예제 #1
0
def task_deploy(request, project_name):
    try:
        log_common.info('进入发布方法')
        work_path = os.getcwd()
        if request.method == 'GET':
            log_common.info('开始发布逻辑')
            path = os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER))
            project_path = join(path, project_name)
            # 检索打包egg文件
            egg = find_egg(project_path)
            if not egg:
                raise Exception('没有打包文件')
            egg_file = open(join(project_path, egg), 'rb')
            egg_file_content = egg_file.read()

            project = CrawlProject.objects.get(name=project_name, is_deleted=0)
            task = CrawlTask.objects.get(id=project.task_id)
            task.is_deploy = 1
            task.save()
            for node_id in json.loads(task.node_ids):
                node = CrawlNode.objects.get(id=node_id)
                engine = get_engine(node)
                log_common.info('{}: 准备发布{}'.format(node.node_ip,
                                                    project_name))
                engine.add_version(project_name, int(time.time()),
                                   egg_file_content)
                log_common.info('{}: 发布成功{}'.format(node.node_ip,
                                                    project_name))
                # update deploy info
                deployed_at = timezone.now()
                CrawlDeploy.objects.filter(
                    node_id=node.id,
                    project_id=project.id).update(is_deleted=1)
                deploy, result = CrawlDeploy.objects.update_or_create(
                    node_id=node.id,
                    project_id=project.id,
                    deployed_at=deployed_at,
                    description=project.description)
            r = Result.success("")
            return JsonResponse(r)
    except Exception as e:
        import traceback
        log_common.error("task_deploy => ", e)
        log_common.error("task_deploy => {}".format(traceback.format_exc()))
        r = Result.fail(e)
        return JsonResponse(r)
    finally:
        os.chdir(work_path)
예제 #2
0
def build_egg(project, include_data=False):
    """
    构建egg包
    :param project:
    :param include_data
    :return:
    """
    work_path = os.getcwd()
    try:
        path = os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER))
        project_path = join(path, project)
        os.chdir(project_path)
        settings = config(project_path, 'settings', 'default')
        if include_data:
            create_data_setup_py(project_path,
                                 settings=settings,
                                 project=project)
        else:
            create_default_setup_py(project_path,
                                    settings=settings,
                                    project=project)

        d = tempfile.mkdtemp(prefix="dt-")
        o = open(os.path.join(d, "stdout"), "wb")
        e = open(os.path.join(d, "stderr"), "wb")
        retry_on_eintr(
            check_call,
            ['python', 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d],
            stdout=o,
            stderr=e)

        # retry_on_eintr(check_call, [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d],
        #                stdout=o, stderr=e)

        o.close()
        e.close()
        egg = glob.glob(os.path.join(d, '*.egg'))[0]
        # Delete Origin file
        if find_egg(project_path):
            os.remove(join(project_path, find_egg(project_path)))
        shutil.move(egg, project_path)
        return join(project_path, find_egg(project_path))
    except Exception as e:
        import traceback
        log_common.error(">build_egg ", e)
        log_common.error(">build_egg  = {}", traceback.format_exc())
    finally:
        os.chdir(work_path)
예제 #3
0
    def run(self):
        while True:
            try:
                # 新添加待调度的任务
                self.scheduler.remove_all_jobs()
                sync_task_models = CrawlSyncTask.objects.filter(is_deleted=0)
                if not sync_task_models:
                    log_common.warn('任务获取失败')
                    continue
                for sync_model in sync_task_models:
                    node_ports = eval(sync_model.execute_host)
                    if not sync_model.source_cfg:
                        continue
                    source_cfg = eval(sync_model.source_cfg)
                    target_cfg = eval(sync_model.target_cfg)

                    args = {
                        "conditions": source_cfg["source_condition"],
                        "path": target_cfg["target_path"],
                    }
                    trigger = sync_model.schedule_date
                    mix = "{}-{}-{}".format(trigger, sync_model.source_cfg,
                                            sync_model.target_cfg)
                    job_id = "{}-{}".format(str(sync_model.id), mix)
                    md5_job = md5(job_id)
                    crawl_redis.set("sync#cfg#{}".format(md5_job),
                                    json.dumps(args))
                    self.scheduler.add_job(work_func,
                                           trigger="cron",
                                           **eval(trigger),
                                           id=md5_job,
                                           args=[
                                               node_ports, "pro_sync_erp",
                                               "erp_sync", md5_job,
                                               sync_model.id
                                           ])
            except Exception as ex:
                import traceback
                log_common.error("调度数据同步任务失败", ex)
                log_common.error("调度数据同步任务失败 = {}".format(
                    traceback.format_exc()))
            finally:
                connections.close_all()
                time.sleep(3 * 60)
예제 #4
0
def node_spider_info(request):
    if request.method == 'POST':
        data = json.loads(request.body.decode('utf-8'))
        node_url = engine_url(data['node_ip'], data['node_port'])
        client_d = {}
        try:
            client_d['status'] = 'disconnect'
            client_d['projects_count'] = 0
            client_d['projects'] = []
            client_d['spiders_count'] = client_d['pending'] = client_d[
                'running'] = client_d['finished'] = 0
            response = requests.get(node_url + '/listprojects.json', timeout=2)
            if response:
                info = json.loads(response.text)
                client_d['projects_count'] = len(info['projects'])
                client_d['projects'] = info['projects']
                client_d['status'] = info['status']
                for project in info['projects']:
                    project_info = json.loads(
                        requests.get(node_url + '/listspiders.json?project=' +
                                     project,
                                     timeout=2).text)
                    client_d['spiders_count'] = client_d[
                        'spiders_count'] + len(project_info['spiders'])
                    project_info = json.loads(
                        requests.get(node_url + '/listjobs.json?project=' +
                                     project,
                                     timeout=2).text)
                    client_d['pending'] = client_d['pending'] + len(
                        project_info['pending'])
                    client_d['running'] = client_d['running'] + len(
                        project_info['running'])
                    client_d['finished'] = client_d['running'] + len(
                        project_info['finished'])
        except (requests.exceptions.ConnectionError,
                requests.exceptions.ConnectTimeout,
                requests.exceptions.HTTPError) as e:
            log_common.error(e)
        r = Result.success(client_d)
        return JsonResponse(r)
예제 #5
0
    def run(self):
        while True:
            try:
                while crawl_redis.llen('crawl_delay_queue') > 0:
                    log_common.info('当前延迟处理队列中存在{}个待执行延迟任务'.format(
                        str(crawl_redis.llen('crawl_delay_queue'))))
                    arg = crawl_redis.blpop('crawl_delay_queue', timeout=3)
                    if arg:
                        run_arg = json.loads(arg[1])
                        project = run_arg.get('project')
                        spider = run_arg.get('spider')
                        host = run_arg.get('host')
                        port = run_arg.get('port')
                        args = run_arg.get('args')

                        engine = get_general_engine(host, port)
                        engine_kit.schedule(engine, project, spider, **args)
                    time.sleep(3)
            except Exception as e:
                log_common.error('>>>> [DelayTaskSchedulerWork] 调度出现异常', e)
            finally:
                time.sleep(7 * 60)
예제 #6
0
def work_func(nodes, project, spider, md5_job, task_id):
    log_common.warn("当前同步任务执行节点:{}".format(json.dumps(nodes)))

    # apscheduler bug fix

    try:
        lock = dlm.lock("dlm#{}".format(md5_job), 1000 * 30)
        if lock:
            for node in nodes:
                # 这里检查运行节点的活跃健康
                engine = get_engine_by_ip(node)
                try:
                    args = {
                        "redis":
                        '{{"host":"{}","port": {},"db":1,"password":"******"}}'.
                        format(db_conf.redis_host, str(db_conf.redis_port),
                               db_conf.redis_pwd),
                        "batch_id":
                        md5_job,
                        "task_id":
                        task_id
                    }
                    jobs = engine.schedule(project, spider, **args)
                    task = CrawlSyncTask.objects.get(id=task_id)
                    task.job_id = jobs
                    task.save()
                    log_common.warning("{} ,{}:  {};Jobs:{}".format(
                        str(task_id), project, spider, jobs))
                except Exception as err:
                    import traceback
                    log_common.error("请发布任务到", err)
                    log_common.error("发布分发任务失败:{}".format(
                        traceback.format_exc()))
        else:
            log_common.warning("batch:{} locked".format(md5_job))
    finally:
        pass
예제 #7
0
def collect_script_progress(request):
    """
    采集接收保存任务执行数据
    :param request:
    :return:
    """
    try:
        if request.method == 'POST':
            data = json.loads(request.body.decode('utf-8'))
            script_name = data["script_name"]
            batch = data["batch"]
            script_progress = CrawlScriptProgress.objects.filter(script_name=script_name, batch=batch)
            arg_key = data.get('arg_key')
            if arg_key:
                data['arg'] = bytes.decode(crawl_redis.get('args#{}'.format(arg_key)))

            log_common.error('script_name: {}, batch_id: {}'.format(script_name, batch))

            if script_progress:
                log_common.error('update progress script_name:{}, batch_id: {}'.format(script_name, batch))
                sp = script_progress[0]
                data["task_name"] = sp.task_name
                data["id"] = sp.id

                if data['status'] == -1 and not data.get('msg') and sp.msg:
                    data['msg'] = sp.msg

                result = script_progress.update(**data)

                if data['status'] == -1:
                    user_alert_rel = CrawlUserAlertRel.objects.filter(alert_id=12, is_deleted=0)
                    user_ids = list(map(lambda x: str(x.user_id), user_alert_rel))
                    to_user = '******'.join(user_ids)
                    wx_tools.env_send_card_message(to_user, '爬虫异常', '爬虫: {} 发生异常'.format(script_name))
            else:
                try:
                    log_common.error('new progress script_name:{}, batch_id: {}'.format(script_name, batch))
                    css = CrawlScript.objects.filter(name=script_name, is_deleted=0)
                    if css:
                        cs = css[0]
                        data["task_name"] = cs.task_name
                        result = CrawlScriptProgress.objects.create(**data)
                    else:
                        log_common.warn("no find {} of task!".format(script_name))
                except IntegrityError as e:
                    log_common.error('>>>>>>>>>>>>>>>>>>> catch IntegrityError >>>>>>>>>>>>>>>>>>>>>')
                    # 处理并发情况下脚本上报两次的情况
                    script_progress = CrawlScriptProgress.objects.filter(script_name=script_name, batch=batch)
                    sp = script_progress[0]
                    data["task_name"] = sp.task_name
                    data["id"] = sp.id
                    result = script_progress.update(**data)
                    if data['status'] == -1:
                        user_alert_rel = CrawlUserAlertRel.objects.filter(alert_id=12, is_deleted=0)
                        user_ids = list(map(lambda x: x.user_id, user_alert_rel))
                        to_user = '******'.join(user_ids)
                        wx_tools.env_send_card_message(to_user, '爬虫异常', '爬虫: {} 发生异常'.format(script_name))
            r = Result.success({})
            return JsonResponse(r)
    except Exception as e:
        import traceback
        log_common.error('v3v3:上报数据异常,具体错误 = {}'.format(traceback.format_exc()))
        r = Result.fail(e)
        return JsonResponse(r)