示例#1
0
def work_func(nodes, project, spider, batch_id):
    log_common.warning("project: {}:  spider:{}  batch:{}".format(project, spider, batch_id))
    try:
        lock = dlm.lock("dlm#{}".format(batch_id), 1000 * 30)
        if lock:
            for node in nodes:
                # 这里检查运行节点的活跃健康
                engine = get_engine_by_ip(node)
                args = {
                    "redis": '{{"host":"{}","port": {},"db":{},"password":"******"}}'.format(db_conf.redis_host,
                                                                                         str(db_conf.redis_port),
                                                                                         str(db_conf.redis_db_name),
                                                                                         db_conf.redis_pwd),
                    "batch_id": batch_id,
                    "node": node
                }
                try:
                    # jobs = engine.schedule(project, spider, **args)
                    jobs = engine_kit.schedule(engine, project, spider, **args)
                    script = CrawlScript.objects.get(name=spider, project_name=project)
                    script.job_id = jobs
                    script.save()
                    log_common.warning("{}:  {};Jobs:{}".format(project, spider, jobs))
                except Exception as err:
                    log_common.warning("start task", err)
                    log_common.warning("请发布任务到:{}".format(node))
        else:
            log_common.warning("batch:{} locked".format(batch_id))
    except Exception as e:
        log_common.warn(e)
        raise AlertException(Alert.platform_exception, '{} 运行失败\n原因: {}'.format(spider, str(e)))
    finally:
        pass
示例#2
0
    def run(self):
        while True:
            try:
                # 新添加待调度的任务
                self.scheduler.remove_all_jobs()
                sync_task_models = CrawlSyncTask.objects.filter(is_deleted=0)
                if not sync_task_models:
                    log_common.warn('任务获取失败')
                    continue
                for sync_model in sync_task_models:
                    node_ports = eval(sync_model.execute_host)
                    if not sync_model.source_cfg:
                        continue
                    source_cfg = eval(sync_model.source_cfg)
                    target_cfg = eval(sync_model.target_cfg)

                    args = {
                        "conditions": source_cfg["source_condition"],
                        "path": target_cfg["target_path"],
                    }
                    trigger = sync_model.schedule_date
                    mix = "{}-{}-{}".format(trigger, sync_model.source_cfg,
                                            sync_model.target_cfg)
                    job_id = "{}-{}".format(str(sync_model.id), mix)
                    md5_job = md5(job_id)
                    crawl_redis.set("sync#cfg#{}".format(md5_job),
                                    json.dumps(args))
                    self.scheduler.add_job(work_func,
                                           trigger="cron",
                                           **eval(trigger),
                                           id=md5_job,
                                           args=[
                                               node_ports, "pro_sync_erp",
                                               "erp_sync", md5_job,
                                               sync_model.id
                                           ])
            except Exception as ex:
                import traceback
                log_common.error("调度数据同步任务失败", ex)
                log_common.error("调度数据同步任务失败 = {}".format(
                    traceback.format_exc()))
            finally:
                connections.close_all()
                time.sleep(3 * 60)
示例#3
0
def work_func(nodes, project, spider, md5_job, task_id):
    log_common.warn("当前同步任务执行节点:{}".format(json.dumps(nodes)))

    # apscheduler bug fix

    try:
        lock = dlm.lock("dlm#{}".format(md5_job), 1000 * 30)
        if lock:
            for node in nodes:
                # 这里检查运行节点的活跃健康
                engine = get_engine_by_ip(node)
                try:
                    args = {
                        "redis":
                        '{{"host":"{}","port": {},"db":1,"password":"******"}}'.
                        format(db_conf.redis_host, str(db_conf.redis_port),
                               db_conf.redis_pwd),
                        "batch_id":
                        md5_job,
                        "task_id":
                        task_id
                    }
                    jobs = engine.schedule(project, spider, **args)
                    task = CrawlSyncTask.objects.get(id=task_id)
                    task.job_id = jobs
                    task.save()
                    log_common.warning("{} ,{}:  {};Jobs:{}".format(
                        str(task_id), project, spider, jobs))
                except Exception as err:
                    import traceback
                    log_common.error("请发布任务到", err)
                    log_common.error("发布分发任务失败:{}".format(
                        traceback.format_exc()))
        else:
            log_common.warning("batch:{} locked".format(md5_job))
    finally:
        pass
示例#4
0
def schedule(engine, project, spider, **args):
    time.sleep(3)
    running_task = len(engine.list_jobs(project).get('running'))
    pending_task = len(engine.list_jobs(project).get('pending'))
    current_count = running_task + pending_task
    task = CrawlTask.objects.get(project_name=project, is_deleted=0)
    max_count = 5 if not task.spider_concurrency else task.spider_concurrency
    jobs = ''
    if current_count > int(max_count):
        # 写进延迟运行队列
        host_arr = get_host_by_engine(engine)
        arg = {
            'project': project,
            'spider': spider,
            'args': args,
            'host': host_arr[0],
            'port': host_arr[1]
        }
        log_common.warn('添加到延迟队列=> project: {}, spider: {}, host: {}, port: {}'.format(project, spider,
                                                                                       host_arr[0], host_arr[1]))
        crawl_redis.rpush('crawl_delay_queue', json.dumps(arg))
    else:
        jobs = engine.schedule(project, spider, **args)
    return jobs
示例#5
0
                    mix = "{}-{}-{}".format(trigger, sync_model.source_cfg,
                                            sync_model.target_cfg)
                    job_id = "{}-{}".format(str(sync_model.id), mix)
                    md5_job = md5(job_id)
                    crawl_redis.set("sync#cfg#{}".format(md5_job),
                                    json.dumps(args))
                    self.scheduler.add_job(work_func,
                                           trigger="cron",
                                           **eval(trigger),
                                           id=md5_job,
                                           args=[
                                               node_ports, "pro_sync_erp",
                                               "erp_sync", md5_job,
                                               sync_model.id
                                           ])
            except Exception as ex:
                import traceback
                log_common.error("调度数据同步任务失败", ex)
                log_common.error("调度数据同步任务失败 = {}".format(
                    traceback.format_exc()))
            finally:
                connections.close_all()
                time.sleep(3 * 60)


register_events(scheduler)
# scheduler.start()
log_common.warn("同步数据任务加载")
add_work = CreateSchedulerWork(scheduler)
add_work.start()
示例#6
0
def time_task():
    log_common.warn("加载爬虫脚本数据~~~~")
    load_cfg_data()
    load_auto()
示例#7
0
def script_start(request):
    """
    启动脚本
    :param request:
    :return:
    """
    try:
        if request.method == 'POST':
            data_scripts = json.loads(request.body.decode('utf-8'))

            if not data_scripts:
                return JsonResponse(Result.fail("没有指定脚本"))

            for data_script in data_scripts:
                _job_id = ''
                crawl_script = CrawlScript.objects.get(id=data_script['id'])
                host_list = get_hosts_by_script_id(crawl_script.id)
                for host in host_list:
                    engine = get_engine_by_ip(host)
                    if "args" in data_script and data_script["args"]:
                        for arg in data_script["args"]:
                            if 'dynamic_value' in arg:
                                script_arg = json.loads(arg)
                                sql = script_arg.get('dynamic_value')
                                result = db_kit.fetch_all_to_json(sql)
                                for r in result:
                                    if isinstance(arg, str):
                                        arg = json.loads(arg)
                                    arg['dynamic_value'] = r
                                    batch_id = encrypt_kit.md5(json.dumps(arg))
                                    args = {
                                        "redis": '{{"host":"{}","port": {},"db":1,"password":"******"}}'.format(db_conf.redis_host,
                                                                                                            str(
                                                                                                                db_conf.redis_port),
                                                                                                            db_conf.redis_pwd),
                                        "batch_id": batch_id,
                                        "node": host,
                                        "args": arg
                                    }
                                    # _job_id = engine.schedule(crawl_script.project_name, crawl_script.name, **args)
                                    log_common.warn('>>>> 动态分割脚本启动 {}'.format(json.dumps(args)))
                                    _job_id = engine_kit.schedule(engine, crawl_script.project_name, crawl_script.name, **args)
                                    crawl_redis.set("args#{}".format(batch_id), json.dumps(arg))
                            else:
                                batch_id = encrypt_kit.md5(json.dumps(arg))
                                args = {
                                    "redis": '{{"host":"{}","port": {},"db":1,"password":"******"}}'.format(
                                        db_conf.redis_host,
                                        str(
                                            db_conf.redis_port),
                                        db_conf.redis_pwd),
                                    "batch_id": batch_id,
                                    "node": host,
                                    "args": arg
                                }
                                # _job_id = engine.schedule(crawl_script.project_name, crawl_script.name, **args)
                                _job_id = engine_kit.schedule(engine, crawl_script.project_name, crawl_script.name,
                                                              **args)
                                crawl_redis.set("args#{}".format(batch_id), arg)
                    else:
                        ta = time.strftime('%Y-%m-%d %H:%M:%S')
                        batch_id = encrypt_kit.md5(ta)
                        args = {
                            "redis": '{{"host":"{}","port": {},"db":1,"password":"******"}}'.format(db_conf.redis_host,
                                                                                                str(db_conf.redis_port),
                                                                                                db_conf.redis_pwd),
                            "batch_id": batch_id,
                            "node": host,
                            "args": '{}'
                        }
                        _job_id = engine_kit.schedule(engine, crawl_script.project_name, crawl_script.name, **args)
                        # _job_id = engine.schedule(crawl_script.project_name, crawl_script.name, **args)
                        crawl_redis.set("args#{}".format(batch_id), json.dumps('{}'))
                crawl_script.job_id = _job_id
                crawl_script.save()
            r = Result.success(None)
            return JsonResponse(r)
    except Exception as err:
        r = Result.fail(err)
        return JsonResponse(r)
示例#8
0
def collect_script_progress(request):
    """
    采集接收保存任务执行数据
    :param request:
    :return:
    """
    try:
        if request.method == 'POST':
            data = json.loads(request.body.decode('utf-8'))
            script_name = data["script_name"]
            batch = data["batch"]
            script_progress = CrawlScriptProgress.objects.filter(script_name=script_name, batch=batch)
            arg_key = data.get('arg_key')
            if arg_key:
                data['arg'] = bytes.decode(crawl_redis.get('args#{}'.format(arg_key)))

            log_common.error('script_name: {}, batch_id: {}'.format(script_name, batch))

            if script_progress:
                log_common.error('update progress script_name:{}, batch_id: {}'.format(script_name, batch))
                sp = script_progress[0]
                data["task_name"] = sp.task_name
                data["id"] = sp.id

                if data['status'] == -1 and not data.get('msg') and sp.msg:
                    data['msg'] = sp.msg

                result = script_progress.update(**data)

                if data['status'] == -1:
                    user_alert_rel = CrawlUserAlertRel.objects.filter(alert_id=12, is_deleted=0)
                    user_ids = list(map(lambda x: str(x.user_id), user_alert_rel))
                    to_user = '******'.join(user_ids)
                    wx_tools.env_send_card_message(to_user, '爬虫异常', '爬虫: {} 发生异常'.format(script_name))
            else:
                try:
                    log_common.error('new progress script_name:{}, batch_id: {}'.format(script_name, batch))
                    css = CrawlScript.objects.filter(name=script_name, is_deleted=0)
                    if css:
                        cs = css[0]
                        data["task_name"] = cs.task_name
                        result = CrawlScriptProgress.objects.create(**data)
                    else:
                        log_common.warn("no find {} of task!".format(script_name))
                except IntegrityError as e:
                    log_common.error('>>>>>>>>>>>>>>>>>>> catch IntegrityError >>>>>>>>>>>>>>>>>>>>>')
                    # 处理并发情况下脚本上报两次的情况
                    script_progress = CrawlScriptProgress.objects.filter(script_name=script_name, batch=batch)
                    sp = script_progress[0]
                    data["task_name"] = sp.task_name
                    data["id"] = sp.id
                    result = script_progress.update(**data)
                    if data['status'] == -1:
                        user_alert_rel = CrawlUserAlertRel.objects.filter(alert_id=12, is_deleted=0)
                        user_ids = list(map(lambda x: x.user_id, user_alert_rel))
                        to_user = '******'.join(user_ids)
                        wx_tools.env_send_card_message(to_user, '爬虫异常', '爬虫: {} 发生异常'.format(script_name))
            r = Result.success({})
            return JsonResponse(r)
    except Exception as e:
        import traceback
        log_common.error('v3v3:上报数据异常,具体错误 = {}'.format(traceback.format_exc()))
        r = Result.fail(e)
        return JsonResponse(r)
示例#9
0
    def run(self):
        while True:
            try:
                # 清理所有任务
                # self.scheduler.remove_all_jobs()
                log_common.warn('*********** 刷新调度器 **********')
                redis_jobs = self.scheduler.get_jobs()
                redis_job_ids = [rj.id for rj in redis_jobs]
                db_job_ids = []

                script_models = CrawlScript.objects.filter(is_deleted=0, is_disable=0)
                for script_model in script_models:
                    node_list = []
                    if not script_model.hosts or script_model.hosts == '[]':
                        project = CrawlProject.objects.get(id=script_model.project_id)
                        task = CrawlTask.objects.get(id=project.task_id)
                        for node_id in json.loads(task.node_ids):
                            node = CrawlNode.objects.get(id=node_id)
                            node_list.append('{}:{}'.format(node.node_ip, node.node_port))
                    else:
                        node_list = eval(script_model.hosts)
                    json_args = []
                    if script_model.args:
                        json_args = eval(script_model.args)
                    for json_arg in json_args:
                        script_args = json_arg["args"]
                        script_triggers = json_arg["trigger"]
                        fix_type = json_arg["fix_type"]

                        try:
                            if script_triggers:
                                # 补数据逻辑
                                if fix_type in (1, 2, 3):
                                    run_date = json_arg['fix_date']
                                    mix = "{}-{}".format(json.dumps(script_triggers), json.dumps(script_args))
                                    job_id = "fix-{}-{}".format(str(script_model.id), md5(mix))
                                    log_common.warn('添加补数据调度任务: {}'.format(script_model.id))
                                    # 立即测试
                                    # schedule_fix_data(node_list, script_model.project_name, script_model.name, script_model.id, script_args, job_id, fix_type)

                                    # 正常逻辑
                                    db_job_ids.append(job_id)
                                    if datetime.datetime.strptime(run_date, '%Y-%m-%d %H:%M:%S') >= datetime.datetime.now() and job_id not in redis_job_ids:
                                        self.scheduler.add_job(schedule_fix_data,
                                                               'date',
                                                               run_date=run_date,
                                                               id=job_id,
                                                               args=[node_list, script_model.project_name,
                                                                     script_model.name, script_model.id,
                                                                     script_args, job_id, fix_type],
                                                               misfire_grace_time=60)
                                else:
                                    # 动态参数
                                    if json_arg.get('dynamic_value'):
                                        sql = json_arg.get('dynamic_value')
                                        result = db_kit.fetch_all_to_json(sql)
                                        for r in result:
                                            script_args['dynamic_value'] = r
                                            log_common.warn('>>>> 动态切割参数调度 {}, args: {}'.format(script_model.name, script_args))
                                            mix = "{}-{}".format(json.dumps(script_triggers), json.dumps(script_args))
                                            job_id = "{}-{}".format(str(script_model.id), md5(mix))
                                            log_common.warn("args#{}".format(job_id))
                                            crawl_redis.set("args#{}".format(job_id), json.dumps(script_args))
                                            # log_common.warn('添加调度任务: {}'.format(script_model.id))
                                            db_job_ids.append(job_id)
                                            if job_id not in redis_job_ids:
                                                self.scheduler.add_job(work_func,
                                                                       trigger="cron",
                                                                       **script_triggers,
                                                                       id=job_id,
                                                                       args=[node_list, script_model.project_name,
                                                                             script_model.name, job_id],
                                                                       misfire_grace_time=60)
                                    else:
                                        mix = "{}-{}".format(json.dumps(script_triggers), json.dumps(script_args))
                                        job_id = "{}-{}".format(str(script_model.id), md5(mix))
                                        crawl_redis.set("args#{}".format(job_id), json.dumps(script_args))
                                        log_common.warn('添加调度任务: {}'.format(script_model.id))
                                        db_job_ids.append(job_id)
                                        if job_id not in redis_job_ids:
                                            self.scheduler.add_job(work_func,
                                                                   trigger="cron",
                                                                   **script_triggers,
                                                                   id=job_id,
                                                                   args=[node_list, script_model.project_name,
                                                                         script_model.name, job_id],
                                                                   misfire_grace_time=60)
                        except Exception as e:
                            log_common.warn(">>>> 添加报错任务报错: ", e)
                            continue

                c_ids = [i for i in redis_job_ids if i not in db_job_ids]
                for c_id in c_ids:
                    self.scheduler.remove_job(c_id)
                    log_common.warn('移除差异任务: {}'.format(c_id))
                db_job_ids.clear()
            except Exception as ex:
                log_common.warn(ex)
                continue
            finally:
                connections.close_all()
                time.sleep(7 * 60)