def work_func(nodes, project, spider, batch_id): log_common.warning("project: {}: spider:{} batch:{}".format(project, spider, batch_id)) try: lock = dlm.lock("dlm#{}".format(batch_id), 1000 * 30) if lock: for node in nodes: # 这里检查运行节点的活跃健康 engine = get_engine_by_ip(node) args = { "redis": '{{"host":"{}","port": {},"db":{},"password":"******"}}'.format(db_conf.redis_host, str(db_conf.redis_port), str(db_conf.redis_db_name), db_conf.redis_pwd), "batch_id": batch_id, "node": node } try: # jobs = engine.schedule(project, spider, **args) jobs = engine_kit.schedule(engine, project, spider, **args) script = CrawlScript.objects.get(name=spider, project_name=project) script.job_id = jobs script.save() log_common.warning("{}: {};Jobs:{}".format(project, spider, jobs)) except Exception as err: log_common.warning("start task", err) log_common.warning("请发布任务到:{}".format(node)) else: log_common.warning("batch:{} locked".format(batch_id)) except Exception as e: log_common.warn(e) raise AlertException(Alert.platform_exception, '{} 运行失败\n原因: {}'.format(spider, str(e))) finally: pass
def run(self): while True: try: # 新添加待调度的任务 self.scheduler.remove_all_jobs() sync_task_models = CrawlSyncTask.objects.filter(is_deleted=0) if not sync_task_models: log_common.warn('任务获取失败') continue for sync_model in sync_task_models: node_ports = eval(sync_model.execute_host) if not sync_model.source_cfg: continue source_cfg = eval(sync_model.source_cfg) target_cfg = eval(sync_model.target_cfg) args = { "conditions": source_cfg["source_condition"], "path": target_cfg["target_path"], } trigger = sync_model.schedule_date mix = "{}-{}-{}".format(trigger, sync_model.source_cfg, sync_model.target_cfg) job_id = "{}-{}".format(str(sync_model.id), mix) md5_job = md5(job_id) crawl_redis.set("sync#cfg#{}".format(md5_job), json.dumps(args)) self.scheduler.add_job(work_func, trigger="cron", **eval(trigger), id=md5_job, args=[ node_ports, "pro_sync_erp", "erp_sync", md5_job, sync_model.id ]) except Exception as ex: import traceback log_common.error("调度数据同步任务失败", ex) log_common.error("调度数据同步任务失败 = {}".format( traceback.format_exc())) finally: connections.close_all() time.sleep(3 * 60)
def work_func(nodes, project, spider, md5_job, task_id): log_common.warn("当前同步任务执行节点:{}".format(json.dumps(nodes))) # apscheduler bug fix try: lock = dlm.lock("dlm#{}".format(md5_job), 1000 * 30) if lock: for node in nodes: # 这里检查运行节点的活跃健康 engine = get_engine_by_ip(node) try: args = { "redis": '{{"host":"{}","port": {},"db":1,"password":"******"}}'. format(db_conf.redis_host, str(db_conf.redis_port), db_conf.redis_pwd), "batch_id": md5_job, "task_id": task_id } jobs = engine.schedule(project, spider, **args) task = CrawlSyncTask.objects.get(id=task_id) task.job_id = jobs task.save() log_common.warning("{} ,{}: {};Jobs:{}".format( str(task_id), project, spider, jobs)) except Exception as err: import traceback log_common.error("请发布任务到", err) log_common.error("发布分发任务失败:{}".format( traceback.format_exc())) else: log_common.warning("batch:{} locked".format(md5_job)) finally: pass
def schedule(engine, project, spider, **args): time.sleep(3) running_task = len(engine.list_jobs(project).get('running')) pending_task = len(engine.list_jobs(project).get('pending')) current_count = running_task + pending_task task = CrawlTask.objects.get(project_name=project, is_deleted=0) max_count = 5 if not task.spider_concurrency else task.spider_concurrency jobs = '' if current_count > int(max_count): # 写进延迟运行队列 host_arr = get_host_by_engine(engine) arg = { 'project': project, 'spider': spider, 'args': args, 'host': host_arr[0], 'port': host_arr[1] } log_common.warn('添加到延迟队列=> project: {}, spider: {}, host: {}, port: {}'.format(project, spider, host_arr[0], host_arr[1])) crawl_redis.rpush('crawl_delay_queue', json.dumps(arg)) else: jobs = engine.schedule(project, spider, **args) return jobs
mix = "{}-{}-{}".format(trigger, sync_model.source_cfg, sync_model.target_cfg) job_id = "{}-{}".format(str(sync_model.id), mix) md5_job = md5(job_id) crawl_redis.set("sync#cfg#{}".format(md5_job), json.dumps(args)) self.scheduler.add_job(work_func, trigger="cron", **eval(trigger), id=md5_job, args=[ node_ports, "pro_sync_erp", "erp_sync", md5_job, sync_model.id ]) except Exception as ex: import traceback log_common.error("调度数据同步任务失败", ex) log_common.error("调度数据同步任务失败 = {}".format( traceback.format_exc())) finally: connections.close_all() time.sleep(3 * 60) register_events(scheduler) # scheduler.start() log_common.warn("同步数据任务加载") add_work = CreateSchedulerWork(scheduler) add_work.start()
def time_task(): log_common.warn("加载爬虫脚本数据~~~~") load_cfg_data() load_auto()
def script_start(request): """ 启动脚本 :param request: :return: """ try: if request.method == 'POST': data_scripts = json.loads(request.body.decode('utf-8')) if not data_scripts: return JsonResponse(Result.fail("没有指定脚本")) for data_script in data_scripts: _job_id = '' crawl_script = CrawlScript.objects.get(id=data_script['id']) host_list = get_hosts_by_script_id(crawl_script.id) for host in host_list: engine = get_engine_by_ip(host) if "args" in data_script and data_script["args"]: for arg in data_script["args"]: if 'dynamic_value' in arg: script_arg = json.loads(arg) sql = script_arg.get('dynamic_value') result = db_kit.fetch_all_to_json(sql) for r in result: if isinstance(arg, str): arg = json.loads(arg) arg['dynamic_value'] = r batch_id = encrypt_kit.md5(json.dumps(arg)) args = { "redis": '{{"host":"{}","port": {},"db":1,"password":"******"}}'.format(db_conf.redis_host, str( db_conf.redis_port), db_conf.redis_pwd), "batch_id": batch_id, "node": host, "args": arg } # _job_id = engine.schedule(crawl_script.project_name, crawl_script.name, **args) log_common.warn('>>>> 动态分割脚本启动 {}'.format(json.dumps(args))) _job_id = engine_kit.schedule(engine, crawl_script.project_name, crawl_script.name, **args) crawl_redis.set("args#{}".format(batch_id), json.dumps(arg)) else: batch_id = encrypt_kit.md5(json.dumps(arg)) args = { "redis": '{{"host":"{}","port": {},"db":1,"password":"******"}}'.format( db_conf.redis_host, str( db_conf.redis_port), db_conf.redis_pwd), "batch_id": batch_id, "node": host, "args": arg } # _job_id = engine.schedule(crawl_script.project_name, crawl_script.name, **args) _job_id = engine_kit.schedule(engine, crawl_script.project_name, crawl_script.name, **args) crawl_redis.set("args#{}".format(batch_id), arg) else: ta = time.strftime('%Y-%m-%d %H:%M:%S') batch_id = encrypt_kit.md5(ta) args = { "redis": '{{"host":"{}","port": {},"db":1,"password":"******"}}'.format(db_conf.redis_host, str(db_conf.redis_port), db_conf.redis_pwd), "batch_id": batch_id, "node": host, "args": '{}' } _job_id = engine_kit.schedule(engine, crawl_script.project_name, crawl_script.name, **args) # _job_id = engine.schedule(crawl_script.project_name, crawl_script.name, **args) crawl_redis.set("args#{}".format(batch_id), json.dumps('{}')) crawl_script.job_id = _job_id crawl_script.save() r = Result.success(None) return JsonResponse(r) except Exception as err: r = Result.fail(err) return JsonResponse(r)
def collect_script_progress(request): """ 采集接收保存任务执行数据 :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) script_name = data["script_name"] batch = data["batch"] script_progress = CrawlScriptProgress.objects.filter(script_name=script_name, batch=batch) arg_key = data.get('arg_key') if arg_key: data['arg'] = bytes.decode(crawl_redis.get('args#{}'.format(arg_key))) log_common.error('script_name: {}, batch_id: {}'.format(script_name, batch)) if script_progress: log_common.error('update progress script_name:{}, batch_id: {}'.format(script_name, batch)) sp = script_progress[0] data["task_name"] = sp.task_name data["id"] = sp.id if data['status'] == -1 and not data.get('msg') and sp.msg: data['msg'] = sp.msg result = script_progress.update(**data) if data['status'] == -1: user_alert_rel = CrawlUserAlertRel.objects.filter(alert_id=12, is_deleted=0) user_ids = list(map(lambda x: str(x.user_id), user_alert_rel)) to_user = '******'.join(user_ids) wx_tools.env_send_card_message(to_user, '爬虫异常', '爬虫: {} 发生异常'.format(script_name)) else: try: log_common.error('new progress script_name:{}, batch_id: {}'.format(script_name, batch)) css = CrawlScript.objects.filter(name=script_name, is_deleted=0) if css: cs = css[0] data["task_name"] = cs.task_name result = CrawlScriptProgress.objects.create(**data) else: log_common.warn("no find {} of task!".format(script_name)) except IntegrityError as e: log_common.error('>>>>>>>>>>>>>>>>>>> catch IntegrityError >>>>>>>>>>>>>>>>>>>>>') # 处理并发情况下脚本上报两次的情况 script_progress = CrawlScriptProgress.objects.filter(script_name=script_name, batch=batch) sp = script_progress[0] data["task_name"] = sp.task_name data["id"] = sp.id result = script_progress.update(**data) if data['status'] == -1: user_alert_rel = CrawlUserAlertRel.objects.filter(alert_id=12, is_deleted=0) user_ids = list(map(lambda x: x.user_id, user_alert_rel)) to_user = '******'.join(user_ids) wx_tools.env_send_card_message(to_user, '爬虫异常', '爬虫: {} 发生异常'.format(script_name)) r = Result.success({}) return JsonResponse(r) except Exception as e: import traceback log_common.error('v3v3:上报数据异常,具体错误 = {}'.format(traceback.format_exc())) r = Result.fail(e) return JsonResponse(r)
def run(self): while True: try: # 清理所有任务 # self.scheduler.remove_all_jobs() log_common.warn('*********** 刷新调度器 **********') redis_jobs = self.scheduler.get_jobs() redis_job_ids = [rj.id for rj in redis_jobs] db_job_ids = [] script_models = CrawlScript.objects.filter(is_deleted=0, is_disable=0) for script_model in script_models: node_list = [] if not script_model.hosts or script_model.hosts == '[]': project = CrawlProject.objects.get(id=script_model.project_id) task = CrawlTask.objects.get(id=project.task_id) for node_id in json.loads(task.node_ids): node = CrawlNode.objects.get(id=node_id) node_list.append('{}:{}'.format(node.node_ip, node.node_port)) else: node_list = eval(script_model.hosts) json_args = [] if script_model.args: json_args = eval(script_model.args) for json_arg in json_args: script_args = json_arg["args"] script_triggers = json_arg["trigger"] fix_type = json_arg["fix_type"] try: if script_triggers: # 补数据逻辑 if fix_type in (1, 2, 3): run_date = json_arg['fix_date'] mix = "{}-{}".format(json.dumps(script_triggers), json.dumps(script_args)) job_id = "fix-{}-{}".format(str(script_model.id), md5(mix)) log_common.warn('添加补数据调度任务: {}'.format(script_model.id)) # 立即测试 # schedule_fix_data(node_list, script_model.project_name, script_model.name, script_model.id, script_args, job_id, fix_type) # 正常逻辑 db_job_ids.append(job_id) if datetime.datetime.strptime(run_date, '%Y-%m-%d %H:%M:%S') >= datetime.datetime.now() and job_id not in redis_job_ids: self.scheduler.add_job(schedule_fix_data, 'date', run_date=run_date, id=job_id, args=[node_list, script_model.project_name, script_model.name, script_model.id, script_args, job_id, fix_type], misfire_grace_time=60) else: # 动态参数 if json_arg.get('dynamic_value'): sql = json_arg.get('dynamic_value') result = db_kit.fetch_all_to_json(sql) for r in result: script_args['dynamic_value'] = r log_common.warn('>>>> 动态切割参数调度 {}, args: {}'.format(script_model.name, script_args)) mix = "{}-{}".format(json.dumps(script_triggers), json.dumps(script_args)) job_id = "{}-{}".format(str(script_model.id), md5(mix)) log_common.warn("args#{}".format(job_id)) crawl_redis.set("args#{}".format(job_id), json.dumps(script_args)) # log_common.warn('添加调度任务: {}'.format(script_model.id)) db_job_ids.append(job_id) if job_id not in redis_job_ids: self.scheduler.add_job(work_func, trigger="cron", **script_triggers, id=job_id, args=[node_list, script_model.project_name, script_model.name, job_id], misfire_grace_time=60) else: mix = "{}-{}".format(json.dumps(script_triggers), json.dumps(script_args)) job_id = "{}-{}".format(str(script_model.id), md5(mix)) crawl_redis.set("args#{}".format(job_id), json.dumps(script_args)) log_common.warn('添加调度任务: {}'.format(script_model.id)) db_job_ids.append(job_id) if job_id not in redis_job_ids: self.scheduler.add_job(work_func, trigger="cron", **script_triggers, id=job_id, args=[node_list, script_model.project_name, script_model.name, job_id], misfire_grace_time=60) except Exception as e: log_common.warn(">>>> 添加报错任务报错: ", e) continue c_ids = [i for i in redis_job_ids if i not in db_job_ids] for c_id in c_ids: self.scheduler.remove_job(c_id) log_common.warn('移除差异任务: {}'.format(c_id)) db_job_ids.clear() except Exception as ex: log_common.warn(ex) continue finally: connections.close_all() time.sleep(7 * 60)