def edit_script_cfg(request): """ 编辑爬虫脚本配置 :param request: request object :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) spider_name = data['spider_name'] script_name = data['script_name'] apply_to_all = data['applyToAll'] task_id = data['project_id'] script_args = [] for p in data.get('params'): if isinstance(p['args'], str): p['args'] = json.loads(p['args']) script_args.append(p) if p.get('trigger'): result, message = scheduler_helper.verify_cron(p.get('trigger')) if not result: raise Exception('参数错误: {}'.format(message)) update_kwargs = { "trigger": data.get('trigger'), "hosts": data.get('hosts'), "args": json.dumps(script_args)} # 批量设置当前任务的所有脚本 if apply_to_all: crawl_scripts = CrawlScript.objects.filter(task_id=task_id) crawl_scripts.update(**update_kwargs) else: crawl_scripts = CrawlScript.objects.get(name=spider_name, task_id=task_id) crawl_scripts.trigger = data.get('trigger') crawl_scripts.hosts = data.get('hosts') crawl_scripts.args = json.dumps(script_args) crawl_scripts.save() if 'params' in data and data['params']: args = data['params'] # 设置每个爬虫脚本的执行参数,不同调度批次的爬虫运行参数使用md5区分 for arg in args: if apply_to_all: for script in crawl_scripts: v_arg = encrypt_kit.md5(json.dumps(arg)) crawl_redis.set("args#{}#{}".format(script.name, v_arg), json.dumps(arg['args'])) else: v_arg = encrypt_kit.md5(json.dumps(arg)) crawl_redis.set("args#{}#{}".format(spider_name, v_arg), json.dumps(arg['args'])) r = Result.success("") return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def list_task_progress(request): """ 爬虫任务进度 :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) keyword = data.get('keyword') script_name = data.get('script_name') date = data.get('date') status = data.get('status') page = data.get('page', 1) size = data.get('size', 15) task_progress = CrawlScriptProgress.objects.filter(is_deleted=0).exclude(script_name='proxy') condition_date = datetime.datetime.today().strftime('%Y-%m-%d') if date == '' else date stat_task_progress = task_progress.filter(start_time__gte='{} 00:00:00'.format(condition_date), start_time__lte='{} 23:59:59'.format(condition_date)) running_cnt = stat_task_progress.filter(status=1).count() success_cnt = stat_task_progress.filter(status=2).count() fail_cnt = stat_task_progress.filter(status=-1).count() if keyword is not None and keyword != '': task_progress = task_progress.filter(task_name__icontains=keyword) if script_name is not None and script_name != '': task_progress = task_progress.filter(script_name__icontains=script_name) if date is not None and date != '': task_progress = task_progress.filter(start_time__gte='{} 00:00:00'.format(date), start_time__lte='{} 23:59:59'.format(date)) if status is not None: task_progress = task_progress.filter(status__in=status) task_progress = task_progress.order_by("-id") total = task_progress.count() pager = page_helper(total, page, size, task_progress, {'fail_cnt': fail_cnt, 'running_cnt': running_cnt, 'success_cnt': success_cnt}) convert_task_progress = [] results = pager.get('results') for result in results: result['run_time'] = time_kit.convert_ms(result.get('run_time')) result['script_id'] = CrawlScript.objects.get(task_name=result.get('task_name'), name=result.get('script_name')).id convert_task_progress.append(result) pager['results'] = convert_task_progress r = Result.success(pager) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def project_create(request): """ 创建爬虫工程 :param request: request object :return: json """ if request.method == 'POST': work_path = os.getcwd() try: data = json.loads(request.body.decode('utf-8')) data['configurable'] = 1 project, result = CrawlProject.objects.update_or_create(**data) path = join(os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER)), data['name']) if exists(path): return JsonResponse(Result.fail(data="该工程已存在")) # 这里判断该工程目录是否存在,如果存在则不创建,直接返回 os.mkdir(path) # 根据模板创建爬虫工程,验证工程名称 if "name" not in data or "description" not in data: return JsonResponse(Result.fail(data="请输入工程名称和描述")) project_name = data["name"] project_description = data['description'] generate_project(project_name) task_id = data.get('task_id') CrawlTask.objects.filter(id=task_id).update( project_name=project_name, project_id=project.id, description=project_description) r = Result.success(data=model_to_dict(project)) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r) finally: os.chdir(work_path)
def project_list(request, node_id): """ 获取某个node节点上的爬虫工程 :param request: request object :param node_id: node_id :return: json """ if request.method == 'GET': client = CrawlNode.objects.get(id=node_id) engine = get_engine(client) try: projects = engine.list_projects() JsonResponse(Result.success(data=projects)) except ConnectionError: return JsonResponse(Result.fail())
def get_hosts(request): """ 根据脚本 id 获取 hosts :param request: :return: """ try: if request.method == 'GET': script_id = request.GET.get('script_id') hosts = get_hosts_by_script_id(script_id) r = Result.success(hosts) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def create_proxy_ip(request): """ 创建代理 ip :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) CrawlProxyIP.objects.create(**data) r = Result.success(None) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def script_remove(request): """ 删除脚本 :param request: :return: """ try: if request.method == 'GET': id = request.GET['id'] CrawlScript.objects.get(id=id).delete() r = Result.success(None) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def get_proxy_ip(request, proxy_ip_id): """ 获取一个代理 ip :param request: :param proxy_ip_id: :return: """ try: if request.method == 'GET': proxy_ip = CrawlProxyIP.objects.get(id=proxy_ip_id) r = Result.success(model_to_dict(proxy_ip)) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def task_deploy(request, project_name): try: log_common.info('进入发布方法') work_path = os.getcwd() if request.method == 'GET': log_common.info('开始发布逻辑') path = os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER)) project_path = join(path, project_name) # 检索打包egg文件 egg = find_egg(project_path) if not egg: raise Exception('没有打包文件') egg_file = open(join(project_path, egg), 'rb') egg_file_content = egg_file.read() project = CrawlProject.objects.get(name=project_name, is_deleted=0) task = CrawlTask.objects.get(id=project.task_id) task.is_deploy = 1 task.save() for node_id in json.loads(task.node_ids): node = CrawlNode.objects.get(id=node_id) engine = get_engine(node) log_common.info('{}: 准备发布{}'.format(node.node_ip, project_name)) engine.add_version(project_name, int(time.time()), egg_file_content) log_common.info('{}: 发布成功{}'.format(node.node_ip, project_name)) # update deploy info deployed_at = timezone.now() CrawlDeploy.objects.filter( node_id=node.id, project_id=project.id).update(is_deleted=1) deploy, result = CrawlDeploy.objects.update_or_create( node_id=node.id, project_id=project.id, deployed_at=deployed_at, description=project.description) r = Result.success("") return JsonResponse(r) except Exception as e: import traceback log_common.error("task_deploy => ", e) log_common.error("task_deploy => {}".format(traceback.format_exc())) r = Result.fail(e) return JsonResponse(r) finally: os.chdir(work_path)
def script_enable(request): """ 启用脚本 :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) ids = data.get('ids') control_script(ids, 0) r = Result.success(None) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def node_create(request): """ 创建爬虫节点 :param request: request object :return: json """ if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) cn = CrawlNode.objects.filter(node_ip=data["node_ip"]).last() if not cn: node = CrawlNode.objects.create(**data) r = Result.success(model_to_dict(node)) return JsonResponse(r) else: # 更改心跳时间,表示节点存活 return JsonResponse(Result.fail('节点已存在'))
def remove_proxy_ip(request, proxy_ip_id): """ 删除代理 ip :param request: :param proxy_ip_id: :return: """ try: if request.method == 'GET': proxy_ip = CrawlProxyIP.objects.get(id=proxy_ip_id) proxy_ip.is_deleted = 1 proxy_ip.save() r = Result.success(None) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def create_user(request): """ 创建用户 :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) username = data.get('username') if CrawlUser.objects.filter(username=username, is_deleted=0): raise Exception('账号名存在') account = data.get('account') mobile = data.get('mobile') wx_account = data.get('wx_account') role_ids = data.get('role_ids') alert_options = data.get('alert_options') comment = data.get('comment') alert_enable = data.get('alert_enable', 0) password = random_password(6) user = CrawlUser.objects.create(account=account, username=username, mobile=mobile, comment=comment, wx_account=wx_account, password=password2md5(password), alert_enable=alert_enable) user_id = user.id for role_id in role_ids: CrawlUserRoleRel.objects.create(user_id=user_id, role_id=role_id) # 权限树写进 redis user_roles = CrawlUserRoleRel.objects.filter(user_id=user_id) crawl_redis.set('permission#user#{}'.format(user_id), build_permission_tree(user_roles)) response = {'username': username, 'password': password} r = Result.success(response) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def update_proxy_ip(request, proxy_ip_id): """ 编辑代理 ip :param proxy_ip_id: :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) CrawlProxyIP.objects.filter(id=proxy_ip_id).update(source=data.get('source'), ip=data.get('ip'), port=data.get('port'), ip_type=data.get('ip_type')) r = Result.success(None) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def project_tree(request, project_name): """ 获取爬虫工程树形数据 :param request: request object :param project_name: project name :return: json of tree """ work_cwd = os.getcwd() try: if request.method == 'GET': path = os.path.abspath(join(work_cwd, PROJECTS_FOLDER)) tree = get_tree(join(path, project_name)) r = Result.success(tree) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r) finally: os.chdir(work_cwd)
def project_index(request): """ 工程列表(自动识别目录下爬虫工程,该版本没有使用到) :param request: request object :return: json """ work_path = os.getcwd() try: if request.method == 'GET': path = os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER)) files = os.listdir(path) project_lists = [] for file in files: if os.path.isdir(join(path, file)) and file not in IGNORES: project_list.append({'name': file}) return JsonResponse(project_lists) except Exception as e: r = Result.fail(e) return JsonResponse(r) finally: os.chdir(work_path)
def create_role(request): """ 新增角色 :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) role_name = data.get('role_name') if CrawlRole.objects.filter(role_name=role_name): raise Exception('角色名存在') role = CrawlRole.objects.create(role_name=role_name) permission_ids = data.get('permission_ids') for permission_id in permission_ids: CrawlRolePermission.objects.create(role_id=role.id, permission_id=permission_id) r = Result.success(None) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def reset_pwd(request, user_id): """ 重置用户账号密码 :param user_id: :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) new_pwd = data.get('new_pwd') confirm_pwd = data.get('confirm_pwd') if new_pwd != confirm_pwd: raise Exception('两次密码输入不一致') user = CrawlUser.objects.get(id=user_id) user.password = password2md5(new_pwd) user.save() r = Result.success(None) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def project_remove(request, project_name): """ 从数据库和磁盘上移除爬虫工程 :param request: request object :param project_name: project name :return: result of remove """ if request.method == 'POST': work_path = os.getcwd() try: project = CrawlProject.objects.get(name=project_name) CrawlProject.objects.filter(project=project).delete() result = CrawlProject.objects.filter(name=project_name).delete() path = join(os.path.abspath(os.getcwd()), PROJECTS_FOLDER) project_path = join(path, project_name) if exists(project_path): rmtree(project_path) return JsonResponse({'result': result}) except Exception as e: r = Result.fail(e) return JsonResponse(r) finally: os.chdir(work_path)
def job_cancel_all(request): """ 根据 script 停止所有节点上的脚本 :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) for d in data: script_id = d.get('id') script = CrawlScript.objects.get(id=script_id) project_name = script.project_name nodes = get_hosts_by_script_id(script.id) for node in nodes: engine = get_engine_by_ip(node) host = 'http://{}'.format(node) url = '{}/listjobs.json?project={}'.format(host, project_name) # url = 'http://120.27.210.65:6800/listjobs.json?project=proxy' r = requests.get(url).json() # 过滤掉不是这个脚本 running_task = list(filter(lambda x: x.get('spider') == script.name, r.get('running'))) pending_task = list(filter(lambda x: x.get('spider') == script.name, r.get('pending'))) running_id_list = list(map(lambda x: x.get('id'), running_task)) pending_id_list = list(map(lambda x: x.get('id'), pending_task)) stop_id_list = running_id_list + pending_id_list for stop_id in stop_id_list: result = engine.cancel(project_name, stop_id) print(request) r = Result.success(None) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def node_spider_list(request): """ 某个爬虫工程节点爬虫脚本分布列表 :param request: :return: """ if request.method == 'GET': project_id = request.GET.get("project_id") project_id = CrawlTask.objects.get(id=project_id).project_id deploys = CrawlDeploy.objects.filter(project_id=project_id) node_spiders = [] for deploy in deploys: node_spider = [] node = CrawlNode.objects.get(id=deploy.node_id) engine = get_engine(node) try: # spiders = engine.list_spiders(deploy.project_name) # 写入爬虫脚本表,这个根据性能需要考虑是否后台实时写入 # new_data = { # "name": "", # "desc": "", # "trigger": "", # "hosts": "", # "args": "", # "type": 1, # "project_id": project_id # } # CrawlScript.objects.create() scripts = CrawlScript.objects.filter(project_id=project_id) # spiders = [{'name': spider, 'id': index + 1} for index, spider in enumerate(spiders)] node_spider.append({"node": node, "scripts": scripts}) node_spiders.append(node_spider) except ConnectionError: return JsonResponse(Result.fail("{}爬虫节点不能提供服务".format(node.node_name))) r = Result.success(node_spiders) return JsonResponse(r)
def project_file_read(request): """ 获取爬虫工程文件 :param request: request object :return: file content """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) path = join(data['path'], data['label']) project_id = data.get('project_id') project_name = data.get("project_name") project_base_path = os.path.abspath( join(os.getcwd(), PROJECTS_FOLDER)) project_path = join(project_base_path, project_name) label = data.get('label') is_spider = 0 _spider_name = "" project_spiders = detect_project_spiders(project_path) for spider_name, spider_path in project_spiders.items(): spider_real_path = spider_path.replace( '.', '/', spider_path.count('.') - 1) if path.endswith(spider_real_path): is_spider = 1 _spider_name = spider_name break script = CrawlScript.objects.filter(project_id=project_id, name=_spider_name) if not script and _spider_name: crawl_task = CrawlTask.objects.get(project_id=project_id) script_data = { "name": _spider_name, "project_id": project_id, "task_id": crawl_task.id, "task_name": crawl_task.task_name, "project_name": crawl_task.project_name, "type": 0, "script_file": label, "path": data['path'] } CrawlScript.objects.create(**script_data) else: if is_spider == 1 and (script[0].path is None or script[0].path == ''): script[0].path = data['path'] script[0].save() with open(path, 'rb') as f: if len(script) is not 0: vo = { 'content': f.read().decode('utf-8'), 'name': data['label'], 'is_spider': is_spider, 'trigger': script[0].trigger, 'hosts': script[0].hosts, 'params': script[0].args, 'spider_name': _spider_name, 'use_proxy': script[0].use_proxy, } else: vo = { 'content': f.read().decode('utf-8'), 'name': data['label'], 'is_spider': is_spider, 'trigger': '', 'hosts': '', 'params': '', 'spider_name': _spider_name, 'use_proxy': '', } r = Result.success(vo) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def collect_script_progress(request): """ 采集接收保存任务执行数据 :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) script_name = data["script_name"] batch = data["batch"] script_progress = CrawlScriptProgress.objects.filter(script_name=script_name, batch=batch) arg_key = data.get('arg_key') if arg_key: data['arg'] = bytes.decode(crawl_redis.get('args#{}'.format(arg_key))) log_common.error('script_name: {}, batch_id: {}'.format(script_name, batch)) if script_progress: log_common.error('update progress script_name:{}, batch_id: {}'.format(script_name, batch)) sp = script_progress[0] data["task_name"] = sp.task_name data["id"] = sp.id if data['status'] == -1 and not data.get('msg') and sp.msg: data['msg'] = sp.msg result = script_progress.update(**data) if data['status'] == -1: user_alert_rel = CrawlUserAlertRel.objects.filter(alert_id=12, is_deleted=0) user_ids = list(map(lambda x: str(x.user_id), user_alert_rel)) to_user = '******'.join(user_ids) wx_tools.env_send_card_message(to_user, '爬虫异常', '爬虫: {} 发生异常'.format(script_name)) else: try: log_common.error('new progress script_name:{}, batch_id: {}'.format(script_name, batch)) css = CrawlScript.objects.filter(name=script_name, is_deleted=0) if css: cs = css[0] data["task_name"] = cs.task_name result = CrawlScriptProgress.objects.create(**data) else: log_common.warn("no find {} of task!".format(script_name)) except IntegrityError as e: log_common.error('>>>>>>>>>>>>>>>>>>> catch IntegrityError >>>>>>>>>>>>>>>>>>>>>') # 处理并发情况下脚本上报两次的情况 script_progress = CrawlScriptProgress.objects.filter(script_name=script_name, batch=batch) sp = script_progress[0] data["task_name"] = sp.task_name data["id"] = sp.id result = script_progress.update(**data) if data['status'] == -1: user_alert_rel = CrawlUserAlertRel.objects.filter(alert_id=12, is_deleted=0) user_ids = list(map(lambda x: x.user_id, user_alert_rel)) to_user = '******'.join(user_ids) wx_tools.env_send_card_message(to_user, '爬虫异常', '爬虫: {} 发生异常'.format(script_name)) r = Result.success({}) return JsonResponse(r) except Exception as e: import traceback log_common.error('v3v3:上报数据异常,具体错误 = {}'.format(traceback.format_exc())) r = Result.fail(e) return JsonResponse(r)
def script_start(request): """ 启动脚本 :param request: :return: """ try: if request.method == 'POST': data_scripts = json.loads(request.body.decode('utf-8')) if not data_scripts: return JsonResponse(Result.fail("没有指定脚本")) for data_script in data_scripts: _job_id = '' crawl_script = CrawlScript.objects.get(id=data_script['id']) host_list = get_hosts_by_script_id(crawl_script.id) for host in host_list: engine = get_engine_by_ip(host) if "args" in data_script and data_script["args"]: for arg in data_script["args"]: if 'dynamic_value' in arg: script_arg = json.loads(arg) sql = script_arg.get('dynamic_value') result = db_kit.fetch_all_to_json(sql) for r in result: if isinstance(arg, str): arg = json.loads(arg) arg['dynamic_value'] = r batch_id = encrypt_kit.md5(json.dumps(arg)) args = { "redis": '{{"host":"{}","port": {},"db":1,"password":"******"}}'.format(db_conf.redis_host, str( db_conf.redis_port), db_conf.redis_pwd), "batch_id": batch_id, "node": host, "args": arg } # _job_id = engine.schedule(crawl_script.project_name, crawl_script.name, **args) log_common.warn('>>>> 动态分割脚本启动 {}'.format(json.dumps(args))) _job_id = engine_kit.schedule(engine, crawl_script.project_name, crawl_script.name, **args) crawl_redis.set("args#{}".format(batch_id), json.dumps(arg)) else: batch_id = encrypt_kit.md5(json.dumps(arg)) args = { "redis": '{{"host":"{}","port": {},"db":1,"password":"******"}}'.format( db_conf.redis_host, str( db_conf.redis_port), db_conf.redis_pwd), "batch_id": batch_id, "node": host, "args": arg } # _job_id = engine.schedule(crawl_script.project_name, crawl_script.name, **args) _job_id = engine_kit.schedule(engine, crawl_script.project_name, crawl_script.name, **args) crawl_redis.set("args#{}".format(batch_id), arg) else: ta = time.strftime('%Y-%m-%d %H:%M:%S') batch_id = encrypt_kit.md5(ta) args = { "redis": '{{"host":"{}","port": {},"db":1,"password":"******"}}'.format(db_conf.redis_host, str(db_conf.redis_port), db_conf.redis_pwd), "batch_id": batch_id, "node": host, "args": '{}' } _job_id = engine_kit.schedule(engine, crawl_script.project_name, crawl_script.name, **args) # _job_id = engine.schedule(crawl_script.project_name, crawl_script.name, **args) crawl_redis.set("args#{}".format(batch_id), json.dumps('{}')) crawl_script.job_id = _job_id crawl_script.save() r = Result.success(None) return JsonResponse(r) except Exception as err: r = Result.fail(err) return JsonResponse(r)