def list_proxy_ip(request): """ 所有代理 ip :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) keyword = data.get('keyword') page = data.get('page', 1) size = data.get('size', 15) ip_type = data.get('ip_type') status = request.GET.get('status') proxy_ips = CrawlProxyIP.objects.filter(is_deleted=0) if keyword is not None: proxy_ips = proxy_ips.filter(ip__icontains=keyword) if ip_type is not None: proxy_ips = proxy_ips.filter(ip_type=ip_type) if status is not None: proxy_ips = proxy_ips.filter(status=status) total = proxy_ips.count() r = Result.success(page_helper(total, page, size, proxy_ips)) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def process_request(self, request): try: if DEBUG: return if request.path in self.white_list: return dt_token = request.COOKIES.get('dt_token') dt_user_id = request.COOKIES.get('dt_user_id') dt_username = request.COOKIES.get('dt_username') if not dt_token: return JsonResponse(Result.fail("缺少token"), status=403) res = jwt_tools.decode_token(dt_token) if not jwt_tools.verify(res): return JsonResponse(Result.fail("非法token"), status=403) # 检查权限 if not self.filter_auth(dt_user_id, dt_username, request.path): r = Result.fail("无权限访问该资源") return JsonResponse(r, status=403) request.user_id = dt_user_id request.user_name = dt_username except ExpiredSignatureError as e: r = Result.fail("登录过期") return JsonResponse(r, status=403) except Exception as e: r = Result.fail("非法登录") return JsonResponse(r, status=403)
def login(request): """ 登录(TODO 使用jwt) :param request: :return: """ try: domain = settings.SESSION_COOKIE_DOMAIN if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) username = data.get('username').strip() password = data.get('password').strip() user = CrawlUser.objects.get(username=username) if not user: raise Exception('用户名或密码不正确') else: if password2md5(password) == user.password: token = jwt_tools.encode_token(user.id, user.username) r = Result.success(None) response = JsonResponse(r) response.set_cookie('dt_token', bytes.decode(token), domain=domain, max_age=60 * 60 * 24 * 30) response.set_cookie('dt_user_id', user.id, domain=domain, max_age=60 * 60 * 24 * 30) response.set_cookie('dt_username', user.username, domain=domain, max_age=60 * 60 * 24 * 30) return response else: raise Exception('用户名或密码不正确') except Exception as e: r = Result.fail(e) return JsonResponse(r)
def reset_profile_pwd(request, user_id): """ 重置个人密码 :param user_id: :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) old_pwd = data.get('old_pwd') new_pwd = data.get('new_pwd') confirm_pwd = data.get('confirm_pwd') user = CrawlUser.objects.get(id=user_id) if confirm_pwd != new_pwd: raise Exception('两次密码输入不一致') db_pwd = user.password if db_pwd != password2md5(old_pwd): raise Exception('密码不正确') user.password = password2md5(new_pwd) user.save() r = Result.success(None) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def project_deploy(request, project_name): """ 发布爬虫工程 :param request: request object :param project_name: project name :return: json of deploy result """ if request.method == 'POST': path = os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER)) project_path = join(path, project_name) # 检索打包egg文件 egg = find_egg(project_path) if not egg: r = Result.success("没有打包文件") return JsonResponse(r) egg_file = open(join(project_path, egg), 'rb') data = json.loads(request.body.decode('utf-8')) node_ids = data["node_ids"] nodes = CrawlNode.objects.filter(id__in=node_ids) project = CrawlProject.objects.get(name=project_name) for node in nodes: engine = get_engine(node) engine.add_version(project_name, int(time.time()), egg_file.read()) deployed_at = timezone.now() CrawlDeploy.objects.filter( node_id=node.id, project_id=project.id).delete() # 这里逻辑删除 deploy, result = CrawlDeploy.objects.update_or_create( node_id=node.id, project_id=project.id, deployed_at=deployed_at, description=project.description) r = Result.success("") return JsonResponse(r)
def project_version(request, client_id, project_name): """ get project deploy version :param request: request object :param client_id: client id :param project_name: project name :return: deploy version of project """ if request.method == 'GET': # get client and project model client = CrawlNode.objects.get(id=client_id) project = CrawlProject.objects.get(name=project_name) engine = get_engine(client) # if deploy info exists in db, return it if CrawlDeploy.objects.filter(client=client, project=project): deploy = CrawlDeploy.objects.get(client=client, project=project) # if deploy info does not exists in db, create deploy info else: try: versions = engine.list_versions(project_name) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500) if len(versions) > 0: version = versions[-1] deployed_at = timezone.datetime.fromtimestamp( int(version), tz=pytz.timezone(TIME_ZONE)) else: deployed_at = None deploy, result = CrawlDeploy.objects.update_or_create( client=client, project=project, deployed_at=deployed_at) # return deploy json info return JsonResponse(model_to_dict(deploy))
def script_stop(request): """ 启动脚本 :param request: :return: """ try: if request.method == 'POST': data_scripts = json.loads(request.body.decode('utf-8')) if not data_scripts: return JsonResponse(Result.fail("没有指定脚本")) for data_script in data_scripts: crawl_script = CrawlScript.objects.get(id=data_script["id"]) host_list = get_hosts_by_script_id(crawl_script.id) for host in host_list: engine = get_engine_by_ip(host) args = { "redis": '{{"host":"{}","port": {},"db":1,"password":"******"}}'.format(db_conf.redis_host, str(db_conf.redis_port), db_conf.redis_pwd), "batch_id": '' } jobs = engine.cancel(crawl_script.project_name, crawl_script.name) r = Result.success(None) return JsonResponse(r) except Exception as err: r = Result.fail(err) return JsonResponse(r)
def script_newest_log(request): """ 获取脚本最新日志 :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) script_id = data.get('script_id') host_ip = data.get('host_ip') script = CrawlScript.objects.get(id=script_id) project_name = script.project_name spider_name = script.name job_id = script.job_id if not job_id: r = Result.success('暂无日志') return JsonResponse(r) url = 'http://{}/logs/{}/{}/{}.log'.format(host_ip, project_name, spider_name, job_id) response = requests.get(url) if response.status_code != 200: r = Result.success('暂无日志') return JsonResponse(r) log_content = response.content.decode('utf-8') r = Result.success({'message': log_content}) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def task_by_script_id(request, script_id): """ 根据脚本 id 获取任务 :param script_id: :param request: :return: """ work_path = os.getcwd() try: if request.method == 'GET': script = CrawlScript.objects.get(id=script_id) project = CrawlProject.objects.get(id=script.project_id) task = CrawlTask.objects.get(id=project.task_id) path = os.path.abspath(join(work_path, PROJECTS_FOLDER)) script_name = script.name vo = model_to_dict(task) vo.__setitem__('path', path) vo.__setitem__('script_name', script.script_file) r = Result.success(vo) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r) finally: os.chdir(work_path)
def find_debug_log(request): """ 查看测试执行日志 :param request: :return: """ work_path = os.getcwd() try: if request.method == 'GET': project_name = request.GET.get('project_name') spider_name = request.GET.get('spider_name') current_line = int(request.GET.get('current_line')) project_path = join(PROJECTS_FOLDER, project_name) os.chdir(project_path) if not os.path.exists("debug_folder"): r = Result.success(data='') return JsonResponse(r) input_file = open('./debug_folder/logs/{}.log'.format(spider_name), 'r', encoding='utf-8') lines = input_file.readlines() input_file.close() response = [] for line in lines[(current_line - 1):]: data = {'current_line': current_line, 'data': line} response.append(data) current_line = current_line + 1 r = Result.success(response) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r) finally: os.chdir(work_path)
def find_debug_result(request): """ 查看测试执行结果 :param request: :return: """ work_path = os.getcwd() try: if request.method == 'GET': project_name = request.GET.get('project_name') spider_name = request.GET.get('spider_name') project_path = join(PROJECTS_FOLDER, project_name) os.chdir(project_path) if not os.path.exists("debug_folder"): r = Result.success(data='') return JsonResponse(r) input_file = open('./debug_folder/items/{}.json'.format(spider_name)) all_text = input_file.read() input_file.close() r = Result.success({'content': all_text}) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r) finally: os.chdir(work_path)
def project_build(request, project_name): """ 爬虫工程编译打包 :param request: request object :param project_name: project name :return: json """ path = os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER)) project_path = join(path, project_name) if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) description = data['description'] build_project( project_name, include_data=False if project_name != 'auto_login' else True) egg = find_egg(project_path) if not egg: return JsonResponse(Result.fail("编译打包失败")) built_at = timezone.now() if not CrawlProject.objects.filter(name=project_name): CrawlProject(name=project_name, description=description, built_at=built_at, egg=egg).save() model = CrawlProject.objects.get(name=project_name) else: model = CrawlProject.objects.get(name=project_name, is_deleted=0) model.built_at = built_at model.egg = egg model.description = description model.save() data = model_to_dict(model) r = Result.success(data) return JsonResponse(r)
def index_status(request): """ 统计工程状态 :param request: request object :return: json """ if request.method == 'GET': work_path = os.getcwd() try: nodes = CrawlNode.objects.all() data = { 'success': 0, 'error': 0, 'project': 0, } for client in nodes: try: requests.get(engine_url(client.ip, client.port), timeout=1) data['success'] += 1 except ConnectionError: data['error'] += 1 path = os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER)) files = os.listdir(path) for file in files: if os.path.isdir(join(path, file)) and file not in IGNORES: data['project'] += 1 return JsonResponse(data) except Exception as e: r = Result.fail(e) return JsonResponse(r) finally: os.chdir(work_path)
def edit_script_cfg(request): """ 编辑爬虫脚本配置 :param request: request object :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) spider_name = data['spider_name'] script_name = data['script_name'] apply_to_all = data['applyToAll'] task_id = data['project_id'] script_args = [] for p in data.get('params'): if isinstance(p['args'], str): p['args'] = json.loads(p['args']) script_args.append(p) if p.get('trigger'): result, message = scheduler_helper.verify_cron(p.get('trigger')) if not result: raise Exception('参数错误: {}'.format(message)) update_kwargs = { "trigger": data.get('trigger'), "hosts": data.get('hosts'), "args": json.dumps(script_args)} # 批量设置当前任务的所有脚本 if apply_to_all: crawl_scripts = CrawlScript.objects.filter(task_id=task_id) crawl_scripts.update(**update_kwargs) else: crawl_scripts = CrawlScript.objects.get(name=spider_name, task_id=task_id) crawl_scripts.trigger = data.get('trigger') crawl_scripts.hosts = data.get('hosts') crawl_scripts.args = json.dumps(script_args) crawl_scripts.save() if 'params' in data and data['params']: args = data['params'] # 设置每个爬虫脚本的执行参数,不同调度批次的爬虫运行参数使用md5区分 for arg in args: if apply_to_all: for script in crawl_scripts: v_arg = encrypt_kit.md5(json.dumps(arg)) crawl_redis.set("args#{}#{}".format(script.name, v_arg), json.dumps(arg['args'])) else: v_arg = encrypt_kit.md5(json.dumps(arg)) crawl_redis.set("args#{}#{}".format(spider_name, v_arg), json.dumps(arg['args'])) r = Result.success("") return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def del_project(request, client_id, project): if request.method == 'GET': client = CrawlNode.objects.get(id=client_id) try: scrapyd = get_engine(client) result = scrapyd.delete_project(project=project) return JsonResponse(result) except ConnectionError: return JsonResponse({'message': 'Connect Error'})
def list_task_progress(request): """ 爬虫任务进度 :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) keyword = data.get('keyword') script_name = data.get('script_name') date = data.get('date') status = data.get('status') page = data.get('page', 1) size = data.get('size', 15) task_progress = CrawlScriptProgress.objects.filter(is_deleted=0).exclude(script_name='proxy') condition_date = datetime.datetime.today().strftime('%Y-%m-%d') if date == '' else date stat_task_progress = task_progress.filter(start_time__gte='{} 00:00:00'.format(condition_date), start_time__lte='{} 23:59:59'.format(condition_date)) running_cnt = stat_task_progress.filter(status=1).count() success_cnt = stat_task_progress.filter(status=2).count() fail_cnt = stat_task_progress.filter(status=-1).count() if keyword is not None and keyword != '': task_progress = task_progress.filter(task_name__icontains=keyword) if script_name is not None and script_name != '': task_progress = task_progress.filter(script_name__icontains=script_name) if date is not None and date != '': task_progress = task_progress.filter(start_time__gte='{} 00:00:00'.format(date), start_time__lte='{} 23:59:59'.format(date)) if status is not None: task_progress = task_progress.filter(status__in=status) task_progress = task_progress.order_by("-id") total = task_progress.count() pager = page_helper(total, page, size, task_progress, {'fail_cnt': fail_cnt, 'running_cnt': running_cnt, 'success_cnt': success_cnt}) convert_task_progress = [] results = pager.get('results') for result in results: result['run_time'] = time_kit.convert_ms(result.get('run_time')) result['script_id'] = CrawlScript.objects.get(task_name=result.get('task_name'), name=result.get('script_name')).id convert_task_progress.append(result) pager['results'] = convert_task_progress r = Result.success(pager) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def task_remove(request, task_id): """ 删除任务 :param request: :param task_id: :return: """ if request.method == 'POST': try: CrawlTask.objects.filter(id=task_id).delete() return JsonResponse({'result': '1'}) except: # TODO 日志 return JsonResponse({'result': '0'})
def fetch_user_permissions(request): """ 获取用户菜单权限列表 :param request: :return: """ user_id = request.user_id user_roles = CrawlUserRoleRel.objects.filter(user_id=user_id, is_deleted=0) if not user_roles: return JsonResponse(Result.success(data={})) permission_tree = build_permission_tree(user_roles) crawl_redis.set('permission#user#{}'.format(user_id), json.dumps(permission_tree)) r = Result.success(data=permission_tree) return JsonResponse(r)
def script_remove(request): """ 删除脚本 :param request: :return: """ try: if request.method == 'GET': id = request.GET['id'] CrawlScript.objects.get(id=id).delete() r = Result.success(None) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def get_hosts(request): """ 根据脚本 id 获取 hosts :param request: :return: """ try: if request.method == 'GET': script_id = request.GET.get('script_id') hosts = get_hosts_by_script_id(script_id) r = Result.success(hosts) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def project_list(request, client_id): """ project deployed list on one client :param request: request object :param client_id: client id :return: json """ if request.method == 'GET': client = CrawlNode.objects.get(id=client_id) scrapyd = get_engine(client) try: projects = scrapyd.list_projects() return JsonResponse(projects) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
def node_status(request, node_id): """ 获取某个爬虫节点的状态 :param request: request object :param node_id: node_id id :return: json """ if request.method == 'GET': # get client object client = CrawlNode.objects.get(id=node_id) try: requests.get(engine_url(client.ip, client.port), timeout=3) return JsonResponse(Result.success("")) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
def project_list(request, node_id): """ 获取某个node节点上的爬虫工程 :param request: request object :param node_id: node_id :return: json """ if request.method == 'GET': client = CrawlNode.objects.get(id=node_id) engine = get_engine(client) try: projects = engine.list_projects() JsonResponse(Result.success(data=projects)) except ConnectionError: return JsonResponse(Result.fail())
def create_proxy_ip(request): """ 创建代理 ip :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) CrawlProxyIP.objects.create(**data) r = Result.success(None) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def get_proxy_ip(request, proxy_ip_id): """ 获取一个代理 ip :param request: :param proxy_ip_id: :return: """ try: if request.method == 'GET': proxy_ip = CrawlProxyIP.objects.get(id=proxy_ip_id) r = Result.success(model_to_dict(proxy_ip)) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def task_deploy(request, project_name): try: log_common.info('进入发布方法') work_path = os.getcwd() if request.method == 'GET': log_common.info('开始发布逻辑') path = os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER)) project_path = join(path, project_name) # 检索打包egg文件 egg = find_egg(project_path) if not egg: raise Exception('没有打包文件') egg_file = open(join(project_path, egg), 'rb') egg_file_content = egg_file.read() project = CrawlProject.objects.get(name=project_name, is_deleted=0) task = CrawlTask.objects.get(id=project.task_id) task.is_deploy = 1 task.save() for node_id in json.loads(task.node_ids): node = CrawlNode.objects.get(id=node_id) engine = get_engine(node) log_common.info('{}: 准备发布{}'.format(node.node_ip, project_name)) engine.add_version(project_name, int(time.time()), egg_file_content) log_common.info('{}: 发布成功{}'.format(node.node_ip, project_name)) # update deploy info deployed_at = timezone.now() CrawlDeploy.objects.filter( node_id=node.id, project_id=project.id).update(is_deleted=1) deploy, result = CrawlDeploy.objects.update_or_create( node_id=node.id, project_id=project.id, deployed_at=deployed_at, description=project.description) r = Result.success("") return JsonResponse(r) except Exception as e: import traceback log_common.error("task_deploy => ", e) log_common.error("task_deploy => {}".format(traceback.format_exc())) r = Result.fail(e) return JsonResponse(r) finally: os.chdir(work_path)
def list_role(request): """ 角色列表 :param request: :return: """ if request.method == 'GET': page = request.GET.get('page', 1) size = request.GET.get('size', 15) response = [] roles = CrawlRole.objects.filter(is_deleted=0) relsDict = CrawlPermission.objects.filter(is_deleted=0) for role in roles: rels = CrawlRolePermission.objects.filter(is_deleted=0, role_id=role.id) role_permissions = [] for rel in rels: permissions = relsDict.get(id=rel.permission_id) role_permissions.append(model_to_dict(permissions).get('permission_name')) roleD = model_to_dict(role) roleD.__setitem__('permission', role_permissions) response.append(roleD) r = Result.success(response) return JsonResponse(r)
def list_scripts(request): """ 某个爬虫工程节点爬虫脚本分布列表 :param request: :return: """ if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) size = data.get('size', 15) page = data.get('page', 1) task_name = data.get("task_name") task_id = data.get("task_id") script_name = data.get("script_name") scripts = CrawlScript.objects.filter(is_deleted=0) if task_id: scripts = scripts.filter(task_id=task_id) if script_name: scripts = scripts.filter(name__contains=script_name) if task_name: scripts = scripts.filter(task_name__contains=task_name) scripts = scripts.order_by("-id") total = scripts.count() response = page_helper(total, page, size, scripts) results = response.get('results') for result in results: result.__setitem__('hosts', ','.join(get_hosts_by_script_id(result.get('id')))) r = Result.success(response) return JsonResponse(r)
def script_enable(request): """ 启用脚本 :param request: :return: """ try: if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) ids = data.get('ids') control_script(ids, 0) r = Result.success(None) return JsonResponse(r) except Exception as e: r = Result.fail(e) return JsonResponse(r)
def node_create(request): """ 创建爬虫节点 :param request: request object :return: json """ if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) cn = CrawlNode.objects.filter(node_ip=data["node_ip"]).last() if not cn: node = CrawlNode.objects.create(**data) r = Result.success(model_to_dict(node)) return JsonResponse(r) else: # 更改心跳时间,表示节点存活 return JsonResponse(Result.fail('节点已存在'))