def job_log(request, client_id, project_name, spider_name, job_id): """ get log of jog :param request: request object :param client_id: client id :param project_name: project name :param spider_name: spider name :param job_id: job id :return: log of job """ if request.method == 'GET': node = Node.objects.get(id=client_id) # get log url url = log_url(node.ip, node.port, project_name, spider_name, job_id) try: # get last 1000 bytes of log response = requests.get( url, timeout=5, headers={'Range': 'bytes=-1000'}, auth=(node.username, node.password) if node.auth else None) # Get encoding encoding = response.apparent_encoding # log not found if response.status_code == 404: return JsonResponse({'message': 'Log Not Found'}, status=404) # bytes to string text = response.content.decode(encoding, errors='replace') return HttpResponse(text) except requests.ConnectionError: return JsonResponse({'message': 'Load Log Error'}, status=500)
def project_version(request, node_id, project_name): """ get project deploy version :param request: request object :param node_id: node id :param project_name: project name :return: deploy version of project """ if request.method == 'GET': # get node and project model node = Node.objects.get(id=node_id) project = Project.objects.get(name=project_name) scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port)) # if deploy info exists in db, return it if Deploy.objects.filter(node=node, project=project): deploy = Deploy.objects.get(node=node, project=project) # if deploy info does not exists in db, create deploy info else: try: versions = scrapyd.list_versions(project_name) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500) if len(versions) > 0: version = versions[-1] deployed_at = timezone.datetime.fromtimestamp( int(version), tz=pytz.timezone(TIME_ZONE)) else: deployed_at = None deploy, result = Deploy.objects.update_or_create( node=node, project=project, deployed_at=deployed_at) # return deploy json info return JsonResponse(model_to_dict(deploy))
def project_build(request, project_name): """ get build info or execute build operation :param request: request object :param project_name: project name :return: json """ # get project folder path = os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER)) project_path = join(path, project_name) # get build version if request.method == 'GET': egg = find_egg(project_path) # if built, save or update project to db if egg: built_at = timezone.datetime.fromtimestamp( os.path.getmtime(join(project_path, egg)), tz=pytz.timezone(TIME_ZONE)) if not Project.objects.filter(spider_name=project_name): Project(spider_name=project_name, built_at=built_at, egg=egg).save() model = Project.objects.get(spider_name=project_name) else: model = Project.objects.get(spider_name=project_name) model.built_at = built_at model.egg = egg model.save() else: # if not built, just save project name to db if not Project.objects.filter(spider_name=project_name): Project(name=project_name).save() model = Project.objects.get(spider_name=project_name) # transfer model to dict then dumps it to json data = model_to_dict(model) return JsonResponse(data) # build operation manually by clicking button elif request.method == 'POST': # data = json.loads(request.body.decode('utf-8')) # description = data['spider_desc'] build_project(project_name) egg = find_egg(project_path) # update built_at info built_at = timezone.now() # if project does not exists in db, create it if not Project.objects.filter(spider_name=project_name): Project(name=project_name, description=Project.spider_desc, built_at=built_at, egg=egg).save() model = Project.objects.get(spider_name=project_name) # if project exists, update egg, description, built_at info else: model = Project.objects.get(spider_name=project_name) model.built_at = built_at model.egg = egg # model.description = description model.save() # transfer model to dict then dumps it to json # data = model_to_dict(model) return JsonResponse({"result": 1})
def project_deploy(request, node_id, project_name): if request.method == 'POST': # get project folder path = os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER)) project_path = join(path, project_name) # find egg file egg = find_egg(project_path) egg_file = open(join(project_path, egg), 'rb') # get node and project model node = Node.objects.get(id=node_id) project = Project.objects.get(spider_name=project_name) # execute deploy operation scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port)) try: scrapyd.add_version(project_name, int(time.time()), egg_file.read()) # update deploy info deployed_at = datetime.datetime.now() deployed_at = deployed_at.strftime("%Y-%m-%d %H:%M:%S") Deploy.objects.filter(node=node, project=project).delete() deploy, result = Deploy.objects.update_or_create( node=node, project=project, deployed_at=deployed_at, description=project.spider_desc) return JsonResponse({'result': 1, "deploy": model_to_dict(deploy)}) except Exception: return JsonResponse({'message': get_traceback()}, status=500)
def failUrlList(request): if request.method == "GET": try: failUrlList = Fail_url_detail.objects.all().order_by('-save_time') return JsonResponse({'failUrlList': failUrlList}) except: return JsonResponse({'message': 'not data'})
def addTemplate(request): if request.method == "POST": data = json.loads(request.body.decode('utf-8')) item_list_xpath = data['item_list_xpath'] item_title_xpath = data['item_title_xpath'] item_url_xpath = data['item_url_xpath'] item_publishdata_xpath = data['item_publishdata_xpath'] next_page_xpath = data['next_page_xpath'] select_template = data['select_template'] if next_page_xpath and item_list_xpath and item_title_xpath and item_url_xpath and item_publishdata_xpath: templateList = SpiderTemplates.objects.filter( tem_type=select_template).values() template = templateList.first().get('tem_text') next_page = re.findall(r'next_page_xpath = u"(.*)"', template) item_list = re.findall(r'list_xpath = "(.*)"', template) item_title = re.findall(r'title_xpath = "(.*)"', template) item_url = re.findall(r'url_xpath = "(.*)"', template) item_publishdata = re.findall(r'pdate_xpath = "(.*)"', template) func = template.replace(next_page[0], next_page_xpath).replace( item_list[0], item_list_xpath).replace( item_title[0], item_title_xpath).replace( item_url[0], item_url_xpath).replace(item_publishdata[0], item_publishdata_xpath) print(func) try: return JsonResponse({'func': func}) except: return JsonResponse({'messages': 'input error'}) else: return JsonResponse({'messages': 'input error'})
def task(): node = Node.objects.get(id=node_id) scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port)) try: job = scrapyd.schedule(project_name, spider_name) return JsonResponse({'job': job}) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
def remove_all_version(request, project, client_id): node = Node.objects.get(id=client_id) scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port)) try: versions = scrapyd.delete_project(project) return JsonResponse(versions) except ConnectionError: return JsonResponse({'message': 'Connet Error'}, status=500)
def remove_depody_spider(request, client_id, project, version_name): if request.method == 'POST': node = Node.objects.get(id=client_id) scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port)) try: spider = scrapyd.delete_version(project, version_name) return JsonResponse(spider) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
def project_generate(request, project_name): """ generate code of project :param request: request object :param project_name: project name :return: json of generated project """ if request.method == 'POST': # get configuration configuration = Project.objects.get(name=project_name).configuration configuration = json.loads(configuration) if not is_valid_name(project_name): return JsonResponse({'message': 'Invalid project name'}, status=500) # remove original project dir project_dir = join(PROJECTS_FOLDER, project_name) if exists(project_dir): rmtree(project_dir) # generate project copy_tree(join(TEMPLATES_DIR, 'project'), project_dir) move(join(PROJECTS_FOLDER, project_name, 'module'), join(project_dir, project_name)) for paths in TEMPLATES_TO_RENDER: path = join(*paths) tplfile = join( project_dir, string.Template(path).substitute(project_name=project_name)) vars = { 'project_name': project_name, 'items': configuration.get('items'), } render_template(tplfile, tplfile.rstrip('.tmpl'), **vars) # generate spider spiders = configuration.get('spiders') for spider in spiders: source_tpl_file = join(TEMPLATES_DIR, 'spiders', 'crawl.tmpl') new_tpl_file = join(PROJECTS_FOLDER, project_name, project_name, 'spiders', 'crawl.tmpl') spider_file = "%s.py" % join(PROJECTS_FOLDER, project_name, project_name, 'spiders', spider.get('name')) copy(source_tpl_file, new_tpl_file) render_template(new_tpl_file, spider_file, spider=spider, project_name=project_name) # save generated_at attr model = Project.objects.get(name=project_name) model.generated_at = timezone.now() # clear built_at attr model.built_at = None model.save() # return model return JsonResponse(model_to_dict(model))
def get_spider_version(request, project, client_id): node = Node.objects.get(id=client_id) scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port)) try: spiders = scrapyd.list_spiders(project) spiders = [{ 'name': spider, 'id': index + 1 } for index, spider in enumerate(spiders)] return JsonResponse(spiders) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
def get_project_version(request, project, node_id): if request.method == 'GET': print('ssss') node = Node.objects.get(id=node_id) scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port)) try: versions = scrapyd.list_versions(project) versions = [{ 'name': version, 'id': index + 1 } for index, version in enumerate(versions)] return JsonResponse(versions) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
def node_status(request, node_id): """ get node status :param request: request object :param node_id: node id :return: json """ if request.method == 'GET': # get node object node = Node.objects.get(id=node_id) try: requests.get(scrapyd_url(node.node_ip, node.node_port), timeout=3) return JsonResponse({'result': '1'}) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
def ruler_update(request, ruler_id): if request.method == 'POST': spider = ProjectRuler.objects.filter(id=ruler_id) data = json.loads(request.body.decode('utf-8')) spider.update(**data) return JsonResponse( model_to_dict(ProjectRuler.objects.get(id=ruler_id)))
def paginator(request, obje): if request.method == "POST": contact_list = eval(obje).objects.all().order_by('id') data = json.loads(request.body.decode('utf-8')) page = data['page'] page_num = data['page_num'] paginator = Paginator(contact_list, page_num) # Show 25 contacts per page try: contacts = paginator.page(page) contacts = [ dict(x["fields"], **{"id": x["pk"]}) for x in json.loads((serialize("json", contacts))) ] except PageNotAnInteger: # If page is not an integer, deliver first page. contacts = paginator.page(1) contacts = [ dict(x["fields"], **{"id": x["pk"]}) for x in json.loads((serialize("json", contacts))) ] except EmptyPage: # If page is out of range (e.g. 9999), deliver last page of results. contacts = paginator.page(paginator.num_pages) contacts = [ dict(x["fields"], **{"id": x["pk"]}) for x in json.loads((serialize("json", contacts))) ] return JsonResponse(contacts)
def ruler_indexs(request): """ get project list :param request: request object :return: node list """ if request.method == "GET": PR = ProjectRuler.objects.select_related('scheduler', 'project').filter(is_lock=1) lis = [] for p in PR: if p.project != None: project_name = p.project.spider_name else: project_name = "未部署" if p.scheduler != None: schduler_name = p.scheduler.schedule_name else: schduler_name = "未添加调度" id = p.id project_desc = p.project_desc dept_id = p.dept_id url = p.url data = { "id": id, "schduler_name": schduler_name, "spider_name": project_name, "project_desc": project_desc, "dept_id": dept_id, "url": url, } lis.append(data) return JsonResponse(lis)
def delete_spider_scheduler(request): if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) ids = data['ids'] spider = ProjectRuler.objects.filter(id(ids)) spider.update(Scheduler_id=None) return JsonResponse({'result': 1})
def scheduler_update(request, scheduler_id): if request.method == 'POST': scheder = Scheduler.objects.filter(id=scheduler_id) data = json.loads(request.body.decode('utf-8')) scheder.update(**data) return JsonResponse( model_to_dict(Scheduler.objects.get(id=scheduler_id)))
def scheduler_run_ruler(request, scheduler_id): """ start scheduler task :param request: :param scheduler_id: :return: """ if request.method == "GET": from zzh.scheduler.sched import reload_runnable_spider_job_execution schedul.add_job(reload_runnable_spider_job_execution, 'interval', minutes=2, id='my_scheduler_job') schedule = Scheduler.objects.get(id=scheduler_id) spider_time = schedule.spider_time print(spider_time) # try: # while True: # schedul.start() # time.sleep(2) # except (KeyboardInterrupt, SystemExit): # schedul.shutdown() # schedul.start() schedul.start() return JsonResponse({'result': 1})
def index_status(request): """ index statistics :param request: request object :return: json """ if request.method == 'GET': nodes = Node.objects.all() data = { 'success': 0, 'error': 0, 'project': 0, } # nodes info for node in nodes: try: requests.get(scrapyd_url(node.node_ip, node.node_port), timeout=1) data['success'] += 1 except ConnectionError: data['error'] += 1 path = os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER)) files = os.listdir(path) # projects info for file in files: if os.path.isdir(join(path, file)) and not file in IGNORES: data['project'] += 1 return JsonResponse(data)
def taskUrlSearch(request): if request.method == "POST": data = json.loads(request.body.decode('utf-8')) search_keywords = data.get('dept_name_key') or data.get( 'item_pulishdate') if search_keywords: all_orgs = Task_url.objects.filter( Q(dept_name_key=search_keywords) | Q(item_pulishdate__contains=search_keywords)) all_orgs = json.loads(serialize('json', all_orgs)) all_orgs = [dict(x["fields"], **{"id": x["pk"]}) for x in all_orgs] try: return JsonResponse(all_orgs) except: return JsonResponse({'message': 'data does not exist'}) else: return JsonResponse({'message': 'Error'})
def search(request): if request.method == "POST": # # all_search = ProjectRuler.objects.all() data = json.loads(request.body.decode('utf-8')) search_keywords = data.get('dept_id') or data.get( 'dept_name_key') or data.get('project_desc') or data.get('is_lock') if search_keywords: all_orgs = ProjectRuler.objects.filter( Q(project_desc__icontains=search_keywords) | Q(dept_name_key__icontains=search_keywords) \ | Q(dept_id__icontains=search_keywords)) try: while True: return JsonResponse(all_orgs) except: return JsonResponse({'message': 'Project does not exist'}) else: return JsonResponse({'message': 'Error'})
def spider_start(request, node_id, project_name, spider_name): """ start a spider :param request: request object :param node_id: node id :param project_name: project name :param spider_name: spider name :return: json """ if request.method == 'GET': node = Node.objects.get(id=node_id) scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port)) try: job = scrapyd.schedule(project_name, spider_name) return JsonResponse({'job': job, "result": 1}) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
def job_cancel(request, node_id, project_name, job_id): """ cancel a job :param request: request object :param node_id: node id :param project_name: project name :param job_id: job id :return: json of cancel """ if request.method == 'GET': node = Node.objects.get(id=node_id) try: scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port)) res = scrapyd.cancel(project_name, job_id) return JsonResponse({"res": res, "result": 1}) except ConnectionError: return JsonResponse({'message': 'Connect Error'})
def delete_project(request): if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) ids = data['ids'] spider_id = data['spider_id'] for id in ids: spider = ProjectRuler.objects.filter(id=id) spider.update(project_id=None) return JsonResponse({'result': 1})
def FailUrlSearch(request): if request.method == "POST": data = json.loads(request.body.decode('utf-8')) search_keywords = data.get('save_time') or data.get( 'queue_url') or data.get('spider_url') or data.get( 'status_code') or data.get('dept_id') if search_keywords: all_orgs = Fail_url_detail.objects.filter( Q(save_time__icontains=search_keywords) | Q(queue_url__icontains=search_keywords) \ | Q(spider_url__icontains=search_keywords) |Q(status_code__icontains=search_keywords) \ | Q(dept_id__icontains=search_keywords)) try: while True: return JsonResponse(all_orgs) except: return JsonResponse({'message': 'Project does not exist'}) else: return JsonResponse({'message': 'Error'})
def add_spider_scheduler(request): if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) ids = data['ids'] scheduler_id = data[ 'scheduler_id'] # scheduler_id = data['scheduler_id'] //修改成 enabled 值为0表示不定时,1定时 spider = ProjectRuler.objects.filter(id(ids)) spider.update(Scheduler_id=scheduler_id) return JsonResponse({'result': 1})
def node_info(request, node_id): """ get node info :param request: request object :param id: node id :return: json """ if request.method == 'GET': return JsonResponse(model_to_dict(Node.objects.get(id=node_id)))
def add_project_scheduler(request): if request.method == 'POST': data = json.loads(request.body.decode('utf-8')) ids = data['ids'] scheduler_id = data['scheduler_id'] for id in ids: spider = ProjectRuler.objects.filter(id=id, is_lock=1) spider.update(scheduler_id=scheduler_id) return JsonResponse({'result': 1})
def project_list(request, node_id): """ project deployed list on one node :param request: request object :param node_id: node id :return: json """ if request.method == 'GET': node = Node.objects.get(id=node_id) scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port)) try: projects = scrapyd.list_projects() lis = [] for project in projects: lis.append({'spider_name': project}) return JsonResponse({'result': 1, 'lis': lis}) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)