def project_deploy(request, client_id, project_name): """ deploy project operation :param request: request object :param client_id: client id :param project_name: project name :return: json of deploy result """ if request.method == 'POST': # get project folder path = os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER)) project_path = join(path, project_name) # find egg file egg = find_egg(project_path) egg_file = open(join(project_path, egg), 'rb') # get client and project model client = Client.objects.get(id=client_id) project = Project.objects.get(name=project_name) # execute deploy operation scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) try: scrapyd.add_version(project_name, int(time.time()), egg_file.read()) # update deploy info deployed_at = timezone.now() Deploy.objects.filter(client=client, project=project).delete() deploy, result = Deploy.objects.update_or_create(client=client, project=project, deployed_at=deployed_at, description=project.description) return JsonResponse(model_to_dict(deploy)) except Exception: return JsonResponse({'message': get_traceback()}, status=500)
def project_deploy(request, node_id, project_name): if request.method == 'POST': # get project folder path = os.path.abspath(join(os.getcwd(), PROJECTS_FOLDER)) project_path = join(path, project_name) # find egg file egg = find_egg(project_path) egg_file = open(join(project_path, egg), 'rb') # get node and project model node = Node.objects.get(id=node_id) project = Project.objects.get(spider_name=project_name) # execute deploy operation scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port)) try: scrapyd.add_version(project_name, int(time.time()), egg_file.read()) # update deploy info deployed_at = datetime.datetime.now() deployed_at = deployed_at.strftime("%Y-%m-%d %H:%M:%S") Deploy.objects.filter(node=node, project=project).delete() deploy, result = Deploy.objects.update_or_create( node=node, project=project, deployed_at=deployed_at, description=project.spider_desc) return JsonResponse({'result': 1, "deploy": model_to_dict(deploy)}) except Exception: return JsonResponse({'message': get_traceback()}, status=500)
def project_deploy(request, id, project): if request.method == 'GET': path = os.path.abspath(merge(os.getcwd(), PROJECTS_FOLDER)) project_path = merge(path, project) egg = find_egg(project_path) egg_file = open(merge(project_path, egg), 'rb') deploy_version = time.time() client_model = Client.objects.get(id=id) project_model = Project.objects.get(name=project) Deploy.objects.filter(client=client_model, project=project_model).delete() deploy = Deploy.objects.update_or_create(client=client_model, project=project_model, description=project_model.description) scrapyd = ScrapydAPI(scrapyd_url(client_model.ip, client_model.port)) result = scrapyd.add_version(project, int(deploy_version), egg_file.read()) return HttpResponse(result)
class ScrapyAgent(object): """ scrapy项目代理类 """ def __init__(self, server_url): self.server_url = server_url self.scrapyd_api = ScrapydAPI(server_url) def __repr__(self): return '<ScrapyAgent %s>' % self.server_url @property def server(self): return self.server_url def list_projects(self): return self.scrapyd_api.list_projects() def del_project(self, project_name): try: return self.scrapyd_api.delete_project(project_name) except: return False def list_spiders(self, project_name): return self.scrapyd_api.list_spiders(project_name) def start_spider(self, project_name, spider_name): return self.scrapyd_api.schedule(project_name, spider_name) def cancel_spider(self, project_name, job_id): return self.scrapyd_api.cancel(project_name, job_id) def deploy(self, project_name: str, version: int, egg_byte: BinaryIO) -> "Dict or bool": spider_num = self.scrapyd_api.add_version(project_name, version, egg_byte) return { 'project': project_name, 'version': version, 'spiders': spider_num, } if spider_num else False def log_url(self, project_name, spider_name, job_id): return '{}/logs/{}/{}/{}'\ .format(self.server_url, project_name, spider_name, job_id) def job_status(self, project_name, job_id): return self.scrapyd_api.job_status(project_name, job_id)
def deploy_project(project, client): path = get_run_path() path = '{path}/storage/{project}/'.format(path=path, project=project.name) egg = get_egg_info(project) if egg: file_path = '{path}/{egg}'.format(path=path, egg=egg.get('name')) egg_file = open(file_path, 'rb') deploy_version = date_format(time.time(), '%Y-%m-%d_%H_%M_%S') url = 'http://{ip}:{port}'.format(ip=client.ip, port=client.port) try: scrapyd = ScrapydAPI(url) egg_version = egg.get('version') result = scrapyd.add_version(project.name, deploy_version, egg_file.read()) return result, deploy_version, egg_version except (ConnectionError, InvalidURL): return None, None, None else: return None, None, None
class ScrapydProxy(SpiderServiceProxy): ''' 单个爬虫服务类 继承单个爬虫服务基类, 实现基类的功能 ''' def __init__(self, server): self.spider_status_name_dict = { SpiderStatus.PENDING: 'pending', SpiderStatus.RUNNING: 'running', SpiderStatus.FINISHED: 'finished' } super(ScrapydProxy, self).__init__(server) # super执行的是父类的方法 self.scrapyd_api = ScrapydAPI(self._scrapyd_url()) # 实例化ScrapydAPI def _scrapyd_url(self): return self.server # 得到scrapyd的url, 用到实现的get方法 def list_projects(self): """ 获取指定scrapyd上的所有工程列表,返回工程名字符串列表,而get_project_list返回的是对象 :return: """ # 获取scrapyd上的所有工程列表 return self.scrapyd_api.list_projects() def get_project_list(self): """ 功能: 获取所有的爬虫工程列表 :return: 返回工程对象列表 """ data = self.scrapyd_api.list_projects() # 获取scrapyd上的所有工程列表 result = [] if data: for project_name in data: project = Project() # 实例化工程对象 project.project_name = project_name result.append(project) return result def delete_project(self, project_name): """ 功能: scrapyd上删除指定工程 :param project_name: 工程名称 :return: """ try: return self.scrapyd_api.delete_project( project_name) # 返回状态, 工程存在, 删除后返回True except: return False def get_slave_spider_list(self, project_name): try: data = self.scrapyd_api.list_spiders( project_name) # 列出指定工程下所有的爬虫名称 return data if data else [] except: return [] def get_spider_list(self, project_name): """ 功能: 获取指定工程下的所有爬虫名称列表 :param project_name: 工程名称 :return: 返回爬虫实例对象列表 """ try: data = self.scrapyd_api.list_spiders( project_name) # 列出指定工程下所有的爬虫名称 result = [] if data: for spider_name in data: spider_instance = SpiderInstance() spider_instance.spider_name = spider_name result.append(spider_instance) return result except: return [] def get_daemon_status(self): pass def get_job_list(self, project_name, spider_status=None): """ 从scrapyd中获取一个爬虫项目下面的所有蜘蛛任务状态 :param project_name: 爬虫项目名称 :param spider_status: 蜘蛛状态, 默认为None, 返回所有状态, 若传入状态值, 则返回某个状态 :return: """ result = { SpiderStatus.PENDING: [], SpiderStatus.RUNNING: [], SpiderStatus.FINISHED: [] } try: data = self.scrapyd_api.list_jobs(project_name) if data: for _status in self.spider_status_name_dict.keys(): for item in data[self.spider_status_name_dict[_status]]: start_time, end_time = None, None if item.get('start_time'): start_time = datetime.datetime.strptime( item['start_time'], '%Y-%m-%d %H:%M:%S.%f') if item.get('end_time'): end_time = datetime.datetime.strptime( item['end_time'], '%Y-%m-%d %H:%M:%S.%f') result[_status].append( dict(id=item['id'], start_time=start_time, end_time=end_time)) return result if not spider_status else result[spider_status] except: return result def start_spider(self, project_name, spider_name): """ 功能:启动指定工程下的指定爬虫 :param project_name: 工程名称 :param spider_name: 爬虫名称 :return: 返回启动的爬虫的id, 启动不成功, 返回None """ data = self.scrapyd_api.schedule(project_name, spider_name, pro_name=project_name) return data if data else None def cancel_spider(self, project_name, job_id): """ 功能: 取消工程下的指定job :param project_name: 工程名称 str :param job_id: job_id str :return: 成功取消, 返回True, 否则返回False """ data = self.scrapyd_api.cancel(project_name, job_id) return data != None def deploy(self, project_name, file_path): """ 功能: 将上传的egg项目部署到scrapyd上 :param project_name: 工程名称 str :param file_path: egg文件路径 str :return: 成功返回字典型工程信息, 否则返回None """ egg = open(file_path, 'rb') version = int(time.time()) spider_num = self.scrapyd_api.add_version(project_name, int(time.time()), egg) egg.close() ret = { 'version': version, 'project': project_name, 'spiders': spider_num, 'node_name': socket.gethostname(), 'status': 'ok' if spider_num else 'error' } return str(ret) if spider_num else False def log_url(self, project_name, spider_name, job_id): """ 功能: 获取爬虫的日志 :param project_name: 工程名称 str :param spider_name: 爬虫名称 str :param job_id: job_id str :return: 返回log日志文件的url str """ return self._scrapyd_url() + '/logs/%s/%s/%s.log' % ( project_name, spider_name, job_id)
# delversion.json: 此接口用来删除项目的某个版本 curl http://120.27.34.25:6800/delversion.json -d project=weibo -d version=v1 # 这里需要一个参数project,即项目名称,还需一个参数version,即项目的版本 # 返回结果: {"status": "ok"} # status代表请求执行情况,删除成功 # delproject.json:用来删除某个项目 curl http://120.27.34.25:6800/delproject.json -d project=weibo # 这里需要一个参数project,即项目名称, # 返回结果: {"status": "ok"} # status代表请求执行情况,删除成功 # 以上接口是Scrapyd所有的接口,可以直接请求HTTP接口,即可控制项目的部署、启动、运行等操作 # 5.ScrapyAPI的使用:Scrapyd API库对这些接口做了一层封装,核心原理和HTTP接口请求方式并无二致,只不过Python封装后的库更便捷 # 建立Scrapy API: from scrapyd_api import ScrapydAPI scrapyd = ScrapydAPI('http://120.27.34.25:6800') # 调用它的方法来实现对应接口的操作,如部署的操作: egg = open('weibo.egg', 'rb') scrapyd.add_version('weibo', 'v1', egg) # 这样就可以将项目打包为egg文件,然后把本地打包的Egg项目部署到远程Scrapyd # Scrapyd API还实现了所有Scrapyd提供的API接口,名称都是相同的,参数也是相同的 # 调用list_projects方法即可列出Scrapyd中所有已部署的项目: scrapyd.list_projects() ['weibo', 'zhihu'] # 详细操作可参考官方文档:http://python-scrapyd-api.readthedocs.io/
from scrapyd_api import ScrapydAPI scrapyd = ScrapydAPI("http://127.0.0.1:6800") egg = open("book.egg", "rb") scrapyd.add_version("book", "V1", egg)
class Schedular: def __init__(self): self._scrapyd = None try: self._scrapyd = ScrapydAPI('http://{}:{}'.format( config['Scrapyd']['host'], config['Scrapyd']['port'])) except KeyError as e: logger.error("{}: No such key exists - {}".format( class_fullname(e), str(e))) except Exception as e: logger.error("{}: Failed to create a scrapyd object - {}".format( class_fullname(e), str(e))) def addversion(self, project, version, egg_filename='pricewatch_bot-0.0.1-py3.7.egg'): """ Scrapyd API: addversion - https://scrapyd.readthedocs.io/en/stable/api.html#addversion-json """ if not self._scrapyd: logger.error( "No scrapyd object find. Unable to add a new version.") return None num_of_spiders = None try: with open(os.path.join(settings.APP_DIST_DIRPATH, egg_filename), 'rb') as egg: num_of_spiders = self._scrapyd.add_version( project, version, egg) except FileNotFoundError as e: logger.error("{}: {}".format(class_fullname(e), str(e))) except ScrapydResponseError as e: logger.error("{}: Response error - {}".format( class_fullname(e), str(e))) except Exception as e: logger.error("{}: Failed to add a version - {}".format( class_fullname(e), str(e))) else: logger.info( "version '{}' for project '{}' added/updated - {} spider(s)". format(project, version, num_of_spiders)) # call API to create a version response = requests.post( 'http://{}:{}/api/schedule/version/'.format( config['PriceWatchWeb']['host'], config['PriceWatchWeb']['port']), json={ 'project': project, 'version': version, 'status': settings.SCHEDULES_VERSION_STATUS_ADDED, 'added_at': str(datetime.now()), 'deleted_at': None, }) if not response.ok: logger.error( "{} HTTP Error: Failed to add a version - {} - {}".format( response.status_code, response.reason, response.text)) finally: return num_of_spiders def schedule(self, project, spider, **kwargs): if not self._scrapyd: logger.error("No scrapyd object find. Unable to schedule a job.") return None _jobid = str(uuid.uuid4()) kwargs['jobid'] = _jobid # a scrapyd parameter kwargs['job_id'] = _jobid # passing to a spider try: _s = None # scrapy settings in dict. eg {'DOWNLOAD_DELAY': 2} jobid = self._scrapyd.schedule(project, spider, settings=_s, **kwargs) except ScrapydResponseError as e: logger.error("{}: Response error - {}".format( class_fullname(e), str(e))) except Exception as e: logger.error("{}: Failed to schedule a job - {}".format( class_fullname(e), str(e))) else: if jobid != _jobid: logger.error( "{}: Invalid jobid [enteredid vs returnedid] [{} vs {}] - {}" .format(class_fullname(e), _jobid, jobid, str(e))) else: logger.info( "new scheduled job '{}' for project '{}', spider '{}' has been set" .format(jobid, project, spider)) # call API to create a job response = requests.post( 'http://{}:{}/api/schedule/job/'.format( config['PriceWatchWeb']['host'], config['PriceWatchWeb']['port']), json={ 'job_id': jobid, 'project': project, 'spider': spider, 'version': kwargs.pop('_version', None), 'settings': _s, 'other_params': kwargs, 'status': settings.SCHEDULES_JOB_STATUS_PENDING, }) if not response.ok: logger.error( "{} HTTP Error: Failed to add a new job - {} - {}". format(response.status_code, response.reason, response.text)) finally: return jobid def listjobs(self, project): if not self._scrapyd: logger.error("No scrapyd object find. Unable to list jobs.") return None jobs = None try: jobs = self._scrapyd.list_jobs(project) except ScrapydResponseError as e: logger.error("{}: Response error - {}".format( class_fullname(e), str(e))) except Exception as e: logger.error("{}: Failed to list jobs - {}".format( class_fullname(e), str(e))) else: logger.info("list of jobs for project '{}' - {}".format( project, str(jobs))) self._store_jobs(project, jobs) finally: return jobs def _store_jobs(self, project, jobs): """ parse jobs and store information into db """ if all(_j in jobs for _j in ['running', 'finished']): for x in jobs['running']: # call API to update a running job response = requests.put( 'http://{}:{}/api/schedule/job/{}/'.format( config['PriceWatchWeb']['host'], config['PriceWatchWeb']['port'], x['id']), json={ 'job_id': x['id'], 'project': project, 'spider': x['spider'], 'start_time': x['start_time'], 'status': settings.SCHEDULES_JOB_STATUS_RUNNING, }) if not response.ok: logger.error( "{} HTTP Error: Failed to update a running job - {} - {}" .format(response.status_code, response.reason, response.text)) for x in jobs['finished']: # call API to update a finished job response = requests.put( 'http://{}:{}/api/schedule/job/{}/'.format( config['PriceWatchWeb']['host'], config['PriceWatchWeb']['port'], x['id']), json={ 'job_id': x['id'], 'project': project, 'spider': x['spider'], 'start_time': x['start_time'], 'end_time': x['end_time'], 'status': settings.SCHEDULES_JOB_STATUS_FINISHED, }) if not response.ok: logger.error( "{} HTTP Error: Failed to update a finished job - {} - {}" .format(response.status_code, response.reason, response.text)) def delversion(self, project, version): """ delversion """ if not self._scrapyd: logger.error("No scrapyd object find. Unable to delete version.") return False deleted = False try: deleted = self._scrapyd.delete_version(project, version) except ScrapydResponseError as e: logger.error("{}: Response error - {}".format( class_fullname(e), str(e))) except Exception as e: logger.error("{}: Failed to delete version - {}".format( class_fullname(e), str(e))) else: logger.info( "successfully deleted project '{}' version '{}'".format( project, version)) # update deleted version response = requests.put( 'http://{}:{}/api/schedule/version/'.format( config['PriceWatchWeb']['host'], config['PriceWatchWeb']['port']), json={ 'project': project, 'version': version, 'status': settings.SCHEDULES_VERSION_STATUS_DELETED, 'deleted_at': str(datetime.now()), }) if not response.ok: logger.error( "{} HTTP Error: Failed to update a deleted version - {} - {}" .format(response.status_code, response.reason, response.text)) finally: return deleted def delproject(self, project): """ delproject """ if not self._scrapyd: logger.error("No scrapyd object find. Unable to delete version.") return False deleted = False try: deleted = self._scrapyd.delete_project(project) except ScrapydResponseError as e: logger.error("{}: Response error - {}".format( class_fullname(e), str(e))) except Exception as e: logger.error("{}: Failed to delete project - {}".format( class_fullname(e), str(e))) else: logger.info("successfully deleted project '{}'".format(project)) # update deleted project response = requests.put( 'http://{}:{}/api/schedule/version/'.format( config['PriceWatchWeb']['host'], config['PriceWatchWeb']['port']), json={ 'project': project, 'status': settings.SCHEDULES_VERSION_STATUS_DELETED, 'deleted_at': str(datetime.now()), }) if not response.ok: logger.error( "{} HTTP Error: Failed to update deleted project - {} - {}" .format(response.status_code, response.reason, response.text)) finally: return deleted def close(self): self._scrapyd.client.close()