class ScrapydLoginFinderJob(object): def __init__(self, seed_url, username, password, db_name, scrapyd_host="localhost", scrapyd_port="6800", project="default", spider="login_finder"): scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port) self.scrapi = ScrapydAPI(scrapy_url) self.project = project self.spider = spider self.seed_url = seed_url self.username = username self.password = password self.db_name = db_name def schedule(self): self.job_id = self.scrapi.schedule(self.project, self.spider, seed_url = self.seed_url, username = self.username, password = self.password, db_name = self.db_name) return self.job_id def list_jobs(self): return self.scrapi.list_jobs(self.project) def get_state(self): try: self.job_id except: Exception("You must schedule a job before getting the state!") try: for job in self.scrapi.list_jobs(self.project)["running"]: print self.job_id, job["id"] if job["id"] == self.job_id: return "Running" for job in self.scrapi.list_jobs(self.project)["pending"]: print self.job_id, job["id"] if job["id"] == self.job_id: return "Pending" except: print "handled exception:" traceback.print_exc() return None return "Done" def block_until_done(self, timeout = 120): exec_time = 0 while 1: exec_time += 1 if exec_time == timeout: raise Exception("Timeout time reached for login_finder spider execution") time.sleep(1) state = self.get_state() if state == "Done": break
def call_scrapyd_service(): """通过 api 操作爬虫 参考文档地址:https://pypi.python.org/pypi/python-scrapyd-api#downloads """ scrapyd = ScrapydAPI('http://localhost:6800') scrapyd.job_status('govbuyscrapy', '0c838fd4b9f111e6abcc14dda97ae760') # 查看指定爬虫任务执行状态 scrapyd.list_jobs('govbuyscrapy') # 查看爬虫任务列表 scrapyd.schedule('govbuyscrapy', 'govbuy_wan_shucheng') # 指定项目执行指定爬虫
class ScrapydJob(object): def __init__(self, scrapyd_host="localhost", scrapyd_port="6800", project="default", spider="website_finder", screenshot_dir='/memex-pinterest/ui/static/images/screenshots'): scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port) self.scrapi = ScrapydAPI(scrapy_url) self.project = project self.spider = spider self.screenshot_dir = screenshot_dir def schedule(self, seed): if not self.screenshot_dir: raise Exception("Please set the screenshot path in the config before scheduling") self.job_id = self.scrapi.schedule(self.project, self.spider, seed_urls=seed, screenshot_dir=self.screenshot_dir) return self.job_id def schedule_keywords(self, phrases, use_splash=True): """ Schedule a Scrapyd job """ if not self.screenshot_dir: raise Exception("Please set the screenshot path in the config before scheduling") self.job_id = self.scrapi.schedule(self.project, self.spider, phrases=phrases, screenshot_dir=self.screenshot_dir, use_splash=int(use_splash) ) return self.job_id def list_jobs(self): return self.scrapi.list_jobs(self.project) def get_state(self, job_id): try: for job in self.scrapi.list_jobs(self.project)["running"]: print job_id, job["id"] if job["id"] == job_id: return "Running" for job in self.scrapi.list_jobs(self.project)["pending"]: print job_id, job["id"] if job["id"] == job_id: return "Pending" except Exception: print "handled exception:" traceback.print_exc() return None return "Done"
def post(self, request): """ :param request: 启动爬虫的请求参数 :return: 爬虫启动是否成功 """ data = request.data spider_name = data.get("spider_name") spider_type = data.get("spider_type") # print(spider_name) # print(spider_type) if spider_type == "start": try: scrapyd = ScrapydAPI('http://localhost:6800') # 这里是去调用部署分布式爬虫 scrapyd.schedule('default', spider_name) # 这里是启动爬虫 except: return Response("failed") else: try: scrapyd = ScrapydAPI('http://localhost:6800') # 这里是去调用部署分布式爬虫 del_dict = scrapyd.list_jobs('default') # 这里是启动爬虫 # print(scrapyd.list_jobs('default')) del_jobs = [] for k in ["pending", "running"]: # print(del_dict[k]) for item in del_dict[k]: if item.get("spider") == spider_name: del_jobs.append(item.get("id")) for job_id in del_jobs: scrapyd.cancel('default', job_id) # print(del_jobs) except: return Response("failed") return Response("ok")
class Spider(Resource): def __init__(self, url): self.scrapyd = ScrapydAPI(url) def get(self): data = [] for p in self.scrapyd.list_projects(): spiders = reduce_spiders(listjobs(self.scrapyd.list_jobs(p), p)) data.extend(spiders) return data def post(self, name): p, s = name.split('.') jobs = self.scrapyd.list_jobs(p) for job in (jobs['running'] + jobs['pending']): if job['spider'] == s: return 'Already Running' return self.scrapyd.schedule(p, s)
def jobs_remove(project_id): servers = agent.servers project = Project.query.filter(Project.id == project_id).first() db.session.execute('pragma foreign_keys=on') for job_instance in JobInstance.query.filter_by(project_id=project_id): db.session.delete(job_instance) db.session.commit() for server in servers: scrapyd = ScrapydAPI(server) for job in scrapyd.list_jobs(project.project_name)['pending']: jobid = job['id'] prev_status = scrapyd.cancel(project.project_name, jobid, signal='KILL') for job in scrapyd.list_jobs(project.project_name)['running']: jobid = job['id'] prev_status = scrapyd.cancel(project.project_name, jobid, signal='KILL') return redirect(request.referrer, code=302)
def job_list(request, id, project): if request.method == 'GET': client = Client.objects.get(id=id) scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) result = scrapyd.list_jobs(project) jobs = [] statuses = ['pending', 'running', 'finished'] for status in statuses: for job in result.get(status): job['status'] = status jobs.append(job) return HttpResponse(json.dumps(jobs))
def list_jobs(project, url=DEFAULT_URL): """ @param project: scrapy project name @param spider: spider name @param url: the url which target scrapyd daemon listens on @param settings: the settings dictionary To schedule a spider run: curl http://localhost:6800/schedule.json -d project=myproject -d spider=spider2 """ scrapyd = ScrapydAPI(url) return scrapyd.list_jobs(project) pass
class Scraper: def __init__(self): self.client = ScrapydAPI("http://scrapyd:6800", timeout=10) self.project = 'default' def schedule_spider(self, spider_name: str): print(f"RUN SPIDER: {spider_name}") return self.client.schedule(self.project, spider_name) def cancel_job(self, job_id: str): return self.client.cancel(self.project, job_id) def get_status_of_job(self, job_id: str): return self.client.job_status(self.project, job_id) def get_all_jobs(self): return self.client.list_jobs(self.project) def get_all_spiders(self): return self.client.list_spiders(self.project)
def job_list(request, client_id, project_name): """ get job list of project from one client :param request: request object :param client_id: client id :param project_name: project name :return: list of jobs """ if request.method == 'GET': client = Client.objects.get(id=client_id) scrapyd = ScrapydAPI(scrapyd_url(client.ip, client.port)) try: result = scrapyd.list_jobs(project_name) jobs = [] statuses = ['pending', 'running', 'finished'] for status in statuses: for job in result.get(status): job['status'] = status jobs.append(job) return JsonResponse(jobs) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
def job_list(request, node_id, project_name): """ get job list of project from one node :param request: request object :param node_id: node id :param project_name: project name :return: list of jobs """ if request.method == 'GET': node = Node.objects.get(id=node_id) scrapyd = ScrapydAPI(scrapyd_url(node.node_ip, node.node_port)) try: result = scrapyd.list_jobs(project_name) jobs = [] statuses = ['pending', 'running', 'finished'] for status in statuses: for job in result.get(status): job['status'] = status jobs.append(job) return JsonResponse({"result": 1, "jobs": jobs}) except ConnectionError: return JsonResponse({'message': 'Connect Error'}, status=500)
def getRunServer(deployProject='offlineCheckSpiders'): """ :return: 返回pending和running状态任务数最少的机器,暂时按每个任务进行一次安排。如果超过最大任务数就不添加任务 """ servers = settings.SCRAPYD_URLS minTaskServer = None minTasks = -1 for server in servers: try: scrapyd = ScrapydAPI(server, timeout=8) jobs = scrapyd.list_jobs(project=deployProject) taskNums = len(jobs.get('pending', [])) + len( jobs.get('running', [])) print("server: %s Running tasks is %s" % (server, taskNums)) if taskNums < scrapydBatchSize // 2: return server if (taskNums < minTasks or minTasks < 0): minTaskServer = server minTasks = taskNums except BaseException as e: print(" %s this server is not deployed, %s" % (server, e)) return minTaskServer
class ScrapydProxy(SpiderServiceProxy): ''' 单个爬虫服务类 继承单个爬虫服务基类, 实现基类的功能 ''' def __init__(self, server): self.spider_status_name_dict = { SpiderStatus.PENDING: 'pending', SpiderStatus.RUNNING: 'running', SpiderStatus.FINISHED: 'finished' } super(ScrapydProxy, self).__init__(server) # super执行的是父类的方法 self.scrapyd_api = ScrapydAPI(self._scrapyd_url()) # 实例化ScrapydAPI def _scrapyd_url(self): return self.server # 得到scrapyd的url, 用到实现的get方法 def list_projects(self): """ 获取指定scrapyd上的所有工程列表,返回工程名字符串列表,而get_project_list返回的是对象 :return: """ # 获取scrapyd上的所有工程列表 return self.scrapyd_api.list_projects() def get_project_list(self): """ 功能: 获取所有的爬虫工程列表 :return: 返回工程对象列表 """ data = self.scrapyd_api.list_projects() # 获取scrapyd上的所有工程列表 result = [] if data: for project_name in data: project = Project() # 实例化工程对象 project.project_name = project_name result.append(project) return result def delete_project(self, project_name): """ 功能: scrapyd上删除指定工程 :param project_name: 工程名称 :return: """ try: return self.scrapyd_api.delete_project( project_name) # 返回状态, 工程存在, 删除后返回True except: return False def get_slave_spider_list(self, project_name): try: data = self.scrapyd_api.list_spiders( project_name) # 列出指定工程下所有的爬虫名称 return data if data else [] except: return [] def get_spider_list(self, project_name): """ 功能: 获取指定工程下的所有爬虫名称列表 :param project_name: 工程名称 :return: 返回爬虫实例对象列表 """ try: data = self.scrapyd_api.list_spiders( project_name) # 列出指定工程下所有的爬虫名称 result = [] if data: for spider_name in data: spider_instance = SpiderInstance() spider_instance.spider_name = spider_name result.append(spider_instance) return result except: return [] def get_daemon_status(self): pass def get_job_list(self, project_name, spider_status=None): """ 从scrapyd中获取一个爬虫项目下面的所有蜘蛛任务状态 :param project_name: 爬虫项目名称 :param spider_status: 蜘蛛状态, 默认为None, 返回所有状态, 若传入状态值, 则返回某个状态 :return: """ result = { SpiderStatus.PENDING: [], SpiderStatus.RUNNING: [], SpiderStatus.FINISHED: [] } try: data = self.scrapyd_api.list_jobs(project_name) if data: for _status in self.spider_status_name_dict.keys(): for item in data[self.spider_status_name_dict[_status]]: start_time, end_time = None, None if item.get('start_time'): start_time = datetime.datetime.strptime( item['start_time'], '%Y-%m-%d %H:%M:%S.%f') if item.get('end_time'): end_time = datetime.datetime.strptime( item['end_time'], '%Y-%m-%d %H:%M:%S.%f') result[_status].append( dict(id=item['id'], start_time=start_time, end_time=end_time)) return result if not spider_status else result[spider_status] except: return result def start_spider(self, project_name, spider_name): """ 功能:启动指定工程下的指定爬虫 :param project_name: 工程名称 :param spider_name: 爬虫名称 :return: 返回启动的爬虫的id, 启动不成功, 返回None """ data = self.scrapyd_api.schedule(project_name, spider_name, pro_name=project_name) return data if data else None def cancel_spider(self, project_name, job_id): """ 功能: 取消工程下的指定job :param project_name: 工程名称 str :param job_id: job_id str :return: 成功取消, 返回True, 否则返回False """ data = self.scrapyd_api.cancel(project_name, job_id) return data != None def deploy(self, project_name, file_path): """ 功能: 将上传的egg项目部署到scrapyd上 :param project_name: 工程名称 str :param file_path: egg文件路径 str :return: 成功返回字典型工程信息, 否则返回None """ egg = open(file_path, 'rb') version = int(time.time()) spider_num = self.scrapyd_api.add_version(project_name, int(time.time()), egg) egg.close() ret = { 'version': version, 'project': project_name, 'spiders': spider_num, 'node_name': socket.gethostname(), 'status': 'ok' if spider_num else 'error' } return str(ret) if spider_num else False def log_url(self, project_name, spider_name, job_id): """ 功能: 获取爬虫的日志 :param project_name: 工程名称 str :param spider_name: 爬虫名称 str :param job_id: job_id str :return: 返回log日志文件的url str """ return self._scrapyd_url() + '/logs/%s/%s/%s.log' % ( project_name, spider_name, job_id)
class Overseer(object): """ Overseer facilitate the deployment process of local spiders to a remote scrapyd server Available methods: spawn_spiders Create spider and deploy them to remote scrapyd server get_status Report the current status of the remote scrapyd server """ DEFAULT_TYPE = 'sell' DEFAULT_VENDOR = 'None' def __init__(self, name, spider_name, host, mongodb_credentials): self.server = ScrapydAPI(host) self.host_name = self._strip_host_name(host) self.birth_date = datetime.utcnow() self.name = name self.spider_name = spider_name self.alive = True client = pymongo.MongoClient(mongodb_credentials['server'], mongodb_credentials['port'], connectTimeoutMS=30000, socketTimeoutMS=None, socketKeepAlive=True) db = client[mongodb_credentials['database']] self.collection = db[mongodb_credentials['collection']] def kill(self): self.alive = False return self.host_name def heartbeat(self): return self.alive def spawn_spiders(self, num_spiders=5, items_per_spider=100, **kwargs): type = kwargs.get('type', self.DEFAULT_TYPE) vendor = kwargs.get('vendor', self.DEFAULT_VENDOR) count = 0 while count < num_spiders: count += 1 self._spawn(vendor, type, items_per_spider) time.sleep(3) def get_status(self): """ Return: the number of running spiders the number of finished spiders the average time for one spider to finish """ status = self.server.list_jobs(self.name) running = status['running'] finished = status['finished'] finished_times = [self._time_diff_in_minute(job['end_time'], job['start_time']) for job in finished] avg_time = np.average(finished_times) Notification('{} - [{}] \t Running Spiders = {}, Finished Spiders = {}, Average Runtime = {}' .format(datetime.utcnow(), self.host_name, len(running), len(finished), avg_time ) .expandtabs(3) ).info() return len(running), len(finished), avg_time def _spawn(self, vendor, type, items_per_spider=100): # Get the tasks from the database tasks = self._get_tasks_from_database(vendor, type, items_per_spider) if not tasks: raise ValueError('There is no more task from the database!') links, property_ids = zip(*tasks) # Schedule the tasks with the remote scrapyd server job_id = self.server.schedule(self.name, self.spider_name, vendor=vendor, crawl_url=','.join(links), type=type) Notification('{} - [{}] \t Launch spider {}' .format(datetime.utcnow(), self.host_name, job_id) .expandtabs(3) ).success() # Clear the tasks from the database self._clear_tasks_from_database(vendor, type, property_ids) def _get_tasks_from_database(self, vendor, type, items_per_spider): cursor = self.collection \ .find({"last_crawled_date": None, "type": type, "vendor": vendor}) \ .sort("created_date", pymongo.ASCENDING) \ .limit(items_per_spider) tasks = [(item['link'], item['property_id']) for item in cursor] return tasks def _clear_tasks_from_database(self, vendor, type, property_ids): self.collection.update({"vendor": vendor, "type": type, "property_id": {"$in": property_ids}}, {"$set": {"last_crawled_date": datetime.utcnow()}}, multi=True, upsert=False) @staticmethod def _time_diff_in_minute(current, previous): return ((parser.parse(current) - parser.parse(previous)).seconds // 60) % 60 @staticmethod def _strip_host_name(host): return host.replace('http://', '').replace('.compute.amazonaws.com:6800', '')
class Schedular: def __init__(self): self._scrapyd = None try: self._scrapyd = ScrapydAPI('http://{}:{}'.format( config['Scrapyd']['host'], config['Scrapyd']['port'])) except KeyError as e: logger.error("{}: No such key exists - {}".format( class_fullname(e), str(e))) except Exception as e: logger.error("{}: Failed to create a scrapyd object - {}".format( class_fullname(e), str(e))) def addversion(self, project, version, egg_filename='pricewatch_bot-0.0.1-py3.7.egg'): """ Scrapyd API: addversion - https://scrapyd.readthedocs.io/en/stable/api.html#addversion-json """ if not self._scrapyd: logger.error( "No scrapyd object find. Unable to add a new version.") return None num_of_spiders = None try: with open(os.path.join(settings.APP_DIST_DIRPATH, egg_filename), 'rb') as egg: num_of_spiders = self._scrapyd.add_version( project, version, egg) except FileNotFoundError as e: logger.error("{}: {}".format(class_fullname(e), str(e))) except ScrapydResponseError as e: logger.error("{}: Response error - {}".format( class_fullname(e), str(e))) except Exception as e: logger.error("{}: Failed to add a version - {}".format( class_fullname(e), str(e))) else: logger.info( "version '{}' for project '{}' added/updated - {} spider(s)". format(project, version, num_of_spiders)) # call API to create a version response = requests.post( 'http://{}:{}/api/schedule/version/'.format( config['PriceWatchWeb']['host'], config['PriceWatchWeb']['port']), json={ 'project': project, 'version': version, 'status': settings.SCHEDULES_VERSION_STATUS_ADDED, 'added_at': str(datetime.now()), 'deleted_at': None, }) if not response.ok: logger.error( "{} HTTP Error: Failed to add a version - {} - {}".format( response.status_code, response.reason, response.text)) finally: return num_of_spiders def schedule(self, project, spider, **kwargs): if not self._scrapyd: logger.error("No scrapyd object find. Unable to schedule a job.") return None _jobid = str(uuid.uuid4()) kwargs['jobid'] = _jobid # a scrapyd parameter kwargs['job_id'] = _jobid # passing to a spider try: _s = None # scrapy settings in dict. eg {'DOWNLOAD_DELAY': 2} jobid = self._scrapyd.schedule(project, spider, settings=_s, **kwargs) except ScrapydResponseError as e: logger.error("{}: Response error - {}".format( class_fullname(e), str(e))) except Exception as e: logger.error("{}: Failed to schedule a job - {}".format( class_fullname(e), str(e))) else: if jobid != _jobid: logger.error( "{}: Invalid jobid [enteredid vs returnedid] [{} vs {}] - {}" .format(class_fullname(e), _jobid, jobid, str(e))) else: logger.info( "new scheduled job '{}' for project '{}', spider '{}' has been set" .format(jobid, project, spider)) # call API to create a job response = requests.post( 'http://{}:{}/api/schedule/job/'.format( config['PriceWatchWeb']['host'], config['PriceWatchWeb']['port']), json={ 'job_id': jobid, 'project': project, 'spider': spider, 'version': kwargs.pop('_version', None), 'settings': _s, 'other_params': kwargs, 'status': settings.SCHEDULES_JOB_STATUS_PENDING, }) if not response.ok: logger.error( "{} HTTP Error: Failed to add a new job - {} - {}". format(response.status_code, response.reason, response.text)) finally: return jobid def listjobs(self, project): if not self._scrapyd: logger.error("No scrapyd object find. Unable to list jobs.") return None jobs = None try: jobs = self._scrapyd.list_jobs(project) except ScrapydResponseError as e: logger.error("{}: Response error - {}".format( class_fullname(e), str(e))) except Exception as e: logger.error("{}: Failed to list jobs - {}".format( class_fullname(e), str(e))) else: logger.info("list of jobs for project '{}' - {}".format( project, str(jobs))) self._store_jobs(project, jobs) finally: return jobs def _store_jobs(self, project, jobs): """ parse jobs and store information into db """ if all(_j in jobs for _j in ['running', 'finished']): for x in jobs['running']: # call API to update a running job response = requests.put( 'http://{}:{}/api/schedule/job/{}/'.format( config['PriceWatchWeb']['host'], config['PriceWatchWeb']['port'], x['id']), json={ 'job_id': x['id'], 'project': project, 'spider': x['spider'], 'start_time': x['start_time'], 'status': settings.SCHEDULES_JOB_STATUS_RUNNING, }) if not response.ok: logger.error( "{} HTTP Error: Failed to update a running job - {} - {}" .format(response.status_code, response.reason, response.text)) for x in jobs['finished']: # call API to update a finished job response = requests.put( 'http://{}:{}/api/schedule/job/{}/'.format( config['PriceWatchWeb']['host'], config['PriceWatchWeb']['port'], x['id']), json={ 'job_id': x['id'], 'project': project, 'spider': x['spider'], 'start_time': x['start_time'], 'end_time': x['end_time'], 'status': settings.SCHEDULES_JOB_STATUS_FINISHED, }) if not response.ok: logger.error( "{} HTTP Error: Failed to update a finished job - {} - {}" .format(response.status_code, response.reason, response.text)) def delversion(self, project, version): """ delversion """ if not self._scrapyd: logger.error("No scrapyd object find. Unable to delete version.") return False deleted = False try: deleted = self._scrapyd.delete_version(project, version) except ScrapydResponseError as e: logger.error("{}: Response error - {}".format( class_fullname(e), str(e))) except Exception as e: logger.error("{}: Failed to delete version - {}".format( class_fullname(e), str(e))) else: logger.info( "successfully deleted project '{}' version '{}'".format( project, version)) # update deleted version response = requests.put( 'http://{}:{}/api/schedule/version/'.format( config['PriceWatchWeb']['host'], config['PriceWatchWeb']['port']), json={ 'project': project, 'version': version, 'status': settings.SCHEDULES_VERSION_STATUS_DELETED, 'deleted_at': str(datetime.now()), }) if not response.ok: logger.error( "{} HTTP Error: Failed to update a deleted version - {} - {}" .format(response.status_code, response.reason, response.text)) finally: return deleted def delproject(self, project): """ delproject """ if not self._scrapyd: logger.error("No scrapyd object find. Unable to delete version.") return False deleted = False try: deleted = self._scrapyd.delete_project(project) except ScrapydResponseError as e: logger.error("{}: Response error - {}".format( class_fullname(e), str(e))) except Exception as e: logger.error("{}: Failed to delete project - {}".format( class_fullname(e), str(e))) else: logger.info("successfully deleted project '{}'".format(project)) # update deleted project response = requests.put( 'http://{}:{}/api/schedule/version/'.format( config['PriceWatchWeb']['host'], config['PriceWatchWeb']['port']), json={ 'project': project, 'status': settings.SCHEDULES_VERSION_STATUS_DELETED, 'deleted_at': str(datetime.now()), }) if not response.ok: logger.error( "{} HTTP Error: Failed to update deleted project - {} - {}" .format(response.status_code, response.reason, response.text)) finally: return deleted def close(self): self._scrapyd.client.close()
class ScrapydJob(object): def __init__( self, scrapyd_host="localhost", scrapyd_port="6800", project="default", spider="website_finder", screenshot_dir='/memex-pinterest/ui/static/images/screenshots'): scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port) self.scrapi = ScrapydAPI(scrapy_url) self.project = project self.spider = spider self.screenshot_dir = screenshot_dir def schedule(self, seed): if not self.screenshot_dir: raise Exception( "Please set the screenshot path in the config before scheduling" ) self.job_id = self.scrapi.schedule(self.project, self.spider, seed_urls=seed, screenshot_dir=self.screenshot_dir) return self.job_id def schedule_keywords(self, phrases, use_splash=True): """ Schedule a Scrapyd job """ if not self.screenshot_dir: raise Exception( "Please set the screenshot path in the config before scheduling" ) self.job_id = self.scrapi.schedule(self.project, self.spider, phrases=phrases, screenshot_dir=self.screenshot_dir, use_splash=int(use_splash)) return self.job_id def list_jobs(self): return self.scrapi.list_jobs(self.project) def get_state(self, job_id): try: for job in self.scrapi.list_jobs(self.project)["running"]: print job_id, job["id"] if job["id"] == job_id: return "Running" for job in self.scrapi.list_jobs(self.project)["pending"]: print job_id, job["id"] if job["id"] == job_id: return "Pending" except Exception: print "handled exception:" traceback.print_exc() return None return "Done"
from scrapyd_api import ScrapydAPI scrapyd = ScrapydAPI('http://127.0.0.1:6800') # 获取上传爬虫目录 list_projects = scrapyd.list_projects() print(list_projects) list_spiders = scrapyd.list_spiders('wangYiStocks') # 查看爬虫任务 print(list_spiders) # 查看项目任务id list_jobs = scrapyd.list_jobs('wangYiStocks') print(list_jobs) cancel = scrapyd.cancel('wangYiStocks', '7c8be8661d4c11ea95d06c4b903122b5') print(cancel)
class ScrapydLoginFinderJob(object): def __init__(self, seed_url, username, password, db_name, scrapyd_host="localhost", scrapyd_port="6800", project="default", spider="login_finder"): scrapy_url = "http://" + scrapyd_host + ":" + str(scrapyd_port) self.scrapi = ScrapydAPI(scrapy_url) self.project = project self.spider = spider self.seed_url = seed_url self.username = username self.password = password self.db_name = db_name def schedule(self): self.job_id = self.scrapi.schedule(self.project, self.spider, seed_url=self.seed_url, username=self.username, password=self.password, db_name=self.db_name) return self.job_id def list_jobs(self): return self.scrapi.list_jobs(self.project) def get_state(self): try: self.job_id except: Exception("You must schedule a job before getting the state!") try: for job in self.scrapi.list_jobs(self.project)["running"]: print self.job_id, job["id"] if job["id"] == self.job_id: return "Running" for job in self.scrapi.list_jobs(self.project)["pending"]: print self.job_id, job["id"] if job["id"] == self.job_id: return "Pending" except: print "handled exception:" traceback.print_exc() return None return "Done" def block_until_done(self, timeout=120): exec_time = 0 while 1: exec_time += 1 if exec_time == timeout: raise Exception( "Timeout time reached for login_finder spider execution") time.sleep(1) state = self.get_state() if state == "Done": break
from scrapyd_api import ScrapydAPI scrapyd = ScrapydAPI('http://127.0.0.1:6800') scrapyd.list_jobs('project_name')
from scrapyd_api import ScrapydAPI scrapyd = ScrapydAPI('http://localhost:6800') scrapyd.list_jobs('blogSpider')
class Scrapyd_Control(object): def __init__(self): scrapyd_url = input('请输入scrapyd地址: ') project = input('请输入项目名称: ') self.project = project self.scrapyd = ScrapydAPI(scrapyd_url) # 启动爬虫 def schedule(self): spider = input('请输入爬虫名称: ') return { 'project': self.project, 'spider': spider, 'jobid': self.scrapyd.schedule(self.project, spider) } start, run = schedule, schedule # 取消爬虫 def cancel(self): jobid = input('请粘贴要取消的爬虫jobid: ') return self.scrapyd.cancel(self.project, jobid) # 查看项目 def listprojects(self): return self.scrapyd.list_projects() # 查看爬虫 def listspiders(self): return self.scrapyd.list_spiders(self.project) # 列出所有jobs def listjobs(self): return self.scrapyd.list_jobs(self.project) # 查看job状态 def jobstatus(self): jobid = input('请粘贴要查看的jobid: ') return self.scrapyd.job_status(self.project, jobid) # 查看版本 def listversions(self): return self.scrapyd.list_versions(self.project) # 删除版本 def delversion(self): version_name = input('请粘贴要删除的版本: ') yes = input('是否确认删除该版本{},请输yes否则回车跳过删除\n'.format(version_name)) if yes == 'yes': return self.scrapyd.delete_version(self.project, version_name) else: pass # 删除项目 def delproject(self): yes = input('是否确认删除该项目{},请输yes否则回车跳过删除\n'.format(self.project)) if yes == 'yes': return self.scrapyd.delete_project(self.project) else: pass # 列出所有命令 def help(self): print(""" 启动爬虫 schedule|start|run 取消爬虫 cancel 查看项目 listprojects 查看爬虫 listspiders 列出所有jobs listjobs 查看job状态 jobstatus 查看版本 listversions 删除版本 delversion 删除项目 deleproject 列出所有命令 help """)