def start(self): """启动这个worker 启动的时候,会将spider中的start_tasks移到待抓取队列 不会重复启动 """ if self.is_started: self.logger.warn("duplicate start") else: self.is_started = True self.worker_statistic.start_time = datetime.datetime.now() try: RecorderManager.instance().record_doing( record( self._worker_name, self.worker_statistic.start_time.strftime( "%Y-%m-%d %H:%M:%S"), get_class_path(self.spider.crawl_schedule.__class__), self.spider.crawl_schedule.schedule_kwargs, get_class_path(self.spider.__class__), self.spider.spider_kwargs)) except Exception, e: self.logger.warn("record worker failed:%s" % e) _move_start_tasks_to_crawl_schedule(self.spider.start_tasks, self.spider.crawl_schedule) ioloop.IOLoop.instance().add_timeout( datetime.timedelta( milliseconds=self.spider.crawl_schedule.interval), self.loop_get_and_execute) self.logger.info("start worker")
def api_recover_worker(params): '''以恢复模式启动一个worker Args: params: 字典, 参数字典:必须包括对应的worker_name ''' is_ok, errors = check_params(params, 'worker_name') if not is_ok: return result(400, "params error", str(errors)) else: try: worker_name = params.pop('worker_name') record = RecorderManager.instance().get_fail_worker_record(worker_name) if not record: return result(400, "not exist this fail worker", worker_name) else: schedule_params = record.get('schedule_kwargs') spider_params = record.get('spider_kwargs') schedule_path = record.get('schedule_class') spider_path = record.get('spider_class') schedule = get_schedule_class(schedule_path)(**schedule_params) spider = get_spider_class(spider_path)(schedule, **spider_params) recover_worker(spider) RecorderManager.instance().remove_last_fail_worker(worker_name) except ScheduleError, e: return result(400, message="init schedule failed", result=str(e)) except SpiderError, e: return result(400, message="init spider failed", result=str(e))
def api_remove_fail_worker(params): """清除fail worker记录,以及相关队列 Args: params: dict, 参数字典,必须包括worker_name """ is_ok, error = check_params(params, "worker_name") if not is_ok: return result(400, "params error", str(error)) else: worker_name = params.pop("worker_name") record = RecorderManager.instance().get_fail_worker_record(worker_name) if not record: return result(400, "not exist this fail worker", worker_name) else: try: schedule_params = record.get('schedule_kwargs') spider_params = record.get('spider_kwargs') schedule_path = record.get('schedule_class') spider_path = record.get('spider_class') schedule = get_schedule_class(schedule_path)(**schedule_params) spider = get_spider_class(spider_path)(schedule, **spider_params) spider.clear_all() RecorderManager.instance().remove_last_fail_worker(worker_name) except ScheduleError, e: return result(400, message="init schedule failed", result=str(e)) except SpiderError, e: return result(400, message="init spider failed", result=str(e)) except WorkerError, e: return result(400, message="recover worker failed", result=str(e))
def start(self): """启动这个worker 启动的时候,会将spider中的start_tasks移到待抓取队列 不会重复启动 """ if self.is_started: self.logger.warn("duplicate start") else: self.is_started = True self.worker_statistic.start_time = datetime.datetime.now() try: RecorderManager.instance().record_doing( record( self._worker_name, self.worker_statistic.start_time. strftime("%Y-%m-%d %H:%M:%S"), get_class_path(self.spider.crawl_schedule.__class__), self.spider.crawl_schedule.schedule_kwargs, get_class_path(self.spider.__class__), self.spider.spider_kwargs)) except Exception, e: self.logger.warn("record worker failed:%s" % e) _move_start_tasks_to_crawl_schedule(self.spider.start_tasks, self.spider.crawl_schedule) ioloop.IOLoop.instance().add_timeout( datetime.timedelta( milliseconds=self.spider.crawl_schedule.interval), self.loop_get_and_execute) self.logger.info("start worker")
def api_remove_all_fail_worker(params): """remove all fail worker Args: params: dict, param dict """ records = RecorderManager.instance().get_last_fail_worker() remove_rs = [] try: for record in records: worker_name = record.get('worker_name') schedule_params = record.get('schedule_kwargs') spider_params = record.get('spider_kwargs') schedule_path = record.get('schedule_class') spider_path = record.get('spider_class') try: schedule = get_schedule_class(schedule_path)(**schedule_params) spider = get_spider_class(spider_path)(schedule, **spider_params) spider.clear_all() RecorderManager.instance().remove_last_fail_worker(worker_name) except Exception, e: remove_rs.append({ "worker_name": worker_name, "result": "fail", "error": str(e) }) else: remove_rs.append({ "worker_name": worker_name, "result": "success", "error": "" }) except Exception, e: return result(500, "unsupported exception", result=str(e))
def api_recover_worker(params): """以恢复模式启动一个worker Args: params: 字典, 参数字典:必须包括对应的worker_name """ is_ok, errors = check_params(params, 'worker_name') if not is_ok: return result(400, "params error", str(errors)) else: try: worker_name = params.pop('worker_name') record = RecorderManager.instance().\ get_fail_worker_record(worker_name) if not record: return result(400, "not exist this fail worker", worker_name) else: schedule_params = record.get('schedule_kwargs') spider_params = record.get('spider_kwargs') schedule_path = record.get('schedule_class') spider_path = record.get('spider_class') schedule = get_schedule_class(schedule_path)(**schedule_params) spider = get_spider_class(spider_path)(schedule, **spider_params) recover_worker(spider) RecorderManager.instance().remove_last_fail_worker(worker_name) except ScheduleError, e: return result(400, message="init schedule failed", result=str(e)) except SpiderError, e: return result(400, message="init spider failed", result=str(e))
def api_remove_all_fail_worker(params): """remove all fail worker Args: params: dict, param dict """ records = RecorderManager.instance().get_last_fail_worker() remove_rs = [] try: for record in records: worker_name = record.get('worker_name') schedule_params = record.get('schedule_kwargs') spider_params = record.get('spider_kwargs') schedule_path = record.get('schedule_class') spider_path = record.get('spider_class') try: schedule = get_schedule_class(schedule_path)(**schedule_params) spider = get_spider_class(spider_path)(schedule, **spider_params) spider.clear_all() RecorderManager.instance().remove_last_fail_worker(worker_name) except Exception, e: remove_rs.append({"worker_name": worker_name, "result": "fail", "error": str(e)}) else: remove_rs.append({"worker_name": worker_name, "result": "success", "error": ""}) except Exception, e: return result(500, "unsupported exception", result=str(e))
def get_worker(params): """获取worker Args: params:dict 参数字典 Returns: path, {}: 路径和参数字典 """ workers = get_all_workers() fail_workers = RecorderManager.instance().get_last_fail_worker() return "worker.html", {'workers': workers, 'fail_workers': fail_workers}
def api_get_all_fail_worker(params): """获取以前失败的worker Args: params: 字典,参数字典 Returns: result: str,结果 """ try: fail_worker_records = RecorderManager.instance().get_last_fail_worker() last_fail_worker_str = json.dumps(fail_worker_records, ensure_ascii=False, encoding="utf-8") except Exception, e: return result(500, "get fail worker failed", str(e))
def recover(self): """以恢复模式启动这个worker 不会重复启动 """ if self.is_started: self.logger.warn("duplicate start") else: self.worker_statistic.start_time = datetime.datetime.now() RecorderManager.instance().record_doing( record(self._worker_name, self.worker_statistic. start_time.strftime("%Y-%m-%d %H:%M:%S"), get_class_path(self.spider.crawl_schedule.__class__), self.spider.crawl_schedule.schedule_kwargs, get_class_path(self.spider.__class__), self.spider.spider_kwargs)) self.is_started = True ioloop.IOLoop.instance().add_timeout( datetime.timedelta(milliseconds= self.spider.crawl_schedule.interval), self.loop_get_and_execute) self.logger.info("recover worker")
def stop(self): """关闭这个worker,并保存统计信息, store fail task 关闭的时候,会清空所有schedule中的队列以及pipeline中的中间数据 不会重复关闭 """ if not self.is_started: self.logger.warn("duplicate stop") else: self.is_started = False self.worker_statistic.end_time = datetime.datetime.now() fail_task_file_name = ( self.spider.__class__.__name__ + "-" + self.worker_statistic.start_time.strftime("%Y-%m-%d %H:%M:%S") ) try: output_fail_http_task_file(WORKER_FAIL_PATH + fail_task_file_name + ".csv", self.spider.crawl_schedule) except Exception, e: self.logger.warn("output fail task failed error:%s" % e) try: RecorderManager.instance().record_done(self._worker_name) except Exception, e: self.logger.warn("record done failed error:%s" % e)
def stop(self): """关闭这个worker,并保存统计信息, store fail task 关闭的时候,会清空所有schedule中的队列以及pipeline中的中间数据 不会重复关闭 """ if not self.is_started: self.logger.warn("duplicate stop") else: self.is_started = False self.worker_statistic.end_time = datetime.datetime.now() fail_task_file_name = self.spider.__class__.__name__ + "-" + \ self.worker_statistic.start_time.strftime("%Y-%m-%d %H:%M:%S") try: output_fail_http_task_file( WORKER_FAIL_PATH + fail_task_file_name + ".csv", self.spider.crawl_schedule) except Exception, e: self.logger.warn("output fail task failed error:%s" % e) try: RecorderManager.instance().record_done(self._worker_name) except Exception, e: self.logger.warn("record done failed error:%s" % e)
def recover(self): """以恢复模式启动这个worker 不会重复启动 """ if self.is_started: self.logger.warn("duplicate start") else: self.worker_statistic.start_time = datetime.datetime.now() RecorderManager.instance().record_doing( record( self._worker_name, self.worker_statistic.start_time.strftime( "%Y-%m-%d %H:%M:%S"), get_class_path(self.spider.crawl_schedule.__class__), self.spider.crawl_schedule.schedule_kwargs, get_class_path(self.spider.__class__), self.spider.spider_kwargs)) self.is_started = True ioloop.IOLoop.instance().add_timeout( datetime.timedelta( milliseconds=self.spider.crawl_schedule.interval), self.loop_get_and_execute) self.logger.info("recover worker")