class ArachnadoExecutionEngine(ExecutionEngine): """ Extended ExecutionEngine. It sends a signal when engine gets scheduled to stop. """ def __init__(self, *args, **kwargs): super(ArachnadoExecutionEngine, self).__init__(*args, **kwargs) self.send_tick = CallLaterOnce(self._send_tick_signal) def close_spider(self, spider, reason='cancelled'): if self.slot.closing: return self.slot.closing self.crawler.crawling = False self.signals.send_catch_log(signals.spider_closing) return super(ArachnadoExecutionEngine, self).close_spider(spider, reason) def pause(self): """Pause the execution engine""" super(ArachnadoExecutionEngine, self).pause() self.signals.send_catch_log(signals.engine_paused) def unpause(self): """Resume the execution engine""" super(ArachnadoExecutionEngine, self).unpause() self.signals.send_catch_log(signals.engine_resumed) def _next_request(self, spider): res = super(ArachnadoExecutionEngine, self)._next_request(spider) self.send_tick.schedule(0.1) # avoid sending the signal too often return res def _send_tick_signal(self): self.signals.send_catch_log_deferred(signals.engine_tick)
class ArachnadoExecutionEngine(ExecutionEngine): """ Extended ExecutionEngine. It sends a signal when engine gets scheduled to stop. """ def __init__(self, *args, **kwargs): super(ArachnadoExecutionEngine, self).__init__(*args, **kwargs) self.send_tick = CallLaterOnce(self._send_tick_signal) def close_spider(self, spider, reason="cancelled"): if self.slot.closing: return self.slot.closing self.crawler.crawling = False self.signals.send_catch_log(signals.spider_closing) return super(ArachnadoExecutionEngine, self).close_spider(spider, reason) def pause(self): """Pause the execution engine""" super(ArachnadoExecutionEngine, self).pause() self.signals.send_catch_log(signals.engine_paused) def unpause(self): """Resume the execution engine""" super(ArachnadoExecutionEngine, self).unpause() self.signals.send_catch_log(signals.engine_resumed) def _next_request(self, spider): res = super(ArachnadoExecutionEngine, self)._next_request(spider) self.send_tick.schedule(0.1) # avoid sending the signal too often return res def _send_tick_signal(self): self.signals.send_catch_log_deferred(signals.engine_tick)
class MyselfExecutionEngine(ExecutionEngine): """扩写执行引擎 任务停止时发送信号""" def __init__(self, *args, **kwargs): super(MyselfExecutionEngine, self).__init__(*args, **kwargs) self.send_tick = CallLaterOnce(self._send_tick_signal) # TODO def close_spider(self, spider, reason='cancelled'): """关闭spider并清除未完成请求""" # self.slot使用twisted.reactor调度engine的_next_request方法, 核心循环方法 if self.slot.closing: return self.slot.closing self.crawler.crawling = False self.signals.send_catch_log(signals.spider_closing) return super(MyselfExecutionEngine, self).close_spider(spider, reason) def pause(self): """暂停执行引擎""" super(MyselfExecutionEngine, self).pause() self.signals.send_catch_log(signals.engine_paused) def unpause(self): """继续执行暂停任务""" super(MyselfExecutionEngine, self).unpause() self.signals.send_catch_log(signals.engine_resumed) def _next_request(self, spider): """任务调度""" res = super(MyselfExecutionEngine, self)._next_request(spider) self.send_tick.schedule(0.1) return res def _send_tick_signal(self): """发送信号""" self.signals.send_catch_log_deferred(signals.engine_tick)
def createSpiderTask(site_info, settings, CHECK_POINT): results = iter(select(settings, SITE_ID=site_info["site_id"])) nextcall = CallLaterOnce(eval(site_info["SpiderName"]), site_info, results, CHECK_POINT) heartbeat = task.LoopingCall(nextcall.schedule) # TODO delay 秒后开始回调 nextcall.schedule(delay=0.5) TaskTimer = 3 # TODO 每 TaskTimer秒 产生一次任务 heartbeat.start(TaskTimer)