def _handle_pipeline_result(self, result): if result is None: pass elif isinstance(result, Request): self.download(result) else: assert isinstance(result, (Response, Failure)) request = result.request if isinstance(result, Response): flags = ' %s' % result.flags if result.flags else '' log.msg(format='Crawled %(url)s [%(status)s]%(flags)s', level=log.DEBUG, url=result.url, status=result.status, flags=flags) self.signals.send(signal=signals.response_received, response=result) else: self.signals.send(signal=signals.failure_received, failure=result) dfd = defer_result(result, clock=self.clock) dfd.addCallbacks(request.callback or self.spider.parse, request.errback) dfd.addCallbacks( self._handle_spider_output, self._handle_spider_error, callbackKeywords={'request': request}, errbackKeywords={'request': request}) return dfd
def _handle_pipeline_result(self, result): if result is None: pass elif isinstance(result, Request): self.download(result) else: assert isinstance(result, (Response, Failure)) request = result.request if isinstance(result, Response): flags = ' %s' % result.flags if result.flags else '' log.msg(format='Crawled %(url)s [%(status)s]%(flags)s', level=log.DEBUG, url=result.url, status=result.status, flags=flags) self.signals.send(signal=signals.response_received, response=result) else: self.signals.send(signal=signals.failure_received, failure=result) dfd = defer_result(result, clock=self.clock) dfd.addCallbacks(request.callback or self.spider.parse, request.errback) dfd.addCallbacks(self._handle_spider_output, self._handle_spider_error, callbackKeywords={'request': request}, errbackKeywords={'request': request}) return dfd
def _process_queue(self): if not self.running: return elif self.paused: self.processing.schedule(self.PAUSED_CHECK_FREQUENCY) elif self.response_queue: response = self.response_queue.pop() if isinstance(response, Response): self.signals.send(signal=signals.response_downloaded, response=response) dfd = defer_result(response, clock=self.clock) dfd.addBoth(self.pipeline.process_response) dfd.addBoth(self._handle_pipeline_result) dfd.addBoth(self._finalize_download) dfd.addBoth(lambda _: self.processing.schedule(0)) elif self.is_idle(): # send `spider_idle` signal res = self.signals.send(signal=signals.spider_idle, dont_log=DontStopEngine) dont_stop = any(isinstance(x, Failure) and isinstance(x.value, DontStopEngine) for _, x in res) # more requests have been scheduled if not self.is_idle(): self.processing.schedule(0) # slow down a little, but still run elif dont_stop or not self.stop_if_idle: self.processing.schedule(self.IDLE_CHECK_FREQUENCY) else: self.stop('finished') else: self.processing.schedule(self.QUEUE_CHECK_FREQUENCY)
def _process_queue(self): if not self.running: return elif self.paused: self.processing.schedule(self.PAUSED_CHECK_FREQUENCY) elif self.response_queue: response = self.response_queue.pop() if isinstance(response, Response): self.signals.send(signal=signals.response_downloaded, response=response) dfd = defer_result(response, clock=self.clock) dfd.addBoth(self.pipeline.process_response) dfd.addBoth(self._handle_pipeline_result) dfd.addBoth(self._finalize_download) dfd.addBoth(lambda _: self.processing.schedule(0)) elif self.is_idle(): # send `spider_idle` signal res = self.signals.send(signal=signals.spider_idle, dont_log=DontStopEngine) dont_stop = any( isinstance(x, Failure) and isinstance(x.value, DontStopEngine) for _, x in res) # more requests have been scheduled if not self.is_idle(): self.processing.schedule(0) # slow down a little, but still run elif dont_stop or not self.stop_if_idle: self.processing.schedule(self.IDLE_CHECK_FREQUENCY) else: self.stop('finished') else: self.processing.schedule(self.QUEUE_CHECK_FREQUENCY)