class ExecutionEngine(object): def __init__(self, crawler, spider_closed_callback): self.crawler = crawler self.settings = crawler.settings self.signals = crawler.signals self.logformatter = crawler.logformatter self.slot = None self.spider = None self.running = False self.paused = False self.scheduler_cls = load_object(self.settings['SCHEDULER']) downloader_cls = load_object(self.settings['DOWNLOADER']) self.downloader = downloader_cls(crawler) self.scraper = Scraper(crawler) self._concurrent_spiders = self.settings.getint( 'CONCURRENT_SPIDERS', 1) if self._concurrent_spiders != 1: warnings.warn("CONCURRENT_SPIDERS settings is deprecated, use " \ "Scrapyd max_proc config instead", ScrapyDeprecationWarning) self._spider_closed_callback = spider_closed_callback @defer.inlineCallbacks def start(self): """Start the execution engine""" assert not self.running, "Engine already running" self.start_time = time() yield self.signals.send_catch_log_deferred( signal=signals.engine_started) self.running = True self._closewait = defer.Deferred() yield self._closewait def stop(self): """Stop the execution engine gracefully""" assert self.running, "Engine not running" self.running = False dfd = self._close_all_spiders() return dfd.addBoth(lambda _: self._finish_stopping_engine()) def pause(self): """Pause the execution engine""" self.paused = True def unpause(self): """Resume the execution engine""" self.paused = False def _next_request(self, spider): slot = self.slot if not slot: return if self.paused: slot.nextcall.schedule(5) return while not self._needs_backout(spider): if not self._next_request_from_scheduler(spider): break if slot.start_requests and not self._needs_backout(spider): try: request = next(slot.start_requests) except StopIteration: slot.start_requests = None except Exception as exc: slot.start_requests = None log.err(None, 'Obtaining request from start requests', \ spider=spider) else: self.crawl(request, spider) if self.spider_is_idle(spider) and slot.close_if_idle: self._spider_idle(spider) def _needs_backout(self, spider): slot = self.slot return not self.running \ or slot.closing \ or self.downloader.needs_backout() \ or self.scraper.slot.needs_backout() def _next_request_from_scheduler(self, spider): slot = self.slot request = slot.scheduler.next_request() if not request: return d = self._download(request, spider) d.addBoth(self._handle_downloader_output, request, spider) d.addErrback(log.msg, spider=spider) d.addBoth(lambda _: slot.remove_request(request)) d.addErrback(log.msg, spider=spider) d.addBoth(lambda _: slot.nextcall.schedule()) d.addErrback(log.msg, spider=spider) return d def _handle_downloader_output(self, response, request, spider): assert isinstance(response, (Request, Response, Failure)), response # downloader middleware can return requests (for example, redirects) if isinstance(response, Request): self.crawl(response, spider) return # response is a Response or Failure d = self.scraper.enqueue_scrape(response, request, spider) d.addErrback(log.err, spider=spider) return d def spider_is_idle(self, spider): scraper_idle = self.scraper.slot.is_idle() pending = self.slot.scheduler.has_pending_requests() downloading = bool(self.downloader.active) pending_start_requests = self.slot.start_requests is not None idle = scraper_idle and not (pending or downloading or pending_start_requests) return idle @property def open_spiders(self): return [self.spider] if self.spider else [] def has_capacity(self): """Does the engine have capacity to handle more spiders""" return not bool(self.slot) def crawl(self, request, spider): assert spider in self.open_spiders, \ "Spider %r not opened when crawling: %s" % (spider.name, request) self.schedule(request, spider) self.slot.nextcall.schedule() def schedule(self, request, spider): self.signals.send_catch_log(signal=signals.request_scheduled, request=request, spider=spider) return self.slot.scheduler.enqueue_request(request) def download(self, request, spider): slot = self.slot slot.add_request(request) d = self._download(request, spider) d.addBoth(self._downloaded, slot, request, spider) return d def _downloaded(self, response, slot, request, spider): slot.remove_request(request) return self.download(response, spider) \ if isinstance(response, Request) else response def _download(self, request, spider): slot = self.slot slot.add_request(request) def _on_success(response): assert isinstance(response, (Response, Request)) if isinstance(response, Response): response.request = request # tie request to response received logkws = self.logformatter.crawled(request, response, spider) log.msg(spider=spider, **logkws) self.signals.send_catch_log(signal=signals.response_received, \ response=response, request=request, spider=spider) return response def _on_complete(_): slot.nextcall.schedule() return _ dwld = self.downloader.fetch(request, spider) dwld.addCallbacks(_on_success) dwld.addBoth(_on_complete) return dwld @defer.inlineCallbacks def open_spider(self, spider, start_requests=(), close_if_idle=True): assert self.has_capacity(), "No free spider slot when opening %r" % \ spider.name log.msg("Spider opened", spider=spider) nextcall = CallLaterOnce(self._next_request, spider) scheduler = self.scheduler_cls.from_crawler(self.crawler) start_requests = yield self.scraper.spidermw.process_start_requests( start_requests, spider) slot = Slot(start_requests, close_if_idle, nextcall, scheduler) self.slot = slot self.spider = spider yield scheduler.open(spider) yield self.scraper.open_spider(spider) self.crawler.stats.open_spider(spider) yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider) slot.nextcall.schedule() def _spider_idle(self, spider): """Called when a spider gets idle. This function is called when there are no remaining pages to download or schedule. It can be called multiple times. If some extension raises a DontCloseSpider exception (in the spider_idle signal handler) the spider is not closed until the next loop and this function is guaranteed to be called (at least) once again for this spider. """ res = self.signals.send_catch_log(signal=signals.spider_idle, \ spider=spider, dont_log=DontCloseSpider) if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \ for _, x in res): self.slot.nextcall.schedule(5) return if self.spider_is_idle(spider): self.close_spider(spider, reason='finished') def close_spider(self, spider, reason='cancelled'): """Close (cancel) spider and clear all its outstanding requests""" slot = self.slot if slot.closing: return slot.closing log.msg(format="Closing spider (%(reason)s)", reason=reason, spider=spider) dfd = slot.close() dfd.addBoth(lambda _: self.downloader.close()) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: self.scraper.close_spider(spider)) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: slot.scheduler.close(reason)) dfd.addErrback(log.err, spider=spider) # XXX: spider_stats argument was added for backwards compatibility with # stats collection refactoring added in 0.15. it should be removed in 0.17. dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(signal=signals.spider_closed, \ spider=spider, reason=reason, spider_stats=self.crawler.stats.get_stats())) dfd.addErrback(log.err, spider=spider) dfd.addBoth( lambda _: self.crawler.stats.close_spider(spider, reason=reason)) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: log.msg( format="Spider closed (%(reason)s)", reason=reason, spider=spider)) dfd.addBoth(lambda _: setattr(self, 'slot', None)) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: setattr(self, 'spider', None)) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: self._spider_closed_callback(spider)) return dfd def _close_all_spiders(self): dfds = [ self.close_spider(s, reason='shutdown') for s in self.open_spiders ] dlist = defer.DeferredList(dfds) return dlist @defer.inlineCallbacks def _finish_stopping_engine(self): yield self.signals.send_catch_log_deferred( signal=signals.engine_stopped) self._closewait.callback(None)
class ExecutionEngine: def __init__(self, crawler, spider_closed_callback): self.crawler = crawler self.settings = crawler.settings self.signals = crawler.signals self.logformatter = crawler.logformatter self.slot = None self.spider = None self.running = False self.paused = False self.scheduler_cls = load_object(self.settings['SCHEDULER']) downloader_cls = load_object(self.settings['DOWNLOADER']) self.downloader = downloader_cls(crawler) self.scraper = Scraper(crawler) self._spider_closed_callback = spider_closed_callback @defer.inlineCallbacks def start(self): """Start the execution engine""" if self.running: raise RuntimeError("Engine already running") self.start_time = time() yield self.signals.send_catch_log_deferred( signal=signals.engine_started) self.running = True self._closewait = defer.Deferred() yield self._closewait def stop(self): """Stop the execution engine gracefully""" if not self.running: raise RuntimeError("Engine not running") self.running = False dfd = self._close_all_spiders() return dfd.addBoth(lambda _: self._finish_stopping_engine()) def close(self): """Close the execution engine gracefully. If it has already been started, stop it. In all cases, close all spiders and the downloader. """ if self.running: # Will also close spiders and downloader return self.stop() elif self.open_spiders: # Will also close downloader return self._close_all_spiders() else: return defer.succeed(self.downloader.close()) def pause(self): """Pause the execution engine""" self.paused = True def unpause(self): """Resume the execution engine""" self.paused = False def _next_request(self, spider): slot = self.slot if not slot: return if self.paused: return while not self._needs_backout(spider): if not self._next_request_from_scheduler(spider): break if slot.start_requests and not self._needs_backout(spider): try: request = next(slot.start_requests) except StopIteration: slot.start_requests = None except Exception: slot.start_requests = None logger.error('Error while obtaining start requests', exc_info=True, extra={'spider': spider}) else: self.crawl(request, spider) if self.spider_is_idle(spider) and slot.close_if_idle: self._spider_idle(spider) def _needs_backout(self, spider): slot = self.slot return (not self.running or slot.closing or self.downloader.needs_backout() or self.scraper.slot.needs_backout()) def _next_request_from_scheduler(self, spider): slot = self.slot request = slot.scheduler.next_request() if not request: return d = self._download(request, spider) d.addBoth(self._handle_downloader_output, request, spider) d.addErrback( lambda f: logger.info('Error while handling downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.remove_request(request)) d.addErrback( lambda f: logger.info('Error while removing request from slot', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.nextcall.schedule()) d.addErrback( lambda f: logger.info('Error while scheduling new request', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d def _handle_downloader_output(self, response, request, spider): if not isinstance(response, (Request, Response, Failure)): raise TypeError( "Incorrect type: expected Request, Response or Failure, got " f"{type(response)}: {response!r}") # downloader middleware can return requests (for example, redirects) if isinstance(response, Request): self.crawl(response, spider) return # response is a Response or Failure d = self.scraper.enqueue_scrape(response, request, spider) d.addErrback( lambda f: logger.error('Error while enqueuing downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d def spider_is_idle(self, spider): if not self.scraper.slot.is_idle(): # scraper is not idle return False if self.downloader.active: # downloader has pending requests return False if self.slot.start_requests is not None: # not all start requests are handled return False if self.slot.scheduler.has_pending_requests(): # scheduler has pending requests return False return True @property def open_spiders(self): return [self.spider] if self.spider else [] def has_capacity(self): """Does the engine have capacity to handle more spiders""" return not bool(self.slot) def crawl(self, request, spider): if spider not in self.open_spiders: raise RuntimeError( f"Spider {spider.name!r} not opened when crawling: {request}") self.schedule(request, spider) self.slot.nextcall.schedule() def schedule(self, request, spider): self.signals.send_catch_log(signals.request_scheduled, request=request, spider=spider) if not self.slot.scheduler.enqueue_request(request): self.signals.send_catch_log(signals.request_dropped, request=request, spider=spider) def download(self, request, spider): d = self._download(request, spider) d.addBoth(self._downloaded, self.slot, request, spider) return d def _downloaded(self, response, slot, request, spider): slot.remove_request(request) return self.download(response, spider) if isinstance( response, Request) else response def _download(self, request, spider): slot = self.slot slot.add_request(request) def _on_success(response): if not isinstance(response, (Response, Request)): raise TypeError( "Incorrect type: expected Response or Request, got " f"{type(response)}: {response!r}") if isinstance(response, Response): if response.request is None: response.request = request logkws = self.logformatter.crawled(response.request, response, spider) if logkws is not None: logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) self.signals.send_catch_log( signal=signals.response_received, response=response, request=response.request, spider=spider, ) return response def _on_complete(_): slot.nextcall.schedule() return _ dwld = self.downloader.fetch(request, spider) dwld.addCallbacks(_on_success) dwld.addBoth(_on_complete) return dwld @defer.inlineCallbacks def open_spider(self, spider, start_requests=(), close_if_idle=True): if not self.has_capacity(): raise RuntimeError( f"No free spider slot when opening {spider.name!r}") logger.info("Spider opened", extra={'spider': spider}) nextcall = CallLaterOnce(self._next_request, spider) scheduler = self.scheduler_cls.from_crawler(self.crawler) start_requests = yield self.scraper.spidermw.process_start_requests( start_requests, spider) slot = Slot(start_requests, close_if_idle, nextcall, scheduler) self.slot = slot self.spider = spider yield scheduler.open(spider) yield self.scraper.open_spider(spider) self.crawler.stats.open_spider(spider) yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider) slot.nextcall.schedule() slot.heartbeat.start(5) def _spider_idle(self, spider): """Called when a spider gets idle. This function is called when there are no remaining pages to download or schedule. It can be called multiple times. If some extension raises a DontCloseSpider exception (in the spider_idle signal handler) the spider is not closed until the next loop and this function is guaranteed to be called (at least) once again for this spider. """ res = self.signals.send_catch_log(signals.spider_idle, spider=spider, dont_log=DontCloseSpider) if any( isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) for _, x in res): return if self.spider_is_idle(spider): self.close_spider(spider, reason='finished') def close_spider(self, spider, reason='cancelled'): """Close (cancel) spider and clear all its outstanding requests""" slot = self.slot if slot.closing: return slot.closing logger.info("Closing spider (%(reason)s)", {'reason': reason}, extra={'spider': spider}) dfd = slot.close() def log_failure(msg): def errback(failure): logger.error(msg, exc_info=failure_to_exc_info(failure), extra={'spider': spider}) return errback dfd.addBoth(lambda _: self.downloader.close()) dfd.addErrback(log_failure('Downloader close failure')) dfd.addBoth(lambda _: self.scraper.close_spider(spider)) dfd.addErrback(log_failure('Scraper close failure')) dfd.addBoth(lambda _: slot.scheduler.close(reason)) dfd.addErrback(log_failure('Scheduler close failure')) dfd.addBoth(lambda _: self.signals.send_catch_log_deferred( signal=signals.spider_closed, spider=spider, reason=reason)) dfd.addErrback(log_failure('Error while sending spider_close signal')) dfd.addBoth( lambda _: self.crawler.stats.close_spider(spider, reason=reason)) dfd.addErrback(log_failure('Stats close failure')) dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)", {'reason': reason}, extra={'spider': spider})) dfd.addBoth(lambda _: setattr(self, 'slot', None)) dfd.addErrback(log_failure('Error while unassigning slot')) dfd.addBoth(lambda _: setattr(self, 'spider', None)) dfd.addErrback(log_failure('Error while unassigning spider')) dfd.addBoth(lambda _: self._spider_closed_callback(spider)) return dfd def _close_all_spiders(self): dfds = [ self.close_spider(s, reason='shutdown') for s in self.open_spiders ] dlist = defer.DeferredList(dfds) return dlist @defer.inlineCallbacks def _finish_stopping_engine(self): yield self.signals.send_catch_log_deferred( signal=signals.engine_stopped) self._closewait.callback(None)
class ExecutionEngine(object): def __init__(self, crawler, spider_closed_callback): self.crawler = crawler self.settings = crawler.settings self.signals = crawler.signals self.logformatter = crawler.logformatter self.slot = None self.spider = None self.running = False self.paused = False self.scheduler_cls = load_object(self.settings['SCHEDULER']) downloader_cls = load_object(self.settings['DOWNLOADER']) self.downloader = downloader_cls(crawler) self.scraper = Scraper(crawler) self._spider_closed_callback = spider_closed_callback @defer.inlineCallbacks def start(self): """Start the execution engine""" assert not self.running, "Engine already running" self.start_time = time() yield self.signals.send_catch_log_deferred(signal=signals.engine_started) self.running = True self._closewait = defer.Deferred() yield self._closewait def stop(self): """Stop the execution engine gracefully""" assert self.running, "Engine not running" self.running = False dfd = self._close_all_spiders() return dfd.addBoth(lambda _: self._finish_stopping_engine()) def close(self): """Close the execution engine gracefully. If it has already been started, stop it. In all cases, close all spiders and the downloader. """ if self.running: # Will also close spiders and downloader return self.stop() elif self.open_spiders: # Will also close downloader return self._close_all_spiders() else: return defer.succeed(self.downloader.close()) def pause(self): """Pause the execution engine""" self.paused = True def unpause(self): """Resume the execution engine""" self.paused = False def _next_request(self, spider): slot = self.slot if not slot: return if self.paused: return while not self._needs_backout(spider): if not self._next_request_from_scheduler(spider): break if slot.start_requests and not self._needs_backout(spider): try: request = next(slot.start_requests) except StopIteration: slot.start_requests = None except Exception: slot.start_requests = None logger.error('Error while obtaining start requests', exc_info=True, extra={'spider': spider}) else: self.crawl(request, spider) if self.spider_is_idle(spider) and slot.close_if_idle: self._spider_idle(spider) def _needs_backout(self, spider): slot = self.slot return not self.running \ or slot.closing \ or self.downloader.needs_backout() \ or self.scraper.slot.needs_backout() def _next_request_from_scheduler(self, spider): slot = self.slot request = slot.scheduler.next_request() if not request: return d = self._download(request, spider) d.addBoth(self._handle_downloader_output, request, spider) d.addErrback(lambda f: logger.info('Error while handling downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.remove_request(request)) d.addErrback(lambda f: logger.info('Error while removing request from slot', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.nextcall.schedule()) d.addErrback(lambda f: logger.info('Error while scheduling new request', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d def _handle_downloader_output(self, response, request, spider): assert isinstance(response, (Request, Response, Failure)), response # downloader middleware can return requests (for example, redirects) if isinstance(response, Request): self.crawl(response, spider) return # response is a Response or Failure d = self.scraper.enqueue_scrape(response, request, spider) d.addErrback(lambda f: logger.error('Error while enqueuing downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d def spider_is_idle(self, spider): if not self.scraper.slot.is_idle(): # scraper is not idle return False if self.downloader.active: # downloader has pending requests return False if self.slot.start_requests is not None: # not all start requests are handled return False if self.slot.scheduler.has_pending_requests(): # scheduler has pending requests return False return True @property def open_spiders(self): return [self.spider] if self.spider else [] def has_capacity(self): """Does the engine have capacity to handle more spiders""" return not bool(self.slot) def crawl(self, request, spider): assert spider in self.open_spiders, \ "Spider %r not opened when crawling: %s" % (spider.name, request) self.schedule(request, spider) self.slot.nextcall.schedule() def schedule(self, request, spider): self.signals.send_catch_log(signal=signals.request_scheduled, request=request, spider=spider) if not self.slot.scheduler.enqueue_request(request): self.signals.send_catch_log(signal=signals.request_dropped, request=request, spider=spider) def download(self, request, spider): slot = self.slot slot.add_request(request) d = self._download(request, spider) d.addBoth(self._downloaded, slot, request, spider) return d def _downloaded(self, response, slot, request, spider): slot.remove_request(request) return self.download(response, spider) \ if isinstance(response, Request) else response def _download(self, request, spider): slot = self.slot slot.add_request(request) def _on_success(response): assert isinstance(response, (Response, Request)) if isinstance(response, Response): response.request = request # tie request to response received logkws = self.logformatter.crawled(request, response, spider) logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) self.signals.send_catch_log(signal=signals.response_received, \ response=response, request=request, spider=spider) return response def _on_complete(_): slot.nextcall.schedule() return _ dwld = self.downloader.fetch(request, spider) dwld.addCallbacks(_on_success) dwld.addBoth(_on_complete) return dwld @defer.inlineCallbacks def open_spider(self, spider, start_requests=(), close_if_idle=True): assert self.has_capacity(), "No free spider slot when opening %r" % \ spider.name logger.info("Spider opened", extra={'spider': spider}) nextcall = CallLaterOnce(self._next_request, spider) scheduler = self.scheduler_cls.from_crawler(self.crawler) start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider) slot = Slot(start_requests, close_if_idle, nextcall, scheduler) self.slot = slot self.spider = spider yield scheduler.open(spider) yield self.scraper.open_spider(spider) self.crawler.stats.open_spider(spider) yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider) slot.nextcall.schedule() slot.heartbeat.start(5) def _spider_idle(self, spider): """Called when a spider gets idle. This function is called when there are no remaining pages to download or schedule. It can be called multiple times. If some extension raises a DontCloseSpider exception (in the spider_idle signal handler) the spider is not closed until the next loop and this function is guaranteed to be called (at least) once again for this spider. """ res = self.signals.send_catch_log(signal=signals.spider_idle, \ spider=spider, dont_log=DontCloseSpider) if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \ for _, x in res): return if self.spider_is_idle(spider): self.close_spider(spider, reason='finished') def close_spider(self, spider, reason='cancelled'): """Close (cancel) spider and clear all its outstanding requests""" slot = self.slot if slot.closing: return slot.closing logger.info("Closing spider (%(reason)s)", {'reason': reason}, extra={'spider': spider}) dfd = slot.close() def log_failure(msg): def errback(failure): logger.error( msg, exc_info=failure_to_exc_info(failure), extra={'spider': spider} ) return errback dfd.addBoth(lambda _: self.downloader.close()) dfd.addErrback(log_failure('Downloader close failure')) dfd.addBoth(lambda _: self.scraper.close_spider(spider)) dfd.addErrback(log_failure('Scraper close failure')) dfd.addBoth(lambda _: slot.scheduler.close(reason)) dfd.addErrback(log_failure('Scheduler close failure')) dfd.addBoth(lambda _: self.signals.send_catch_log_deferred( signal=signals.spider_closed, spider=spider, reason=reason)) dfd.addErrback(log_failure('Error while sending spider_close signal')) dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason)) dfd.addErrback(log_failure('Stats close failure')) dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)", {'reason': reason}, extra={'spider': spider})) dfd.addBoth(lambda _: setattr(self, 'slot', None)) dfd.addErrback(log_failure('Error while unassigning slot')) dfd.addBoth(lambda _: setattr(self, 'spider', None)) dfd.addErrback(log_failure('Error while unassigning spider')) dfd.addBoth(lambda _: self._spider_closed_callback(spider)) return dfd def _close_all_spiders(self): dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders] dlist = defer.DeferredList(dfds) return dlist @defer.inlineCallbacks def _finish_stopping_engine(self): yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped) self._closewait.callback(None)
class ExecutionEngine(object): def __init__(self, crawler, spider_closed_callback): self.settings = crawler.settings self.slots = {} self.running = False self.paused = False self.scheduler_cls = load_object(self.settings['SCHEDULER']) self.downloader = Downloader(crawler) self.scraper = Scraper(crawler) self._concurrent_spiders = self.settings.getint('CONCURRENT_SPIDERS') self._spider_closed_callback = spider_closed_callback @defer.inlineCallbacks def start(self): """Start the execution engine""" assert not self.running, "Engine already running" self.start_time = time() yield send_catch_log_deferred(signal=signals.engine_started) self.running = True def stop(self): """Stop the execution engine gracefully""" assert self.running, "Engine not running" self.running = False dfd = self._close_all_spiders() return dfd.addBoth(lambda _: self._finish_stopping_engine()) def pause(self): """Pause the execution engine""" self.paused = True def unpause(self): """Resume the execution engine""" self.paused = False def _next_request(self, spider): try: slot = self.slots[spider] except KeyError: return if self.paused: slot.nextcall.schedule(5) return while not self._needs_backout(spider): if not self._next_request_from_scheduler(spider): break if slot.start_requests and not self._needs_backout(spider): try: request = slot.start_requests.next() self.crawl(request, spider) except StopIteration: slot.start_requests = None if self.spider_is_idle(spider) and slot.close_if_idle: self._spider_idle(spider) def _needs_backout(self, spider): slot = self.slots[spider] return not self.running \ or slot.closing \ or self.downloader.needs_backout() \ or self.scraper.slots[spider].needs_backout() def _next_request_from_scheduler(self, spider): slot = self.slots[spider] request = slot.scheduler.next_request() if not request: return d = self._download(request, spider) d.addBoth(self._handle_downloader_output, request, spider) d.addErrback(log.msg, spider=spider) d.addBoth(lambda _: slot.remove_request(request)) d.addErrback(log.msg, spider=spider) d.addBoth(lambda _: slot.nextcall.schedule()) d.addErrback(log.msg, spider=spider) return d def _handle_downloader_output(self, response, request, spider): assert isinstance(response, (Request, Response, Failure)), response # downloader middleware can return requests (for example, redirects) if isinstance(response, Request): self.crawl(response, spider) return # response is a Response or Failure d = self.scraper.enqueue_scrape(response, request, spider) d.addErrback(log.err, spider=spider) return d def spider_is_idle(self, spider): scraper_idle = spider in self.scraper.slots \ and self.scraper.slots[spider].is_idle() pending = self.slots[spider].scheduler.has_pending_requests() downloading = bool(self.downloader.slots) idle = scraper_idle and not (pending or downloading) return idle @property def open_spiders(self): return self.slots.keys() def has_capacity(self): """Does the engine have capacity to handle more spiders""" return len(self.slots) < self._concurrent_spiders def crawl(self, request, spider): assert spider in self.open_spiders, \ "Spider %r not opened when crawling: %s" % (spider.name, request) self.schedule(request, spider) self.slots[spider].nextcall.schedule() def schedule(self, request, spider): return self.slots[spider].scheduler.enqueue_request(request) def download(self, request, spider): slot = self.slots[spider] slot.add_request(request) d = self._download(request, spider) d.addBoth(self._downloaded, slot, request, spider) return d def _downloaded(self, response, slot, request, spider): slot.remove_request(request) return self.download(response, spider) \ if isinstance(response, Request) else response def _download(self, request, spider): slot = self.slots[spider] slot.add_request(request) def _on_success(response): assert isinstance(response, (Response, Request)) if isinstance(response, Response): response.request = request # tie request to response received log.msg(log.formatter.crawled(request, response, spider), \ level=log.DEBUG, spider=spider) send_catch_log(signal=signals.response_received, \ response=response, request=request, spider=spider) return response def _on_error(failure): failure.request = request return failure def _on_complete(_): slot.nextcall.schedule() return _ dwld = self.downloader.fetch(request, spider) dwld.addCallbacks(_on_success, _on_error) dwld.addBoth(_on_complete) return dwld @defer.inlineCallbacks def open_spider(self, spider, start_requests=None, close_if_idle=True): assert self.has_capacity(), "No free spider slots when opening %r" % \ spider.name log.msg("Spider opened", spider=spider) nextcall = CallLaterOnce(self._next_request, spider) scheduler = self.scheduler_cls.from_settings(self.settings) slot = Slot(start_requests or (), close_if_idle, nextcall, scheduler) self.slots[spider] = slot yield scheduler.open(spider) yield self.scraper.open_spider(spider) stats.open_spider(spider) yield send_catch_log_deferred(signals.spider_opened, spider=spider) slot.nextcall.schedule() def _spider_idle(self, spider): """Called when a spider gets idle. This function is called when there are no remaining pages to download or schedule. It can be called multiple times. If some extension raises a DontCloseSpider exception (in the spider_idle signal handler) the spider is not closed until the next loop and this function is guaranteed to be called (at least) once again for this spider. """ res = send_catch_log(signal=signals.spider_idle, \ spider=spider, dont_log=DontCloseSpider) if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \ for _, x in res): self.slots[spider].nextcall.schedule(5) return if self.spider_is_idle(spider): self.close_spider(spider, reason='finished') def close_spider(self, spider, reason='cancelled'): """Close (cancel) spider and clear all its outstanding requests""" slot = self.slots[spider] if slot.closing: return slot.closing log.msg("Closing spider (%s)" % reason, spider=spider) dfd = slot.close() dfd.addBoth(lambda _: self.scraper.close_spider(spider)) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: slot.scheduler.close(reason)) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: send_catch_log_deferred(signal=signals.spider_closed, \ spider=spider, reason=reason)) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: stats.close_spider(spider, reason=reason)) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: log.msg("Spider closed (%s)" % reason, spider=spider)) dfd.addBoth(lambda _: self.slots.pop(spider)) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: self._spider_closed_callback(spider)) return dfd def _close_all_spiders(self): dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders] dlist = defer.DeferredList(dfds) return dlist @defer.inlineCallbacks def _finish_stopping_engine(self): yield send_catch_log_deferred(signal=signals.engine_stopped) yield stats.engine_stopped()
"""
class ExecutionEngine(object): def __init__(self, crawler, spider_closed_callback): self.crawler = crawler self.settings = crawler.settings self.signals = crawler.signals self.logformatter = crawler.logformatter self.slot = None self.spider = None self.running = False self.paused = False self.scheduler_cls = load_object(self.settings['SCHEDULER']) self.downloader = Downloader(crawler) self.scraper = Scraper(crawler) self._concurrent_spiders = self.settings.getint('CONCURRENT_SPIDERS', 1) if self._concurrent_spiders != 1: warnings.warn("CONCURRENT_SPIDERS settings is deprecated, use " \ "Scrapyd max_proc config instead", ScrapyDeprecationWarning) self._spider_closed_callback = spider_closed_callback @defer.inlineCallbacks def start(self): """Start the execution engine""" assert not self.running, "Engine already running" self.start_time = time() yield self.signals.send_catch_log_deferred(signal=signals.engine_started) self.running = True self._closewait = defer.Deferred() yield self._closewait def stop(self): """Stop the execution engine gracefully""" assert self.running, "Engine not running" self.running = False dfd = self._close_all_spiders() return dfd.addBoth(lambda _: self._finish_stopping_engine()) def pause(self): """Pause the execution engine""" self.paused = True def unpause(self): """Resume the execution engine""" self.paused = False def _next_request(self, spider): slot = self.slot if not slot: return if self.paused: slot.nextcall.schedule(5) return while not self._needs_backout(spider): if not self._next_request_from_scheduler(spider): break if slot.start_requests and not self._needs_backout(spider): try: request = next(slot.start_requests) except StopIteration: slot.start_requests = None except Exception as exc: log.err(None, 'Obtaining request from start requests', \ spider=spider) else: self.crawl(request, spider) if self.spider_is_idle(spider) and slot.close_if_idle: self._spider_idle(spider) def _needs_backout(self, spider): slot = self.slot return not self.running \ or slot.closing \ or self.downloader.needs_backout() \ or self.scraper.slot.needs_backout() def _next_request_from_scheduler(self, spider): slot = self.slot request = slot.scheduler.next_request() if not request: return d = self._download(request, spider) d.addBoth(self._handle_downloader_output, request, spider) d.addErrback(log.msg, spider=spider) d.addBoth(lambda _: slot.remove_request(request)) d.addErrback(log.msg, spider=spider) d.addBoth(lambda _: slot.nextcall.schedule()) d.addErrback(log.msg, spider=spider) return d def _handle_downloader_output(self, response, request, spider): assert isinstance(response, (Request, Response, Failure)), response # downloader middleware can return requests (for example, redirects) if isinstance(response, Request): self.crawl(response, spider) return # response is a Response or Failure d = self.scraper.enqueue_scrape(response, request, spider) d.addErrback(log.err, spider=spider) return d def spider_is_idle(self, spider): scraper_idle = self.scraper.slot.is_idle() pending = self.slot.scheduler.has_pending_requests() downloading = bool(self.downloader.active) idle = scraper_idle and not (pending or downloading) return idle @property def open_spiders(self): return [self.spider] if self.spider else [] def has_capacity(self): """Does the engine have capacity to handle more spiders""" return not bool(self.slot) def crawl(self, request, spider): assert spider in self.open_spiders, \ "Spider %r not opened when crawling: %s" % (spider.name, request) self.schedule(request, spider) self.slot.nextcall.schedule() def schedule(self, request, spider): self.signals.send_catch_log(signal=signals.request_scheduled, request=request, spider=spider) return self.slot.scheduler.enqueue_request(request) def download(self, request, spider): slot = self.slot slot.add_request(request) d = self._download(request, spider) d.addBoth(self._downloaded, slot, request, spider) return d def _downloaded(self, response, slot, request, spider): slot.remove_request(request) return self.download(response, spider) \ if isinstance(response, Request) else response def _download(self, request, spider): slot = self.slot slot.add_request(request) def _on_success(response): assert isinstance(response, (Response, Request)) if isinstance(response, Response): response.request = request # tie request to response received logkws = self.logformatter.crawled(request, response, spider) log.msg(spider=spider, **logkws) self.signals.send_catch_log(signal=signals.response_received, \ response=response, request=request, spider=spider) return response def _on_complete(_): slot.nextcall.schedule() return _ dwld = self.downloader.fetch(request, spider) dwld.addCallbacks(_on_success) dwld.addBoth(_on_complete) return dwld @defer.inlineCallbacks def open_spider(self, spider, start_requests=(), close_if_idle=True): assert self.has_capacity(), "No free spider slot when opening %r" % \ spider.name log.msg("Spider opened", spider=spider) nextcall = CallLaterOnce(self._next_request, spider) scheduler = self.scheduler_cls.from_crawler(self.crawler) start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider) slot = Slot(start_requests, close_if_idle, nextcall, scheduler) self.slot = slot self.spider = spider yield scheduler.open(spider) yield self.scraper.open_spider(spider) self.crawler.stats.open_spider(spider) yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider) slot.nextcall.schedule() def _spider_idle(self, spider): """Called when a spider gets idle. This function is called when there are no remaining pages to download or schedule. It can be called multiple times. If some extension raises a DontCloseSpider exception (in the spider_idle signal handler) the spider is not closed until the next loop and this function is guaranteed to be called (at least) once again for this spider. """ res = self.signals.send_catch_log(signal=signals.spider_idle, \ spider=spider, dont_log=DontCloseSpider) if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \ for _, x in res): self.slot.nextcall.schedule(5) return if self.spider_is_idle(spider): self.close_spider(spider, reason='finished') def close_spider(self, spider, reason='cancelled'): """Close (cancel) spider and clear all its outstanding requests""" slot = self.slot if slot.closing: return slot.closing log.msg(format="Closing spider (%(reason)s)", reason=reason, spider=spider) dfd = slot.close() dfd.addBoth(lambda _: self.downloader.close()) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: self.scraper.close_spider(spider)) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: slot.scheduler.close(reason)) dfd.addErrback(log.err, spider=spider) # XXX: spider_stats argument was added for backwards compatibility with # stats collection refactoring added in 0.15. it should be removed in 0.17. dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(signal=signals.spider_closed, \ spider=spider, reason=reason, spider_stats=self.crawler.stats.get_stats())) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason)) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: log.msg(format="Spider closed (%(reason)s)", reason=reason, spider=spider)) dfd.addBoth(lambda _: setattr(self, 'slot', None)) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: setattr(self, 'spider', None)) dfd.addErrback(log.err, spider=spider) dfd.addBoth(lambda _: self._spider_closed_callback(spider)) return dfd def _close_all_spiders(self): dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders] dlist = defer.DeferredList(dfds) return dlist @defer.inlineCallbacks def _finish_stopping_engine(self): yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped) self._closewait.callback(None)
class ExecutionEngine(object): def __init__(self, crawler, spider_closed_callback): self.crawler = crawler self.settings = crawler.settings self.signals = crawler.signals self.logformatter = crawler.logformatter self.slot = None self.spider = None self.running = False self.paused = False self.scheduler_cls = load_object( self.settings['SCHEDULER'] ) # 从settings中找到Scheduler调度器,找到Scheduler类 downloader_cls = load_object( self.settings['DOWNLOADER']) # 同样,找到Downloader下载器类 self.downloader = downloader_cls(crawler) # 实例化Downloader self.scraper = Scraper(crawler) # 实例化Scraper,它是引擎连接爬虫类的桥梁 self._spider_closed_callback = spider_closed_callback @defer.inlineCallbacks def start(self): """Start the execution engine""" assert not self.running, "Engine already running" self.start_time = time() yield self.signals.send_catch_log_deferred( signal=signals.engine_started) self.running = True self._closewait = defer.Deferred() yield self._closewait def stop(self): """Stop the execution engine gracefully""" assert self.running, "Engine not running" self.running = False dfd = self._close_all_spiders() return dfd.addBoth(lambda _: self._finish_stopping_engine()) def close(self): """Close the execution engine gracefully. If it has already been started, stop it. In all cases, close all spiders and the downloader. """ if self.running: # Will also close spiders and downloader return self.stop() elif self.open_spiders: # Will also close downloader return self._close_all_spiders() else: return defer.succeed(self.downloader.close()) def pause(self): """Pause the execution engine""" self.paused = True def unpause(self): """Resume the execution engine""" self.paused = False def _next_request(self, spider): """ 这个_next_request方法有2种调用途径,一种是通过reactor的5s心跳定时启动运行,另一种则是在流程中需要时主动调用。 """ slot = self.slot if not slot: return if self.paused: return while not self._needs_backout( spider ): # 从scheduler中获取request,这个循环的意思是,尽量把队列中的request都安排异步下载,除非是达到最大并发量或其他原因 if not self._next_request_from_scheduler( spider): # 这个函数才是真正的发起下载任务 break if slot.start_requests and not self._needs_backout( spider): # 如果start_requests有数据且不需要等待 try: request = next(slot.start_requests) except StopIteration: slot.start_requests = None except Exception: slot.start_requests = None logger.error('Error while obtaining start requests', exc_info=True, extra={'spider': spider}) else: self.crawl( request, spider ) # 调用crawl,实际是把request放入scheduler对象的内存队列中,然后又安排马上调用_next_request if self.spider_is_idle(spider) and slot.close_if_idle: self._spider_idle(spider) def _needs_backout(self, spider): """ 是否需要等待,取决4个条件 1. Engine是否stop 2. slot是否close 3. downloader下载超过预设 4. scraper处理response超过预设 """ slot = self.slot return not self.running \ or slot.closing \ or self.downloader.needs_backout() \ or self.scraper.slot.needs_backout() def _next_request_from_scheduler(self, spider): slot = self.slot request = slot.scheduler.next_request() # 从scheduler拿出下个request if not request: return d = self._download( request, spider ) # 处理各中间件的process_request方法,然后用self._enqueue_request将request发出下载,并绑定回调函数(如process_response) d.addBoth(self._handle_downloader_output, request, spider) d.addErrback( lambda f: logger.info('Error while handling downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.remove_request(request)) d.addErrback( lambda f: logger.info('Error while removing request from slot', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.nextcall.schedule()) d.addErrback( lambda f: logger.info('Error while scheduling new request', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d def _handle_downloader_output(self, response, request, spider): assert isinstance( response, (Request, Response, Failure)), response # 下载结果必须是Request、Response、Failure其一 # downloader middleware can return requests (for example, redirects) if isinstance(response, Request): # 如果是Request,则再次调用crawl,执行Scheduler的入队逻辑 self.crawl(response, spider) return # response is a Response or Failure d = self.scraper.enqueue_scrape(response, request, spider) # 主要是和Spiders和Pipeline交互 d.addErrback( lambda f: logger.error('Error while enqueuing downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d def spider_is_idle(self, spider): if not self.scraper.slot.is_idle(): # scraper is not idle return False if self.downloader.active: # downloader has pending requests return False if self.slot.start_requests is not None: # not all start requests are handled return False if self.slot.scheduler.has_pending_requests(): # scheduler has pending requests return False return True @property def open_spiders(self): return [self.spider] if self.spider else [] def has_capacity(self): """Does the engine have capacity to handle more spiders""" return not bool(self.slot) def crawl(self, request, spider): assert spider in self.open_spiders, \ "Spider %r not opened when crawling: %s" % (spider.name, request) self.schedule(request, spider) # 把该requests放在内存队列中(前提是指纹唯一,指纹不唯一则drop掉) self.slot.nextcall.schedule( ) # 下一个loop马上调用_next_request,这样就会一直去找spider中的request然后放入队列 def schedule(self, request, spider): self.signals.send_catch_log(signal=signals.request_scheduled, request=request, spider=spider) if not self.slot.scheduler.enqueue_request( request): # request指纹重复则drop,指纹唯一则将request入队 self.signals.send_catch_log(signal=signals.request_dropped, request=request, spider=spider) def download(self, request, spider): # 没有调用这个函数??? d = self._download(request, spider) d.addBoth(self._downloaded, self.slot, request, spider) return d def _downloaded(self, response, slot, request, spider): # 没有调用这个函数??? slot.remove_request(request) return self.download(response, spider) \ if isinstance(response, Request) else response # 如果返回的是个request(比如从中间件返回),那就再次下载,如果是response则直接返回 def _download(self, request, spider): slot = self.slot slot.add_request(request) # 添加到正在处理的request的集合 def _on_success(response): assert isinstance(response, (Response, Request)) if isinstance(response, Response): # 如果下载后结果为Response,返回Response response.request = request # tie request to response received logkws = self.logformatter.crawled(request, response, spider) logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) self.signals.send_catch_log(signal=signals.response_received, \ response=response, request=request, spider=spider) return response def _on_complete(_): slot.nextcall.schedule() # 这里一个重点,下载完成之后再次调度,即,再次取出request,然后发送请求 return _ dwld = self.downloader.fetch( request, spider ) # 处理各中间件的process_request方法,然后用self._enqueue_request将request发出下载,并绑定回调(如process_response) dwld.addCallbacks(_on_success) # 绑定了内部函数,其实就相当于闭包了,所以变量啥的会保存下来 dwld.addBoth(_on_complete) return dwld @defer.inlineCallbacks def open_spider(self, spider, start_requests=(), close_if_idle=True): assert self.has_capacity(), "No free spider slot when opening %r" % \ spider.name logger.info("Spider opened", extra={'spider': spider}) nextcall = CallLaterOnce(self._next_request, spider) # 注册_next_request调度方法,循环调度 scheduler = self.scheduler_cls.from_crawler( self.crawler) # 初始化scheduler start_requests = yield self.scraper.spidermw.process_start_requests( start_requests, spider) # 调用爬虫中间件,处理种子请求 slot = Slot(start_requests, close_if_idle, nextcall, scheduler) # 封装Slot对象 self.slot = slot self.spider = spider yield scheduler.open( spider) # 调用scheduler的open,实例化一个优先级队列,其余啥也没干,返回None yield self.scraper.open_spider( spider) # 这里主要是调用所有itemPipline的open_spider方法 self.crawler.stats.open_spider(spider) # 返回None,这个不知道干啥的 yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider) slot.nextcall.schedule() # 发起调度 slot.heartbeat.start(5) # 每5秒调用一次CallLaterOnce.schedule def _spider_idle(self, spider): """Called when a spider gets idle. This function is called when there are no remaining pages to download or schedule. It can be called multiple times. If some extension raises a DontCloseSpider exception (in the spider_idle signal handler) the spider is not closed until the next loop and this function is guaranteed to be called (at least) once again for this spider. """ res = self.signals.send_catch_log(signal=signals.spider_idle, \ spider=spider, dont_log=DontCloseSpider) if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \ for _, x in res): return if self.spider_is_idle(spider): self.close_spider(spider, reason='finished') def close_spider(self, spider, reason='cancelled'): """Close (cancel) spider and clear all its outstanding requests""" slot = self.slot if slot.closing: return slot.closing logger.info("Closing spider (%(reason)s)", {'reason': reason}, extra={'spider': spider}) dfd = slot.close() def log_failure(msg): def errback(failure): logger.error(msg, exc_info=failure_to_exc_info(failure), extra={'spider': spider}) return errback dfd.addBoth(lambda _: self.downloader.close()) dfd.addErrback(log_failure('Downloader close failure')) dfd.addBoth(lambda _: self.scraper.close_spider(spider)) dfd.addErrback(log_failure('Scraper close failure')) dfd.addBoth(lambda _: slot.scheduler.close(reason)) dfd.addErrback(log_failure('Scheduler close failure')) dfd.addBoth(lambda _: self.signals.send_catch_log_deferred( signal=signals.spider_closed, spider=spider, reason=reason)) dfd.addErrback(log_failure('Error while sending spider_close signal')) dfd.addBoth( lambda _: self.crawler.stats.close_spider(spider, reason=reason)) dfd.addErrback(log_failure('Stats close failure')) dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)", {'reason': reason}, extra={'spider': spider})) dfd.addBoth(lambda _: setattr(self, 'slot', None)) dfd.addErrback(log_failure('Error while unassigning slot')) dfd.addBoth(lambda _: setattr(self, 'spider', None)) dfd.addErrback(log_failure('Error while unassigning spider')) dfd.addBoth(lambda _: self._spider_closed_callback(spider)) return dfd def _close_all_spiders(self): dfds = [ self.close_spider(s, reason='shutdown') for s in self.open_spiders ] dlist = defer.DeferredList(dfds) return dlist @defer.inlineCallbacks def _finish_stopping_engine(self): yield self.signals.send_catch_log_deferred( signal=signals.engine_stopped) self._closewait.callback(None)
class ExecutionEngine: def __init__(self, crawler, spider_closed_callback): self.crawler = crawler self.settings = crawler.settings self.signals = crawler.signals #当 crawler 初始化应该初始化了一个 signalmanager 它里面的 sender 就是 crawler self.logformatter = crawler.logformatter self.slot = None self.spider = None self.running = False self.paused = False self.scheduler_cls = load_object(self.settings['SCHEDULER']) downloader_cls = load_object(self.settings['DOWNLOADER']) self.downloader = downloader_cls(crawler) self.scraper = Scraper(crawler) self._spider_closed_callback = spider_closed_callback # 这里只是设置开启标志 并返回deferred对象 真正的准备阶段在 open_spider @defer.inlineCallbacks def start(self): """Start the execution engine""" if self.running: raise RuntimeError("Engine already running") self.start_time = time() yield self.signals.send_catch_log_deferred(signal=signals.engine_started) #增加 errback 并发送一次 signal self.running = True self._closewait = defer.Deferred() yield self._closewait def stop(self): """Stop the execution engine gracefully""" if not self.running: raise RuntimeError("Engine not running") self.running = False #打标记 dfd = self._close_all_spiders() #拿到关闭deferred return dfd.addBoth(lambda _: self._finish_stopping_engine()) #给deferred添加 关闭的callback def close(self): """Close the execution engine gracefully. If it has already been started, stop it. In all cases, close all spiders and the downloader. """ if self.running: # Will also close spiders and downloader return self.stop() elif self.open_spiders: # Will also close downloader return self._close_all_spiders() else: return defer.succeed(self.downloader.close()) #其实就是相当于d.callback(result) def pause(self): """Pause the execution engine""" self.paused = True def unpause(self): """Resume the execution engine""" self.paused = False # 判断暂停否,取出一个request 调用crawl 处理request 最后判断是否空闲关闭 def _next_request(self, spider): slot = self.slot if not slot: return if self.paused: return while not self._needs_backout(spider): if not self._next_request_from_scheduler(spider): #如果从scheduler 取出 request 并添加 item处理的callback 查看其返回结果 没有了就break break if slot.start_requests and not self._needs_backout(spider): #处理关闭后仍有request情况 try: request = next(slot.start_requests) except StopIteration: slot.start_requests = None except Exception: slot.start_requests = None logger.error('Error while obtaining start requests', exc_info=True, extra={'spider': spider}) else: self.crawl(request, spider)# 调用自身crawl方法爬 if self.spider_is_idle(spider) and slot.close_if_idle: #空闲时候是否关闭spider self._spider_idle(spider) # 从不同地方的标志位置判断是否需要退出 def _needs_backout(self, spider): slot = self.slot return ( not self.running or slot.closing or self.downloader.needs_backout() or self.scraper.slot.needs_backout() ) # 从scheduler中取出request,放到_download下载 再给他添加处理返回值 和 调用下一个循环的回调 def _next_request_from_scheduler(self, spider): slot = self.slot request = slot.scheduler.next_request() # 从scheduler中取出request if not request: return d = self._download(request, spider) #将request放到_download中 生成deferred d.addBoth(self._handle_downloader_output, request, spider) # #### 这里其实就是 处理返回request的 回调函数 其中 函数的 response 就是这个deferred对象 d.addErrback(lambda f: logger.info('Error while handling downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.remove_request(request)) #从slot中删掉对应的reqeust d.addErrback(lambda f: logger.info('Error while removing request from slot', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.nextcall.schedule()) # 添加一个运行下一个request的回调 d.addErrback(lambda f: logger.info('Error while scheduling new request', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d # 判断resonse具体类别 正确情况下调用scraper.enqueue_scrape 压入响应并返回deferred对象,errback 添加log def _handle_downloader_output(self, response, request, spider): if not isinstance(response, (Request, Response, Failure)): raise TypeError( "Incorrect type: expected Request, Response or Failure, got " f"{type(response)}: {response!r}" ) # downloader middleware can return requests (for example, redirects) if isinstance(response, Request): self.crawl(response, spider) #将request 压入 scheduler return # response is a Response or Failure d = self.scraper.enqueue_scrape(response, request, spider) #将 request respond 和sipider 共同的做作用好的scraper 返回来 d.addErrback(lambda f: logger.error('Error while enqueuing downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d #返回deferred #判断spider是不是闲着 def spider_is_idle(self, spider): #判断 scrapy整体是不是 闲着 if not self.scraper.slot.is_idle(): # scraper is not idle return False if self.downloader.active: # downloader has pending requests return False if self.slot.start_requests is not None: # not all start requests are handled return False if self.slot.scheduler.has_pending_requests(): # scheduler has pending requests return False return True @property def open_spiders(self): # 目前一个engine还是只能使用一个spider return [self.spider] if self.spider else [] def has_capacity(self): #一个引擎对应一个slot """Does the engine have capacity to handle more spiders""" return not bool(self.slot) #执行单个的 request 压入scheduler 并执行下一步的命令 def crawl(self, request, spider): if spider not in self.open_spiders: raise RuntimeError(f"Spider {spider.name!r} not opened when crawling: {request}") self.schedule(request, spider) #将request 压入scheduler【压入数据】 self.slot.nextcall.schedule() #执行下一步操作 这里的nextcall 是调用 _next_request()【取出request并后处理】 # 将request 压入spider的que中 def schedule(self, request, spider): self.signals.send_catch_log(signals.request_scheduled, request=request, spider=spider) if not self.slot.scheduler.enqueue_request(request): #入队列 并陪你段是否被过滤 self.signals.send_catch_log(signals.request_dropped, request=request, spider=spider) # 这里是调用 _downlaod 方法 进行下载 最后添加一个 _downloaded 到回调链路上 def download(self, request, spider): d = self._download(request, spider) d.addBoth(self._downloaded, self.slot, request, spider) #addBoth(func, ,参数 ,参数) return d #是request的话 从新调用 dowload 否则返回 rewponse def _downloaded(self, response, slot, request, spider): slot.remove_request(request) #从slot里删除这个request return self.download(response, spider) if isinstance(response, Request) else response #### 下载起点slot ##### 添加这个 request 然后调用downloader 下载 request 添加对应的 处理callback def _download(self, request, spider): # slot = self.slot slot.add_request(request) # 在slot 的正在处理的request里面 增加这个request def _on_success(response): if not isinstance(response, (Response, Request)): raise TypeError( "Incorrect type: expected Response or Request, got " f"{type(response)}: {response!r}" ) if isinstance(response, Response): if response.request is None: response.request = request logkws = self.logformatter.crawled(response.request, response, spider) if logkws is not None: logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) self.signals.send_catch_log( signal=signals.response_received, response=response, request=response.request, spider=spider, ) return response def _on_complete(_): slot.nextcall.schedule() return _ dwld = self.downloader.fetch(request, spider) #实际下载发生的情况 dwld.addCallbacks(_on_success) #增加 callback dwld.addBoth(_on_complete) #增加callback 和errback return dwld ######## 程序的实际开始的地方 ###### 相当于 NO1 实际上一个引擎只能调用一个spider 这个方法中很多spider都只能是个例 不能是list @defer.inlineCallbacks def open_spider(self, spider, start_requests=(), close_if_idle=True): if not self.has_capacity(): raise RuntimeError(f"No free spider slot when opening {spider.name!r}") logger.info("Spider opened", extra={'spider': spider}) nextcall = CallLaterOnce(self._next_request, spider) # 这里相当于 创建了一个 CallLaterOnce的对象 目标是从start_request里开始抛出request 不过 他是相当于仅仅生成个deferred scheduler = self.scheduler_cls.from_crawler(self.crawler)# 实例化scheduler start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider) # 调用middleware中的process_start_requests 来处理这些startrequest slot = Slot(start_requests, close_if_idle, nextcall, scheduler) #这里创建对应的slot self.slot = slot self.spider = spider yield scheduler.open(spider) # 初始化 scheduler 生成que yield self.scraper.open_spider(spider)# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~· self.crawler.stats.open_spider(spider) yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)#发出信号 slot.nextcall.schedule() # 给reactor 添加任务 实际启动_next_request slot.heartbeat.start(5) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~· def _spider_idle(self, spider): """Called when a spider gets idle. This function is called when there are no remaining pages to download or schedule. It can be called multiple times. If some extension raises a DontCloseSpider exception (in the spider_idle signal handler) the spider is not closed until the next loop and this function is guaranteed to be called (at least) once again for this spider. """ res = self.signals.send_catch_log(signals.spider_idle, spider=spider, dont_log=DontCloseSpider) #方法返回的类型为 (receiver, result) if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) for _, x in res): return #当信号发出,返回的是错误 且 不许关闭spider标志的话 返回空 if self.spider_is_idle(spider): self.close_spider(spider, reason='finished') def close_spider(self, spider, reason='cancelled'): """Close (cancel) spider and clear all its outstanding requests""" slot = self.slot if slot.closing: return slot.closing logger.info("Closing spider (%(reason)s)", {'reason': reason}, extra={'spider': spider}) dfd = slot.close() def log_failure(msg): def errback(failure): logger.error( msg, exc_info=failure_to_exc_info(failure), extra={'spider': spider} ) return errback dfd.addBoth(lambda _: self.downloader.close()) #通知downloader 关闭动作 dfd.addErrback(log_failure('Downloader close failure')) dfd.addBoth(lambda _: self.scraper.close_spider(spider)) # 通知scraper 关闭动作 dfd.addErrback(log_failure('Scraper close failure')) dfd.addBoth(lambda _: slot.scheduler.close(reason)) #通知slot dfd.addErrback(log_failure('Scheduler close failure')) dfd.addBoth(lambda _: self.signals.send_catch_log_deferred( #发送信号 signal=signals.spider_closed, spider=spider, reason=reason)) dfd.addErrback(log_failure('Error while sending spider_close signal')) dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason)) #crawler 关闭信息 dfd.addErrback(log_failure('Stats close failure')) dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)", {'reason': reason}, extra={'spider': spider})) dfd.addBoth(lambda _: setattr(self, 'slot', None)) #清理内存 dfd.addErrback(log_failure('Error while unassigning slot')) dfd.addBoth(lambda _: setattr(self, 'spider', None)) dfd.addErrback(log_failure('Error while unassigning spider')) dfd.addBoth(lambda _: self._spider_closed_callback(spider)) #调用对应回调函数 return dfd def _close_all_spiders(self): dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders] dlist = defer.DeferredList(dfds) return dlist @defer.inlineCallbacks def _finish_stopping_engine(self): yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped) # self._closewait.callback(None)
class ExecutionEngine(object): def __init__(self, crawler, spider_closed_callback): ## 将爬虫实例存储在执行引擎实例中 self.crawler = crawler ## 将爬虫实例所对应的配置也存储在执行引擎实例中 self.settings = crawler.settings ## 信号 self.signals = crawler.signals ## 日志格式化器 self.logformatter = crawler.logformatter self.slot = None self.spider = None ## 是否正在运行 self.running = False ## 是否已暂停执行 self.paused = False ## 从配置文件中加载调度器类 self.scheduler_cls = load_object(self.settings['SCHEDULER']) ## 从配置文件中加载下载器类 downloader_cls = load_object(self.settings['DOWNLOADER']) ## 实例化下载器 self.downloader = downloader_cls(crawler) ## 实例化 scraper,它是引擎连接爬虫类(Spider)和管道类(Pipeline)的桥梁 self.scraper = Scraper(crawler) ## 指定爬虫关闭的回调函数 self._spider_closed_callback = spider_closed_callback @defer.inlineCallbacks def start(self): """Start the execution engine""" assert not self.running, "Engine already running" self.start_time = time() yield self.signals.send_catch_log_deferred(signal=signals.engine_started) self.running = True self._closewait = defer.Deferred() yield self._closewait def stop(self): """Stop the execution engine gracefully""" assert self.running, "Engine not running" self.running = False dfd = self._close_all_spiders() return dfd.addBoth(lambda _: self._finish_stopping_engine()) def close(self): """Close the execution engine gracefully. If it has already been started, stop it. In all cases, close all spiders and the downloader. """ if self.running: # Will also close spiders and downloader return self.stop() elif self.open_spiders: # Will also close downloader return self._close_all_spiders() else: return defer.succeed(self.downloader.close()) def pause(self): """Pause the execution engine""" self.paused = True def unpause(self): """Resume the execution engine""" self.paused = False def _next_request(self, spider): ## 该方法会被循环调度 slot = self.slot if not slot: return if self.paused: return ## 是否撤销 while not self._needs_backout(spider): ## 从 scheduler 中获取下一个 request ## 注意:第一次获取时,是没有的,也就是会 break 出来 ## 从而执行下面的逻辑 if not self._next_request_from_scheduler(spider): break ## 如果 start_requests 有数据且不需要撤销 if slot.start_requests and not self._needs_backout(spider): try: ## 获取下一个种子请求 request = next(slot.start_requests) except StopIteration: slot.start_requests = None except Exception: slot.start_requests = None logger.error('Error while obtaining start requests', exc_info=True, extra={'spider': spider}) else: ## 调用 crawl, 实际是把 request 放入 scheduler 的队列中 self.crawl(request, spider) ## 如果爬虫是空闲的则关闭爬虫 if self.spider_is_idle(spider) and slot.close_if_idle: self._spider_idle(spider) def _needs_backout(self, spider): ## 是否需要撤销,取决于 4 个条件 ## 1. engine 是否在运行 ## 2. slot 是否关闭 ## 3. 下载器网络下载是否超过预设 ## 4. scraper 处理输出是否超过预设 slot = self.slot return not self.running \ or slot.closing \ or self.downloader.needs_backout() \ or self.scraper.slot.needs_backout() def _next_request_from_scheduler(self, spider): slot = self.slot ## 从调度器中获取下一个请求 request = slot.scheduler.next_request() if not request: return ## 下载 d = self._download(request, spider) ## 为下载结果添加回调,下载结果可能是 Request、Response、Failure d.addBoth(self._handle_downloader_output, request, spider) d.addErrback(lambda f: logger.info('Error while handling downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.remove_request(request)) d.addErrback(lambda f: logger.info('Error while removing request from slot', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.nextcall.schedule()) d.addErrback(lambda f: logger.info('Error while scheduling new request', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d def _handle_downloader_output(self, response, request, spider): ## 下载结果 response 必须是 Request、Response、Failure 之一 assert isinstance(response, (Request, Response, Failure)), response # downloader middleware can return requests (for example, redirects) ## 如果下载结果是 Request,则再次调用 crawl,执行 Scheduler 的入队逻辑 if isinstance(response, Request): self.crawl(response, spider) return # response is a Response or Failure ## 如果下载结果是 Response 或 Failure,则交给 scrapy 的 enqueue_scrape 方法进一步处理 ## 主要是与 spiders 和 pipelines 交互 d = self.scraper.enqueue_scrape(response, request, spider) d.addErrback(lambda f: logger.error('Error while enqueuing downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d def spider_is_idle(self, spider): if not self.scraper.slot.is_idle(): # scraper is not idle return False if self.downloader.active: # downloader has pending requests return False if self.slot.start_requests is not None: # not all start requests are handled return False if self.slot.scheduler.has_pending_requests(): # scheduler has pending requests return False return True @property def open_spiders(self): return [self.spider] if self.spider else [] def has_capacity(self): """Does the engine have capacity to handle more spiders""" return not bool(self.slot) def crawl(self, request, spider): assert spider in self.open_spiders, \ "Spider %r not opened when crawling: %s" % (spider.name, request) ## 将请求放入调度器队列 self.schedule(request, spider) ## 调用 nextcall 的 schedule 方法,进行下一次调度 self.slot.nextcall.schedule() def schedule(self, request, spider): self.signals.send_catch_log(signal=signals.request_scheduled, request=request, spider=spider) ## 调用调度器的 enqueue_request 方法,将请求放入调度器队列 if not self.slot.scheduler.enqueue_request(request): self.signals.send_catch_log(signal=signals.request_dropped, request=request, spider=spider) def download(self, request, spider): d = self._download(request, spider) d.addBoth(self._downloaded, self.slot, request, spider) return d def _downloaded(self, response, slot, request, spider): slot.remove_request(request) return self.download(response, spider) \ if isinstance(response, Request) else response def _download(self, request, spider): slot = self.slot slot.add_request(request) ## 下载成功的回调,返回处理过的响应 def _on_success(response): assert isinstance(response, (Response, Request)) if isinstance(response, Response): ## 将请求放入响应对象的 request 属性中 response.request = request # tie request to response received logkws = self.logformatter.crawled(request, response, spider) logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) self.signals.send_catch_log(signal=signals.response_received, \ response=response, request=request, spider=spider) return response ## 下载完成的回调,继续下一次调度 def _on_complete(_): slot.nextcall.schedule() return _ ## 调用下载器进行下载 dwld = self.downloader.fetch(request, spider) ## 注册下载成功的回调 dwld.addCallbacks(_on_success) ## 注册下载完成的回调 dwld.addBoth(_on_complete) return dwld @defer.inlineCallbacks def open_spider(self, spider, start_requests=(), close_if_idle=True): assert self.has_capacity(), "No free spider slot when opening %r" % \ spider.name logger.info("Spider opened", extra={'spider': spider}) ## 注册 _next_request 调度方法,循环调度 nextcall = CallLaterOnce(self._next_request, spider) ## 初始化调度器类 scheduler = self.scheduler_cls.from_crawler(self.crawler) ## 调用爬虫中间件的 process_start_requests 方法处理种子请求 ## 可以定义多个爬虫中间件,每个类都重写该方法,爬虫在调度之前会分别调用你定义好的 ## 爬虫中间件,来分别处理起始请求,功能独立而且维护起来更加清晰 start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider) ## 封装 slot 对象 slot = Slot(start_requests, close_if_idle, nextcall, scheduler) self.slot = slot self.spider = spider ## 调用调度器的 open 方法 yield scheduler.open(spider) ## 调用 scraper 的 open_spider 方法 yield self.scraper.open_spider(spider) self.crawler.stats.open_spider(spider) yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider) ## 发起调度 slot.nextcall.schedule() slot.heartbeat.start(5) def _spider_idle(self, spider): """Called when a spider gets idle. This function is called when there are no remaining pages to download or schedule. It can be called multiple times. If some extension raises a DontCloseSpider exception (in the spider_idle signal handler) the spider is not closed until the next loop and this function is guaranteed to be called (at least) once again for this spider. """ res = self.signals.send_catch_log(signal=signals.spider_idle, \ spider=spider, dont_log=DontCloseSpider) if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \ for _, x in res): return if self.spider_is_idle(spider): self.close_spider(spider, reason='finished') def close_spider(self, spider, reason='cancelled'): """Close (cancel) spider and clear all its outstanding requests""" slot = self.slot if slot.closing: return slot.closing logger.info("Closing spider (%(reason)s)", {'reason': reason}, extra={'spider': spider}) dfd = slot.close() def log_failure(msg): def errback(failure): logger.error( msg, exc_info=failure_to_exc_info(failure), extra={'spider': spider} ) return errback dfd.addBoth(lambda _: self.downloader.close()) dfd.addErrback(log_failure('Downloader close failure')) dfd.addBoth(lambda _: self.scraper.close_spider(spider)) dfd.addErrback(log_failure('Scraper close failure')) dfd.addBoth(lambda _: slot.scheduler.close(reason)) dfd.addErrback(log_failure('Scheduler close failure')) dfd.addBoth(lambda _: self.signals.send_catch_log_deferred( signal=signals.spider_closed, spider=spider, reason=reason)) dfd.addErrback(log_failure('Error while sending spider_close signal')) dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason)) dfd.addErrback(log_failure('Stats close failure')) dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)", {'reason': reason}, extra={'spider': spider})) dfd.addBoth(lambda _: setattr(self, 'slot', None)) dfd.addErrback(log_failure('Error while unassigning slot')) dfd.addBoth(lambda _: setattr(self, 'spider', None)) dfd.addErrback(log_failure('Error while unassigning spider')) dfd.addBoth(lambda _: self._spider_closed_callback(spider)) return dfd def _close_all_spiders(self): dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders] dlist = defer.DeferredList(dfds) return dlist @defer.inlineCallbacks def _finish_stopping_engine(self): yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped) self._closewait.callback(None)
class ExecutionEngine: def __init__(self, crawler, spider_closed_callback: Callable) -> None: self.crawler = crawler self.settings = crawler.settings self.signals = crawler.signals self.logformatter = crawler.logformatter self.slot: Optional[Slot] = None self.spider: Optional[Spider] = None self.running = False self.paused = False self.scheduler_cls = load_object(crawler.settings["SCHEDULER"]) downloader_cls = load_object(self.settings['DOWNLOADER']) self.downloader = downloader_cls(crawler) self.scraper = Scraper(crawler) self._spider_closed_callback = spider_closed_callback @inlineCallbacks def start(self) -> Deferred: if self.running: raise RuntimeError("Engine already running") self.start_time = time() yield self.signals.send_catch_log_deferred( signal=signals.engine_started) self.running = True self._closewait = Deferred() yield self._closewait def stop(self) -> Deferred: """Gracefully stop the execution engine""" @inlineCallbacks def _finish_stopping_engine(_) -> Deferred: yield self.signals.send_catch_log_deferred( signal=signals.engine_stopped) self._closewait.callback(None) if not self.running: raise RuntimeError("Engine not running") self.running = False dfd = self.close_spider( self.spider, reason="shutdown") if self.spider is not None else succeed(None) return dfd.addBoth(_finish_stopping_engine) def close(self) -> Deferred: """ Gracefully close the execution engine. If it has already been started, stop it. In all cases, close the spider and the downloader. """ if self.running: return self.stop() # will also close spider and downloader if self.spider is not None: return self.close_spider( self.spider, reason="shutdown") # will also close downloader return succeed(self.downloader.close()) def pause(self) -> None: self.paused = True def unpause(self) -> None: self.paused = False def _next_request(self) -> None: assert self.slot is not None # typing assert self.spider is not None # typing if self.paused: return None while not self._needs_backout() and self._next_request_from_scheduler( ) is not None: pass if self.slot.start_requests is not None and not self._needs_backout(): try: request = next(self.slot.start_requests) except StopIteration: self.slot.start_requests = None except Exception: self.slot.start_requests = None logger.error('Error while obtaining start requests', exc_info=True, extra={'spider': self.spider}) else: self.crawl(request) if self.spider_is_idle() and self.slot.close_if_idle: self._spider_idle() def _needs_backout(self) -> bool: return (not self.running or self.slot.closing # type: ignore[union-attr] or self.downloader.needs_backout() or self.scraper.slot.needs_backout() # type: ignore[union-attr] ) def _next_request_from_scheduler(self) -> Optional[Deferred]: assert self.slot is not None # typing assert self.spider is not None # typing request = self.slot.scheduler.next_request() if request is None: return None d = self._download(request, self.spider) d.addBoth(self._handle_downloader_output, request) d.addErrback( lambda f: logger.info('Error while handling downloader output', exc_info=failure_to_exc_info(f), extra={'spider': self.spider})) d.addBoth(lambda _: self.slot.remove_request(request)) d.addErrback( lambda f: logger.info('Error while removing request from slot', exc_info=failure_to_exc_info(f), extra={'spider': self.spider})) d.addBoth(lambda _: self.slot.nextcall.schedule()) d.addErrback( lambda f: logger.info('Error while scheduling new request', exc_info=failure_to_exc_info(f), extra={'spider': self.spider})) return d def _handle_downloader_output(self, result: Union[Request, Response, Failure], request: Request) -> Optional[Deferred]: assert self.spider is not None # typing if not isinstance(result, (Request, Response, Failure)): raise TypeError( f"Incorrect type: expected Request, Response or Failure, got {type(result)}: {result!r}" ) # downloader middleware can return requests (for example, redirects) if isinstance(result, Request): self.crawl(result) return None d = self.scraper.enqueue_scrape(result, request, self.spider) d.addErrback(lambda f: logger.error( "Error while enqueuing downloader output", exc_info=failure_to_exc_info(f), extra={'spider': self.spider}, )) return d def spider_is_idle(self, spider: Optional[Spider] = None) -> bool: if spider is not None: warnings.warn( "Passing a 'spider' argument to ExecutionEngine.spider_is_idle is deprecated", category=ScrapyDeprecationWarning, stacklevel=2, ) if self.slot is None: raise RuntimeError("Engine slot not assigned") if not self.scraper.slot.is_idle(): # type: ignore[union-attr] return False if self.downloader.active: # downloader has pending requests return False if self.slot.start_requests is not None: # not all start requests are handled return False if self.slot.scheduler.has_pending_requests(): return False return True def crawl(self, request: Request, spider: Optional[Spider] = None) -> None: """Inject the request into the spider <-> downloader pipeline""" if spider is not None: warnings.warn( "Passing a 'spider' argument to ExecutionEngine.crawl is deprecated", category=ScrapyDeprecationWarning, stacklevel=2, ) if spider is not self.spider: raise RuntimeError( f"The spider {spider.name!r} does not match the open spider" ) if self.spider is None: raise RuntimeError(f"No open spider to crawl: {request}") self._schedule_request(request, self.spider) self.slot.nextcall.schedule() # type: ignore[union-attr] def _schedule_request(self, request: Request, spider: Spider) -> None: self.signals.send_catch_log(signals.request_scheduled, request=request, spider=spider) if not self.slot.scheduler.enqueue_request( request): # type: ignore[union-attr] self.signals.send_catch_log(signals.request_dropped, request=request, spider=spider) def download(self, request: Request, spider: Optional[Spider] = None) -> Deferred: """Return a Deferred which fires with a Response as result, only downloader middlewares are applied""" if spider is None: spider = self.spider else: warnings.warn( "Passing a 'spider' argument to ExecutionEngine.download is deprecated", category=ScrapyDeprecationWarning, stacklevel=2, ) if spider is not self.spider: logger.warning( "The spider '%s' does not match the open spider", spider.name) if spider is None: raise RuntimeError(f"No open spider to crawl: {request}") return self._download(request, spider).addBoth(self._downloaded, request, spider) def _downloaded(self, result: Union[Response, Request], request: Request, spider: Spider) -> Union[Deferred, Response]: assert self.slot is not None # typing self.slot.remove_request(request) return self.download(result, spider) if isinstance(result, Request) else result def _download(self, request: Request, spider: Spider) -> Deferred: assert self.slot is not None # typing self.slot.add_request(request) def _on_success( result: Union[Response, Request]) -> Union[Response, Request]: if not isinstance(result, (Response, Request)): raise TypeError( f"Incorrect type: expected Response or Request, got {type(result)}: {result!r}" ) if isinstance(result, Response): if result.request is None: result.request = request logkws = self.logformatter.crawled(result.request, result, spider) if logkws is not None: logger.log(*logformatter_adapter(logkws), extra={"spider": spider}) self.signals.send_catch_log( signal=signals.response_received, response=result, request=result.request, spider=spider, ) return result def _on_complete(_): self.slot.nextcall.schedule() return _ dwld = self.downloader.fetch(request, spider) dwld.addCallbacks(_on_success) dwld.addBoth(_on_complete) return dwld @inlineCallbacks def open_spider(self, spider: Spider, start_requests: Iterable = (), close_if_idle: bool = True): if self.slot is not None: raise RuntimeError( f"No free spider slot when opening {spider.name!r}") logger.info("Spider opened", extra={'spider': spider}) nextcall = CallLaterOnce(self._next_request) scheduler = create_instance(self.scheduler_cls, settings=None, crawler=self.crawler) start_requests = yield self.scraper.spidermw.process_start_requests( start_requests, spider) self.slot = Slot(start_requests, close_if_idle, nextcall, scheduler) self.spider = spider yield scheduler.open(spider) yield self.scraper.open_spider(spider) self.crawler.stats.open_spider(spider) yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider) self.slot.nextcall.schedule() self.slot.heartbeat.start(5) def _spider_idle(self) -> None: """ Called when a spider gets idle, i.e. when there are no remaining requests to download or schedule. It can be called multiple times. If a handler for the spider_idle signal raises a DontCloseSpider exception, the spider is not closed until the next loop and this function is guaranteed to be called (at least) once again. """ assert self.spider is not None # typing res = self.signals.send_catch_log(signals.spider_idle, spider=self.spider, dont_log=DontCloseSpider) if any( isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) for _, x in res): return None if self.spider_is_idle(): self.close_spider(self.spider, reason='finished') def close_spider(self, spider: Spider, reason: str = "cancelled") -> Deferred: """Close (cancel) spider and clear all its outstanding requests""" if self.slot is None: raise RuntimeError("Engine slot not assigned") if self.slot.closing is not None: return self.slot.closing logger.info("Closing spider (%(reason)s)", {'reason': reason}, extra={'spider': spider}) dfd = self.slot.close() def log_failure(msg: str) -> Callable: def errback(failure: Failure) -> None: logger.error(msg, exc_info=failure_to_exc_info(failure), extra={'spider': spider}) return errback dfd.addBoth(lambda _: self.downloader.close()) dfd.addErrback(log_failure('Downloader close failure')) dfd.addBoth(lambda _: self.scraper.close_spider(spider)) dfd.addErrback(log_failure('Scraper close failure')) dfd.addBoth(lambda _: self.slot.scheduler.close(reason)) dfd.addErrback(log_failure('Scheduler close failure')) dfd.addBoth(lambda _: self.signals.send_catch_log_deferred( signal=signals.spider_closed, spider=spider, reason=reason, )) dfd.addErrback(log_failure('Error while sending spider_close signal')) dfd.addBoth( lambda _: self.crawler.stats.close_spider(spider, reason=reason)) dfd.addErrback(log_failure('Stats close failure')) dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)", {'reason': reason}, extra={'spider': spider})) dfd.addBoth(lambda _: setattr(self, 'slot', None)) dfd.addErrback(log_failure('Error while unassigning slot')) dfd.addBoth(lambda _: setattr(self, 'spider', None)) dfd.addErrback(log_failure('Error while unassigning spider')) dfd.addBoth(lambda _: self._spider_closed_callback(spider)) return dfd @property def open_spiders(self) -> list: warnings.warn( "ExecutionEngine.open_spiders is deprecated, please use ExecutionEngine.spider instead", category=ScrapyDeprecationWarning, stacklevel=2, ) return [self.spider] if self.spider is not None else [] def has_capacity(self) -> bool: warnings.warn("ExecutionEngine.has_capacity is deprecated", ScrapyDeprecationWarning, stacklevel=2) return not bool(self.slot) def schedule(self, request: Request, spider: Spider) -> None: warnings.warn( "ExecutionEngine.schedule is deprecated, please use " "ExecutionEngine.crawl or ExecutionEngine.download instead", category=ScrapyDeprecationWarning, stacklevel=2, ) if self.slot is None: raise RuntimeError("Engine slot not assigned") self._schedule_request(request, spider)
class ExecutionEngine(object): """ 有三个重要的实例化 一、schedule 调度器,实例化了dupefilter过滤器,然后还初始化了三个队列。 dupefilter:过滤器,通过存储 method + url + response.body 生成sha1指纹,来进行过滤 pqclass:一个优先级队列queuelib.PriorityQueue dqclass:一个FIFO队列,先进先出规则,并且通过pickle序列化了 mqclass:一个FIFO队列,先进先出规则,直接存储在内存中 二、downloader 实例化了Handler对象,还实例化了下载器中间件 Handler:具体的下载逻辑 DownloaderMiddlewareManager:收集所有的下载中间件,在收集其中的process_request、process_exception、process_response三种方法 三、scraper 实例化了爬虫中间件,还实例化了管道处理器 SpiderMiddlewareManager:实例化后获取process_spider_input、process_spider_output、process_spider_exception、process_start_requests itemproc_cls:获取ItemPipelineManager,实例化其中的ITEM_PIPELINES,获取process_item """ def __init__(self, crawler, spider_closed_callback): self.crawler = crawler self.settings = crawler.settings self.signals = crawler.signals self.logformatter = crawler.logformatter self.slot = None self.spider = None self.running = False self.paused = False self.scheduler_cls = load_object( self.settings['SCHEDULER'] ) # SCHEDULER = 'scrapy.core.scheduler.Scheduler',仅仅获取对象,没做其他坏事 downloader_cls = load_object( self.settings['DOWNLOADER'] ) # DOWNLOADER = 'scrapy.core.downloader.Downloader' self.downloader = downloader_cls( crawler ) # 这个下载器,里面实例化了handler处理器,和到下载器之间的process_处理逻辑。就是具体的下载功能和中间件功能都已经实现了 self.scraper = Scraper( crawler) # 这里有定义有spidermw爬虫中间件和ITEM_pipeline管道对象,数据处理功能和存储功能都实现了 self._spider_closed_callback = spider_closed_callback # 这个回调很重要,关系到爬虫能不能停下来,是个匿名函数lambda _: self.stop(),最终还是执行engine的self.engine.stop @defer.inlineCallbacks def start(self): """Start the execution engine""" assert not self.running, "Engine already running" self.start_time = time() yield self.signals.send_catch_log_deferred( signal=signals.engine_started) self.running = True self._closewait = defer.Deferred() yield self._closewait def stop(self): """Stop the execution engine gracefully""" assert self.running, "Engine not running" self.running = False dfd = self._close_all_spiders() return dfd.addBoth(lambda _: self._finish_stopping_engine()) def close(self): """Close the execution engine gracefully. If it has already been started, stop it. In all cases, close all spiders and the downloader. """ if self.running: # Will also close spiders and downloader return self.stop() elif self.open_spiders: # Will also close downloader return self._close_all_spiders() else: return defer.succeed(self.downloader.close()) def pause(self): """Pause the execution engine""" self.paused = True def unpause(self): """Resume the execution engine""" self.paused = False def _next_request(self, spider): slot = self.slot if not slot: return if self.paused: return while not self._needs_backout(spider): # 什么时候才会出来呢?? """首次执行状态 True False False False """ # 当爬虫running为True # 心跳关闭slot.closing=True # 下载器有active活跃数量大于16 # 刮擦有active活跃数量大于5000000 if not self._next_request_from_scheduler( spider): # 就是加入这里我放了10个函数呢 """ 会一直递归获取所有的request,丢到下载器进行下载,最后一步为经历了scraper的润色 从调度器中pop出一个request请求 执行下载函数,获取结果,如果是request则继续入队,并递归心跳函数,否则继续往下走 执行对结果的处理 居然会在start_requests之前执行,不可思议,存在记录的话最少要走两次,一次走完,还有一次走结束 执行_next_request_from_scheduler Done! 首次直接走掉,应为队列里面一个数据都没有 然后从start_requests里面next出一个数据,推到队列面去,所以数据怎么进去,居然是一个一个next出来推进去的,那个百度小哥着实牛逼 """ break if slot.start_requests and not self._needs_backout( spider): # 卧槽,这个_needs_backout原来可以控制并发的大小 try: request = next( slot.start_requests ) # 数据就是从这里,一个一个的从start_requests调出来然后再推进去的,神奇的异步 except StopIteration: slot.start_requests = None except Exception: slot.start_requests = None logger.error('Error while obtaining start requests', exc_info=True, extra={'spider': spider}) else: """ 所以我感觉现在的情况就很尴尬,我刚往里面push一个数据,然后继续调用时,又立马给我pop出来了,真是醉了 """ self.crawl(request, spider) if self.spider_is_idle(spider) and slot.close_if_idle: self._spider_idle(spider) def _needs_backout( self, spider ): # len(self.active) >= self.total_concurrency # return self.active_size > self.max_active_size slot = self.slot return not self.running \ or slot.closing \ or self.downloader.needs_backout() \ or self.scraper.slot.needs_backout() def _next_request_from_scheduler(self, spider): # 怎么感觉这一个函数就可以把所有流程走完啊??????? slot = self.slot request = slot.scheduler.next_request() # 从调度器中pop出一条request记录 if not request: return d = self._download(request, spider) d.addBoth(self._handle_downloader_output, request, spider) d.addErrback( lambda f: logger.info('Error while handling downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.remove_request(request)) d.addErrback( lambda f: logger.info('Error while removing request from slot', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.nextcall.schedule()) d.addErrback( lambda f: logger.info('Error while scheduling new request', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d def _handle_downloader_output(self, response, request, spider): # 这里链接到download下载后的response assert isinstance(response, (Request, Response, Failure)), response # downloader middleware can return requests (for example, redirects) if isinstance(response, Request): # 对于结果,如果是Request,则直接入队,进入self.crawl self.crawl(response, spider) # 对request请求指纹过滤,没问题则入队,然后递归心跳处理 return # response is a Response or Failure d = self.scraper.enqueue_scrape( response, request, spider ) # 如果是正确的response,对下载器输出的结果进行scraper的三个处理函数,如果结果是request继续入队,如果是字典或者Item则调用process_item函数进行后续处理 d.addErrback( lambda f: logger.error('Error while enqueuing downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d def spider_is_idle(self, spider): # 爬虫 闲置 状态 ?? if not self.scraper.slot.is_idle(): # scraper处于闲置 # scraper is not idle return False if self.downloader.active: # 下载器处于闲置 # downloader has pending requests return False if self.slot.start_requests is not None: # 所有start_requests处于空 # not all start requests are handled return False if self.slot.scheduler.has_pending_requests(): # 调度器 所有等待任务 为空 # scheduler has pending requests return False return True # 判断闲置的四个条件:start_requests、调度器、下载器、scraper均闲置,才会判断爬虫处于闲置状态。 @property def open_spiders(self): return [self.spider] if self.spider else [] def has_capacity(self): """Does the engine have capacity to handle more spiders""" return not bool(self.slot) def crawl(self, request, spider): # 将请求进行指纹过滤,没问题则入队,然后递归执行心跳 assert spider in self.open_spiders, \ "Spider %r not opened when crawling: %s" % (spider.name, request) self.schedule(request, spider) self.slot.nextcall.schedule() # 又执行一次 def schedule(self, request, spider): self.signals.send_catch_log(signal=signals.request_scheduled, request=request, spider=spider) if not self.slot.scheduler.enqueue_request( request ): # 什么是否才会走到这里呢 - 请求指纹过滤,若没有过滤掉,则入队,self._dqpush(request)也就是push进队列 self.signals.send_catch_log(signal=signals.request_dropped, request=request, spider=spider) def download(self, request, spider): d = self._download(request, spider) d.addBoth(self._downloaded, self.slot, request, spider) return d def _downloaded(self, response, slot, request, spider): # 下载结束时,心跳中移除请求 slot.remove_request(request) return self.download(response, spider) \ if isinstance(response, Request) else response # 是Request则继续调用上面函数 def _download(self, request, spider): slot = self.slot slot.add_request(request) def _on_success( response): # 如果是response,就是正常的response对象了,但是应该还没有进行回调处理吧 assert isinstance(response, (Response, Request)) if isinstance(response, Response): response.request = request # tie request to response received logkws = self.logformatter.crawled(request, response, spider) logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) self.signals.send_catch_log(signal=signals.response_received, \ response=response, request=request, spider=spider) return response def _on_complete(_): slot.nextcall.schedule() return _ dwld = self.downloader.fetch( request, spider ) # 爬虫下载入口,调用middle进行下载,把真正的下载函数传递过滤,在middle中间进行回调的时候,处理第一个管道,没了再执行下载器进行处理 dwld.addCallbacks(_on_success) dwld.addBoth(_on_complete) return dwld @defer.inlineCallbacks def open_spider(self, spider, start_requests=(), close_if_idle=True): assert self.has_capacity(), "No free spider slot when opening %r" % \ spider.name logger.info("Spider opened", extra={'spider': spider}) nextcall = CallLaterOnce(self._next_request, spider) scheduler = self.scheduler_cls.from_crawler( self.crawler ) # 对调度器进行实例化。实例化了dupefilter,还有三种队列。一种是优先级队列,还有来两个都是fifo先进先出队列,不过一个是直接存储在内存memory中,一个是通过pickle实例化了 start_requests = yield self.scraper.spidermw.process_start_requests( start_requests, spider) # 第一步执行的居然是爬虫中间件里面的process_start_requests slot = Slot(start_requests, close_if_idle, nextcall, scheduler) self.slot = slot self.spider = spider yield scheduler.open(spider) # 打开内存队列FIFO,优先级队列,并打开过滤器 yield self.scraper.open_spider(spider) # 貌似没做啥事 self.crawler.stats.open_spider(spider) # pass,也没做啥事 yield self.signals.send_catch_log_deferred( signals.spider_opened, spider=spider) # 做了好多事啊,初始化日志,还有各种装啊提,中间件似乎都实现了这个函数? slot.nextcall.schedule() # 执行一次self._next_request # 这鬼地方居然只会走一次,也就是初始化的走完这里,但是并不会执行里面的逻辑,应为这个schedule里面用的是reactor.callLater(delay, self),所以是不会执行的,除非你start slot.heartbeat.start(5) def _spider_idle(self, spider): """Called when a spider gets idle. This function is called when there are no remaining pages to download or schedule. It can be called multiple times. If some extension raises a DontCloseSpider exception (in the spider_idle signal handler) the spider is not closed until the next loop and this function is guaranteed to be called (at least) once again for this spider. """ res = self.signals.send_catch_log(signal=signals.spider_idle, \ spider=spider, dont_log=DontCloseSpider) if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \ for _, x in res): return if self.spider_is_idle(spider): self.close_spider(spider, reason='finished') def close_spider(self, spider, reason='cancelled'): """Close (cancel) spider and clear all its outstanding requests""" slot = self.slot if slot.closing: return slot.closing logger.info("Closing spider (%(reason)s)", {'reason': reason}, extra={'spider': spider}) dfd = slot.close() def log_failure(msg): def errback(failure): logger.error(msg, exc_info=failure_to_exc_info(failure), extra={'spider': spider}) return errback dfd.addBoth(lambda _: self.downloader.close()) dfd.addErrback(log_failure('Downloader close failure')) dfd.addBoth(lambda _: self.scraper.close_spider(spider)) dfd.addErrback(log_failure('Scraper close failure')) dfd.addBoth(lambda _: slot.scheduler.close(reason)) dfd.addErrback(log_failure('Scheduler close failure')) dfd.addBoth(lambda _: self.signals.send_catch_log_deferred( signal=signals.spider_closed, spider=spider, reason=reason)) dfd.addErrback(log_failure('Error while sending spider_close signal')) dfd.addBoth( lambda _: self.crawler.stats.close_spider(spider, reason=reason)) dfd.addErrback(log_failure('Stats close failure')) dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)", {'reason': reason}, extra={'spider': spider})) dfd.addBoth(lambda _: setattr(self, 'slot', None)) dfd.addErrback(log_failure('Error while unassigning slot')) dfd.addBoth(lambda _: setattr(self, 'spider', None)) # 这里有点意思 dfd.addErrback(log_failure('Error while unassigning spider')) dfd.addBoth(lambda _: self._spider_closed_callback(spider)) return dfd def _close_all_spiders(self): dfds = [ self.close_spider(s, reason='shutdown') for s in self.open_spiders ] dlist = defer.DeferredList(dfds) return dlist @defer.inlineCallbacks def _finish_stopping_engine(self): yield self.signals.send_catch_log_deferred( signal=signals.engine_stopped) self._closewait.callback(None)
class ExecutionEngine(object): def __init__(self, crawler, spider_closed_callback): self.crawler = crawler self.settings = crawler.settings # 配置 self.signals = crawler.signals # 信号 self.logformatter = crawler.logformatter # 日志格式 self.slot = None self.spider = None self.running = False self.paused = False # 提取scheduler调度器类名(未进行实例化), 其在open_spdier中实例化 self.scheduler_cls = load_object(self.settings['SCHEDULER']) # 提取downloader下载器类名, 并实例化, 见scrapy/core/downloader/__init__.py文件 downloader_cls = load_object(self.settings['DOWNLOADER']) self.downloader = downloader_cls(crawler) # 实例化scrapyer: engine和spider之间的桥梁, 见scrapy/core/scraper.py self.scraper = Scraper(crawler) self._spider_closed_callback = spider_closed_callback @defer.inlineCallbacks def start(self): """Start the execution engine""" assert not self.running, "Engine already running" self.start_time = time() yield self.signals.send_catch_log_deferred( signal=signals.engine_started) self.running = True self._closewait = defer.Deferred() yield self._closewait def stop(self): """Stop the execution engine gracefully""" assert self.running, "Engine not running" self.running = False dfd = self._close_all_spiders() return dfd.addBoth(lambda _: self._finish_stopping_engine()) def close(self): """Close the execution engine gracefully. If it has already been started, stop it. In all cases, close all spiders and the downloader. """ if self.running: # Will also close spiders and downloader return self.stop() elif self.open_spiders: # Will also close downloader return self._close_all_spiders() else: return defer.succeed(self.downloader.close()) def pause(self): """Pause the execution engine""" self.paused = True def unpause(self): """Resume the execution engine""" self.paused = False def _next_request(self, spider): """ 1 在CallLaterOnce被注册 2 通过nextcall.schedule开启调度工作 """ slot = self.slot if not slot: return if self.paused: # 暂停时 return while not self._needs_backout(spider): # 是否等待 # 循环从scheduler中获取Request, 首次会失败. # 这里进行下载任务 if not self._next_request_from_scheduler(spider): break # 循环调用, 消费yield, 这里slot在open_spider中初始化, start_requests==process_start_requests方法 # 从而调用每一个爬虫中间件的process_start_requests方法, 批量处理种子Request. # 这里start_requests实际上包含 Request 的可迭代对象 if slot.start_requests and not self._needs_backout(spider): # start_requests有 Requests并且不需要等待时 try: request = next(slot.start_requests) # 提取下一个种子请求, 不管是不是首次 except StopIteration: slot.start_requests = None except Exception: slot.start_requests = None logger.error('Error while obtaining start requests', exc_info=True, extra={'spider': spider}) else: # 将request放入到scheduler的队列中 self.crawl(request, spider) if self.spider_is_idle(spider) and slot.close_if_idle: self._spider_idle(spider) # 关闭spider, 满足空闲并且设置了空闲就关闭标志位之后 def _needs_backout(self, spider): # 是否需要等待, 取决于如下条件: # 1. engine是否仍然运行 # 2. slot是否关闭 # 3. downloader下载超过预设的最大数: CONCURRENT_REQUESTS # 4. scraper处理response超过预设 slot = self.slot return not self.running \ or slot.closing \ or self.downloader.needs_backout() \ or self.scraper.slot.needs_backout() def _next_request_from_scheduler(self, spider): slot = self.slot # 获取下一个request, 弹出队列中的request, 见scrapy/core/scheduler.py request = slot.scheduler.next_request() if not request: return # "下载"该request, 调用Downloader中相应的下载器, 在这之前会注册一批的回调函数, 返回即表示下载成功 d = self._download(request, spider) # 对下载结果做处理(真正下载见scrapy/core/downloader/__init__.py中的_download函数) d.addBoth(self._handle_downloader_output, request, spider) d.addErrback( lambda f: logger.info('Error while handling downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.remove_request(request)) d.addErrback( lambda f: logger.info('Error while removing request from slot', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.nextcall.schedule()) d.addErrback( lambda f: logger.info('Error while scheduling new request', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d def _handle_downloader_output(self, response, request, spider): # 下载结果必须为下面三者之一: Request/Response/Failure assert isinstance(response, (Request, Response, Failure)), response # downloader middleware can return requests (for example, redirects) # 结果 1: Request, 则必须重新进行一轮下载操作 if isinstance(response, Request): self.crawl(response, spider) return # response is a Response or Failure # 结果 2: 利用scraper完成同spiders/pipeline交互, 见scrapy/core/scraper.py d = self.scraper.enqueue_scrape(response, request, spider) d.addErrback( lambda f: logger.error('Error while enqueuing downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d def spider_is_idle(self, spider): if not self.scraper.slot.is_idle(): # scraper is not idle return False if self.downloader.active: # downloader has pending requests return False if self.slot.start_requests is not None: # not all start requests are handled return False if self.slot.scheduler.has_pending_requests(): # scheduler has pending requests return False return True @property def open_spiders(self): return [self.spider] if self.spider else [] def has_capacity(self): """Does the engine have capacity to handle more spiders""" return not bool(self.slot) def crawl(self, request, spider): assert spider in self.open_spiders, \ "Spider %r not opened when crawling: %s" % (spider.name, request) # 将request放入scheduler队列中, 以便下次循环调用 self.schedule(request, spider) # 进行下一次调度 self.slot.nextcall.schedule() def schedule(self, request, spider): self.signals.send_catch_log(signal=signals.request_scheduled, request=request, spider=spider) # 入队列 if not self.slot.scheduler.enqueue_request(request): self.signals.send_catch_log(signal=signals.request_dropped, request=request, spider=spider) def download(self, request, spider): d = self._download(request, spider) d.addBoth(self._downloaded, self.slot, request, spider) return d def _downloaded(self, response, slot, request, spider): slot.remove_request(request) return self.download(response, spider) \ if isinstance(response, Request) else response def _download(self, request, spider): slot = self.slot slot.add_request(request) def _on_success(response): # 成功之后的回调函数, 结果须为request/response assert isinstance(response, (Response, Request)) if isinstance(response, Response): response.request = request # tie request to response received logkws = self.logformatter.crawled(request, response, spider) if logkws is not None: logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) self.signals.send_catch_log(signal=signals.response_received, response=response, request=request, spider=spider) return response def _on_complete(_): # 下载完成之后的回调, 直接开始下一次调度 slot.nextcall.schedule() return _ # 下载请求, 调用 Downloader 进行下载(实际上没开始), 见scrapy/core/downloader/__init__.py dwld = self.downloader.fetch(request, spider) # 注册回调函数 dwld.addCallbacks(_on_success) dwld.addBoth(_on_complete) return dwld @defer.inlineCallbacks def open_spider(self, spider, start_requests=(), close_if_idle=True): assert self.has_capacity(), "No free spider slot when opening %r" % \ spider.name logger.info("Spider opened", extra={'spider': spider}) # 注册next_request调度方法, 以便循环调度(利用twisted的reactor) nextcall = CallLaterOnce(self._next_request, spider) # 实例化调度器, 见scrapy/core/scheduler.py scheduler = self.scheduler_cls.from_crawler(self.crawler) # 爬虫中间件, 处理种子Request, 见scrapy/core/spidermw.py, 其中start_requests一般为包含 Requests的可迭代对象 start_requests = yield self.scraper.spidermw.process_start_requests( start_requests, spider) # 封装slot对象, 并将返回的包含可迭代对象start_requests赋值给slot slot = Slot(start_requests, close_if_idle, nextcall, scheduler) self.slot = slot self.spider = spider # 调用调度器中的open, 进行爬虫对象绑定, 各种任务队列初始化, 并且开启指纹过滤, 见scrapy/core/scheduler.py yield scheduler.open(spider) # 见scrapy/core/scraper.py, 实例化实例化或者绑定spider到pipeline manager类上, 见scrapy/pipelines/__init__.py # 批量调度爬虫中间件所有open_spdier方法 yield self.scraper.open_spider(spider) # scrapy/statscollectors/__init__.py self.crawler.stats.open_spider(spider) yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider) # 开始调度, 实际执行上面注册的_next_request方法 slot.nextcall.schedule() slot.heartbeat.start(5) def _spider_idle(self, spider): """Called when a spider gets idle. This function is called when there are no remaining pages to download or schedule. It can be called multiple times. If some extension raises a DontCloseSpider exception (in the spider_idle signal handler) the spider is not closed until the next loop and this function is guaranteed to be called (at least) once again for this spider. """ res = self.signals.send_catch_log(signal=signals.spider_idle, spider=spider, dont_log=DontCloseSpider) if any( isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) for _, x in res): return if self.spider_is_idle(spider): self.close_spider(spider, reason='finished') def close_spider(self, spider, reason='cancelled'): """Close (cancel) spider and clear all its outstanding requests""" slot = self.slot if slot.closing: return slot.closing logger.info("Closing spider (%(reason)s)", {'reason': reason}, extra={'spider': spider}) dfd = slot.close() def log_failure(msg): def errback(failure): logger.error(msg, exc_info=failure_to_exc_info(failure), extra={'spider': spider}) return errback dfd.addBoth(lambda _: self.downloader.close()) dfd.addErrback(log_failure('Downloader close failure')) dfd.addBoth(lambda _: self.scraper.close_spider(spider)) dfd.addErrback(log_failure('Scraper close failure')) dfd.addBoth(lambda _: slot.scheduler.close(reason)) dfd.addErrback(log_failure('Scheduler close failure')) dfd.addBoth(lambda _: self.signals.send_catch_log_deferred( signal=signals.spider_closed, spider=spider, reason=reason)) dfd.addErrback(log_failure('Error while sending spider_close signal')) dfd.addBoth( lambda _: self.crawler.stats.close_spider(spider, reason=reason)) dfd.addErrback(log_failure('Stats close failure')) dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)", {'reason': reason}, extra={'spider': spider})) dfd.addBoth(lambda _: setattr(self, 'slot', None)) dfd.addErrback(log_failure('Error while unassigning slot')) dfd.addBoth(lambda _: setattr(self, 'spider', None)) dfd.addErrback(log_failure('Error while unassigning spider')) dfd.addBoth(lambda _: self._spider_closed_callback(spider)) return dfd def _close_all_spiders(self): dfds = [ self.close_spider(s, reason='shutdown') for s in self.open_spiders ] dlist = defer.DeferredList(dfds) return dlist @defer.inlineCallbacks def _finish_stopping_engine(self): yield self.signals.send_catch_log_deferred( signal=signals.engine_stopped) self._closewait.callback(None)
class ExecutionEngine(object): def __init__(self, crawler, spider_closed_callback): self.crawler = crawler self.settings = crawler.settings self.signals = crawler.signals #使用crawler的信号管理器 self.logformatter = crawler.logformatter self.slot = None self.spider = None self.running = False self.paused = False self.scheduler_cls = load_object( self.settings['SCHEDULER']) #根据配置的调度器类来生成对应的对象 downloader_cls = load_object( self.settings['DOWNLOADER']) #根据配置的下载器类来生成对应的类 self.downloader = downloader_cls(crawler) self.scraper = Scraper(crawler) #生成一个刮取器 self._spider_closed_callback = spider_closed_callback @defer.inlineCallbacks def start(self): """Start the execution engine""" assert not self.running, "Engine already running" self.start_time = time() yield self.signals.send_catch_log_deferred( signal=signals.engine_started) self.running = True self._closewait = defer.Deferred() yield self._closewait def stop(self): """Stop the execution engine gracefully""" assert self.running, "Engine not running" self.running = False dfd = self._close_all_spiders() return dfd.addBoth(lambda _: self._finish_stopping_engine()) def close(self): """Close the execution engine gracefully. If it has already been started, stop it. In all cases, close all spiders and the downloader. """ if self.running: # Will also close spiders and downloader return self.stop() elif self.open_spiders: # Will also close downloader return self._close_all_spiders() else: return defer.succeed(self.downloader.close()) def pause(self): """Pause the execution engine""" self.paused = True def unpause(self): """Resume the execution engine""" self.paused = False """ 被CallLaterOnce包装后被slot设置, 主要在reactor中的heartbeat中被定时调用(在slot中设置),不过也可以被代码主动调用 """ def _next_request(self, spider): slot = self.slot if not slot: return if self.paused: return #此处应该是通过调度器异步的获取待处理的request while not self._needs_backout(spider): if not self._next_request_from_scheduler(spider): break if slot.start_requests and not self._needs_backout(spider): try: request = next(slot.start_requests) except StopIteration: slot.start_requests = None except Exception: slot.start_requests = None logger.error('Error while obtaining start requests', exc_info=True, extra={'spider': spider}) else: self.crawl(request, spider) if self.spider_is_idle(spider) and slot.close_if_idle: self._spider_idle(spider) """ 判断当前引擎的状态是不是异常,需不需要回退(backout) """ def _needs_backout(self, spider): slot = self.slot return not self.running \ or slot.closing \ or self.downloader.needs_backout() \ or self.scraper.slot.needs_backout() """ 从调度器中请求下一个request,如果有request待处理, 那么就对这个request进行下载处理,并对下载的操作添加一下回调函数 """ def _next_request_from_scheduler(self, spider): slot = self.slot request = slot.scheduler.next_request() if not request: return # 调用下载器进行下载 d = self._download(request, spider) d.addBoth(self._handle_downloader_output, request, spider) d.addErrback( lambda f: logger.info('Error while handling downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.remove_request(request)) d.addErrback( lambda f: logger.info('Error while removing request from slot', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.nextcall.schedule()) d.addErrback( lambda f: logger.info('Error while scheduling new request', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d """ 对下载器的结果输出进行的异步处理 """ def _handle_downloader_output(self, response, request, spider): assert isinstance(response, (Request, Response, Failure)), response # downloader middleware can return requests (for example, redirects) if isinstance(response, Request): self.crawl(response, spider) return # response is a Response or Failure d = self.scraper.enqueue_scrape(response, request, spider) d.addErrback( lambda f: logger.error('Error while enqueuing downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d def spider_is_idle(self, spider): if not self.scraper.slot.is_idle(): # scraper is not idle return False if self.downloader.active: # downloader has pending requests return False if self.slot.start_requests is not None: # not all start requests are handled return False if self.slot.scheduler.has_pending_requests(): # scheduler has pending requests return False return True @property def open_spiders(self): return [self.spider] if self.spider else [] def has_capacity(self): """Does the engine have capacity to handle more spiders""" return not bool(self.slot) """ 调用schedule请求slot处理request,并且显式通知slot进行处理 """ def crawl(self, request, spider): assert spider in self.open_spiders, \ "Spider %r not opened when crawling: %s" % (spider.name, request) self.schedule(request, spider) self.slot.nextcall.schedule() """ 通过slot将request入队,等待被reactor处理, """ def schedule(self, request, spider): self.signals.send_catch_log(signal=signals.request_scheduled, request=request, spider=spider) if not self.slot.scheduler.enqueue_request(request): self.signals.send_catch_log(signal=signals.request_dropped, request=request, spider=spider) """ 根据请求进行下载,其实是调用_download进行下载 并且在下载完成之后,通过reactor异步调度_downloaded函数。 """ def download(self, request, spider): d = self._download(request, spider) d.addBoth(self._downloaded, self.slot, request, spider) return d """ 在下载完成之后,从slot中将要对应的request移除,然后在判断response的类型: 如果是Request,则继续进行下载;若是Response,则直接返回 """ def _downloaded(self, response, slot, request, spider): slot.remove_request(request) return self.download(response, spider) if isinstance( response, Request) else response """ 将下载的任务由下载器downloader进行下载的操作 并添加了两个回调函数: 在下载完毕complete的时候 在下载成功success的时候 """ def _download(self, request, spider): slot = self.slot slot.add_request(request) def _on_success(response): assert isinstance(response, (Response, Request)) if isinstance(response, Response): response.request = request # tie request to response received logkws = self.logformatter.crawled(request, response, spider) logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) self.signals.send_catch_log(signal=signals.response_received, response=response, request=request, spider=spider) return response """ 在下载完成的时候显式调用slot进行调度处理 """ def _on_complete(_): slot.nextcall.schedule() return _ # 从下载器中获取下载的结果deferred dwld = self.downloader.fetch(request, spider) dwld.addCallbacks(_on_success) dwld.addBoth(_on_complete) return dwld """ ### 被scrapy.crawler.crawl调用 开启爬虫系统 创建调度器并开启, """ @defer.inlineCallbacks def open_spider(self, spider, start_requests=(), close_if_idle=True): assert self.has_capacity(), "No free spider slot when opening %r" % \ spider.name logger.info("Spider opened", extra={'spider': spider}) nextcall = CallLaterOnce(self._next_request, spider) scheduler = self.scheduler_cls.from_crawler(self.crawler) start_requests = yield self.scraper.spidermw.process_start_requests( start_requests, spider) #先调用spider中间件进行处理 slot = Slot(start_requests, close_if_idle, nextcall, scheduler) self.slot = slot self.spider = spider yield scheduler.open(spider) #开启调度器 yield self.scraper.open_spider(spider) self.crawler.stats.open_spider(spider) yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider) slot.nextcall.schedule() slot.heartbeat.start(5) """ 当调度器空闲的时候调用(在_next_request中判断)。 可以被多次调用。 如果某些extension引起了DontCloseSpider异常(在spider_idle 信号的处理器中),spider就不会关闭,直到下一个循环。 并且这个方法会保证至少被执行一次 """ def _spider_idle(self, spider): """Called when a spider gets idle. This function is called when there are no remaining pages to download or schedule. It can be called multiple times. If some extension raises a DontCloseSpider exception (in the spider_idle signal handler) the spider is not closed until the next loop and this function is guaranteed to be called (at least) once again for this spider. """ res = self.signals.send_catch_log(signal=signals.spider_idle, spider=spider, dont_log=DontCloseSpider) if any( isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) for _, x in res): return if self.spider_is_idle(spider): self.close_spider(spider, reason='finished') """ 关闭爬虫(引擎) 发送信息:关闭下载器、使用scrapyer关闭爬虫spider、关闭调度器 发送关闭日志、关闭scawler关闭爬虫的信息、打印日志、 重设当前的slot为空、重设当前的spider为空等 """ def close_spider(self, spider, reason='cancelled'): """Close (cancel) spider and clear all its outstanding requests""" slot = self.slot if slot.closing: return slot.closing logger.info("Closing spider (%(reason)s)", {'reason': reason}, extra={'spider': spider}) dfd = slot.close() def log_failure(msg): def errback(failure): logger.error(msg, exc_info=failure_to_exc_info(failure), extra={'spider': spider}) return errback dfd.addBoth(lambda _: self.downloader.close()) dfd.addErrback(log_failure('Downloader close failure')) dfd.addBoth(lambda _: self.scraper.close_spider(spider)) dfd.addErrback(log_failure('Scraper close failure')) dfd.addBoth(lambda _: slot.scheduler.close(reason)) dfd.addErrback(log_failure('Scheduler close failure')) dfd.addBoth(lambda _: self.signals.send_catch_log_deferred( signal=signals.spider_closed, spider=spider, reason=reason)) dfd.addErrback(log_failure('Error while sending spider_close signal')) dfd.addBoth( lambda _: self.crawler.stats.close_spider(spider, reason=reason)) dfd.addErrback(log_failure('Stats close failure')) dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)", {'reason': reason}, extra={'spider': spider})) dfd.addBoth(lambda _: setattr(self, 'slot', None)) dfd.addErrback(log_failure('Error while unassigning slot')) dfd.addBoth(lambda _: setattr(self, 'spider', None)) dfd.addErrback(log_failure('Error while unassigning spider')) dfd.addBoth(lambda _: self._spider_closed_callback(spider)) return dfd def _close_all_spiders(self): dfds = [ self.close_spider(s, reason='shutdown') for s in self.open_spiders ] dlist = defer.DeferredList(dfds) return dlist @defer.inlineCallbacks def _finish_stopping_engine(self): yield self.signals.send_catch_log_deferred( signal=signals.engine_stopped) self._closewait.callback(None)