Exemplo n.º 1
0
class ExecutionEngine(object):
    def __init__(self, crawler, spider_closed_callback):
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(self.settings['SCHEDULER'])
        downloader_cls = load_object(self.settings['DOWNLOADER'])
        self.downloader = downloader_cls(crawler)
        self.scraper = Scraper(crawler)
        self._concurrent_spiders = self.settings.getint(
            'CONCURRENT_SPIDERS', 1)
        if self._concurrent_spiders != 1:
            warnings.warn("CONCURRENT_SPIDERS settings is deprecated, use " \
                "Scrapyd max_proc config instead", ScrapyDeprecationWarning)
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_started)
        self.running = True
        self._closewait = defer.Deferred()
        yield self._closewait

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def _next_request(self, spider):
        slot = self.slot
        if not slot:
            return

        if self.paused:
            slot.nextcall.schedule(5)
            return

        while not self._needs_backout(spider):
            if not self._next_request_from_scheduler(spider):
                break

        if slot.start_requests and not self._needs_backout(spider):
            try:
                request = next(slot.start_requests)
            except StopIteration:
                slot.start_requests = None
            except Exception as exc:
                slot.start_requests = None
                log.err(None, 'Obtaining request from start requests', \
                        spider=spider)
            else:
                self.crawl(request, spider)

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        slot = self.slot
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slot.needs_backout()

    def _next_request_from_scheduler(self, spider):
        slot = self.slot
        request = slot.scheduler.next_request()
        if not request:
            return
        d = self._download(request, spider)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(log.msg, spider=spider)
        return d

    def _handle_downloader_output(self, response, request, spider):
        assert isinstance(response, (Request, Response, Failure)), response
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(response, request, spider)
        d.addErrback(log.err, spider=spider)
        return d

    def spider_is_idle(self, spider):
        scraper_idle = self.scraper.slot.is_idle()
        pending = self.slot.scheduler.has_pending_requests()
        downloading = bool(self.downloader.active)
        pending_start_requests = self.slot.start_requests is not None
        idle = scraper_idle and not (pending or downloading
                                     or pending_start_requests)
        return idle

    @property
    def open_spiders(self):
        return [self.spider] if self.spider else []

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return not bool(self.slot)

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        self.schedule(request, spider)
        self.slot.nextcall.schedule()

    def schedule(self, request, spider):
        self.signals.send_catch_log(signal=signals.request_scheduled,
                                    request=request,
                                    spider=spider)
        return self.slot.scheduler.enqueue_request(request)

    def download(self, request, spider):
        slot = self.slot
        slot.add_request(request)
        d = self._download(request, spider)
        d.addBoth(self._downloaded, slot, request, spider)
        return d

    def _downloaded(self, response, slot, request, spider):
        slot.remove_request(request)
        return self.download(response, spider) \
                if isinstance(response, Request) else response

    def _download(self, request, spider):
        slot = self.slot
        slot.add_request(request)

        def _on_success(response):
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request  # tie request to response received
                logkws = self.logformatter.crawled(request, response, spider)
                log.msg(spider=spider, **logkws)
                self.signals.send_catch_log(signal=signals.response_received, \
                    response=response, request=request, spider=spider)
            return response

        def _on_complete(_):
            slot.nextcall.schedule()
            return _

        dwld = self.downloader.fetch(request, spider)
        dwld.addCallbacks(_on_success)
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        assert self.has_capacity(), "No free spider slot when opening %r" % \
            spider.name
        log.msg("Spider opened", spider=spider)
        nextcall = CallLaterOnce(self._next_request, spider)
        scheduler = self.scheduler_cls.from_crawler(self.crawler)
        start_requests = yield self.scraper.spidermw.process_start_requests(
            start_requests, spider)
        slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
        self.slot = slot
        self.spider = spider
        yield scheduler.open(spider)
        yield self.scraper.open_spider(spider)
        self.crawler.stats.open_spider(spider)
        yield self.signals.send_catch_log_deferred(signals.spider_opened,
                                                   spider=spider)
        slot.nextcall.schedule()

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            self.slot.nextcall.schedule(5)
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slot
        if slot.closing:
            return slot.closing
        log.msg(format="Closing spider (%(reason)s)",
                reason=reason,
                spider=spider)

        dfd = slot.close()

        dfd.addBoth(lambda _: self.downloader.close())
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log.err, spider=spider)

        # XXX: spider_stats argument was added for backwards compatibility with
        # stats collection refactoring added in 0.15. it should be removed in 0.17.
        dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(signal=signals.spider_closed, \
            spider=spider, reason=reason, spider_stats=self.crawler.stats.get_stats()))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(
            lambda _: self.crawler.stats.close_spider(spider, reason=reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: log.msg(
            format="Spider closed (%(reason)s)", reason=reason, spider=spider))

        dfd.addBoth(lambda _: setattr(self, 'slot', None))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: setattr(self, 'spider', None))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spiders(self):
        dfds = [
            self.close_spider(s, reason='shutdown') for s in self.open_spiders
        ]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_stopped)
        self._closewait.callback(None)
class ExecutionEngine:
    def __init__(self, crawler, spider_closed_callback):
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(self.settings['SCHEDULER'])
        downloader_cls = load_object(self.settings['DOWNLOADER'])
        self.downloader = downloader_cls(crawler)
        self.scraper = Scraper(crawler)
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        if self.running:
            raise RuntimeError("Engine already running")
        self.start_time = time()
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_started)
        self.running = True
        self._closewait = defer.Deferred()
        yield self._closewait

    def stop(self):
        """Stop the execution engine gracefully"""
        if not self.running:
            raise RuntimeError("Engine not running")
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def close(self):
        """Close the execution engine gracefully.

        If it has already been started, stop it. In all cases, close all spiders
        and the downloader.
        """
        if self.running:
            # Will also close spiders and downloader
            return self.stop()
        elif self.open_spiders:
            # Will also close downloader
            return self._close_all_spiders()
        else:
            return defer.succeed(self.downloader.close())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def _next_request(self, spider):
        slot = self.slot
        if not slot:
            return

        if self.paused:
            return

        while not self._needs_backout(spider):
            if not self._next_request_from_scheduler(spider):
                break

        if slot.start_requests and not self._needs_backout(spider):
            try:
                request = next(slot.start_requests)
            except StopIteration:
                slot.start_requests = None
            except Exception:
                slot.start_requests = None
                logger.error('Error while obtaining start requests',
                             exc_info=True,
                             extra={'spider': spider})
            else:
                self.crawl(request, spider)

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        slot = self.slot
        return (not self.running or slot.closing
                or self.downloader.needs_backout()
                or self.scraper.slot.needs_backout())

    def _next_request_from_scheduler(self, spider):
        slot = self.slot
        request = slot.scheduler.next_request()
        if not request:
            return
        d = self._download(request, spider)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(
            lambda f: logger.info('Error while handling downloader output',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(
            lambda f: logger.info('Error while removing request from slot',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(
            lambda f: logger.info('Error while scheduling new request',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        return d

    def _handle_downloader_output(self, response, request, spider):
        if not isinstance(response, (Request, Response, Failure)):
            raise TypeError(
                "Incorrect type: expected Request, Response or Failure, got "
                f"{type(response)}: {response!r}")
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(response, request, spider)
        d.addErrback(
            lambda f: logger.error('Error while enqueuing downloader output',
                                   exc_info=failure_to_exc_info(f),
                                   extra={'spider': spider}))
        return d

    def spider_is_idle(self, spider):
        if not self.scraper.slot.is_idle():
            # scraper is not idle
            return False

        if self.downloader.active:
            # downloader has pending requests
            return False

        if self.slot.start_requests is not None:
            # not all start requests are handled
            return False

        if self.slot.scheduler.has_pending_requests():
            # scheduler has pending requests
            return False

        return True

    @property
    def open_spiders(self):
        return [self.spider] if self.spider else []

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return not bool(self.slot)

    def crawl(self, request, spider):
        if spider not in self.open_spiders:
            raise RuntimeError(
                f"Spider {spider.name!r} not opened when crawling: {request}")
        self.schedule(request, spider)
        self.slot.nextcall.schedule()

    def schedule(self, request, spider):
        self.signals.send_catch_log(signals.request_scheduled,
                                    request=request,
                                    spider=spider)
        if not self.slot.scheduler.enqueue_request(request):
            self.signals.send_catch_log(signals.request_dropped,
                                        request=request,
                                        spider=spider)

    def download(self, request, spider):
        d = self._download(request, spider)
        d.addBoth(self._downloaded, self.slot, request, spider)
        return d

    def _downloaded(self, response, slot, request, spider):
        slot.remove_request(request)
        return self.download(response, spider) if isinstance(
            response, Request) else response

    def _download(self, request, spider):
        slot = self.slot
        slot.add_request(request)

        def _on_success(response):
            if not isinstance(response, (Response, Request)):
                raise TypeError(
                    "Incorrect type: expected Response or Request, got "
                    f"{type(response)}: {response!r}")
            if isinstance(response, Response):
                if response.request is None:
                    response.request = request
                logkws = self.logformatter.crawled(response.request, response,
                                                   spider)
                if logkws is not None:
                    logger.log(*logformatter_adapter(logkws),
                               extra={'spider': spider})
                self.signals.send_catch_log(
                    signal=signals.response_received,
                    response=response,
                    request=response.request,
                    spider=spider,
                )
            return response

        def _on_complete(_):
            slot.nextcall.schedule()
            return _

        dwld = self.downloader.fetch(request, spider)
        dwld.addCallbacks(_on_success)
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        if not self.has_capacity():
            raise RuntimeError(
                f"No free spider slot when opening {spider.name!r}")
        logger.info("Spider opened", extra={'spider': spider})
        nextcall = CallLaterOnce(self._next_request, spider)
        scheduler = self.scheduler_cls.from_crawler(self.crawler)
        start_requests = yield self.scraper.spidermw.process_start_requests(
            start_requests, spider)
        slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
        self.slot = slot
        self.spider = spider
        yield scheduler.open(spider)
        yield self.scraper.open_spider(spider)
        self.crawler.stats.open_spider(spider)
        yield self.signals.send_catch_log_deferred(signals.spider_opened,
                                                   spider=spider)
        slot.nextcall.schedule()
        slot.heartbeat.start(5)

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signals.spider_idle,
                                          spider=spider,
                                          dont_log=DontCloseSpider)
        if any(
                isinstance(x, Failure) and isinstance(x.value, DontCloseSpider)
                for _, x in res):
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slot
        if slot.closing:
            return slot.closing
        logger.info("Closing spider (%(reason)s)", {'reason': reason},
                    extra={'spider': spider})

        dfd = slot.close()

        def log_failure(msg):
            def errback(failure):
                logger.error(msg,
                             exc_info=failure_to_exc_info(failure),
                             extra={'spider': spider})

            return errback

        dfd.addBoth(lambda _: self.downloader.close())
        dfd.addErrback(log_failure('Downloader close failure'))

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log_failure('Scraper close failure'))

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log_failure('Scheduler close failure'))

        dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
            signal=signals.spider_closed, spider=spider, reason=reason))
        dfd.addErrback(log_failure('Error while sending spider_close signal'))

        dfd.addBoth(
            lambda _: self.crawler.stats.close_spider(spider, reason=reason))
        dfd.addErrback(log_failure('Stats close failure'))

        dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
                                          {'reason': reason},
                                          extra={'spider': spider}))

        dfd.addBoth(lambda _: setattr(self, 'slot', None))
        dfd.addErrback(log_failure('Error while unassigning slot'))

        dfd.addBoth(lambda _: setattr(self, 'spider', None))
        dfd.addErrback(log_failure('Error while unassigning spider'))

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spiders(self):
        dfds = [
            self.close_spider(s, reason='shutdown') for s in self.open_spiders
        ]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_stopped)
        self._closewait.callback(None)
Exemplo n.º 3
0
Arquivo: engine.py Projeto: 01-/scrapy
class ExecutionEngine(object):

    def __init__(self, crawler, spider_closed_callback):
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(self.settings['SCHEDULER'])
        downloader_cls = load_object(self.settings['DOWNLOADER'])
        self.downloader = downloader_cls(crawler)
        self.scraper = Scraper(crawler)
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield self.signals.send_catch_log_deferred(signal=signals.engine_started)
        self.running = True
        self._closewait = defer.Deferred()
        yield self._closewait

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def close(self):
        """Close the execution engine gracefully.

        If it has already been started, stop it. In all cases, close all spiders
        and the downloader.
        """
        if self.running:
            # Will also close spiders and downloader
            return self.stop()
        elif self.open_spiders:
            # Will also close downloader
            return self._close_all_spiders()
        else:
            return defer.succeed(self.downloader.close())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def _next_request(self, spider):
        slot = self.slot
        if not slot:
            return

        if self.paused:
            return

        while not self._needs_backout(spider):
            if not self._next_request_from_scheduler(spider):
                break

        if slot.start_requests and not self._needs_backout(spider):
            try:
                request = next(slot.start_requests)
            except StopIteration:
                slot.start_requests = None
            except Exception:
                slot.start_requests = None
                logger.error('Error while obtaining start requests',
                             exc_info=True, extra={'spider': spider})
            else:
                self.crawl(request, spider)

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        slot = self.slot
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slot.needs_backout()

    def _next_request_from_scheduler(self, spider):
        slot = self.slot
        request = slot.scheduler.next_request()
        if not request:
            return
        d = self._download(request, spider)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(lambda f: logger.info('Error while handling downloader output',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(lambda f: logger.info('Error while removing request from slot',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(lambda f: logger.info('Error while scheduling new request',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        return d

    def _handle_downloader_output(self, response, request, spider):
        assert isinstance(response, (Request, Response, Failure)), response
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(response, request, spider)
        d.addErrback(lambda f: logger.error('Error while enqueuing downloader output',
                                            exc_info=failure_to_exc_info(f),
                                            extra={'spider': spider}))
        return d

    def spider_is_idle(self, spider):
        if not self.scraper.slot.is_idle():
            # scraper is not idle
            return False

        if self.downloader.active:
            # downloader has pending requests
            return False

        if self.slot.start_requests is not None:
            # not all start requests are handled
            return False

        if self.slot.scheduler.has_pending_requests():
            # scheduler has pending requests
            return False

        return True

    @property
    def open_spiders(self):
        return [self.spider] if self.spider else []

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return not bool(self.slot)

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        self.schedule(request, spider)
        self.slot.nextcall.schedule()

    def schedule(self, request, spider):
        self.signals.send_catch_log(signal=signals.request_scheduled,
                request=request, spider=spider)
        if not self.slot.scheduler.enqueue_request(request):
            self.signals.send_catch_log(signal=signals.request_dropped,
                                        request=request, spider=spider)

    def download(self, request, spider):
        slot = self.slot
        slot.add_request(request)
        d = self._download(request, spider)
        d.addBoth(self._downloaded, slot, request, spider)
        return d

    def _downloaded(self, response, slot, request, spider):
        slot.remove_request(request)
        return self.download(response, spider) \
                if isinstance(response, Request) else response

    def _download(self, request, spider):
        slot = self.slot
        slot.add_request(request)
        def _on_success(response):
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request # tie request to response received
                logkws = self.logformatter.crawled(request, response, spider)
                logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
                self.signals.send_catch_log(signal=signals.response_received, \
                    response=response, request=request, spider=spider)
            return response

        def _on_complete(_):
            slot.nextcall.schedule()
            return _

        dwld = self.downloader.fetch(request, spider)
        dwld.addCallbacks(_on_success)
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        assert self.has_capacity(), "No free spider slot when opening %r" % \
            spider.name
        logger.info("Spider opened", extra={'spider': spider})
        nextcall = CallLaterOnce(self._next_request, spider)
        scheduler = self.scheduler_cls.from_crawler(self.crawler)
        start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
        slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
        self.slot = slot
        self.spider = spider
        yield scheduler.open(spider)
        yield self.scraper.open_spider(spider)
        self.crawler.stats.open_spider(spider)
        yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)
        slot.nextcall.schedule()
        slot.heartbeat.start(5)

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slot
        if slot.closing:
            return slot.closing
        logger.info("Closing spider (%(reason)s)",
                    {'reason': reason},
                    extra={'spider': spider})

        dfd = slot.close()

        def log_failure(msg):
            def errback(failure):
                logger.error(
                    msg,
                    exc_info=failure_to_exc_info(failure),
                    extra={'spider': spider}
                )
            return errback

        dfd.addBoth(lambda _: self.downloader.close())
        dfd.addErrback(log_failure('Downloader close failure'))

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log_failure('Scraper close failure'))

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log_failure('Scheduler close failure'))

        dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
            signal=signals.spider_closed, spider=spider, reason=reason))
        dfd.addErrback(log_failure('Error while sending spider_close signal'))

        dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason))
        dfd.addErrback(log_failure('Stats close failure'))

        dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
                                          {'reason': reason},
                                          extra={'spider': spider}))

        dfd.addBoth(lambda _: setattr(self, 'slot', None))
        dfd.addErrback(log_failure('Error while unassigning slot'))

        dfd.addBoth(lambda _: setattr(self, 'spider', None))
        dfd.addErrback(log_failure('Error while unassigning spider'))

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spiders(self):
        dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
        self._closewait.callback(None)
Exemplo n.º 4
0
Arquivo: engine.py Projeto: tml/scrapy
class ExecutionEngine(object):

    def __init__(self, crawler, spider_closed_callback):
        self.settings = crawler.settings
        self.slots = {}
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(self.settings['SCHEDULER'])
        self.downloader = Downloader(crawler)
        self.scraper = Scraper(crawler)
        self._concurrent_spiders = self.settings.getint('CONCURRENT_SPIDERS')
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield send_catch_log_deferred(signal=signals.engine_started)
        self.running = True

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def _next_request(self, spider):
        try:
            slot = self.slots[spider]
        except KeyError:
            return

        if self.paused:
            slot.nextcall.schedule(5)
            return

        while not self._needs_backout(spider):
            if not self._next_request_from_scheduler(spider):
                break

        if slot.start_requests and not self._needs_backout(spider):
            try:
                request = slot.start_requests.next()
                self.crawl(request, spider)
            except StopIteration:
                slot.start_requests = None

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        slot = self.slots[spider]
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slots[spider].needs_backout()

    def _next_request_from_scheduler(self, spider):
        slot = self.slots[spider]
        request = slot.scheduler.next_request()
        if not request:
            return
        d = self._download(request, spider)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(log.msg, spider=spider)
        return d

    def _handle_downloader_output(self, response, request, spider):
        assert isinstance(response, (Request, Response, Failure)), response
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(response, request, spider)
        d.addErrback(log.err, spider=spider)
        return d

    def spider_is_idle(self, spider):
        scraper_idle = spider in self.scraper.slots \
            and self.scraper.slots[spider].is_idle()
        pending = self.slots[spider].scheduler.has_pending_requests()
        downloading = bool(self.downloader.slots)
        idle = scraper_idle and not (pending or downloading)
        return idle

    @property
    def open_spiders(self):
        return self.slots.keys()

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return len(self.slots) < self._concurrent_spiders

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        self.schedule(request, spider)
        self.slots[spider].nextcall.schedule()

    def schedule(self, request, spider):
        return self.slots[spider].scheduler.enqueue_request(request)

    def download(self, request, spider):
        slot = self.slots[spider]
        slot.add_request(request)
        d = self._download(request, spider)
        d.addBoth(self._downloaded, slot, request, spider)
        return d

    def _downloaded(self, response, slot, request, spider):
        slot.remove_request(request)
        return self.download(response, spider) \
                if isinstance(response, Request) else response

    def _download(self, request, spider):
        slot = self.slots[spider]
        slot.add_request(request)
        def _on_success(response):
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request # tie request to response received
                log.msg(log.formatter.crawled(request, response, spider), \
                    level=log.DEBUG, spider=spider)
                send_catch_log(signal=signals.response_received, \
                    response=response, request=request, spider=spider)
            return response

        def _on_error(failure):
            failure.request = request
            return failure

        def _on_complete(_):
            slot.nextcall.schedule()
            return _

        dwld = self.downloader.fetch(request, spider)
        dwld.addCallbacks(_on_success, _on_error)
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=None, close_if_idle=True):
        assert self.has_capacity(), "No free spider slots when opening %r" % \
            spider.name
        log.msg("Spider opened", spider=spider)
        nextcall = CallLaterOnce(self._next_request, spider)
        scheduler = self.scheduler_cls.from_settings(self.settings)
        slot = Slot(start_requests or (), close_if_idle, nextcall, scheduler)
        self.slots[spider] = slot
        yield scheduler.open(spider)
        yield self.scraper.open_spider(spider)
        stats.open_spider(spider)
        yield send_catch_log_deferred(signals.spider_opened, spider=spider)
        slot.nextcall.schedule()

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            self.slots[spider].nextcall.schedule(5)
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slots[spider]
        if slot.closing:
            return slot.closing
        log.msg("Closing spider (%s)" % reason, spider=spider)

        dfd = slot.close()

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: send_catch_log_deferred(signal=signals.spider_closed, \
            spider=spider, reason=reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: stats.close_spider(spider, reason=reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: log.msg("Spider closed (%s)" % reason, spider=spider))

        dfd.addBoth(lambda _: self.slots.pop(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spiders(self):
        dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield send_catch_log_deferred(signal=signals.engine_stopped)
        yield stats.engine_stopped()
Exemplo n.º 5
0
"""
Exemplo n.º 6
0
class ExecutionEngine(object):

    def __init__(self, crawler, spider_closed_callback):
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(self.settings['SCHEDULER'])
        self.downloader = Downloader(crawler)
        self.scraper = Scraper(crawler)
        self._concurrent_spiders = self.settings.getint('CONCURRENT_SPIDERS', 1)
        if self._concurrent_spiders != 1:
            warnings.warn("CONCURRENT_SPIDERS settings is deprecated, use " \
                "Scrapyd max_proc config instead", ScrapyDeprecationWarning)
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield self.signals.send_catch_log_deferred(signal=signals.engine_started)
        self.running = True
        self._closewait = defer.Deferred()
        yield self._closewait

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def _next_request(self, spider):
        slot = self.slot
        if not slot:
            return

        if self.paused:
            slot.nextcall.schedule(5)
            return

        while not self._needs_backout(spider):
            if not self._next_request_from_scheduler(spider):
                break

        if slot.start_requests and not self._needs_backout(spider):
            try:
                request = next(slot.start_requests)
            except StopIteration:
                slot.start_requests = None
            except Exception as exc:
                log.err(None, 'Obtaining request from start requests', \
                        spider=spider)
            else:
                self.crawl(request, spider)

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        slot = self.slot
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slot.needs_backout()

    def _next_request_from_scheduler(self, spider):
        slot = self.slot
        request = slot.scheduler.next_request()
        if not request:
            return
        d = self._download(request, spider)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(log.msg, spider=spider)
        return d

    def _handle_downloader_output(self, response, request, spider):
        assert isinstance(response, (Request, Response, Failure)), response
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(response, request, spider)
        d.addErrback(log.err, spider=spider)
        return d

    def spider_is_idle(self, spider):
        scraper_idle = self.scraper.slot.is_idle()
        pending = self.slot.scheduler.has_pending_requests()
        downloading = bool(self.downloader.active)
        idle = scraper_idle and not (pending or downloading)
        return idle

    @property
    def open_spiders(self):
        return [self.spider] if self.spider else []

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return not bool(self.slot)

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        self.schedule(request, spider)
        self.slot.nextcall.schedule()

    def schedule(self, request, spider):
        self.signals.send_catch_log(signal=signals.request_scheduled,
                request=request, spider=spider)
        return self.slot.scheduler.enqueue_request(request)

    def download(self, request, spider):
        slot = self.slot
        slot.add_request(request)
        d = self._download(request, spider)
        d.addBoth(self._downloaded, slot, request, spider)
        return d

    def _downloaded(self, response, slot, request, spider):
        slot.remove_request(request)
        return self.download(response, spider) \
                if isinstance(response, Request) else response

    def _download(self, request, spider):
        slot = self.slot
        slot.add_request(request)
        def _on_success(response):
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request # tie request to response received
                logkws = self.logformatter.crawled(request, response, spider)
                log.msg(spider=spider, **logkws)
                self.signals.send_catch_log(signal=signals.response_received, \
                    response=response, request=request, spider=spider)
            return response

        def _on_complete(_):
            slot.nextcall.schedule()
            return _

        dwld = self.downloader.fetch(request, spider)
        dwld.addCallbacks(_on_success)
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        assert self.has_capacity(), "No free spider slot when opening %r" % \
            spider.name
        log.msg("Spider opened", spider=spider)
        nextcall = CallLaterOnce(self._next_request, spider)
        scheduler = self.scheduler_cls.from_crawler(self.crawler)
        start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
        slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
        self.slot = slot
        self.spider = spider
        yield scheduler.open(spider)
        yield self.scraper.open_spider(spider)
        self.crawler.stats.open_spider(spider)
        yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)
        slot.nextcall.schedule()

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            self.slot.nextcall.schedule(5)
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slot
        if slot.closing:
            return slot.closing
        log.msg(format="Closing spider (%(reason)s)", reason=reason, spider=spider)

        dfd = slot.close()

        dfd.addBoth(lambda _: self.downloader.close())
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log.err, spider=spider)

        # XXX: spider_stats argument was added for backwards compatibility with
        # stats collection refactoring added in 0.15. it should be removed in 0.17.
        dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(signal=signals.spider_closed, \
            spider=spider, reason=reason, spider_stats=self.crawler.stats.get_stats()))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: log.msg(format="Spider closed (%(reason)s)", reason=reason, spider=spider))

        dfd.addBoth(lambda _: setattr(self, 'slot', None))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: setattr(self, 'spider', None))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spiders(self):
        dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
        self._closewait.callback(None)
Exemplo n.º 7
0
class ExecutionEngine(object):
    def __init__(self, crawler, spider_closed_callback):
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(
            self.settings['SCHEDULER']
        )  # 从settings中找到Scheduler调度器,找到Scheduler类
        downloader_cls = load_object(
            self.settings['DOWNLOADER'])  # 同样,找到Downloader下载器类
        self.downloader = downloader_cls(crawler)  # 实例化Downloader
        self.scraper = Scraper(crawler)  # 实例化Scraper,它是引擎连接爬虫类的桥梁
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_started)
        self.running = True
        self._closewait = defer.Deferred()
        yield self._closewait

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def close(self):
        """Close the execution engine gracefully.

        If it has already been started, stop it. In all cases, close all spiders
        and the downloader.
        """
        if self.running:
            # Will also close spiders and downloader
            return self.stop()
        elif self.open_spiders:
            # Will also close downloader
            return self._close_all_spiders()
        else:
            return defer.succeed(self.downloader.close())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def _next_request(self, spider):
        """
        这个_next_request方法有2种调用途径,一种是通过reactor的5s心跳定时启动运行,另一种则是在流程中需要时主动调用。
        """
        slot = self.slot
        if not slot:
            return

        if self.paused:
            return

        while not self._needs_backout(
                spider
        ):  # 从scheduler中获取request,这个循环的意思是,尽量把队列中的request都安排异步下载,除非是达到最大并发量或其他原因
            if not self._next_request_from_scheduler(
                    spider):  # 这个函数才是真正的发起下载任务
                break

        if slot.start_requests and not self._needs_backout(
                spider):  # 如果start_requests有数据且不需要等待
            try:
                request = next(slot.start_requests)
            except StopIteration:
                slot.start_requests = None
            except Exception:
                slot.start_requests = None
                logger.error('Error while obtaining start requests',
                             exc_info=True,
                             extra={'spider': spider})
            else:
                self.crawl(
                    request, spider
                )  # 调用crawl,实际是把request放入scheduler对象的内存队列中,然后又安排马上调用_next_request

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        """
        是否需要等待,取决4个条件
        1. Engine是否stop
        2. slot是否close
        3. downloader下载超过预设
        4. scraper处理response超过预设
        """
        slot = self.slot
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slot.needs_backout()

    def _next_request_from_scheduler(self, spider):
        slot = self.slot
        request = slot.scheduler.next_request()  # 从scheduler拿出下个request
        if not request:
            return
        d = self._download(
            request, spider
        )  # 处理各中间件的process_request方法,然后用self._enqueue_request将request发出下载,并绑定回调函数(如process_response)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(
            lambda f: logger.info('Error while handling downloader output',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(
            lambda f: logger.info('Error while removing request from slot',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(
            lambda f: logger.info('Error while scheduling new request',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        return d

    def _handle_downloader_output(self, response, request, spider):
        assert isinstance(
            response, (Request, Response,
                       Failure)), response  # 下载结果必须是Request、Response、Failure其一
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response,
                      Request):  # 如果是Request,则再次调用crawl,执行Scheduler的入队逻辑
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(response, request,
                                        spider)  # 主要是和Spiders和Pipeline交互
        d.addErrback(
            lambda f: logger.error('Error while enqueuing downloader output',
                                   exc_info=failure_to_exc_info(f),
                                   extra={'spider': spider}))
        return d

    def spider_is_idle(self, spider):
        if not self.scraper.slot.is_idle():
            # scraper is not idle
            return False

        if self.downloader.active:
            # downloader has pending requests
            return False

        if self.slot.start_requests is not None:
            # not all start requests are handled
            return False

        if self.slot.scheduler.has_pending_requests():
            # scheduler has pending requests
            return False

        return True

    @property
    def open_spiders(self):
        return [self.spider] if self.spider else []

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return not bool(self.slot)

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        self.schedule(request,
                      spider)  # 把该requests放在内存队列中(前提是指纹唯一,指纹不唯一则drop掉)
        self.slot.nextcall.schedule(
        )  # 下一个loop马上调用_next_request,这样就会一直去找spider中的request然后放入队列

    def schedule(self, request, spider):
        self.signals.send_catch_log(signal=signals.request_scheduled,
                                    request=request,
                                    spider=spider)
        if not self.slot.scheduler.enqueue_request(
                request):  # request指纹重复则drop,指纹唯一则将request入队
            self.signals.send_catch_log(signal=signals.request_dropped,
                                        request=request,
                                        spider=spider)

    def download(self, request, spider):  # 没有调用这个函数???
        d = self._download(request, spider)
        d.addBoth(self._downloaded, self.slot, request, spider)
        return d

    def _downloaded(self, response, slot, request, spider):  # 没有调用这个函数???
        slot.remove_request(request)
        return self.download(response, spider) \
                if isinstance(response, Request) else response  # 如果返回的是个request(比如从中间件返回),那就再次下载,如果是response则直接返回

    def _download(self, request, spider):
        slot = self.slot
        slot.add_request(request)  # 添加到正在处理的request的集合

        def _on_success(response):
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):  # 如果下载后结果为Response,返回Response
                response.request = request  # tie request to response received
                logkws = self.logformatter.crawled(request, response, spider)
                logger.log(*logformatter_adapter(logkws),
                           extra={'spider': spider})
                self.signals.send_catch_log(signal=signals.response_received, \
                    response=response, request=request, spider=spider)
            return response

        def _on_complete(_):
            slot.nextcall.schedule()  # 这里一个重点,下载完成之后再次调度,即,再次取出request,然后发送请求
            return _

        dwld = self.downloader.fetch(
            request, spider
        )  # 处理各中间件的process_request方法,然后用self._enqueue_request将request发出下载,并绑定回调(如process_response)
        dwld.addCallbacks(_on_success)  # 绑定了内部函数,其实就相当于闭包了,所以变量啥的会保存下来
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        assert self.has_capacity(), "No free spider slot when opening %r" % \
            spider.name
        logger.info("Spider opened", extra={'spider': spider})
        nextcall = CallLaterOnce(self._next_request,
                                 spider)  # 注册_next_request调度方法,循环调度
        scheduler = self.scheduler_cls.from_crawler(
            self.crawler)  # 初始化scheduler
        start_requests = yield self.scraper.spidermw.process_start_requests(
            start_requests, spider)  # 调用爬虫中间件,处理种子请求
        slot = Slot(start_requests, close_if_idle, nextcall,
                    scheduler)  # 封装Slot对象
        self.slot = slot
        self.spider = spider
        yield scheduler.open(
            spider)  # 调用scheduler的open,实例化一个优先级队列,其余啥也没干,返回None
        yield self.scraper.open_spider(
            spider)  # 这里主要是调用所有itemPipline的open_spider方法
        self.crawler.stats.open_spider(spider)  # 返回None,这个不知道干啥的
        yield self.signals.send_catch_log_deferred(signals.spider_opened,
                                                   spider=spider)
        slot.nextcall.schedule()  # 发起调度
        slot.heartbeat.start(5)  # 每5秒调用一次CallLaterOnce.schedule

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slot
        if slot.closing:
            return slot.closing
        logger.info("Closing spider (%(reason)s)", {'reason': reason},
                    extra={'spider': spider})

        dfd = slot.close()

        def log_failure(msg):
            def errback(failure):
                logger.error(msg,
                             exc_info=failure_to_exc_info(failure),
                             extra={'spider': spider})

            return errback

        dfd.addBoth(lambda _: self.downloader.close())
        dfd.addErrback(log_failure('Downloader close failure'))

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log_failure('Scraper close failure'))

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log_failure('Scheduler close failure'))

        dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
            signal=signals.spider_closed, spider=spider, reason=reason))
        dfd.addErrback(log_failure('Error while sending spider_close signal'))

        dfd.addBoth(
            lambda _: self.crawler.stats.close_spider(spider, reason=reason))
        dfd.addErrback(log_failure('Stats close failure'))

        dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
                                          {'reason': reason},
                                          extra={'spider': spider}))

        dfd.addBoth(lambda _: setattr(self, 'slot', None))
        dfd.addErrback(log_failure('Error while unassigning slot'))

        dfd.addBoth(lambda _: setattr(self, 'spider', None))
        dfd.addErrback(log_failure('Error while unassigning spider'))

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spiders(self):
        dfds = [
            self.close_spider(s, reason='shutdown') for s in self.open_spiders
        ]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_stopped)
        self._closewait.callback(None)
Exemplo n.º 8
0
class ExecutionEngine:

    def __init__(self, crawler, spider_closed_callback):
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals #当 crawler 初始化应该初始化了一个 signalmanager 它里面的 sender 就是 crawler
        self.logformatter = crawler.logformatter
        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(self.settings['SCHEDULER'])
        downloader_cls = load_object(self.settings['DOWNLOADER'])
        self.downloader = downloader_cls(crawler)
        self.scraper = Scraper(crawler)
        self._spider_closed_callback = spider_closed_callback
    # 这里只是设置开启标志 并返回deferred对象 真正的准备阶段在 open_spider
    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        if self.running:
            raise RuntimeError("Engine already running")
        self.start_time = time()
        yield self.signals.send_catch_log_deferred(signal=signals.engine_started) #增加 errback  并发送一次 signal
        self.running = True
        self._closewait = defer.Deferred()
        yield self._closewait

    def stop(self):
        """Stop the execution engine gracefully"""
        if not self.running:
            raise RuntimeError("Engine not running")
        self.running = False #打标记
        dfd = self._close_all_spiders() #拿到关闭deferred
        return dfd.addBoth(lambda _: self._finish_stopping_engine()) #给deferred添加 关闭的callback

    def close(self):
        """Close the execution engine gracefully.

        If it has already been started, stop it. In all cases, close all spiders
        and the downloader.
        """
        if self.running:
            # Will also close spiders and downloader
            return self.stop()
        elif self.open_spiders:
            # Will also close downloader
            return self._close_all_spiders()
        else:
            return defer.succeed(self.downloader.close()) #其实就是相当于d.callback(result)

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False
    # 判断暂停否,取出一个request  调用crawl 处理request 最后判断是否空闲关闭
    def _next_request(self, spider):
        slot = self.slot
        if not slot:
            return

        if self.paused:
            return

        while not self._needs_backout(spider):
            if not self._next_request_from_scheduler(spider): #如果从scheduler 取出 request 并添加 item处理的callback 查看其返回结果 没有了就break
                break

        if slot.start_requests and not self._needs_backout(spider): #处理关闭后仍有request情况
            try:
                request = next(slot.start_requests)
            except StopIteration:
                slot.start_requests = None
            except Exception:
                slot.start_requests = None
                logger.error('Error while obtaining start requests',
                             exc_info=True, extra={'spider': spider})
            else:
                self.crawl(request, spider)# 调用自身crawl方法爬

        if self.spider_is_idle(spider) and slot.close_if_idle: #空闲时候是否关闭spider
            self._spider_idle(spider)
    # 从不同地方的标志位置判断是否需要退出
    def _needs_backout(self, spider):
        slot = self.slot
        return (
            not self.running
            or slot.closing
            or self.downloader.needs_backout()
            or self.scraper.slot.needs_backout()
        )
    # 从scheduler中取出request,放到_download下载 再给他添加处理返回值 和 调用下一个循环的回调
    def _next_request_from_scheduler(self, spider):
        slot = self.slot
        request = slot.scheduler.next_request() # 从scheduler中取出request
        if not request:
            return
        d = self._download(request, spider) #将request放到_download中 生成deferred
        d.addBoth(self._handle_downloader_output, request, spider) # #### 这里其实就是 处理返回request的 回调函数 其中 函数的 response 就是这个deferred对象
        d.addErrback(lambda f: logger.info('Error while handling downloader output',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        d.addBoth(lambda _: slot.remove_request(request)) #从slot中删掉对应的reqeust
        d.addErrback(lambda f: logger.info('Error while removing request from slot',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        d.addBoth(lambda _: slot.nextcall.schedule()) # 添加一个运行下一个request的回调
        d.addErrback(lambda f: logger.info('Error while scheduling new request',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        return d
    # 判断resonse具体类别 正确情况下调用scraper.enqueue_scrape 压入响应并返回deferred对象,errback 添加log
    def _handle_downloader_output(self, response, request, spider):
        if not isinstance(response, (Request, Response, Failure)):
            raise TypeError(
                "Incorrect type: expected Request, Response or Failure, got "
                f"{type(response)}: {response!r}"
            )
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):
            self.crawl(response, spider) #将request 压入 scheduler
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(response, request, spider) #将 request respond 和sipider 共同的做作用好的scraper 返回来
        d.addErrback(lambda f: logger.error('Error while enqueuing downloader output',
                                            exc_info=failure_to_exc_info(f),
                                            extra={'spider': spider}))
        return d #返回deferred
    #判断spider是不是闲着
    def spider_is_idle(self, spider): #判断 scrapy整体是不是 闲着
        if not self.scraper.slot.is_idle():
            # scraper is not idle
            return False

        if self.downloader.active:
            # downloader has pending requests
            return False

        if self.slot.start_requests is not None:
            # not all start requests are handled
            return False

        if self.slot.scheduler.has_pending_requests():
            # scheduler has pending requests
            return False

        return True

    @property
    def open_spiders(self): # 目前一个engine还是只能使用一个spider
        return [self.spider] if self.spider else []

    def has_capacity(self): #一个引擎对应一个slot
        """Does the engine have capacity to handle more spiders"""
        return not bool(self.slot)
    #执行单个的 request 压入scheduler 并执行下一步的命令
    def crawl(self, request, spider):
        if spider not in self.open_spiders:
            raise RuntimeError(f"Spider {spider.name!r} not opened when crawling: {request}")
        self.schedule(request, spider) #将request 压入scheduler【压入数据】
        self.slot.nextcall.schedule() #执行下一步操作 这里的nextcall 是调用 _next_request()【取出request并后处理】
    # 将request 压入spider的que中
    def schedule(self, request, spider):
        self.signals.send_catch_log(signals.request_scheduled, request=request, spider=spider)
        if not self.slot.scheduler.enqueue_request(request): #入队列 并陪你段是否被过滤
            self.signals.send_catch_log(signals.request_dropped, request=request, spider=spider)
    # 这里是调用 _downlaod 方法 进行下载 最后添加一个 _downloaded 到回调链路上
    def download(self, request, spider):
        d = self._download(request, spider)
        d.addBoth(self._downloaded, self.slot, request, spider) #addBoth(func, ,参数 ,参数)
        return d
    #是request的话 从新调用 dowload 否则返回 rewponse
    def _downloaded(self, response, slot, request, spider):
        slot.remove_request(request) #从slot里删除这个request
        return self.download(response, spider) if isinstance(response, Request) else response
    #### 下载起点slot ##### 添加这个 request 然后调用downloader 下载 request 添加对应的 处理callback
    def _download(self, request, spider): #
        slot = self.slot
        slot.add_request(request) # 在slot 的正在处理的request里面 增加这个request

        def _on_success(response):
            if not isinstance(response, (Response, Request)):
                raise TypeError(
                    "Incorrect type: expected Response or Request, got "
                    f"{type(response)}: {response!r}"
                )
            if isinstance(response, Response):
                if response.request is None:
                    response.request = request
                logkws = self.logformatter.crawled(response.request, response, spider)
                if logkws is not None:
                    logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
                self.signals.send_catch_log(
                    signal=signals.response_received,
                    response=response,
                    request=response.request,
                    spider=spider,
                )
            return response

        def _on_complete(_):
            slot.nextcall.schedule()
            return _

        dwld = self.downloader.fetch(request, spider) #实际下载发生的情况
        dwld.addCallbacks(_on_success) #增加 callback
        dwld.addBoth(_on_complete) #增加callback 和errback
        return dwld
    ######## 程序的实际开始的地方 ###### 相当于 NO1 实际上一个引擎只能调用一个spider  这个方法中很多spider都只能是个例 不能是list
    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        if not self.has_capacity():
            raise RuntimeError(f"No free spider slot when opening {spider.name!r}")
        logger.info("Spider opened", extra={'spider': spider})
        nextcall = CallLaterOnce(self._next_request, spider) # 这里相当于 创建了一个 CallLaterOnce的对象 目标是从start_request里开始抛出request  不过 他是相当于仅仅生成个deferred
        scheduler = self.scheduler_cls.from_crawler(self.crawler)# 实例化scheduler
        start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider) # 调用middleware中的process_start_requests 来处理这些startrequest
        slot = Slot(start_requests, close_if_idle, nextcall, scheduler) #这里创建对应的slot
        self.slot = slot
        self.spider = spider
        yield scheduler.open(spider) # 初始化 scheduler 生成que
        yield self.scraper.open_spider(spider)# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~·
        self.crawler.stats.open_spider(spider)
        yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)#发出信号
        slot.nextcall.schedule() # 给reactor 添加任务 实际启动_next_request
        slot.heartbeat.start(5) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~·

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signals.spider_idle, spider=spider, dont_log=DontCloseSpider) #方法返回的类型为 (receiver, result)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) for _, x in res):
            return #当信号发出,返回的是错误 且 不许关闭spider标志的话 返回空

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slot
        if slot.closing:
            return slot.closing
        logger.info("Closing spider (%(reason)s)",
                    {'reason': reason},
                    extra={'spider': spider})

        dfd = slot.close()

        def log_failure(msg):
            def errback(failure):
                logger.error(
                    msg,
                    exc_info=failure_to_exc_info(failure),
                    extra={'spider': spider}
                )
            return errback

        dfd.addBoth(lambda _: self.downloader.close()) #通知downloader 关闭动作
        dfd.addErrback(log_failure('Downloader close failure'))

        dfd.addBoth(lambda _: self.scraper.close_spider(spider)) # 通知scraper 关闭动作
        dfd.addErrback(log_failure('Scraper close failure'))

        dfd.addBoth(lambda _: slot.scheduler.close(reason)) #通知slot
        dfd.addErrback(log_failure('Scheduler close failure'))

        dfd.addBoth(lambda _: self.signals.send_catch_log_deferred( #发送信号
            signal=signals.spider_closed, spider=spider, reason=reason))
        dfd.addErrback(log_failure('Error while sending spider_close signal'))

        dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason)) #crawler 关闭信息
        dfd.addErrback(log_failure('Stats close failure'))

        dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
                                          {'reason': reason},
                                          extra={'spider': spider}))

        dfd.addBoth(lambda _: setattr(self, 'slot', None)) #清理内存
        dfd.addErrback(log_failure('Error while unassigning slot'))

        dfd.addBoth(lambda _: setattr(self, 'spider', None))
        dfd.addErrback(log_failure('Error while unassigning spider'))

        dfd.addBoth(lambda _: self._spider_closed_callback(spider)) #调用对应回调函数

        return dfd

    def _close_all_spiders(self):
        dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped) #
        self._closewait.callback(None)
Exemplo n.º 9
0
class ExecutionEngine(object):

    def __init__(self, crawler, spider_closed_callback):
        ## 将爬虫实例存储在执行引擎实例中
        self.crawler = crawler
        ## 将爬虫实例所对应的配置也存储在执行引擎实例中
        self.settings = crawler.settings
        ## 信号
        self.signals = crawler.signals
        ## 日志格式化器
        self.logformatter = crawler.logformatter
        self.slot = None
        self.spider = None
        ## 是否正在运行
        self.running = False
        ## 是否已暂停执行
        self.paused = False
        ## 从配置文件中加载调度器类
        self.scheduler_cls = load_object(self.settings['SCHEDULER'])
        ## 从配置文件中加载下载器类
        downloader_cls = load_object(self.settings['DOWNLOADER'])
        ## 实例化下载器
        self.downloader = downloader_cls(crawler)
        ## 实例化 scraper,它是引擎连接爬虫类(Spider)和管道类(Pipeline)的桥梁
        self.scraper = Scraper(crawler)
        ## 指定爬虫关闭的回调函数
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield self.signals.send_catch_log_deferred(signal=signals.engine_started)
        self.running = True
        self._closewait = defer.Deferred()
        yield self._closewait

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def close(self):
        """Close the execution engine gracefully.

        If it has already been started, stop it. In all cases, close all spiders
        and the downloader.
        """
        if self.running:
            # Will also close spiders and downloader
            return self.stop()
        elif self.open_spiders:
            # Will also close downloader
            return self._close_all_spiders()
        else:
            return defer.succeed(self.downloader.close())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def _next_request(self, spider):
        ## 该方法会被循环调度

        slot = self.slot
        if not slot:
            return

        if self.paused:
            return

        ## 是否撤销
        while not self._needs_backout(spider):
            ## 从 scheduler 中获取下一个 request
            ## 注意:第一次获取时,是没有的,也就是会 break 出来
            ## 从而执行下面的逻辑
            if not self._next_request_from_scheduler(spider):
                break

        ## 如果 start_requests 有数据且不需要撤销
        if slot.start_requests and not self._needs_backout(spider):
            try:
                ## 获取下一个种子请求
                request = next(slot.start_requests)
            except StopIteration:
                slot.start_requests = None
            except Exception:
                slot.start_requests = None
                logger.error('Error while obtaining start requests',
                             exc_info=True, extra={'spider': spider})
            else:
                ## 调用 crawl, 实际是把 request 放入 scheduler 的队列中
                self.crawl(request, spider)

        ## 如果爬虫是空闲的则关闭爬虫
        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        ## 是否需要撤销,取决于 4 个条件
        ## 1. engine 是否在运行
        ## 2. slot 是否关闭
        ## 3. 下载器网络下载是否超过预设
        ## 4. scraper 处理输出是否超过预设

        slot = self.slot
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slot.needs_backout()

    def _next_request_from_scheduler(self, spider):
        slot = self.slot
        ## 从调度器中获取下一个请求
        request = slot.scheduler.next_request()
        if not request:
            return
        ## 下载
        d = self._download(request, spider)

        ## 为下载结果添加回调,下载结果可能是 Request、Response、Failure

        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(lambda f: logger.info('Error while handling downloader output',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(lambda f: logger.info('Error while removing request from slot',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(lambda f: logger.info('Error while scheduling new request',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        return d

    def _handle_downloader_output(self, response, request, spider):
        ## 下载结果 response 必须是 Request、Response、Failure 之一
        assert isinstance(response, (Request, Response, Failure)), response
        # downloader middleware can return requests (for example, redirects)
        ## 如果下载结果是 Request,则再次调用 crawl,执行 Scheduler 的入队逻辑
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        ## 如果下载结果是 Response 或 Failure,则交给 scrapy 的 enqueue_scrape 方法进一步处理
        ## 主要是与 spiders 和 pipelines 交互
        d = self.scraper.enqueue_scrape(response, request, spider)
        d.addErrback(lambda f: logger.error('Error while enqueuing downloader output',
                                            exc_info=failure_to_exc_info(f),
                                            extra={'spider': spider}))
        return d

    def spider_is_idle(self, spider):
        if not self.scraper.slot.is_idle():
            # scraper is not idle
            return False

        if self.downloader.active:
            # downloader has pending requests
            return False

        if self.slot.start_requests is not None:
            # not all start requests are handled
            return False

        if self.slot.scheduler.has_pending_requests():
            # scheduler has pending requests
            return False

        return True

    @property
    def open_spiders(self):
        return [self.spider] if self.spider else []

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return not bool(self.slot)

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        ## 将请求放入调度器队列
        self.schedule(request, spider)
        ## 调用 nextcall 的 schedule 方法,进行下一次调度
        self.slot.nextcall.schedule()

    def schedule(self, request, spider):
        self.signals.send_catch_log(signal=signals.request_scheduled,
                request=request, spider=spider)
        ## 调用调度器的 enqueue_request 方法,将请求放入调度器队列
        if not self.slot.scheduler.enqueue_request(request):
            self.signals.send_catch_log(signal=signals.request_dropped,
                                        request=request, spider=spider)

    def download(self, request, spider):
        d = self._download(request, spider)
        d.addBoth(self._downloaded, self.slot, request, spider)
        return d

    def _downloaded(self, response, slot, request, spider):
        slot.remove_request(request)
        return self.download(response, spider) \
                if isinstance(response, Request) else response

    def _download(self, request, spider):
        slot = self.slot
        slot.add_request(request)
        ## 下载成功的回调,返回处理过的响应
        def _on_success(response):
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                ## 将请求放入响应对象的 request 属性中
                response.request = request # tie request to response received
                logkws = self.logformatter.crawled(request, response, spider)
                logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
                self.signals.send_catch_log(signal=signals.response_received, \
                    response=response, request=request, spider=spider)
            return response

        ## 下载完成的回调,继续下一次调度
        def _on_complete(_):
            slot.nextcall.schedule()
            return _

        ## 调用下载器进行下载
        dwld = self.downloader.fetch(request, spider)
        ## 注册下载成功的回调
        dwld.addCallbacks(_on_success)
        ## 注册下载完成的回调
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        assert self.has_capacity(), "No free spider slot when opening %r" % \
            spider.name
        logger.info("Spider opened", extra={'spider': spider})
        ## 注册 _next_request 调度方法,循环调度
        nextcall = CallLaterOnce(self._next_request, spider)
        ## 初始化调度器类
        scheduler = self.scheduler_cls.from_crawler(self.crawler)
        ## 调用爬虫中间件的 process_start_requests 方法处理种子请求
        ## 可以定义多个爬虫中间件,每个类都重写该方法,爬虫在调度之前会分别调用你定义好的
        ## 爬虫中间件,来分别处理起始请求,功能独立而且维护起来更加清晰
        start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
        ## 封装 slot 对象
        slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
        self.slot = slot
        self.spider = spider
        ## 调用调度器的 open 方法
        yield scheduler.open(spider)
        ## 调用 scraper 的 open_spider 方法
        yield self.scraper.open_spider(spider)
        self.crawler.stats.open_spider(spider)
        yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)
        ## 发起调度
        slot.nextcall.schedule()
        slot.heartbeat.start(5)

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slot
        if slot.closing:
            return slot.closing
        logger.info("Closing spider (%(reason)s)",
                    {'reason': reason},
                    extra={'spider': spider})

        dfd = slot.close()

        def log_failure(msg):
            def errback(failure):
                logger.error(
                    msg,
                    exc_info=failure_to_exc_info(failure),
                    extra={'spider': spider}
                )
            return errback

        dfd.addBoth(lambda _: self.downloader.close())
        dfd.addErrback(log_failure('Downloader close failure'))

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log_failure('Scraper close failure'))

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log_failure('Scheduler close failure'))

        dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
            signal=signals.spider_closed, spider=spider, reason=reason))
        dfd.addErrback(log_failure('Error while sending spider_close signal'))

        dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason))
        dfd.addErrback(log_failure('Stats close failure'))

        dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
                                          {'reason': reason},
                                          extra={'spider': spider}))

        dfd.addBoth(lambda _: setattr(self, 'slot', None))
        dfd.addErrback(log_failure('Error while unassigning slot'))

        dfd.addBoth(lambda _: setattr(self, 'spider', None))
        dfd.addErrback(log_failure('Error while unassigning spider'))

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spiders(self):
        dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
        self._closewait.callback(None)
Exemplo n.º 10
0
class ExecutionEngine:
    def __init__(self, crawler, spider_closed_callback: Callable) -> None:
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
        self.slot: Optional[Slot] = None
        self.spider: Optional[Spider] = None
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(crawler.settings["SCHEDULER"])
        downloader_cls = load_object(self.settings['DOWNLOADER'])
        self.downloader = downloader_cls(crawler)
        self.scraper = Scraper(crawler)
        self._spider_closed_callback = spider_closed_callback

    @inlineCallbacks
    def start(self) -> Deferred:
        if self.running:
            raise RuntimeError("Engine already running")
        self.start_time = time()
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_started)
        self.running = True
        self._closewait = Deferred()
        yield self._closewait

    def stop(self) -> Deferred:
        """Gracefully stop the execution engine"""
        @inlineCallbacks
        def _finish_stopping_engine(_) -> Deferred:
            yield self.signals.send_catch_log_deferred(
                signal=signals.engine_stopped)
            self._closewait.callback(None)

        if not self.running:
            raise RuntimeError("Engine not running")

        self.running = False
        dfd = self.close_spider(
            self.spider,
            reason="shutdown") if self.spider is not None else succeed(None)
        return dfd.addBoth(_finish_stopping_engine)

    def close(self) -> Deferred:
        """
        Gracefully close the execution engine.
        If it has already been started, stop it. In all cases, close the spider and the downloader.
        """
        if self.running:
            return self.stop()  # will also close spider and downloader
        if self.spider is not None:
            return self.close_spider(
                self.spider, reason="shutdown")  # will also close downloader
        return succeed(self.downloader.close())

    def pause(self) -> None:
        self.paused = True

    def unpause(self) -> None:
        self.paused = False

    def _next_request(self) -> None:
        assert self.slot is not None  # typing
        assert self.spider is not None  # typing

        if self.paused:
            return None

        while not self._needs_backout() and self._next_request_from_scheduler(
        ) is not None:
            pass

        if self.slot.start_requests is not None and not self._needs_backout():
            try:
                request = next(self.slot.start_requests)
            except StopIteration:
                self.slot.start_requests = None
            except Exception:
                self.slot.start_requests = None
                logger.error('Error while obtaining start requests',
                             exc_info=True,
                             extra={'spider': self.spider})
            else:
                self.crawl(request)

        if self.spider_is_idle() and self.slot.close_if_idle:
            self._spider_idle()

    def _needs_backout(self) -> bool:
        return (not self.running
                or self.slot.closing  # type: ignore[union-attr]
                or self.downloader.needs_backout() or
                self.scraper.slot.needs_backout()  # type: ignore[union-attr]
                )

    def _next_request_from_scheduler(self) -> Optional[Deferred]:
        assert self.slot is not None  # typing
        assert self.spider is not None  # typing

        request = self.slot.scheduler.next_request()
        if request is None:
            return None

        d = self._download(request, self.spider)
        d.addBoth(self._handle_downloader_output, request)
        d.addErrback(
            lambda f: logger.info('Error while handling downloader output',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': self.spider}))
        d.addBoth(lambda _: self.slot.remove_request(request))
        d.addErrback(
            lambda f: logger.info('Error while removing request from slot',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': self.spider}))
        d.addBoth(lambda _: self.slot.nextcall.schedule())
        d.addErrback(
            lambda f: logger.info('Error while scheduling new request',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': self.spider}))
        return d

    def _handle_downloader_output(self, result: Union[Request, Response,
                                                      Failure],
                                  request: Request) -> Optional[Deferred]:
        assert self.spider is not None  # typing

        if not isinstance(result, (Request, Response, Failure)):
            raise TypeError(
                f"Incorrect type: expected Request, Response or Failure, got {type(result)}: {result!r}"
            )

        # downloader middleware can return requests (for example, redirects)
        if isinstance(result, Request):
            self.crawl(result)
            return None

        d = self.scraper.enqueue_scrape(result, request, self.spider)
        d.addErrback(lambda f: logger.error(
            "Error while enqueuing downloader output",
            exc_info=failure_to_exc_info(f),
            extra={'spider': self.spider},
        ))
        return d

    def spider_is_idle(self, spider: Optional[Spider] = None) -> bool:
        if spider is not None:
            warnings.warn(
                "Passing a 'spider' argument to ExecutionEngine.spider_is_idle is deprecated",
                category=ScrapyDeprecationWarning,
                stacklevel=2,
            )
        if self.slot is None:
            raise RuntimeError("Engine slot not assigned")
        if not self.scraper.slot.is_idle():  # type: ignore[union-attr]
            return False
        if self.downloader.active:  # downloader has pending requests
            return False
        if self.slot.start_requests is not None:  # not all start requests are handled
            return False
        if self.slot.scheduler.has_pending_requests():
            return False
        return True

    def crawl(self, request: Request, spider: Optional[Spider] = None) -> None:
        """Inject the request into the spider <-> downloader pipeline"""
        if spider is not None:
            warnings.warn(
                "Passing a 'spider' argument to ExecutionEngine.crawl is deprecated",
                category=ScrapyDeprecationWarning,
                stacklevel=2,
            )
            if spider is not self.spider:
                raise RuntimeError(
                    f"The spider {spider.name!r} does not match the open spider"
                )
        if self.spider is None:
            raise RuntimeError(f"No open spider to crawl: {request}")
        self._schedule_request(request, self.spider)
        self.slot.nextcall.schedule()  # type: ignore[union-attr]

    def _schedule_request(self, request: Request, spider: Spider) -> None:
        self.signals.send_catch_log(signals.request_scheduled,
                                    request=request,
                                    spider=spider)
        if not self.slot.scheduler.enqueue_request(
                request):  # type: ignore[union-attr]
            self.signals.send_catch_log(signals.request_dropped,
                                        request=request,
                                        spider=spider)

    def download(self,
                 request: Request,
                 spider: Optional[Spider] = None) -> Deferred:
        """Return a Deferred which fires with a Response as result, only downloader middlewares are applied"""
        if spider is None:
            spider = self.spider
        else:
            warnings.warn(
                "Passing a 'spider' argument to ExecutionEngine.download is deprecated",
                category=ScrapyDeprecationWarning,
                stacklevel=2,
            )
            if spider is not self.spider:
                logger.warning(
                    "The spider '%s' does not match the open spider",
                    spider.name)
        if spider is None:
            raise RuntimeError(f"No open spider to crawl: {request}")
        return self._download(request, spider).addBoth(self._downloaded,
                                                       request, spider)

    def _downloaded(self, result: Union[Response, Request], request: Request,
                    spider: Spider) -> Union[Deferred, Response]:
        assert self.slot is not None  # typing
        self.slot.remove_request(request)
        return self.download(result, spider) if isinstance(result,
                                                           Request) else result

    def _download(self, request: Request, spider: Spider) -> Deferred:
        assert self.slot is not None  # typing

        self.slot.add_request(request)

        def _on_success(
                result: Union[Response, Request]) -> Union[Response, Request]:
            if not isinstance(result, (Response, Request)):
                raise TypeError(
                    f"Incorrect type: expected Response or Request, got {type(result)}: {result!r}"
                )
            if isinstance(result, Response):
                if result.request is None:
                    result.request = request
                logkws = self.logformatter.crawled(result.request, result,
                                                   spider)
                if logkws is not None:
                    logger.log(*logformatter_adapter(logkws),
                               extra={"spider": spider})
                self.signals.send_catch_log(
                    signal=signals.response_received,
                    response=result,
                    request=result.request,
                    spider=spider,
                )
            return result

        def _on_complete(_):
            self.slot.nextcall.schedule()
            return _

        dwld = self.downloader.fetch(request, spider)
        dwld.addCallbacks(_on_success)
        dwld.addBoth(_on_complete)
        return dwld

    @inlineCallbacks
    def open_spider(self,
                    spider: Spider,
                    start_requests: Iterable = (),
                    close_if_idle: bool = True):
        if self.slot is not None:
            raise RuntimeError(
                f"No free spider slot when opening {spider.name!r}")
        logger.info("Spider opened", extra={'spider': spider})
        nextcall = CallLaterOnce(self._next_request)
        scheduler = create_instance(self.scheduler_cls,
                                    settings=None,
                                    crawler=self.crawler)
        start_requests = yield self.scraper.spidermw.process_start_requests(
            start_requests, spider)
        self.slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
        self.spider = spider
        yield scheduler.open(spider)
        yield self.scraper.open_spider(spider)
        self.crawler.stats.open_spider(spider)
        yield self.signals.send_catch_log_deferred(signals.spider_opened,
                                                   spider=spider)
        self.slot.nextcall.schedule()
        self.slot.heartbeat.start(5)

    def _spider_idle(self) -> None:
        """
        Called when a spider gets idle, i.e. when there are no remaining requests to download or schedule.
        It can be called multiple times. If a handler for the spider_idle signal raises a DontCloseSpider
        exception, the spider is not closed until the next loop and this function is guaranteed to be called
        (at least) once again.
        """
        assert self.spider is not None  # typing
        res = self.signals.send_catch_log(signals.spider_idle,
                                          spider=self.spider,
                                          dont_log=DontCloseSpider)
        if any(
                isinstance(x, Failure) and isinstance(x.value, DontCloseSpider)
                for _, x in res):
            return None
        if self.spider_is_idle():
            self.close_spider(self.spider, reason='finished')

    def close_spider(self,
                     spider: Spider,
                     reason: str = "cancelled") -> Deferred:
        """Close (cancel) spider and clear all its outstanding requests"""
        if self.slot is None:
            raise RuntimeError("Engine slot not assigned")

        if self.slot.closing is not None:
            return self.slot.closing

        logger.info("Closing spider (%(reason)s)", {'reason': reason},
                    extra={'spider': spider})

        dfd = self.slot.close()

        def log_failure(msg: str) -> Callable:
            def errback(failure: Failure) -> None:
                logger.error(msg,
                             exc_info=failure_to_exc_info(failure),
                             extra={'spider': spider})

            return errback

        dfd.addBoth(lambda _: self.downloader.close())
        dfd.addErrback(log_failure('Downloader close failure'))

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log_failure('Scraper close failure'))

        dfd.addBoth(lambda _: self.slot.scheduler.close(reason))
        dfd.addErrback(log_failure('Scheduler close failure'))

        dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
            signal=signals.spider_closed,
            spider=spider,
            reason=reason,
        ))
        dfd.addErrback(log_failure('Error while sending spider_close signal'))

        dfd.addBoth(
            lambda _: self.crawler.stats.close_spider(spider, reason=reason))
        dfd.addErrback(log_failure('Stats close failure'))

        dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
                                          {'reason': reason},
                                          extra={'spider': spider}))

        dfd.addBoth(lambda _: setattr(self, 'slot', None))
        dfd.addErrback(log_failure('Error while unassigning slot'))

        dfd.addBoth(lambda _: setattr(self, 'spider', None))
        dfd.addErrback(log_failure('Error while unassigning spider'))

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    @property
    def open_spiders(self) -> list:
        warnings.warn(
            "ExecutionEngine.open_spiders is deprecated, please use ExecutionEngine.spider instead",
            category=ScrapyDeprecationWarning,
            stacklevel=2,
        )
        return [self.spider] if self.spider is not None else []

    def has_capacity(self) -> bool:
        warnings.warn("ExecutionEngine.has_capacity is deprecated",
                      ScrapyDeprecationWarning,
                      stacklevel=2)
        return not bool(self.slot)

    def schedule(self, request: Request, spider: Spider) -> None:
        warnings.warn(
            "ExecutionEngine.schedule is deprecated, please use "
            "ExecutionEngine.crawl or ExecutionEngine.download instead",
            category=ScrapyDeprecationWarning,
            stacklevel=2,
        )
        if self.slot is None:
            raise RuntimeError("Engine slot not assigned")
        self._schedule_request(request, spider)
Exemplo n.º 11
0
class ExecutionEngine(object):
    """
    有三个重要的实例化
    一、schedule
        调度器,实例化了dupefilter过滤器,然后还初始化了三个队列。
        dupefilter:过滤器,通过存储 method + url + response.body 生成sha1指纹,来进行过滤
        pqclass:一个优先级队列queuelib.PriorityQueue
        dqclass:一个FIFO队列,先进先出规则,并且通过pickle序列化了
        mqclass:一个FIFO队列,先进先出规则,直接存储在内存中

    二、downloader
        实例化了Handler对象,还实例化了下载器中间件
        Handler:具体的下载逻辑
        DownloaderMiddlewareManager:收集所有的下载中间件,在收集其中的process_request、process_exception、process_response三种方法

    三、scraper
        实例化了爬虫中间件,还实例化了管道处理器
        SpiderMiddlewareManager:实例化后获取process_spider_input、process_spider_output、process_spider_exception、process_start_requests
        itemproc_cls:获取ItemPipelineManager,实例化其中的ITEM_PIPELINES,获取process_item

    """
    def __init__(self, crawler, spider_closed_callback):
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(
            self.settings['SCHEDULER']
        )  # SCHEDULER = 'scrapy.core.scheduler.Scheduler',仅仅获取对象,没做其他坏事
        downloader_cls = load_object(
            self.settings['DOWNLOADER']
        )  # DOWNLOADER = 'scrapy.core.downloader.Downloader'
        self.downloader = downloader_cls(
            crawler
        )  # 这个下载器,里面实例化了handler处理器,和到下载器之间的process_处理逻辑。就是具体的下载功能和中间件功能都已经实现了
        self.scraper = Scraper(
            crawler)  # 这里有定义有spidermw爬虫中间件和ITEM_pipeline管道对象,数据处理功能和存储功能都实现了
        self._spider_closed_callback = spider_closed_callback  # 这个回调很重要,关系到爬虫能不能停下来,是个匿名函数lambda _: self.stop(),最终还是执行engine的self.engine.stop

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_started)
        self.running = True
        self._closewait = defer.Deferred()
        yield self._closewait

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def close(self):
        """Close the execution engine gracefully.

        If it has already been started, stop it. In all cases, close all spiders
        and the downloader.
        """
        if self.running:
            # Will also close spiders and downloader
            return self.stop()
        elif self.open_spiders:
            # Will also close downloader
            return self._close_all_spiders()
        else:
            return defer.succeed(self.downloader.close())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def _next_request(self, spider):
        slot = self.slot
        if not slot:
            return

        if self.paused:
            return

        while not self._needs_backout(spider):  # 什么时候才会出来呢??
            """首次执行状态
            True False False False
            """
            # 当爬虫running为True
            # 心跳关闭slot.closing=True
            # 下载器有active活跃数量大于16
            # 刮擦有active活跃数量大于5000000
            if not self._next_request_from_scheduler(
                    spider):  # 就是加入这里我放了10个函数呢
                """
                会一直递归获取所有的request,丢到下载器进行下载,最后一步为经历了scraper的润色
                
                从调度器中pop出一个request请求
                执行下载函数,获取结果,如果是request则继续入队,并递归心跳函数,否则继续往下走
                执行对结果的处理
                
                居然会在start_requests之前执行,不可思议,存在记录的话最少要走两次,一次走完,还有一次走结束
                
                执行_next_request_from_scheduler
                Done!  首次直接走掉,应为队列里面一个数据都没有
                
                然后从start_requests里面next出一个数据,推到队列面去,所以数据怎么进去,居然是一个一个next出来推进去的,那个百度小哥着实牛逼
                """
                break

        if slot.start_requests and not self._needs_backout(
                spider):  # 卧槽,这个_needs_backout原来可以控制并发的大小
            try:
                request = next(
                    slot.start_requests
                )  # 数据就是从这里,一个一个的从start_requests调出来然后再推进去的,神奇的异步
            except StopIteration:
                slot.start_requests = None
            except Exception:
                slot.start_requests = None
                logger.error('Error while obtaining start requests',
                             exc_info=True,
                             extra={'spider': spider})
            else:
                """
                所以我感觉现在的情况就很尴尬,我刚往里面push一个数据,然后继续调用时,又立马给我pop出来了,真是醉了
                """
                self.crawl(request, spider)

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)

    def _needs_backout(
        self, spider
    ):  # len(self.active) >= self.total_concurrency # return self.active_size > self.max_active_size
        slot = self.slot
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slot.needs_backout()

    def _next_request_from_scheduler(self,
                                     spider):  # 怎么感觉这一个函数就可以把所有流程走完啊???????
        slot = self.slot
        request = slot.scheduler.next_request()  # 从调度器中pop出一条request记录
        if not request:
            return
        d = self._download(request, spider)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(
            lambda f: logger.info('Error while handling downloader output',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(
            lambda f: logger.info('Error while removing request from slot',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(
            lambda f: logger.info('Error while scheduling new request',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        return d

    def _handle_downloader_output(self, response, request,
                                  spider):  # 这里链接到download下载后的response
        assert isinstance(response, (Request, Response, Failure)), response
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):  # 对于结果,如果是Request,则直接入队,进入self.crawl
            self.crawl(response, spider)  # 对request请求指纹过滤,没问题则入队,然后递归心跳处理
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(
            response, request, spider
        )  # 如果是正确的response,对下载器输出的结果进行scraper的三个处理函数,如果结果是request继续入队,如果是字典或者Item则调用process_item函数进行后续处理
        d.addErrback(
            lambda f: logger.error('Error while enqueuing downloader output',
                                   exc_info=failure_to_exc_info(f),
                                   extra={'spider': spider}))
        return d

    def spider_is_idle(self, spider):  # 爬虫 闲置 状态 ??
        if not self.scraper.slot.is_idle():  # scraper处于闲置
            # scraper is not idle
            return False

        if self.downloader.active:  # 下载器处于闲置
            # downloader has pending requests
            return False

        if self.slot.start_requests is not None:  # 所有start_requests处于空
            # not all start requests are handled
            return False

        if self.slot.scheduler.has_pending_requests():  # 调度器 所有等待任务 为空
            # scheduler has pending requests
            return False

        return True  # 判断闲置的四个条件:start_requests、调度器、下载器、scraper均闲置,才会判断爬虫处于闲置状态。

    @property
    def open_spiders(self):
        return [self.spider] if self.spider else []

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return not bool(self.slot)

    def crawl(self, request, spider):  # 将请求进行指纹过滤,没问题则入队,然后递归执行心跳
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        self.schedule(request, spider)
        self.slot.nextcall.schedule()  # 又执行一次

    def schedule(self, request, spider):
        self.signals.send_catch_log(signal=signals.request_scheduled,
                                    request=request,
                                    spider=spider)
        if not self.slot.scheduler.enqueue_request(
                request
        ):  # 什么是否才会走到这里呢 - 请求指纹过滤,若没有过滤掉,则入队,self._dqpush(request)也就是push进队列
            self.signals.send_catch_log(signal=signals.request_dropped,
                                        request=request,
                                        spider=spider)

    def download(self, request, spider):
        d = self._download(request, spider)
        d.addBoth(self._downloaded, self.slot, request, spider)
        return d

    def _downloaded(self, response, slot, request, spider):  # 下载结束时,心跳中移除请求
        slot.remove_request(request)
        return self.download(response, spider) \
                if isinstance(response, Request) else response  # 是Request则继续调用上面函数

    def _download(self, request, spider):
        slot = self.slot
        slot.add_request(request)

        def _on_success(
                response):  # 如果是response,就是正常的response对象了,但是应该还没有进行回调处理吧
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request  # tie request to response received
                logkws = self.logformatter.crawled(request, response, spider)
                logger.log(*logformatter_adapter(logkws),
                           extra={'spider': spider})
                self.signals.send_catch_log(signal=signals.response_received, \
                    response=response, request=request, spider=spider)
            return response

        def _on_complete(_):
            slot.nextcall.schedule()
            return _

        dwld = self.downloader.fetch(
            request, spider
        )  # 爬虫下载入口,调用middle进行下载,把真正的下载函数传递过滤,在middle中间进行回调的时候,处理第一个管道,没了再执行下载器进行处理
        dwld.addCallbacks(_on_success)
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        assert self.has_capacity(), "No free spider slot when opening %r" % \
            spider.name
        logger.info("Spider opened", extra={'spider': spider})
        nextcall = CallLaterOnce(self._next_request, spider)
        scheduler = self.scheduler_cls.from_crawler(
            self.crawler
        )  # 对调度器进行实例化。实例化了dupefilter,还有三种队列。一种是优先级队列,还有来两个都是fifo先进先出队列,不过一个是直接存储在内存memory中,一个是通过pickle实例化了
        start_requests = yield self.scraper.spidermw.process_start_requests(
            start_requests, spider)  # 第一步执行的居然是爬虫中间件里面的process_start_requests
        slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
        self.slot = slot
        self.spider = spider
        yield scheduler.open(spider)  # 打开内存队列FIFO,优先级队列,并打开过滤器
        yield self.scraper.open_spider(spider)  # 貌似没做啥事
        self.crawler.stats.open_spider(spider)  # pass,也没做啥事
        yield self.signals.send_catch_log_deferred(
            signals.spider_opened,
            spider=spider)  # 做了好多事啊,初始化日志,还有各种装啊提,中间件似乎都实现了这个函数?
        slot.nextcall.schedule()  # 执行一次self._next_request
        # 这鬼地方居然只会走一次,也就是初始化的走完这里,但是并不会执行里面的逻辑,应为这个schedule里面用的是reactor.callLater(delay, self),所以是不会执行的,除非你start
        slot.heartbeat.start(5)

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slot
        if slot.closing:
            return slot.closing
        logger.info("Closing spider (%(reason)s)", {'reason': reason},
                    extra={'spider': spider})

        dfd = slot.close()

        def log_failure(msg):
            def errback(failure):
                logger.error(msg,
                             exc_info=failure_to_exc_info(failure),
                             extra={'spider': spider})

            return errback

        dfd.addBoth(lambda _: self.downloader.close())
        dfd.addErrback(log_failure('Downloader close failure'))

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log_failure('Scraper close failure'))

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log_failure('Scheduler close failure'))

        dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
            signal=signals.spider_closed, spider=spider, reason=reason))
        dfd.addErrback(log_failure('Error while sending spider_close signal'))

        dfd.addBoth(
            lambda _: self.crawler.stats.close_spider(spider, reason=reason))
        dfd.addErrback(log_failure('Stats close failure'))

        dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
                                          {'reason': reason},
                                          extra={'spider': spider}))

        dfd.addBoth(lambda _: setattr(self, 'slot', None))
        dfd.addErrback(log_failure('Error while unassigning slot'))

        dfd.addBoth(lambda _: setattr(self, 'spider', None))  # 这里有点意思
        dfd.addErrback(log_failure('Error while unassigning spider'))

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spiders(self):
        dfds = [
            self.close_spider(s, reason='shutdown') for s in self.open_spiders
        ]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_stopped)
        self._closewait.callback(None)
Exemplo n.º 12
0
class ExecutionEngine(object):
    def __init__(self, crawler, spider_closed_callback):
        self.crawler = crawler
        self.settings = crawler.settings  # 配置
        self.signals = crawler.signals  # 信号
        self.logformatter = crawler.logformatter  # 日志格式
        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        # 提取scheduler调度器类名(未进行实例化), 其在open_spdier中实例化
        self.scheduler_cls = load_object(self.settings['SCHEDULER'])
        # 提取downloader下载器类名, 并实例化, 见scrapy/core/downloader/__init__.py文件
        downloader_cls = load_object(self.settings['DOWNLOADER'])
        self.downloader = downloader_cls(crawler)
        # 实例化scrapyer: engine和spider之间的桥梁, 见scrapy/core/scraper.py
        self.scraper = Scraper(crawler)
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_started)
        self.running = True
        self._closewait = defer.Deferred()
        yield self._closewait

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def close(self):
        """Close the execution engine gracefully.

        If it has already been started, stop it. In all cases, close all spiders
        and the downloader.
        """
        if self.running:
            # Will also close spiders and downloader
            return self.stop()
        elif self.open_spiders:
            # Will also close downloader
            return self._close_all_spiders()
        else:
            return defer.succeed(self.downloader.close())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def _next_request(self, spider):
        """ 1 在CallLaterOnce被注册 2 通过nextcall.schedule开启调度工作 """
        slot = self.slot
        if not slot:
            return

        if self.paused:  # 暂停时
            return

        while not self._needs_backout(spider):  # 是否等待
            # 循环从scheduler中获取Request, 首次会失败.
            # 这里进行下载任务
            if not self._next_request_from_scheduler(spider):
                break

        # 循环调用, 消费yield, 这里slot在open_spider中初始化, start_requests==process_start_requests方法
        # 从而调用每一个爬虫中间件的process_start_requests方法, 批量处理种子Request.
        # 这里start_requests实际上包含 Request 的可迭代对象
        if slot.start_requests and not self._needs_backout(spider):
            # start_requests有 Requests并且不需要等待时
            try:
                request = next(slot.start_requests)  # 提取下一个种子请求, 不管是不是首次
            except StopIteration:
                slot.start_requests = None
            except Exception:
                slot.start_requests = None
                logger.error('Error while obtaining start requests',
                             exc_info=True,
                             extra={'spider': spider})
            else:
                # 将request放入到scheduler的队列中
                self.crawl(request, spider)

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)  # 关闭spider, 满足空闲并且设置了空闲就关闭标志位之后

    def _needs_backout(self, spider):
        # 是否需要等待, 取决于如下条件:
        # 1. engine是否仍然运行
        # 2. slot是否关闭
        # 3. downloader下载超过预设的最大数: CONCURRENT_REQUESTS
        # 4. scraper处理response超过预设
        slot = self.slot
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slot.needs_backout()

    def _next_request_from_scheduler(self, spider):
        slot = self.slot
        # 获取下一个request, 弹出队列中的request, 见scrapy/core/scheduler.py
        request = slot.scheduler.next_request()
        if not request:
            return
        # "下载"该request, 调用Downloader中相应的下载器, 在这之前会注册一批的回调函数, 返回即表示下载成功
        d = self._download(request, spider)
        # 对下载结果做处理(真正下载见scrapy/core/downloader/__init__.py中的_download函数)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(
            lambda f: logger.info('Error while handling downloader output',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(
            lambda f: logger.info('Error while removing request from slot',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(
            lambda f: logger.info('Error while scheduling new request',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        return d

    def _handle_downloader_output(self, response, request, spider):
        # 下载结果必须为下面三者之一: Request/Response/Failure
        assert isinstance(response, (Request, Response, Failure)), response

        # downloader middleware can return requests (for example, redirects)
        # 结果 1: Request, 则必须重新进行一轮下载操作
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        # 结果 2: 利用scraper完成同spiders/pipeline交互, 见scrapy/core/scraper.py
        d = self.scraper.enqueue_scrape(response, request, spider)
        d.addErrback(
            lambda f: logger.error('Error while enqueuing downloader output',
                                   exc_info=failure_to_exc_info(f),
                                   extra={'spider': spider}))
        return d

    def spider_is_idle(self, spider):
        if not self.scraper.slot.is_idle():
            # scraper is not idle
            return False

        if self.downloader.active:
            # downloader has pending requests
            return False

        if self.slot.start_requests is not None:
            # not all start requests are handled
            return False

        if self.slot.scheduler.has_pending_requests():
            # scheduler has pending requests
            return False

        return True

    @property
    def open_spiders(self):
        return [self.spider] if self.spider else []

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return not bool(self.slot)

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        # 将request放入scheduler队列中, 以便下次循环调用
        self.schedule(request, spider)
        # 进行下一次调度
        self.slot.nextcall.schedule()

    def schedule(self, request, spider):
        self.signals.send_catch_log(signal=signals.request_scheduled,
                                    request=request,
                                    spider=spider)
        # 入队列
        if not self.slot.scheduler.enqueue_request(request):
            self.signals.send_catch_log(signal=signals.request_dropped,
                                        request=request,
                                        spider=spider)

    def download(self, request, spider):
        d = self._download(request, spider)
        d.addBoth(self._downloaded, self.slot, request, spider)
        return d

    def _downloaded(self, response, slot, request, spider):
        slot.remove_request(request)
        return self.download(response, spider) \
            if isinstance(response, Request) else response

    def _download(self, request, spider):
        slot = self.slot
        slot.add_request(request)

        def _on_success(response):
            # 成功之后的回调函数, 结果须为request/response
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request  # tie request to response received
                logkws = self.logformatter.crawled(request, response, spider)
                if logkws is not None:
                    logger.log(*logformatter_adapter(logkws),
                               extra={'spider': spider})
                self.signals.send_catch_log(signal=signals.response_received,
                                            response=response,
                                            request=request,
                                            spider=spider)
            return response

        def _on_complete(_):
            # 下载完成之后的回调, 直接开始下一次调度
            slot.nextcall.schedule()
            return _

        # 下载请求, 调用 Downloader 进行下载(实际上没开始), 见scrapy/core/downloader/__init__.py
        dwld = self.downloader.fetch(request, spider)
        # 注册回调函数
        dwld.addCallbacks(_on_success)
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        assert self.has_capacity(), "No free spider slot when opening %r" % \
            spider.name
        logger.info("Spider opened", extra={'spider': spider})
        # 注册next_request调度方法, 以便循环调度(利用twisted的reactor)
        nextcall = CallLaterOnce(self._next_request, spider)
        # 实例化调度器, 见scrapy/core/scheduler.py
        scheduler = self.scheduler_cls.from_crawler(self.crawler)
        # 爬虫中间件, 处理种子Request, 见scrapy/core/spidermw.py, 其中start_requests一般为包含 Requests的可迭代对象
        start_requests = yield self.scraper.spidermw.process_start_requests(
            start_requests, spider)
        # 封装slot对象, 并将返回的包含可迭代对象start_requests赋值给slot
        slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
        self.slot = slot
        self.spider = spider
        # 调用调度器中的open, 进行爬虫对象绑定, 各种任务队列初始化, 并且开启指纹过滤, 见scrapy/core/scheduler.py
        yield scheduler.open(spider)
        # 见scrapy/core/scraper.py, 实例化实例化或者绑定spider到pipeline manager类上, 见scrapy/pipelines/__init__.py
        # 批量调度爬虫中间件所有open_spdier方法
        yield self.scraper.open_spider(spider)
        # scrapy/statscollectors/__init__.py
        self.crawler.stats.open_spider(spider)
        yield self.signals.send_catch_log_deferred(signals.spider_opened,
                                                   spider=spider)
        # 开始调度, 实际执行上面注册的_next_request方法
        slot.nextcall.schedule()
        slot.heartbeat.start(5)

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signal=signals.spider_idle,
                                          spider=spider,
                                          dont_log=DontCloseSpider)
        if any(
                isinstance(x, Failure) and isinstance(x.value, DontCloseSpider)
                for _, x in res):
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slot
        if slot.closing:
            return slot.closing
        logger.info("Closing spider (%(reason)s)", {'reason': reason},
                    extra={'spider': spider})

        dfd = slot.close()

        def log_failure(msg):
            def errback(failure):
                logger.error(msg,
                             exc_info=failure_to_exc_info(failure),
                             extra={'spider': spider})

            return errback

        dfd.addBoth(lambda _: self.downloader.close())
        dfd.addErrback(log_failure('Downloader close failure'))

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log_failure('Scraper close failure'))

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log_failure('Scheduler close failure'))

        dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
            signal=signals.spider_closed, spider=spider, reason=reason))
        dfd.addErrback(log_failure('Error while sending spider_close signal'))

        dfd.addBoth(
            lambda _: self.crawler.stats.close_spider(spider, reason=reason))
        dfd.addErrback(log_failure('Stats close failure'))

        dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
                                          {'reason': reason},
                                          extra={'spider': spider}))

        dfd.addBoth(lambda _: setattr(self, 'slot', None))
        dfd.addErrback(log_failure('Error while unassigning slot'))

        dfd.addBoth(lambda _: setattr(self, 'spider', None))
        dfd.addErrback(log_failure('Error while unassigning spider'))

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spiders(self):
        dfds = [
            self.close_spider(s, reason='shutdown') for s in self.open_spiders
        ]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_stopped)
        self._closewait.callback(None)
Exemplo n.º 13
0
class ExecutionEngine(object):
    def __init__(self, crawler, spider_closed_callback):
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals  #使用crawler的信号管理器
        self.logformatter = crawler.logformatter
        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(
            self.settings['SCHEDULER'])  #根据配置的调度器类来生成对应的对象
        downloader_cls = load_object(
            self.settings['DOWNLOADER'])  #根据配置的下载器类来生成对应的类
        self.downloader = downloader_cls(crawler)
        self.scraper = Scraper(crawler)  #生成一个刮取器
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_started)
        self.running = True
        self._closewait = defer.Deferred()
        yield self._closewait

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def close(self):
        """Close the execution engine gracefully.

        If it has already been started, stop it. In all cases, close all spiders
        and the downloader.
        """
        if self.running:
            # Will also close spiders and downloader
            return self.stop()
        elif self.open_spiders:
            # Will also close downloader
            return self._close_all_spiders()
        else:
            return defer.succeed(self.downloader.close())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    """
    被CallLaterOnce包装后被slot设置,
    主要在reactor中的heartbeat中被定时调用(在slot中设置),不过也可以被代码主动调用
    """

    def _next_request(self, spider):
        slot = self.slot
        if not slot:
            return

        if self.paused:
            return
        #此处应该是通过调度器异步的获取待处理的request
        while not self._needs_backout(spider):
            if not self._next_request_from_scheduler(spider):
                break

        if slot.start_requests and not self._needs_backout(spider):
            try:
                request = next(slot.start_requests)
            except StopIteration:
                slot.start_requests = None
            except Exception:
                slot.start_requests = None
                logger.error('Error while obtaining start requests',
                             exc_info=True,
                             extra={'spider': spider})
            else:
                self.crawl(request, spider)

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)

    """
    判断当前引擎的状态是不是异常,需不需要回退(backout)
    """

    def _needs_backout(self, spider):
        slot = self.slot
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slot.needs_backout()

    """
    从调度器中请求下一个request,如果有request待处理,
    那么就对这个request进行下载处理,并对下载的操作添加一下回调函数
    """

    def _next_request_from_scheduler(self, spider):
        slot = self.slot
        request = slot.scheduler.next_request()
        if not request:
            return

        # 调用下载器进行下载
        d = self._download(request, spider)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(
            lambda f: logger.info('Error while handling downloader output',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(
            lambda f: logger.info('Error while removing request from slot',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(
            lambda f: logger.info('Error while scheduling new request',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        return d

    """
    对下载器的结果输出进行的异步处理
    """

    def _handle_downloader_output(self, response, request, spider):
        assert isinstance(response, (Request, Response, Failure)), response
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(response, request, spider)
        d.addErrback(
            lambda f: logger.error('Error while enqueuing downloader output',
                                   exc_info=failure_to_exc_info(f),
                                   extra={'spider': spider}))
        return d

    def spider_is_idle(self, spider):
        if not self.scraper.slot.is_idle():
            # scraper is not idle
            return False

        if self.downloader.active:
            # downloader has pending requests
            return False

        if self.slot.start_requests is not None:
            # not all start requests are handled
            return False

        if self.slot.scheduler.has_pending_requests():
            # scheduler has pending requests
            return False

        return True

    @property
    def open_spiders(self):
        return [self.spider] if self.spider else []

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return not bool(self.slot)

    """
    调用schedule请求slot处理request,并且显式通知slot进行处理
    """

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        self.schedule(request, spider)
        self.slot.nextcall.schedule()

    """
    通过slot将request入队,等待被reactor处理,
    """

    def schedule(self, request, spider):
        self.signals.send_catch_log(signal=signals.request_scheduled,
                                    request=request,
                                    spider=spider)
        if not self.slot.scheduler.enqueue_request(request):
            self.signals.send_catch_log(signal=signals.request_dropped,
                                        request=request,
                                        spider=spider)

    """
    根据请求进行下载,其实是调用_download进行下载
    并且在下载完成之后,通过reactor异步调度_downloaded函数。
    """

    def download(self, request, spider):
        d = self._download(request, spider)
        d.addBoth(self._downloaded, self.slot, request, spider)
        return d

    """
    在下载完成之后,从slot中将要对应的request移除,然后在判断response的类型:
        如果是Request,则继续进行下载;若是Response,则直接返回
    """

    def _downloaded(self, response, slot, request, spider):
        slot.remove_request(request)
        return self.download(response, spider) if isinstance(
            response, Request) else response

    """
    将下载的任务由下载器downloader进行下载的操作
    并添加了两个回调函数:
        在下载完毕complete的时候
        在下载成功success的时候
    """

    def _download(self, request, spider):
        slot = self.slot
        slot.add_request(request)

        def _on_success(response):
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request  # tie request to response received
                logkws = self.logformatter.crawled(request, response, spider)
                logger.log(*logformatter_adapter(logkws),
                           extra={'spider': spider})
                self.signals.send_catch_log(signal=signals.response_received,
                                            response=response,
                                            request=request,
                                            spider=spider)
            return response

        """
        在下载完成的时候显式调用slot进行调度处理
        """

        def _on_complete(_):
            slot.nextcall.schedule()
            return _

        # 从下载器中获取下载的结果deferred
        dwld = self.downloader.fetch(request, spider)
        dwld.addCallbacks(_on_success)
        dwld.addBoth(_on_complete)
        return dwld

    """
    ### 被scrapy.crawler.crawl调用
    开启爬虫系统
    创建调度器并开启,
    """

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        assert self.has_capacity(), "No free spider slot when opening %r" % \
            spider.name
        logger.info("Spider opened", extra={'spider': spider})
        nextcall = CallLaterOnce(self._next_request, spider)
        scheduler = self.scheduler_cls.from_crawler(self.crawler)
        start_requests = yield self.scraper.spidermw.process_start_requests(
            start_requests, spider)  #先调用spider中间件进行处理
        slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
        self.slot = slot
        self.spider = spider
        yield scheduler.open(spider)  #开启调度器
        yield self.scraper.open_spider(spider)
        self.crawler.stats.open_spider(spider)
        yield self.signals.send_catch_log_deferred(signals.spider_opened,
                                                   spider=spider)
        slot.nextcall.schedule()
        slot.heartbeat.start(5)

    """ 
    当调度器空闲的时候调用(在_next_request中判断)。
    可以被多次调用。
    如果某些extension引起了DontCloseSpider异常(在spider_idle 信号的处理器中),spider就不会关闭,直到下一个循环。
    并且这个方法会保证至少被执行一次
    """

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signal=signals.spider_idle,
                                          spider=spider,
                                          dont_log=DontCloseSpider)
        if any(
                isinstance(x, Failure) and isinstance(x.value, DontCloseSpider)
                for _, x in res):
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    """
    关闭爬虫(引擎)
    发送信息:关闭下载器、使用scrapyer关闭爬虫spider、关闭调度器
        发送关闭日志、关闭scawler关闭爬虫的信息、打印日志、
        重设当前的slot为空、重设当前的spider为空等
    """

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slot
        if slot.closing:
            return slot.closing
        logger.info("Closing spider (%(reason)s)", {'reason': reason},
                    extra={'spider': spider})

        dfd = slot.close()

        def log_failure(msg):
            def errback(failure):
                logger.error(msg,
                             exc_info=failure_to_exc_info(failure),
                             extra={'spider': spider})

            return errback

        dfd.addBoth(lambda _: self.downloader.close())
        dfd.addErrback(log_failure('Downloader close failure'))

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log_failure('Scraper close failure'))

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log_failure('Scheduler close failure'))

        dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
            signal=signals.spider_closed, spider=spider, reason=reason))
        dfd.addErrback(log_failure('Error while sending spider_close signal'))

        dfd.addBoth(
            lambda _: self.crawler.stats.close_spider(spider, reason=reason))
        dfd.addErrback(log_failure('Stats close failure'))

        dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
                                          {'reason': reason},
                                          extra={'spider': spider}))

        dfd.addBoth(lambda _: setattr(self, 'slot', None))
        dfd.addErrback(log_failure('Error while unassigning slot'))

        dfd.addBoth(lambda _: setattr(self, 'spider', None))
        dfd.addErrback(log_failure('Error while unassigning spider'))

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spiders(self):
        dfds = [
            self.close_spider(s, reason='shutdown') for s in self.open_spiders
        ]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_stopped)
        self._closewait.callback(None)