def __init__(self, engine, settings): self.sites = {} self.spidermw = SpiderMiddlewareManager.from_settings(settings) itemproc_cls = load_object(settings['ITEM_PROCESSOR']) self.itemproc = itemproc_cls.from_settings(settings) self.concurrent_items = settings.getint('CONCURRENT_ITEMS') self.engine = engine
def __init__(self, crawler): self.slots = {} self.spidermw = SpiderMiddlewareManager.from_crawler(crawler) itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR']) self.itemproc = itemproc_cls.from_crawler(crawler) self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS') self.crawler = crawler
def __init__(self, crawler): super(Scraper, self).__init__(crawler.metas) self.slots = {} self.spidermw = SpiderMiddlewareManager(crawler.metas) itemproc_cls = load_object(self.item_processor.to_value()) self.itemproc = itemproc_cls(self.metas) self.concurrent_items = self.concurrent_items.to_value() self.crawler = crawler
def __init__(self, crawler): self.slot = None self.spidermw = SpiderMiddlewareManager.from_crawler(crawler) itemproc_cls = load_object(crawler.settings["ITEM_PROCESSOR"]) self.itemproc = itemproc_cls.from_crawler(crawler) self.concurrent_items = crawler.settings.getint("CONCURRENT_ITEMS") self.crawler = crawler self.signals = crawler.signals self.logformatter = crawler.logformatter
def __init__(self, crawler): self.slot = None self.spidermw = SpiderMiddlewareManager.from_crawler(crawler) # ITEM_PROCESSOR = 'scrapy.contrib.pipeline.ItemPipelineManager' itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR']) self.itemproc = itemproc_cls.from_crawler(crawler) # CONCURRENT_ITEMS = 100 self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS') self.crawler = crawler self.signals = crawler.signals self.logformatter = crawler.logformatter
def setUp(self): class TestSpider(Spider): name = 'test' self.spider = TestSpider scrapy_default_middlewares = { 'scrapy.spidermiddlewares.referer.RefererMiddleware': 700 } # monkey patch SPIDER_MIDDLEWARES_BASE to include only referer middleware sys.modules['scrapy.settings.default_settings'].SPIDER_MIDDLEWARES_BASE = scrapy_default_middlewares custom_settings = { 'SPIDER_MIDDLEWARES': {'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 1000} } crawler = get_crawler(self.spider, custom_settings) self.add_frontera_scheduler(crawler) self.smw = SpiderMiddlewareManager.from_crawler(crawler)
def __init__(self, crawler): self.slot = None self.spidermw = SpiderMiddlewareManager.from_crawler(crawler) itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR']) #ITEM_PROCESSOR = 'scrapy.pipelines.ItemPipelineManager' #ItemPipelineManager是一个MiddlewareManager的派生类 #加入了一个功能:将pipeline中process_item方法添加到回调链中。_add_middleware #然后以callback(spider)调用回调链。(pipelinemanager.process_item) self.itemproc = itemproc_cls.from_crawler(crawler) #管道管理器实例化。 self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS') #默认100 self.crawler = crawler self.signals = crawler.signals self.logformatter = crawler.logformatter
class Scraper(object): def __init__(self, engine): self.sites = {} self.spidermw = SpiderMiddlewareManager() self.itemproc = load_object(settings['ITEM_PROCESSOR'])() self.concurrent_items = settings.getint('CONCURRENT_ITEMS') self.engine = engine def open_spider(self, spider): """Open the given spider for scraping and allocate resources for it""" assert spider not in self.sites, "Spider already opened: %s" % spider self.sites[spider] = SpiderInfo() self.itemproc.open_spider(spider) def close_spider(self, spider): """Close a spider being scraped and release its resources""" assert spider in self.sites, "Spider not opened: %s" % spider site = self.sites[spider] site.closing = defer.Deferred() self.itemproc.close_spider(spider) def is_idle(self): """Return True if there isn't any more spiders to process""" return not self.sites def enqueue_scrape(self, response, request, spider): site = self.sites[spider] dfd = site.add_response_request(response, request) # FIXME: this can't be called here because the stats spider may be # already closed #stats.max_value('scraper/max_active_size', site.active_size, \ # spider=spider) def finish_scraping(_): site.finish_response(response) if site.closing and site.is_idle(): del self.sites[spider] site.closing.callback(None) self._scrape_next(spider, site) return _ dfd.addBoth(finish_scraping) dfd.addErrback(log.err, 'Scraper bug processing %s' % request, \ spider=spider) self._scrape_next(spider, site) return dfd def _scrape_next(self, spider, site): while site.queue: response, request, deferred = site.next_response_request_deferred() self._scrape(response, request, spider).chainDeferred(deferred) def _scrape(self, response, request, spider): """Handle the downloaded response or failure trough the spider callback/errback""" assert isinstance(response, (Response, Failure)) dfd = self._scrape2(response, request, spider) # returns spiders processed output dfd.addErrback(self.handle_spider_error, request, spider) dfd.addCallback(self.handle_spider_output, request, response, spider) return dfd def _scrape2(self, request_result, request, spider): """Handle the diferent cases of request's result been a Response or a Failure""" if not isinstance(request_result, Failure): return self.spidermw.scrape_response(self.call_spider, \ request_result, request, spider) else: # FIXME: don't ignore errors in spider middleware dfd = self.call_spider(request_result, request, spider) return dfd.addErrback(self._check_propagated_failure, \ request_result, request, spider) def call_spider(self, result, request, spider): dfd = defer_result(result) dfd.addCallbacks(request.callback or spider.parse, request.errback) return dfd.addCallback(iterate_spider_output) def handle_spider_error(self, _failure, request, spider, propagated_failure=None): referer = request.headers.get('Referer', None) msg = "Spider exception caught while processing <%s> (referer: <%s>): %s" % \ (request.url, referer, _failure) log.msg(msg, log.ERROR, spider=spider) stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, \ spider=spider) def handle_spider_output(self, result, request, response, spider): if not result: return defer_succeed(None) dfd = parallel(iter(result), self.concurrent_items, self._process_spidermw_output, request, response, spider) return dfd def _process_spidermw_output(self, output, request, response, spider): """Process each Request/Item (given in the output parameter) returned from the given spider """ # TODO: keep closing state internally instead of checking engine if spider in self.engine.closing: return elif isinstance(output, Request): send_catch_log(signal=signals.request_received, request=output, \ spider=spider) self.engine.crawl(request=output, spider=spider) elif isinstance(output, BaseItem): log.msg("Scraped %s in <%s>" % (output, request.url), level=log.DEBUG, \ spider=spider) send_catch_log(signal=signals.item_scraped, sender=self.__class__, \ item=output, spider=spider, response=response) self.sites[spider].itemproc_size += 1 # FIXME: this can't be called here because the stats spider may be # already closed #stats.max_value('scraper/max_itemproc_size', \ # self.sites[spider].itemproc_size, spider=spider) dfd = self.itemproc.process_item(output, spider) dfd.addBoth(self._itemproc_finished, output, spider) return dfd elif output is None: pass else: log.msg("Spider must return Request, BaseItem or None, got %r in %s" % \ (type(output).__name__, request), log.ERROR, spider=spider) def _check_propagated_failure(self, spider_failure, propagated_failure, request, spider): """Log and silence the bugs raised outside of spiders, but still allow spiders to be notified about general failures while downloading spider generated requests """ # ignored requests are commonly propagated exceptions safes to be silenced if isinstance(spider_failure.value, IgnoreRequest): return elif spider_failure is propagated_failure: log.err(spider_failure, 'Unhandled error propagated to spider', \ spider=spider) return # stop propagating this error else: return spider_failure # exceptions raised in the spider code def _itemproc_finished(self, output, item, spider): """ItemProcessor finished for the given ``item`` and returned ``output`` """ self.sites[spider].itemproc_size -= 1 if isinstance(output, Failure): ex = output.value if isinstance(ex, DropItem): log.msg("Dropped %s - %s" % (item, str(ex)), level=log.WARNING, spider=spider) send_catch_log(signal=signals.item_dropped, sender=self.__class__, \ item=item, spider=spider, exception=output.value) else: log.msg('Error processing %s - %s' % (item, output), \ log.ERROR, spider=spider) else: log.msg("Passed %s" % item, log.INFO, spider=spider) send_catch_log(signal=signals.item_passed, sender=self.__class__, \ item=item, spider=spider, output=output)
class Scraper(SettingObject): item_processor = StringField(default="scrapy.contrib.pipeline.ItemPipelineManager") concurrent_items = IntegerField(default=100) def __init__(self, crawler): super(Scraper, self).__init__(crawler.metas) self.slots = {} self.spidermw = SpiderMiddlewareManager(crawler.metas) itemproc_cls = load_object(self.item_processor.to_value()) self.itemproc = itemproc_cls(self.metas) self.concurrent_items = self.concurrent_items.to_value() self.crawler = crawler @defer.inlineCallbacks def open_spider(self, spider): """Open the given spider for scraping and allocate resources for it""" assert spider not in self.slots, "Spider already opened: %s" % spider self.slots[spider] = Slot() yield self.itemproc.open_spider(spider) def close_spider(self, spider): """Close a spider being scraped and release its resources""" assert spider in self.slots, "Spider not opened: %s" % spider slot = self.slots[spider] slot.closing = defer.Deferred() slot.closing.addCallback(self.itemproc.close_spider) self._check_if_closing(spider, slot) return slot.closing def is_idle(self): """Return True if there isn't any more spiders to process""" return not self.slots def _check_if_closing(self, spider, slot): if slot.closing and slot.is_idle(): del self.slots[spider] slot.closing.callback(spider) def enqueue_scrape(self, response, request, spider): slot = self.slots[spider] dfd = slot.add_response_request(response, request) def finish_scraping(_): slot.finish_response(response, request) self._check_if_closing(spider, slot) self._scrape_next(spider, slot) return _ dfd.addBoth(finish_scraping) dfd.addErrback(log.err, "Scraper bug processing %s" % request, spider=spider) self._scrape_next(spider, slot) return dfd def _scrape_next(self, spider, slot): while slot.queue: response, request, deferred = slot.next_response_request_deferred() self._scrape(response, request, spider).chainDeferred(deferred) def _scrape(self, response, request, spider): """Handle the downloaded response or failure trough the spider callback/errback""" assert isinstance(response, (Response, Failure)) dfd = self._scrape2(response, request, spider) # returns spiders processed output dfd.addErrback(self.handle_spider_error, request, response, spider) dfd.addCallback(self.handle_spider_output, request, response, spider) return dfd def _scrape2(self, request_result, request, spider): """Handle the diferent cases of request's result been a Response or a Failure""" if not isinstance(request_result, Failure): return self.spidermw.scrape_response(self.call_spider, request_result, request, spider) else: # FIXME: don't ignore errors in spider middleware dfd = self.call_spider(request_result, request, spider) return dfd.addErrback(self._log_download_errors, request_result, request, spider) def call_spider(self, result, request, spider): dfd = defer_result(result) dfd.addCallbacks(request.callback or spider.parse, request.errback) return dfd.addCallback(iterate_spider_output) def handle_spider_error(self, _failure, request, response, spider): exc = _failure.value if isinstance(exc, CloseSpider): self.crawler.engine.close_spider(spider, exc.reason or "cancelled") return log.err(_failure, "Spider error processing %s" % request, spider=spider) send_catch_log(signal=signals.spider_error, failure=_failure, response=response, spider=spider) stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, spider=spider) def handle_spider_output(self, result, request, response, spider): if not result: return defer_succeed(None) it = iter_errback(result, self.handle_spider_error, request, response, spider) dfd = parallel(it, self.concurrent_items, self._process_spidermw_output, request, response, spider) return dfd def _process_spidermw_output(self, output, request, response, spider): """Process each Request/Item (given in the output parameter) returned from the given spider """ if isinstance(output, Request): send_catch_log(signal=signals.request_received, request=output, spider=spider) self.crawler.engine.crawl(request=output, spider=spider) elif isinstance(output, BaseItem): self.slots[spider].itemproc_size += 1 dfd = self.itemproc.process_item(output, spider) dfd.addBoth(self._itemproc_finished, output, response, spider) return dfd elif output is None: pass else: log.msg( "Spider must return Request, BaseItem or None, got %r in %s" % (type(output).__name__, request), log.ERROR, spider=spider, ) def _log_download_errors(self, spider_failure, download_failure, request, spider): """Log and silence errors that come from the engine (typically download errors that got propagated thru here) """ if spider_failure is download_failure: log.msg("Error downloading %s: %s" % (request, spider_failure.getErrorMessage()), log.ERROR, spider=spider) return return spider_failure def _itemproc_finished(self, output, item, response, spider): """ItemProcessor finished for the given ``item`` and returned ``output`` """ self.slots[spider].itemproc_size -= 1 if isinstance(output, Failure): ex = output.value if isinstance(ex, DropItem): log.msg(log.formatter.dropped(item, ex, response, spider), level=log.WARNING, spider=spider) return send_catch_log_deferred( signal=signals.item_dropped, item=item, spider=spider, exception=output.value ) else: log.err(output, "Error processing %s" % item, spider=spider) else: log.msg(log.formatter.scraped(output, response, spider), log.DEBUG, spider=spider) return send_catch_log_deferred(signal=signals.item_scraped, item=output, response=response, spider=spider)
def setUp(self): self.request = Request('http://example.com/index.html') self.response = Response(self.request.url, request=self.request) self.crawler = get_crawler(Spider) self.spider = self.crawler._create_spider('foo') self.mwman = SpiderMiddlewareManager.from_crawler(self.crawler)