def _logerror(self, failure, request, spider): if failure.type is not IgnoreRequest: logger.error("Error downloading %(request)s: %(f_exception)s", {'request': request, 'f_exception': failure.value}, exc_info=failure_to_exc_info(failure), extra={'spider': spider}) return failure
def logerror(failure, recv): if dont_log is None or not isinstance(failure.value, dont_log): logger.error("Error caught on signal handler: %(receiver)s", {'receiver': recv}, exc_info=failure_to_exc_info(failure), extra={'spider': spider}) return failure
def _process_request(self, request, info): fp = request_fingerprint(request) cb = request.callback or (lambda _: _) eb = request.errback request.callback = None request.errback = None # Return cached result if request was already seen if fp in info.downloaded: return defer_result(info.downloaded[fp]).addCallbacks(cb, eb) # Otherwise, wait for result wad = Deferred().addCallbacks(cb, eb) info.waiting[fp].append(wad) # Check if request is downloading right now to avoid doing it twice if fp in info.downloading: return wad # Download request checking media_to_download hook output first info.downloading.add(fp) dfd = mustbe_deferred(self.media_to_download, request, info) dfd.addCallback(self._check_media_to_download, request, info) dfd.addBoth(self._cache_result_and_execute_waiters, fp, info) dfd.addErrback(lambda f: logger.error( f.value, exc_info=failure_to_exc_info(f), extra={'spider': info.spider}) ) return dfd.addBoth(lambda _: wad) # it must return wad at last
def test_failure(self): try: 0 / 0 except ZeroDivisionError: exc_info = sys.exc_info() failure = Failure() self.assertTupleEqual(exc_info, failure_to_exc_info(failure))
def err(_stuff=None, _why=None, **kw): warnings.warn('log.err has been deprecated, create a python logger and ' 'use its error method instead', ScrapyDeprecationWarning, stacklevel=2) level = kw.pop('level', logging.ERROR) failure = kw.pop('failure', _stuff) or Failure() message = kw.pop('why', _why) or failure.value logger.log(level, message, *[kw] if kw else [], exc_info=failure_to_exc_info(failure))
def _logerror(self, failure, request, spider): if failure.type is not IgnoreRequest: logger.error( "Error downloading %(request)s: %(f_exception)s", {"request": request, "f_exception": failure.value}, exc_info=failure_to_exc_info(failure), extra={"spider": spider}, ) return failure
def _next_request_from_scheduler(self, spider): slot = self.slot request = slot.scheduler.next_request() if not request: return d = self._download(request, spider) d.addBoth(self._handle_downloader_output, request, spider) d.addErrback(lambda f: logger.info('Error while handling downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.remove_request(request)) d.addErrback(lambda f: logger.info('Error while removing request from slot', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.nextcall.schedule()) d.addErrback(lambda f: logger.info('Error while scheduling new request', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d
def _handle_downloader_output(self, response, request, spider): assert isinstance(response, (Request, Response, Failure)), response # downloader middleware can return requests (for example, redirects) if isinstance(response, Request): self.crawl(response, spider) return # response is a Response or Failure d = self.scraper.enqueue_scrape(response, request, spider) d.addErrback(lambda f: logger.error('Error while enqueuing downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d
def item_completed(self, results, item, info): if self.LOG_FAILED_RESULTS: for ok, value in results: if not ok: logger.error( '%(class)s found errors processing %(item)s', {'class': self.__class__.__name__, 'item': item}, exc_info=failure_to_exc_info(value), extra={'spider': info.spider} ) item["files"] = [{"file_id": x['file_id'], "url": x['url']} for ok, x in results if ok] return item
def item_completed(self, results, item, info): """Called per item when all media requests has been processed""" if self.LOG_FAILED_RESULTS: for ok, value in results: if not ok: logger.error( '%(class)s found errors processing %(item)s', {'class': self.__class__.__name__, 'item': item}, exc_info=failure_to_exc_info(value), extra={'spider': info.spider} ) return item
def item_completed(self, results, item, info): """Called per item when all media requests has been processed""" if self.LOG_FAILED_RESULTS: for ok, value in results: if not ok: logger.error('%(class)s found errors processing %(item)s', { 'class': self.__class__.__name__, 'item': item }, exc_info=failure_to_exc_info(value), extra={'spider': info.spider}) return item
def _next_request_from_scheduler(self) -> Optional[Deferred]: assert self.slot is not None # typing assert self.spider is not None # typing request = self.slot.scheduler.next_request() if request is None: return None d = self._download(request, self.spider) d.addBoth(self._handle_downloader_output, request) d.addErrback(lambda f: logger.info('Error while handling downloader output', exc_info=failure_to_exc_info(f), extra={'spider': self.spider})) d.addBoth(lambda _: self.slot.remove_request(request)) d.addErrback(lambda f: logger.info('Error while removing request from slot', exc_info=failure_to_exc_info(f), extra={'spider': self.spider})) d.addBoth(lambda _: self.slot.nextcall.schedule()) d.addErrback(lambda f: logger.info('Error while scheduling new request', exc_info=failure_to_exc_info(f), extra={'spider': self.spider})) return d
def _handle_downloader_output(self, response, request, spider): assert isinstance(response, (Request, Response, Failure)), response # downloader middleware can return requests (for example, redirects) if isinstance(response, Request): self.crawl(response, spider) return # response is a Response or Failure d = self.scraper.enqueue_scrape(response, request, spider) d.addErrback( lambda f: logger.error('Error while enqueuing downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d
def _next_request_from_scheduler(self, spider): slot = self.slot request = slot.scheduler.next_request() if not request: return d = self._download(request, spider) d.addBoth(self._handle_downloader_output, request, spider) d.addErrback( lambda f: logger.info('Error while handling downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.remove_request(request)) d.addErrback( lambda f: logger.info('Error while removing request from slot', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.nextcall.schedule()) d.addErrback( lambda f: logger.info('Error while scheduling new request', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d
def item_completed(self, results, item, info): for ok, value in results: if ok: item['isload'] = '下载成功' item['file_path'] = value['path'] else: item['isload'] = '下载失败' logger.error('%(class)s found errors processing %(item)s', { 'class': self.__class__.__name__, 'item': item }, exc_info=failure_to_exc_info(value), extra={'spider': info.spider}) return item
def err(_stuff=None, _why=None, **kw): warnings.warn( 'log.err has been deprecated, create a python logger and ' 'use its error method instead', ScrapyDeprecationWarning, stacklevel=2) level = kw.pop('level', logging.ERROR) failure = kw.pop('failure', _stuff) or Failure() message = kw.pop('why', _why) or failure.value logger.log(level, message, *[kw] if kw else [], exc_info=failure_to_exc_info(failure))
def _next_request_from_scheduler(self, spider): slot = self.slot request = slot.scheduler.next_request() # 从scheduler拿出下个request if not request: return d = self._download( request, spider ) # 处理各中间件的process_request方法,然后用self._enqueue_request将request发出下载,并绑定回调函数(如process_response) d.addBoth(self._handle_downloader_output, request, spider) d.addErrback( lambda f: logger.info('Error while handling downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.remove_request(request)) d.addErrback( lambda f: logger.info('Error while removing request from slot', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.nextcall.schedule()) d.addErrback( lambda f: logger.info('Error while scheduling new request', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d
def _handle_downloader_output(self, response, request, spider): if not isinstance(response, (Request, Response, Failure)): raise TypeError( "Incorrect type: expected Request, Response or Failure, got " f"{type(response)}: {response!r}") # downloader middleware can return requests (for example, redirects) if isinstance(response, Request): self.crawl(response, spider) return # response is a Response or Failure d = self.scraper.enqueue_scrape(response, request, spider) d.addErrback( lambda f: logger.error('Error while enqueuing downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d
def _handle_downloader_output(self, response, request, spider): ## 下载结果 response 必须是 Request、Response、Failure 之一 assert isinstance(response, (Request, Response, Failure)), response # downloader middleware can return requests (for example, redirects) ## 如果下载结果是 Request,则再次调用 crawl,执行 Scheduler 的入队逻辑 if isinstance(response, Request): self.crawl(response, spider) return # response is a Response or Failure ## 如果下载结果是 Response 或 Failure,则交给 scrapy 的 enqueue_scrape 方法进一步处理 ## 主要是与 spiders 和 pipelines 交互 d = self.scraper.enqueue_scrape(response, request, spider) d.addErrback(lambda f: logger.error('Error while enqueuing downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d
def _handle_downloader_output(self, response, request, spider): # 这里链接到download下载后的response assert isinstance(response, (Request, Response, Failure)), response # downloader middleware can return requests (for example, redirects) if isinstance(response, Request): # 对于结果,如果是Request,则直接入队,进入self.crawl self.crawl(response, spider) # 对request请求指纹过滤,没问题则入队,然后递归心跳处理 return # response is a Response or Failure d = self.scraper.enqueue_scrape( response, request, spider ) # 如果是正确的response,对下载器输出的结果进行scraper的三个处理函数,如果结果是request继续入队,如果是字典或者Item则调用process_item函数进行后续处理 d.addErrback( lambda f: logger.error('Error while enqueuing downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d
def item_completed(self, results, item, info): if self.LOG_FAILED_RESULTS: for ok, value in results: if not ok: logger.error('%(class)s found errors processing %(item)s', { 'class': self.__class__.__name__, 'item': item }, exc_info=failure_to_exc_info(value), extra={'spider': info.spider}) item["files"] = [{ "file_id": x['file_id'], "url": x['url'] } for ok, x in results if ok] return item
def close_spider(self, spider): slot = self.slot if not slot.itemcount and not self.store_empty: return slot.exporter.finish_exporting() logfmt = "%s %%(format)s feed (%%(itemcount)d items) in: %%(uri)s" log_args = {'format': self.format, 'itemcount': slot.itemcount, 'uri': slot.uri} d = defer.maybeDeferred(slot.storage.store, slot.file) d.addCallback(lambda _: logger.info(logfmt % "Stored", log_args, extra={'spider': spider})) d.addErrback(lambda f: logger.error(logfmt % "Error storing", log_args, exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d
def enqueue_scrape(self, response, request, spider): slot = self.slot dfd = slot.add_response_request(response, request) def finish_scraping(_): slot.finish_response(response, request) self._check_if_closing(spider, slot) self._scrape_next(spider, slot) return _ dfd.addBoth(finish_scraping) dfd.addErrback( lambda f: logger.error('Scraper bug processing %(request)s', {'request': request}, exc_info=failure_to_exc_info(f), extra={'spider': spider})) self._scrape_next(spider, slot) return dfd
def _handle_downloader_output(self, response, request, spider): # 下载结果必须为下面三者之一: Request/Response/Failure assert isinstance(response, (Request, Response, Failure)), response # downloader middleware can return requests (for example, redirects) # 结果 1: Request, 则必须重新进行一轮下载操作 if isinstance(response, Request): self.crawl(response, spider) return # response is a Response or Failure # 结果 2: 利用scraper完成同spiders/pipeline交互, 见scrapy/core/scraper.py d = self.scraper.enqueue_scrape(response, request, spider) d.addErrback( lambda f: logger.error('Error while enqueuing downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d
def _handle_downloader_output(self, response, request, spider): assert isinstance( response, (Request, Response, Failure)), response # 下载结果必须是Request、Response、Failure其一 # downloader middleware can return requests (for example, redirects) if isinstance(response, Request): # 如果是Request,则再次调用crawl,执行Scheduler的入队逻辑 self.crawl(response, spider) return # response is a Response or Failure d = self.scraper.enqueue_scrape(response, request, spider) # 主要是和Spiders和Pipeline交互 d.addErrback( lambda f: logger.error('Error while enqueuing downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d
def handle_spider_error(self, _failure, request, response, spider): exc = _failure.value if isinstance(exc, CloseSpider): self.crawler.engine.close_spider(spider, exc.reason or 'cancelled') return logkws = self.logformatter.spider_error(_failure, request, response, spider) logger.log(*logformatter_adapter(logkws), exc_info=failure_to_exc_info(_failure), extra={'spider': spider}) self.signals.send_catch_log(signal=signals.spider_error, failure=_failure, response=response, spider=spider) self.crawler.stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, spider=spider)
def process_pokemon_request(self, request, info): fingerprint = request_fingerprint(request) callback = request.callback or (lambda _: _) errorback = request.errback request.callback = None request.errback = None # Otherwise, wait for result wad = Deferred().addCallbacks(callback, errorback) info.waiting[fingerprint].append(wad) info.downloading.add(fingerprint) dfd = mustbe_deferred(self.media_to_download, request, info) dfd.addCallback(self._check_media_to_download, request, info) dfd.addBoth(self._cache_result_and_execute_waiters, fingerprint, info) dfd.addErrback(lambda f: logger.error(f.value, exc_info=failure_to_exc_info(f), extra={'spider': info.spider})) return dfd.addBoth(lambda _: wad) # it must return wad at last
def _itemproc_finished(self, output, item, response, spider): """ItemProcessor finished for the given ``item`` and returned ``output``""" self.slot.itemproc_size -= 1 if isinstance(output, Failure): ex = output.value if isinstance(ex, DropItem): logkws = self.logformatter.dropped(item, ex, response, spider) if logkws is not None: logger.log(*logformatter_adapter(logkws), extra={"spider": spider}) return self.signals.send_catch_log_deferred( signal=signals.item_dropped, item=item, response=response, spider=spider, exception=output.value, ) else: logkws = self.logformatter.item_error(item, ex, response, spider) logger.log( *logformatter_adapter(logkws), extra={"spider": spider}, exc_info=failure_to_exc_info(output), ) return self.signals.send_catch_log_deferred( signal=signals.item_error, item=item, response=response, spider=spider, failure=output, ) else: logkws = self.logformatter.scraped(output, response, spider) if logkws is not None: logger.log(*logformatter_adapter(logkws), extra={"spider": spider}) return self.signals.send_catch_log_deferred( signal=signals.item_scraped, item=output, response=response, spider=spider, )
def media_to_download(self, request, info): def _onsuccess(result): if not result: return # returning None force download last_modified = result.get('last_modified', None) if not last_modified: return # returning None force download age_seconds = time.time() - last_modified age_days = age_seconds / 60 / 60 / 24 if age_days > self.expires: return # returning None force download referer = referer_str(request) logger.debug( 'File (uptodate): Downloaded %(medianame)s from %(request)s ' 'referred in <%(referer)s>', { 'medianame': self.MEDIA_NAME, 'request': request, 'referer': referer }, extra={'spider': info.spider}) self.inc_stats(info.spider, 'uptodate') checksum = result.get('checksum', None) return { 'checksum': checksum, # TODO: Refactor this! 'image_fields': info.urls_fields[request.url], 'url': request.url, 'path': '%s%s' % (self.image_base_url, path), } path = self.file_path(request, info=info) dfd = defer.maybeDeferred(self.store.stat_file, path, info) dfd.addCallbacks(_onsuccess, lambda _: None) dfd.addErrback(lambda f: logger.error(self.__class__.__name__ + '.store.stat_file', exc_info=failure_to_exc_info(f), extra={'spider': info.spider})) return dfd
def enqueue_scrape(self, result: Union[Response, Failure], request: Request, spider: Spider) -> Deferred: if self.slot is None: raise RuntimeError("Scraper slot not assigned") dfd = self.slot.add_response_request(result, request) def finish_scraping(_): self.slot.finish_response(result, request) self._check_if_closing(spider) self._scrape_next(spider) return _ dfd.addBoth(finish_scraping) dfd.addErrback( lambda f: logger.error('Scraper bug processing %(request)s', {'request': request}, exc_info=failure_to_exc_info(f), extra={'spider': spider})) self._scrape_next(spider) return dfd
def handle_spider_error(self, _failure, request, response, spider): exc = _failure.value if isinstance(exc, CloseSpider): self.crawler.engine.close_spider(spider, exc.reason or 'cancelled') return logger.error( "Spider error processing %(request)s (referer: %(referer)s)", { 'request': request, 'referer': referer_str(request) }, exc_info=failure_to_exc_info(_failure), extra={'spider': spider}) self.signals.send_catch_log(signal=signals.spider_error, failure=_failure, response=response, spider=spider) self.crawler.stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, spider=spider)
def _log_download_errors(self, spider_failure, download_failure, request, spider): """Log and silence errors that come from the engine (typically download errors that got propagated thru here) """ if (isinstance(download_failure, Failure) and not download_failure.check(IgnoreRequest)): if download_failure.frames: logger.error('Error downloading %(request)s', {'request': request}, exc_info=failure_to_exc_info(download_failure), extra={'spider': spider}) else: errmsg = download_failure.getErrorMessage() if errmsg: logger.error('Error downloading %(request)s: %(errmsg)s', {'request': request, 'errmsg': errmsg}, extra={'spider': spider}) if spider_failure is not download_failure: return spider_failure
def handle_spider_error(self, _failure, request, response, spider): exc = _failure.value if isinstance(exc, CloseSpider): self.crawler.engine.close_spider(spider, exc.reason or 'cancelled') return logger.error( "Spider error processing %(request)s (referer: %(referer)s)", {'request': request, 'referer': referer_str(request)}, exc_info=failure_to_exc_info(_failure), extra={'spider': spider} ) self.signals.send_catch_log( signal=signals.spider_error, failure=_failure, response=response, spider=spider ) self.crawler.stats.inc_value( "spider_exceptions/%s" % _failure.value.__class__.__name__, spider=spider )
def enqueue_scrape( self, response, request, spider ): # 执行process_spider_input、process_spider_exception、process_spider_output三个函数,然后还执行了process_item的管道处理函数 slot = self.slot dfd = slot.add_response_request(response, request) # 把数据推到self.queue里面 def finish_scraping(_): slot.finish_response(response, request) self._check_if_closing(spider, slot) self._scrape_next(spider, slot) return _ dfd.addBoth(finish_scraping) dfd.addErrback(lambda f: logger.error( 'Scraper bug processing %(request)s', {'request': request}, exc_info=failure_to_exc_info(f), extra={'spider': spider})) self._scrape_next( spider, slot) # 主要是为了执行scrape_response,也就是scrape里面对response处理的函数 return dfd
def media_to_download(self, request, info): path = self.file_path(request, info=info) def _onsuccess(result): if not result: return # returning None force download last_modified = result.get('last_modified', None) if not last_modified: return # returning None force download age_seconds = time.time() - last_modified age_days = age_seconds / 60 / 60 / 24 if age_days > self.EXPIRES: return # returning None force download referer = request.headers.get('Referer') logger.debug( 'File (uptodate): Downloaded %(medianame)s from %(request)s ' 'referred in <%(referer)s>', { 'medianame': self.MEDIA_NAME, 'request': request, 'referer': referer }, extra={'spider': info.spider}) self.inc_stats(info.spider, 'uptodate') checksum = result.get('checksum', None) # ret = {'url': request.url, 'path': path, 'checksum': checksum} # filename = result.get('filename', None) # if filename: # ret['filename'] = filename return {'url': request.url, 'path': path, 'checksum': checksum} dfd = defer.maybeDeferred(self.store.stat_file, path, info) dfd.addCallbacks(_onsuccess, lambda _: None) dfd.addErrback(lambda f: logger.error(self.__class__.__name__ + '.store.stat_file', exc_info=failure_to_exc_info(f), extra={'spider': info.spider})) return dfd
def media_to_download(self, request, info): path = self.file_path(request, info=info) def _onsuccess(result): if not result: return # returning None force download last_modified = result.get('last_modified', None) if not last_modified: return # returning None force download age_seconds = time.time() - last_modified age_days = age_seconds / 60 / 60 / 24 if age_days > self.EXPIRES: return # returning None force download referer = request.headers.get('Referer') logger.debug( 'File (uptodate): Downloaded %(medianame)s from %(request)s ' 'referred in <%(referer)s>', {'medianame': self.MEDIA_NAME, 'request': request, 'referer': referer}, extra={'spider': info.spider} ) self.inc_stats(info.spider, 'uptodate') checksum = result.get('checksum', None) # ret = {'url': request.url, 'path': path, 'checksum': checksum} # filename = result.get('filename', None) # if filename: # ret['filename'] = filename return {'url': request.url, 'path': path, 'checksum': checksum} dfd = defer.maybeDeferred(self.store.stat_file, path, info) dfd.addCallbacks(_onsuccess, lambda _: None) dfd.addErrback( lambda f: logger.error(self.__class__.__name__ + '.store.stat_file', exc_info=failure_to_exc_info(f), extra={'spider': info.spider}) ) return dfd
def _itemproc_finished(self, output, item, response, spider): """ItemProcessor finished for the given ``item`` and returned ``output`` """ self.slot.itemproc_size -= 1 if isinstance(output, Failure): ex = output.value if isinstance(ex, DropItem): logkws = self.logformatter.dropped(item, ex, response, spider) logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) return self.signals.send_catch_log_deferred( signal=signals.item_dropped, item=item, response=response, spider=spider, exception=output.value) else: logger.error('Error processing %(item)s', {'item': item}, exc_info=failure_to_exc_info(output), extra={'spider': spider}) else: logkws = self.logformatter.scraped(output, response, spider) logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) return self.signals.send_catch_log_deferred( signal=signals.item_scraped, item=output, response=response, spider=spider)
def _handle_downloader_output(self, result: Union[Request, Response, Failure], request: Request) -> Optional[Deferred]: assert self.spider is not None # typing if not isinstance(result, (Request, Response, Failure)): raise TypeError( f"Incorrect type: expected Request, Response or Failure, got {type(result)}: {result!r}" ) # downloader middleware can return requests (for example, redirects) if isinstance(result, Request): self.crawl(result) return None d = self.scraper.enqueue_scrape(result, request, self.spider) d.addErrback(lambda f: logger.error( "Error while enqueuing downloader output", exc_info=failure_to_exc_info(f), extra={'spider': self.spider}, )) return d
def enqueue_scrape(self, response, request, spider): # 这里的response 实际上是一个deferred对象 slot = self.slot dfd = slot.add_response_request(response, request) # 将数据压入缓存 def finish_scraping(_): slot.finish_response(response, request) #从slot 移出这个结果 self._check_if_closing(spider, slot) # 检查自身是否处于正在关闭状态 self._scrape_next( spider, slot) #注册当这个respond 处理完后 处理下一个 (仅仅是在deferred处理链路上注册这个工作) return _ dfd.addBoth(finish_scraping) dfd.addErrback(lambda f: logger.error( 'Scraper bug processing %(request)s', {'request': request}, exc_info=failure_to_exc_info(f), extra={'spider': spider})) self._scrape_next( spider, slot) ######### 这里才是调用slot执行工作 (注意是执行slot里的工作 并非对应的request ) return dfd
def test_default_item_completed(self): item = dict(name='name') assert self.pipe.item_completed([], item, self.info) is item # Check that failures are logged by default fail = Failure(Exception()) results = [(True, 1), (False, fail)] with LogCapture() as l: new_item = self.pipe.item_completed(results, item, self.info) assert new_item is item assert len(l.records) == 1 record = l.records[0] assert record.levelname == 'ERROR' self.assertTupleEqual(record.exc_info, failure_to_exc_info(fail)) # disable failure logging and check again self.pipe.LOG_FAILED_RESULTS = False with LogCapture() as l: new_item = self.pipe.item_completed(results, item, self.info) assert new_item is item assert len(l.records) == 0
def _itemproc_finished(self, output, item, response, spider): """ItemProcessor finished for the given ``item`` and returned ``output`` """ self.slot.itemproc_size -= 1 if isinstance(output, Failure): ex = output.value ## 如果在 pipeline 处理中抛 DropItem 异常,则忽略处理结果 ## 从这里可以看到,如果想在 Pipeline 中丢弃某个结果,直接抛出 DropItem 异常即可 ## scrapy 会进行相应的处理 if isinstance(ex, DropItem): logkws = self.logformatter.dropped(item, ex, response, spider) logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) return self.signals.send_catch_log_deferred( signal=signals.item_dropped, item=item, response=response, spider=spider, exception=output.value) else: logger.error('Error processing %(item)s', {'item': item}, exc_info=failure_to_exc_info(output), extra={'spider': spider}) return self.signals.send_catch_log_deferred( signal=signals.item_error, item=item, response=response, spider=spider, failure=output) else: logkws = self.logformatter.scraped(output, response, spider) logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) return self.signals.send_catch_log_deferred( signal=signals.item_scraped, item=output, response=response, spider=spider)
def close_spider(self, spider): deferred_list = [] for slot in self.slots: if not slot.itemcount and not slot.store_empty: # We need to call slot.storage.store nonetheless to get the file # properly closed. return defer.maybeDeferred(slot.storage.store, slot.file) slot.finish_exporting() logfmt = "%s %%(format)s feed (%%(itemcount)d items) in: %%(uri)s" log_args = { 'format': slot.format, 'itemcount': slot.itemcount, 'uri': slot.uri } d = defer.maybeDeferred(slot.storage.store, slot.file) d.addCallback(lambda _: logger.info( logfmt % "Stored", log_args, extra={'spider': spider})) d.addErrback( lambda f: logger.error(logfmt % "Error storing", log_args, exc_info=failure_to_exc_info(f), extra={'spider': spider})) deferred_list.append(d) return defer.DeferredList(deferred_list) if deferred_list else None
def _close_slot(self, slot, spider): if not slot.itemcount and not slot.store_empty: # We need to call slot.storage.store nonetheless to get the file # properly closed. return defer.maybeDeferred(slot.storage.store, slot.file) slot.finish_exporting() logfmt = "%s %%(format)s feed (%%(itemcount)d items) in: %%(uri)s" log_args = { 'format': slot.format, 'itemcount': slot.itemcount, 'uri': slot.uri } d = defer.maybeDeferred(slot.storage.store, slot.file) # Use `largs=log_args` to copy log_args into function's scope # instead of using `log_args` from the outer scope d.addCallback(lambda _, largs=log_args: logger.info( logfmt % "Stored", largs, extra={'spider': spider})) d.addErrback(lambda f, largs=log_args: logger.error( logfmt % "Error storing", largs, exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d
def errback(failure): logger.error( msg, exc_info=failure_to_exc_info(failure), extra={'spider': spider} )
def test_non_failure(self): self.assertIsNone(failure_to_exc_info("test"))