def _cache_result_and_execute_waiters(self, result, fp, info): if isinstance(result, Failure): # minimize cached information for failure result.cleanFailure() result.frames = [] result.stack = None # This code fixes a memory leak by avoiding to keep references to # the Request and Response objects on the Media Pipeline cache. # # Twisted inline callbacks pass return values using the function # twisted.internet.defer.returnValue, which encapsulates the return # value inside a _DefGen_Return base exception. # # What happens when the media_downloaded callback raises another # exception, for example a FileException('download-error') when # the Response status code is not 200 OK, is that it stores the # _DefGen_Return exception on the FileException context. # # To avoid keeping references to the Response and therefore Request # objects on the Media Pipeline cache, we should wipe the context of # the exception encapsulated by the Twisted Failure when its a # _DefGen_Return instance. # # This problem does not occur in Python 2.7 since we don't have # Exception Chaining (https://www.python.org/dev/peps/pep-3134/). context = getattr(result.value, '__context__', None) if isinstance(context, _DefGen_Return): setattr(result.value, '__context__', None) info.downloading.remove(fp) info.downloaded[fp] = result # cache result for wad in info.waiting.pop(fp): defer_result(result).chainDeferred(wad)
def _cache_result_and_execute_waiters(self, result, fp, info): if isinstance(result, Failure): # minimize cached information for failure result.cleanFailure() result.frames = [] result.stack = None info.downloading.remove(fp) info.downloaded[fp] = result # cache result for wad in info.waiting.pop(fp): defer_result(result).chainDeferred(wad)
def call_spider(self, result, request, spider): result.request = request dfd = defer_result(result) dfd.addCallbacks(callback=request.callback or spider.parse, errback=request.errback, callbackKeywords=request.cb_kwargs) return dfd.addCallback(iterate_spider_output)
def _process_request(self, request, info): fp = request_fingerprint(request) cb = request.callback or (lambda _: _) eb = request.errback request.callback = None request.errback = None # Return cached result if request was already seen if fp in info.downloaded: return defer_result(info.downloaded[fp]).addCallbacks(cb, eb) # Otherwise, wait for result wad = Deferred().addCallbacks(cb, eb) info.waiting[fp].append(wad) # Check if request is downloading right now to avoid doing it twice if fp in info.downloading: return wad # Download request checking media_to_download hook output first info.downloading.add(fp) dfd = mustbe_deferred(self.media_to_download, request, info) dfd.addCallback(self._check_media_to_download, request, info) dfd.addBoth(self._cache_result_and_execute_waiters, fp, info) dfd.addErrback(lambda f: logger.error( f.value, exc_info=failure_to_exc_info(f), extra={'spider': info.spider}) ) return dfd.addBoth(lambda _: wad) # it must return wad at last
def _process_request(self, request, info): fp = request_fingerprint(request) cb = request.callback or (lambda _: _) eb = request.errback request.callback = None request.errback = None # Return cached result if request was already seen if fp in info.downloaded: return defer_result(info.downloaded[fp]).addCallbacks(cb, eb) # Otherwise, wait for result wad = Deferred().addCallbacks(cb, eb) info.waiting[fp].append(wad) # Check if request is downloading right now to avoid doing it twice if fp in info.downloading: return wad # Download request checking media_to_download hook output first info.downloading.add(fp) dfd = mustbe_deferred(self.media_to_download, request, info) dfd.addCallback(self._check_media_to_download, request, info) dfd.addBoth(self._cache_result_and_execute_waiters, fp, info) dfd.addErrback(log.err, spider=info.spider) return dfd.addBoth(lambda _: wad) # it must return wad at last
def call_spider(self, result, request, spider): ## 回调爬虫模块 result.request = request dfd = defer_result(result) ## 注册回调,如果回调未定义则调用爬虫模块的 parse 方法 dfd.addCallbacks(request.callback or spider.parse, request.errback) return dfd.addCallback(iterate_spider_output)
def call_spider(self, result, request, spider): result.request = request dfd = defer_result(result) callback = request.callback or spider.parse warn_on_generator_with_return_value(spider, callback) warn_on_generator_with_return_value(spider, request.errback) dfd.addCallbacks(callback=callback, errback=request.errback, callbackKeywords=request.cb_kwargs) return dfd.addCallback(iterate_spider_output)
def _deferred_value(self, value, spider): labels = self.labels.get(value) if labels is not None: self.logger.debug("found labels in cache for %s: %s", value, labels) return defer_result(labels) request = Request(self.url.format(value), priority=1) deferred = spider.crawler.engine.download(request, spider) deferred.addBoth(self._extract_labels, value) return deferred
def call_spider(self, result, request, spider): result.request = request dfd = defer_result( result ) # 把result(其实就是response)的处理安排到事件循环中,下个循环调用dfd的callback处理result # 首先是调用注册在request中的callback,如果没有就调用parse,所以不在request中指定回调时,就默认调用parse dfd.addCallbacks(callback=request.callback or spider.parse, errback=request.errback, callbackKeywords=request.cb_kwargs ) # 看到没有,spider的parse方法在这里被添加到回调中,调用parse一般返回一个生成器 return dfd.addCallback(iterate_spider_output)
def call_spider(self, result, request, spider): result.request = request dfd = defer_result(result) if request.callback: logger.info('Called request.callback %s', request.callback) else: logger.info('Called spider.parse') dfd.addCallbacks(callback=request.callback or spider.parse, errback=request.errback, callbackKeywords=request.cb_kwargs) return dfd.addCallback(iterate_spider_output)
def _deferred_field(self, field, item, spider): deferreds = [ self._deferred_value(value, spider) for value in arg_to_iter(item.get(field)) ] if not deferreds: item[field] = None return defer_result(item) deferred = DeferredList(deferreds, consumeErrors=True) deferred.addBoth(self._add_value, field, item) return deferred
def _post_media_to_download(result): if result is None: # continue with download dwld = mustbe_deferred(self.download, request, info) dwld.addCallbacks( callback=self.media_downloaded, callbackArgs=(request, info), errback=self.media_failed, errbackArgs=(request, info)) else: # or use media_to_download return value as result dwld = defer_result(result) info.downloading[fp] = (request, dwld) # fill downloading state data dwld.addBoth(_downloaded) # append post-download hook dwld.addErrback(log.err, spider=info.spider)
def _cache_result_and_execute_waiters(self, result, fp, info): if isinstance(result, Failure): # minimize cached information for failure result.cleanFailure() result.frames = [] result.stack = None # This code fixes a memory leak by avoiding to keep references to # the Request and Response objects on the Media Pipeline cache. # # What happens when the media_downloaded callback raises an # exception, for example a FileException('download-error') when # the Response status code is not 200 OK, is that the original # StopIteration exception (which in turn contains the failed # Response and by extension, the original Request) gets encapsulated # within the FileException context. # # Originally, Scrapy was using twisted.internet.defer.returnValue # inside functions decorated with twisted.internet.defer.inlineCallbacks, # encapsulating the returned Response in a _DefGen_Return exception # instead of a StopIteration. # # To avoid keeping references to the Response and therefore Request # objects on the Media Pipeline cache, we should wipe the context of # the encapsulated exception when it is a StopIteration instance # # This problem does not occur in Python 2.7 since we don't have # Exception Chaining (https://www.python.org/dev/peps/pep-3134/). context = getattr(result.value, '__context__', None) if isinstance(context, StopIteration): setattr(result.value, '__context__', None) info.downloading.remove(fp) info.downloaded[fp] = result # cache result for wad in info.waiting.pop(fp): defer_result(result).chainDeferred(wad)
def _enqueue(self, request, info): wad = request.deferred or Deferred() fp = request_fingerprint(request) # if already downloaded, return cached result. if fp in info.downloaded: return defer_result(info.downloaded[fp]).chainDeferred(wad) # add to pending list for this request, and wait for result like the others. info.waiting.setdefault(fp, []).append(wad) # if request is not downloading, download it. if fp not in info.downloading: self._download(request, info, fp) return wad
def call_spider(self, result, request, spider): # result其实就是下载下载完成后的response类 result.request = request # 执行完 dfd = defer_result(result) # 这一步意义何在呢,感觉没有执行啊 """ 入口url都是从starts_url开始,也就是说,下载器下载好后的response其实就是针对url而言,此时还没有使用回调函数呢 这里的callback或者parse会不会是针对上一级来说的 针对start_requests函数,我们将获取到的结果执行parse函数 """ dfd.addCallbacks(request.callback or spider.parse, request.errback) # 找到了,callback优先级高于parse,不写就默认为parse """ 好吧,我想多了,这里就是针对上一级的绑定,和我这一级是什么无关,除非我返回的是一个requests,那么下次才会走到这里来 这里添加回调函数callback作为其后续处理 就比如有五个首页入口,那爬虫会先把这个五个首页入口数据全记录,然后再进行翻页处理,怎么实现的呢 """ return dfd.addCallback(iterate_spider_output)
def _enqueue(self, request, info): fp = request_fingerprint(request) cb = request.callback or (lambda _: _) eb = request.errback # if already downloaded, return cached result. if fp in info.downloaded: return defer_result(info.downloaded[fp]).addCallbacks(cb, eb) wad = Deferred().addCallbacks(cb, eb) # add to pending list for this request, and wait for result like the others. info.waiting.setdefault(fp, []).append(wad) # if request is not downloading, download it. if fp not in info.downloading: self._download(request, info, fp) return wad
def call_spider(self, result, request, spider): dfd = defer_result(result) dfd.addCallbacks(request.callback or spider.parse, request.errback) return dfd.addCallback(iterate_spider_output)
def call_spider(self, result, request, spider): result.request = request dfd = defer_result(result) # vvvvvvvvvvvv - patched dfd.addCallbacks(request.callback or spider.from_scraper, request.errback) return dfd.addCallback(iterate_spider_output)
def call_spider(self, result, request, spider): defer_result(result).chainDeferred(request.deferred) return request.deferred.addCallback(iterate_spider_output)
def call_spider(self, result, request, spider): result.request = request dfd = defer_result(result) dfd.addCallbacks(request.callback or spider.parse, request.errback) return dfd.addCallback(iterate_spider_output)
def _cache_result_and_execute_waiters(self, result, fp, info): info.downloading.remove(fp) info.downloaded[fp] = result # cache result for wad in info.waiting.pop(fp): defer_result(result).chainDeferred(wad)
def _downloaded(result): info.downloading.pop(fp) info.downloaded[fp] = result for wad in info.waiting.pop(fp): # pass result to each waiting client defer_result(result).chainDeferred(wad)