def basic_items_check(items, obligate_fields, primary_fields, request_url): for item in items: if not set( item.keys()).intersection(obligate_fields) == obligate_fields: missing_fields = obligate_fields.difference(item.keys()) raise _InvalidOutput( "Obligate fields check failed. Request url: %s. " "Missing fields: %s" % (request_url, missing_fields)) for field in primary_fields: if not item.get(field, ""): raise _InvalidOutput( "Primary fields check failed. Request url: %s. " "Empty field: %s" % (request_url, field))
def process_spider_output(result, start_index=0): # items in this iterable do not need to go through the process_spider_output # chain, they went through it already from the process_spider_exception method recovered = MutableChain() method_list = islice(self.methods['process_spider_output'], start_index, None) for method_index, method in enumerate(method_list, start=start_index): if method is None: continue try: # might fail directly if the output value is not a generator result = method(response=response, result=result, spider=spider) except Exception as ex: exception_result = process_spider_exception( ex, method_index + 1) if isinstance(exception_result, (Exception, BaseException)): raise return exception_result if _isiterable(result): result = _evaluate_iterable(result, method_index + 1, recovered) else: msg = "Middleware {} must return an iterable, got {}" raise _InvalidOutput( msg.format(_fname(method), type(result))) return MutableChain(result, recovered)
def process_spider_exception(_failure, start_index=0): exception = _failure.value # don't handle _InvalidOutput exception if isinstance(exception, _InvalidOutput): return _failure method_list = islice( self.methods['process_spider_exception'], start_index, None) # 执行一系列爬虫中间件的process_spider_exception for method_index, method in enumerate( method_list, start=start_index): if method is None: continue result = method( response=response, exception=exception, spider=spider) if _isiterable(result): # stop exception handling by handing control over to the # process_spider_output chain if an iterable has been # returned return process_spider_output(result, method_index + 1) elif result is None: continue else: raise _InvalidOutput( 'Middleware {} must return None or an iterable, got {}' .format( fname(method), type(result))) return _failure
def _process_spider_output(self, response, spider, result, start_index=0): # items in this iterable do not need to go through the process_spider_output # chain, they went through it already from the process_spider_exception method recovered = MutableChain() method_list = islice(self.methods['process_spider_output'], start_index, None) for method_index, method in enumerate(method_list, start=start_index): if method is None: continue try: # might fail directly if the output value is not a generator result = method(response=response, result=result, spider=spider) except Exception as ex: exception_result = self._process_spider_exception( response, spider, Failure(ex), method_index + 1) if isinstance(exception_result, Failure): raise return exception_result if _isiterable(result): result = self._evaluate_iterable(response, spider, result, method_index + 1, recovered) else: msg = (f"Middleware {method.__qualname__} must return an " f"iterable, got {type(result)}") raise _InvalidOutput(msg) return MutableChain(result, recovered)
def _process_spider_exception(self, response, spider, _failure, start_index=0): exception = _failure.value # don't handle _InvalidOutput exception if isinstance(exception, _InvalidOutput): return _failure method_list = islice(self.methods['process_spider_exception'], start_index, None) for method_index, method in enumerate(method_list, start=start_index): if method is None: continue result = method(response=response, exception=exception, spider=spider) if _isiterable(result): # stop exception handling by handing control over to the # process_spider_output chain if an iterable has been returned return self._process_spider_output(response, spider, result, method_index + 1) elif result is None: continue else: msg = (f"Middleware {method.__qualname__} must return None " f"or an iterable, got {type(result)}") raise _InvalidOutput(msg) return _failure
def process_spider_output(result, start_index=0): # items in this iterable do not need to go through the process_spider_output # chain, they went through it already from the process_spider_exception method recovered = MutableChain() def evaluate_iterable(iterable, index): try: for r in iterable: yield r except Exception as ex: exception_result = process_spider_exception(Failure(ex), index+1) if isinstance(exception_result, Failure): raise recovered.extend(exception_result) method_list = islice(self.methods['process_spider_output'], start_index, None) for method_index, method in enumerate(method_list, start=start_index): if method is None: continue # the following might fail directly if the output value is not a generator try: result = method(response=response, result=result, spider=spider) except Exception as ex: exception_result = process_spider_exception(Failure(ex), method_index+1) if isinstance(exception_result, Failure): raise return exception_result if _isiterable(result): result = evaluate_iterable(result, method_index) else: raise _InvalidOutput('Middleware {} must return an iterable, got {}' \ .format(fname(method), type(result))) return chain(result, recovered)
def process_spider_output(result, start_index=0): # items in this iterable do not need to go through the process_spider_output # chain, they went through it already from the process_spider_exception method recovered = MutableChain() def evaluate_iterable(iterable, index): try: for r in iterable: yield r except Exception as ex: exception_result = process_spider_exception(Failure(ex), index+1) if isinstance(exception_result, Failure): raise recovered.extend(exception_result) method_list = islice(self.methods['process_spider_output'], start_index, None) for method_index, method in enumerate(method_list, start=start_index): if method is None: continue # the following might fail directly if the output value is not a generator try: result = method(response=response, result=result, spider=spider) except Exception as ex: exception_result = process_spider_exception(Failure(ex), method_index+1) if isinstance(exception_result, Failure): raise return exception_result if _isiterable(result): result = evaluate_iterable(result, method_index) else: raise _InvalidOutput('Middleware {} must return an iterable, got {}' \ .format(fname(method), type(result))) return chain(result, recovered)
def process_request(request): for method in self.methods['process_request']: response = yield method(request=request, spider=spider) if response is not None and not isinstance(response, (Response, Request)): raise _InvalidOutput('Middleware %s.process_request must return None, Response or Request, got %s' % \ (six.get_method_self(method).__class__.__name__, response.__class__.__name__)) if response: defer.returnValue(response) defer.returnValue((yield download_func(request=request, spider=spider)))
def process_exception(_failure): exception = _failure.value for method in self.methods['process_exception']: response = yield method(request=request, exception=exception, spider=spider) if response is not None and not isinstance(response, (Response, Request)): raise _InvalidOutput('Middleware %s.process_exception must return None, Response or Request, got %s' % \ (six.get_method_self(method).__class__.__name__, type(response))) if response: defer.returnValue(response) defer.returnValue(_failure)
def process_request(request): for method in self.methods['process_request']: response = yield method(request=request, spider=spider) if response is not None and not isinstance( response, (Response, Request)): raise _InvalidOutput('Middleware %s.process_request must return None, Response or Request, got %s' % \ (method.__self__.__class__.__name__, response.__class__.__name__)) if response: defer.returnValue(response) defer.returnValue((yield download_func(request=request, spider=spider)))
def process_request(request): for method in self.methods['process_request']: response = yield deferred_from_coro(method(request=request, spider=spider)) if response is not None and not isinstance(response, (Response, Request)): raise _InvalidOutput( "Middleware %s.process_request must return None, Response or Request, got %s" % (method.__self__.__class__.__name__, response.__class__.__name__) ) if response: return response return (yield download_func(request=request, spider=spider))
def _process_spider_exception( self, response: Response, spider: Spider, _failure: Failure, start_index: int = 0) -> Union[Failure, MutableChain]: exception = _failure.value # don't handle _InvalidOutput exception if isinstance(exception, _InvalidOutput): return _failure method_list = islice(self.methods['process_spider_exception'], start_index, None) for method_index, method in enumerate(method_list, start=start_index): if method is None: continue method = cast(Callable, method) result = method(response=response, exception=exception, spider=spider) if _isiterable(result): # stop exception handling by handing control over to the # process_spider_output chain if an iterable has been returned dfd: Deferred = self._process_spider_output( response, spider, result, method_index + 1) # _process_spider_output() returns a Deferred only because of downgrading so this can be # simplified when downgrading is removed. if dfd.called: # the result is available immediately if _process_spider_output didn't do downgrading return dfd.result else: # we forbid waiting here because otherwise we would need to return a deferred from # _process_spider_exception too, which complicates the architecture msg = f"Async iterable returned from {method.__qualname__} cannot be downgraded" raise _InvalidOutput(msg) elif result is None: continue else: msg = (f"{method.__qualname__} must return None " f"or an iterable, got {type(result)}") raise _InvalidOutput(msg) return _failure
def process_spider_input(response): for method in self.methods['process_spider_input']: try: result = method(response=response, spider=spider) if result is not None: raise _InvalidOutput('Middleware {} must return None or raise an exception, got {}' \ .format(fname(method), type(result))) except _InvalidOutput: raise except Exception: return scrape_func(Failure(), request, spider) return scrape_func(response, request, spider)
def process_spider_input(response): for method in self.methods['process_spider_input']: try: result = method(response=response, spider=spider) if result is not None: raise _InvalidOutput('Middleware {} must return None or raise an exception, got {}' \ .format(fname(method), type(result))) except _InvalidOutput: raise except Exception: return scrape_func(Failure(), request, spider) return scrape_func(response, request, spider)
def process_exception(failure): exception = failure.value for method in self.methods['process_exception']: response = yield deferred_from_coro(method(request=request, exception=exception, spider=spider)) if response is not None and not isinstance(response, (Response, Request)): raise _InvalidOutput( "Middleware %s.process_exception must return None, Response or Request, got %s" % (method.__self__.__class__.__name__, type(response)) ) if response: return response return failure
def check_req_rules(reqclass, requests, request_url): reqclass_attrs = [(name, getattr(reqclass, name)) for name in dir(reqclass) if not name.startswith('__')] req_rules = list(filter(lambda entry: callable(entry[1]), reqclass_attrs)) for req in requests: for rule_func in req_rules: try: rule_func[1](req) except AssertionError: raise _InvalidOutput( "A request produced by the request with url %s has " "failed the rule %s" % (request_url, rule_func[0]))
def process_exception(failure): exception = failure.value for method in self.methods['process_exception']: response = yield deferred_from_coro(method(request=request, exception=exception, spider=spider)) if response is not None and not isinstance(response, (Response, Request)): raise _InvalidOutput( f"Middleware {method.__qualname__} must return None, Response or " f"Request, got {type(response)}" ) if response: return response return failure
def process_request(request: Request): for method in self.methods['process_request']: method = cast(Callable, method) response = yield deferred_from_coro( method(request=request, spider=spider)) if response is not None and not isinstance( response, (Response, Request)): raise _InvalidOutput( f"Middleware {method.__qualname__} must return None, Response or " f"Request, got {response.__class__.__name__}") if response: return response return (yield download_func(request=request, spider=spider))
def process_response(response): assert response is not None, 'Received None in process_response' if isinstance(response, Request): defer.returnValue(response) for method in self.methods['process_response']: response = yield method(request=request, response=response, spider=spider) if not isinstance(response, (Response, Request)): raise _InvalidOutput('Middleware %s.process_response must return Response or Request, got %s' % \ (six.get_method_self(method).__class__.__name__, type(response))) if isinstance(response, Request): defer.returnValue(response) defer.returnValue(response)
def check_item_rules(itemclass, items, request_url): itemclass_attrs = [(name, getattr(itemclass, name)) for name in dir(itemclass) if not name.startswith('__')] item_rules = list(filter(lambda entry: callable(entry[1]), itemclass_attrs)) for item in items: for rule_func in item_rules: try: rule_func[1](item) except AssertionError: raise _InvalidOutput( "An item produced by the request with url %s has " "failed the rule %s" % (request_url, rule_func[0]))
def _process_spider_input(self, scrape_func, response, request, spider): for method in self.methods["process_spider_input"]: try: result = method(response=response, spider=spider) if result is not None: msg = (f"Middleware {_fname(method)} must return None " f"or raise an exception, got {type(result)}") raise _InvalidOutput(msg) except _InvalidOutput: raise except Exception: return scrape_func(Failure(), request, spider) return scrape_func(response, request, spider)
def process_exception(_failure): exception = _failure.value for method in self.methods['process_exception']: response = yield method(request=request, exception=exception, spider=spider) if response is not None and not isinstance( response, (Response, Request)): raise _InvalidOutput('Middleware %s.process_exception must return None, Response or Request, got %s' % \ (method.__self__.__class__.__name__, type(response))) if response: defer.returnValue(response) defer.returnValue(_failure)
def process_response(response): assert response is not None, 'Received None in process_response' if isinstance(response, Request): defer.returnValue(response) for method in self.methods['process_response']: response = yield deferred_from_coro( method(request=request, response=response, spider=spider)) if not isinstance(response, (Response, Request)): raise _InvalidOutput('Middleware %s.process_response must return Response or Request, got %s' % \ (method.__self__.__class__.__name__, type(response))) if isinstance(response, Request): defer.returnValue(response) defer.returnValue(response)
def process_spider_input(response): for method in self.methods[ 'process_spider_input']: #调用所有中间件 处理response try: result = method(response=response, spider=spider) if result is not None: # 中间件的process_spider_input 方法需要返回None msg = (f"Middleware {_fname(method)} must return None " f"or raise an exception, got {type(result)}") raise _InvalidOutput(msg) except _InvalidOutput: raise except Exception: return scrape_func(Failure(), request, spider) return scrape_func(response, request, spider)
def process_request(request): for method in self.methods['process_request']: response = yield deferred_from_coro( method(request=request, spider=spider) ) # 这个deferred_from_coro方法是将 middleware的方法 从 asyncio 转化为 recator方法 并yield出去 if response is not None and not isinstance( response, (Response, Request)): raise _InvalidOutput( f"Middleware {method.__self__.__class__.__name__}" ".process_request must return None, Response or " f"Request, got {response.__class__.__name__}") if response: return response return (yield download_func(request=request, spider=spider))
async def process_request(self, spider, request): for method in self.methods['process_request']: if iscoroutinefunction(method): response = await method(request=request, spider=spider) else: response = method(request=request, spider=spider) if response is not None and not isinstance(response, (Response, Request)): raise _InvalidOutput( "Middleware %s.process_request must return None, Response or Request, got %s" % (method.__self__.__class__.__name__, response.__class__.__name__)) if response: return response
def process_request(request): for method in self.methods[ 'process_request']: # 挨个执行下载器中间件的process_request方法 response = yield method( request=request, spider=spider ) # 看,process_request方法就是在此处把requests和spider传入的 if response is not None and not isinstance( response, (Response, Request)): raise _InvalidOutput('Middleware %s.process_request must return None, Response or Request, got %s' % \ (six.get_method_self(method).__class__.__name__, response.__class__.__name__)) if response: defer.returnValue(response) defer.returnValue( (yield download_func(request=request, spider=spider) )) # 如果下载器中间件没有返回值,则执行注册进来的方法,也就是Downloader的_enqueue_request
def process_spider_input( response): # 即引擎把response交给spider的时候,经过爬虫中间件,调用这个函数处理 for method in self.methods[ 'process_spider_input']: # 执行一系列爬虫中间件的process_spider_input try: result = method(response=response, spider=spider) if result is not None: raise _InvalidOutput('Middleware {} must return None or raise an exception, got {}' \ .format(fname(method), type(result))) except _InvalidOutput: raise except Exception: return scrape_func(Failure(), request, spider) return scrape_func(response, request, spider) # 中间件处理之后,这里注册一个事件到事件循环,等到调用爬虫的parse方法
def process_request(request): # 执行下载器中间件的process_request方法, 下载前依次进行加工, 处理, 校验等 for method in self.methods['process_request']: response = yield method(request=request, spider=spider) if response is not None and not isinstance( response, (Response, Request)): raise _InvalidOutput( 'Middleware %s.process_request must return None, Response or Request, got %s' % (six.get_method_self(method).__class__.__name__, response.__class__.__name__)) if response: defer.returnValue(response) # 这里才是真正的下载, download_func == _enqueue_request方法 defer.returnValue((yield download_func(request=request, spider=spider)))
def process_response(response: Union[Response, Request]): if response is None: raise TypeError("Received None in process_response") elif isinstance(response, Request): return response for method in self.methods['process_response']: response = yield deferred_from_coro( method(request=request, response=response, spider=spider)) if not isinstance(response, (Response, Request)): raise _InvalidOutput( f"Middleware {method.__qualname__} must return Response or Request, " f"got {type(response)}") if isinstance(response, Request): return response return response
def process_response(response): if response is None: raise TypeError("Received None in process_response") elif isinstance(response, Request): return response for method in self.methods['process_response']: response = yield deferred_from_coro(method(request=request, response=response, spider=spider)) if not isinstance(response, (Response, Request)): raise _InvalidOutput( "Middleware %s.process_response must return Response or Request, got %s" % (method.__self__.__class__.__name__, type(response)) ) if isinstance(response, Request): return response return response
def process_spider_input(response): for method in self.methods['process_spider_input']: logger.info('[spider_input] Processing %s pages in url: %s', len(response.body), request.url) try: result = method(response=response, spider=spider) if result is not None: raise _InvalidOutput('Middleware {} must return None or raise an exception, got {}' \ .format(fname(method), type(result))) except _InvalidOutput: raise except Exception: return scrape_func(Failure(), request, spider) logger.info('[spider_input] Processing input for url: %s', request.url) return scrape_func(response, request, spider)
def _process_spider_input(self, scrape_func: ScrapeFunc, response: Response, request: Request, spider: Spider) -> Any: for method in self.methods['process_spider_input']: try: result = method(response=response, spider=spider) if result is not None: msg = ( f"Middleware {method.__qualname__} must return None " f"or raise an exception, got {type(result)}") raise _InvalidOutput(msg) except _InvalidOutput: raise except Exception: return scrape_func(Failure(), request, spider) return scrape_func(response, request, spider)
def process_spider_input(response): for method in self.methods['process_spider_input']: try: result = method(response=response, spider=spider) if result is not None: msg = "Middleware {} must return None or raise an exception, got {}" raise _InvalidOutput( msg.format(_fname(method), type(result))) except _InvalidOutput: raise except Exception as exception: iterable_or_exception = scrape_func( exception, request, spider) if iterable_or_exception is exception: raise iterable_or_exception return iterable_or_exception return scrape_func(response, request, spider)
def process_response(response): assert response is not None, 'Received None in process_response' if isinstance(response, Request): defer.returnValue(response) # 如果下载器中间件定义process_response, 则依次执行 for method in self.methods['process_response']: response = yield method(request=request, response=response, spider=spider) if not isinstance(response, (Response, Request)): raise _InvalidOutput( 'Middleware %s.process_response must return Response or Request, got %s' % (six.get_method_self(method).__class__.__name__, type(response))) if isinstance(response, Request): defer.returnValue(response) defer.returnValue(response)
def process_spider_exception(_failure, start_index=0): exception = _failure.value # don't handle _InvalidOutput exception if isinstance(exception, _InvalidOutput): return _failure method_list = islice(self.methods['process_spider_exception'], start_index, None) for method_index, method in enumerate(method_list, start=start_index): if method is None: continue result = method(response=response, exception=exception, spider=spider) if _isiterable(result): # stop exception handling by handing control over to the # process_spider_output chain if an iterable has been returned return process_spider_output(result, method_index+1) elif result is None: continue else: raise _InvalidOutput('Middleware {} must return None or an iterable, got {}' \ .format(fname(method), type(result))) return _failure