Exemplo n.º 1
0
 def post_process(self, output):
     for x in output:
         if is_item(x):
             missing = [arg for arg in self.args if arg not in ItemAdapter(x) or ItemAdapter(x)[arg]==None]
             if missing:
                 missing_str = ", ".join(missing)
                 raise ContractFail("Missing or None fields: %s. Item is %s" % (missing_str, x))
Exemplo n.º 2
0
 def _process_spidermw_output(self, output: Any, request: Request,
                              response: Response,
                              spider: Spider) -> Optional[Deferred]:
     """Process each Request/Item (given in the output parameter) returned
     from the given spider
     """
     assert self.slot is not None  # typing
     if isinstance(output, Request):
         self.crawler.engine.crawl(request=output)
     elif is_item(output):
         self.slot.itemproc_size += 1
         dfd = self.itemproc.process_item(output, spider)
         dfd.addBoth(self._itemproc_finished, output, response, spider)
         return dfd
     elif output is None:
         pass
     else:
         typename = type(output).__name__
         logger.error(
             'Spider must return request, item, or None, got %(typename)r in %(request)s',
             {
                 'request': request,
                 'typename': typename
             },
             extra={'spider': spider},
         )
     return None
Exemplo n.º 3
0
 def _process_spidermw_output(self, output, request, response, spider):
     """Process each Request/Item (given in the output parameter) returned
     from the given spider
     """
     if isinstance(output, Request):
         self.crawler.engine.crawl(request=output,
                                   spider=spider)  #丢给 engine 处理
     elif is_item(output):
         self.slot.itemproc_size += 1  # slot 正在处理 item计数器+1
         dfd = self.itemproc.process_item(
             output, spider)  #用处理ItemPipelineManager的类 output(item)
         dfd.addBoth(self._itemproc_finished, output, response,
                     spider)  # item回调链 添加self._itemproc_finished
         return dfd
     elif output is None:
         pass
     else:
         typename = type(output).__name__
         logger.error(
             'Spider must return request, item, or None, got %(typename)r in %(request)s',
             {
                 'request': request,
                 'typename': typename
             },
             extra={'spider': spider},
         )
 def item_scraped(self, item, spider):
     if not is_item(item):
         self.logger.error("Wrong item type: %s" % item)
         return
     type_ = type(item).__name__
     item = self.exporter.export_item(item)
     item.setdefault("_type", type_)
     self._write_item(item)
Exemplo n.º 5
0
 def post_process(self, output):
     for x in output:
         if is_item(x):
             missing = [
                 arg for arg in self.args if arg not in ItemAdapter(x)
             ]
             if missing:
                 missing_fields = ", ".join(missing)
                 raise ContractFail(f"Missing fields: {missing_fields}")
Exemplo n.º 6
0
    def run_callback(self, response, callback, cb_kwargs=None):
        cb_kwargs = cb_kwargs or {}
        items, requests = [], []

        for x in iterate_spider_output(callback(response, **cb_kwargs)):
            if is_item(x):
                items.append(x)
            elif isinstance(x, Request):
                requests.append(x)
        return items, requests
Exemplo n.º 7
0
 def _serialize_value(self, value):
     if isinstance(value, _BaseItem):
         return self.export_item(value)
     elif is_item(value):
         return dict(self._serialize_item(value))
     elif is_listlike(value):
         return [self._serialize_value(v) for v in value]
     encode_func = to_bytes if self.binary else to_unicode
     if isinstance(value, (str, bytes)):
         return encode_func(value, encoding=self.encoding)
     return value
Exemplo n.º 8
0
def arg_to_iter(arg):
    """Convert an argument to an iterable. The argument can be a None, single
    value, or an iterable.

    Exception: if arg is a dict, [arg] will be returned
    """
    if arg is None:
        return []
    elif (hasattr(arg, '__iter__')
          and not isinstance(arg, _ITERABLE_SINGLE_VALUES)
          and not is_item(arg)):
        return arg
    else:
        return [arg]
Exemplo n.º 9
0
 async def _process_spidermw_output(self, output, request, response,
                                    spider):
     """Process each Request/Item (given in the output parameter) returned
     from the given spider
     """
     if is_item(output):
         self.slot.itemproc_size += 1
         item = await self.itemproc.process_item(output, spider)
         await self._itemproc_finished(output, item, response, spider)
     elif output is None:
         pass
     else:
         typename = type(output).__name__
         logger.error(
             'Spider must return request, item, or None, got %(typename)r in %(request)s',
             {
                 'request': request,
                 'typename': typename
             },
             extra={'spider': spider},
         )
Exemplo n.º 10
0
 def default(self, o):
     if isinstance(o, set):
         return list(o)
     elif isinstance(o, datetime.datetime):
         return o.strftime("%s %s" % (self.DATE_FORMAT, self.TIME_FORMAT))
     elif isinstance(o, datetime.date):
         return o.strftime(self.DATE_FORMAT)
     elif isinstance(o, datetime.time):
         return o.strftime(self.TIME_FORMAT)
     elif isinstance(o, decimal.Decimal):
         return str(o)
     elif isinstance(o, defer.Deferred):
         return str(o)
     elif is_item(o):
         return ItemAdapter(o).asdict()
     elif isinstance(o, Request):
         return "<%s %s %s>" % (type(o).__name__, o.method, o.url)
     elif isinstance(o, Response):
         return "<%s %s %s>" % (type(o).__name__, o.status, o.url)
     else:
         return super().default(o)
Exemplo n.º 11
0
 def _serialize_value(self, value, pre=None, field_filter=None):
     try:
         if isinstance(value, dict):
             return dict(
                 self._serialize_dict(value,
                                      pre=pre,
                                      field_filter=field_filter))
         elif isinstance(value, _BaseItem):
             return self.export_item(value,
                                     pre=pre,
                                     field_filter=field_filter)
         elif is_item(value):
             return dict(self._serialize_item(value))
         value = super(TextDictKeyPythonItemExporter,
                       self)._serialize_value(value)
     except UnicodeDecodeError as e:
         if self.ensure_base64 and isinstance(value, bytes):
             value = to_unicode(base64.b64encode(value))
         else:
             raise e
     return value
Exemplo n.º 12
0
 def default(self, o):
     if isinstance(o, set):
         return list(o)
     elif isinstance(o, datetime.datetime):
         return o.strftime(f"{self.DATE_FORMAT} {self.TIME_FORMAT}")
     elif isinstance(o, datetime.date):
         return o.strftime(self.DATE_FORMAT)
     elif isinstance(o, datetime.time):
         return o.strftime(self.TIME_FORMAT)
     elif isinstance(o, decimal.Decimal):
         return str(o)
     elif isinstance(o, defer.Deferred):
         return str(o)
     elif is_item(o):
         return ItemAdapter(o).asdict()
     elif isinstance(o, Request):
         return f"<{type(o).__name__} {o.method} {o.url}>"
     elif isinstance(o, Response):
         return f"<{type(o).__name__} {o.status} {o.url}>"
     else:
         return super().default(o)
Exemplo n.º 13
0
 def _process_spidermw_output(self, output, request, response, spider):
     """Process each Request/Item (given in the output parameter) returned
     from the given spider
     """
     if isinstance(output, Request):
         self.crawler.engine.crawl(request=output, spider=spider)
     elif is_item(output):
         self.slot.itemproc_size += 1
         dfd = self.itemproc.process_item(output, spider)
         dfd.addBoth(self._itemproc_finished, output, response, spider)
         return dfd
     elif output is None:
         pass
     else:
         typename = type(output).__name__
         logger.error(
             "Spider must return request, item, or None, got %(typename)r in %(request)s",
             {
                 "request": request,
                 "typename": typename
             },
             extra={"spider": spider},
         )
Exemplo n.º 14
0
 def _is_relevant(self, value):
     return isinstance(value, self.relevant_classes) or is_item(value)