def _urlencode(seq, enc='utf-8'): if seq: values = [(to_bytes(k, enc), to_bytes(v, enc)) for k, vs in seq for v in (vs if is_listlike(vs) else [vs])] return urlencode(values, doseq=1) else: return ''
def batch_make_requests(self, spider, depth=0, link_or_url_list=[], meta={}): if is_listlike(link_or_url_list) and link_or_url_list: for link_or_url in link_or_url_list: yield self.make_request(spider, depth, link_or_url, meta)
def _serialize_value(self, value): if isinstance(value, BaseItem): return self.export_item(value) if isinstance(value, dict): return dict(self._serialize_dict(value)) if is_listlike(value): return [self._serialize_value(v) for v in value] encode_func = to_bytes if self.binary else to_unicode if isinstance(value, (six.text_type, bytes)): return encode_func(value, encoding=self.encoding) return value
def _serialize_value(self, value): if isinstance(value, Item): return self.export_item(value) elif is_item(value): return dict(self._serialize_item(value)) elif is_listlike(value): return [self._serialize_value(v) for v in value] encode_func = to_bytes if self.binary else to_unicode if isinstance(value, (str, bytes)): return encode_func(value, encoding=self.encoding) return value
def _export_xml_field(self, name, serialized_value): self.xg.startElement(name, {}) if hasattr(serialized_value, 'items'): for subname, value in serialized_value.items(): self._export_xml_field(subname, value) elif is_listlike(serialized_value): for value in serialized_value: self._export_xml_field('value', value) else: self._xg_characters(serialized_value) self.xg.endElement(name)
def _export_xml_field(self, name, serialized_value): self.xg.startElement(name, {}) if hasattr(serialized_value, "items"): for subname, value in serialized_value.items(): self._export_xml_field(subname, value) elif is_listlike(serialized_value): for value in serialized_value: self._export_xml_field("value", value) else: self._xg_characters(serialized_value) self.xg.endElement(name)
def _serialize_value(self, value): if isinstance(value, BaseItem): return self.export_item(value) if isinstance(value, dict): return dict(self._serialize_dict(value)) if is_listlike(value): return [self._serialize_value(v) for v in value] if self.binary: return to_bytes(value, encoding=self.encoding) else: return to_unicode(value, encoding=self.encoding)
def _check_field_len_validity(item, field_name, length=1): if not _check_field_in_item(item, field_name): return False str_or_list = item[field_name] if not str_or_list: return False elif isinstance(str_or_list, str): return len(str_or_list.strip()) >= length elif is_listlike(str_or_list): s = ''.join(flatten(str_or_list)).strip() return len(s) >= length return False
def _export_xml_field(self, name, serialized_value): self.xg.startElement(name, {}) if hasattr(serialized_value, 'items'): for subname, value in serialized_value.items(): self._export_xml_field(subname, value) elif is_listlike(serialized_value): for value in serialized_value: self._export_xml_field('value', value) elif isinstance(serialized_value, six.text_type): self._xg_characters(serialized_value) else: self._xg_characters(str(serialized_value)) self.xg.endElement(name)
def _export_xml_field(self, name, serialized_value, depth): self._beautify_indent(depth=depth) self.xg.startElement(name, {}) if hasattr(serialized_value, 'items'): self._beautify_newline() for subname, value in serialized_value.items(): self._export_xml_field(subname, value, depth=depth + 1) self._beautify_indent(depth=depth) elif is_listlike(serialized_value): self._beautify_newline() for value in serialized_value: self._export_xml_field('value', value, depth=depth + 1) self._beautify_indent(depth=depth) elif isinstance(serialized_value, six.text_type): self._xg_characters(serialized_value) else: self._xg_characters(str(serialized_value)) self.xg.endElement(name) self._beautify_newline()
def upload_item(self, item, file_key, force_override_flag=False, content_type="application/json"): """上传item到Ks3, 注意file_key必须保证唯一, PUT操作""" path = self.get_path(file_key) if not force_override_flag: # 是否进行强制覆盖上传, 默认不进行,为False if self.is_exist_key(file_key): log.warning("文件:{} 已经存在于ks3中,不进行重复上传".format(path)) return True s = None if is_listlike(item): s = json.dumps(item, ensure_ascii=False) if isinstance(item, str): s = item if not s: return False body = s.encode(encoding='utf-8') return self._upload_body(file_key, body, content_type)
def get_file_key_by_category(self, category, file_id): """ :param category: listlike or str :param file_id: 这里对应mongodb的数据库_id :return: file_key str """ if is_listlike(category): temp_categories = [c.strip() for c in category if c.strip()] dir_path = '/'.join(temp_categories).strip() else: dir_path = category.strip() if not dir_path: dir_path = '未知类别' # 注意最后需要url encode,因为中文的话需要编码 file_key = "{}/{}".format(dir_path, file_id) file_key = url_encode(file_key) # url编码是不对/进行编码的 # 替换特殊字符, 不替换会进行报错403 file_key = file_key.replace('//', '/%2F') file_key = file_key.replace('%7E', '~') return file_key
def _urlencode(seq: Iterable, enc: str) -> str: values = [(to_bytes(k, enc), to_bytes(v, enc)) for k, vs in seq for v in (vs if is_listlike(vs) else [vs])] return urlencode(values, doseq=True)
def _export_xml_field(self, name, serialized_value, depth, attrs=None, out_value=None): self._beautify_indent(depth=depth) if attrs is None: attrs = {} if out_value is not None: serialized_value = out_value if serialized_value is None: self.xg.ignorableWhitespace("<%s/>" % name) self._beautify_newline() return self.xg.startElement(name, attrs) if hasattr(serialized_value, 'items'): self._beautify_newline() for subname, value in serialized_value.items(): if subname.startswith("_"): continue _attrs = {} if hasattr(value, 'items'): for key in value.keys(): if key.startswith("_"): _attrs[key[1:]] = value[key] _tag = subname if "tag" in _attrs: _tag = _attrs.pop("tag") _out_value = None if "value" in _attrs: _out_value = _attrs.pop("value") self._export_xml_field(_tag, value, depth=depth + 1, attrs=_attrs, out_value=_out_value) self._beautify_indent(depth=depth) elif is_listlike(serialized_value): self._beautify_newline() _is_dict_inside = True for value in serialized_value: if not hasattr(value, 'items'): _is_dict_inside = False break if _is_dict_inside: for value in serialized_value: _sub_attrs = {} for key in value.keys(): if key.startswith("_"): _sub_attrs[key[1:]] = value[key] _sub_tag = 'value' if "tag" in _sub_attrs: _sub_tag = _sub_attrs.pop("tag") _sub_out_value = None if "value" in _sub_attrs: _sub_out_value = _sub_attrs.pop("value") self._export_xml_field(_sub_tag, value, depth=depth + 1, out_value=_sub_out_value) else: for value in serialized_value: self._export_xml_field('value', value, depth=depth + 1) self._beautify_indent(depth=depth) elif isinstance(serialized_value, six.text_type): self._xg_characters(serialized_value) else: self._xg_characters(str(serialized_value)) self.xg.endElement(name) self._beautify_newline()
def _urlencode(seq, enc): if isinstance(seq, str): return bytes(seq, enc) values = [(to_bytes(k, enc), to_bytes(v, enc)) for k, vs in seq for v in (vs if is_listlike(vs) else [vs])] return urlencode(values, doseq=1)
def start_requests(self): """默认会调用CrawlSpider的parse方法""" rules = self.config.get('start_seeds_rules', None) request_factory_cls = load_object(rules.get("request_factory_class")) req_factory_obj = request_factory_cls() depth = int(rules.get("depth")) check_before_request_flag = rules.get("check_before_request_flag", False) # 第一种情况, 默认配置的urls集合 category_urls = rules.get("category_urls") if isinstance(category_urls, dict) and category_urls: for category, urls in category_urls.items(): if is_listlike(urls): if check_before_request_flag: urls = [ url for url in urls if not self.check_url_in_redis_set(url) ] if not urls: continue for request in req_factory_obj.batch_make_requests( spider=self, depth=depth, link_or_url_list=urls, meta={'category': category}): yield request # 第二种情况, 通过函数泛化 callback_urls = rules.get("callback_urls") if isinstance(callback_urls, dict) and callback_urls: for category, callback_url_infos in callback_urls.items(): for infos in callback_url_infos: callback = infos.get('callback').strip() if not callback: continue if isinstance(callback, str): callback = load_object(callback) params = infos.get('params') if callable(callback) and isinstance(params, dict): urls_or_requests = callback(**params) if check_before_request_flag: urls_or_requests = [ e for e in urls_or_requests if not self.check_url_in_redis_set(e) ] if not urls_or_requests: continue urls = [ e for e in urls_or_requests if isinstance(e, str) and e.startswith('http') ] for request in req_factory_obj.batch_make_requests( spider=self, depth=depth, link_or_url_list=urls, meta={'category': category}): yield request requests = [ e for e in urls_or_requests if isinstance(e, Request) ] for request in requests: yield request
def _urlencode(seq, enc): values = [(to_bytes(k, enc), to_bytes(v, enc)) for k, vs in seq for v in (vs if is_listlike(vs) else [vs]) if v != '+'] return urlencode(values, doseq=1)
def _urlencode(seq, enc): values = [(to_bytes(k, enc), to_bytes(v, enc)) for k, vs in seq for v in (vs if is_listlike(vs) else [vs])] return urlencode(values, doseq=1)