def get_selector_values(self, field_name, selector_rules, selector, **kw): """Provides an abstraction to _get_xpathvalues() and _get_cssvalues() since they share the same components. """ self._check_selector_method() selector_type = selector.__name__ # either 'css' or 'xpath' # The optional arg in methods like `add_css()` for context in stats name = kw.get("name") # For every call of `add_css()` and `add_xpath()` this is incremented. # We'll use it as the base index of the position of the logged stats. index = self.field_tracker[f"{field_name}_{selector_type}"] values = [] for position, rule in enumerate(arg_to_iter(selector_rules), index): parsed_data = selector(rule).getall() values.append(parsed_data) self.write_to_stats(field_name, parsed_data, position, selector_type, name=name) return flatten(values)
def extract_regex(regex, text, encoding='utf-8'): """Extract a list of unicode strings from the given text/encoding using the following policies: * if the regex contains a named group called "extract" that will be returned * if the regex contains multiple numbered groups, all those will be returned (flattened) * if the regex doesn't contain any group the entire regex matching is returned """ warnings.warn( "scrapy.utils.misc.extract_regex has moved to parsel.utils.extract_regex.", ScrapyDeprecationWarning, stacklevel=2 ) if isinstance(regex, str): regex = re.compile(regex, re.UNICODE) try: strings = [regex.search(text).group('extract')] # named group except Exception: strings = regex.findall(text) # full regex or numbered groups strings = flatten(strings) if isinstance(text, str): return [replace_entities(s, keep=['lt', 'amp']) for s in strings] else: return [replace_entities(to_unicode(s, encoding), keep=['lt', 'amp']) for s in strings]
def _get_jsonpathvalues(self, jsonpaths, **kw): self._check_selector_method() jsonpaths = arg_to_iter(jsonpaths) ret = self._extract_hier_jsonpaths(self.selector.json, jsonpaths, **kw) if not flatten(ret): return None else: return ret
def _get_cssvalues(self, csss, **kw): self._check_selector_method() csss = arg_to_iter(csss) ret = self._extract_hier_csss(self.selector, csss, **kw) if ret is None or not flatten(ret): return None else: return ret
def _get_jmes_values(self, jmes_paths): if self.json_obj is None: raise RuntimeError("no JSON object found") jmes_paths = arg_to_iter(jmes_paths) return flatten( jmespath.search(jmes_path, self.json_obj) for jmes_path in jmes_paths)
def add_xpathWithCondition(self, field_name, conditionxpath, successXpath, failXpath, *processors, **kw): xpath_val = successXpath if self.selector.xpath(conditionxpath).extract_first() \ is not None else failXpath values = flatten( [self.selector.xpath(xpath).extract() for xpath in [xpath_val]]) self.add_value(field_name, values, *processors, **kw)
def get_value(self, value, *processors, **kw): regex = kw.get('re', None) if regex: value = arg_to_iter(value) value = flatten(extract_regex(regex, x) for x in value) for proc in processors: if value is None: break proc = wrap_loader_context(proc, self.context) value = proc(value) return value
def _check_field_len_validity(item, field_name, length=1): if not _check_field_in_item(item, field_name): return False str_or_list = item[field_name] if not str_or_list: return False elif isinstance(str_or_list, str): return len(str_or_list.strip()) >= length elif is_listlike(str_or_list): s = ''.join(flatten(str_or_list)).strip() return len(s) >= length return False
def get_value(self, value, *processors, **kw): regex = kw.get('re', None) if regex: value = arg_to_iter(value) value = flatten(extract_regex(regex, x) for x in value) for proc in processors: if value is None: break _proc = proc proc = wrap_loader_context(proc, self.context) try: value = proc(value) except Exception as e: raise ValueError("Error with processor %s value=%r error='%s: %s'" % (_proc.__class__.__name__, value, type(e).__name__, str(e))) return value
def get_value(self, value, *processors, **kw): regex = kw.get('re', None) if regex: value = arg_to_iter(value) value = flatten(extract_regex(regex, x) for x in value) grouped = kw.get('grouped') if grouped: regex = re.compile(grouped, re.UNICODE) value = arg_to_iter(value) value = [m.groupdict() for v in value for m in regex.finditer(v)] for proc in processors: if value is None: break proc = wrap_loader_context(proc, self.context) value = proc(value) return value
def extract_regex(regex, text, encoding='utf-8'): """Extract a list of unicode strings from the given text/encoding using the following policies: * if the regex contains a named group called "extract" that will be returned * if the regex contains multiple numbered groups, all those will be returned (flattened) * if the regex doesn't contain any group the entire regex matching is returned """ if isinstance(regex, basestring): regex = re.compile(regex, re.UNICODE) try: strings = [regex.search(text).group('extract')] # named group except: strings = regex.findall(text) # full regex or numbered groups strings = flatten(strings) #flatten 把列表中的列表或者字典等嵌套结构去除,返回一个统一的列表。 if isinstance(text, unicode): return [replace_entities(s, keep=['lt', 'amp']) for s in strings] else: return [replace_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings]
def extract_regex(regex, text, encoding="utf-8"): """Extract a list of unicode strings from the given text/encoding using the following policies: * if the regex contains a named group called "extract" that will be returned * if the regex contains multiple numbered groups, all those will be returned (flattened) * if the regex doesn't contain any group the entire regex matching is returned """ if isinstance(regex, six.string_types): regex = re.compile(regex, re.UNICODE) try: strings = [regex.search(text).group("extract")] # named group except: strings = regex.findall(text) # full regex or numbered groups strings = flatten(strings) if isinstance(text, six.text_type): return [replace_entities(s, keep=["lt", "amp"]) for s in strings] else: return [replace_entities(to_unicode(s, encoding), keep=["lt", "amp"]) for s in strings]
def test_get_selector_values(): """Selectors must be properly called as well as correctly flatten the data.""" selector_rules = ["#rule1", "#rule2", "#rule3"] field_name = "field" parsed_data = ["data1", "data2"] mock_selector = mock.Mock() mock_selector().getall.return_value = parsed_data mock_selector.__name__ = "css" loader = ItemLoader(selector=mock_selector) loader.write_to_stats = mock.Mock() # This wasn't actually initialized so it will return 0 by default otherwise. loader.field_tracker["field_css"] = 1 result = loader.get_selector_values(field_name, selector_rules, mock_selector) assert result == flatten([parsed_data] * len(selector_rules)) mock_selector.assert_has_calls([ mock.call(selector_rules[0]), mock.call().getall(), mock.call(selector_rules[1]), mock.call().getall(), mock.call(selector_rules[2]), mock.call().getall(), ]) loader.write_to_stats.assert_has_calls([ mock.call(field_name, parsed_data, 1, "css", name=None), mock.call(field_name, parsed_data, 2, "css", name=None), mock.call(field_name, parsed_data, 3, "css", name=None), ])
def _get_values(self, xpaths, **kw): xpaths = arg_to_iter(xpaths) return flatten([self.selector.xpath(xpath) for xpath in xpaths])
def select(self, xpath): return self.__class__(flatten([x.select(xpath) for x in self]))
def _get_cssvalues(self, csss, **kw): csss = arg_to_iter(csss) return flatten([self.selector.css(css).extract() for css in csss])
def _get_xpathvalues(self, xpaths, **kw): self._check_selector_method() xpaths = arg_to_iter(xpaths) return flatten(self.selector.xpath(xpath).extract() for xpath in xpaths)
def re(self, regex): """Perform the re() method on each XPathSelector of the list, and return the result as a flattened list of unicode strings""" return flatten([x.re(regex) for x in self])
def re(self, regex): return flatten([x.re(regex) for x in self])
def _get_xpathvalues(self, xpaths, **kw): self._check_selector_method() jsonpath_expr = parse(xpaths) #self.log("SELECTOR: %s" % unicode(self.selector), log.INFO) res_list = [match.value for match in jsonpath_expr.find(self.selector)] return flatten(res_list)
def _get_xpathvalues(self, xpaths, **kw): self._check_selector_method() jsonpath_expr = parse(xpaths) res_list = [match.value for match in jsonpath_expr.find(self.selector)] return flatten(res_list)
def _get_revalues(self, regexes, **kw): self._check_selector_method() regexes = arg_to_iter(regexes) return flatten(self.selector.re(regex) for regex in regexes)
def xpath(self, xpath): return self.__class__(flatten([x.xpath(xpath) for x in self]))
def _get_xpathvalues(self, xpaths, **kw): self._check_selector_method() xpaths = arg_to_iter(xpaths) return flatten( self.selector.xpath(xpath).extract() for xpath in xpaths)
def css(self, xpath): return self.__class__(flatten([x.css(xpath) for x in self]))
def __init__(self, locations=None, unique=True, canonicalize=True): self.locations = flatten([locations]) self.unique = unique self.canonicalize = canonicalize
def select(self, xpath): """Perform the given XPath query on each XPathSelector of the list and return a new (flattened) XPathSelectorList of the results""" return XPathSelectorList(flatten([x.select(xpath) for x in self]))
def text(self): """Return a list of unicode strings with the content text referenced by each XPathSelector of the list""" return LxmlSelectorList(flatten([ x.text() if isinstance(x, LxmlSelector) else x for x in self]))
def _get_jmes_values(self, jmes_paths): jmes_paths = arg_to_iter(jmes_paths) return flatten( jmespath.search(jmes_path, self.json_obj) for jmes_path in jmes_paths)
def _add_link(url_sel, alt_sel=None): url = flatten([url_sel.extract()]) alt = flatten([alt_sel.extract()]) if alt_sel else (u'', ) if url: ret.append(Link(unicode_to_str(url[0], encoding), alt[0]))
def _get_cssvalues(self, csss, **kw): self._check_selector_method() csss = arg_to_iter(csss) return flatten(self.selector.css(css).extract() for css in csss)
def _add_value(self, result, field, item): labels = clear_list(flatten(r[1] for r in arg_to_iter(result))) or None self.logger.debug("resolved labels for %s: %s", item.get(field), labels) item[field] = labels return item
def __call__(self, data): return flatten(MapCompose(self.parse)(data))
def css(self, expr): """Perform the given XPath query on each XPathSelector of the list and return a new (flattened) XPathSelectorList of the results""" return LxmlSelectorList(flatten([ x.css(expr) for x in self ]))