Exemplo n.º 1
0
    def get_value(self, value, *processors, **kw):
        regex = kw.get('re', None)
        if regex:
            value = arg_to_iter(value)
            value = flatten([extract_regex(regex, x) for x in value])

        for proc in processors:
            if value is None:
                break
            proc = wrap_loader_context(proc, self.context)
            value = proc(value)
        return value
Exemplo n.º 2
0
def extract_regex(regex, text, encoding='utf-8'):
    """Extract a list of unicode strings from the given text/encoding using the following policies:

    * if the regex contains a named group called "extract" that will be returned
    * if the regex contains multiple numbered groups, all those will be returned (flattened)
    * if the regex doesn't contain any group the entire regex matching is returned
    """

    if isinstance(regex, basestring):
        regex = re.compile(regex, re.UNICODE)

    try:
        strings = [regex.search(text).group('extract')]   # named group
    except:
        strings = regex.findall(text)    # full regex or numbered groups
    strings = flatten(strings)

    if isinstance(text, unicode):
        return [replace_entities(s, keep=['lt', 'amp']) for s in strings]
    else:
        return [replace_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings]
Exemplo n.º 3
0
 def re(self, regex):
     return flatten([x.re(regex) for x in self])
Exemplo n.º 4
0
 def css(self, xpath):
     return self.__class__(flatten([x.css(xpath) for x in self]))
Exemplo n.º 5
0
 def xpath(self, xpath):
     return self.__class__(flatten([x.xpath(xpath) for x in self]))
Exemplo n.º 6
0
 def _get_cssvalues(self, csss, **kw):
     self._check_selector_method()
     csss = arg_to_iter(csss)
     return flatten([self.selector.css(css).extract() for css in csss])
Exemplo n.º 7
0
 def _get_xpathvalues(self, xpaths, **kw):
     self._check_selector_method()
     xpaths = arg_to_iter(xpaths)
     return flatten(
         [self.selector.xpath(xpath).extract() for xpath in xpaths])