Exemplo n.º 1
0
def filter_regex(regex, texts):
    if regex:
        if not isinstance(texts, collections.Iterable):
            texts = extract_regex(regex, texts)
        else:
            text_group = texts
            texts = []
            for text in text_group:
                if isinstance(text, dict):
                    text = json.dumps(text)
                text = unicode(text)
                text = extract_regex(regex, text)
                if text:
                    texts.extend(text)
    return texts
Exemplo n.º 2
0
def parse_int(text):
    """Parse integer numbers"""
    if not isinstance(text, six.string_types):
        return text

    try:
        text = re.sub(r'[\s,]*', '', text)
        return [int(match) for match in extract_regex(INT_REGEX, text)]
    except ValueError:
        return None
Exemplo n.º 3
0
def parse_float(text):
    """Parse float numbers."""
    if not isinstance(text, str):
        return text

    try:
        text = re.sub(r'[\s,]*', '', text)
        return [float(match) for match in extract_regex(FLOAT_REGEX, text)]
    except ValueError:
        return None
Exemplo n.º 4
0
    def re(self, regex, replace_entities=True, **kwargs):
        """
        Apply the given regex and return a list of unicode strings with the
        matches.

        ``regex`` can be either a compiled regular expression or a string which
        will be compiled to a regular expression using ``re.compile(regex)``.

        By default, character entity references are replaced by their
        corresponding character (except for ``&`` and ``<``).
        Passing ``replace_entities`` as ``False`` switches off these
        replacements.
        """
        if isinstance(regex, str): regex = re.compile(regex, **kwargs)
        return extract_regex(regex,
                             self.get(),
                             replace_entities=replace_entities)
Exemplo n.º 5
0
    def get_value(self, value, *processors, **kw):
        """
        Process the given ``value`` by the given ``processors`` and keyword
        arguments.

        Available keyword arguments:

        :param re: a regular expression to use for extracting data from the
            given value using :func:`~parsel.utils.extract_regex` method,
            applied before processors
        :type re: str or typing.Pattern

        Examples:

        >>> from itemloaders import ItemLoader
        >>> from itemloaders.processors import TakeFirst
        >>> loader = ItemLoader()
        >>> loader.get_value('name: foo', TakeFirst(), str.upper, re='name: (.+)')
        'FOO'
        """
        regex = kw.get('re', None)
        if regex:
            value = arg_to_iter(value)
            value = flatten(extract_regex(regex, x) for x in value)

        for proc in processors:
            if value is None:
                break
            _proc = proc
            proc = wrap_loader_context(proc, self.context)
            try:
                value = proc(value)
            except Exception as e:
                raise ValueError(
                    "Error with processor %s value=%r error='%s: %s'" %
                    (_proc.__class__.__name__, value, type(e).__name__,
                     str(e)))
        return value
Exemplo n.º 6
0
def test_extract_regex(regex, text, replace_entities, expected):
    assert extract_regex(regex, text, replace_entities) == expected