예제 #1
0
파일: rex.py 프로젝트: abael/weblib
def rex_text_list(body, rex, flags=0):
    """
    Return found matches with stripped tags.
    """

    items = []
    for match in rex_list(body, rex, flags=flags):
        items.append(normalize_space(decode_entities(match.group(1))))
    return items
예제 #2
0
def rex_text_list(body, rex, flags=0):
    """
    Return found matches with stripped tags.
    """

    items = []
    for match in rex_list(body, rex, flags=flags):
        items.append(normalize_space(decode_entities(match.group(1))))
    return items
예제 #3
0
def find_content_blocks(tree, min_length=None):
    """
    Iterate over content blocks (russian version)
    """
    from lxml.html import tostring
    from lxml.etree import strip_tags, strip_elements, Comment

    # First, make a copy of DOM-tree to not harm external code
    tree = deepcopy(tree)

    # Completely remove content of following tags
    nondata_tags = ['head', 'style', 'script']
    strip_elements(tree, *nondata_tags)

    # Remove comment nodes (keep tail text)
    strip_tags(tree, Comment)

    # Remove links
    strip_tags(tree, 'a')

    # Drop inline tags
    inline_tags = ('br', 'hr', 'p', 'b', 'i', 'strong', 'em', 'a',
                   'span', 'font')
    strip_tags(tree, *inline_tags)

    # Drop media tags
    media_tags = ('img',)
    strip_tags(tree, *media_tags)

    body = tostring(tree, encoding='utf-8').decode('utf-8')

    # Normalize spaces
    body = normalize_space(body)

    # Remove ALL chars from tags
    re_tag = re.compile(r'<[^>]+>')
    body = re_tag.sub(r'<>', body)

    #with open('/tmp/log.html', 'w') as out:
        #out.write(body.encode('utf-8'))
    #return

    # Find text blocks
    block_rex = re.compile(r'[^<>]+')

    blocks = []
    for match in block_rex.finditer(body):
        block = match.group(0)
        if min_length is None or len(block) >= min_length:
            ratio = _trash_ratio(block)
            if ratio < 0.05:
                words = block.split()
                if not any(len(x) > 50 for x in words):
                    blocks.append(block)
    return blocks
예제 #4
0
def find_content_blocks(tree, min_length=None):
    """
    Iterate over content blocks (russian version)
    """
    from lxml.html import tostring
    from lxml.etree import strip_tags, strip_elements, Comment

    # First, make a copy of DOM-tree to not harm external code
    tree = deepcopy(tree)

    # Completely remove content of following tags
    nondata_tags = ['head', 'style', 'script']
    strip_elements(tree, *nondata_tags)

    # Remove comment nodes (keep tail text)
    strip_tags(tree, Comment)

    # Remove links
    strip_tags(tree, 'a')

    # Drop inline tags
    inline_tags = ('br', 'hr', 'p', 'b', 'i', 'strong', 'em', 'a', 'span',
                   'font')
    strip_tags(tree, *inline_tags)

    # Drop media tags
    media_tags = ('img', )
    strip_tags(tree, *media_tags)

    body = tostring(tree, encoding='utf-8').decode('utf-8')

    # Normalize spaces
    body = normalize_space(body)

    # Remove ALL chars from tags
    re_tag = re.compile(r'<[^>]+>')
    body = re_tag.sub(r'<>', body)

    #with open('/tmp/log.html', 'w') as out:
    #out.write(body.encode('utf-8'))
    #return

    # Find text blocks
    block_rex = re.compile(r'[^<>]+')

    blocks = []
    for match in block_rex.finditer(body):
        block = match.group(0)
        if min_length is None or len(block) >= min_length:
            ratio = _trash_ratio(block)
            if ratio < 0.05:
                words = block.split()
                if not any(len(x) > 50 for x in words):
                    blocks.append(block)
    return blocks
예제 #5
0
파일: text.py 프로젝트: abael/weblib
 def test_normalize_space(self):
     self.assertEqual('', normalize_space(' '))
     self.assertEqual('f', normalize_space(' f '))
     self.assertEqual('f b', normalize_space(' f b '))
     self.assertEqual(u'тр и гла за', normalize_space(u' тр и гла' + '\t' + '\n' + u' за '))
     self.assertEqual(u'тр_и_гла_за', normalize_space(u' тр и гла' + '\t' + '\n' + u' за ', replace='_'))
     self.assertEqual(u'трABCиABCглаABCза', normalize_space(u' тр и гла' + '\t' + '\n' + u' за ', replace='ABC'))
예제 #6
0
def rex_text(body, regexp, flags=0, default=NULL):
    """
    Search `regexp` expression in `body` text and then strip tags in found result.
    """

    match = rex(body, regexp, flags=flags, default=default)
    try:
        return normalize_space(decode_entities(match.group(1)))
    except AttributeError:
        if default is NULL:
            raise DataNotFound('Regexp not found')
        else:
            return default
예제 #7
0
파일: rex.py 프로젝트: abael/weblib
def rex_text(body, regexp, flags=0, default=NULL):
    """
    Search `regexp` expression in `body` text and then strip tags in found result.
    """

    match = rex(body, regexp, flags=flags, default=default)
    try:
        return normalize_space(decode_entities(match.group(1)))
    except AttributeError:
        if default is NULL:
            raise DataNotFound('Regexp not found')
        else:
            return default
예제 #8
0
    def rex_text(self, regexp, flags=0, byte=False, default=NULL):
        """
        Search regular expression in response body and return content of first
        matching group.

        :param byte: if False then search is performed in
        `response.unicode_body()` else the rex is searched in `response.body`.
        """

        try:
            match = self.rex_search(regexp, flags=flags, byte=byte)
        except DataNotFound:
            if default is NULL:
                raise DataNotFound('Regexp not found')
            else:
                return default
        else:
            return normalize_space(decode_entities(match.group(1)))
예제 #9
0
    def rex_text(self, regexp, flags=0, byte=False, default=NULL):
        """
        Search regular expression in response body and return content of first
        matching group.

        :param byte: if False then search is performed in
        `response.unicode_body()` else the rex is searched in `response.body`.
        """

        try:
            match = self.rex_search(regexp, flags=flags, byte=byte)
        except DataNotFound:
            if default is NULL:
                raise DataNotFound('Regexp not found')
            else:
                return default
        else:
            return normalize_space(decode_entities(match.group(1)))
예제 #10
0
파일: text.py 프로젝트: khokhlov/hmedia
 def test_normalize_space(self):
     self.assertEqual('', normalize_space(' '))
     self.assertEqual('f', normalize_space(' f '))
     self.assertEqual('f b', normalize_space(' f b '))
     self.assertEqual(u'тр и гла за',
                      normalize_space(u' тр и гла' + '\t' + '\n' + u' за '))
     self.assertEqual(
         u'тр_и_гла_за',
         normalize_space(u' тр и гла' + '\t' + '\n' + u' за ', replace='_'))
     self.assertEqual(
         u'трABCиABCглаABCза',
         normalize_space(u' тр и гла' + '\t' + '\n' + u' за ',
                         replace='ABC'))