Python handle_textelem примеры, trafilatura.core.handle_textelem Python примеры использования

Пример #1

0

Показать файл

Файл: unit_tests.py Проект: vkuberan/trafilatura

def test_links():
    '''Test link extraction function'''
    assert handle_textelem(etree.Element('ref'), [], False,
                           DEFAULT_CONFIG) is None
    assert handle_formatting(
        html.fromstring(
            '<a href="testlink.html">Test link text.</a>')) is not None
    mydoc = html.fromstring(
        '<html><body><p><a href="testlink.html">Test link text.</a></p></body></html>'
    )
    assert 'testlink.html' not in extract(mydoc)
    assert 'testlink.html' in extract(mydoc,
                                      include_links=True,
                                      no_fallback=True,
                                      config=ZERO_CONFIG)
    resources_dir = os.path.join(TEST_DIR, 'resources')
    with open(os.path.join(resources_dir, 'http_sample.html')) as f:
        teststring = f.read()
    assert 'testlink.html' not in extract(teststring, config=ZERO_CONFIG)
    print(
        extract(teststring,
                include_links=True,
                no_fallback=True,
                config=ZERO_CONFIG))
    assert '[link](testlink.html)' in extract(teststring,
                                              include_links=True,
                                              no_fallback=True,
                                              config=ZERO_CONFIG)
    assert '<ref target="testlink.html">link</ref>' in extract(
        teststring,
        include_links=True,
        no_fallback=True,
        output_format='xml',
        config=ZERO_CONFIG)

Пример #2

0

Показать файл

Файл: unit_tests.py Проект: vkuberan/trafilatura

def test_images():
    '''Test image extraction function'''
    mydoc = html.fromstring('<html><body><img src="test.jpg"/></body></html>')
    assert handle_image(html.fromstring('<img src="test.jpg"/>')) is not None
    assert handle_image(
        html.fromstring('<img data-src="test.jpg" alt="text" title="a title"/>'
                        )) is not None
    assert handle_image(html.fromstring('<img other="test.jpg"/>')) is None
    assert utils.is_image_file('test.jpg') is True
    assert utils.is_image_file('test.txt') is False
    assert handle_textelem(etree.Element('graphic'), [], False,
                           DEFAULT_CONFIG) is None
    resources_dir = os.path.join(TEST_DIR, 'resources')
    with open(os.path.join(resources_dir, 'http_sample.html')) as f:
        teststring = f.read()
    assert 'test.jpg Example image' not in extract(teststring)
    assert 'test.jpg Example image' in extract(teststring,
                                               include_images=True,
                                               no_fallback=True)
    assert '<graphic src="test.jpg" title="Example image"/>' in extract(
        teststring,
        include_images=True,
        no_fallback=True,
        output_format='xml',
        config=ZERO_CONFIG)

Пример #3

0

Показать файл

def test_links():
    '''Test link extraction function'''
    assert handle_textelem(etree.Element('ref'), [], False, DEFAULT_CONFIG) is None
    assert handle_formatting(html.fromstring('<a href="testlink.html">Test link text.</a>'), dedupbool=False, config=ZERO_CONFIG) is not None
    # empty link
    mydoc = html.fromstring('<html><body><p><a></a><b>Some text.</b></p></body></html>')
    assert extract(mydoc) is not None
    # link with target
    mydoc = html.fromstring('<html><body><p><a href="testlink.html">Test link text.</a> This part of the text has to be long enough.</p></body></html>')
    assert 'testlink.html' not in extract(mydoc)
    assert '[Test link text.](testlink.html) This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG)
    # link without target
    mydoc = html.fromstring('<html><body><p><a>Test link text.</a> This part of the text has to be long enough.</p></body></html>')
    assert '[Test link text.] This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG)
    mydoc = html.fromstring('<html><body><article><a>Segment 1</a><h1><a>Segment 2</a></h1><p>Segment 3</p></article></body></html>')
    result = extract(mydoc, output_format='xml', include_links=True, no_fallback=True, config=ZERO_CONFIG)
    assert '1' in result and '2' in result and '3' in result
    with open(os.path.join(RESOURCES_DIR, 'http_sample.html')) as f:
        teststring = f.read()
    assert 'testlink.html' not in extract(teststring, config=ZERO_CONFIG)
    assert '[link](testlink.html)' in extract(teststring, include_links=True, no_fallback=True, config=ZERO_CONFIG)
    assert '<ref target="testlink.html">link</ref>' in extract(teststring, include_links=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG)
    # test license link
    mydoc = html.fromstring('<html><body><p>Test text under <a rel="license" href="">CC BY-SA license</a>.</p></body></html>')
    assert 'license="CC BY-SA license"' in extract(mydoc, include_links=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG)

Пример #4

0

Показать файл

def test_images():
    '''Test image extraction function'''
    assert handle_image(html.fromstring('<img src="test.jpg"/>')) is not None
    assert handle_image(html.fromstring('<img data-src="test.jpg" alt="text" title="a title"/>')) is not None
    assert handle_image(html.fromstring('<img other="test.jpg"/>')) is None
    assert utils.is_image_file('test.jpg') is True
    assert utils.is_image_file('test.txt') is False
    assert handle_textelem(etree.Element('graphic'), [], False, DEFAULT_CONFIG) is None
    resources_dir = os.path.join(TEST_DIR, 'resources')
    with open(os.path.join(resources_dir, 'http_sample.html')) as f:
        teststring = f.read()
    assert 'test.jpg Example image' not in extract(teststring)
    assert 'test.jpg Example image' in extract(teststring, include_images=True, no_fallback=True)
    assert '<graphic src="test.jpg" title="Example image"/>' in extract(teststring, include_images=True,
                                                                        no_fallback=True, output_format='xml',
                                                                        config=ZERO_CONFIG)
    # CNN example
    mydoc = html.fromstring(
        '<img class="media__image media__image--responsive" alt="Harry and Meghan last March, in their final royal engagement." data-src-mini="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-small-169.jpg" data-src-xsmall="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-medium-plus-169.jpg" data-src-small="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-large-169.jpg" data-src-medium="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg" data-src-large="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-super-169.jpg" data-src-full16x9="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-full-169.jpg" data-src-mini1x1="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-small-11.jpg" data-demand-load="loaded" data-eq-pts="mini: 0, xsmall: 221, small: 308, medium: 461, large: 781" src="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg" data-eq-state="mini xsmall small medium" data-src="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg">')
    myimage = handle_image(mydoc)
    assert myimage is not None and 'alt' in myimage.attrib and 'src' in myimage.attrib
    # modified CNN example
    mydoc = html.fromstring(
        '<img class="media__image media__image--responsive" alt="Harry and Meghan last March, in their final royal engagement." data-src-mini="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-small-169.jpg" data-src-xsmall="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-medium-plus-169.jpg" data-src-small="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-large-169.jpg" data-src-medium="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg" data-src-large="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-super-169.jpg" data-src-full16x9="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-full-169.jpg" data-src-mini1x1="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-small-11.jpg" data-demand-load="loaded" data-eq-pts="mini: 0, xsmall: 221, small: 308, medium: 461, large: 781">')
    myimage = handle_image(mydoc)
    assert myimage is not None and 'alt' in myimage.attrib and 'src' in myimage.attrib and myimage.get(
        'src').startswith('http')

Пример #5

0

Показать файл

Файл: unit_tests.py Проект: EiffelFly/trafilatura

def test_images():
    '''Test image extraction function'''
    mydoc = html.fromstring('<html><body><img src="test.jpg"/></body></html>')
    assert handle_image(html.fromstring('<img src="test.jpg"/>')) is not None
    assert handle_image(
        html.fromstring('<img data-src="test.jpg" alt="text" title="a title"/>'
                        )) is not None
    assert handle_image(html.fromstring('<img other="test.jpg"/>')) is None
    assert utils.is_image_file('test.jpg') is True
    assert utils.is_image_file('test.txt') is False
    assert handle_textelem(etree.Element('image'), [], False) is None

Пример #6

0

Показать файл

def test_links():
    '''Test link extraction function'''
    assert handle_textelem(etree.Element('ref'), [], False,
                           DEFAULT_CONFIG) is None
    assert handle_formatting(
        html.fromstring('<a href="testlink.html">Test link text.</a>'),
        dedupbool=False,
        config=ZERO_CONFIG) is not None
    # link with target
    mydoc = html.fromstring(
        '<html><body><p><a href="testlink.html">Test link text.</a></p></body></html>'
    )
    assert 'testlink.html' not in extract(mydoc)
    assert '[Test link text.](testlink.html)' in extract(mydoc,
                                                         include_links=True,
                                                         no_fallback=True,
                                                         config=ZERO_CONFIG)
    # link without target
    mydoc = html.fromstring(
        '<html><body><p><a>Test link text.</a></p></body></html>')
    assert '[Test link text.]' in extract(mydoc,
                                          include_links=True,
                                          no_fallback=True,
                                          config=ZERO_CONFIG)
    resources_dir = os.path.join(TEST_DIR, 'resources')
    with open(os.path.join(resources_dir, 'http_sample.html')) as f:
        teststring = f.read()
    assert 'testlink.html' not in extract(teststring, config=ZERO_CONFIG)
    assert '[link](testlink.html)' in extract(teststring,
                                              include_links=True,
                                              no_fallback=True,
                                              config=ZERO_CONFIG)
    assert '<ref target="testlink.html">link</ref>' in extract(
        teststring,
        include_links=True,
        no_fallback=True,
        output_format='xml',
        config=ZERO_CONFIG)
    # test license link
    mydoc = html.fromstring(
        '<html><body><p>Test text under <a rel="license" href="">CC BY-SA license</a>.</p></body></html>'
    )
    assert 'license="CC BY-SA license"' in extract(mydoc,
                                                   include_links=True,
                                                   no_fallback=True,
                                                   output_format='xml',
                                                   config=ZERO_CONFIG)

Python handle_textelem примеры использования