def test_links(): '''Test link extraction function''' assert handle_textelem(etree.Element('ref'), [], False, DEFAULT_CONFIG) is None assert handle_formatting( html.fromstring( '<a href="testlink.html">Test link text.</a>')) is not None mydoc = html.fromstring( '<html><body><p><a href="testlink.html">Test link text.</a></p></body></html>' ) assert 'testlink.html' not in extract(mydoc) assert 'testlink.html' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG) resources_dir = os.path.join(TEST_DIR, 'resources') with open(os.path.join(resources_dir, 'http_sample.html')) as f: teststring = f.read() assert 'testlink.html' not in extract(teststring, config=ZERO_CONFIG) print( extract(teststring, include_links=True, no_fallback=True, config=ZERO_CONFIG)) assert '[link](testlink.html)' in extract(teststring, include_links=True, no_fallback=True, config=ZERO_CONFIG) assert '<ref target="testlink.html">link</ref>' in extract( teststring, include_links=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG)
def test_images(): '''Test image extraction function''' mydoc = html.fromstring('<html><body><img src="test.jpg"/></body></html>') assert handle_image(html.fromstring('<img src="test.jpg"/>')) is not None assert handle_image( html.fromstring('<img data-src="test.jpg" alt="text" title="a title"/>' )) is not None assert handle_image(html.fromstring('<img other="test.jpg"/>')) is None assert utils.is_image_file('test.jpg') is True assert utils.is_image_file('test.txt') is False assert handle_textelem(etree.Element('graphic'), [], False, DEFAULT_CONFIG) is None resources_dir = os.path.join(TEST_DIR, 'resources') with open(os.path.join(resources_dir, 'http_sample.html')) as f: teststring = f.read() assert 'test.jpg Example image' not in extract(teststring) assert 'test.jpg Example image' in extract(teststring, include_images=True, no_fallback=True) assert '<graphic src="test.jpg" title="Example image"/>' in extract( teststring, include_images=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG)
def test_links(): '''Test link extraction function''' assert handle_textelem(etree.Element('ref'), [], False, DEFAULT_CONFIG) is None assert handle_formatting(html.fromstring('<a href="testlink.html">Test link text.</a>'), dedupbool=False, config=ZERO_CONFIG) is not None # empty link mydoc = html.fromstring('<html><body><p><a></a><b>Some text.</b></p></body></html>') assert extract(mydoc) is not None # link with target mydoc = html.fromstring('<html><body><p><a href="testlink.html">Test link text.</a> This part of the text has to be long enough.</p></body></html>') assert 'testlink.html' not in extract(mydoc) assert '[Test link text.](testlink.html) This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG) # link without target mydoc = html.fromstring('<html><body><p><a>Test link text.</a> This part of the text has to be long enough.</p></body></html>') assert '[Test link text.] This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG) mydoc = html.fromstring('<html><body><article><a>Segment 1</a><h1><a>Segment 2</a></h1><p>Segment 3</p></article></body></html>') result = extract(mydoc, output_format='xml', include_links=True, no_fallback=True, config=ZERO_CONFIG) assert '1' in result and '2' in result and '3' in result with open(os.path.join(RESOURCES_DIR, 'http_sample.html')) as f: teststring = f.read() assert 'testlink.html' not in extract(teststring, config=ZERO_CONFIG) assert '[link](testlink.html)' in extract(teststring, include_links=True, no_fallback=True, config=ZERO_CONFIG) assert '<ref target="testlink.html">link</ref>' in extract(teststring, include_links=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG) # test license link mydoc = html.fromstring('<html><body><p>Test text under <a rel="license" href="">CC BY-SA license</a>.</p></body></html>') assert 'license="CC BY-SA license"' in extract(mydoc, include_links=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG)
def test_images(): '''Test image extraction function''' assert handle_image(html.fromstring('<img src="test.jpg"/>')) is not None assert handle_image(html.fromstring('<img data-src="test.jpg" alt="text" title="a title"/>')) is not None assert handle_image(html.fromstring('<img other="test.jpg"/>')) is None assert utils.is_image_file('test.jpg') is True assert utils.is_image_file('test.txt') is False assert handle_textelem(etree.Element('graphic'), [], False, DEFAULT_CONFIG) is None resources_dir = os.path.join(TEST_DIR, 'resources') with open(os.path.join(resources_dir, 'http_sample.html')) as f: teststring = f.read() assert 'test.jpg Example image' not in extract(teststring) assert 'test.jpg Example image' in extract(teststring, include_images=True, no_fallback=True) assert '<graphic src="test.jpg" title="Example image"/>' in extract(teststring, include_images=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG) # CNN example mydoc = html.fromstring( '<img class="media__image media__image--responsive" alt="Harry and Meghan last March, in their final royal engagement." data-src-mini="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-small-169.jpg" data-src-xsmall="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-medium-plus-169.jpg" data-src-small="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-large-169.jpg" data-src-medium="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg" data-src-large="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-super-169.jpg" data-src-full16x9="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-full-169.jpg" data-src-mini1x1="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-small-11.jpg" data-demand-load="loaded" data-eq-pts="mini: 0, xsmall: 221, small: 308, medium: 461, large: 781" src="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg" data-eq-state="mini xsmall small medium" data-src="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg">') myimage = handle_image(mydoc) assert myimage is not None and 'alt' in myimage.attrib and 'src' in myimage.attrib # modified CNN example mydoc = html.fromstring( '<img class="media__image media__image--responsive" alt="Harry and Meghan last March, in their final royal engagement." data-src-mini="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-small-169.jpg" data-src-xsmall="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-medium-plus-169.jpg" data-src-small="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-large-169.jpg" data-src-medium="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg" data-src-large="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-super-169.jpg" data-src-full16x9="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-full-169.jpg" data-src-mini1x1="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-small-11.jpg" data-demand-load="loaded" data-eq-pts="mini: 0, xsmall: 221, small: 308, medium: 461, large: 781">') myimage = handle_image(mydoc) assert myimage is not None and 'alt' in myimage.attrib and 'src' in myimage.attrib and myimage.get( 'src').startswith('http')
def test_images(): '''Test image extraction function''' mydoc = html.fromstring('<html><body><img src="test.jpg"/></body></html>') assert handle_image(html.fromstring('<img src="test.jpg"/>')) is not None assert handle_image( html.fromstring('<img data-src="test.jpg" alt="text" title="a title"/>' )) is not None assert handle_image(html.fromstring('<img other="test.jpg"/>')) is None assert utils.is_image_file('test.jpg') is True assert utils.is_image_file('test.txt') is False assert handle_textelem(etree.Element('image'), [], False) is None
def test_links(): '''Test link extraction function''' assert handle_textelem(etree.Element('ref'), [], False, DEFAULT_CONFIG) is None assert handle_formatting( html.fromstring('<a href="testlink.html">Test link text.</a>'), dedupbool=False, config=ZERO_CONFIG) is not None # link with target mydoc = html.fromstring( '<html><body><p><a href="testlink.html">Test link text.</a></p></body></html>' ) assert 'testlink.html' not in extract(mydoc) assert '[Test link text.](testlink.html)' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG) # link without target mydoc = html.fromstring( '<html><body><p><a>Test link text.</a></p></body></html>') assert '[Test link text.]' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG) resources_dir = os.path.join(TEST_DIR, 'resources') with open(os.path.join(resources_dir, 'http_sample.html')) as f: teststring = f.read() assert 'testlink.html' not in extract(teststring, config=ZERO_CONFIG) assert '[link](testlink.html)' in extract(teststring, include_links=True, no_fallback=True, config=ZERO_CONFIG) assert '<ref target="testlink.html">link</ref>' in extract( teststring, include_links=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG) # test license link mydoc = html.fromstring( '<html><body><p>Test text under <a rel="license" href="">CC BY-SA license</a>.</p></body></html>' ) assert 'license="CC BY-SA license"' in extract(mydoc, include_links=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG)