def test_external(): '''Test external components''' # remove unwanted elements mydoc = html.fromstring('<html><body><footer>Test text</footer></body></html>') _, _, mylen = sanitize_tree(mydoc) assert mylen == 0 mydoc = html.fromstring('<html><body><table><th>Test text</th><tr><td>Test</td></tr></table></body></html>') _, _, mylen = sanitize_tree(mydoc) assert mylen > 0 # strip fancy tags while including links and images mydoc = html.fromstring('<html><body><p>Text here <fancy>Test text</fancy><a href="">with a link</a>.</p><img src="test.jpg"/></body></html>') mytree, _, _ = sanitize_tree(mydoc, include_links=False, include_images=False) assert len(mytree) == 1 mydoc = html.fromstring('<html><body><p>Text here <fancy>Test text</fancy><a href="">with a link</a>.</p><img src="test.jpg"/></body></html>') mytree, _, _ = sanitize_tree(mydoc, include_links=True, include_images=True) myelems = {element.tag for element in set(mytree.iter())} assert 'graphic' in myelems and 'ref' in myelems # test langid if LANGID_FLAG is True: doc = html.fromstring('<html><body>' + '<p>Non è inglese.</p>'*20 + '</body></html>') assert extract(doc, no_fallback=False, target_language='en', deduplicate=False) is None # no tables with open(os.path.join(RESOURCES_DIR, 'apache.html')) as f: teststring = f.read() assert 'localhost:80' in extract(teststring, no_fallback=False, include_tables=True) assert 'localhost:80' not in extract(teststring, no_fallback=False, include_tables=False) with open(os.path.join(RESOURCES_DIR, 'scam.html')) as f: teststring = f.read() assert extract(teststring, no_fallback=True, include_tables=False) == '' assert extract(teststring, no_fallback=False, include_tables=False) == ''
def test_images(): '''Test image extraction function''' assert handle_image(html.fromstring('<img src="test.jpg"/>')) is not None assert handle_image(html.fromstring('<img data-src="test.jpg" alt="text" title="a title"/>')) is not None assert handle_image(html.fromstring('<img other="test.jpg"/>')) is None assert utils.is_image_file('test.jpg') is True assert utils.is_image_file('test.txt') is False assert handle_textelem(etree.Element('graphic'), [], False, DEFAULT_CONFIG) is None resources_dir = os.path.join(TEST_DIR, 'resources') with open(os.path.join(resources_dir, 'http_sample.html')) as f: teststring = f.read() assert 'test.jpg Example image' not in extract(teststring) assert 'test.jpg Example image' in extract(teststring, include_images=True, no_fallback=True) assert '<graphic src="test.jpg" title="Example image"/>' in extract(teststring, include_images=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG) # CNN example mydoc = html.fromstring( '<img class="media__image media__image--responsive" alt="Harry and Meghan last March, in their final royal engagement." data-src-mini="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-small-169.jpg" data-src-xsmall="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-medium-plus-169.jpg" data-src-small="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-large-169.jpg" data-src-medium="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg" data-src-large="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-super-169.jpg" data-src-full16x9="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-full-169.jpg" data-src-mini1x1="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-small-11.jpg" data-demand-load="loaded" data-eq-pts="mini: 0, xsmall: 221, small: 308, medium: 461, large: 781" src="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg" data-eq-state="mini xsmall small medium" data-src="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg">') myimage = handle_image(mydoc) assert myimage is not None and 'alt' in myimage.attrib and 'src' in myimage.attrib # modified CNN example mydoc = html.fromstring( '<img class="media__image media__image--responsive" alt="Harry and Meghan last March, in their final royal engagement." data-src-mini="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-small-169.jpg" data-src-xsmall="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-medium-plus-169.jpg" data-src-small="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-large-169.jpg" data-src-medium="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg" data-src-large="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-super-169.jpg" data-src-full16x9="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-full-169.jpg" data-src-mini1x1="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-small-11.jpg" data-demand-load="loaded" data-eq-pts="mini: 0, xsmall: 221, small: 308, medium: 461, large: 781">') myimage = handle_image(mydoc) assert myimage is not None and 'alt' in myimage.attrib and 'src' in myimage.attrib and myimage.get( 'src').startswith('http')
def test_tei(): '''test TEI-related functions''' # open local resources to avoid redownloading at each run resources_dir = os.path.join(TEST_DIR, 'resources') with open(os.path.join(resources_dir, 'httpbin_sample.html')) as f: teststring = f.read() # download, parse and validate simple html file result = extract(teststring, "mocked", no_fallback=True, tei_output=True, tei_validation=False) assert result is not None assert xml.validate_tei(etree.fromstring(result)) is True assert xml.validate_tei(etree.fromstring(teststring)) is False # test with another file with open(os.path.join(resources_dir, 'http_sample.html')) as f: teststring = f.read() # download, parse and validate simple html file result = extract(teststring, "mocked", no_fallback=True, tei_output=True, tei_validation=False) assert result is not None assert xml.validate_tei(etree.fromstring(result)) is True # include ID in metadata result = extract(teststring, "mocked", no_fallback=True, tei_output=True, tei_validation=False, record_id='0001') assert result is not None assert xml.validate_tei(etree.fromstring(result)) is True
def test_links(): '''Test link extraction function''' assert handle_textelem(etree.Element('ref'), [], False, DEFAULT_CONFIG) is None assert handle_formatting( html.fromstring( '<a href="testlink.html">Test link text.</a>')) is not None mydoc = html.fromstring( '<html><body><p><a href="testlink.html">Test link text.</a></p></body></html>' ) assert 'testlink.html' not in extract(mydoc) assert 'testlink.html' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG) resources_dir = os.path.join(TEST_DIR, 'resources') with open(os.path.join(resources_dir, 'http_sample.html')) as f: teststring = f.read() assert 'testlink.html' not in extract(teststring, config=ZERO_CONFIG) assert '[link](testlink.html)' in extract(teststring, include_links=True, no_fallback=True, config=ZERO_CONFIG) assert '<ref target="testlink.html">link</ref>' in extract( teststring, include_links=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG)
def test_htmlprocessing(): '''test html-related functions''' assert trafilatura.htmlprocessing.tree_cleaning(etree.Element('html'), True) is not None assert trafilatura.htmlprocessing.prune_html(etree.Element('unwanted')) is not None mydoc = html.fromstring('<html><body><table><a href="">Link</a></table><img src="test.jpg"/><u>Underlined</u><tt>True Type</tt><sub>Text</sub><sup>Text</sup></body></html>') myconverted = trafilatura.htmlprocessing.convert_tags(mydoc, include_formatting=True, include_tables=True, include_images=True, include_links=True) assert myconverted.xpath('.//ref') and myconverted.xpath('.//graphic') and myconverted.xpath('.//hi[@rend="#t"]') and myconverted.xpath('.//table') myconverted = trafilatura.htmlprocessing.tree_cleaning(mydoc, include_tables=False, include_images=True) assert myconverted.xpath('.//graphic') and not myconverted.xpath('.//table') mydoc = html.fromstring('<html><body><article><h1>Test headline</h1><p>Test</p></article></body></html>') assert '<head rend="h1">Test headline</head>' in extract(mydoc, output_format='xml', config=ZERO_CONFIG, no_fallback=True) assert '<fw rend="h1" type="header">Test headline</fw>' in extract(mydoc, output_format='xmltei', config=ZERO_CONFIG, no_fallback=True) # merge with parent function element = etree.Element('test') xml.merge_with_parent(element) mydoc = html.fromstring('<html><body><p><span>A</span><span>B</span><span>C</span></p></body></html>') for element in mydoc.iter('span'): xml.merge_with_parent(element) assert b'<p>A B C</p>' in etree.tostring(mydoc) mydoc = html.fromstring('<html><body><p><span>A</span><span>B</span> tail<span>C</span></p></body></html>') for element in mydoc.iter('span'): xml.merge_with_parent(element) assert b'<p>A B tail C</p>' in etree.tostring(mydoc) # paywalls my_html = '<html><body><main><p>1</p><p id="paywall">2</p><p>3</p></main></body></html>' assert extract(my_html, config=ZERO_CONFIG, no_fallback=True) == '1\n3' assert extract(my_html, config=ZERO_CONFIG, no_fallback=False) == '1\n3'
def test_txttocsv(): mymeta = dict.fromkeys(METADATA_LIST) assert utils.txttocsv('', '', mymeta) == 'None\tNone\tNone\tNone\tNone\t\t\tNone\n' mymeta['title'] = 'Test title' mymeta['url'] = 'https://example.org' mymeta['hostname'] = 'example.org' mymeta['id'] = '1' mymeta['license'] = 'CC BY-SA' assert utils.txttocsv( 'Test text', 'Test comment', mymeta ) == '1\thttps://example.org\tNone\texample.org\tTest title\tNone\tTest text\tTest comment\tCC BY-SA\n' mystring = '<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>' assert extract(mystring, output_format='csv', config=ZERO_CONFIG) is not None assert extract(mystring, output_format='csv', include_comments=False, config=ZERO_CONFIG).endswith('\tNone\n') # test json result = extract(mystring, output_format='json', config=ZERO_CONFIG) assert result.endswith('}') and '"fingerprint":' in result # bare extraction for python result = bare_extraction(mystring, config=ZERO_CONFIG) assert isinstance(result, dict) and len(result) == 14
def test_images(): '''Test image extraction function''' mydoc = html.fromstring('<html><body><img src="test.jpg"/></body></html>') assert handle_image(html.fromstring('<img src="test.jpg"/>')) is not None assert handle_image( html.fromstring('<img data-src="test.jpg" alt="text" title="a title"/>' )) is not None assert handle_image(html.fromstring('<img other="test.jpg"/>')) is None assert utils.is_image_file('test.jpg') is True assert utils.is_image_file('test.txt') is False assert handle_textelem(etree.Element('graphic'), [], False, DEFAULT_CONFIG) is None resources_dir = os.path.join(TEST_DIR, 'resources') with open(os.path.join(resources_dir, 'http_sample.html')) as f: teststring = f.read() assert 'test.jpg Example image' not in extract(teststring) assert 'test.jpg Example image' in extract(teststring, include_images=True, no_fallback=True) assert '<graphic src="test.jpg" title="Example image"/>' in extract( teststring, include_images=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG)
def test_htmlprocessing(): '''test html-related functions''' assert trafilatura.htmlprocessing.tree_cleaning(etree.Element('html'), True) is not None assert trafilatura.htmlprocessing.prune_html( etree.Element('unwanted')) is not None mydoc = html.fromstring( '<html><body><table><a href="">Link</a></table><img src="test.jpg"/><u>Underlined</u><tt>True Type</tt><sub>Text</sub><sup>Text</sup></body></html>' ) myconverted = trafilatura.htmlprocessing.convert_tags( mydoc, include_formatting=True, include_tables=True, include_images=True) assert myconverted.xpath('.//ref') and myconverted.xpath( './/graphic') and myconverted.xpath( './/hi[@rend="#t"]') and myconverted.xpath('.//table') myconverted = trafilatura.htmlprocessing.tree_cleaning( mydoc, include_tables=False, include_images=True) assert myconverted.xpath( './/graphic') and not myconverted.xpath('.//table') mydoc = html.fromstring( '<html><body><article><h1>Test headline</h1><p>Test</p></article></body></html>' ) assert '<head rend="h1">Test headline</head>' in extract( mydoc, output_format='xml', config=ZERO_CONFIG, no_fallback=True) assert '<fw rend="h1" type="header">Test headline</fw>' in extract( mydoc, output_format='xmltei', config=ZERO_CONFIG, no_fallback=True)
def test_precision_recall(): '''test precision- and recall-oriented settings''' # the test cases could be better my_document = html.fromstring( '<html><body><p>This here is the text.</p></body></html>') assert extract(my_document, favor_precision=True, config=ZERO_CONFIG) is not None assert extract(my_document, favor_recall=True, config=ZERO_CONFIG) is not None
def test_txttocsv(): mymeta = dict.fromkeys(METADATA_LIST) assert utils.txttocsv('', '', mymeta) == 'None\tNone\tNone\tNone\tNone\t\t\n' mymeta['title'] = 'Test title' mymeta['url'] = 'https://example.org' mymeta['hostname'] = 'example.org' assert utils.txttocsv('Test text', 'Test comment', mymeta) == 'https://example.org\tNone\texample.org\tTest title\tNone\tTest text\tTest comment\n' assert extract('<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>', csv_output=True) is not None assert extract('<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>', csv_output=True, include_comments=False).endswith('\t\n') # test json assert extract('<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>', json_output=True).endswith('}')
def test_formatting(): '''Test HTML formatting conversion and extraction''' # simple my_document = html.fromstring('<html><body><p><b>This here is in bold font.</b></p></body></html>') my_result = extract(my_document, xml_output=True, include_formatting=True) assert '<hi rend="#b">This here is in bold font.</hi>' in my_result # nested my_document = html.fromstring('<html><body><p><b>This here is in bold and <i>italic</i> font.</b></p></body></html>') my_result = extract(my_document, xml_output=True, include_formatting=True) assert '<hi rend="#b">This here is in bold and italic font.</hi>' in my_result # empty my_document = html.fromstring('<html><body><p><b><i></i></b></p></body></html>') my_result = extract(my_document, xml_output=True, include_formatting=True) assert '<main/>' in my_result # wild div my_document = html.fromstring('<html><body><article><div><strong>Wild text</strong></div></article></body></html>') my_result = extract(my_document, xml_output=True, include_formatting=True) assert '<p>' in my_result and '<hi>Wild text</hi>' in my_result # no rend so far my_result = extract(my_document) assert my_result == 'Wild text' # links doc = html.fromstring('<html><body><p><a href="">Link text</a></p></body></html>') my_result = extract(doc) assert my_result == 'Link text' # line-breaks doc = html.fromstring('<html><body><p><br/></p></body></html>') my_result = extract(doc) assert my_result == '' doc = html.fromstring('<html><body><p><br/>Here is the text.</p></body></html>') my_result = extract(doc) assert my_result == 'Here is the text.'
def test_tei(): '''test TEI-related functions''' # open local resources to avoid redownloading at each run with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html')) as f: teststring = f.read() # download, parse and validate simple html file result1 = extract(teststring, "mocked", no_fallback=True, output_format='xmltei', tei_validation=False) result2 = extract(teststring, "mocked", no_fallback=True, output_format='xmltei', tei_validation=True) assert result1 is not None and result1 == result2 assert xml.validate_tei(etree.fromstring(result1)) is True assert xml.validate_tei(etree.fromstring(teststring)) is False # test with another file with open(os.path.join(RESOURCES_DIR, 'http_sample.html')) as f: teststring = f.read() # download, parse and validate simple html file result = extract(teststring, "mocked", no_fallback=True, output_format='xmltei', tei_validation=False) assert result is not None # and '<p>license</p>' in result assert xml.validate_tei(etree.fromstring(result)) is True # include ID in metadata result = extract(teststring, "mocked", no_fallback=True, output_format='xmltei', tei_validation=False, record_id='0001') assert result is not None assert xml.validate_tei(etree.fromstring(result)) is True # test header + metadata tei = etree.Element('TEI', xmlns='http://www.tei-c.org/ns/1.0') header = etree.SubElement(tei, 'teiHeader') docmeta = dict.fromkeys(METADATA_LIST) docmeta['categories'], docmeta['tags'] = [], [] docmeta['title'] = 'Title' assert xml.write_fullheader(header, docmeta) is not None docmeta['sitename'] = 'Site Name' assert xml.write_fullheader(header, docmeta) is not None docmeta['hostname'], docmeta['sitename'] = 'hostname', None assert xml.write_fullheader(header, docmeta) is not None
def test_filters(): '''Test content filtering''' if LANGID_FLAG is True: # main text assert trafilatura.filters.language_filter('Hier ist ein Text auf Deutsch', '', 'de', SAMPLE_META) is False assert trafilatura.filters.language_filter('Hier ist ein Text auf Deutsch', '', 'en', SAMPLE_META) is True # comments assert trafilatura.filters.language_filter('Hier ist ein Text.', 'Die Kommentare sind aber etwas länger.', 'de', SAMPLE_META) is False else: # no detection assert trafilatura.filters.language_filter('Hier ist ein Text.', '', 'en', SAMPLE_META) is False # test URL blacklist assert trafilatura.extract('<html><head><link rel="canonical" href="https://example.org"/></head><body></body></html>', output_format='xml', url_blacklist={'https://example.org'}) is None ## recursion limit my_p = '<p>abc</p>' doc = html.fromstring('<html><body>' + my_p*50 + '</body></html>') assert extract(doc, max_tree_size=500) is not None doc = html.fromstring('<html><body>' + my_p*(501) + '</body></html>') assert extract(doc, max_tree_size=500) is None my_p = '<p><hi rend="#i">abc</hi></p>' doc = html.fromstring('<html><body>' + my_p*(501) + '</body></html>') assert extract(doc, include_formatting=True, max_tree_size=500) is None doc = html.fromstring('<html><body>' + my_p*(499) + '</body></html>') assert extract(doc, include_formatting=True, max_tree_size=500) is not None ## deduplication doc = html.fromstring('<html><body>' + my_p*50 + '</body></html>') lru_test = LRUCache(maxsize=2) trafilatura.filters.LRU_TEST = lru_test assert extract(doc, deduplicate=True) is not None assert extract(doc, deduplicate=True) is not None assert extract(doc, deduplicate=True) is not None assert extract(doc, deduplicate=True) is None
def test_links(): '''Test link extraction function''' assert handle_textelem(etree.Element('ref'), [], False, DEFAULT_CONFIG) is None assert handle_formatting(html.fromstring('<a href="testlink.html">Test link text.</a>'), dedupbool=False, config=ZERO_CONFIG) is not None # empty link mydoc = html.fromstring('<html><body><p><a></a><b>Some text.</b></p></body></html>') assert extract(mydoc) is not None # link with target mydoc = html.fromstring('<html><body><p><a href="testlink.html">Test link text.</a> This part of the text has to be long enough.</p></body></html>') assert 'testlink.html' not in extract(mydoc) assert '[Test link text.](testlink.html) This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG) # link without target mydoc = html.fromstring('<html><body><p><a>Test link text.</a> This part of the text has to be long enough.</p></body></html>') assert '[Test link text.] This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG) mydoc = html.fromstring('<html><body><article><a>Segment 1</a><h1><a>Segment 2</a></h1><p>Segment 3</p></article></body></html>') result = extract(mydoc, output_format='xml', include_links=True, no_fallback=True, config=ZERO_CONFIG) assert '1' in result and '2' in result and '3' in result with open(os.path.join(RESOURCES_DIR, 'http_sample.html')) as f: teststring = f.read() assert 'testlink.html' not in extract(teststring, config=ZERO_CONFIG) assert '[link](testlink.html)' in extract(teststring, include_links=True, no_fallback=True, config=ZERO_CONFIG) assert '<ref target="testlink.html">link</ref>' in extract(teststring, include_links=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG) # test license link mydoc = html.fromstring('<html><body><p>Test text under <a rel="license" href="">CC BY-SA license</a>.</p></body></html>') assert 'license="CC BY-SA license"' in extract(mydoc, include_links=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG)
def test_filters(): '''Test content filtering''' if LANGID_FLAG is True: # main text assert trafilatura.filters.language_filter('Hier ist ein Text auf Deutsch', '', 'de', SAMPLE_META) is False assert trafilatura.filters.language_filter('Hier ist ein Text auf Deutsch', '', 'en', SAMPLE_META) is True # comments assert trafilatura.filters.language_filter('Hier ist ein Text.', 'Die Kommentare sind aber etwas länger.', 'de', SAMPLE_META) is False else: # no detection assert trafilatura.filters.language_filter('Hier ist ein Text.', '', 'en', SAMPLE_META) is False # test URL blacklist assert trafilatura.extract( '<html><head><link rel="canonical" href="https://example.org"/></head><body></body></html>', output_format='xml', url_blacklist={'https://example.org'}) is None ## recursion limit my_p = '<p>abc</p>' doc = html.fromstring('<html><body>' + my_p * 50 + '</body></html>') assert extract(doc, max_tree_size=500) is not None doc = html.fromstring('<html><body>' + my_p * 501 + '</body></html>') assert extract(doc, max_tree_size=500) is None my_p = '<p><hi rend="#i">abc</hi></p>' doc = html.fromstring('<html><body>' + my_p * 501 + '</body></html>') assert extract(doc, include_formatting=True, max_tree_size=500) is None doc = html.fromstring('<html><body>' + my_p * 499 + '</body></html>') assert extract(doc, include_formatting=True, max_tree_size=500) is not None ## deduplication doc = html.fromstring('<html><body>' + my_p * 50 + '</body></html>') lru_test = LRUCache(maxsize=2) trafilatura.filters.LRU_TEST = lru_test assert extract(doc, deduplicate=True) is not None assert extract(doc, deduplicate=True) is not None assert extract(doc, deduplicate=True) is not None assert extract(doc, deduplicate=True) is None # HTML lang filter my_p = '<p>In sleep a king, but waking no such matter.</p>' assert extract(html.fromstring('<html lang="en-US"><body>' + my_p * 50 + '</body></html>'), target_language='en') is not None assert extract(html.fromstring('<html lang="en-US"><body>' + my_p * 50 + '</body></html>'), target_language='de') is None assert check_html_lang(html.fromstring('<html lang="de_DE, en_US"><body></body></html>'), target_language='de') is True assert check_html_lang(html.fromstring('<html lang="en"><body></body></html>'), target_language='it') is False assert check_html_lang( html.fromstring('<html><head><meta http-equiv="content-language" content="en"></head><body></body></html>'), target_language='en') is True assert check_html_lang( html.fromstring('<html><head><meta http-equiv="content-language" content="en"></head><body></body></html>'), target_language='de') is False
def test_formatting(): '''Test HTML formatting conversion and extraction''' # simple my_document = html.fromstring( '<html><body><p><b>This here is in bold font.</b></p></body></html>') my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG) assert '<hi rend="#b">This here is in bold font.</hi>' in my_result # nested my_document = html.fromstring( '<html><body><p><b>This here is in bold and <i>italic</i> font.</b></p></body></html>' ) my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG) assert '<hi rend="#b">This here is in bold and italic font.</hi>' in my_result # empty my_document = html.fromstring( '<html><body><p><b><i></i></b></p></body></html>') my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG) assert '<main/>' in my_result # wild div my_document = html.fromstring( '<html><body><article><div><strong>Wild text</strong></div></article></body></html>' ) my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG) assert '<p>' in my_result and '<hi>Wild text</hi>' in my_result # no rend so far my_result = extract(my_document, config=ZERO_CONFIG) assert my_result == 'Wild text' # links doc = html.fromstring( '<html><body><p><a href="">Link text</a></p></body></html>') my_result = extract(doc, config=ZERO_CONFIG) assert my_result == 'Link text' # line-breaks doc = html.fromstring('<html><body><p><br/></p></body></html>') my_result = extract(doc, config=ZERO_CONFIG) assert my_result == '' doc = html.fromstring( '<html><body><p><br/>Here is the text.</p></body></html>') my_result = extract(doc, config=ZERO_CONFIG) assert my_result == 'Here is the text.' # handle formatting tails element = etree.Element("hi") element.text = 'Here is the text.' element.tail = 'And a tail.' converted = handle_formatting(element) assert etree.tostring( converted) == b'<p><hi>Here is the text.</hi>And a tail.</p>'
def test_tei(): '''test TEI-related functions''' # open local resources to avoid redownloading at each run with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html')) as f: teststring = f.read() # download, parse and validate simple html file result1 = extract(teststring, "mocked", no_fallback=True, output_format='xmltei', tei_validation=False) result2 = extract(teststring, "mocked", no_fallback=True, output_format='xmltei', tei_validation=True) assert result1 is not None and result1 == result2 assert xml.validate_tei(etree.fromstring(result1)) is True assert xml.validate_tei(etree.fromstring(teststring)) is False # test with another file with open(os.path.join(RESOURCES_DIR, 'http_sample.html')) as f: teststring = f.read() # download, parse and validate simple html file result = extract(teststring, "mocked", no_fallback=True, include_comments=True, output_format='xmltei', tei_validation=False) assert result is not None # and '<p>license</p>' in result assert xml.validate_tei(etree.fromstring(result)) is True result = extract(teststring, "mocked", no_fallback=True, include_comments=False, output_format='xmltei', tei_validation=False) assert result is not None # and '<p>license</p>' in result assert xml.validate_tei(etree.fromstring(result)) is True # include ID in metadata result = extract(teststring, "mocked", no_fallback=True, output_format='xmltei', tei_validation=False, record_id='0001') assert result is not None assert xml.validate_tei(etree.fromstring(result)) is True # test header + metadata tei = etree.Element('TEI', xmlns='http://www.tei-c.org/ns/1.0') header = etree.SubElement(tei, 'teiHeader') docmeta = Document() docmeta.categories, docmeta.tags = [], [] docmeta.title = 'Title' assert xml.write_fullheader(header, docmeta) is not None docmeta.sitename = 'Site Name' docmeta.date = '2021-01-01' assert xml.write_fullheader(header, docmeta) is not None docmeta.date = None assert xml.write_fullheader(header, docmeta) is not None docmeta.hostname = 'hostname' assert xml.write_fullheader(header, docmeta) is not None docmeta.sitename = None docmeta.license = 'CC BY-SA' docmeta.url = 'https://test.org/' docmeta.categories = ['cat1', 'cat2'] assert xml.write_fullheader(header, docmeta) is not None docmeta.date = '2021-01-01' assert xml.write_fullheader(header, docmeta) is not None docmeta.title, docmeta.sitename = None, None assert xml.write_fullheader(header, docmeta) is not None
def test_exotic_tags(xmloutput=False): # cover some edge cases with a specially crafted file result = load_mock_page('http://exotic_tags', xml_flag=xmloutput, tei_output=True) assert 'Teletype text' in result and 'My new car is silver.' in result filepath = os.path.join(TEST_DIR, 'cache', 'exotic_tags_tei.html') with open(filepath) as f: content = etree.fromstring(f.read()) res = xml.check_tei(content, 'http://dummy') assert etree.tostring(res).startswith( b'<html>\n<text>\n<body>\n<div>\n\n<hi rend="uppercase">Hello</hi>\n<p>Teletype text</p>' ) # misformed HTML declaration htmlstring = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd"><html><head></head><body><p>ABC</p></body></html>' # outputs '012"http://www.w3.org/TR/html4/loose.dtd">\nABC' assert 'ABC' in extract(htmlstring, config=ZERO_CONFIG) # quotes assert handle_quotes(etree.Element('quote')) is None assert handle_table(etree.Element('table')) is None # p within p element, second = etree.Element('p'), etree.Element('p') element.text, second.text = '1st part.', '2nd part.' element.append(second) converted = handle_paragraphs(element, ['p'], False, ZERO_CONFIG) assert etree.tostring(converted) == b'<p>1st part. 2nd part.</p>' # delete last <lb> third = etree.Element('lb') element.append(third) assert etree.tostring(converted) == b'<p>1st part. 2nd part.</p>'
def test_fetch(): '''test URL fetching''' assert fetch_url('1234') == '' assert fetch_url('https://httpbin.org/status/404') is None assert decode_response(b'\x1f\x8babcdef') is not None assert fetch_url('https://expired.badssl.com/', no_ssl=True) is not None # no decoding response = fetch_url('https://httpbin.org/status/200', decode=False) assert response == '' # response object url = 'https://httpbin.org/encoding/utf8' response = _send_request(url, False, DEFAULT_CONFIG) myobject = _handle_response(url, response, False, DEFAULT_CONFIG) assert myobject.data.startswith(b'<h1>Unicode Demo</h1>') # straight handling of response object assert load_html(response) is not None # nothing to see here assert extract(response, url=response.geturl(), config=ZERO_CONFIG) is None # default config is none assert _parse_config(DEFAULT_CONFIG) == (None, None) # default user-agent default = _determine_headers(DEFAULT_CONFIG) assert default['User-Agent'] == USER_AGENT assert 'Cookie' not in default # user-agents rotation assert _parse_config(UA_CONFIG) == ([ 'Firefox', 'Chrome' ], 'yummy_cookie=choco; tasty_cookie=strawberry') custom = _determine_headers(UA_CONFIG) assert custom['User-Agent'] in ['Chrome', 'Firefox'] assert custom['Cookie'] == 'yummy_cookie=choco; tasty_cookie=strawberry'
def load_mock_page(url, xml_flag=False, langcheck=None, tei_output=False): '''load mock page from samples''' try: with open(os.path.join(TEST_DIR, 'cache', MOCK_PAGES[url]), 'r') as inputf: htmlstring = inputf.read() # encoding/windows fix for the tests except UnicodeDecodeError: # read as binary with open(os.path.join(TEST_DIR, 'cache', MOCK_PAGES[url]), 'rb') as inputf: htmlbinary = inputf.read() guessed_encoding = chardet.detect(htmlbinary)['encoding'] if guessed_encoding is not None: try: htmlstring = htmlbinary.decode(guessed_encoding) except UnicodeDecodeError: htmlstring = htmlbinary else: print('Encoding error') result = extract(htmlstring, url, record_id='0000', no_fallback=False, xml_output=xml_flag, tei_output=tei_output, target_language=langcheck) return result
def test_input(): '''test if loaded strings/trees are handled properly''' assert utils.load_html(123) is None assert utils.load_html('<html><body>ÄÖÜ</body></html>') is not None assert utils.load_html( b'<html><body>\x2f\x2e\x9f</body></html>') is not None assert utils.load_html( '<html><body>\x2f\x2e\x9f</body></html>'.encode('latin-1')) is not None #assert utils.load_html(b'0'*int(10e3)) is None assert extract(None, 'url', '0000', target_language=None) is None # GZip with open(os.path.join(RESOURCES_DIR, 'webpage.html.gz'), 'rb') as gzfile: myinput = gzfile.read() assert 'Long story short,' in extract(myinput) # legacy assert process_record(None, 'url', '0000', target_language=None) is None
def test_external(): '''Test external components''' # remove unwanted elements mydoc = html.fromstring( '<html><body><footer>Test text</footer></body></html>') _, _, mylen = sanitize_tree(mydoc) assert mylen == 0 mydoc = html.fromstring( '<html><body><table><th>Test text</th><tr><td>Test</td></tr></table></body></html>' ) _, _, mylen = sanitize_tree(mydoc) assert mylen > 0 # strip fancy tags while including links and images mydoc = html.fromstring( '<html><body><p>Text here <fancy>Test text</fancy><a href="">with a link</a>.</p><img src="test.jpg"/></body></html>' ) mytree, _, _ = sanitize_tree(mydoc, include_links=False, include_images=False) assert len(mytree) == 1 mydoc = html.fromstring( '<html><body><p>Text here <fancy>Test text</fancy><a href="">with a link</a>.</p><img src="test.jpg"/></body></html>' ) mytree, _, _ = sanitize_tree(mydoc, include_links=True, include_images=True) myelems = set([element.tag for element in set(mytree.iter())]) assert 'graphic' in myelems and 'ref' in myelems # test langid if LANGID_FLAG is True: doc = html.fromstring('<html><body>' + '<p>Non è inglese.</p>' * 20 + '</body></html>') assert extract( doc, no_fallback=False, target_language='en', deduplicate=False) is None
def test_input(): '''test if loaded strings/trees are handled properly''' assert utils.load_html(123) is None assert utils.load_html('<html><body>XYZ</body></html>') is not None #assert utils.load_html(b'0'*int(10e3)) is None assert extract(None, 'url', '0000', xml_output=False, tei_output=False, target_language=None) is None # legacy assert process_record(None, 'url', '0000', xml_output=False, tei_output=False, target_language=None) is None
def test_txttocsv(): mymeta = dict.fromkeys(METADATA_LIST) assert utils.txttocsv('', '', mymeta) == 'None\tNone\tNone\tNone\tNone\t\t\n' mymeta['title'] = 'Test title' mymeta['url'] = 'https://example.org' mymeta['hostname'] = 'example.org' assert utils.txttocsv( 'Test text', 'Test comment', mymeta ) == 'https://example.org\tNone\texample.org\tTest title\tNone\tTest text\tTest comment\n' mystring = '<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>' assert extract(mystring, csv_output=True) is not None assert extract(mystring, csv_output=True, include_comments=False).endswith('\t\n') # test json assert extract(mystring, json_output=True).endswith('}') # bare extraction for python result = bare_extraction(mystring) assert isinstance(result, dict) and len(result) == 13
def test_txttocsv(): mymeta = Document() assert utils.txttocsv('', '', mymeta) == 'None\tNone\tNone\tNone\tNone\t\t\tNone\n' mymeta.title = 'Test title' mymeta.url = 'https://example.org' mymeta.hostname = 'example.org' mymeta.id = '1' mymeta.license = 'CC BY-SA' assert utils.txttocsv('Test text', 'Test comment', mymeta) == '1\thttps://example.org\tNone\texample.org\tTest title\tNone\tTest text\tTest comment\tCC BY-SA\n' mystring = '<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>' assert extract(mystring, output_format='csv', config=ZERO_CONFIG) is not None assert extract(mystring, output_format='csv', include_comments=False, config=ZERO_CONFIG).endswith('\tNone\n') # test json result = extract(mystring, output_format='json', config=ZERO_CONFIG) assert result.endswith('}') and '"fingerprint":' in result assert extract(mystring, output_format='json', include_comments=False, config=ZERO_CONFIG).endswith('}') # bare extraction for python result = bare_extraction(mystring, config=ZERO_CONFIG, as_dict=True) assert isinstance(result, dict) and len(result) == 17
def test_exotic_tags(xmloutput=False): # cover some edge cases with a specially crafted file result = load_mock_page('http://exotic_tags', xml_flag=xmloutput, tei_output=True) assert 'Teletype text' in result and 'My new car is silver.' in result filepath = os.path.join(TEST_DIR, 'cache', 'exotic_tags_tei.html') with open(filepath) as f: content = etree.fromstring(f.read()) res = xml.check_tei(content, 'http://dummy') assert etree.tostring(res).startswith( b'<html>\n<text>\n<body>\n<div>\n\n<hi rend="uppercase">Hello</hi>\n<p>Teletype text</p>' ) # misformed HTML declaration htmlstring = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd"><html><head></head><body><p>ABC</p></body></html>' # outputs '012"http://www.w3.org/TR/html4/loose.dtd">\nABC' assert 'ABC' in extract(htmlstring, config=ZERO_CONFIG) # quotes assert handle_quotes(etree.Element('quote'), False, ZERO_CONFIG) is None assert handle_table(etree.Element('table'), False, ZERO_CONFIG) is None # p within p element, second = etree.Element('p'), etree.Element('p') element.text, second.text = '1st part.', '2nd part.' element.append(second) # delete last <lb> element.append(etree.Element('lb')) converted = handle_paragraphs(element, ['p'], False, ZERO_CONFIG) assert etree.tostring(converted) == b'<p>1st part. 2nd part.</p>' # malformed lists (common error) result = etree.tostring( handle_lists( etree.fromstring( '<list>Description of the list:<item>List item 1</item><item>List item 2</item><item>List item 3</item></list>' ), False, ZERO_CONFIG)) assert result.count(b'List item') == 3 assert b"Description" in result # HTML5: <details> htmlstring = '<html><body><article><details><summary>Epcot Center</summary><p>Epcot is a theme park at Walt Disney World Resort featuring exciting attractions, international pavilions, award-winning fireworks and seasonal special events.</p></details></article></body></html>' my_result = extract(htmlstring, no_fallback=True, config=ZERO_CONFIG) print(my_result) assert 'Epcot Center' in my_result and 'award-winning fireworks' in my_result my_result = extract(htmlstring, no_fallback=False, config=ZERO_CONFIG) assert 'Epcot Center' in my_result and 'award-winning fireworks' in my_result
def test_extraction_options(): '''Test the different parameters available in extract() and bare_extraction()''' my_html = '<html><head><meta http-equiv="content-language" content="EN"/></head><body><div="article-body"><p>Text.<!-- comment --></p></div></body></html>' with pytest.raises(NameError) as err: extract(my_html, json_output=True) assert extract(my_html, config=NEW_CONFIG) is None assert extract(my_html, config=ZERO_CONFIG) is not None assert extract(my_html, with_metadata=True, output_format='xml', config=ZERO_CONFIG) is None assert extract(my_html, only_with_metadata=True, output_format='xml', config=ZERO_CONFIG) is None assert extract(my_html, target_language='de', config=ZERO_CONFIG) is None assert etree.tostring(try_justext(html.fromstring(my_html), None, 'de')) == b'<body/>'
def test_exotic_tags(xmloutput=False): # cover some edge cases with a specially crafted file result = load_mock_page('http://exotic_tags', xmloutput, tei_output=True) assert 'Teletype text' in result and 'My new car is silver.' in result filepath = os.path.join(TEST_DIR, 'cache', 'exotic_tags_tei.html') with open(filepath) as f: content = etree.fromstring(f.read()) res = xml.check_tei(content, 'http://dummy') assert etree.tostring(res).startswith(b'<html>\n<text>\n<body>\n<div>\n\n<hi rend="uppercase">Hello</hi>\n<p>Teletype text</p>') # misformed HTML declaration htmlstring = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd"><html><head></head><body><p>ABC</p></body></html>' # outputs '012"http://www.w3.org/TR/html4/loose.dtd">\nABC' assert 'ABC' in extract(htmlstring)
def test_external(): '''Test external components''' # remove unwanted elements mydoc = html.fromstring('<html><body><footer>Test text</footer></body></html>') _, _, mylen = sanitize_tree(mydoc) assert mylen == 0 # strip fancy tags mydoc = html.fromstring('<html><body><p>Text here <fancy>Test text</fancy></p></body></html>') mytree, _, _ = sanitize_tree(mydoc) assert len(mytree) == 1 # justext stoplist # if LANGID_FLAG is True: doc = html.fromstring('<html><body>' + '<p>abc</p>'*10 + '</body></html>') result = extract(doc, no_fallback=False, target_language='en')
def test_precision_recall(): '''test precision- and recall-oriented settings''' # the test cases could be better my_document = html.fromstring('<html><body><p>This here is the text.</p></body></html>') assert extract(my_document, favor_precision=True, config=ZERO_CONFIG) is not None assert extract(my_document, favor_recall=True, config=ZERO_CONFIG) is not None my_document = html.fromstring('<html><body><div class="article-body"><div class="teaser-content"><p>This here is a teaser text.</p></div><div><p>This here is the text.</p></div></body></html>') assert 'teaser text' in extract(my_document, favor_recall=True, config=ZERO_CONFIG) assert 'teaser text' not in extract(my_document, config=ZERO_CONFIG) assert 'teaser text' not in extract(my_document, favor_precision=True, config=ZERO_CONFIG) my_document = html.fromstring('<html><body><article><div><p><a href="test.html">1.</a><br/><a href="test2.html">2.</a></p></div></article></body></html>') assert '1' not in extract(my_document, favor_recall=True, config=ZERO_CONFIG) assert '1' not in extract(my_document, favor_precision=True, config=ZERO_CONFIG)