def test_exotic_tags(xmloutput=False): # cover some edge cases with a specially crafted file result = load_mock_page('http://exotic_tags', xml_flag=xmloutput, tei_output=True) assert 'Teletype text' in result and 'My new car is silver.' in result filepath = os.path.join(TEST_DIR, 'cache', 'exotic_tags_tei.html') with open(filepath) as f: content = etree.fromstring(f.read()) res = xml.check_tei(content, 'http://dummy') assert etree.tostring(res).startswith( b'<html>\n<text>\n<body>\n<div>\n\n<hi rend="uppercase">Hello</hi>\n<p>Teletype text</p>' ) # misformed HTML declaration htmlstring = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd"><html><head></head><body><p>ABC</p></body></html>' # outputs '012"http://www.w3.org/TR/html4/loose.dtd">\nABC' assert 'ABC' in extract(htmlstring, config=ZERO_CONFIG) # quotes assert handle_quotes(etree.Element('quote')) is None assert handle_table(etree.Element('table')) is None # p within p element, second = etree.Element('p'), etree.Element('p') element.text, second.text = '1st part.', '2nd part.' element.append(second) converted = handle_paragraphs(element, ['p'], False, ZERO_CONFIG) assert etree.tostring(converted) == b'<p>1st part. 2nd part.</p>' # delete last <lb> third = etree.Element('lb') element.append(third) assert etree.tostring(converted) == b'<p>1st part. 2nd part.</p>'
def test_exotic_tags(xmloutput=False): # cover some edge cases with a specially crafted file result = load_mock_page('http://exotic_tags', xmloutput, tei_output=True) assert 'Teletype text' in result and 'My new car is silver.' in result filepath = os.path.join(TEST_DIR, 'cache', 'exotic_tags_tei.html') with open(filepath) as f: content = etree.fromstring(f.read()) res = xml.check_tei(content, 'http://dummy') assert etree.tostring(res).startswith( b'<html>\n<text>\n<body>\n<div>\n\n<hi rend="uppercase">Hello</hi>\n<p>Teletype text</p>' )
def test_exotic_tags(xmloutput=False): # cover some edge cases with a specially crafted file result = load_mock_page('http://exotic_tags', xmloutput, tei_output=True) assert 'Teletype text' in result and 'My new car is silver.' in result filepath = os.path.join(TEST_DIR, 'cache', 'exotic_tags_tei.html') with open(filepath) as f: content = etree.fromstring(f.read()) res = xml.check_tei(content, 'http://dummy') assert etree.tostring(res).startswith(b'<html>\n<text>\n<body>\n<div>\n\n<hi rend="uppercase">Hello</hi>\n<p>Teletype text</p>') # misformed HTML declaration htmlstring = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd"><html><head></head><body><p>ABC</p></body></html>' # outputs '012"http://www.w3.org/TR/html4/loose.dtd">\nABC' assert 'ABC' in extract(htmlstring)
def test_exotic_tags(xmloutput=False): # cover some edge cases with a specially crafted file result = load_mock_page('http://exotic_tags', xml_flag=xmloutput, tei_output=True) assert 'Teletype text' in result and 'My new car is silver.' in result filepath = os.path.join(TEST_DIR, 'cache', 'exotic_tags_tei.html') with open(filepath) as f: content = etree.fromstring(f.read()) res = xml.check_tei(content, 'http://dummy') assert etree.tostring(res).startswith( b'<html>\n<text>\n<body>\n<div>\n\n<hi rend="uppercase">Hello</hi>\n<p>Teletype text</p>' ) # misformed HTML declaration htmlstring = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd"><html><head></head><body><p>ABC</p></body></html>' # outputs '012"http://www.w3.org/TR/html4/loose.dtd">\nABC' assert 'ABC' in extract(htmlstring, config=ZERO_CONFIG) # quotes assert handle_quotes(etree.Element('quote'), False, ZERO_CONFIG) is None assert handle_table(etree.Element('table'), False, ZERO_CONFIG) is None # p within p element, second = etree.Element('p'), etree.Element('p') element.text, second.text = '1st part.', '2nd part.' element.append(second) # delete last <lb> element.append(etree.Element('lb')) converted = handle_paragraphs(element, ['p'], False, ZERO_CONFIG) assert etree.tostring(converted) == b'<p>1st part. 2nd part.</p>' # malformed lists (common error) result = etree.tostring( handle_lists( etree.fromstring( '<list>Description of the list:<item>List item 1</item><item>List item 2</item><item>List item 3</item></list>' ), False, ZERO_CONFIG)) assert result.count(b'List item') == 3 assert b"Description" in result # HTML5: <details> htmlstring = '<html><body><article><details><summary>Epcot Center</summary><p>Epcot is a theme park at Walt Disney World Resort featuring exciting attractions, international pavilions, award-winning fireworks and seasonal special events.</p></details></article></body></html>' my_result = extract(htmlstring, no_fallback=True, config=ZERO_CONFIG) print(my_result) assert 'Epcot Center' in my_result and 'award-winning fireworks' in my_result my_result = extract(htmlstring, no_fallback=False, config=ZERO_CONFIG) assert 'Epcot Center' in my_result and 'award-winning fireworks' in my_result
def test_exotic_tags(xmloutput=False): # cover some edge cases with a specially crafted file result = load_mock_page('http://exotic_tags', xml_flag=xmloutput, tei_output=True) assert 'Teletype text' in result and 'My new car is silver.' in result filepath = os.path.join(TEST_DIR, 'resources', 'exotic_tags_tei.html') with open(filepath) as f: content = etree.fromstring(f.read()) res = xml.check_tei(content, 'http://dummy') assert etree.tostring(res).startswith(b'<html>\n<text>\n<body>\n<div>\n\n<hi rend="uppercase">Hello</hi>\n<p>Teletype text</p>') # misformed HTML declaration htmlstring = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd"><html><head></head><body><p>ABC</p></body></html>' # outputs '012"http://www.w3.org/TR/html4/loose.dtd">\nABC' assert 'ABC' in extract(htmlstring, config=ZERO_CONFIG) # quotes assert handle_quotes(etree.Element('quote'), False, ZERO_CONFIG) is None assert handle_table(etree.Element('table'), TAG_CATALOG, False, ZERO_CONFIG) is None # p within p element, second = etree.Element('p'), etree.Element('p') element.text, second.text = '1st part.', '2nd part.' element.append(second) # delete last <lb> element.append(etree.Element('lb')) converted = handle_paragraphs(element, ['p'], False, ZERO_CONFIG) assert etree.tostring(converted) == b'<p>1st part. 2nd part.</p>' # naked div with <lb> assert '1.\n2.\n3.' in extract('<html><body><main><div>1.<br/>2.<br/>3.<br/></div></main></body></html>', no_fallback=True, config=ZERO_CONFIG) # malformed lists (common error) result = etree.tostring(handle_lists(etree.fromstring('<list>Description of the list:<item>List item 1</item><item>List item 2</item><item>List item 3</item></list>'), False, ZERO_CONFIG)) assert result.count(b'List item') == 3 assert b"Description" in result # HTML5: <details> htmlstring = '<html><body><article><details><summary>Epcot Center</summary><p>Epcot is a theme park at Walt Disney World Resort featuring exciting attractions, international pavilions, award-winning fireworks and seasonal special events.</p></details></article></body></html>' my_result = extract(htmlstring, no_fallback=True, config=ZERO_CONFIG) assert 'Epcot Center' in my_result and 'award-winning fireworks' in my_result my_result = extract(htmlstring, no_fallback=False, config=ZERO_CONFIG) assert 'Epcot Center' in my_result and 'award-winning fireworks' in my_result # tables with nested elements htmlstring = '''<html><body><article> <table> <tr><td><b>Present Tense</b></td> <td>I buy</td> <td>you buy</td> <td>he/she/it buys</td> <td>we buy</td> <td>you buy</td> <td>they buy</td> </tr> </table></article></body></html>''' my_result = extract(htmlstring, no_fallback=True, output_format='xml', include_formatting=True, config=ZERO_CONFIG) assert '''<row> <cell> <hi>Present Tense</hi> </cell> <cell>I buy</cell> <cell>you buy</cell> <cell>he/she/it buys</cell> <cell>we buy</cell> <cell>you buy</cell> <cell>they buy</cell> </row>''' in my_result # nested list htmlstring = '''<html><body><article> <ul> <li>Coffee</li> <li>Tea <ul> <li>Black tea</li> <li>Green tea</li> </ul> </li> <li>Milk</li> </ul> </article></body></html>''' my_result = extract(htmlstring, no_fallback=True, output_format='xml', config=ZERO_CONFIG) assert ''' <list> <item>Coffee</item> <item> <item>Tea</item> <list> <item>Black tea</item> <item>Green tea</item> </list> </item> <item>Milk</item> </list>''' in my_result # table with links # todo: further tests and adjustsments htmlstring = '<html><body><article><table><tr><td><a href="test.html">' + 'ABCD'*100 + '</a></td></tr></table></article></body></html>' result = extract(htmlstring, no_fallback=True, output_format='xml', config=ZERO_CONFIG, include_tables=True, include_links=True) assert 'ABCD' not in result # nested table htmlstring = '<html><body><article><table><th>1</th><table><tr><td>2</td></tr></table></table></article></body></html>' result = extract(htmlstring, no_fallback=True, output_format='xml', config=ZERO_CONFIG, include_tables=True) # todo: all elements are there, but output not nested # todo: th conversion assert '<cell>1</cell>' in result and '<cell>2</cell>' in result