def test_html_encoding(self): reader = HTMLReader() for name in CODEC_NAMES: data = io.BytesIO('<img>'.encode(name)) elements = tuple(reader.read_links(data, encoding=name)) html_element = elements[0] self.assertEqual('html', html_element.tag)
def test_html_early_html(self): reader = HTMLReader() for test_string in [ b'''<!DOCTYPE HTML><html></html><img>''', b'''<html></html><img>''', b'''<!DOCTYPE HTML><img><html></html>''', b'''<img><html></html>''', b'''<!DOCTYPE HTML> <html><body></body></html><p><img>''', b''' <html><body></body></html><p><img>''', b''' <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <head> <title>Download</title> </head> <body> <br /> </body> </html> <pre><img></pre> ''', b''' <html> <head> <title>Download</title> </head> <body> <br /> </body> </html> <pre><img></pre> ''', b''' <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <body> <br /> </body> <head> <title>Download</title> </head> </html> <pre><img></pre> ''', b''' <html> <body> <br /> </body> <head> <title>Download</title> </head> </html> <pre><img></pre> ''', ]: elements = tuple( reader.read_links(io.BytesIO(test_string), encoding='ascii') ) self.assertEqual('img', elements[-1].tag) elements = tuple( reader.read_tree(io.BytesIO(test_string), encoding='ascii') ) self.assertEqual('img', elements[-4].tag)