def test_html_layout(self): reader = HTMLReader() elements = tuple( reader.read_tree(io.BytesIO(b''' <html> <head> <title>hi</title> </head> <body> <img> </body> </html>'''), encoding='ascii') ) print(elements) self.assertEqual('html', elements[0].tag) self.assertEqual('head', elements[1].tag) self.assertEqual('title', elements[2].tag) self.assertEqual('title', elements[3].tag) self.assertEqual('head', elements[4].tag) self.assertEqual('body', elements[5].tag) self.assertEqual('img', elements[6].tag) self.assertEqual('img', elements[7].tag) self.assertEqual('body', elements[8].tag) self.assertEqual('html', elements[9].tag)
def test_html_early_html(self): reader = HTMLReader() for test_string in [ b'''<!DOCTYPE HTML><html></html><img>''', b'''<html></html><img>''', b'''<!DOCTYPE HTML><img><html></html>''', b'''<img><html></html>''', b'''<!DOCTYPE HTML> <html><body></body></html><p><img>''', b''' <html><body></body></html><p><img>''', b''' <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <head> <title>Download</title> </head> <body> <br /> </body> </html> <pre><img></pre> ''', b''' <html> <head> <title>Download</title> </head> <body> <br /> </body> </html> <pre><img></pre> ''', b''' <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <body> <br /> </body> <head> <title>Download</title> </head> </html> <pre><img></pre> ''', b''' <html> <body> <br /> </body> <head> <title>Download</title> </head> </html> <pre><img></pre> ''', ]: elements = tuple( reader.read_links(io.BytesIO(test_string), encoding='ascii') ) self.assertEqual('img', elements[-1].tag) elements = tuple( reader.read_tree(io.BytesIO(test_string), encoding='ascii') ) self.assertEqual('img', elements[-4].tag)