def test_single_element(self): source = BytesIO(b'<img src="#">') context = iterparse(source) elements = list(context) # consume the generator self.assertTrue(len(elements) == 1) el, attr, url, pos = elements.pop() self.assertEqual(url, '#') self.assertEqual(pos, 0) self.assertEqual(attr, 'src') self.assertEqual(el.tag, 'img') self.assertEqual(el.attrib, {'src': '#'})
def test_source_is_not_bytes(self): source = StringIO('<img src="#">') context = iterparse(source) lst = list(context) self.assertTrue(len(lst) == 1) el, attr, url, pos = lst.pop() self.assertEqual(url, '#') self.assertEqual(pos, 0) self.assertEqual(attr, 'src') self.assertEqual(el.tag, 'img') self.assertEqual(el.attrib, {'src': '#'})
def test_meta_charset_insertion(self): context = iterparse(BytesIO(b''), include_meta_charset_tag=True) self.assertEqual(list(context), []) elms = context.root.xpath('//meta') self.assertEqual(len(elms), 1) meta = elms.pop() self.assertEqual(meta.tag, 'meta') self.assertEqual(meta.attrib['charset'], 'iso-8859-1') self.assertEqual( tostring(meta), tostring(lxml.html.Element('meta', charset='iso-8859-1'))) self.assertEqual(tostring(context.root), b'<html><head>%s</head></html>' % tostring(meta))
def test_root_post_iteration(self): source = BytesIO(b'<img src="#">') context = iterparse(source) list(context) # consume the generator self.assertTrue(context.root is not None) self.assertTrue(isinstance(context.root, lxml.etree._Element))
def test_root_pre_iteration(self): source = BytesIO(b'<img src="#">') context = iterparse(source) self.assertTrue(hasattr(context, 'root')) self.assertTrue(context.root is None)
def test_return_type(self): source = BytesIO(b'<img src="#">') context = iterparse(source) self.assertTrue(isinstance(context, Iterator))
class TestFullLatin1EncodedHTMLParsing(unittest.TestCase): # Single instance of the parser to avoid creating anew each time. context = iterparse(BytesIO(html.encode('latin1', 'replace')), encoding='latin1') # NOTE: Methods are ordered in the sequence they will be parsed def test_a_first_meta_element(self): el, attr, url, pos = next(self.context) self.assertEqual(el.tag, 'meta') self.assertEqual(url, 'http://nx-domain.com/redirect') self.assertEqual(pos, 2) self.assertEqual(attr, 'content') self.assertEqual(el.attrib, { 'http-equiv': 'refresh', 'content': "3;http://nx-domain.com/redirect" }) def test_b_first_link_element(self): el, attr, url, pos = next(self.context) self.assertEqual(el.tag, 'link') self.assertEqual(url, 'css/main.css') self.assertEqual(pos, 0) self.assertEqual(attr, 'href') self.assertEqual(el.attrib, { 'rel': 'stylesheet', 'href': "css/main.css" }) def test_c_second_link_element(self): el, attr, url, pos = next(self.context) self.assertEqual(el.tag, 'link') self.assertEqual(url, 'http://nx-domain.com/css/style.css') self.assertEqual(pos, 0) self.assertEqual(attr, 'href') self.assertEqual(el.attrib, { 'rel': 'stylesheet', 'href': 'http://nx-domain.com/css/style.css' }) def test_d_style_tag_css_url_element(self): el, attr, url, pos = next(self.context) self.assertEqual(el.tag, 'style') self.assertEqual(url, "img/background.png") self.assertEqual(pos, 49) self.assertEqual(attr, None) self.assertEqual(el.attrib, {}) def test_e_style_tag_css_import_element(self): el, attr, url, pos = next(self.context) self.assertEqual(el.tag, 'style') self.assertEqual(url, "css/theme.css") self.assertEqual(pos, 10) self.assertEqual(attr, None) self.assertEqual(el.attrib, {}) def test_f_first_anchor_element(self): el, attr, url, pos = next(self.context) self.assertEqual(el.tag, 'a') self.assertEqual(url, "#") self.assertEqual(pos, 0) self.assertEqual(attr, 'href') self.assertEqual(el.attrib, {'href': '#'}) def test_g_second_anchor_element(self): el, attr, url, pos = next(self.context) self.assertEqual(el.tag, 'a') self.assertEqual(url, "javascript:void(0);") self.assertEqual(pos, 0) self.assertEqual(attr, 'href') self.assertEqual(el.attrib, {'href': "javascript:void(0);"}) def test_h_third_anchor_element(self): el, attr, url, pos = next(self.context) self.assertEqual(el.tag, 'a') self.assertEqual(url, "http://new-site.com") self.assertEqual(pos, 0) self.assertEqual(attr, 'href') self.assertEqual(el.attrib, {'href': "http://new-site.com"}) def test_i_inline_style_element(self): el, attr, url, pos = next(self.context) self.assertEqual(el.tag, 'div') self.assertEqual(url, "img/background.png") self.assertEqual(pos, 17) self.assertEqual(attr, 'style') self.assertEqual(el.attrib, {'style': "background: url('img/background.png');"}) def test_j_first_img_element(self): el, attr, url, pos = next(self.context) self.assertEqual(el.tag, 'img') self.assertEqual(url, "img/img1.png") self.assertEqual(pos, 0) self.assertEqual(attr, 'src') self.assertEqual(el.attrib, {'src': "img/img1.png", 'alt': 'img1-alt'}) def test_k_second_img_element(self): el, attr, url, pos = next(self.context) self.assertEqual(el.tag, 'img') self.assertEqual(url, "http://static-site.com/img/img3.png") self.assertEqual(pos, 0) self.assertEqual(attr, 'src') self.assertEqual(el.attrib, { 'src': "http://static-site.com/img/img3.png", 'alt': "" }) def test_y_empty_iterator(self): with self.assertRaises(StopIteration): next(self.context) self.assertEqual(len(list(self.context)), 0) def test_z_root_tree_attribute(self): self.assertTrue(hasattr(self.context.root, 'getroottree')) self.assertTrue(isinstance(self.context.root, lxml.etree.ElementBase)) self.assertTrue( isinstance(self.context.root.getroottree(), lxml.etree._ElementTree))
def test_source_is_empty(self): source = BytesIO(b'') context = iterparse(source) self.assertEqual(list(context), []) self.assertEqual(tostring(context.root), b'<html></html>')
def test_source_without_read_method(self): with self.assertRaises(TypeError): iterparse(object()) with self.assertRaises(TypeError): iterparse(None)
def test_source_is_empty_string(self): with self.assertRaises(TypeError): iterparse('')