def setUp(self): self.visitor = HTMLVisitor()
class TestVisitor(TestCase): def setUp(self): self.visitor = HTMLVisitor() def assert_attrs(self, text, expected_attrs): parsed = html_grammar['attrs'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(out.as_dict(), expected_attrs) def test_attrs_empty(self): self.assert_attrs('', {}) def test_attrs_mixed(self): text = 'foo="bar" key=\'value\' selected=1' expected = {'foo': 'bar', 'key': 'value', 'selected': 1} self.assert_attrs(text, expected) def test_option_selected(self): text = '<option value="value2" selected>Value 2</option>' parsed = html_grammar['option_element'].parse(text) out = self.visitor.visit(parsed) self.assertIsInstance(out, HTMLElement) def test_open(self): text = '<a href="http://example.com">' parsed = html_grammar['a_open'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(out.start, 0) self.assertEqual(out.end, len(text)) self.assertEqual(out.tag, 'a') self.assertEqual(out.attributes.as_dict(), {'href': 'http://example.com'}) def test_br(self): text = '<br>' parsed = html_grammar['br_element'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(out.start, 0) self.assertEqual(out.end, len(text)) self.assertEqual(out.tag, 'br') def test_element(self): text = '<p>This is a simple paragraph.</p>' parsed = html_grammar['p_element'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(out.start, 0) self.assertEqual(out.end, len(text)) self.assertEqual(out.open_tag.tag, 'p') self.assertEqual(out.open_tag.start, 0) self.assertEqual(out.open_tag.end, text.index('>') + 1) self.assertEqual(out.close_tag.tag, 'p') self.assertEqual(out.close_tag.start, text.index('</p>')) self.assertEqual(out.close_tag.end, len(text)) self.assertEqual(out.tag, 'p') self.assertEqual(len(out.children), 1) self.assertEqual(out.children[0].raw, 'This is a simple paragraph.') def test_text_block(self): text = 'This is a simple paragraph.' parsed = html_grammar['text_block'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(len(out), 1) self.assertEqual(out[0].start, 0) self.assertEqual(out[0].end, len(text)) self.assertEqual(out[0].raw, 'This is a simple paragraph.') def test_html_simple_element(self): text = '<p>Simple Paragraph</p>' parsed = html_grammar['html'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(len(out), 1) self.assertIsInstance(out[0], HTMLElement) self.assertEqual(out[0].tag, 'p') def test_html_simple_text(self): text = 'Simple Text' parsed = html_grammar['html'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(len(out), 1) self.assertIsInstance(out[0], HTMLInterval) self.assertEqual(out[0].raw, 'Simple Text') self.assertEqual(out[0].start, 0) def test_html_simple_text_with_offset(self): text = 'Simple Text' parsed = html_grammar['html'].parse(text) self.visitor.offset = 100 out = self.visitor.visit(parsed) self.assertEqual(len(out), 1) self.assertEqual(out[0].raw, 'Simple Text') self.assertEqual(out[0].start, 100) def test_html_complex(self): text = ''' <p> Paragraph 1 </p> <p> Paragraph 2 </p> ''' parsed = html_grammar['html'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(len(out), 5) self.assertIsInstance(out[0], HTMLText) self.assertEqual(str(out[0]), '') self.assertIsInstance(out[1], HTMLElement) self.assertEqual(str(out[1]), '<p>Paragraph 1</p>') self.assertIsInstance(out[2], HTMLText) self.assertEqual(str(out[2]), '') self.assertIsInstance(out[3], HTMLElement) self.assertEqual(str(out[3]), '<p>Paragraph 2</p>') self.assertIsInstance(out[4], HTMLText) self.assertEqual(str(out[4]), '') def test_html_with_code(self): text = '<p>Here is <code>code</code>.</p>' parsed = html_grammar['html'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(len(out), 1) p_elem = out[0] self.assertIsInstance(p_elem, HTMLElement) self.assertEqual('p', p_elem.tag) text1, code, text2 = p_elem.children self.assertEqual(text_type(text1), 'Here is') self.assertEqual(text_type(code), '<code>code</code>') self.assertEqual(text_type(text2), '.') def test_html_simple_table(self): text = '<table><tr><td>A very dumb table</td></tr></table>' parsed = html_grammar['html'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(len(out), 1) table = out[0] self.assertEqual(table.tag, 'table') self.assertEqual(len(table.children), 1) tr = table.children[0] self.assertEqual(tr.tag, 'tr') self.assertEqual(len(tr.children), 1) td = tr.children[0] self.assertEqual(td.tag, 'td') self.assertEqual(len(td.children), 1) text = td.children[0] self.assertEqual(text_type(text), 'A very dumb table') def test_html_empty_tag(self): text = '<td></td>' parsed = html_grammar['html'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(len(out), 1) td = out[0] self.assertEqual(td.tag, 'td') self.assertEqual(len(td.children), 1) text = td.children[0] self.assertEqual(text_type(text), '') def test_add_issue(self): text = '<p>A paragraph</p>' parsed = html_grammar['html'].parse(text) out = self.visitor.visit(parsed) self.assertFalse(self.visitor.issues) self.assertEqual(len(out), 1) p_elem = out[0] self.visitor.add_issue('halt_import', p_elem) self.assertEqual(self.visitor.issues, [('halt_import', 0, 18, {})]) def test_html_headers(self): text = """\ <h1>An H1 Header</h1> <p>This is in the h1 section</p> """ parsed = html_grammar['html'].parse(text) out = self.visitor.visit(parsed) self.assertFalse(self.visitor.issues) h1_elem, text1, p_elem, text2 = out self.assertIsInstance(h1_elem, HnElement) self.assertEqual(text_type(h1_elem), '<h1>An H1 Header</h1>') self.assertIsInstance(text1, HTMLText) self.assertEqual(text_type(text1), '') self.assertIsInstance(p_elem, HTMLElement) self.assertEqual(text_type(p_elem), '<p>This is in the h1 section</p>') self.assertIsInstance(text2, HTMLText) self.assertEqual(text_type(text2), '') def test_double_quoted_text(self): text = '"text"' parsed = html_grammar['text'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(['text'], out) def test_double_quoted_escaped_text(self): text = '"the \\"text\\"."' parsed = html_grammar['text'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(['the "text".'], out) def test_single_quoted_escaped_text(self): text = "'I don\\'t like escaped text'" parsed = html_grammar['text'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(["I don't like escaped text"], out)
class TestVisitor(TestCase): def setUp(self): self.visitor = HTMLVisitor() def assert_attrs(self, text, expected_attrs): parsed = html_grammar['attrs'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(out.as_dict(), expected_attrs) def test_attrs_empty(self): self.assert_attrs('', {}) def test_attrs_mixed(self): text = 'foo="bar" key=\'value\' selected=1' expected = {'foo': 'bar', 'key': 'value', 'selected': 1} self.assert_attrs(text, expected) def test_option_selected(self): text = '<option value="value2" selected>Value 2</option>' parsed = html_grammar['option_element'].parse(text) out = self.visitor.visit(parsed) self.assertIsInstance(out, HTMLElement) def test_open(self): text = '<a href="http://example.com">' parsed = html_grammar['a_open'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(out.start, 0) self.assertEqual(out.end, len(text)) self.assertEqual(out.tag, 'a') self.assertEqual( out.attributes.as_dict(), {'href': 'http://example.com'}) def test_br(self): text = '<br>' parsed = html_grammar['br_element'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(out.start, 0) self.assertEqual(out.end, len(text)) self.assertEqual(out.tag, 'br') def test_element(self): text = '<p>This is a simple paragraph.</p>' parsed = html_grammar['p_element'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(out.start, 0) self.assertEqual(out.end, len(text)) self.assertEqual(out.open_tag.tag, 'p') self.assertEqual(out.open_tag.start, 0) self.assertEqual(out.open_tag.end, text.index('>') + 1) self.assertEqual(out.close_tag.tag, 'p') self.assertEqual(out.close_tag.start, text.index('</p>')) self.assertEqual(out.close_tag.end, len(text)) self.assertEqual(out.tag, 'p') self.assertEqual(len(out.children), 1) self.assertEqual( out.children[0].raw, 'This is a simple paragraph.') def test_text_block(self): text = 'This is a simple paragraph.' parsed = html_grammar['text_block'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(len(out), 1) self.assertEqual(out[0].start, 0) self.assertEqual(out[0].end, len(text)) self.assertEqual(out[0].raw, 'This is a simple paragraph.') def test_html_simple_element(self): text = '<p>Simple Paragraph</p>' parsed = html_grammar['html'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(len(out), 1) self.assertIsInstance(out[0], HTMLElement) self.assertEqual(out[0].tag, 'p') def test_html_simple_text(self): text = 'Simple Text' parsed = html_grammar['html'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(len(out), 1) self.assertIsInstance(out[0], HTMLInterval) self.assertEqual(out[0].raw, 'Simple Text') self.assertEqual(out[0].start, 0) def test_html_simple_text_with_offset(self): text = 'Simple Text' parsed = html_grammar['html'].parse(text) self.visitor.offset = 100 out = self.visitor.visit(parsed) self.assertEqual(len(out), 1) self.assertEqual(out[0].raw, 'Simple Text') self.assertEqual(out[0].start, 100) def test_html_complex(self): text = ''' <p> Paragraph 1 </p> <p> Paragraph 2 </p> ''' parsed = html_grammar['html'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(len(out), 5) self.assertIsInstance(out[0], HTMLText) self.assertEqual(str(out[0]), '') self.assertIsInstance(out[1], HTMLElement) self.assertEqual(str(out[1]), '<p>Paragraph 1</p>') self.assertIsInstance(out[2], HTMLText) self.assertEqual(str(out[2]), '') self.assertIsInstance(out[3], HTMLElement) self.assertEqual(str(out[3]), '<p>Paragraph 2</p>') self.assertIsInstance(out[4], HTMLText) self.assertEqual(str(out[4]), '') def test_html_with_code(self): text = '<p>Here is <code>code</code>.</p>' parsed = html_grammar['html'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(len(out), 1) p_elem = out[0] self.assertIsInstance(p_elem, HTMLElement) self.assertEqual('p', p_elem.tag) text1, code, text2 = p_elem.children self.assertEqual(text_type(text1), 'Here is') self.assertEqual(text_type(code), '<code>code</code>') self.assertEqual(text_type(text2), '.') def test_html_simple_table(self): text = '<table><tr><td>A very dumb table</td></tr></table>' parsed = html_grammar['html'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(len(out), 1) table = out[0] self.assertEqual(table.tag, 'table') self.assertEqual(len(table.children), 1) tr = table.children[0] self.assertEqual(tr.tag, 'tr') self.assertEqual(len(tr.children), 1) td = tr.children[0] self.assertEqual(td.tag, 'td') self.assertEqual(len(td.children), 1) text = td.children[0] self.assertEqual(text_type(text), 'A very dumb table') def test_html_empty_tag(self): text = '<td></td>' parsed = html_grammar['html'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(len(out), 1) td = out[0] self.assertEqual(td.tag, 'td') self.assertEqual(len(td.children), 1) text = td.children[0] self.assertEqual(text_type(text), '') def test_add_issue(self): text = '<p>A paragraph</p>' parsed = html_grammar['html'].parse(text) out = self.visitor.visit(parsed) self.assertFalse(self.visitor.issues) self.assertEqual(len(out), 1) p_elem = out[0] self.visitor.add_issue('halt_import', p_elem) self.assertEqual(self.visitor.issues, [('halt_import', 0, 18, {})]) def test_html_headers(self): text = """\ <h1>An H1 Header</h1> <p>This is in the h1 section</p> """ parsed = html_grammar['html'].parse(text) out = self.visitor.visit(parsed) self.assertFalse(self.visitor.issues) h1_elem, text1, p_elem, text2 = out self.assertIsInstance(h1_elem, HnElement) self.assertEqual(text_type(h1_elem), '<h1>An H1 Header</h1>') self.assertIsInstance(text1, HTMLText) self.assertEqual(text_type(text1), '') self.assertIsInstance(p_elem, HTMLElement) self.assertEqual(text_type(p_elem), '<p>This is in the h1 section</p>') self.assertIsInstance(text2, HTMLText) self.assertEqual(text_type(text2), '') def test_double_quoted_text(self): text = '"text"' parsed = html_grammar['text'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(['text'], out) def test_double_quoted_escaped_text(self): text = '"the \\"text\\"."' parsed = html_grammar['text'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(['the "text".'], out) def test_single_quoted_escaped_text(self): text = "'I don\\'t like escaped text'" parsed = html_grammar['text'].parse(text) out = self.visitor.visit(parsed) self.assertEqual(["I don't like escaped text"], out)