def test_remove_insignificant_text_nodes(): html = dedent(''' <html> <head /> <body> <p> one <em>two</em> <strong>three</strong> </p> <table> <tr> <td>stuff</td> </tr> </table> </body> </html> ''') target_html = ('<p> one <em>two</em> <strong>three</strong> </p> ' '<table><tr><td>stuff</td></tr></table>') dom = parse_minidom(html) remove_insignificant_text_nodes(dom) html = minidom_tostring(dom) assert_equal(html, target_html) # Check that it is idempotent. dom = parse_minidom(html) remove_insignificant_text_nodes(dom) html = minidom_tostring(dom) assert_equal(html, target_html)
def test_remove_insignificant_text_nodes(): html = dedent( """ <html> <head /> <body> <p> one <em>two</em> <strong>three</strong> </p> <table> <tr> <td>stuff</td> </tr> </table> </body> </html> """ ) target_html = "<p> one <em>two</em> <strong>three</strong> </p> " "<table><tr><td>stuff</td></tr></table>" dom = parse_minidom(html) remove_insignificant_text_nodes(dom) html = minidom_tostring(dom) assert_equal(html, target_html) # Check that it is idempotent. dom = parse_minidom(html) remove_insignificant_text_nodes(dom) html = minidom_tostring(dom) assert_equal(html, target_html)
def test(original, distributed): original = parse_minidom(original) distributed = parse_minidom(distributed) node = get_location(original, [0]) distribute(node) assert_html_equal( minidom_tostring(original), minidom_tostring(distributed), )
def test_parse_comments(): assert_html_equal( minidom_tostring(parse_minidom('<!-- -->')), '', ) assert_html_equal( minidom_tostring(parse_minidom('<!--\n-->')), '', ) assert_html_equal( minidom_tostring(parse_minidom('<p>stuff<!-- \n -->stuff</p>')), '<p>stuffstuff</p>', )
def diff(old_html, new_html, cutoff=0.0, plaintext=False, pretty=False): """Show the differences between the old and new html document, as html. Return the document html with extra tags added to show changes. Add <ins> tags around newly added sections, and <del> tags to show sections that have been deleted. """ if plaintext: old_dom = parse_text(old_html) new_dom = parse_text(new_html) else: old_dom = parse_minidom(old_html) new_dom = parse_minidom(new_html) # If the two documents are not similar enough, don't show the changes. if not check_text_similarity(old_dom, new_dom, cutoff): return ( '<h2>The differences from the previous version are too large to ' 'show concisely.</h2>') dom = dom_diff(old_dom, new_dom) # HTML-specific cleanup. if not plaintext: fix_lists(dom) fix_tables(dom) # Only return html for the document body contents. body_elements = dom.getElementsByTagName('body') if len(body_elements) == 1: dom = body_elements[0] return minidom_tostring(dom, pretty=pretty)
def diff(old_html, new_html, cutoff=0.0, plaintext=False, pretty=False): """Show the differences between the old and new html document, as html. Return the document html with extra tags added to show changes. Add <ins> tags around newly added sections, and <del> tags to show sections that have been deleted. """ if plaintext: old_dom = parse_text(old_html) new_dom = parse_text(new_html) else: old_dom = parse_minidom(old_html) new_dom = parse_minidom(new_html) # If the two documents are not similar enough, don't show the changes. if not check_text_similarity(old_dom, new_dom, cutoff): return "<h2>The differences from the previous version are too large to " "show concisely.</h2>" dom = dom_diff(old_dom, new_dom) # HTML-specific cleanup. if not plaintext: fix_lists(dom) fix_tables(dom) # Only return html for the document body contents. body_elements = dom.getElementsByTagName("body") if len(body_elements) == 1: dom = body_elements[0] return minidom_tostring(dom, pretty=pretty)
def test_remove_insignificant_text_nodes_nbsp(): html = dedent( """ <table> <tbody> <tr> <td> </td> <td> </td> <td> </td> </tr> </tbody> </table> """ ) dom = parse_minidom(html) remove_insignificant_text_nodes(dom) html = minidom_tostring(dom) assert_equal(html, ("<table><tbody><tr><td> </td><td> </td><td> </td>" "</tr></tbody></table>"))
def test_remove_insignificant_text_nodes_nbsp(): html = dedent(''' <table> <tbody> <tr> <td> </td> <td> </td> <td> </td> </tr> </tbody> </table> ''') dom = parse_minidom(html) remove_insignificant_text_nodes(dom) html = minidom_tostring(dom) assert_equal( html, ('<table><tbody><tr><td> </td><td> </td><td> </td>' '</tr></tbody></table>'), )
def test(): dom = parse_minidom(old_html) assert_equal(minidom_tostring(dom), target) assert_equal(remove_xml_declaration(dom.toxml()), target_raw)
def test(): changes_dom = parse_minidom(changes, strict_xml=True) fix_tables(changes_dom) assert_html_equal(minidom_tostring(changes_dom), fixed_changes)
def test(): changes_dom = parse_minidom(changes) fix_lists(changes_dom) assert_html_equal(minidom_tostring(changes_dom), fixed_changes)
def strip_changes_new(html): dom = parse_minidom(html) _strip_changes_new(dom) return minidom_tostring(dom)
def html_patch(old_html, edit_script): old_dom = parse_minidom(old_html) split_text_nodes(old_dom) runner = EditScriptRunner(old_dom, edit_script) return minidom_tostring(runner.run_edit_script())
def reverse_changes_html(changes): dom = parse_minidom(changes) reverse_changes(dom) return minidom_tostring(dom)
def test_xml_diff(): for test_name, old_html, new_html, target in test_cases: old_dom = parse_minidom(old_html, strict_xml=True) new_dom = parse_minidom(new_html, strict_xml=True) changes_xml = minidom_tostring(dom_diff(old_dom, new_dom)) assert_equal(changes_xml, target)
def remove_attributes(html): dom = parse_minidom(html) remove_dom_attributes(dom) return minidom_tostring(dom)