def htmldiff(old_html, new_html): try: old_html_tokens = tokenize(old_html, include_hrefs=False) new_html_tokens = tokenize(new_html, include_hrefs=False) except (KeyError, ParserError): return new_html result = htmldiff_tokens(old_html_tokens, new_html_tokens) result = ''.join(result).strip() return fixup_ins_del_tags(result)
def htmldiff(old_html, new_html): """ Modified lxml.html.diff.htmldiff: * include_hrefs=False - it's hard to fix this " Link: href " stuff, and it's not needed (right?) * do not do fixup_ins_del_tags, as it re-parses everything and we don't need it here """ old_html_tokens = lxml_diff.tokenize(old_html, include_hrefs=False) new_html_tokens = lxml_diff.tokenize(new_html, include_hrefs=False) result = lxml_diff.htmldiff_tokens(old_html_tokens, new_html_tokens) result = ''.join(result).strip() return result
def compare_html(self, original_text, output_text): """Do a diff of two HTML files. Only the text, <img> tags and <a href=***> attributes in the HTML are diffed. """ # We start with a diff of the text tokens alone. This allows us to check that the content is the same (weather or not some non-visible structural elements may have been added/removed/modified) old_html_tokens = tokenize(output_text) new_html_tokens = tokenize(original_text) s = InsensitiveSequenceMatcher(a=old_html_tokens, b=new_html_tokens) commands = s.get_opcodes() # If the content is the same it will only have one opcode which states that the objects are equal self.assertEqual(len(commands), 1) self.assertEqual('equal', commands[0][0]) # Now we do the real test of equality between the original and the de-encapsulated copy self.assertEqual(original_text, output_text)