def compare_sanitized_html(): """ Report all changes that result from applying sanitize() to ContentNode.headnote and TextBlock.content. """ from main.models import TextBlock, ContentNode from main.sanitize import sanitize from main.utils import parse_html_fragment, elements_equal sanitized_fields = ( (TextBlock, 'content'), (ContentNode, 'headnote'), ) for model, field in sanitized_fields: print("Getting tags from {model.__name__}.{field}") for obj in tqdm(model.objects.exclude(**{ field: '' }).exclude(**{ field: None }).iterator(), total=float("inf")): content = getattr(obj, field) sanitized = sanitize(content) if content != sanitized: content_tree = parse_html_fragment(content) sanitized_tree = parse_html_fragment(sanitized) elements_equal(content_tree, sanitized_tree, tidy_style_attrs=True)
def assert_docx_equal(path_or_file_a, path_or_file_b): """ Two .docx files are considered equal if all zipped files inside have the same contents, except for docProps/core.xml which contains a timestamp. This function compares CRCs first, then, if that fails, decodes each zipped file and compares the contents as an xml tree. If the trees differ, we recommend running the tests with --pdb to drop into the debugger at the moment of failure, and inspecting the differing elements using lxml/etree utilities, e.g. ... etree.tostring(e1) ... etree.tostring(e2) """ with ZipFile(path_or_file_a) as zip_a, ZipFile(path_or_file_b) as zip_b: try: # Quick comparison assert set((f.filename, f.CRC) for f in zip_a.infolist() if f.filename != 'docProps/core.xml') == set( (f.filename, f.CRC) for f in zip_b.infolist() if f.filename != 'docProps/core.xml') except AssertionError: # Slow comparison: for filename in [ f for f in zip_a.namelist() if f != 'docProps/core.xml' ]: data_a = etree.XML(zip_a.read(filename)) data_b = etree.XML(zip_b.read(filename)) assert elements_equal(data_a, data_b, exc_class=AssertionError)
def assert_html_equal(bytes_a, bytes_b): # If the trees differ, we recommend running the tests with --pdb to drop into the debugger at the moment of failure, # and inspecting the differing elements using lxml/etree utilities, e.g. # > etree.tostring(e1, method='html') # > etree.tostring(e2, method='html') tree_a = parse_html_fragment(bytes_a.decode('utf-8')) tree_b = parse_html_fragment(bytes_b.decode('utf-8')) assert elements_equal(tree_a, tree_b, ignore_trailing_whitespace=True, exc_class=AssertionError)