예제 #1
0
def compare_sanitized_html():
    """
        Report all changes that result from applying sanitize() to ContentNode.headnote and TextBlock.content.
    """
    from main.models import TextBlock, ContentNode
    from main.sanitize import sanitize
    from main.utils import parse_html_fragment, elements_equal

    sanitized_fields = (
        (TextBlock, 'content'),
        (ContentNode, 'headnote'),
    )
    for model, field in sanitized_fields:
        print("Getting tags from {model.__name__}.{field}")
        for obj in tqdm(model.objects.exclude(**{
                field: ''
        }).exclude(**{
                field: None
        }).iterator(),
                        total=float("inf")):
            content = getattr(obj, field)
            sanitized = sanitize(content)
            if content != sanitized:
                content_tree = parse_html_fragment(content)
                sanitized_tree = parse_html_fragment(sanitized)
                elements_equal(content_tree,
                               sanitized_tree,
                               tidy_style_attrs=True)
예제 #2
0
파일: test_export.py 프로젝트: ktfhale/h2o
def assert_html_equal(bytes_a, bytes_b):
    # If the trees differ, we recommend running the tests with --pdb to drop into the debugger at the moment of failure,
    # and inspecting the differing elements using lxml/etree utilities, e.g.
    # > etree.tostring(e1, method='html')
    # > etree.tostring(e2, method='html')
    tree_a = parse_html_fragment(bytes_a.decode('utf-8'))
    tree_b = parse_html_fragment(bytes_b.decode('utf-8'))
    assert elements_equal(tree_a, tree_b, ignore_trailing_whitespace=True, exc_class=AssertionError)
예제 #3
0
def report_tags():
    """ Report all HTML tags, attributes, and styles used in ContentNode.headnote and TextBlock.content. """
    from main.models import TextBlock, ContentNode
    from main.utils import parse_html_fragment
    from pprint import pprint
    import re

    tags = {}
    tag_styles = {}

    sanitized_fields = ((TextBlock, 'content'), (ContentNode, 'headnote'))

    for model, field in sanitized_fields:
        print("Getting tags from %s.%s" % (model.__name__, field))
        for obj in tqdm(model.objects.exclude(**{
                field: ''
        }).exclude(**{field: None}),
                        total=float("inf")):
            tree = parse_html_fragment(getattr(obj, field))
            for el in tree.iter():
                tag = tags.setdefault(el.tag, set())
                for k, v in el.items():
                    tag.add(k)
                    if k == 'style':
                        tag_style = tag_styles.setdefault(el.tag, set())
                        v = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(
                            ' ', v)  # remove url()
                        for pair in v.split(';'):
                            tag_style.add(
                                pair.split(':', 1)[0].strip().lower())

    print("Tags and attributes in use:")
    pprint(tags)

    print("Styles in use:")
    for tag in sorted(tag_styles.keys()):
        styles = tag_styles[tag]
        print("%s[%s]" % (tag, ",".join(s for s in styles if s)))

    print("Unique styles in use:")
    styles = set()
    for s in tag_styles.values():
        styles |= s
        print(" ".join(sorted(styles)))
예제 #4
0
def compare_sanitized_html():
    """
        Report all changes that result from applying sanitize() to ContentNode.headnote and TextBlock.content.
    """
    import difflib
    from main.models import TextBlock, ContentNode
    from main.sanitize import sanitize
    from main.utils import parse_html_fragment

    def elements_equal(e1, e2, ignore={}):
        """
            Recursively compare two lxml Elements. Raise ValueError if not identical.
        """
        if e1.tag != e2.tag:
            raise ValueError("e1.tag != e2.tag (%s != %s)" % (e1.tag, e2.tag))
        if e1.text != e2.text:
            diff = '\n'.join(difflib.ndiff([e1.text or ''], [e2.text or '']))
            raise ValueError("e1.text != e2.text:\n%s" % diff)
        if e1.tail != e2.tail:
            raise ValueError("e1.tail != e2.tail (%s != %s)" %
                             (e1.tail, e2.tail))
        ignore_attrs = ignore.get('attrs', set()) | ignore.get(
            'tag_attrs', {}).get(e1.tag.rsplit('}', 1)[-1], set())
        e1_attrib = {
            k: v
            for k, v in e1.attrib.items() if k not in ignore_attrs
        }
        e2_attrib = {
            k: v
            for k, v in e2.attrib.items() if k not in ignore_attrs
        }
        if e1_attrib.get('style'):
            # allow easy comparison of sanitized style tags by removing all spaces and final semicolon
            e1_attrib['style'] = e1_attrib['style'].replace(' ',
                                                            '').rstrip(';')
            e2_attrib['style'] = e2_attrib['style'].replace(' ',
                                                            '').rstrip(';')
        if e1_attrib != e2_attrib:
            diff = "\n".join(difflib.Differ().compare(
                ["%s: %s" % i for i in sorted(e1_attrib.items())],
                ["%s: %s" % i for i in sorted(e2_attrib.items())]))
            raise ValueError("e1.attrib != e2.attrib:\n%s" % diff)
        s1 = [
            i for i in e1
            if i.tag.rsplit('}', 1)[-1] not in ignore.get('tags', ())
        ]
        s2 = [
            i for i in e2
            if i.tag.rsplit('}', 1)[-1] not in ignore.get('tags', ())
        ]
        if len(s1) != len(s2):
            diff = "\n".join(difflib.Differ().compare([s.tag for s in s1],
                                                      [s.tag for s in s2]))
            raise ValueError("e1 children != e2 children:\n%s" % diff)
        for c1, c2 in zip(s1, s2):
            elements_equal(c1, c2, ignore)

    sanitized_fields = (
        (TextBlock, 'content'),
        (ContentNode, 'headnote'),
    )
    for model, field in sanitized_fields:
        print("Getting tags from %s.%s" % (model.__name__, field))
        for obj in tqdm(model.objects.exclude(**{
                field: ''
        }).exclude(**{
                field: None
        }).iterator(),
                        total=float("inf")):
            content = getattr(obj, field)
            sanitized = sanitize(content)
            if content != sanitized:
                content_tree = parse_html_fragment(content)
                sanitized_tree = parse_html_fragment(sanitized)
                try:
                    elements_equal(content_tree, sanitized_tree)
                except ValueError as e:
                    print("Error comparing %s:\n%s" % (obj.id, e))