Exemplo n.º 1
0
def load_comparison(left_bill,
                    left_version,
                    right_bill,
                    right_version,
                    timelimit=10):
    from billtext import load_bill_text, get_current_version
    from xml_diff import compare
    import lxml

    left_bill = Bill.objects.get(id=left_bill)
    right_bill = Bill.objects.get(id=right_bill)

    if left_version == "": left_version = get_current_version(left_bill)
    if right_version == "": right_version = get_current_version(right_bill)

    use_cache = True

    if use_cache:
        # Load from cache.
        try:
            btc = BillTextComparison.objects.get(bill1=left_bill,
                                                 ver1=left_version,
                                                 bill2=right_bill,
                                                 ver2=right_version)
            btc.decompress()
            return btc.data
        except BillTextComparison.DoesNotExist:
            pass

        # Load from cache - Try with the bills swapped.
        try:
            btc2 = BillTextComparison.objects.get(bill2=left_bill,
                                                  ver2=left_version,
                                                  bill1=right_bill,
                                                  ver1=right_version)
            btc2.decompress()
            data = btc2.data
            # un-swap
            return {
                "left_meta": data["right_meta"],
                "right_meta": data["left_meta"],
                "left_text": data["right_text"],
                "right_text": data["left_text"],
            }
        except BillTextComparison.DoesNotExist:
            pass

    # Load bill text metadata.
    left = load_bill_text(left_bill, left_version, mods_only=True)
    right = load_bill_text(right_bill, right_version, mods_only=True)

    # Load XML DOMs for each document and perform the comparison.
    def load_bill_text_xml(docinfo):
        # If XML text is available, use it, but pre-render it
        # into HTML. Otherwise use the legacy HTML that we
        # scraped from THOMAS.
        if "xml_file" in docinfo:
            import congressxml
            return congressxml.convert_xml(docinfo["xml_file"])
        elif "html_file" in docinfo:
            return lxml.etree.parse(docinfo["html_file"])
        else:
            raise IOError("Bill text is not available for one of the bills.")

    doc1 = load_bill_text_xml(left)
    doc2 = load_bill_text_xml(right)

    def make_tag_func(ins_del):
        import lxml.etree
        elem = lxml.etree.Element("comparison-change")
        return elem

    compare(doc1.getroot(), doc2.getroot(), make_tag_func=make_tag_func)

    # Prepare JSON response data.
    # dates aren't JSON serializable
    left["docdate"] = left["docdate"].strftime("%x")
    right["docdate"] = right["docdate"].strftime("%x")
    ret = {
        "left_meta": left,
        "right_meta": right,
        "left_text": lxml.etree.tostring(doc1),
        "right_text": lxml.etree.tostring(doc2),
    }

    if use_cache:
        # Cache in database so we don't have to re-do the comparison
        # computation again.
        btc = BillTextComparison(bill1=left_bill,
                                 ver1=left_version,
                                 bill2=right_bill,
                                 ver2=right_version,
                                 data=dict(ret))  # clone before compress()
        btc.compress()
        btc.save()

    # Return JSON comparison data.
    return ret
Exemplo n.º 2
0
def load_comparison(left_bill, left_version, right_bill, right_version, timelimit=10, use_cache=True, force_update=False):
    from .billtext import load_bill_text, get_current_version
    from xml_diff import compare
    import lxml

    left_bill = Bill.objects.get(id = left_bill)
    right_bill = Bill.objects.get(id = right_bill)

    if left_version == "": left_version = get_current_version(left_bill)
    if right_version == "": right_version = get_current_version(right_bill)

    if use_cache:
        # Load from cache.
        try:
            btc = BillTextComparison.objects.get(
                bill1 = left_bill,
                ver1 = left_version,
                bill2 = right_bill,
                ver2 = right_version)
            btc.decompress()
            return btc.data
        except BillTextComparison.DoesNotExist:
            pass

        # Load from cache - Try with the bills swapped.
        try:
            btc2 = BillTextComparison.objects.get(
                bill2 = left_bill,
                ver2 = left_version,
                bill1 = right_bill,
                ver1 = right_version)
            btc2.decompress()
            data = btc2.data
            # un-swap
            return {
                "left_meta": data["right_meta"],
                "right_meta": data["left_meta"],
                "left_text": data["right_text"],
                "right_text": data["left_text"],
            }
        except BillTextComparison.DoesNotExist:
            pass

    # Load bill text metadata.
    left = load_bill_text(left_bill, left_version, mods_only=True)
    right = load_bill_text(right_bill, right_version, mods_only=True)

    # Load XML DOMs for each document and perform the comparison.
    def load_bill_text_xml(docinfo):
        # If XML text is available, use it, but pre-render it
        # into HTML. Otherwise use the legacy HTML that we
        # scraped from THOMAS.
        if "xml_file" in docinfo:
            import congressxml
            return congressxml.convert_xml(docinfo["xml_file"])
        elif "html_file" in docinfo:
            return lxml.etree.parse(docinfo["html_file"])
        else:
            raise IOError("Bill text is not available for one of the bills.")
    doc1 = load_bill_text_xml(left)
    doc2 = load_bill_text_xml(right)
    def make_tag_func(ins_del):
        import lxml.etree
        elem = lxml.etree.Element("comparison-change")
        return elem
    def differ(text1, text2):
        # ensure we use the C++ Google DMP and can specify the time limit
        import diff_match_patch
        for x in diff_match_patch.diff(text1, text2, timelimit=timelimit):
            yield x
    compare(doc1.getroot(), doc2.getroot(), make_tag_func=make_tag_func, differ=differ)

    # Prepare JSON response data.
        # dates aren't JSON serializable
    left["docdate"] = left["docdate"].strftime("%x")
    right["docdate"] = right["docdate"].strftime("%x")
    ret = {
        "left_meta": left,
        "right_meta": right,
        "left_text": lxml.etree.tostring(doc1, encoding=str),
        "right_text": lxml.etree.tostring(doc2, encoding=str),
    }

    if use_cache or force_update:
        # For force_update, or race conditions, delete any existing record.
        fltr = { "bill1": left_bill,
            "ver1": left_version,
            "bill2": right_bill,
            "ver2": right_version }
        BillTextComparison.objects.filter(**fltr).delete()

        # Cache in database so we don't have to re-do the comparison
        # computation again.
        btc = BillTextComparison(
            data = dict(ret), # clone before compress()
            **fltr)
        btc.compress()
        btc.save()

    # Return JSON comparison data.
    return ret
Exemplo n.º 3
0
 def assertEqualXML(self, xml1, xml2):
     et1 = lxml.etree.fromstring(xml1)
     et2 = lxml.etree.fromstring(xml2)
     return compare(et1, et2)
Exemplo n.º 4
0
import sys
import lxml.etree
from xml_diff import compare

# make an alias for Py3
if sys.version_info >= (3, ):
    unicode = str

if len(sys.argv) < 3:
    print("Usage: python3 xml_diff.py [--tags del,ins] before.xml after.xml")
    sys.exit(1)

args = sys.argv[1:]

tags = ['del', 'ins']
if args[0] == "--tags":
    args.pop(0)
    tags = args.pop(0).split(",")

# Load the documents and munge them in-place.
dom1 = lxml.etree.parse(args[0]).getroot()
dom2 = lxml.etree.parse(args[1]).getroot()
compare(dom1, dom2, tags=tags)

# Output changed documents.
output = lxml.etree.Element("documents")
output.append(dom1)
output.append(dom2)
print(lxml.etree.tostring(output, encoding=unicode))
Exemplo n.º 5
0
def load_comparison(left_bill, left_version, right_bill, right_version, timelimit=10, use_cache=True, force_update=False):
    from billtext import load_bill_text, get_current_version
    from xml_diff import compare
    import lxml

    left_bill = Bill.objects.get(id = left_bill)
    right_bill = Bill.objects.get(id = right_bill)

    if left_version == "": left_version = get_current_version(left_bill)
    if right_version == "": right_version = get_current_version(right_bill)

    if use_cache:
        # Load from cache.
        try:
            btc = BillTextComparison.objects.get(
                bill1 = left_bill,
                ver1 = left_version,
                bill2 = right_bill,
                ver2 = right_version)
            btc.decompress()
            return btc.data
        except BillTextComparison.DoesNotExist:
            pass

        # Load from cache - Try with the bills swapped.
        try:
            btc2 = BillTextComparison.objects.get(
                bill2 = left_bill,
                ver2 = left_version,
                bill1 = right_bill,
                ver1 = right_version)
            btc2.decompress()
            data = btc2.data
            # un-swap
            return {
                "left_meta": data["right_meta"],
                "right_meta": data["left_meta"],
                "left_text": data["right_text"],
                "right_text": data["left_text"],
            }
        except BillTextComparison.DoesNotExist:
            pass

    # Load bill text metadata.
    left = load_bill_text(left_bill, left_version, mods_only=True)
    right = load_bill_text(right_bill, right_version, mods_only=True)

    # Load XML DOMs for each document and perform the comparison.
    def load_bill_text_xml(docinfo):
        # If XML text is available, use it, but pre-render it
        # into HTML. Otherwise use the legacy HTML that we
        # scraped from THOMAS.
        if "xml_file" in docinfo:
            import congressxml
            return congressxml.convert_xml(docinfo["xml_file"])
        elif "html_file" in docinfo:
            return lxml.etree.parse(docinfo["html_file"])
        else:
            raise IOError("Bill text is not available for one of the bills.")
    doc1 = load_bill_text_xml(left)
    doc2 = load_bill_text_xml(right)
    def make_tag_func(ins_del):
        import lxml.etree
        elem = lxml.etree.Element("comparison-change")
        return elem
    def differ(text1, text2):
        # ensure we use the C++ Google DMP and can specify the time limit
        import diff_match_patch
        for x in diff_match_patch.diff_unicode(text1, text2, timelimit=timelimit):
            yield x
    compare(doc1.getroot(), doc2.getroot(), make_tag_func=make_tag_func, differ=differ)

    # Prepare JSON response data.
        # dates aren't JSON serializable
    left["docdate"] = left["docdate"].strftime("%x")
    right["docdate"] = right["docdate"].strftime("%x")
    ret = {
        "left_meta": left,
        "right_meta": right,
        "left_text": lxml.etree.tostring(doc1),
        "right_text": lxml.etree.tostring(doc2),
    }

    if use_cache or force_update:
        # For force_update, or race conditions, delete any existing record.
        fltr = { "bill1": left_bill,
            "ver1": left_version,
            "bill2": right_bill,
            "ver2": right_version }
        BillTextComparison.objects.filter(**fltr).delete()

        # Cache in database so we don't have to re-do the comparison
        # computation again.
        btc = BillTextComparison(
            data = dict(ret), # clone before compress()
            **fltr)
        btc.compress()
        btc.save()

    # Return JSON comparison data.
    return ret
Exemplo n.º 6
0
def create_diff(version1, version2, output_fn):
    # Generate a HTML diff of two HTML report versions.

    def load_html(fn):
        # Open file.
        with open(fn) as f:
            doc = f.read()

        # Parse DOM. It's a fragment so we need to use parseFragment,
        # which returns a list which we re-assemble into a node.
        import warnings
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            fragment = html5lib.parseFragment(doc, treebuilder="lxml")

        dom = lxml.etree.Element("div")
        for node in fragment:
            dom.append(node)

        ## Remove comments - xml_diff can't handle that.
        ## They seem to already be stripped by the HTML
        ## sanitization.
        # for node in dom.xpath("//comment()"):
        #    node.getparent().remove(node)

        # Take everything out of the HTML namespace so
        # that when we serialize at the end there are no
        # namespaces and it's plain HTML.
        for node in dom.xpath("//*"):
            node.tag = node.tag.replace("{http://www.w3.org/1999/xhtml}", "")

        return (doc, dom)

    try:
        version1_text, version1_dom = load_html(version1)
        version2_text, version2_dom = load_html(version2)
    except ValueError:
        return

    # Compute diff. Each DOM is updated in place with
    # <ins>/<del> tags.
    xml_diff.compare(version1_dom, version2_dom, merge=True)

    # Serialize. If we used tostring like normal, we'd get
    # the extra <div> that we wraped the fragement in. So
    # serialize what's inside of the div and concatenate.
    #diff_html = lxml.etree.tostring(version1, encoding=str)
    diff_html = "".join(
        lxml.etree.tostring(n, encoding=str, method="html"
                            ) if isinstance(n, lxml.etree._Element) else str(n)
        for n in version1_dom.xpath("node()"))

    # Also compute a percent change.
    percent_change = 1.0 - difflib.SequenceMatcher(
        None, version1_text, version2_text).quick_ratio()

    # Save.
    with open(output_fn, "w") as f:
        f.write(diff_html)
    with open(output_fn.replace(".html", "-pctchg.txt"), "w") as f:
        f.write(str(percent_change))
Exemplo n.º 7
0
import sys
import lxml.etree
from xml_diff import compare

# make an alias for Py3
if sys.version_info >= (3,):
	unicode = str

if len(sys.argv) < 3:
	print("Usage: python3 xml_diff.py [--tags del,ins] before.xml after.xml")
	sys.exit(1)

args = sys.argv[1:]

tags = ['del', 'ins']
if args[0] == "--tags":
	args.pop(0)
	tags = args.pop(0).split(",")

# Load the documents and munge them in-place.
dom1 = lxml.etree.parse(args[0]).getroot()
dom2 = lxml.etree.parse(args[1]).getroot()
compare(dom1, dom2, tags=tags)

# Output changed documents.
output = lxml.etree.Element("documents")
output.append(dom1)
output.append(dom2)
print(lxml.etree.tostring(output, encoding=unicode))