def load_comparison(left_bill, left_version, right_bill, right_version, timelimit=10): from billtext import load_bill_text, get_current_version from xml_diff import compare import lxml left_bill = Bill.objects.get(id=left_bill) right_bill = Bill.objects.get(id=right_bill) if left_version == "": left_version = get_current_version(left_bill) if right_version == "": right_version = get_current_version(right_bill) use_cache = True if use_cache: # Load from cache. try: btc = BillTextComparison.objects.get(bill1=left_bill, ver1=left_version, bill2=right_bill, ver2=right_version) btc.decompress() return btc.data except BillTextComparison.DoesNotExist: pass # Load from cache - Try with the bills swapped. try: btc2 = BillTextComparison.objects.get(bill2=left_bill, ver2=left_version, bill1=right_bill, ver1=right_version) btc2.decompress() data = btc2.data # un-swap return { "left_meta": data["right_meta"], "right_meta": data["left_meta"], "left_text": data["right_text"], "right_text": data["left_text"], } except BillTextComparison.DoesNotExist: pass # Load bill text metadata. left = load_bill_text(left_bill, left_version, mods_only=True) right = load_bill_text(right_bill, right_version, mods_only=True) # Load XML DOMs for each document and perform the comparison. def load_bill_text_xml(docinfo): # If XML text is available, use it, but pre-render it # into HTML. Otherwise use the legacy HTML that we # scraped from THOMAS. if "xml_file" in docinfo: import congressxml return congressxml.convert_xml(docinfo["xml_file"]) elif "html_file" in docinfo: return lxml.etree.parse(docinfo["html_file"]) else: raise IOError("Bill text is not available for one of the bills.") doc1 = load_bill_text_xml(left) doc2 = load_bill_text_xml(right) def make_tag_func(ins_del): import lxml.etree elem = lxml.etree.Element("comparison-change") return elem compare(doc1.getroot(), doc2.getroot(), make_tag_func=make_tag_func) # Prepare JSON response data. # dates aren't JSON serializable left["docdate"] = left["docdate"].strftime("%x") right["docdate"] = right["docdate"].strftime("%x") ret = { "left_meta": left, "right_meta": right, "left_text": lxml.etree.tostring(doc1), "right_text": lxml.etree.tostring(doc2), } if use_cache: # Cache in database so we don't have to re-do the comparison # computation again. btc = BillTextComparison(bill1=left_bill, ver1=left_version, bill2=right_bill, ver2=right_version, data=dict(ret)) # clone before compress() btc.compress() btc.save() # Return JSON comparison data. return ret
def load_comparison(left_bill, left_version, right_bill, right_version, timelimit=10, use_cache=True, force_update=False): from .billtext import load_bill_text, get_current_version from xml_diff import compare import lxml left_bill = Bill.objects.get(id = left_bill) right_bill = Bill.objects.get(id = right_bill) if left_version == "": left_version = get_current_version(left_bill) if right_version == "": right_version = get_current_version(right_bill) if use_cache: # Load from cache. try: btc = BillTextComparison.objects.get( bill1 = left_bill, ver1 = left_version, bill2 = right_bill, ver2 = right_version) btc.decompress() return btc.data except BillTextComparison.DoesNotExist: pass # Load from cache - Try with the bills swapped. try: btc2 = BillTextComparison.objects.get( bill2 = left_bill, ver2 = left_version, bill1 = right_bill, ver1 = right_version) btc2.decompress() data = btc2.data # un-swap return { "left_meta": data["right_meta"], "right_meta": data["left_meta"], "left_text": data["right_text"], "right_text": data["left_text"], } except BillTextComparison.DoesNotExist: pass # Load bill text metadata. left = load_bill_text(left_bill, left_version, mods_only=True) right = load_bill_text(right_bill, right_version, mods_only=True) # Load XML DOMs for each document and perform the comparison. def load_bill_text_xml(docinfo): # If XML text is available, use it, but pre-render it # into HTML. Otherwise use the legacy HTML that we # scraped from THOMAS. if "xml_file" in docinfo: import congressxml return congressxml.convert_xml(docinfo["xml_file"]) elif "html_file" in docinfo: return lxml.etree.parse(docinfo["html_file"]) else: raise IOError("Bill text is not available for one of the bills.") doc1 = load_bill_text_xml(left) doc2 = load_bill_text_xml(right) def make_tag_func(ins_del): import lxml.etree elem = lxml.etree.Element("comparison-change") return elem def differ(text1, text2): # ensure we use the C++ Google DMP and can specify the time limit import diff_match_patch for x in diff_match_patch.diff(text1, text2, timelimit=timelimit): yield x compare(doc1.getroot(), doc2.getroot(), make_tag_func=make_tag_func, differ=differ) # Prepare JSON response data. # dates aren't JSON serializable left["docdate"] = left["docdate"].strftime("%x") right["docdate"] = right["docdate"].strftime("%x") ret = { "left_meta": left, "right_meta": right, "left_text": lxml.etree.tostring(doc1, encoding=str), "right_text": lxml.etree.tostring(doc2, encoding=str), } if use_cache or force_update: # For force_update, or race conditions, delete any existing record. fltr = { "bill1": left_bill, "ver1": left_version, "bill2": right_bill, "ver2": right_version } BillTextComparison.objects.filter(**fltr).delete() # Cache in database so we don't have to re-do the comparison # computation again. btc = BillTextComparison( data = dict(ret), # clone before compress() **fltr) btc.compress() btc.save() # Return JSON comparison data. return ret
def assertEqualXML(self, xml1, xml2): et1 = lxml.etree.fromstring(xml1) et2 = lxml.etree.fromstring(xml2) return compare(et1, et2)
import sys import lxml.etree from xml_diff import compare # make an alias for Py3 if sys.version_info >= (3, ): unicode = str if len(sys.argv) < 3: print("Usage: python3 xml_diff.py [--tags del,ins] before.xml after.xml") sys.exit(1) args = sys.argv[1:] tags = ['del', 'ins'] if args[0] == "--tags": args.pop(0) tags = args.pop(0).split(",") # Load the documents and munge them in-place. dom1 = lxml.etree.parse(args[0]).getroot() dom2 = lxml.etree.parse(args[1]).getroot() compare(dom1, dom2, tags=tags) # Output changed documents. output = lxml.etree.Element("documents") output.append(dom1) output.append(dom2) print(lxml.etree.tostring(output, encoding=unicode))
def load_comparison(left_bill, left_version, right_bill, right_version, timelimit=10, use_cache=True, force_update=False): from billtext import load_bill_text, get_current_version from xml_diff import compare import lxml left_bill = Bill.objects.get(id = left_bill) right_bill = Bill.objects.get(id = right_bill) if left_version == "": left_version = get_current_version(left_bill) if right_version == "": right_version = get_current_version(right_bill) if use_cache: # Load from cache. try: btc = BillTextComparison.objects.get( bill1 = left_bill, ver1 = left_version, bill2 = right_bill, ver2 = right_version) btc.decompress() return btc.data except BillTextComparison.DoesNotExist: pass # Load from cache - Try with the bills swapped. try: btc2 = BillTextComparison.objects.get( bill2 = left_bill, ver2 = left_version, bill1 = right_bill, ver1 = right_version) btc2.decompress() data = btc2.data # un-swap return { "left_meta": data["right_meta"], "right_meta": data["left_meta"], "left_text": data["right_text"], "right_text": data["left_text"], } except BillTextComparison.DoesNotExist: pass # Load bill text metadata. left = load_bill_text(left_bill, left_version, mods_only=True) right = load_bill_text(right_bill, right_version, mods_only=True) # Load XML DOMs for each document and perform the comparison. def load_bill_text_xml(docinfo): # If XML text is available, use it, but pre-render it # into HTML. Otherwise use the legacy HTML that we # scraped from THOMAS. if "xml_file" in docinfo: import congressxml return congressxml.convert_xml(docinfo["xml_file"]) elif "html_file" in docinfo: return lxml.etree.parse(docinfo["html_file"]) else: raise IOError("Bill text is not available for one of the bills.") doc1 = load_bill_text_xml(left) doc2 = load_bill_text_xml(right) def make_tag_func(ins_del): import lxml.etree elem = lxml.etree.Element("comparison-change") return elem def differ(text1, text2): # ensure we use the C++ Google DMP and can specify the time limit import diff_match_patch for x in diff_match_patch.diff_unicode(text1, text2, timelimit=timelimit): yield x compare(doc1.getroot(), doc2.getroot(), make_tag_func=make_tag_func, differ=differ) # Prepare JSON response data. # dates aren't JSON serializable left["docdate"] = left["docdate"].strftime("%x") right["docdate"] = right["docdate"].strftime("%x") ret = { "left_meta": left, "right_meta": right, "left_text": lxml.etree.tostring(doc1), "right_text": lxml.etree.tostring(doc2), } if use_cache or force_update: # For force_update, or race conditions, delete any existing record. fltr = { "bill1": left_bill, "ver1": left_version, "bill2": right_bill, "ver2": right_version } BillTextComparison.objects.filter(**fltr).delete() # Cache in database so we don't have to re-do the comparison # computation again. btc = BillTextComparison( data = dict(ret), # clone before compress() **fltr) btc.compress() btc.save() # Return JSON comparison data. return ret
def create_diff(version1, version2, output_fn): # Generate a HTML diff of two HTML report versions. def load_html(fn): # Open file. with open(fn) as f: doc = f.read() # Parse DOM. It's a fragment so we need to use parseFragment, # which returns a list which we re-assemble into a node. import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") fragment = html5lib.parseFragment(doc, treebuilder="lxml") dom = lxml.etree.Element("div") for node in fragment: dom.append(node) ## Remove comments - xml_diff can't handle that. ## They seem to already be stripped by the HTML ## sanitization. # for node in dom.xpath("//comment()"): # node.getparent().remove(node) # Take everything out of the HTML namespace so # that when we serialize at the end there are no # namespaces and it's plain HTML. for node in dom.xpath("//*"): node.tag = node.tag.replace("{http://www.w3.org/1999/xhtml}", "") return (doc, dom) try: version1_text, version1_dom = load_html(version1) version2_text, version2_dom = load_html(version2) except ValueError: return # Compute diff. Each DOM is updated in place with # <ins>/<del> tags. xml_diff.compare(version1_dom, version2_dom, merge=True) # Serialize. If we used tostring like normal, we'd get # the extra <div> that we wraped the fragement in. So # serialize what's inside of the div and concatenate. #diff_html = lxml.etree.tostring(version1, encoding=str) diff_html = "".join( lxml.etree.tostring(n, encoding=str, method="html" ) if isinstance(n, lxml.etree._Element) else str(n) for n in version1_dom.xpath("node()")) # Also compute a percent change. percent_change = 1.0 - difflib.SequenceMatcher( None, version1_text, version2_text).quick_ratio() # Save. with open(output_fn, "w") as f: f.write(diff_html) with open(output_fn.replace(".html", "-pctchg.txt"), "w") as f: f.write(str(percent_change))
import sys import lxml.etree from xml_diff import compare # make an alias for Py3 if sys.version_info >= (3,): unicode = str if len(sys.argv) < 3: print("Usage: python3 xml_diff.py [--tags del,ins] before.xml after.xml") sys.exit(1) args = sys.argv[1:] tags = ['del', 'ins'] if args[0] == "--tags": args.pop(0) tags = args.pop(0).split(",") # Load the documents and munge them in-place. dom1 = lxml.etree.parse(args[0]).getroot() dom2 = lxml.etree.parse(args[1]).getroot() compare(dom1, dom2, tags=tags) # Output changed documents. output = lxml.etree.Element("documents") output.append(dom1) output.append(dom2) print(lxml.etree.tostring(output, encoding=unicode))