def load_comparison(left_bill, left_version, right_bill, right_version, timelimit=10, force=False): from billtext import load_bill_text, compare_xml_text, get_current_version import lxml left_bill = Bill.objects.get(id = left_bill) right_bill = Bill.objects.get(id = right_bill) if left_version == "": left_version = get_current_version(left_bill) if right_version == "": right_version = get_current_version(right_bill) btc = None try: btc = BillTextComparison.objects.get( bill1 = left_bill, ver1 = left_version, bill2 = right_bill, ver2 = right_version) btc.decompress() if not force: return btc.data except BillTextComparison.DoesNotExist: pass # Try with the bills swapped. try: btc2 = BillTextComparison.objects.get( bill2 = left_bill, ver2 = left_version, bill1 = right_bill, ver1 = right_version) btc2.decompress() data = btc2.data return { "left_meta": data["right_meta"], "right_meta": data["left_meta"], "left_text": data["right_text"], "right_text": data["left_text"], } except BillTextComparison.DoesNotExist: pass left = load_bill_text(left_bill, left_version, mods_only=True) right = load_bill_text(right_bill, right_version, mods_only=True) doc1 = lxml.etree.parse(left["basename"] + ".html") doc2 = lxml.etree.parse(right["basename"] + ".html") compare_xml_text(doc1, doc2, timelimit=timelimit) # revises DOMs in-place # dates aren't JSON serializable left["docdate"] = left["docdate"].strftime("%x") right["docdate"] = right["docdate"].strftime("%x") ret = { "left_meta": left, "right_meta": right, "left_text": lxml.etree.tostring(doc1), "right_text": lxml.etree.tostring(doc2), } if not btc: btc = BillTextComparison( bill1 = left_bill, ver1 = left_version, bill2 = right_bill, ver2 = right_version, data = dict(ret)) # clone before compress() else: btc.data = dict(ret) # clone before compress() btc.compress() btc.save() return ret
def load_comparison(left_bill, left_version, right_bill, right_version, timelimit=10, use_cache=True, force_update=False): from .billtext import load_bill_text, get_current_version from xml_diff import compare import lxml left_bill = Bill.objects.get(id = left_bill) right_bill = Bill.objects.get(id = right_bill) if left_version == "": left_version = get_current_version(left_bill) if right_version == "": right_version = get_current_version(right_bill) if use_cache: # Load from cache. try: btc = BillTextComparison.objects.get( bill1 = left_bill, ver1 = left_version, bill2 = right_bill, ver2 = right_version) btc.decompress() return btc.data except BillTextComparison.DoesNotExist: pass # Load from cache - Try with the bills swapped. try: btc2 = BillTextComparison.objects.get( bill2 = left_bill, ver2 = left_version, bill1 = right_bill, ver1 = right_version) btc2.decompress() data = btc2.data # un-swap return { "left_meta": data["right_meta"], "right_meta": data["left_meta"], "left_text": data["right_text"], "right_text": data["left_text"], } except BillTextComparison.DoesNotExist: pass # Load bill text metadata. left = load_bill_text(left_bill, left_version, mods_only=True) right = load_bill_text(right_bill, right_version, mods_only=True) # Load XML DOMs for each document and perform the comparison. def load_bill_text_xml(docinfo): # If XML text is available, use it, but pre-render it # into HTML. Otherwise use the legacy HTML that we # scraped from THOMAS. if "xml_file" in docinfo: import congressxml return congressxml.convert_xml(docinfo["xml_file"]) elif "html_file" in docinfo: return lxml.etree.parse(docinfo["html_file"]) else: raise IOError("Bill text is not available for one of the bills.") doc1 = load_bill_text_xml(left) doc2 = load_bill_text_xml(right) def make_tag_func(ins_del): import lxml.etree elem = lxml.etree.Element("comparison-change") return elem def differ(text1, text2): # ensure we use the C++ Google DMP and can specify the time limit import diff_match_patch for x in diff_match_patch.diff(text1, text2, timelimit=timelimit): yield x compare(doc1.getroot(), doc2.getroot(), make_tag_func=make_tag_func, differ=differ) # Prepare JSON response data. # dates aren't JSON serializable left["docdate"] = left["docdate"].strftime("%x") right["docdate"] = right["docdate"].strftime("%x") ret = { "left_meta": left, "right_meta": right, "left_text": lxml.etree.tostring(doc1, encoding=str), "right_text": lxml.etree.tostring(doc2, encoding=str), } if use_cache or force_update: # For force_update, or race conditions, delete any existing record. fltr = { "bill1": left_bill, "ver1": left_version, "bill2": right_bill, "ver2": right_version } BillTextComparison.objects.filter(**fltr).delete() # Cache in database so we don't have to re-do the comparison # computation again. btc = BillTextComparison( data = dict(ret), # clone before compress() **fltr) btc.compress() btc.save() # Return JSON comparison data. return ret
def load_comparison(left_bill, left_version, right_bill, right_version, timelimit=10): from billtext import load_bill_text, get_current_version from xml_diff import compare import lxml left_bill = Bill.objects.get(id=left_bill) right_bill = Bill.objects.get(id=right_bill) if left_version == "": left_version = get_current_version(left_bill) if right_version == "": right_version = get_current_version(right_bill) use_cache = True if use_cache: # Load from cache. try: btc = BillTextComparison.objects.get(bill1=left_bill, ver1=left_version, bill2=right_bill, ver2=right_version) btc.decompress() return btc.data except BillTextComparison.DoesNotExist: pass # Load from cache - Try with the bills swapped. try: btc2 = BillTextComparison.objects.get(bill2=left_bill, ver2=left_version, bill1=right_bill, ver1=right_version) btc2.decompress() data = btc2.data # un-swap return { "left_meta": data["right_meta"], "right_meta": data["left_meta"], "left_text": data["right_text"], "right_text": data["left_text"], } except BillTextComparison.DoesNotExist: pass # Load bill text metadata. left = load_bill_text(left_bill, left_version, mods_only=True) right = load_bill_text(right_bill, right_version, mods_only=True) # Load XML DOMs for each document and perform the comparison. def load_bill_text_xml(docinfo): # If XML text is available, use it, but pre-render it # into HTML. Otherwise use the legacy HTML that we # scraped from THOMAS. if "xml_file" in docinfo: import congressxml return congressxml.convert_xml(docinfo["xml_file"]) elif "html_file" in docinfo: return lxml.etree.parse(docinfo["html_file"]) else: raise IOError("Bill text is not available for one of the bills.") doc1 = load_bill_text_xml(left) doc2 = load_bill_text_xml(right) def make_tag_func(ins_del): import lxml.etree elem = lxml.etree.Element("comparison-change") return elem compare(doc1.getroot(), doc2.getroot(), make_tag_func=make_tag_func) # Prepare JSON response data. # dates aren't JSON serializable left["docdate"] = left["docdate"].strftime("%x") right["docdate"] = right["docdate"].strftime("%x") ret = { "left_meta": left, "right_meta": right, "left_text": lxml.etree.tostring(doc1), "right_text": lxml.etree.tostring(doc2), } if use_cache: # Cache in database so we don't have to re-do the comparison # computation again. btc = BillTextComparison(bill1=left_bill, ver1=left_version, bill2=right_bill, ver2=right_version, data=dict(ret)) # clone before compress() btc.compress() btc.save() # Return JSON comparison data. return ret
def load_comparison(left_bill, left_version, right_bill, right_version, timelimit=10, force=False): from billtext import load_bill_text, compare_xml_text, get_current_version import lxml left_bill = Bill.objects.get(id=left_bill) right_bill = Bill.objects.get(id=right_bill) if left_version == "": left_version = get_current_version(left_bill) if right_version == "": right_version = get_current_version(right_bill) btc = None try: btc = BillTextComparison.objects.get(bill1=left_bill, ver1=left_version, bill2=right_bill, ver2=right_version) btc.decompress() if not force: return btc.data except BillTextComparison.DoesNotExist: pass # Try with the bills swapped. try: btc2 = BillTextComparison.objects.get(bill2=left_bill, ver2=left_version, bill1=right_bill, ver1=right_version) btc2.decompress() data = btc2.data return { "left_meta": data["right_meta"], "right_meta": data["left_meta"], "left_text": data["right_text"], "right_text": data["left_text"], } except BillTextComparison.DoesNotExist: pass left = load_bill_text(left_bill, left_version, mods_only=True) right = load_bill_text(right_bill, right_version, mods_only=True) try: doc1 = lxml.etree.parse(left["html_file"]) doc2 = lxml.etree.parse(right["html_file"]) except KeyError: raise IOError( "The HTML bill text format is not available for one of the bills.") compare_xml_text(doc1, doc2, timelimit=timelimit) # revises DOMs in-place # dates aren't JSON serializable left["docdate"] = left["docdate"].strftime("%x") right["docdate"] = right["docdate"].strftime("%x") ret = { "left_meta": left, "right_meta": right, "left_text": lxml.etree.tostring(doc1), "right_text": lxml.etree.tostring(doc2), } if not btc: btc = BillTextComparison(bill1=left_bill, ver1=left_version, bill2=right_bill, ver2=right_version, data=dict(ret)) # clone before compress() else: btc.data = dict(ret) # clone before compress() btc.compress() btc.save() return ret
def load_comparison(left_bill, left_version, right_bill, right_version, timelimit=10, use_cache=True, force_update=False): from billtext import load_bill_text, get_current_version from xml_diff import compare import lxml left_bill = Bill.objects.get(id = left_bill) right_bill = Bill.objects.get(id = right_bill) if left_version == "": left_version = get_current_version(left_bill) if right_version == "": right_version = get_current_version(right_bill) if use_cache: # Load from cache. try: btc = BillTextComparison.objects.get( bill1 = left_bill, ver1 = left_version, bill2 = right_bill, ver2 = right_version) btc.decompress() return btc.data except BillTextComparison.DoesNotExist: pass # Load from cache - Try with the bills swapped. try: btc2 = BillTextComparison.objects.get( bill2 = left_bill, ver2 = left_version, bill1 = right_bill, ver1 = right_version) btc2.decompress() data = btc2.data # un-swap return { "left_meta": data["right_meta"], "right_meta": data["left_meta"], "left_text": data["right_text"], "right_text": data["left_text"], } except BillTextComparison.DoesNotExist: pass # Load bill text metadata. left = load_bill_text(left_bill, left_version, mods_only=True) right = load_bill_text(right_bill, right_version, mods_only=True) # Load XML DOMs for each document and perform the comparison. def load_bill_text_xml(docinfo): # If XML text is available, use it, but pre-render it # into HTML. Otherwise use the legacy HTML that we # scraped from THOMAS. if "xml_file" in docinfo: import congressxml return congressxml.convert_xml(docinfo["xml_file"]) elif "html_file" in docinfo: return lxml.etree.parse(docinfo["html_file"]) else: raise IOError("Bill text is not available for one of the bills.") doc1 = load_bill_text_xml(left) doc2 = load_bill_text_xml(right) def make_tag_func(ins_del): import lxml.etree elem = lxml.etree.Element("comparison-change") return elem def differ(text1, text2): # ensure we use the C++ Google DMP and can specify the time limit import diff_match_patch for x in diff_match_patch.diff_unicode(text1, text2, timelimit=timelimit): yield x compare(doc1.getroot(), doc2.getroot(), make_tag_func=make_tag_func, differ=differ) # Prepare JSON response data. # dates aren't JSON serializable left["docdate"] = left["docdate"].strftime("%x") right["docdate"] = right["docdate"].strftime("%x") ret = { "left_meta": left, "right_meta": right, "left_text": lxml.etree.tostring(doc1), "right_text": lxml.etree.tostring(doc2), } if use_cache or force_update: # For force_update, or race conditions, delete any existing record. fltr = { "bill1": left_bill, "ver1": left_version, "bill2": right_bill, "ver2": right_version } BillTextComparison.objects.filter(**fltr).delete() # Cache in database so we don't have to re-do the comparison # computation again. btc = BillTextComparison( data = dict(ret), # clone before compress() **fltr) btc.compress() btc.save() # Return JSON comparison data. return ret