def bill_text(request, congress, type_slug, number, version=None): if version == "": version = None try: bill_type = BillType.by_slug(type_slug) except BillType.NotFound: raise Http404("Invalid bill type: " + type_slug) bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number) from billtext import load_bill_text, get_bill_text_versions try: textdata = load_bill_text(bill, version) except IOError: textdata = None # Get a list of the alternate versions of this bill. alternates = None is_latest = True if textdata: alternates = [] for v in get_bill_text_versions(bill): try: alternates.append(load_bill_text(bill, v, mods_only=True)) except IOError: pass alternates.sort(key = lambda mods : mods["docdate"]) if len(alternates) > 0: is_latest = False if textdata["doc_version"] == alternates[-1]["doc_version"]: is_latest = True # Get a list of related bills. from billtext import get_current_version related_bills = [] for rb in list(bill.find_reintroductions()) + [r.related_bill for r in bill.get_related_bills()]: try: rbv = get_current_version(rb) if not (rb, rbv) in related_bills: related_bills.append((rb, rbv)) except IOError: pass # text not available for btc in BillTextComparison.objects.filter(bill1=bill).exclude(bill2=bill): if not (btc.bill2, btc.ver2) in related_bills: related_bills.append((btc.bill2, btc.ver2)) for btc in BillTextComparison.objects.filter(bill2=bill).exclude(bill1=bill): if not (btc.bill1, btc.ver1) in related_bills: related_bills.append((btc.bill1, btc.ver1)) return { 'bill': bill, "congressdates": get_congress_dates(bill.congress), "textdata": textdata, "version": version, "is_latest": is_latest, "alternates": alternates, "related_bills": related_bills, "days_old": (datetime.datetime.now().date() - bill.current_status_date).days, "is_on_bill_text_page": True, # for the header tabs }
def bill_text(request, congress, type_slug, number, version=None): if version == "": version = None try: bill_type = BillType.by_slug(type_slug) except BillType.NotFound: raise Http404("Invalid bill type: " + type_slug) bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number) from billtext import load_bill_text, bill_gpo_status_codes try: textdata = load_bill_text(bill, version) except IOError: textdata = None # Get a list of the alternate versions of this bill. alternates = None if textdata: alternates = [] for v in bill_gpo_status_codes: fn = "data/us/bills.text/%s/%s/%s%d%s.mods.xml" % (bill.congress, BillType.by_value(bill.bill_type).xml_code, BillType.by_value(bill.bill_type).xml_code, bill.number, v) if os.path.exists(fn): alternates.append(load_bill_text(bill, v, mods_only=True)) alternates.sort(key = lambda mods : mods["docdate"]) # Get a list of related bills. from billtext import get_current_version related_bills = [] for rb in list(bill.find_reintroductions()) + [r.related_bill for r in bill.get_related_bills()]: try: rbv = get_current_version(rb) if not (rb, rbv) in related_bills: related_bills.append((rb, rbv)) except IOError: pass # text not available for btc in BillTextComparison.objects.filter(bill1=bill).exclude(bill2=bill): if not (btc.bill2, btc.ver2) in related_bills: related_bills.append((btc.bill2, btc.ver2)) for btc in BillTextComparison.objects.filter(bill2=bill).exclude(bill1=bill): if not (btc.bill1, btc.ver1) in related_bills: related_bills.append((btc.bill1, btc.ver1)) return { 'bill': bill, "congressdates": get_congress_dates(bill.congress), "textdata": textdata, "version": version, "alternates": alternates, "related_bills": related_bills, }
def get_text_info(bill): # bill text info and areas of law affected from billtext import load_bill_text try: return load_bill_text(bill, None, mods_only=True, with_citations=True) except IOError: return None
def get_bill_paragraphs(bill): from billtext import load_bill_text from hashlib import md5 try: dom = lxml.etree.fromstring(load_bill_text(bill, None)["text_html"]) except IOError: return None hashes = { } for node in dom.xpath("//p"): text = lxml.etree.tostring(node, method="text", encoding="utf8") text = text.lower() # normalize case text = re.sub("^\(.*?\)\s*", "", text) # remove initial list numbering text = re.sub(r"\W+", " ", text).strip() # normalize spaces and other non-word characters if text == "": continue text = md5(text).hexdigest() hashes[text] = hashes.get(text, 0) + 1 return hashes
def get_bill_paragraphs(bill): from billtext import load_bill_text from hashlib import md5 try: dom = lxml.etree.fromstring(load_bill_text(bill, None)["text_html"]) except IOError: return None hashes = {} for node in dom.xpath("//p"): text = lxml.etree.tostring(node, method="text", encoding="utf8") text = text.lower() # normalize case text = re.sub("^\(.*?\)\s*", "", text) # remove initial list numbering text = re.sub( r"\W+", " ", text).strip() # normalize spaces and other non-word characters if text == "": continue text = md5(text).hexdigest() hashes[text] = hashes.get(text, 0) + 1 return hashes
def bill_details(request, congress, type_slug, number): bill = load_bill_from_url(congress, type_slug, number) # get related bills related_bills = [] reintro_prev = None reintro_next = None for reintro in bill.find_reintroductions(): if reintro.congress < bill.congress: reintro_prev = reintro if reintro.congress > bill.congress and not reintro_next: reintro_next = reintro if reintro_prev: related_bills.append({ "bill": reintro_prev, "note": "was a previous version of this bill.", "show_title": False }) if reintro_next: related_bills.append({ "bill": reintro_next, "note": "was a re-introduction of this bill in a later Congress.", "show_title": False }) for rb in bill.get_related_bills(): if rb.relation in ("identical", "rule"): related_bills.append({ "bill": rb.related_bill, "note": "(%s)" % rb.relation, "show_title": False }) elif rb.relation == "ruled-by": related_bills.append({ "bill": rb.related_bill, "prenote": "Debate on", "note": " is governed by these rules.", "show_title": False }) else: related_bills.append({ "bill": rb.related_bill, "note": ("(%s)" % (rb.relation.title() if rb.relation != "unknown" else "Related")), "show_title": True }) # bill text info and areas of law affected from billtext import load_bill_text try: text_info = load_bill_text(bill, None, mods_only=True, with_citations=True) except IOError: text_info = None return { 'bill': bill, "congressdates": get_congress_dates(bill.congress), "subtitle": get_secondary_bill_title(bill, bill.titles), "current": bill.congress == CURRENT_CONGRESS, "dead": bill.congress != CURRENT_CONGRESS and bill.current_status not in BillStatus.final_status_obvious, "feed": bill.get_feed(), "text_info": text_info, "related": related_bills, }
def load_comparison(left_bill, left_version, right_bill, right_version, timelimit=10, force=False): from billtext import load_bill_text, compare_xml_text, get_current_version import lxml left_bill = Bill.objects.get(id = left_bill) right_bill = Bill.objects.get(id = right_bill) if left_version == "": left_version = get_current_version(left_bill) if right_version == "": right_version = get_current_version(right_bill) btc = None try: btc = BillTextComparison.objects.get( bill1 = left_bill, ver1 = left_version, bill2 = right_bill, ver2 = right_version) btc.decompress() if not force: return btc.data except BillTextComparison.DoesNotExist: pass # Try with the bills swapped. try: btc2 = BillTextComparison.objects.get( bill2 = left_bill, ver2 = left_version, bill1 = right_bill, ver1 = right_version) btc2.decompress() data = btc2.data return { "left_meta": data["right_meta"], "right_meta": data["left_meta"], "left_text": data["right_text"], "right_text": data["left_text"], } except BillTextComparison.DoesNotExist: pass left = load_bill_text(left_bill, left_version, mods_only=True) right = load_bill_text(right_bill, right_version, mods_only=True) doc1 = lxml.etree.parse(left["basename"] + ".html") doc2 = lxml.etree.parse(right["basename"] + ".html") compare_xml_text(doc1, doc2, timelimit=timelimit) # revises DOMs in-place # dates aren't JSON serializable left["docdate"] = left["docdate"].strftime("%x") right["docdate"] = right["docdate"].strftime("%x") ret = { "left_meta": left, "right_meta": right, "left_text": lxml.etree.tostring(doc1), "right_text": lxml.etree.tostring(doc2), } if not btc: btc = BillTextComparison( bill1 = left_bill, ver1 = left_version, bill2 = right_bill, ver2 = right_version, data = dict(ret)) # clone before compress() else: btc.data = dict(ret) # clone before compress() btc.compress() btc.save() return ret
def get_text_info(): from models import USCSection from billtext import load_bill_text from search import parse_slip_law_number import re try: metadata = load_bill_text(bill, None, mods_only=True) # do interesting stuff with citations if "citations" in metadata: slip_laws = [] statutes = [] usc = { } other = [] usc_other = USCSection(name="Other Citations", ordering=99999) for cite in metadata["citations"]: if cite["type"] == "slip_law": slip_laws.append(cite) cite["bill"] = parse_slip_law_number(cite["text"]) elif cite["type"] == "statutes_at_large": statutes.append(cite) elif cite["type"] == "usc": # build a normalized citation and a link to LII cite_norm = "usc/" + cite["title"] cite_link = "http://www.law.cornell.edu/uscode/text/" + cite["title"] if cite["section"]: cite_link += "/" + cite["section"] cite_norm += "/" + cite["section"] if cite["paragraph"]: cite_link += "#" + "_".join(re.findall(r"\(([^)]+)\)", cite["paragraph"])) # Build a tree of title-chapter-...-section nodes so we can # display the citations in context. try: sec_obj = USCSection.objects.get(citation=cite_norm) except: # USCSection.DoesNotExist and MultipleObjectsReturned both possible # the 'id' field is set to make these objects properly hashable sec_obj = USCSection(id=cite["text"], name=cite["text"], parent_section=usc_other) sec_obj.link = cite_link if "range_to_section" in cite: sec_obj.range_to_section = cite["range_to_section"] # recursively go up to the title path = [sec_obj] while sec_obj.parent_section: sec_obj = sec_obj.parent_section path.append(sec_obj) # now pop off from the path to put the node at the right point in a tree container = usc while path: p = path.pop(-1) if p not in container: container[p] = { } container = container[p] else: other.append(cite) slip_laws.sort(key = lambda x : (x["congress"], x["number"])) # restructure data format def ucfirst(s): return s[0].upper() + s[1:] def rebuild_usc_sec(seclist, indent=0): ret = [] seclist = sorted(seclist.items(), key=lambda x : x[0].ordering) for sec, subparts in seclist: ret.append({ "text": (ucfirst(sec.level_type + ((" " + sec.number) if sec.number else "") + (": " if sec.name else "")) if sec.level_type else "") + (sec.name if sec.name else ""), "link": getattr(sec, "link", None), "range_to_section": getattr(sec, "range_to_section", None), "indent": indent, }) ret.extend(rebuild_usc_sec(subparts, indent=indent+1)) return ret usc = rebuild_usc_sec(usc) metadata["citations"] = { "slip_laws": slip_laws, "statutes": statutes, "usc": usc, "other": other, "count": len(slip_laws)+len(statutes)+len(usc)+len(other) } return metadata except IOError: return None
def bill_text_image(request, congress, type_slug, number, image_type): bill = load_bill_from_url(congress, type_slug, number) from billtext import load_bill_text # Rasterizes a page of a PDF to a greyscale PIL.Image. # Crop out the GPO seal & the vertical margins. def pdftopng(pdffile, pagenumber, width=900): from PIL import Image import subprocess, StringIO pngbytes = subprocess.check_output([ "/usr/bin/pdftoppm", "-f", str(pagenumber), "-l", str(pagenumber), "-scale-to", str(width), "-png", pdffile ]) im = Image.open(StringIO.StringIO(pngbytes)) im = im.convert("L") # crop out the GPO seal: im = im.crop((0, int((.06 if pagenumber == 1 else 0) * im.size[0]), im.size[0], im.size[1])) # zealous-crop the vertical margins, but at least leaving a little # at the bottom so that when we paste the two pages of the two images # together they don't get totally scruntched, and put in some padding # at the top. # (.getbbox() crops out zeroes, so we'll invert the image to make it work with white) from PIL import ImageOps bbox = ImageOps.invert(im).getbbox() vpad = int(.02 * im.size[1]) im = im.crop((0, max(0, bbox[1] - vpad), im.size[0], min(im.size[1], bbox[3] + vpad))) return im # Find the PDF file and rasterize the first two pages. try: metadata = load_bill_text(bill, None, mods_only=True) except IOError: # if bill text metadata isn't available, trap the error # and just 404 it raise Http404() if metadata.get("pdf_file"): # Use the PDF files on disk. pg1 = pdftopng(metadata.get("pdf_file"), 1) try: pg2 = pdftopng(metadata.get("pdf_file"), 2) except: pg2 = pg1.crop((0, 0, pg1.size[0], 0)) # may only be one page! elif settings.DEBUG: # When debugging in a local environment we may not have bill text available # so download the PDF from GPO. import os, tempfile, subprocess try: (fd1, fn1) = tempfile.mkstemp(suffix=".pdf") os.close(fd1) subprocess.check_call( ["/usr/bin/wget", "-O", fn1, "-q", metadata["gpo_pdf_url"]]) pg1 = pdftopng(fn1, 1) pg2 = pdftopng(fn1, 2) finally: os.unlink(fn1) else: # No PDF is available. raise Http404() # Since some bills have big white space at the top of the first page, # we'll combine the first two pages and then shift the window down # until the real start of the bill. from PIL import Image img = Image.new(pg1.mode, (pg1.size[0], int(pg1.size[1] + pg2.size[1]))) img.paste(pg1, (0, 0)) img.paste(pg2, (0, pg1.size[1])) # Zealous crop the (horizontal) margins. We do this only after the two # pages have been combined so that we don't mess up their alignment. # Add some padding. from PIL import ImageOps hpad = int(.02 * img.size[0]) bbox = ImageOps.invert(img).getbbox() img = img.crop((max(0, bbox[0] - hpad), 0, min(img.size[0], bbox[2] + hpad), img.size[1])) # Now take a window from the top matching a particular aspect ratio. # We're going to display this next to photos of members of congress, # so use that aspect ratio. try: aspect = float(request.GET["aspect"]) except: aspect = 240.0 / 200.0 img = img.crop((0, 0, img.size[0], int(aspect * img.size[0]))) # Resize to requested width. if "width" in request.GET: img.thumbnail((int( request.GET["width"]), int(aspect * float(request.GET["width"]))), Image.ANTIALIAS) # Add symbology. if image_type == "thumbnail": img = img.convert("RGBA") banner_color = None party_colors = { "Republican": (230, 14, 19, 150), "Democrat": (0, 65, 161, 150) } if bill.sponsor_role: banner_color = party_colors.get(bill.sponsor_role.party) if banner_color: from PIL import ImageDraw im = Image.new("RGBA", img.size, (0, 0, 0, 0)) draw = ImageDraw.Draw(im) draw.rectangle(((0, int(.85 * im.size[1])), im.size), outline=None, fill=banner_color) del draw img = Image.alpha_composite(img, im) if bill.sponsor and bill.sponsor.has_photo(): im = Image.open("." + bill.sponsor.get_photo_url(200)) im.thumbnail([int(x / 2.5) for x in img.size]) img.paste( im, (int(.05 * img.size[1]), int(.95 * img.size[1]) - im.size[1])) from PIL import ImageDraw draw = ImageDraw.Draw(img) draw.rectangle(((0, 0), (img.size[0] - 1, img.size[1] - 1)), outline=(100, 100, 100, 255), fill=None) del draw # Serialize & return. import StringIO imgbytesbuf = StringIO.StringIO() img.save(imgbytesbuf, "PNG") imgbytes = imgbytesbuf.getvalue() imgbytesbuf.close() return HttpResponse(imgbytes, content_type="image/png")
def get_text_info(): from models import USCSection from billtext import load_bill_text from search import parse_slip_law_number import re try: metadata = load_bill_text(bill, None, mods_only=True) # do interesting stuff with citations if "citations" in metadata and not settings.DEBUG: slip_laws = [] statutes = [] usc = { } other = [] usc_other = USCSection(name="Other Citations", ordering=99999) for cite in metadata["citations"]: if cite["type"] == "slip_law": slip_laws.append(cite) cite["bill"] = parse_slip_law_number(cite["text"]) elif cite["type"] == "statutes_at_large": statutes.append(cite) elif cite["type"] in ("usc-section", "usc-chapter"): # Build a tree of title-chapter-...-section nodes so we can # display the citations in context. try: sec_obj = USCSection.objects.get(citation=cite["key"]) except: # USCSection.DoesNotExist and MultipleObjectsReturned both possible # create a fake entry for the sake of output # the 'id' field is set to make these objects properly hashable sec_obj = USCSection(id=cite["text"], name=cite["text"], parent_section=usc_other) if "range_to_section" in cite: sec_obj.range_to_section = cite["range_to_section"] # recursively go up to the title path = [sec_obj] so = sec_obj while so.parent_section: so = so.parent_section path.append(so) # build a link to LII if cite["type"] == "usc-section": cite_link = "http://www.law.cornell.edu/uscode/text/" + cite["title"] if cite["section"]: cite_link += "/" + cite["section"] if cite["paragraph"]: cite_link += "#" + "_".join(re.findall(r"\(([^)]+)\)", cite["paragraph"])) elif cite["type"] == "usc-chapter": cite_link = "http://www.law.cornell.edu/uscode/text/" + cite["title"] + "/" + "/".join( (so.level_type + "-" + so.number) for so in reversed(path[:-1]) ) sec_obj.link = cite_link # now pop off from the path to put the node at the right point in a tree container = usc while path: p = path.pop(-1) if p not in container: container[p] = { } container = container[p] else: other.append(cite) slip_laws.sort(key = lambda x : (x["congress"], x["number"])) # restructure data format def ucfirst(s): return s[0].upper() + s[1:] def rebuild_usc_sec(seclist, indent=0): ret = [] seclist = sorted(seclist.items(), key=lambda x : x[0].ordering) for sec, subparts in seclist: ret.append({ "text": (ucfirst(sec.level_type + ((" " + sec.number) if sec.number else "") + (": " if sec.name else "")) if sec.level_type else "") + (sec.name_recased if sec.name else ""), "link": getattr(sec, "link", None), "range_to_section": getattr(sec, "range_to_section", None), "indent": indent, }) ret.extend(rebuild_usc_sec(subparts, indent=indent+1)) return ret usc = rebuild_usc_sec(usc) metadata["citations"] = { "slip_laws": slip_laws, "statutes": statutes, "usc": usc, "other": other, "count": len(slip_laws)+len(statutes)+len(usc)+len(other) } return metadata except IOError: return None
def get_text_info(): from billtext import load_bill_text try: return load_bill_text(bill, None, mods_only=True) except IOError: return None
def load_comparison(left_bill, left_version, right_bill, right_version, timelimit=10, force=False): from billtext import load_bill_text, compare_xml_text, get_current_version import lxml left_bill = Bill.objects.get(id=left_bill) right_bill = Bill.objects.get(id=right_bill) if left_version == "": left_version = get_current_version(left_bill) if right_version == "": right_version = get_current_version(right_bill) btc = None try: btc = BillTextComparison.objects.get(bill1=left_bill, ver1=left_version, bill2=right_bill, ver2=right_version) btc.decompress() if not force: return btc.data except BillTextComparison.DoesNotExist: pass # Try with the bills swapped. try: btc2 = BillTextComparison.objects.get(bill2=left_bill, ver2=left_version, bill1=right_bill, ver1=right_version) btc2.decompress() data = btc2.data return { "left_meta": data["right_meta"], "right_meta": data["left_meta"], "left_text": data["right_text"], "right_text": data["left_text"], } except BillTextComparison.DoesNotExist: pass left = load_bill_text(left_bill, left_version, mods_only=True) right = load_bill_text(right_bill, right_version, mods_only=True) try: doc1 = lxml.etree.parse(left["html_file"]) doc2 = lxml.etree.parse(right["html_file"]) except KeyError: raise IOError( "The HTML bill text format is not available for one of the bills.") compare_xml_text(doc1, doc2, timelimit=timelimit) # revises DOMs in-place # dates aren't JSON serializable left["docdate"] = left["docdate"].strftime("%x") right["docdate"] = right["docdate"].strftime("%x") ret = { "left_meta": left, "right_meta": right, "left_text": lxml.etree.tostring(doc1), "right_text": lxml.etree.tostring(doc2), } if not btc: btc = BillTextComparison(bill1=left_bill, ver1=left_version, bill2=right_bill, ver2=right_version, data=dict(ret)) # clone before compress() else: btc.data = dict(ret) # clone before compress() btc.compress() btc.save() return ret
def load_comparison(left_bill, left_version, right_bill, right_version, timelimit=10): from billtext import load_bill_text, get_current_version from xml_diff import compare import lxml left_bill = Bill.objects.get(id=left_bill) right_bill = Bill.objects.get(id=right_bill) if left_version == "": left_version = get_current_version(left_bill) if right_version == "": right_version = get_current_version(right_bill) use_cache = True if use_cache: # Load from cache. try: btc = BillTextComparison.objects.get(bill1=left_bill, ver1=left_version, bill2=right_bill, ver2=right_version) btc.decompress() return btc.data except BillTextComparison.DoesNotExist: pass # Load from cache - Try with the bills swapped. try: btc2 = BillTextComparison.objects.get(bill2=left_bill, ver2=left_version, bill1=right_bill, ver1=right_version) btc2.decompress() data = btc2.data # un-swap return { "left_meta": data["right_meta"], "right_meta": data["left_meta"], "left_text": data["right_text"], "right_text": data["left_text"], } except BillTextComparison.DoesNotExist: pass # Load bill text metadata. left = load_bill_text(left_bill, left_version, mods_only=True) right = load_bill_text(right_bill, right_version, mods_only=True) # Load XML DOMs for each document and perform the comparison. def load_bill_text_xml(docinfo): # If XML text is available, use it, but pre-render it # into HTML. Otherwise use the legacy HTML that we # scraped from THOMAS. if "xml_file" in docinfo: import congressxml return congressxml.convert_xml(docinfo["xml_file"]) elif "html_file" in docinfo: return lxml.etree.parse(docinfo["html_file"]) else: raise IOError("Bill text is not available for one of the bills.") doc1 = load_bill_text_xml(left) doc2 = load_bill_text_xml(right) def make_tag_func(ins_del): import lxml.etree elem = lxml.etree.Element("comparison-change") return elem compare(doc1.getroot(), doc2.getroot(), make_tag_func=make_tag_func) # Prepare JSON response data. # dates aren't JSON serializable left["docdate"] = left["docdate"].strftime("%x") right["docdate"] = right["docdate"].strftime("%x") ret = { "left_meta": left, "right_meta": right, "left_text": lxml.etree.tostring(doc1), "right_text": lxml.etree.tostring(doc2), } if use_cache: # Cache in database so we don't have to re-do the comparison # computation again. btc = BillTextComparison(bill1=left_bill, ver1=left_version, bill2=right_bill, ver2=right_version, data=dict(ret)) # clone before compress() btc.compress() btc.save() # Return JSON comparison data. return ret
def load_comparison(left_bill, left_version, right_bill, right_version, timelimit=10, use_cache=True, force_update=False): from billtext import load_bill_text, get_current_version from xml_diff import compare import lxml left_bill = Bill.objects.get(id = left_bill) right_bill = Bill.objects.get(id = right_bill) if left_version == "": left_version = get_current_version(left_bill) if right_version == "": right_version = get_current_version(right_bill) if use_cache: # Load from cache. try: btc = BillTextComparison.objects.get( bill1 = left_bill, ver1 = left_version, bill2 = right_bill, ver2 = right_version) btc.decompress() return btc.data except BillTextComparison.DoesNotExist: pass # Load from cache - Try with the bills swapped. try: btc2 = BillTextComparison.objects.get( bill2 = left_bill, ver2 = left_version, bill1 = right_bill, ver1 = right_version) btc2.decompress() data = btc2.data # un-swap return { "left_meta": data["right_meta"], "right_meta": data["left_meta"], "left_text": data["right_text"], "right_text": data["left_text"], } except BillTextComparison.DoesNotExist: pass # Load bill text metadata. left = load_bill_text(left_bill, left_version, mods_only=True) right = load_bill_text(right_bill, right_version, mods_only=True) # Load XML DOMs for each document and perform the comparison. def load_bill_text_xml(docinfo): # If XML text is available, use it, but pre-render it # into HTML. Otherwise use the legacy HTML that we # scraped from THOMAS. if "xml_file" in docinfo: import congressxml return congressxml.convert_xml(docinfo["xml_file"]) elif "html_file" in docinfo: return lxml.etree.parse(docinfo["html_file"]) else: raise IOError("Bill text is not available for one of the bills.") doc1 = load_bill_text_xml(left) doc2 = load_bill_text_xml(right) def make_tag_func(ins_del): import lxml.etree elem = lxml.etree.Element("comparison-change") return elem def differ(text1, text2): # ensure we use the C++ Google DMP and can specify the time limit import diff_match_patch for x in diff_match_patch.diff_unicode(text1, text2, timelimit=timelimit): yield x compare(doc1.getroot(), doc2.getroot(), make_tag_func=make_tag_func, differ=differ) # Prepare JSON response data. # dates aren't JSON serializable left["docdate"] = left["docdate"].strftime("%x") right["docdate"] = right["docdate"].strftime("%x") ret = { "left_meta": left, "right_meta": right, "left_text": lxml.etree.tostring(doc1), "right_text": lxml.etree.tostring(doc2), } if use_cache or force_update: # For force_update, or race conditions, delete any existing record. fltr = { "bill1": left_bill, "ver1": left_version, "bill2": right_bill, "ver2": right_version } BillTextComparison.objects.filter(**fltr).delete() # Cache in database so we don't have to re-do the comparison # computation again. btc = BillTextComparison( data = dict(ret), # clone before compress() **fltr) btc.compress() btc.save() # Return JSON comparison data. return ret
def load_comparison(left_bill, left_version, right_bill, right_version, timelimit=10, use_cache=True, force_update=False): from billtext import load_bill_text, get_current_version from xml_diff import compare import lxml left_bill = Bill.objects.get(id=left_bill) right_bill = Bill.objects.get(id=right_bill) if left_version == "": left_version = get_current_version(left_bill) if right_version == "": right_version = get_current_version(right_bill) if use_cache: # Load from cache. try: btc = BillTextComparison.objects.get(bill1=left_bill, ver1=left_version, bill2=right_bill, ver2=right_version) btc.decompress() return btc.data except BillTextComparison.DoesNotExist: pass # Load from cache - Try with the bills swapped. try: btc2 = BillTextComparison.objects.get(bill2=left_bill, ver2=left_version, bill1=right_bill, ver1=right_version) btc2.decompress() data = btc2.data # un-swap return { "left_meta": data["right_meta"], "right_meta": data["left_meta"], "left_text": data["right_text"], "right_text": data["left_text"], } except BillTextComparison.DoesNotExist: pass # Load bill text metadata. left = load_bill_text(left_bill, left_version, mods_only=True) right = load_bill_text(right_bill, right_version, mods_only=True) # Load XML DOMs for each document and perform the comparison. def load_bill_text_xml(docinfo): # If XML text is available, use it, but pre-render it # into HTML. Otherwise use the legacy HTML that we # scraped from THOMAS. if "xml_file" in docinfo: import congressxml return congressxml.convert_xml(docinfo["xml_file"]) elif "html_file" in docinfo: return lxml.etree.parse(docinfo["html_file"]) else: raise IOError("Bill text is not available for one of the bills.") doc1 = load_bill_text_xml(left) doc2 = load_bill_text_xml(right) def make_tag_func(ins_del): import lxml.etree elem = lxml.etree.Element("comparison-change") return elem def differ(text1, text2): # ensure we use the C++ Google DMP and can specify the time limit import diff_match_patch for x in diff_match_patch.diff_unicode(text1, text2, timelimit=timelimit): yield x compare(doc1.getroot(), doc2.getroot(), make_tag_func=make_tag_func, differ=differ) # Prepare JSON response data. # dates aren't JSON serializable left["docdate"] = left["docdate"].strftime("%x") right["docdate"] = right["docdate"].strftime("%x") ret = { "left_meta": left, "right_meta": right, "left_text": lxml.etree.tostring(doc1), "right_text": lxml.etree.tostring(doc2), } if use_cache or force_update: # For force_update, or race conditions, delete any existing record. fltr = { "bill1": left_bill, "ver1": left_version, "bill2": right_bill, "ver2": right_version } BillTextComparison.objects.filter(**fltr).delete() # Cache in database so we don't have to re-do the comparison # computation again. btc = BillTextComparison( data=dict(ret), # clone before compress() **fltr) btc.compress() btc.save() # Return JSON comparison data. return ret
def bill_text_image(request, congress, type_slug, number): bill = load_bill_from_url(congress, type_slug, number) from billtext import load_bill_text # Rasterizes a page of a PDF to a greyscale PIL.Image. # Crop out the GPO seal & the vertical margins. def pdftopng(pdffile, pagenumber, width=900): from PIL import Image import subprocess, StringIO pngbytes = subprocess.check_output(["/usr/bin/pdftoppm", "-f", str(pagenumber), "-l", str(pagenumber), "-scale-to", str(width), "-png", pdffile]) im = Image.open(StringIO.StringIO(pngbytes)) im = im.convert("L") # crop out the GPO seal: im = im.crop((0, int((.06 if pagenumber==1 else 0) * im.size[0]), im.size[0], im.size[1])) # zealous-crop the vertical margins, but at least leaving a little # at the bottom so that when we paste the two pages of the two images # together they don't get totally scruntched, and put in some padding # at the top. # (.getbbox() crops out zeroes, so we'll invert the image to make it work with white) from PIL import ImageOps bbox = ImageOps.invert(im).getbbox() vpad = int(.02*im.size[1]) im = im.crop( (0, max(0, bbox[1]-vpad), im.size[0], min(im.size[1], bbox[3]+vpad) ) ) return im # Find the PDF file and rasterize the first two pages. try: metadata = load_bill_text(bill, None, mods_only=True) except IOError: # if bill text metadata isn't available, trap the error # and just 404 it raise Http404() if metadata.get("pdf_file"): # Use the PDF files on disk. pg1 = pdftopng(metadata.get("pdf_file"), 1) try: pg2 = pdftopng(metadata.get("pdf_file"), 2) except: pg2 = pg1.crop((0, 0, pg1.size[0], 0)) # may only be one page! elif settings.DEBUG: # When debugging in a local environment we may not have bill text available # so download the PDF from GPO. import os, tempfile, subprocess try: (fd1, fn1) = tempfile.mkstemp(suffix=".pdf") os.close(fd1) subprocess.check_call(["/usr/bin/wget", "-O", fn1, "-q", metadata["gpo_pdf_url"]]) pg1 = pdftopng(fn1, 1) pg2 = pdftopng(fn1, 2) finally: os.unlink(fn1) else: # No PDF is available. raise Http404() # Since some bills have big white space at the top of the first page, # we'll combine the first two pages and then shift the window down # until the real start of the bill. from PIL import Image img = Image.new(pg1.mode, (pg1.size[0], int(pg1.size[1]+pg2.size[1]))) img.paste(pg1, (0,0)) img.paste(pg2, (0,pg1.size[1])) # Zealous crop the (horizontal) margins. We do this only after the two # pages have been combined so that we don't mess up their alignment. # Add some padding. from PIL import ImageOps hpad = int(.02*img.size[0]) bbox = ImageOps.invert(img).getbbox() img = img.crop( (max(0, bbox[0]-hpad), 0, min(img.size[0], bbox[2]+hpad), img.size[1]) ) # Now take a window from the top matching a particular aspect ratio. # We're going to display this next to photos of members of congress, # so use that aspect ratio. img = img.crop((0,0, img.size[0], int(240.0/200.0*img.size[0]))) # Resize to requested width. if "width" in request.GET: img.thumbnail((int(request.GET["width"]), 11.0/8.0*int(request.GET["width"])), Image.ANTIALIAS) import StringIO imgbytesbuf = StringIO.StringIO() img.save(imgbytesbuf, "PNG") imgbytes = imgbytesbuf.getvalue() imgbytesbuf.close() return HttpResponse(imgbytes, mimetype="image/png")