def was_bill_enacted_2013(b, startdate, enddate): # Our status code is currently tied to the assignment of a slip # law number, which isn't what we mean exactly. # # (Additionally, we should count a bill as enacted if any identified companion # bill is enacted.) # TODO: See new function in the Bill model. # If it *was* assigned a slip law number, which in the future might # be useful for veto overrides, then OK. if b.current_status in BillStatus.final_status_passed_bill and \ startdate <= b.current_status_date <= enddate: return True # Otherwise, check the actions for a <signed> action. fn = "data/congress/%s/bills/%s/%s%d/data.json" % ( b.congress, BillType.by_value( b.bill_type).slug, BillType.by_value(b.bill_type).slug, b.number) bj = json.load(open(fn)) for axn in bj["actions"]: if axn["type"] == "signed" and startdate.isoformat( ) <= axn["acted_at"] <= enddate.isoformat(): return True return False
def was_bill_enacted_2013(b, startdate, enddate): # Our status code is currently tied to the assignment of a slip # law number, which isn't what we mean exactly. # # (Additionally, we should count a bill as enacted if any identified companion # bill is enacted.) # TODO: See new function in the Bill model. # If it *was* assigned a slip law number, which in the future might # be useful for veto overrides, then OK. if b.current_status in BillStatus.final_status_passed_bill and \ startdate <= b.current_status_date <= enddate: return True # Otherwise, check the actions for a <signed> action. fn = "data/congress/%s/bills/%s/%s%d/data.json" % ( b.congress, BillType.by_value(b.bill_type).slug, BillType.by_value(b.bill_type).slug, b.number) bj = json.load(open(fn)) for axn in bj["actions"]: if axn["type"] == "signed" and startdate.isoformat() <= axn["acted_at"] <= enddate.isoformat(): return True return False
def was_bill_enacted(b, startdate, enddate, recurse=True): # Our status code is currently tied to the assignment of a slip # law number, which isn't what we mean exactly. # # (Additionally, we should count a bill as enacted if any identified companion # bill is enacted.) # If it *was* assigned a slip law number, which in the future might # be useful for veto overrides, then OK. if b.current_status in BillStatus.final_status_passed_bill and \ startdate <= b.current_status_date <= enddate: return True # Otherwise, check the actions for a <signed> action. fn = "data/congress/%s/bills/%s/%s%d/data.json" % ( b.congress, BillType.by_value(b.bill_type).slug, BillType.by_value(b.bill_type).slug, b.number) bj = json.load(open(fn)) for axn in bj["actions"]: if axn["type"] == "signed" and startdate.isoformat() <= axn["acted_at"] <= enddate.isoformat(): return True # Otherwise check companion bills. #if recurse: # for rb in RelatedBill.objects.filter(bill=b, relation="identical").select_related("related_bill"): # if was_bill_enacted(rb.related_bill, startdate, enddate, recurse=False): # return True return False
def get_bill_text_metadata(bill, version): from bill.models import BillType # has to be here and not module-level to avoid cyclic dependency import glob, json bt = BillType.by_value(bill.bill_type).slug basename = "data/congress/%d/bills/%s/%s%d/text-versions" % (bill.congress, bt, bt, bill.number) if version == None: # Cycle through files to find most recent version by date. dat = None for versionfile in glob.glob(basename + "/*/data.json"): d = json.load(open(versionfile)) if not dat or d["issued_on"] > dat["issued_on"]: dat = d if not dat: return None else: dat = json.load(open(basename + "/%s/data.json" % version)) basename += "/" + dat["version_code"] bt2 = BillType.by_value(bill.bill_type).xml_code html_fn = "data/us/bills.text/%s/%s/%s%d%s.html" % (bill.congress, bt2, bt2, bill.number, dat["version_code"]) if os.path.exists(basename + "/mods.xml"): dat["mods_file"] = basename + "/mods.xml" # get a plain text file if one exists if os.path.exists(basename + "/document.txt"): dat["text_file"] = basename + "/document.txt" dat["has_displayable_text"] = True for source in dat.get("sources", []): if source["source"] == "statutes": dat["text_file_source"] = "statutes" # get an HTML file if one exists if os.path.exists(html_fn): dat["html_file"] = html_fn dat["has_displayable_text"] = True # get a PDF file if one exists pdf_fn = "data/us/bills.text/%s/%s/%s%d%s.pdf" % (bill.congress, bt2, bt2, bill.number, dat["version_code"]) if os.path.exists(pdf_fn): dat["pdf_file"] = pdf_fn dat["has_thumbnail"] = True dat["thumbnail_path"] = bill.get_absolute_url() + "/_text_image" # get an XML file if one exists if os.path.exists(basename + "/catoxml.xml"): dat["xml_file"] = basename + "/catoxml.xml" dat["has_displayable_text"] = True dat["xml_file_source"] = "cato-deepbills" elif os.path.exists(basename + "/document.xml"): dat["xml_file"] = basename + "/document.xml" dat["has_displayable_text"] = True return dat
def get_bill_text_metadata(bill, version): from bill.models import BillType # has to be here and not module-level to avoid cyclic dependency import glob, json bt = BillType.by_value(bill.bill_type).slug basename = "data/congress/%d/bills/%s/%s%d/text-versions" % ( bill.congress, bt, bt, bill.number) if version == None: # Cycle through files to find most recent version by date. dat = None for versionfile in glob.glob(basename + "/*/data.json"): d = json.load(open(versionfile)) if not dat or d["issued_on"] > dat["issued_on"]: dat = d if not dat: return None else: dat = json.load(open(basename + "/%s/data.json" % version)) basename += "/" + dat["version_code"] bt2 = BillType.by_value(bill.bill_type).xml_code html_fn = "data/us/bills.text/%s/%s/%s%d%s.html" % ( bill.congress, bt2, bt2, bill.number, dat["version_code"]) if os.path.exists(basename + "/mods.xml"): dat["mods_file"] = basename + "/mods.xml" # get a plain text file if one exists if os.path.exists(basename + "/document.txt"): dat["text_file"] = basename + "/document.txt" dat["has_displayable_text"] = True for source in dat.get("sources", []): if source["source"] == "statutes": dat["text_file_source"] = "statutes" # get an HTML file if one exists if os.path.exists(html_fn): dat["html_file"] = html_fn dat["has_displayable_text"] = True # get an XML file if one exists if os.path.exists(basename + "/catoxml.xml"): dat["xml_file"] = basename + "/catoxml.xml" dat["has_displayable_text"] = True dat["xml_file_source"] = "cato-deepbills" elif os.path.exists(basename + "/document.xml"): dat["xml_file"] = basename + "/document.xml" dat["has_displayable_text"] = True thumb_fn = "data/us/bills.text/%s/%s/%s%d%s-thumb200.png" % ( bill.congress, bt2, bt2, bill.number, dat["version_code"]) if os.path.exists(thumb_fn): dat["thumbnail_path"] = thumb_fn return dat
def get_transparency_stats(person, role, stats, congress, startdate, enddate): global transparency_bills if not transparency_bills: transparency_bills = [] for line in open("analysis/transparency-bills.txt"): bill = Bill.from_congressproject_id(re.split("\s", line)[0]) if bill.congress != congress: continue transparency_bills.append(bill) # which bills are in the right chamber? plausible_bills = [] for bill in transparency_bills: if BillType.by_value(bill.bill_type).chamber == RoleType.by_value(role.role_type).congress_chamber: plausible_bills.append(bill) # did person sponsor any of these within this session? sponsored = [] for bill in transparency_bills: if startdate <= bill.introduced_date <= enddate and bill.sponsor == person: sponsored.append(bill) # did person cosponsor any of these within this session? cosponsored = [] for cosp in Cosponsor.objects.filter(person=person, bill__in=transparency_bills, joined__gte=startdate, joined__lte=enddate): cosponsored.append(cosp.bill) stats["transparency-bills"] = { "value": len(sponsored)*3 + len(cosponsored), "sponsored": make_bill_entries(sponsored), "cosponsored": make_bill_entries(cosponsored), "num_bills": len(plausible_bills), "chamber": RoleType.by_value(role.role_type).congress_chamber, }
def get_bill_number(bill, show_congress_number="ARCHIVAL"): "Compute display form of bill number" from bill.models import BillType ret = '%s %s' % (BillType.by_value(bill.bill_type).label, bill.number) if (bill.congress != settings.CURRENT_CONGRESS and show_congress_number == "ARCHIVAL") or show_congress_number == "ALL": ret += ' (%s)' % ordinal(bill.congress) return ret
def load_bill_text(bill, version, plain_text=False, mods_only=False): if bill.congress < 103 or plain_text: return load_bill_text_alt(bill, version, plain_text=plain_text, mods_only=mods_only) from bill.models import BillType # has to be here and not module-level to avoid cyclic dependency bt = BillType.by_value(bill.bill_type).xml_code basename = "data/us/bills.text/%s/%s/%s%d%s" % (bill.congress, bt, bt, bill.number, version if version != None else "") if mods_only: bill_text_content = None else: bill_text_content = open(basename + ".html").read() mods = lxml.etree.parse(basename + ".mods.xml") ns = { "mods": "http://www.loc.gov/mods/v3" } docdate = mods.xpath("string(mods:originInfo/mods:dateIssued)", namespaces=ns) gpo_url = "http://www.gpo.gov/fdsys/search/pagedetails.action?packageId=" + mods.xpath("string(mods:recordInfo/mods:recordIdentifier[@source='DGPO'])", namespaces=ns) #gpo_url = mods.xpath("string(mods:identifier[@type='uri'])", namespaces=ns) gpo_pdf_url = mods.xpath("string(mods:location/mods:url[@displayLabel='PDF rendition'])", namespaces=ns) doc_version = mods.xpath("string(mods:extension/mods:billVersion)", namespaces=ns) numpages = mods.xpath("string(mods:physicalDescription/mods:extent)", namespaces=ns) if numpages: numpages = re.sub(r" p\.$", " pages", numpages) docdate = datetime.date(*(int(d) for d in docdate.split("-"))) doc_version_name = bill_gpo_status_codes[doc_version] # load a list of citations as marked up by GPO citations = [] for cite in mods.xpath("//mods:identifier", namespaces=ns): if cite.get("type") == "USC citation": citations.append( parse_usc_citation(cite) ) elif cite.get("type") == "Statute citation": citations.append({ "type": "statutes_at_large", "text": cite.text }) elif cite.get("type") == "public law citation": try: congress_cite, slip_law_num = re.match(r"Public Law (\d+)-(\d+)$", cite.text).groups() citations.append({ "type": "slip_law", "text": cite.text, "congress": int(congress_cite), "number": int(slip_law_num) }) except: citations.append({ "type": "unknown", "text": cite.text }) return { "bill_id": bill.id, "bill_name": bill.title, "basename": basename, "text_html": bill_text_content, "docdate": docdate, "gpo_url": gpo_url, "gpo_pdf_url": gpo_pdf_url, "doc_version": doc_version, "doc_version_name": doc_version_name, "numpages": numpages, "has_html_text": True, "citations": citations, }
def get_bill_text_version_regular(bill, version): basename = bill.data_dir_path + "/text-versions" dat = json.load(open(basename + "/%s/data.json" % version)) dat["status_name"] = get_gpo_status_code_name(dat["version_code"]) dat["corresponding_status_codes"] = get_gpo_status_code_corresponding_status( dat["version_code"]) dat["issued_on"] = datetime.date(*(int(d) for d in dat["issued_on"].split("-"))) # find content files basename += "/" + dat["version_code"] from bill.models import BillType # has to be here and not module-level to avoid cyclic dependency bt = BillType.by_value(bill.bill_type).slug html_fn = "data/congress-bill-text-legacy/%d/%s/%s%d/%s.html" % ( bill.congress, bt, bt, bill.number, dat["version_code"]) if os.path.exists(basename + "/mods.xml"): dat["mods_file"] = basename + "/mods.xml" # get a plain text file if one exists if os.path.exists(basename + "/document.txt"): dat["text_file"] = basename + "/document.txt" dat["has_displayable_text"] = True for source in dat.get("sources", []): if source["source"] == "statutes": dat["text_file_source"] = "statutes" # get an HTML file if one exists if os.path.exists(html_fn): dat["html_file"] = html_fn dat["has_displayable_text"] = True # get a PDF file if one exists pdf_fn = basename + "/document.pdf" if os.path.exists(pdf_fn): dat["pdf_file"] = pdf_fn dat["has_thumbnail"] = True dat["thumbnail_path"] = bill.get_absolute_url() + "/_text_image" # get an XML file if one exists if os.path.exists(basename + "/catoxml.xml"): dat["xml_file"] = basename + "/catoxml.xml" dat["has_displayable_text"] = True dat["xml_file_source"] = "cato-deepbills" elif os.path.exists(basename + "/document.xml"): dat["xml_file"] = basename + "/document.xml" dat["has_displayable_text"] = True if settings.DEBUG: dat["has_thumbnail"] = True return dat
def get_bill_number(bill, show_congress_number="ARCHIVAL"): "Compute display form of bill number" if bill.congress <= 42: # This is an American Memory bill. It's number is stored. ret = bill.title.split(":")[0] else: from bill.models import BillType ret = '%s %s' % (BillType.by_value(bill.bill_type).label, bill.number) if (bill.congress != settings.CURRENT_CONGRESS and show_congress_number == "ARCHIVAL") or show_congress_number == "ALL": ret += ' (%s)' % ordinal(bill.congress) return ret
def bill_text(request, congress, type_slug, number, version=None): if version == "": version = None try: bill_type = BillType.by_slug(type_slug) except BillType.NotFound: raise Http404("Invalid bill type: " + type_slug) bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number) from billtext import load_bill_text, bill_gpo_status_codes try: textdata = load_bill_text(bill, version) except IOError: textdata = None # Get a list of the alternate versions of this bill. alternates = None if textdata: alternates = [] for v in bill_gpo_status_codes: fn = "data/us/bills.text/%s/%s/%s%d%s.mods.xml" % (bill.congress, BillType.by_value(bill.bill_type).xml_code, BillType.by_value(bill.bill_type).xml_code, bill.number, v) if os.path.exists(fn): alternates.append(load_bill_text(bill, v, mods_only=True)) alternates.sort(key = lambda mods : mods["docdate"]) # Get a list of related bills. from billtext import get_current_version related_bills = [] for rb in list(bill.find_reintroductions()) + [r.related_bill for r in bill.get_related_bills()]: try: rbv = get_current_version(rb) if not (rb, rbv) in related_bills: related_bills.append((rb, rbv)) except IOError: pass # text not available for btc in BillTextComparison.objects.filter(bill1=bill).exclude(bill2=bill): if not (btc.bill2, btc.ver2) in related_bills: related_bills.append((btc.bill2, btc.ver2)) for btc in BillTextComparison.objects.filter(bill2=bill).exclude(bill1=bill): if not (btc.bill1, btc.ver1) in related_bills: related_bills.append((btc.bill1, btc.ver1)) return { 'bill': bill, "congressdates": get_congress_dates(bill.congress), "textdata": textdata, "version": version, "alternates": alternates, "related_bills": related_bills, }
def get_bill_text_metadata(bill, version): from bill.models import BillType # has to be here and not module-level to avoid cyclic dependency import glob, json bt = BillType.by_value(bill.bill_type).slug basename = "data/congress/%d/bills/%s/%s%d/text-versions" % (bill.congress, bt, bt, bill.number) if version == None: # Cycle through files to find most recent version by date. dat = None for versionfile in glob.glob(basename + "/*/data.json"): d = json.load(open(versionfile)) if not dat or d["issued_on"] > dat["issued_on"]: dat = d if not dat: return None else: dat = json.load(open(basename + "/%s/data.json" % version)) dat["plain_text_file"] = basename + "/" + dat["version_code"] + "/document.txt" return dat
def load_bill_text_alt(bill, version, plain_text=False, mods_only=False): # Load bill text info from the Congress project JSON files. from bill.models import BillType # has to be here and not module-level to avoid cyclic dependency import glob, json bt = BillType.by_value(bill.bill_type).slug basename = "data/congress/%d/bills/%s/%s%d/text-versions" % (bill.congress, bt, bt, bill.number) if version == None: # Cycle through files to find most recent version by date. dat = None for versionfile in glob.glob(basename + "/*.json"): d = json.load(open(versionfile)) if not dat or d["issued_on"] > dat["issued_on"]: dat = d else: dat = json.load(open(basename + "/%s.json" % version)) if not mods_only: raise Exception("Bill text not available.") gpo_url = dat["urls"]["pdf"] m = re.match(r"http://www.gpo.gov/fdsys/pkg/(STATUTE-\d+)/pdf/(STATUTE-\d+-.*).pdf", gpo_url) if m: gpo_url = "http://www.gpo.gov/fdsys/granule/%s/%s/content-detail.html" % m.groups() return { "bill_id": bill.id, "bill_name": bill.title, "basename": basename, "docdate": datetime.date(*(int(d) for d in dat["issued_on"].split("-"))), "gpo_url": gpo_url, "gpo_pdf_url": dat["urls"]["pdf"], "doc_version": dat["version_code"], "doc_version_name": bill_gpo_status_codes[dat["version_code"]], "has_html_text": False, }
def get_bill_text_metadata(bill, version): from bill.models import BillType # has to be here and not module-level to avoid cyclic dependency import glob, json bt = BillType.by_value(bill.bill_type).slug basename = "data/congress/%d/bills/%s/%s%d/text-versions" % ( bill.congress, bt, bt, bill.number) if version == None: # Cycle through files to find most recent version by date. dat = None for versionfile in glob.glob(basename + "/*/data.json"): d = json.load(open(versionfile)) if not dat or d["issued_on"] > dat["issued_on"]: dat = d if not dat: return None else: dat = json.load(open(basename + "/%s/data.json" % version)) dat["plain_text_file"] = basename + "/" + dat[ "version_code"] + "/document.txt" return dat
#!script import os.path from bill.models import Bill, BillType all_bill_ids = list(Bill.objects.all().values_list('id', flat=True)) def batch(iterable, n=1): l = len(iterable) for ndx in range(0, l, n): yield iterable[ndx:min(ndx + n, l)] for idset in batch(all_bill_ids, n=2000): print("...") for bill in Bill.objects.only('congress', 'bill_type', 'number').in_bulk(idset).values(): fn = "data/congress/%s/bills/%s/%s%d/data.json" % ( bill.congress, BillType.by_value(bill.bill_type).slug, BillType.by_value(bill.bill_type).slug, bill.number) if not os.path.exists(fn): print(bill.id, bill)
#!script import os.path from bill.models import Bill, BillType all_bill_ids = list(Bill.objects.all().values_list('id', flat=True)) def batch(iterable, n = 1): l = len(iterable) for ndx in range(0, l, n): yield iterable[ndx:min(ndx+n, l)] for idset in batch(all_bill_ids, n=2000): print "..." for bill in Bill.objects.only('congress', 'bill_type', 'number').in_bulk(idset).values(): fn = "data/congress/%s/bills/%s/%s%d/data.json" % ( bill.congress, BillType.by_value(bill.bill_type).slug, BillType.by_value(bill.bill_type).slug, bill.number) if not os.path.exists(fn): print bill.id, bill
def load_bill_text(bill, version, plain_text=False, mods_only=False): if bill.congress < 103 or plain_text: return load_bill_text_alt(bill, version, plain_text=plain_text, mods_only=mods_only) from bill.models import BillType # has to be here and not module-level to avoid cyclic dependency bt = BillType.by_value(bill.bill_type).xml_code basename = "data/us/bills.text/%s/%s/%s%d%s" % (bill.congress, bt, bt, bill.number, version if version != None else "") if mods_only: bill_text_content = None else: bill_text_content = open(basename + ".html").read() mods = lxml.etree.parse(basename + ".mods.xml") ns = { "mods": "http://www.loc.gov/mods/v3" } docdate = mods.xpath("string(mods:originInfo/mods:dateIssued)", namespaces=ns) gpo_url = "http://www.gpo.gov/fdsys/search/pagedetails.action?packageId=" + mods.xpath("string(mods:recordInfo/mods:recordIdentifier[@source='DGPO'])", namespaces=ns) #gpo_url = mods.xpath("string(mods:identifier[@type='uri'])", namespaces=ns) gpo_pdf_url = mods.xpath("string(mods:location/mods:url[@displayLabel='PDF rendition'])", namespaces=ns) doc_version = mods.xpath("string(mods:extension/mods:billVersion)", namespaces=ns) numpages = mods.xpath("string(mods:physicalDescription/mods:extent)", namespaces=ns) if numpages: numpages = re.sub(r" p\.$", " pages", numpages) docdate = datetime.date(*(int(d) for d in docdate.split("-"))) doc_version_name = bill_gpo_status_codes[doc_version] # citations citations = [] for cite in mods.xpath("//mods:identifier", namespaces=ns): if cite.get("type") == "USC citation": try: title_cite, title_app_cite, sec_cite, para_cite = re.match(r"(\d+\S*)\s*U.S.C.(\s*App.)?\s*([^\s(]+?)?\s*(\(.*|et ?seq\.?|note)?$", cite.text).groups() if title_app_cite: title_cite += "a" if para_cite and para_cite.strip() == "": para_cite = None if not para_cite and "-" in sec_cite: # This dash may indicate a range of sections, or it may just be # a dash that occurs within section names. Be smart and try to # figure it out. found_range = False sec_dash_parts = sec_cite.split("-") for i in xrange(1, len(sec_dash_parts)): # Split the citation around each particular dash, and if both # halves are valid citations with the same parent then assume # this is a range. (A nice case is 16 U.S.C. 3839aa-8, where # both 3839aa and 8 are valid sections but are far apart.) sec_parts = ["-".join(sec_dash_parts[:i]), "-".join(sec_dash_parts[i:])] from models import USCSection sec_parent = None for sec_part in sec_parts: matched_sec = list(USCSection.objects.filter(citation="usc/" + title_cite + "/" + sec_part)) if len(matched_sec) == 0: break # part doesn't exist, skip the else block below and fall through to assume this is not a range if sec_parent == None: sec_parent = matched_sec[0].parent_section_id else: if sec_parent != matched_sec[0].parent_section_id: break # likewise, parents dont match so not a range else: # Both parts exist. Treat as a USC citation range. citations.append({ "type": "usc", "text": cite.text, "title": title_cite, "section": sec_parts[0], "paragraph": None, "range_to_section": sec_parts[1] }) found_range = True break if found_range: continue citations.append({ "type": "usc", "text": cite.text, "title": title_cite, "section": sec_cite, "paragraph" : para_cite }) except: citations.append({ "type": "unknown", "text": cite.text }) elif cite.get("type") == "Statute citation": citations.append({ "type": "statutes_at_large", "text": cite.text }) elif cite.get("type") == "public law citation": try: congress_cite, slip_law_num = re.match(r"Public Law (\d+)-(\d+)$", cite.text).groups() citations.append({ "type": "slip_law", "text": cite.text, "congress": int(congress_cite), "number": int(slip_law_num) }) except: citations.append({ "type": "unknown", "text": cite.text }) else: continue return { "bill_id": bill.id, "bill_name": bill.title, "basename": basename, "text_html": bill_text_content, "docdate": docdate, "gpo_url": gpo_url, "gpo_pdf_url": gpo_pdf_url, "doc_version": doc_version, "doc_version_name": doc_version_name, "numpages": numpages, "has_html_text": True, "citations": citations, }
def analysis_methodology(request): from settings import CURRENT_CONGRESS from person.models import RoleType from bill.models import BillType from us import get_congress_dates import json from person.analysis import load_sponsorship_analysis2 def make_chart_series(role_type): data = load_sponsorship_analysis2(CURRENT_CONGRESS, role_type, None) if not data: return None ret = { } for p in data["all"]: ret.setdefault(p["party"], { "type": "party", "party": p["party"], "data": [], })["data"].append({ "x": float(p["ideology"]), "y": float(p["leadership"]), "name": p["name"], }) ret = list(ret.values()) ret.sort(key = lambda s : len(s["data"]), reverse=True) data = dict(data) # clone before modifying, just in case data["series"] = json.dumps(ret) return data import bill.prognosis_model import bill.prognosis_model_test prognosis_factors = list((k, dict(v)) for k, v in bill.prognosis_model.factors.items()) # clone for k, v in prognosis_factors: v["bill_type"] = BillType.by_value(k[0]) v["is_introduced_model"] = k[1] v["factors"] = sorted(v["factors"].values(), key = lambda f : f["regression_beta"], reverse=True) prognosis_factors = [kv[1] for kv in prognosis_factors] prognosis_factors.sort(key = lambda m : (m["bill_type"] in (BillType.house_bill, BillType.senate_bill), m["count"]), reverse=True) prognosis_test = list(bill.prognosis_model_test.model_test_results.items()) # clone for k, v in prognosis_test: v["bill_type"] = BillType.by_value(k[0]) v["is_introduced_model"] = (k[1] == 0) v["success_name"] = bill.prognosis_model.factors[(k[0], (k[1] == 0))]["success_name"] prognosis_test.sort(key = lambda kv : (kv[0][0] in (BillType.house_bill, BillType.senate_bill), bill.prognosis_model.factors[kv[0]]["count"]), reverse=True) prognosis_test = [kv[1] for kv in prognosis_test] return { "ideology": lambda : { # defer until cache miss "house": make_chart_series(RoleType.representative), "senate": make_chart_series(RoleType.senator), }, "current_congress": CURRENT_CONGRESS, "prognosis_training_congress": bill.prognosis_model.congress, "prognosis_training_congress_dates": get_congress_dates(bill.prognosis_model.congress), "prognosis_factors": prognosis_factors, "prognosis_test": prognosis_test, "prognosis_testing_traincongress": bill.prognosis_model_test.train_congress, "prognosis_testing_testcongress": bill.prognosis_model_test.test_congress, }
def reference(bill): bt = BillType.by_value(bill.bill_type) return bt.xml_code + str(bill.congress) + "-" + str(bill.number)
def load_bill_text(bill, version, plain_text=False, mods_only=False): if bill.congress < 103: return load_bill_text_alt(bill, version, plain_text=plain_text, mods_only=mods_only) from bill.models import BillType # has to be here and not module-level to avoid cyclic dependency bt = BillType.by_value(bill.bill_type).xml_code basename = "data/us/bills.text/%s/%s/%s%d%s" % (bill.congress, bt, bt, bill.number, version if version != None else "") if mods_only: bill_text_content = None else: if plain_text: # plain_text never raises an IOError try: return open(basename + ".txt").read().decode("utf8", "ignore") # otherwise we get 'Chuck failed' in the xapian_backend apparently due to decoding issue. except IOError: return "" elif os.path.exists(basename + ".xml") and False: dom = lxml.etree.parse(basename + ".xml") transform = lxml.etree.parse(os.path.join(os.path.dirname(os.path.realpath(__file__)), "textxsl/billres.xsl")) transform = lxml.etree.XSLT(transform) result = transform(dom) # empty nodes cause HTML parsing problems, so remove them. # iterate in reverse document order so that we hit parents after # their children, since if we remove all of the children then we may # want to remove the parent too. for node in reversed(list(result.getiterator())): if node.xpath("string(.)") == "": node.getparent().remove(node) bill_text_content = lxml.etree.tostring(result.xpath("head/style")[0]) + lxml.etree.tostring(result.xpath("body")[0]) else: bill_text_content = open(basename + ".html").read() mods = lxml.etree.parse(basename + ".mods.xml") ns = { "mods": "http://www.loc.gov/mods/v3" } docdate = mods.xpath("string(mods:originInfo/mods:dateIssued)", namespaces=ns) gpo_url = "http://www.gpo.gov/fdsys/search/pagedetails.action?packageId=" + mods.xpath("string(mods:recordInfo/mods:recordIdentifier[@source='DGPO'])", namespaces=ns) #gpo_url = mods.xpath("string(mods:identifier[@type='uri'])", namespaces=ns) gpo_pdf_url = mods.xpath("string(mods:location/mods:url[@displayLabel='PDF rendition'])", namespaces=ns) doc_version = mods.xpath("string(mods:extension/mods:billVersion)", namespaces=ns) numpages = mods.xpath("string(mods:physicalDescription/mods:extent)", namespaces=ns) if numpages: numpages = re.sub(r" p\.$", " pages", numpages) docdate = datetime.date(*(int(d) for d in docdate.split("-"))) doc_version_name = bill_gpo_status_codes[doc_version] # citations citations = [] for cite in mods.xpath("//mods:identifier", namespaces=ns): if cite.get("type") == "USC citation": try: title_cite, title_app_cite, sec_cite, para_cite = re.match(r"(\d+\S*)\s*U.S.C.(\s*App.)?\s*([^\s(]+?)?\s*(\(.*|et ?seq\.?|note)?$", cite.text).groups() if title_app_cite: title_cite += "a" citations.append({ "type": "usc", "text": cite.text, "title": title_cite, "section": sec_cite, "paragraph" : para_cite }) except: citations.append({ "type": "unknown", "text": cite.text }) elif cite.get("type") == "Statute citation": citations.append({ "type": "statutes_at_large", "text": cite.text }) elif cite.get("type") == "public law citation": try: congress_cite, slip_law_num = re.match(r"Public Law (\d+)-(\d+)$", cite.text).groups() citations.append({ "type": "slip_law", "text": cite.text, "congress": int(congress_cite), "number": int(slip_law_num) }) except: citations.append({ "type": "unknown", "text": cite.text }) else: continue return { "bill_id": bill.id, "bill_name": bill.title, "basename": basename, "text_html": bill_text_content, "docdate": docdate, "gpo_url": gpo_url, "gpo_pdf_url": gpo_pdf_url, "doc_version": doc_version, "doc_version_name": doc_version_name, "numpages": numpages, "has_html_text": True, "citations": citations, }
def load_bill_text(bill, version, plain_text=False, mods_only=False): if bill.congress < 103 or plain_text: return load_bill_text_alt(bill, version, plain_text=plain_text, mods_only=mods_only) from bill.models import BillType # has to be here and not module-level to avoid cyclic dependency bt = BillType.by_value(bill.bill_type).xml_code basename = "data/us/bills.text/%s/%s/%s%d%s" % ( bill.congress, bt, bt, bill.number, version if version != None else "") if mods_only: bill_text_content = None else: bill_text_content = open(basename + ".html").read() mods = lxml.etree.parse(basename + ".mods.xml") ns = {"mods": "http://www.loc.gov/mods/v3"} docdate = mods.xpath("string(mods:originInfo/mods:dateIssued)", namespaces=ns) gpo_url = "http://www.gpo.gov/fdsys/search/pagedetails.action?packageId=" + mods.xpath( "string(mods:recordInfo/mods:recordIdentifier[@source='DGPO'])", namespaces=ns) #gpo_url = mods.xpath("string(mods:identifier[@type='uri'])", namespaces=ns) gpo_pdf_url = mods.xpath( "string(mods:location/mods:url[@displayLabel='PDF rendition'])", namespaces=ns) doc_version = mods.xpath("string(mods:extension/mods:billVersion)", namespaces=ns) numpages = mods.xpath("string(mods:physicalDescription/mods:extent)", namespaces=ns) if numpages: numpages = re.sub(r" p\.$", " pages", numpages) docdate = datetime.date(*(int(d) for d in docdate.split("-"))) doc_version_name = bill_gpo_status_codes[doc_version] # load a list of citations as marked up by GPO citations = [] for cite in mods.xpath("//mods:identifier", namespaces=ns): if cite.get("type") == "USC citation": citations.append(parse_usc_citation(cite)) elif cite.get("type") == "Statute citation": citations.append({"type": "statutes_at_large", "text": cite.text}) elif cite.get("type") == "public law citation": try: congress_cite, slip_law_num = re.match( r"Public Law (\d+)-(\d+)$", cite.text).groups() citations.append({ "type": "slip_law", "text": cite.text, "congress": int(congress_cite), "number": int(slip_law_num) }) except: citations.append({"type": "unknown", "text": cite.text}) return { "bill_id": bill.id, "bill_name": bill.title, "basename": basename, "text_html": bill_text_content, "docdate": docdate, "gpo_url": gpo_url, "gpo_pdf_url": gpo_pdf_url, "doc_version": doc_version, "doc_version_name": doc_version_name, "numpages": numpages, "has_html_text": True, "citations": citations, }
def get_bill_text_metadata(bill, version): from bill.models import BillType # has to be here and not module-level to avoid cyclic dependency import glob, json basename = bill.data_dir_path + "/text-versions" if version == None: # Cycle through files to find most recent version by date. dat = None for versionfile in glob.glob(basename + "/*/data.json"): d = json.load(open(versionfile)) if not dat or d["issued_on"] > dat["issued_on"]: dat = d if not dat: return None else: dat = json.load(open(basename + "/%s/data.json" % version)) # human readable status name dat["status_name"] = get_gpo_status_code_name(dat["version_code"]) dat["corresponding_status_codes"] = get_gpo_status_code_corresponding_status(dat["version_code"]) # parse date dat["issued_on"] = datetime.date(*(int(d) for d in dat["issued_on"].split("-"))) # find content files basename += "/" + dat["version_code"] bt = BillType.by_value(bill.bill_type).slug html_fn = "data/congress-bill-text-legacy/%d/%s/%s%d/%s.html" % (bill.congress, bt, bt, bill.number, dat["version_code"]) if os.path.exists(basename + "/mods.xml"): dat["mods_file"] = basename + "/mods.xml" # get a plain text file if one exists if os.path.exists(basename + "/document.txt"): dat["text_file"] = basename + "/document.txt" dat["has_displayable_text"] = True for source in dat.get("sources", []): if source["source"] == "statutes": dat["text_file_source"] = "statutes" # get an HTML file if one exists if os.path.exists(html_fn): dat["html_file"] = html_fn dat["has_displayable_text"] = True # get a PDF file if one exists pdf_fn = basename + "/document.pdf" if os.path.exists(pdf_fn): dat["pdf_file"] = pdf_fn dat["has_thumbnail"] = True dat["thumbnail_path"] = bill.get_absolute_url() + "/_text_image" # get an XML file if one exists if os.path.exists(basename + "/catoxml.xml"): dat["xml_file"] = basename + "/catoxml.xml" dat["has_displayable_text"] = True dat["xml_file_source"] = "cato-deepbills" elif os.path.exists(basename + "/document.xml"): dat["xml_file"] = basename + "/document.xml" dat["has_displayable_text"] = True if settings.DEBUG: dat["has_thumbnail"] = True return dat
def load_bill_text(bill, version, plain_text=False, mods_only=False): if bill.congress < 103: return load_bill_text_alt(bill, version, plain_text=plain_text, mods_only=mods_only) from bill.models import BillType # has to be here and not module-level to avoid cyclic dependency bt = BillType.by_value(bill.bill_type).xml_code basename = "data/us/bills.text/%s/%s/%s%d%s" % (bill.congress, bt, bt, bill.number, version if version != None else "") if mods_only: bill_text_content = None else: if plain_text: try: return open(basename + ".txt").read().decode("utf8", "ignore") # otherwise we get 'Chuck failed' in the xapian_backend apparently due to decoding issue. except IOError: return "" elif os.path.exists(basename + ".xml") and False: dom = lxml.etree.parse(basename + ".xml") transform = lxml.etree.parse(os.path.join(os.path.dirname(os.path.realpath(__file__)), "textxsl/billres.xsl")) transform = lxml.etree.XSLT(transform) result = transform(dom) # empty nodes cause HTML parsing problems, so remove them. # iterate in reverse document order so that we hit parents after # their children, since if we remove all of the children then we may # want to remove the parent too. for node in reversed(list(result.getiterator())): if node.xpath("string(.)") == "": node.getparent().remove(node) bill_text_content = lxml.etree.tostring(result.xpath("head/style")[0]) + lxml.etree.tostring(result.xpath("body")[0]) else: bill_text_content = open(basename + ".html").read() mods = lxml.etree.parse(basename + ".mods.xml") ns = { "mods": "http://www.loc.gov/mods/v3" } docdate = mods.xpath("string(mods:originInfo/mods:dateIssued)", namespaces=ns) gpo_url = "http://www.gpo.gov/fdsys/search/pagedetails.action?packageId=" + mods.xpath("string(mods:recordInfo/mods:recordIdentifier[@source='DGPO'])", namespaces=ns) #gpo_url = mods.xpath("string(mods:identifier[@type='uri'])", namespaces=ns) gpo_pdf_url = mods.xpath("string(mods:location/mods:url[@displayLabel='PDF rendition'])", namespaces=ns) doc_version = mods.xpath("string(mods:extension/mods:billVersion)", namespaces=ns) numpages = mods.xpath("string(mods:physicalDescription/mods:extent)", namespaces=ns) if numpages: numpages = re.sub(r" p\.$", " pages", numpages) docdate = datetime.date(*(int(d) for d in docdate.split("-"))) doc_version_name = bill_gpo_status_codes[doc_version] return { "bill_id": bill.id, "bill_name": bill.title, "basename": basename, "text_html": bill_text_content, "docdate": docdate, "gpo_url": gpo_url, "gpo_pdf_url": gpo_pdf_url, "doc_version": doc_version, "doc_version_name": doc_version_name, "numpages": numpages, "has_html_text": True, }
def get_bill_text_metadata(bill, version): from bill.models import BillType # has to be here and not module-level to avoid cyclic dependency import glob, json bt = BillType.by_value(bill.bill_type).slug basename = "data/congress/%d/bills/%s/%s%d/text-versions" % ( bill.congress, bt, bt, bill.number) if version == None: # Cycle through files to find most recent version by date. dat = None for versionfile in glob.glob(basename + "/*/data.json"): d = json.load(open(versionfile)) if not dat or d["issued_on"] > dat["issued_on"]: dat = d if not dat: return None else: dat = json.load(open(basename + "/%s/data.json" % version)) # human readable status name dat["status_name"] = get_gpo_status_code_name(dat["version_code"]) dat["corresponding_status_codes"] = get_gpo_status_code_corresponding_status( dat["version_code"]) # parse date dat["issued_on"] = datetime.date(*(int(d) for d in dat["issued_on"].split("-"))) # find content files basename += "/" + dat["version_code"] bt2 = BillType.by_value(bill.bill_type).xml_code html_fn = "data/congress-bill-text-legacy/%s/%s/%s%d%s.html" % ( bill.congress, bt2, bt2, bill.number, dat["version_code"]) if os.path.exists(basename + "/mods.xml"): dat["mods_file"] = basename + "/mods.xml" # get a plain text file if one exists if os.path.exists(basename + "/document.txt"): dat["text_file"] = basename + "/document.txt" dat["has_displayable_text"] = True for source in dat.get("sources", []): if source["source"] == "statutes": dat["text_file_source"] = "statutes" # get an HTML file if one exists if os.path.exists(html_fn): dat["html_file"] = html_fn dat["has_displayable_text"] = True # get a PDF file if one exists pdf_fn = basename + "/document.pdf" if os.path.exists(pdf_fn): dat["pdf_file"] = pdf_fn dat["has_thumbnail"] = True dat["thumbnail_path"] = bill.get_absolute_url() + "/_text_image" # get an XML file if one exists if os.path.exists(basename + "/catoxml.xml"): dat["xml_file"] = basename + "/catoxml.xml" dat["has_displayable_text"] = True dat["xml_file_source"] = "cato-deepbills" elif os.path.exists(basename + "/document.xml"): dat["xml_file"] = basename + "/document.xml" dat["has_displayable_text"] = True if settings.DEBUG: dat["has_thumbnail"] = True return dat
def load_bill_text_alt(bill, version, plain_text=False, mods_only=False): # Load bill text info from the Congress project JSON files. from bill.models import BillType # has to be here and not module-level to avoid cyclic dependency import glob, json bt = BillType.by_value(bill.bill_type).slug basename = "data/congress/%d/bills/%s/%s%d/text-versions" % (bill.congress, bt, bt, bill.number) if version == None: # Cycle through files to find most recent version by date. dat = None for versionfile in glob.glob(basename + "/*.json"): d = json.load(open(versionfile)) if not dat or d["issued_on"] > dat["issued_on"]: dat = d else: dat = json.load(open(basename + "/%s.json" % version)) # Load the text content (unless mods_only is set). bill_text_content = None try: if not dat: raise IOError("Bill text is not available for this bill.") if not mods_only: bill_text_content = open(basename + "/" + dat["version_code"] + "/document.txt").read().decode("utf8") except IOError: # text not available if mods_only or not plain_text: raise # these calls require raising bill_text_content = "" # plain_text gets "" returned instead # Caller just wants the plain text? if not mods_only and plain_text: # replace form feeds with an indication of the page break return bill_text_content.replace(u"\u000C", "=============================================") # Caller wants HTML. if not mods_only: # Return the text wrapped in <pre>, and replace form feeds with an <hr>. import cgi bill_text_content = "<pre>" + cgi.escape(bill_text_content) + "</pre>" bill_text_content = bill_text_content.replace(u"\u000C", "<hr>") #bill_text_content = "<pre>""\n".join( # "<div>" + cgi.escape(line) + "</div>" # for line in # bill_text_content.split("\n") # ) # Returning metadata? try: gpo_url = dat["urls"]["pdf"] except: # hmm, data format problem raise IOError("Bill metadata not available.") m = re.match(r"http://www.gpo.gov/fdsys/pkg/(STATUTE-\d+)/pdf/(STATUTE-\d+-.*).pdf", gpo_url) if m: gpo_url = "http://www.gpo.gov/fdsys/granule/%s/%s/content-detail.html" % m.groups() return { "bill_id": bill.id, "bill_name": bill.title, "text_html": bill_text_content, "basename": basename, "docdate": datetime.date(*(int(d) for d in dat["issued_on"].split("-"))), "gpo_url": gpo_url, "gpo_pdf_url": dat["urls"]["pdf"], "doc_version": dat["version_code"], "doc_version_name": bill_gpo_status_codes[dat["version_code"]], "has_html_text": True, }