def form_bill_json_dict(xml_as_dict): """ Handles converting a government bulk XML file to legacy dictionary form. @param bill_id: id of the bill in format [type][number]-[congress] e.x. s934-113 @type bill_id: str @return: dictionary of bill attributes @rtype: dict """ bill_dict = xml_as_dict['billStatus']['bill'] bill_id = build_bill_id(bill_dict['billType'].lower(), bill_dict['billNumber'], bill_dict['congress']) titles = bill_info.titles_for(bill_dict['titles']['item']) actions = bill_info.actions_for(bill_dict['actions']['item'], bill_id, bill_info.current_title_for(titles, 'official')) status, status_date = bill_info.latest_status(actions, bill_dict.get('introducedDate', '')) bill_data = { 'bill_id': bill_id, 'bill_type': bill_dict.get('billType').lower(), 'number': bill_dict.get('billNumber'), 'congress': bill_dict.get('congress'), 'url': billstatus_url_for(bill_id), 'introduced_at': bill_dict.get('introducedDate', ''), 'by_request': bill_dict['sponsors']['item'][0]['byRequestType'] is not None, 'sponsor': bill_info.sponsor_for(bill_dict['sponsors']['item'][0]), 'cosponsors': bill_info.cosponsors_for(bill_dict['cosponsors']), 'actions': actions, 'history': bill_info.history_from_actions(actions), 'status': status, 'status_at': status_date, 'enacted_as': bill_info.slip_law_from(actions), 'titles': titles, 'official_title': bill_info.current_title_for(titles, 'official'), 'short_title': bill_info.current_title_for(titles, 'short'), 'popular_title': bill_info.current_title_for(titles, 'popular'), 'summary': bill_info.summary_for(bill_dict['summaries']['billSummaries']), # The top term's case has changed with the new bulk data. It's now in # Title Case. For backwards compatibility, the top term is run through # '.capitalize()' so it matches the old string. TODO: Remove one day? 'subjects_top_term': _fixup_top_term_case(bill_dict['policyArea']['name']) if bill_dict['policyArea'] else None, 'subjects': sorted( ([_fixup_top_term_case(bill_dict['policyArea']['name'])] if bill_dict['policyArea'] else []) + ([item['name'] for item in bill_dict['subjects']['billSubjects']['legislativeSubjects']['item']] if bill_dict['subjects']['billSubjects']['legislativeSubjects'] else []) ), 'related_bills': bill_info.related_bills_for(bill_dict['relatedBills']), 'committees': bill_info.committees_for(bill_dict['committees']['billCommittees']), 'amendments': bill_info.amendments_for(bill_dict['amendments']), 'updated_at': bill_dict.get('updateDate', ''), } return bill_data
def proc_statute_volume(path, options): mods = etree.parse(path + "/mods.xml") mods_ns = { "mods": "http://www.loc.gov/mods/v3" } # Load the THOMAS committee names for this Congress, which is our best # bet for normalizing committee names in the GPO data. congress = mods.find( "/mods:extension[2]/mods:congress", mods_ns ).text utils.fetch_committee_names(congress, options) logging.warn("Processing %s (Congress %s)" % (path, congress)) package_id = mods.find( "/mods:extension[2]/mods:accessId", mods_ns ).text for bill in mods.findall( "/mods:relatedItem", mods_ns ): # MODS files also contain information about: # ['BACKMATTER', 'FRONTMATTER', 'CONSTAMEND', 'PROCLAMATION', 'REORGPLAN'] if bill.find( "mods:extension/mods:granuleClass", mods_ns ).text not in [ "PUBLICLAW", "PRIVATELAW", "HCONRES", "SCONRES" ]: continue # Get the title and source URL (used in error messages). title_text = bill.find( "mods:titleInfo/mods:title", mods_ns ).text.replace( '""', '"' ) source_url = bill.find( "mods:location/mods:url[@displayLabel='Content Detail']", mods_ns ).text # Bill number bill_elements = bill.findall( "mods:extension/mods:bill[@priority='primary']", mods_ns ) if len(bill_elements) == 0: logging.error("No bill number identified for '%s' (%s)" % (title_text, source_url)) continue elif len(bill_elements) > 1: logging.error("Multiple bill numbers identified for '%s'" % title_text) for be in bill_elements: logging.error(" -- " + etree.tostring(be).strip()) logging.error(" @ " + source_url) continue else: bill_congress = bill_elements[0].attrib["congress"] bill_type = bill_elements[0].attrib["type"].lower() bill_number = bill_elements[0].attrib["number"] bill_id = "%s%s-%s" % (bill_type, bill_number, bill_congress) # Title titles = [] titles.append( { "title": title_text, "as": "enacted", "type": "official", "is_for_portion": False, } ) # Subject descriptor = bill.find( "mods:extension/mods:descriptor", mods_ns ) if descriptor is not None: subject = descriptor.text else: subject = None # Committees committees = [] cong_committee = bill.find( "mods:extension/mods:congCommittee", mods_ns ) if cong_committee is not None: chambers = { "H": "House", "S": "Senate", "J": "Joint" } committee = chambers[cong_committee.attrib["chamber"]] + " " + cong_committee.find( "mods:name", mods_ns ).text committee_info = { "committee": committee, "activity": [], # XXX "committee_id": utils.committee_names[committee] if committee in utils.committee_names else None, } committees.append( committee_info ) # The 'granuleDate' is the enactment date? granule_date = bill.find( "mods:extension/mods:granuleDate", mods_ns ).text sources = [{ "source": "statutes", "package_id": package_id, "access_id": bill.find( "mods:extension/mods:accessId", mods_ns ).text, "source_url": source_url, "volume": bill.find( "mods:extension/mods:volume", mods_ns ).text, "page": bill.find( "mods:part[@type='article']/mods:extent[@unit='pages']/mods:start", mods_ns ).text, "position": bill.find( "mods:extension/mods:pagePosition", mods_ns ).text, }] law_elements = bill.findall( "mods:extension/mods:law", mods_ns ) # XXX: If <law> is missing, this assumes it is a concurrent resolution. # This may be a problem if the code is updated to accept joint resolutions for constitutional amendments. if ( law_elements is None ) or ( len( law_elements ) != 1 ): other_chamber = { "HOUSE": "s", "SENATE": "h" } actions = [{ "type": "vote", "vote_type": "vote2", "where": other_chamber[bill.find( "mods:extension/mods:originChamber", mods_ns ).text], "result": "pass", # XXX "how": "unknown", # XXX # "text": "", "acted_at": granule_date, # XXX "status": "PASSED:CONCURRENTRES", "references": [], # XXX }] else: law_congress = law_elements[0].attrib["congress"] law_number = law_elements[0].attrib["number"] law_type = ( "private" if ( law_elements[0].attrib["isPrivate"] == "true" ) else "public" ) # Check for typos in the metadata. if law_congress != bill_congress: logging.error("Congress mismatch for %s%s: %s or %s? (%s)" % ( bill_type, bill_number, bill_congress, law_congress, source_url ) ) continue actions = [{ "congress": law_congress, "number": law_number, "type": "enacted", "law": law_type, "text": "Became %s Law No: %s-%s." % ( law_type.capitalize(), law_congress, law_number ), "acted_at": granule_date, # XXX "status": "ENACTED:SIGNED", # XXX: Check for overridden vetoes! "references": [], # XXX }] status, status_date = bill_info.latest_status( actions ) bill_data = { 'bill_id': bill_id, 'bill_type': bill_type, 'number': bill_number, 'congress': bill_congress, 'introduced_at': None, # XXX 'sponsor': None, # XXX 'cosponsors': [], # XXX 'actions': actions, # XXX 'history': bill_info.history_from_actions( actions ), 'status': status, 'status_at': status_date, 'enacted_as': bill_info.slip_law_from( actions ), 'titles': titles, 'official_title': bill_info.current_title_for( titles, "official" ), 'short_title': bill_info.current_title_for( titles, "short" ), # XXX 'popular_title': bill_info.current_title_for( titles, "popular" ), # XXX 'subjects_top_term': subject, 'subjects': [], 'related_bills': [], # XXX: <associatedBills> usually only lists the current bill. 'committees': committees, 'amendments': [], # XXX 'sources': sources, 'updated_at': datetime.datetime.fromtimestamp(time.time()), } if not options.get('textversions', False): bill_info.output_bill( bill_data, options ) # XXX: Can't use bill_versions.fetch_version() because it depends on fdsys. version_code = "enr" bill_version_id = "%s%s-%s-%s" % ( bill_type, bill_number, bill_congress, version_code ) bill_version = { 'bill_version_id': bill_version_id, 'version_code': version_code, 'issued_on': status_date, 'urls': { "pdf": bill.find( "mods:location/mods:url[@displayLabel='PDF rendition']", mods_ns ).text }, 'sources': sources, } utils.write( json.dumps(bill_version, sort_keys=True, indent=2, default=utils.format_datetime), bill_versions.output_for_bill_version(bill_version_id) ) # Process the granule PDF. # - Hard-link it into the right place to be seen as bill text. # - Run "pdftotext -layout" to convert it to plain text and save it in the bill text location. pdf_file = path + "/" + sources[0]["access_id"] + "/document.pdf" if os.path.exists(pdf_file): dst_path = fdsys.output_for_bill(bill_data["bill_id"], "text-versions/" + version_code, is_data_dot=False) if options.get("linkpdf", False): os.link(pdf_file, dst_path + "/document.pdf") # a good idea if options.get("extracttext", False): logging.error("Running pdftotext on %s..." % pdf_file) if subprocess.call(["pdftotext", "-layout", pdf_file, dst_path + "/document.txt"]) != 0: raise Exception("pdftotext failed on %s" % pdf_file) return {'ok': True, 'saved': True}
def proc_statute(path, options): mods = etree.parse(path + "/mods.xml") mods_ns = { "mods": "http://www.loc.gov/mods/v3" } # Load the THOMAS committee names for this Congress, which is our best # bet for normalizing committee names in the GPO data. congress = mods.find( "/mods:extension[2]/mods:congress", mods_ns ).text utils.fetch_committee_names(congress, options) logging.warn("Processing %s (Congress %s)" % (path, congress)) for bill in mods.findall( "/mods:relatedItem", mods_ns ): titles = [] titles.append( { "title": bill.find( "mods:titleInfo/mods:title", mods_ns ).text.replace( '""', '"' ), "as": "enacted", "type": "official", } ) descriptor = bill.find( "mods:extension/mods:descriptor", mods_ns ) if descriptor is not None: subject = descriptor.text else: subject = None # MODS files also contain information about: # ['BACKMATTER', 'FRONTMATTER', 'CONSTAMEND', 'PROCLAMATION', 'REORGPLAN'] if bill.find( "mods:extension/mods:granuleClass", mods_ns ).text not in [ "PUBLICLAW", "PRIVATELAW", "HCONRES", "SCONRES" ]: continue committees = [] cong_committee = bill.find( "mods:extension/mods:congCommittee", mods_ns ) if cong_committee is not None: chambers = { "H": "House", "S": "Senate", "J": "Joint" } committee = chambers[cong_committee.attrib["chamber"]] + " " + cong_committee.find( "mods:name", mods_ns ).text committee_info = { "committee": committee, "activity": [], # XXX "committee_id": utils.committee_names[committee] if committee in utils.committee_names else None, } committees.append( committee_info ) bill_elements = bill.findall( "mods:extension/mods:bill", mods_ns ) if ( bill_elements is None ) or ( len( bill_elements ) != 1 ): logging.error("Could not get bill data for %s" % repr(titles) ) continue else: bill_congress = bill_elements[0].attrib["congress"] bill_type = bill_elements[0].attrib["type"].lower() bill_number = bill_elements[0].attrib["number"] bill_id = "%s%s-%s" % (bill_type, bill_number, bill_congress) actions = [] law_elements = bill.findall( "mods:extension/mods:law", mods_ns ) # XXX: If <law> is missing, this assumes it is a concurrent resolution. # This may be a problem if the code is updated to accept joint resolutions for constitutional amendments. if ( law_elements is None ) or ( len( law_elements ) != 1 ): other_chamber = { "HOUSE": "s", "SENATE": "h" } action = { "type": "vote", "vote_type": "vote2", "where": other_chamber[bill.find( "mods:extension/mods:originChamber", mods_ns ).text], "result": "pass", # XXX "how": "unknown", # XXX # "text": "", "acted_at": bill.find( "mods:extension/mods:granuleDate", mods_ns ).text, # XXX "status": "PASSED:CONCURRENTRES", "references": [], # XXX } else: law_congress = law_elements[0].attrib["congress"] law_number = law_elements[0].attrib["number"] law_type = ( "private" if ( law_elements[0].attrib["isPrivate"] == "true" ) else "public" ) action = { "congress": law_congress, "number": law_number, "type": "enacted", "law": law_type, "text": "Became %s Law No: %s-%s." % ( law_type.capitalize(), law_congress, law_number ), "acted_at": bill.find( "mods:extension/mods:granuleDate", mods_ns ).text, # XXX "status": "ENACTED:SIGNED", # XXX: Check for overridden vetoes! "references": [], # XXX } actions.append( action ) # Check for typos in the metadata. if law_congress != bill_congress: logging.error("Congress mismatch for %s%s: %s or %s?" % ( bill_type, bill_number, bill_congress, law_congress ) ) continue status, status_date = bill_info.latest_status( actions ) bill_data = { 'bill_id': bill_id, 'bill_type': bill_type, 'number': bill_number, 'congress': bill_congress, 'introduced_at': None, # XXX 'sponsor': None, # XXX 'cosponsors': [], # XXX 'actions': actions, # XXX 'history': bill_info.history_from_actions( actions ), 'status': status, 'status_at': status_date, 'enacted_as': bill_info.slip_law_from( actions ), 'titles': titles, 'official_title': bill_info.current_title_for( titles, "official" ), 'short_title': bill_info.current_title_for( titles, "short" ), # XXX 'popular_title': bill_info.current_title_for( titles, "popular" ), # XXX # 'summary': summary, 'subjects_top_term': subject, 'subjects': [], 'related_bills': [], # XXX: <associatedBills> usually only lists the current bill. 'committees': committees, 'amendments': [], # XXX 'updated_at': datetime.datetime.fromtimestamp(time.time()), } bill_info.output_bill( bill_data, options ) # XXX: Can't use bill_versions.fetch_version() because it depends on fdsys. version_code = "enr" bill_version_id = "%s%s-%s-%s" % ( bill_type, bill_number, bill_congress, version_code ) bill_version = { 'bill_version_id': bill_version_id, 'version_code': version_code, 'issued_on': status_date, 'urls': { "pdf": bill.find( "mods:location/mods:url[@displayLabel='PDF rendition']", mods_ns ).text }, } import json, bill_versions utils.write( json.dumps(bill_version, sort_keys=True, indent=2, default=utils.format_datetime), bill_versions.output_for_bill_version(bill_version_id) ) return {'ok': True, 'saved': True}
def form_bill_json_dict(xml_as_dict): """ Handles converting a government bulk XML file to legacy dictionary form. @param bill_id: id of the bill in format [type][number]-[congress] e.x. s934-113 @type bill_id: str @return: dictionary of bill attributes @rtype: dict """ bill_dict = xml_as_dict['billStatus']['bill'] bill_id = build_bill_id(bill_dict['billType'].lower(), bill_dict['billNumber'], bill_dict['congress']) titles = bill_info.titles_for(bill_dict['titles']['item']) actions = bill_info.actions_for(bill_dict['actions']['item'], bill_id, bill_info.current_title_for(titles, 'official')) status, status_date = bill_info.latest_status(actions, bill_dict.get('introducedDate', '')) bill_data = { 'bill_id': bill_id, 'bill_type': bill_dict.get('billType').lower(), 'number': bill_dict.get('billNumber'), 'congress': bill_dict.get('congress'), 'url': billstatus_url_for(bill_id), 'introduced_at': bill_dict.get('introducedDate', ''), 'by_request': bill_dict['sponsors']['item'][0]['byRequestType'] is not None, 'sponsor': bill_info.sponsor_for(bill_dict['sponsors']['item'][0]), 'cosponsors': bill_info.cosponsors_for(bill_dict['cosponsors']), 'actions': actions, 'history': bill_info.history_from_actions(actions), 'status': status, 'status_at': status_date, 'enacted_as': bill_info.slip_law_from(actions), 'titles': titles, 'official_title': bill_info.current_title_for(titles, 'official'), 'short_title': bill_info.current_title_for(titles, 'short'), 'popular_title': bill_info.current_title_for(titles, 'popular'), 'summary': bill_info.summary_for(bill_dict['summaries']['billSummaries']), # The top term's case has changed with the new bulk data. It's now in # Title Case. For backwards compatibility, the top term is run through # '.capitalize()' so it matches the old string. TODO: Remove one day? 'subjects_top_term': _fixup_top_term_case(bill_dict['policyArea']['name']) if bill_dict['policyArea'] else None, 'subjects': sorted( ([_fixup_top_term_case(bill_dict['policyArea']['name'])] if bill_dict['policyArea'] else []) + ([item['name'] for item in bill_dict['subjects']['billSubjects']['legislativeSubjects']['item']] if bill_dict['subjects']['billSubjects']['legislativeSubjects'] else []) ), 'related_bills': bill_info.related_bills_for(bill_dict['relatedBills']), 'committees': bill_info.committees_for(bill_dict['committees']['billCommittees']), 'amendments': bill_info.amendments_for(bill_dict['amendments']), 'committee_reports': bill_info.committee_reports_for(bill_dict['committeeReports']), 'updated_at': bill_dict.get('updateDate', ''), } return bill_data