def path_to_docid05(earmark_path): """ Input: list of lists of extracted earmark info \ Ouput: list of lists containing relevant document id \ """ bill_path = pt.BillPathUtils() report_path = pt.ReportPathUtils() congress = earmark_path[0] bill_report = earmark_path[1] chamber = earmark_path[2] number = earmark_path[3] if bill_report == 'bill': if chamber == 'senate': path = os.path.join(configuration.get_path_to_bills(), str(congress), '/bills/s/', str(number)) else: path = os.path.join(configuration.get_path_to_bills(), str(congress), '/bills/hr/', str(number)) all_versions = bill_path.get_all_versions(path) best_date = date(1900, 1, 1) for version in all_versions: npth = path + '/text-versions/' + version bill_date = pt.BillPathUtils(npth).bill_date() bill_date = datetime.strptime(bill_date, "%Y-%m-%d").date() if bill_date > best_date: best_date = bill_date best_version = version PATH_BILL = bill_path.get_bill_path(congress, number, best_version) doc_id = pt.BillPathUtils(PATH_BILL).get_db_document_id() else: if chamber == "senate": path = os.path.join(configuration.get_path_to_reports(), str(congress), "/senate/", str(number)) else: path = os.path.join(configuration.get_path_to_reports(), str(congress), "/house/", str(number)) all_versions = report_path.get_all_versions(path) rep_path = report_path.get_report_path(int(congress), chamber, int(number), all_versions[0]) doc_id = pt.ReportPathUtils(rep_path).get_db_document_id() return doc_id
def path_to_docid05(earmark_path): """ Input: list of lists of extracted earmark info \ Ouput: list of lists containing relevant document id \ """ bill_path = pt.BillPathUtils() report_path = pt.ReportPathUtils() congress = earmark_path[0] bill_report = earmark_path[1] chamber = earmark_path[2] number = earmark_path[3] if bill_report == 'bill': if chamber == 'senate': path = os.path.join(configuration.get_path_to_bills(), str(congress), '/bills/s/', str(number)) else: path = os.path.join(configuration.get_path_to_bills(), str(congress), '/bills/hr/', str(number)) all_versions = bill_path.get_all_versions(path) best_date = date(1900,1,1) for version in all_versions: npth = path + '/text-versions/' + version bill_date = pt.BillPathUtils(npth).bill_date() bill_date = datetime.strptime(bill_date,"%Y-%m-%d").date() if bill_date > best_date: best_date = bill_date best_version = version PATH_BILL = bill_path.get_bill_path(congress,number,best_version) doc_id = pt.BillPathUtils(PATH_BILL).get_db_document_id() else: if chamber == "senate": path = os.path.join(configuration.get_path_to_reports(), str(congress) , "/senate/", str(number)) else: path = os.path.join(configuration.get_path_to_reports(), str(congress), "/house/", str(number)) all_versions = report_path.get_all_versions(path) rep_path = report_path.get_report_path(int(congress),chamber,int(number),all_versions[0]) doc_id = pt.ReportPathUtils(rep_path).get_db_document_id() return doc_id
def path_to_docid08(earmarks): """ Input: list of lists of extracted earmark info Ouput: list of lists containing relevant document id """ bill_path = pt.BillPathUtils() report_path = pt.ReportPathUtils() database = [] for earmark in earmarks: earmark_id = earmark[0] page = earmark[2] excerpt = earmark[3] earmark_info = earmark[1] congress = int(earmark_info[0]) bill = earmark_info[1] chamber = earmark_info[2] number = earmark_info[3] if bill == 'bill': if isinstance(number,tuple): doc_ref = number[0] document_name = number[1] all_versions = bill_path.get_all_versions(os.path.join(configuration.get_path_to_bills(), '110/bills/hr/hr2764/')) if re.search('\Division\s\w',document_name): doc_string = re.findall('\Division\s\w',document_name)[0].replace(" ","") version_index = [div_type for div_type in all_versions if doc_string in i] version = version_index[0] pth = bill_path.get_bill_path(congress,doc_ref,version) doc_id = pt.BillPathUtils(pth).get_db_document_id() database.append([earmark_id,22552,page,excerpt]) database.append([earmark_id,22553,page,excerpt]) database.append([earmark_id,74460,page,excerpt]) database.append([earmark_id,74678,page,excerpt]) else: if chamber == 'senate': pth = os.path.join(configuration.get_path_to_bills(), str(congress), '/bills/s/', str(number)) else: pth = os.path.join(configuration.get_path_to_bills(), str(congress), '/bills/hr/', str(number)) all_versions = bill_path.get_all_versions(pth) best_date = date(1900,1,1) for version in all_versions: npth = pth + '/text-versions/' + version bill_date = pt.BillPathUtils(npth).bill_date() bill_date = datetime.strptime(bill_date,"%Y-%m-%d").date() if bill_date > best_date: best_date = bill_date best_version = version PATH_BILL = bill_path.get_bill_path(congress,number,best_version) doc_id = pt.BillPathUtils(PATH_BILL).get_db_document_id() if number == 'hr3222': database.append([earmark_id,74360,page,excerpt]) elif bill=='report': if chamber == "senate": pth = os.path.join(configuration.get_path_to_reports(), str(congress), "/senate/", str(number)) else: pth = os.path.join(configuration.get_path_to_reports(), str(congress), "/house/", str(number)) all_versions = report_path.get_all_versions(pth) rep_path = report_path.get_report_path(int(congress),chamber,int(number),all_versions[0]) doc_id = pt.ReportPathUtils(rep_path).get_db_document_id() database.append([earmark_id,doc_id,page,excerpt]) database_dict = {} for ids in database: key = (ids[0],ids[1]) value = [ids[2],ids[3]] if not key in database_dict.keys(): database_dict[key] = [value] else: database_dict[key].append(value) new_database = [] for key in database_dict.keys(): item = list(key) + database_dict[key][0] new_database.append(item) return new_database
import os, path, sys sys.path.insert(0, os.path.realpath(os.path.abspath(os.path.join(os.path.split(inspect.getfile( inspect.currentframe() ))[0],"..")))) from util import configuration import psycopg2 import re from bs4 import BeautifulSoup from datetime import datetime import json CONN_STRING = configuration.get_connection_string() root_dir = configuration.get_path_to_reports() reports_bills_dict = {} conn = psycopg2.connect(CONN_STRING) cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) # loop through each folder def get_reports(path, congress): for report in os.listdir(path): if "mods.xml" not in os.listdir(os.path.join(path, report)): print "No metadata file found for directory %s" % os.path.join(path, report) else: xml_file = os.path.join(path, report, "mods.xml") associated_bill = get_associated_bill(xml_file) report_dir = os.listdir(os.path.join(path, report)) if "mods.xml" in report_dir: report_dir.remove("mods.xml") write_bill_association(os.path.join(path, report, report_dir[0]), get_bill_directory(associated_bill, congress)) def get_associated_bill(xml_file): bill_name = "" soup = BeautifulSoup(open(xml_file))
sys.path.insert( 0, os.path.realpath( os.path.abspath( os.path.join( os.path.split(inspect.getfile(inspect.currentframe()))[0], "..")))) from util import configuration import psycopg2 import re from bs4 import BeautifulSoup from datetime import datetime import json CONN_STRING = configuration.get_connection_string() root_dir = configuration.get_path_to_reports() reports_bills_dict = {} conn = psycopg2.connect(CONN_STRING) cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) # loop through each folder def get_reports(path, congress): for report in os.listdir(path): if "mods.xml" not in os.listdir(os.path.join(path, report)): print "No metadata file found for directory %s" % os.path.join( path, report) else: xml_file = os.path.join(path, report, "mods.xml") associated_bill = get_associated_bill(xml_file) report_dir = os.listdir(os.path.join(path, report))
def path_to_docid08(earmarks): """ Input: list of lists of extracted earmark info Ouput: list of lists containing relevant document id """ bill_path = pt.BillPathUtils() report_path = pt.ReportPathUtils() database = [] for earmark in earmarks: earmark_id = earmark[0] page = earmark[2] excerpt = earmark[3] earmark_info = earmark[1] congress = int(earmark_info[0]) bill = earmark_info[1] chamber = earmark_info[2] number = earmark_info[3] if bill == 'bill': if isinstance(number, tuple): doc_ref = number[0] document_name = number[1] all_versions = bill_path.get_all_versions( os.path.join(configuration.get_path_to_bills(), '110/bills/hr/hr2764/')) if re.search('\Division\s\w', document_name): doc_string = re.findall('\Division\s\w', document_name)[0].replace(" ", "") version_index = [ div_type for div_type in all_versions if doc_string in i ] version = version_index[0] pth = bill_path.get_bill_path(congress, doc_ref, version) doc_id = pt.BillPathUtils(pth).get_db_document_id() database.append([earmark_id, 22552, page, excerpt]) database.append([earmark_id, 22553, page, excerpt]) database.append([earmark_id, 74460, page, excerpt]) database.append([earmark_id, 74678, page, excerpt]) else: if chamber == 'senate': pth = os.path.join(configuration.get_path_to_bills(), str(congress), '/bills/s/', str(number)) else: pth = os.path.join(configuration.get_path_to_bills(), str(congress), '/bills/hr/', str(number)) all_versions = bill_path.get_all_versions(pth) best_date = date(1900, 1, 1) for version in all_versions: npth = pth + '/text-versions/' + version bill_date = pt.BillPathUtils(npth).bill_date() bill_date = datetime.strptime(bill_date, "%Y-%m-%d").date() if bill_date > best_date: best_date = bill_date best_version = version PATH_BILL = bill_path.get_bill_path(congress, number, best_version) doc_id = pt.BillPathUtils(PATH_BILL).get_db_document_id() if number == 'hr3222': database.append([earmark_id, 74360, page, excerpt]) elif bill == 'report': if chamber == "senate": pth = os.path.join(configuration.get_path_to_reports(), str(congress), "/senate/", str(number)) else: pth = os.path.join(configuration.get_path_to_reports(), str(congress), "/house/", str(number)) all_versions = report_path.get_all_versions(pth) rep_path = report_path.get_report_path(int(congress), chamber, int(number), all_versions[0]) doc_id = pt.ReportPathUtils(rep_path).get_db_document_id() database.append([earmark_id, doc_id, page, excerpt]) database_dict = {} for ids in database: key = (ids[0], ids[1]) value = [ids[2], ids[3]] if not key in database_dict.keys(): database_dict[key] = [value] else: database_dict[key].append(value) new_database = [] for key in database_dict.keys(): item = list(key) + database_dict[key][0] new_database.append(item) return new_database