def g(options, filename, *args, **kwargs): f = open(filename, "rb") progress = Progress(name='rows [%s]' % filename, step=2000) f.seek(0, os.SEEK_END) total=f.tell() f.seek(0, os.SEEK_SET) fmt = 'tsv' if "BillSubject" in filename: fmt = 'csv' for rownum, row in BT50FileReader( f, open(filename.replace(".txt", "FD.txt"), "rb"), "cp1252", delimiter = ',' if fmt == 'csv' else '\t', quotechar = '"' if fmt == 'csv' else None): # make a hash of the row so we can tell quickly if it's been changed, but # skip the LastUpdated column because it might be spurriously updated and # we don't care if the other fields didn't change anyway. row["_hash"] = hashlib.sha1(repr(sorted(kv for kv in row.items() if kv[0] != 'LastUpdated'))).hexdigest() rownum += 1 progress.tick(x=f.tell(), y=total) try: func(row, options, filename, *args, **kwargs) except: print "Error in %s line %d." % (filename, rownum) raise return None
def g(options, filename, *args, **kwargs): f = open(filename, "rb") progress = Progress(name='rows [%s]' % filename, step=2000) f.seek(0, os.SEEK_END) total = f.tell() f.seek(0, os.SEEK_SET) fmt = 'tsv' if "BillSubject" in filename: fmt = 'csv' for rownum, row in BT50FileReader( f, open(filename.replace(".txt", "FD.txt"), "rb"), "cp1252", delimiter=',' if fmt == 'csv' else '\t', quotechar='"' if fmt == 'csv' else None): # make a hash of the row so we can tell quickly if it's been changed, but # skip the LastUpdated column because it might be spurriously updated and # we don't care if the other fields didn't change anyway. row["_hash"] = hashlib.sha1( repr(sorted(kv for kv in row.items() if kv[0] != 'LastUpdated'))).hexdigest() rownum += 1 progress.tick(x=f.tell(), y=total) try: func(row, options, filename, *args, **kwargs) except: print "Error in %s line %d." % (filename, rownum) raise return None
def parse_committee_names(options): log.info('Processing committees') COMMITTEES_FILE = settings.CONGRESS_PROJECT_PATH + '/congress-legislators/committees-current.yaml' if not File.objects.is_changed(COMMITTEES_FILE) and not options.force: log.info('File %s was not changed' % COMMITTEES_FILE) else: tree = yaml_load(COMMITTEES_FILE) total = len(tree) progress = Progress(total=total) seen_committees = set() for committee in tree: try: cobj = Committee.objects.get(code=committee["thomas_id"]) except Committee.DoesNotExist: print("New committee:", committee["thomas_id"]) cobj = Committee(code=committee["thomas_id"]) cobj.committee_type = TYPE_MAPPING[committee["type"]] cobj.name = committee["name"] cobj.url = committee.get("url", None) cobj.obsolete = False cobj.committee = None cobj.jurisdiction = committee.get("jurisdiction") cobj.jurisdiction_link = committee.get("jurisdiction_source") cobj.save() seen_committees.add(cobj.id) for subcom in committee.get('subcommittees', []): code = committee["thomas_id"] + subcom["thomas_id"] try: sobj = Committee.objects.get(code=code) except Committee.DoesNotExist: print("New subcommittee:", code) sobj = Committee(code=code) sobj.name = subcom["name"] sobj.url = subcom.get("url", None) sobj.type = None sobj.committee = cobj sobj.obsolete = False sobj.save() seen_committees.add(sobj.id) progress.tick() # Check for non-obsolete committees in the database that aren't in our # file. other_committees = Committee.objects.filter(obsolete=False).exclude( id__in=seen_committees) if len(other_committees) > 0: print("Marking obsolete:", ", ".join(c.code for c in other_committees)) other_committees.update(obsolete=True) File.objects.save_file(COMMITTEES_FILE)
def parse_committee_names(options): log.info('Processing committees') COMMITTEES_FILE = settings.CONGRESS_PROJECT_PATH + '/congress-legislators/committees-current.yaml' if not File.objects.is_changed(COMMITTEES_FILE) and not options.force: log.info('File %s was not changed' % COMMITTEES_FILE) else: tree = yaml_load(COMMITTEES_FILE) total = len(tree) progress = Progress(total=total) seen_committees = set() for committee in tree: try: cobj = Committee.objects.get(code=committee["thomas_id"]) except Committee.DoesNotExist: print("New committee:", committee["thomas_id"]) cobj = Committee(code=committee["thomas_id"]) cobj.committee_type = TYPE_MAPPING[committee["type"]] cobj.name = committee["name"] cobj.url = committee.get("url", None) cobj.obsolete = False cobj.committee = None cobj.jurisdiction = committee.get("jurisdiction") cobj.jurisdiction_link = committee.get("jurisdiction_source") cobj.save() seen_committees.add(cobj.id) for subcom in committee.get('subcommittees', []): code = committee["thomas_id"] + subcom["thomas_id"] try: sobj = Committee.objects.get(code=code) except Committee.DoesNotExist: print("New subcommittee:", code) sobj = Committee(code=code) sobj.name = subcom["name"] sobj.url = subcom.get("url", None) sobj.type = None sobj.committee = cobj sobj.obsolete = False sobj.save() seen_committees.add(sobj.id) progress.tick() # Check for non-obsolete committees in the database that aren't in our # file. other_committees = Committee.objects.filter(obsolete=False).exclude(id__in=seen_committees) if len(other_committees) > 0: print("Marking obsolete:", ", ".join(c.code for c in other_committees)) other_committees.update(obsolete=True) File.objects.save_file(COMMITTEES_FILE)
def parse_committee_members(options): log.info('Processing committee members') MEMBERS_FILE = settings.CONGRESS_PROJECT_PATH + '/congress-legislators/committee-membership-current.yaml' file_changed = File.objects.is_changed(MEMBERS_FILE) if not file_changed and not options.force: log.info('File %s was not changed' % MEMBERS_FILE) else: # map Bioguide IDs to GovTrack IDs y = yaml_load(settings.CONGRESS_PROJECT_PATH + "/congress-legislators/legislators-current.yaml") person_id_map = {} for m in y: if "id" in m and "govtrack" in m["id"] and "bioguide" in m["id"]: person_id_map[m["id"]["bioguide"]] = m["id"]["govtrack"] # load committee members tree = yaml_load(MEMBERS_FILE) total = len(tree) progress = Progress(total=total, name='committees') # We can delete CommitteeMember objects because we don't have # any foreign keys to them. CommitteeMember.objects.all().delete() # Process committee nodes for committee, members in tree.items(): try: cobj = Committee.objects.get(code=committee) except Committee.DoesNotExist: print("Committee not found:", committee) continue # Process members of current committee node for member in members: mobj = CommitteeMember() mobj.person = Person.objects.get( id=person_id_map[member["bioguide"]]) mobj.committee = cobj if "title" in member: mobj.role = ROLE_MAPPING[member["title"]] mobj.save() progress.tick() File.objects.save_file(MEMBERS_FILE)
def parse_committee_members(options): log.info('Processing committee members') MEMBERS_FILE = settings.CONGRESS_PROJECT_PATH + '/congress-legislators/committee-membership-current.yaml' file_changed = File.objects.is_changed(MEMBERS_FILE) if not file_changed and not options.force: log.info('File %s was not changed' % MEMBERS_FILE) else: # map Bioguide IDs to GovTrack IDs y = yaml_load(settings.CONGRESS_PROJECT_PATH + "/congress-legislators/legislators-current.yaml") person_id_map = { } for m in y: if "id" in m and "govtrack" in m["id"] and "bioguide" in m["id"]: person_id_map[m["id"]["bioguide"]] = m["id"]["govtrack"] # load committee members tree = yaml_load(MEMBERS_FILE) total = len(tree) progress = Progress(total=total, name='committees') # We can delete CommitteeMember objects because we don't have # any foreign keys to them. CommitteeMember.objects.all().delete() # Process committee nodes for committee, members in tree.items(): try: cobj = Committee.objects.get(code=committee) except Committee.DoesNotExist: print("Committee not found:", committee) continue # Process members of current committee node for member in members: mobj = CommitteeMember() mobj.person = Person.objects.get(id=person_id_map[member["bioguide"]]) mobj.committee = cobj if "title" in member: mobj.role = ROLE_MAPPING[member["title"]] mobj.save() progress.tick() File.objects.save_file(MEMBERS_FILE)
def main(options): """ Process amendments """ if options.congress: files = glob.glob('data/us/%s/bills.amdt/*.xml' % options.congress) log.info('Parsing amendments of only congress#%s' % options.congress) else: files = glob.glob('data/us/*/bills.amdt/*.xml') if options.filter: files = [f for f in files if re.match(options.filter, f)] log.info('Processing amendments: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=100) amendment_processor = AmendmentProcessor() seen_amdt_ids = [] for fname in files: progress.tick() if not File.objects.is_changed(fname) and not options.force: m = re.match(r"data/us/(\d+)/bills.amdt/([sh])(\d+).xml", fname) if not m: print "Invalid file name", fname else: amdt = Amendment.objects.get(congress=m.group(1), amendment_type=AmendmentType.by_slug(m.group(2)), number=m.group(3)) seen_amdt_ids.append(amdt.id) # don't delete me later continue tree = etree.parse(fname) node = tree.xpath('/amendment')[0] try: amdt = amendment_processor.process(Amendment(), node) except: print fname raise if not amdt: # Amendments to treaties. Can't process. continue # update if already in db try: amdt.id = Amendment.objects.get(congress=amdt.congress, amendment_type=amdt.amendment_type, number=amdt.number).id except Amendment.DoesNotExist: pass # a new amendment seen_amdt_ids.append(amdt.id) # don't delete me later try: amdt.save() except: print amdt raise # For House votes on amendments, the only way to associate the vote with the # amendment is to use the THOMAS/Congress.gov action lines. The House vote XML # has an amendment-num field but its meaning is ambiguous, so it is useless. # When we parse a House amendment with an action line referencing a roll call vote, # save this amendment as that vote's related_amendment, then mark the vote as # 'missing data' (below) so that on the next parse of votes its title gets updated. if amdt.amendment_type == AmendmentType.house_amendment: for vote in node.xpath("actions/vote[@how='roll']"): v_congress, v_session = get_session_from_date(XmlProcessor.parse_datetime(vote.get('datetime')).date()) v_roll = int(vote.get("roll")) try: vote = Vote.objects.get(congress=v_congress, chamber=CongressChamber.house, session=v_session, number=v_roll) vote.related_amendment = amdt vote.save() except Vote.DoesNotExist: print "Missing vote data in", fname # If this amendment is related to a vote, mark the vote as missing data because # we may need to update the vote title if the amendment title has changed. Vote.objects.filter(related_amendment=amdt).update(missing_data=True) File.objects.save_file(fname) # Are any amendments in the database no longer on disk? if options.congress and not options.filter: missing = Amendment.objects.filter(congress=options.congress).exclude(id__in = seen_amdt_ids) if missing.exists(): print "Amendments should be deleted: ", missing
def main(options): """ Parse rolls. """ # Setup XML processors vote_processor = VoteProcessor() option_processor = VoteOptionProcessor() voter_processor = VoterProcessor() voter_processor.PERSON_CACHE = dict( (x.pk, x) for x in Person.objects.all()) # The pattern which the roll file matches # Filename contains info which should be placed to DB # along with info extracted from the XML file re_path = re.compile('data/us/(\d+)/rolls/([hs])(\w+)-(\d+)\.xml') chamber_mapping = {'s': CongressChamber.senate, 'h': CongressChamber.house} if options.filter: files = glob.glob(options.filter) log.info('Parsing rolls matching %s' % options.filter) elif options.congress: files = glob.glob('data/us/%s/rolls/*.xml' % options.congress) log.info('Parsing rolls of only congress#%s' % options.congress) else: files = glob.glob('data/us/*/rolls/*.xml') log.info('Processing votes: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=10) def log_delete_qs(qs): if qs.count() == 0: return print "Deleting obsoleted records: ", qs #if qs.count() > 3: # print "Delete skipped..." # return qs.delete() seen_obj_ids = set() had_error = False for fname in files: progress.tick() match = re_path.search(fname) try: existing_vote = Vote.objects.get( congress=match.group(1), chamber=chamber_mapping[match.group(2)], session=match.group(3), number=match.group(4)) except Vote.DoesNotExist: existing_vote = None if not File.objects.is_changed( fname ) and not options.force and existing_vote != None and not existing_vote.missing_data: seen_obj_ids.add(existing_vote.id) continue try: tree = etree.parse(fname) ## Look for votes with VP tie breakers. #if len(tree.xpath("/roll/voter[@VP='1']")) == 0: # had_error = True # prevent delete at the end # continue # Process role object roll_node = tree.xpath('/roll')[0] # Sqlite is much faster when lots of saves are wrapped in a transaction, # and we do a lot of saves because it's a lot of voters. from django.db import transaction with transaction.atomic(): vote = vote_processor.process(Vote(), roll_node) if existing_vote: vote.id = existing_vote.id match = re_path.search(fname) vote.congress = int(match.group(1)) vote.chamber = chamber_mapping[match.group(2)] vote.session = match.group(3) vote.number = int(match.group(4)) # Get related bill & amendment. for bill_node in roll_node.xpath("bill"): related_bill_num = bill_node.get("number") if 9 <= vote.congress <= 42 and vote.session in ('1', '2'): # Bill numbering from the American Memory colletion is different. The number combines # the session, bill number, and a 0 or 5 for regular or 'half' numbering. Prior to # the 9th congress numbering seems to be wholly assigned by us and not related to # actual numbering, so we skip matching those bills. related_bill_num = "%d%04d%d" % (int( vote.session), int(bill_node.get("number")), 0) try: vote.related_bill = Bill.objects.get( congress=bill_node.get("session"), bill_type=BillType.by_xml_code( bill_node.get("type")), number=related_bill_num) except Bill.DoesNotExist: if vote.congress >= 93: vote.missing_data = True for amdt_node in roll_node.xpath("amendment"): if amdt_node.get( "ref" ) == "regular" and vote.related_bill is not None: try: vote.related_amendment = Amendment.objects.get( congress=vote.related_bill.congress, amendment_type=AmendmentType.by_slug( amdt_node.get("number")[0]), number=amdt_node.get("number")[1:]) except Amendment.DoesNotExist: if vote.congress >= 93: print "Missing amendment", fname vote.missing_data = True elif amdt_node.get("ref") == "bill-serial": # It is impossible to associate House votes with amendments just from the House # vote XML because the amendment-num might correspond either with the A___ number # or with the "An amendment, numbered ____" number from the amendment purpose, # and there's really no way to figure out which. Maybe we can use the amendment # sponsor instead? #vote.related_amendment = Amendment.objects.get(bill=vote.related_bill, sequence=amdt_node.get("number")) # Instead, we set related_amendment from the amendment parser. Here, we have to # preserve the related_amendment if it is set. if existing_vote: vote.related_amendment = existing_vote.related_amendment # clean up some question text and use the question_details field if vote.category in ( VoteCategory.passage, VoteCategory.passage_suspension, VoteCategory.veto_override) and vote.related_bill: # For passage votes, set the question to the bill title and put the question # details in the details field. vote.question = vote.related_bill.title vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display( ) elif vote.category == VoteCategory.amendment and vote.related_amendment: # For votes on amendments, make a better title/explanation. vote.question = vote.related_amendment.title vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display( ) elif vote.related_bill and vote.question.startswith( "On the Cloture Motion " + vote.related_bill.display_number): vote.question = "Cloture on " + vote.related_bill.title elif vote.related_bill and vote.question.startswith( "On Cloture on the Motion to Proceed " + vote.related_bill.display_number): vote.question = "Cloture on " + vote.related_bill.title vote.question_details = "On Cloture on the Motion to Proceed in the " + vote.get_chamber_display( ) elif vote.related_bill and vote.question.startswith( "On the Motion to Proceed " + vote.related_bill.display_number): vote.question = "Motion to Proceed on " + vote.related_bill.title elif vote.related_amendment and vote.question.startswith( "On the Cloture Motion " + vote.related_amendment.get_amendment_type_display() + " " + str(vote.related_amendment.number)): vote.question = "Cloture on " + vote.related_amendment.title vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display( ) # weird House foratting of bill numbers ("H RES 123 Blah blah") if vote.related_bill: vote.question = re.sub( "(On [^:]+): " + vote.related_bill.display_number.replace( ". ", " ").replace(".", " ").upper() + " .*", r"\1: " + truncatewords(vote.related_bill.title, 15), vote.question) vote.save() seen_obj_ids.add(vote.id) # don't delete me later # Process roll options, overwrite existing options where possible. seen_option_ids = set() roll_options = {} for option_node in roll_node.xpath('./option'): option = option_processor.process(VoteOption(), option_node) option.vote = vote if existing_vote: try: option.id = VoteOption.objects.filter( vote=vote, key=option.key )[0].id # get is better, but I had the database corruption problem except IndexError: pass option.save() roll_options[option.key] = option seen_option_ids.add(option.id) log_delete_qs( VoteOption.objects.filter(vote=vote).exclude( id__in=seen_option_ids) ) # may cascade and delete the Voters too? # Process roll voters, overwriting existing voters where possible. if existing_vote: existing_voters = dict( Voter.objects.filter(vote=vote).values_list( "person", "id")) seen_voter_ids = set() voters = list() for voter_node in roll_node.xpath('./voter'): voter = voter_processor.process(roll_options, Voter(), voter_node) voter.vote = vote voter.created = vote.created # for VP votes, load the actual person & role... if voter.voter_type == VoterType.vice_president: try: r = PersonRole.objects.get( role_type=RoleType.vicepresident, startdate__lte=vote.created, enddate__gte=vote.created) voter.person_role = r voter.person = r.person except PersonRole.DoesNotExist: # overlapping roles? missing data? log.error( 'Could not resolve vice president in %s' % fname) if existing_vote and voter.person: try: voter.id = existing_voters[voter.person.id] except KeyError: pass voters.append(voter) if voter.voter_type == VoterType.unknown and not vote.missing_data: vote.missing_data = True vote.save() # pre-fetch the role of each voter load_roles_at_date( [x.person for x in voters if x.person != None], vote.created, vote.congress) for voter in list(voters): if voter.voter_type != VoterType.vice_president: voter.person_role = voter.person.role # If we couldn't match a role for this person on the date of the vote, and if the voter was Not Voting, # and we're looking at historical data, then this is probably a data error --- the voter wasn't even in office. if voter.person_role is None: if vote.source == VoteSource.keithpoole and voter.option.key == "0": # Drop this record. voters.remove(voter) else: log.error("%s: Could not find role for %s on %s." % (fname, voter.person, vote.created)) vote.missing_data = True vote.save() # save all of the records (inserting/updating) for voter in voters: voter.save() seen_voter_ids.add(voter.id) # remove obsolete voter records log_delete_qs( Voter.objects.filter(vote=vote).exclude( id__in=seen_voter_ids) ) # possibly already deleted by cascade above # pre-calculate totals vote.calculate_totals() if not options.disable_events: vote.create_event() File.objects.save_file(fname) except Exception, ex: log.error('Error in processing %s' % fname, exc_info=ex) had_error = True
def main(options): """ Process bill terms and bills """ # Terms term_processor = TermProcessor() terms_parsed = set() # Cache existing terms. There aren't so many. existing_terms = { } for term in BillTerm.objects.all(): existing_terms[(int(term.term_type), term.name)] = term log.info('Processing old bill terms') TERMS_FILE = 'data/us/liv.xml' tree = etree.parse(TERMS_FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] term.subterms.add(subterm) terms_parsed.add(subterm.id) except: try: log.debug("Created %s" % subterm) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) log.info('Processing new bill terms') for FILE in ('data/us/liv111.xml', 'data/us/crsnet.xml'): tree = etree.parse(FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] terms_parsed.add(subterm.id) term.subterms.add(subterm) except: try: log.debug("Created %s" % term) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) for term in existing_terms.values(): if not term.id in terms_parsed: log.debug("Deleted %s" % term) term.delete() # Bills bill_index = None if not options.disable_indexing: from bill.search_indexes import BillIndex bill_index = BillIndex() if options.congress: files = glob.glob('data/us/%s/bills/*.xml' % options.congress) log.info('Parsing bills of only congress#%s' % options.congress) else: files = glob.glob('data/us/*/bills/*.xml') if options.filter: files = [f for f in files if re.match(options.filter, f)] log.info('Processing bills: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=100) bill_processor = BillProcessor() seen_bill_ids = [] for fname in files: progress.tick() if not File.objects.is_changed(fname) and not options.force: m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname) try: b = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3)) seen_bill_ids.append(b.id) # Update the index/events for any bill with recently changed text textfile = "data/us/bills.text/%s/%s/%s%s.txt" % (m.group(1), m.group(2), m.group(2), m.group(3)) if (bill_index and not options.disable_events) and os.path.exists(textfile) and File.objects.is_changed(textfile): bill_index.update_object(b, using="bill") # index the full text b.create_events() # events for new bill text documents File.objects.save_file(textfile) continue except Bill.DoesNotExist: pass # just parse as normal if options.slow: time.sleep(1) skip_stuff = False tree = etree.parse(fname) for node in tree.xpath('/bill'): if not skip_stuff: try: bill = bill_processor.process(Bill(), node) except: print fname raise else: m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname) bill = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3)) seen_bill_ids.append(bill.id) # don't delete me later actions = [] bill.sliplawpubpriv = None bill.sliplawnum = None for axn in tree.xpath("actions/*[@state]"): actions.append( (repr(bill_processor.parse_datetime(axn.xpath("string(@datetime)"))), BillStatus.by_xml_code(axn.xpath("string(@state)")), axn.xpath("string(text)")) ) if actions[-1][1] in (BillStatus.enacted_signed, BillStatus.enacted_veto_override): bill.sliplawpubpriv = "PUB" if axn.get("type") == "public" else "PRI" bill.sliplawnum = int(axn.get("number").split("-")[1]) bill.major_actions = actions try: bill.save() except: print bill raise if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() if not skip_stuff: File.objects.save_file(fname) # delete bill objects that are no longer represented on disk.... this is too dangerous. if options.congress and not options.filter and False: # this doesn't work because seen_bill_ids is too big for sqlite! Bill.objects.filter(congress=options.congress).exclude(id__in = seen_bill_ids).delete() # Parse docs.house.gov for what might be coming up this week. import iso8601 dhg_html = urllib.urlopen("http://docs.house.gov/floor/").read() m = re.search(r"class=\"downloadXML\" href=\"(Download.aspx\?file=.*?)\"", dhg_html) if not m: log.error('No docs.house.gov download link found at http://docs.house.gov.') else: def bt_re(bt): return re.escape(bt[1]).replace(r"\.", "\.?\s*") try: dhg = etree.parse(urllib.urlopen("http://docs.house.gov/floor/" + m.group(1))).getroot() except: print "http://docs.house.gov/" + m.group(1) raise # iso8601.parse_date(dhg.get("week-date")+"T00:00:00").date() for item in dhg.xpath("category/floor-items/floor-item"): billname = item.xpath("legis-num")[0].text m = re.match("\s*(?:Concur in the Senate Amendment to |Senate Amendment to )?(" + "|".join(bt_re(bt) for bt in BillType) + ")(\d+)\s*(\[Conference Report\]\s*)?$", billname, re.I) if not m: if billname.strip() != "H.R. __": log.error('Could not parse legis-num "%s" in docs.house.gov.' % billname) else: for bt in BillType: if re.match(bt_re(bt) + "$", m.group(1)): try: bill = Bill.objects.get(congress=CURRENT_CONGRESS, bill_type=bt[0], number=m.group(2)) bill.docs_house_gov_postdate = iso8601.parse_date(item.get("add-date")).replace(tzinfo=None) bill.save() if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() except Bill.DoesNotExist: log.error('Could not find bill "%s" in docs.house.gov.' % billname) break else: log.error('Could not parse legis-num bill type "%s" in docs.house.gov.' % billname) # Parse Senate.gov's "Floor Schedule" blurb for coming up tomorrow. now = datetime.now() sfs = urllib.urlopen("http://www.senate.gov/pagelayout/legislative/d_three_sections_with_teasers/calendars.htm").read() try: sfs = re.search(r"Floor Schedule([\w\W]*)Previous Meeting", sfs).group(1) for congress, bill_type, number in re.findall(r"http://hdl.loc.gov/loc.uscongress/legislation.(\d+)([a-z]+)(\d+)", sfs): bill_type = BillType.by_slug(bill_type) bill = Bill.objects.get(congress=congress, bill_type=bill_type, number=number) if bill.senate_floor_schedule_postdate == None or now - bill.senate_floor_schedule_postdate > timedelta(days=7): bill.senate_floor_schedule_postdate = now bill.save() if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() except Exception as e: log.error('Could not parse Senate Floor Schedule: ' + repr(e))
def main(options): """ Update Person and PersonRole models. Do safe update: touch only those records which have been changed. """ BASE_PATH = CONGRESS_LEGISLATORS_PATH SRC_FILES = [ 'legislators-current', 'legislators-historical', 'legislators-social-media', 'executive' ] # order matters for p in SRC_FILES: f = BASE_PATH + p + ".yaml" if not File.objects.is_changed(f) and not options.force: log.info('File %s was not changed' % f) else: # file modified... break else: # no 'break' ==> no files modified return # Start parsing. had_error = False # Get combined data. legislator_data = {} leg_id_map = {} for p in SRC_FILES: log.info('Opening %s...' % p) f = BASE_PATH + p + ".yaml" y = yaml_load(f) for m in y: if p == "legislators-current": # We know all terms but the last are non-current and the last is. for r in m["terms"]: r["current"] = False m["terms"][-1]["current"] = True elif p == "legislators-historical": # We know all terms are non-current. for r in m["terms"]: r["current"] = False if p != 'legislators-social-media': govtrack_id = m["id"].get("govtrack") # For the benefit of the social media file, make a mapping of IDs. for k, v in m["id"].items(): if type(v) != list: leg_id_map[(k, v)] = govtrack_id else: # GovTrack IDs are not always listed in this file. govtrack_id = None for k, v in m["id"].items(): if type(v) != list and (k, v) in leg_id_map: govtrack_id = leg_id_map[(k, v)] break if not govtrack_id: print("No GovTrack ID:") pprint.pprint(m) had_error = True continue if govtrack_id not in legislator_data: legislator_data[govtrack_id] = m elif p == "legislators-social-media": legislator_data[govtrack_id]["social"] = m["social"] elif p == "executive": legislator_data[govtrack_id]["terms"].extend(m["terms"]) else: raise ValueError("Duplication in an unexpected way (%d, %s)." % (govtrack_id, p)) person_processor = PersonProcessor() role_processor = PersonRoleProcessor() existing_persons = set(Person.objects.values_list('pk', flat=True)) processed_persons = set() created_persons = set() progress = Progress(total=len(legislator_data)) log.info('Processing persons') for node in legislator_data.values(): # Wrap each iteration in try/except # so that if some node breaks the parsing process # then other nodes could be parsed try: person = person_processor.process(Person(), node) # Create cached name strings. This is done again later # after the roles are updated. person.set_names() # Now try to load the person with such ID from # database. If found it then just update it # else create new Person object try: ex_person = Person.objects.get(pk=person.pk) if person_processor.changed(ex_person, person) or options.force: # If the person has PK of existing record, # coming in via the YAML-specified GovTrack ID, # then Django ORM will update existing record if not options.force: log.warn("Updated %s" % person) person.save() except Person.DoesNotExist: created_persons.add(person.pk) person.save() log.warn("Created %s" % person) processed_persons.add(person.pk) # Parse all of the roles. new_roles = [] for termnode in node['terms']: role = role_processor.process(PersonRole(), termnode) role.person = person role.extra = filter_yaml_term_structure( termnode) # copy in the whole YAML structure # Is this role current? For legislators, same as whether it came from legislators-current, which eases Jan 3 transitions when we can't distinguish by date. if "current" in termnode: role.current = termnode["current"] # But executives... else: now = datetime.now().date() role.current = role.startdate <= now and role.enddate >= now # Because of date overlaps at noon transition dates, ensure that only the last term that covers # today is current --- reset past roles to not current. Doesn't handle turning off retirning people tho. for r in new_roles: r.current = False # Scan for most recent leadership role within the time period of this term, # which isn't great for Senators because it's likely it changed a few times # within a term, especially if there was a party switch. role.leadership_title = None for leadership_node in node.get("leadership_roles", []): # must match on date and chamber if leadership_node["start"] >= role.enddate.isoformat(): continue # might start on the same day but is for the next Congress if "end" in leadership_node and leadership_node[ "end"] <= role.startdate.isoformat(): continue # might start on the same day but is for the previous Congress if leadership_node["chamber"] != RoleType.by_value( role.role_type).congress_chamber.lower(): continue role.leadership_title = leadership_node["title"] new_roles.append(role) # Try matching the new roles to existing db records. Since we don't have a primry key # in the source data, we have to match on the record values. But because of errors in data, # term start/end dates can change, so matching has to be a little fuzzy. existing_roles = list(PersonRole.objects.filter(person=person)) matches = [] def run_match_rule(rule): import itertools for new_role, existing_role in itertools.product( new_roles, existing_roles): if new_role not in new_roles or existing_role not in existing_roles: continue # already matched on a previous iteration if new_role.role_type != existing_role.role_type: continue if new_role.state != existing_role.state: continue if rule(new_role, existing_role): matches.append((new_role, existing_role)) new_roles.remove(new_role) existing_roles.remove(existing_role) # First match exactly, then exact on just one date, then on contractions and expansions. run_match_rule(lambda new_role, existing_role: new_role.startdate == existing_role.startdate and new_role.enddate == existing_role.enddate) run_match_rule(lambda new_role, existing_role: new_role.startdate == existing_role.startdate or new_role.enddate == existing_role.enddate) run_match_rule(lambda new_role, existing_role: new_role.startdate >= existing_role.startdate and new_role.enddate <= existing_role.enddate) run_match_rule(lambda new_role, existing_role: new_role.startdate <= existing_role.startdate and new_role.enddate >= existing_role.enddate) # Update the database entries that correspond with records in the data file. did_update_any = False for new_role, existing_role in matches: new_role.id = existing_role.id if role_processor.changed(existing_role, new_role) or options.force: new_role.save() did_update_any = True if not options.force: log.warn("Updated %s" % new_role) # If we have mutliple records on disk that didn't match and multiple records in the database # that didn't match, then we don't know how to align them. if len(new_roles) > 0 and len(existing_roles) > 0: print(new_roles) print(existing_roles) raise Exception("There is an unmatched role.") # Otherwise if there are any unmatched new roles, we can just add them. for role in new_roles: log.warn("Created %s" % role) role.save() did_update_any = True # And likewise for any existing roles that are left over. for pr in existing_roles: print(pr.person.id, pr) raise ValueError("Deleted role??") log.warn("Deleted %s" % pr) pr.delete() if did_update_any and not options.disable_events: # Create the events for the roles after all have been loaded # because we don't create events for ends of terms and # starts of terms that are adjacent. Refresh the list to get # the roles in order. role_list = list( PersonRole.objects.filter( person=person).order_by('startdate')) for i in range(len(role_list)): role_list[i].create_events( role_list[i - 1] if i > 0 else None, role_list[i + 1] if i < len(role_list) - 1 else None) # The name can't be determined until all of the roles are set. If # it changes, re-save. Unfortunately roles are cached so this actually # doesn't work yet. Re-run the parser to fix names. nn = (person.name, person.sortname) if hasattr(person, "role"): delattr(person, "role") # clear the cached info person._most_recent_role = None # clear cache here too person.set_names() if nn != (person.name, person.sortname): log.warn("%s is now %s." % (nn[0], person.name)) person.save() except Exception as ex: # Catch unexpected exceptions and log them pprint.pprint(node) log.error('', exc_info=ex) had_error = True progress.tick() log.info('Processed persons: %d' % len(processed_persons)) log.info('Created persons: %d' % len(created_persons)) if not had_error: # Remove person which were not found in XML file removed_persons = existing_persons - processed_persons for pk in removed_persons: p = Person.objects.get(pk=pk) if p.roles.all().count() > 0: log.warn("Missing? Deleted? %d: %s" % (p.id, p)) else: log.warn("Deleting... %d: %s (remember to prune_index!)" % (p.id, p)) raise Exception("Won't delete!") p.delete() log.info('Missing/deleted persons: %d' % len(removed_persons)) # Mark the files as processed. for p in SRC_FILES: f = BASE_PATH + p + ".yaml" File.objects.save_file(f) update_twitter_list()
def main(options): """ Process committees, subcommittees and members of current congress committees. """ BASE_PATH = settings.CONGRESS_LEGISLATORS_PATH meeting_processor = CommitteeMeetingProcessor() log.info('Processing committees') COMMITTEES_FILE = BASE_PATH + 'committees-current.yaml' if not File.objects.is_changed(COMMITTEES_FILE) and not options.force: log.info('File %s was not changed' % COMMITTEES_FILE) else: tree = yaml_load(COMMITTEES_FILE) total = len(tree) progress = Progress(total=total) seen_committees = set() for committee in tree: try: cobj = Committee.objects.get(code=committee["thomas_id"]) except Committee.DoesNotExist: print "New committee:", committee["thomas_id"] cobj = Committee(code=committee["thomas_id"]) cobj.committee_type = TYPE_MAPPING[committee["type"]] cobj.name = committee["name"] cobj.url = committee.get("url", None) cobj.obsolete = False cobj.committee = None cobj.jurisdiction = committee.get("jurisdiction") cobj.jurisdiction_link = committee.get("jurisdiction_source") cobj.save() seen_committees.add(cobj.id) for subcom in committee.get('subcommittees', []): code = committee["thomas_id"] + subcom["thomas_id"] try: sobj = Committee.objects.get(code=code) except Committee.DoesNotExist: print "New subcommittee:", code sobj = Committee(code=code) sobj.name = subcom["name"] sobj.url = subcom.get("url", None) sobj.type = None sobj.committee = cobj sobj.obsolete = False sobj.save() seen_committees.add(sobj.id) progress.tick() # Check for non-obsolete committees in the database that aren't in our # file. other_committees = Committee.objects.filter(obsolete=False).exclude( id__in=seen_committees) if len(other_committees) > 0: print "Marking obsolete:", ", ".join(c.code for c in other_committees) other_committees.update(obsolete=True) File.objects.save_file(COMMITTEES_FILE) log.info('Processing committee members') MEMBERS_FILE = BASE_PATH + 'committee-membership-current.yaml' file_changed = File.objects.is_changed(MEMBERS_FILE) if not file_changed and not options.force: log.info('File %s was not changed' % MEMBERS_FILE) else: # map THOMAS IDs to GovTrack IDs y = yaml_load(BASE_PATH + "legislators-current.yaml") person_id_map = {} for m in y: if "id" in m and "govtrack" in m["id"] and "thomas" in m["id"]: person_id_map[m["id"]["thomas"]] = m["id"]["govtrack"] # load committee members tree = yaml_load(MEMBERS_FILE) total = len(tree) progress = Progress(total=total, name='committees') # We can delete CommitteeMember objects because we don't have # any foreign keys to them. CommitteeMember.objects.all().delete() # Process committee nodes for committee, members in tree.items(): try: cobj = Committee.objects.get(code=committee) except Committee.DoesNotExist: print "Committee not found:", committee continue # Process members of current committee node for member in members: mobj = CommitteeMember() mobj.person = Person.objects.get( id=person_id_map[member["thomas"]]) mobj.committee = cobj if "title" in member: mobj.role = ROLE_MAPPING[member["title"]] mobj.save() progress.tick() File.objects.save_file(MEMBERS_FILE) log.info('Processing committee schedule') for chamber in ("house", "senate"): meetings_file = 'data/congress/committee_meetings_%s.json' % chamber file_changed = File.objects.is_changed(meetings_file) if not file_changed and not options.force: log.info('File %s was not changed' % meetings_file) else: meetings = json.load(open(meetings_file)) # Process committee event nodes for meeting in meetings: try: # Associate it with an existing meeting object if GUID is already known. # Must get it like this, vs just assigning the ID as we do in other parsers, # because of the auto_now_add created field, which otherwise misbehaves. try: mobj = CommitteeMeeting.objects.get( guid=meeting['guid']) except CommitteeMeeting.DoesNotExist: mobj = CommitteeMeeting() # Parse. mobj = meeting_processor.process(mobj, meeting) # Attach the meeting to the subcommittee if set. if mobj.subcommittee: mobj.committee = Committee.objects.get( code=mobj.committee.code + mobj.subcommittee) mobj.save() mobj.bills.clear() for bill in meeting["bill_ids"]: try: bill_type, bill_num, bill_cong = re.match( r"([a-z]+)(\d+)-(\d+)$", bill).groups() bill = Bill.objects.get( congress=bill_cong, bill_type=BillType.by_slug(bill_type), number=int(bill_num)) mobj.bills.add(bill) except AttributeError: pass # regex failed except common.enum.NotFound: pass # invalid bill type code in source data except Bill.DoesNotExist: pass # we don't know about bill yet except Committee.DoesNotExist: log.error( 'Could not load Committee object for meeting %s' % meeting_processor.display_node(meeting)) for committee in Committee.objects.all(): if not options.disable_events: committee.create_events() File.objects.save_file(meetings_file)
def main(options): """ Update Person and PersonRole models. Do safe update: touch only those records which have been changed. """ BASE_PATH = CONGRESS_LEGISLATORS_PATH SRC_FILES = ['legislators-current', 'legislators-historical', 'legislators-social-media', 'executive'] # order matters for p in SRC_FILES: f = BASE_PATH + p + ".yaml" if not File.objects.is_changed(f) and not options.force: log.info('File %s was not changed' % f) else: # file modified... break else: # no 'break' ==> no files modified return # Start parsing. had_error = False # Get combined data. legislator_data = { } leg_id_map = { } for p in SRC_FILES: log.info('Opening %s...' % p) f = BASE_PATH + p + ".yaml" y = yaml_load(f) for m in y: if p == "legislators-current": # We know all terms but the last are non-current and the last is. for r in m["terms"]: r["current"] = False m["terms"][-1]["current"] = True elif p == "legislators-historical": # We know all terms are non-current. for r in m["terms"]: r["current"] = False if p != 'legislators-social-media': govtrack_id = m["id"].get("govtrack") # For the benefit of the social media file, make a mapping of IDs. for k, v in m["id"].items(): if type(v) != list: leg_id_map[(k,v)] = govtrack_id else: # GovTrack IDs are not always listed in this file. govtrack_id = None for k, v in m["id"].items(): if type(v) != list and (k, v) in leg_id_map: govtrack_id = leg_id_map[(k,v)] break if not govtrack_id: print "No GovTrack ID:" pprint.pprint(m) had_error = True continue if govtrack_id not in legislator_data: legislator_data[govtrack_id] = m elif p == "legislators-social-media": legislator_data[govtrack_id]["social"] = m["social"] elif p == "executive": legislator_data[govtrack_id]["terms"].extend( m["terms"] ) else: raise ValueError("Duplication in an unexpected way (%d, %s)." % (govtrack_id, p)) person_processor = PersonProcessor() role_processor = PersonRoleProcessor() existing_persons = set(Person.objects.values_list('pk', flat=True)) processed_persons = set() created_persons = set() progress = Progress(total=len(legislator_data)) log.info('Processing persons') for node in legislator_data.values(): # Wrap each iteration in try/except # so that if some node breaks the parsing process # then other nodes could be parsed try: person = person_processor.process(Person(), node) # Create cached name strings. This is done again later # after the roles are updated. person.set_names() # Now try to load the person with such ID from # database. If found it then just update it # else create new Person object try: ex_person = Person.objects.get(pk=person.pk) if person_processor.changed(ex_person, person) or options.force: # If the person has PK of existing record, # coming in via the YAML-specified GovTrack ID, # then Django ORM will update existing record if not options.force: log.warn("Updated %s" % person) person.save() except Person.DoesNotExist: created_persons.add(person.pk) person.save() log.warn("Created %s" % person) processed_persons.add(person.pk) # Parse all of the roles. new_roles = [] for termnode in node['terms']: role = role_processor.process(PersonRole(), termnode) role.person = person role.extra = filter_yaml_term_structure(termnode) # copy in the whole YAML structure # Is this role current? For legislators, same as whether it came from legislators-current, which eases Jan 3 transitions when we can't distinguish by date. if "current" in termnode: role.current = termnode["current"] # But executives... else: now = datetime.now().date() role.current = role.startdate <= now and role.enddate >= now # Because of date overlaps at noon transition dates, ensure that only the last term that covers # today is current --- reset past roles to not current. Doesn't handle turning off retirning people tho. for r in new_roles: r.current = False # Scan for most recent leadership role within the time period of this term, # which isn't great for Senators because it's likely it changed a few times # within a term, especially if there was a party switch. role.leadership_title = None for leadership_node in node.get("leadership_roles", []): # must match on date and chamber if leadership_node["start"] >= role.enddate.isoformat(): continue # might start on the same day but is for the next Congress if "end" in leadership_node and leadership_node["end"] <= role.startdate.isoformat(): continue # might start on the same day but is for the previous Congress if leadership_node["chamber"] != RoleType.by_value(role.role_type).congress_chamber.lower(): continue role.leadership_title = leadership_node["title"] new_roles.append(role) # Try matching the new roles to existing db records. Since we don't have a primry key # in the source data, we have to match on the record values. But because of errors in data, # term start/end dates can change, so matching has to be a little fuzzy. existing_roles = list(PersonRole.objects.filter(person=person)) matches = [] def run_match_rule(rule): import itertools for new_role, existing_role in itertools.product(new_roles, existing_roles): if new_role not in new_roles or existing_role not in existing_roles: continue # already matched on a previous iteration if new_role.role_type != existing_role.role_type: continue if new_role.state != existing_role.state: continue if rule(new_role, existing_role): matches.append((new_role, existing_role)) new_roles.remove(new_role) existing_roles.remove(existing_role) # First match exactly, then exact on just one date, then on contractions and expansions. run_match_rule(lambda new_role, existing_role : new_role.startdate == existing_role.startdate and new_role.enddate == existing_role.enddate) run_match_rule(lambda new_role, existing_role : new_role.startdate == existing_role.startdate or new_role.enddate == existing_role.enddate) run_match_rule(lambda new_role, existing_role : new_role.startdate >= existing_role.startdate and new_role.enddate <= existing_role.enddate) run_match_rule(lambda new_role, existing_role : new_role.startdate <= existing_role.startdate and new_role.enddate >= existing_role.enddate) # Update the database entries that correspond with records in the data file. did_update_any = False for new_role, existing_role in matches: new_role.id = existing_role.id if role_processor.changed(existing_role, new_role) or options.force: new_role.save() did_update_any = True if not options.force: log.warn("Updated %s" % new_role) # If we have mutliple records on disk that didn't match and multiple records in the database # that didn't match, then we don't know how to align them. if len(new_roles) > 0 and len(existing_roles) > 0: print(new_roles) print(existing_roles) raise Exception("There is an unmatched role.") # Otherwise if there are any unmatched new roles, we can just add them. for role in new_roles: log.warn("Created %s" % role) role.save() did_update_any = True # And likewise for any existing roles that are left over. for pr in existing_roles: print pr.person.id, pr raise ValueError("Deleted role??") log.warn("Deleted %s" % pr) pr.delete() if did_update_any and not options.disable_events: # Create the events for the roles after all have been loaded # because we don't create events for ends of terms and # starts of terms that are adjacent. Refresh the list to get # the roles in order. role_list = list(PersonRole.objects.filter(person=person).order_by('startdate')) for i in xrange(len(role_list)): role_list[i].create_events( role_list[i-1] if i > 0 else None, role_list[i+1] if i < len(role_list)-1 else None ) # The name can't be determined until all of the roles are set. If # it changes, re-save. Unfortunately roles are cached so this actually # doesn't work yet. Re-run the parser to fix names. nn = (person.name, person.sortname) if hasattr(person, "role"): delattr(person, "role") # clear the cached info person.set_names() if nn != (person.name, person.sortname): log.warn("%s is now %s." % (nn[0], person.name)) person.save() except Exception, ex: # Catch unexpected exceptions and log them pprint.pprint(node) log.error('', exc_info=ex) had_error = True progress.tick()
def main(options): """ Parse rolls. """ # Setup XML processors vote_processor = VoteProcessor() option_processor = VoteOptionProcessor() voter_processor = VoterProcessor() voter_processor.PERSON_CACHE = dict((x.pk, x) for x in Person.objects.all()) chamber_mapping = {'s': CongressChamber.senate, 'h': CongressChamber.house} if options.filter: files = glob.glob(options.filter) log.info('Parsing rolls matching %s' % options.filter) elif options.congress: files = glob.glob(settings.CONGRESS_DATA_PATH + '/%s/votes/*/*/data.xml' % options.congress) log.info('Parsing rolls of only congress#%s' % options.congress) else: files = glob.glob('data/congress/*/votes/*/*/data.xml') log.info('Processing votes: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=10) def log_delete_qs(qs): if qs.count() == 0: return print("Deleting obsoleted records: ", qs) #if qs.count() > 3: # print "Delete skipped..." # return qs.delete() seen_obj_ids = set() had_error = False for fname in files: progress.tick() match = re.match(r"data/congress/(?P<congress>\d+)/votes/(?P<session>[ABC0-9]+)/(?P<chamber>[hs])(?P<number>\d+)/data.xml$", fname) try: existing_vote = Vote.objects.get(congress=int(match.group("congress")), chamber=chamber_mapping[match.group("chamber")], session=match.group("session"), number=int(match.group("number"))) except Vote.DoesNotExist: existing_vote = None if not File.objects.is_changed(fname) and not options.force and existing_vote != None and not existing_vote.missing_data: seen_obj_ids.add(existing_vote.id) continue try: tree = etree.parse(fname) ## Look for votes with VP tie breakers. #if len(tree.xpath("/roll/voter[@VP='1']")) == 0: # had_error = True # prevent delete at the end # continue # Process role object roll_node = tree.xpath('/roll')[0] # Sqlite is much faster when lots of saves are wrapped in a transaction, # and we do a lot of saves because it's a lot of voters. from django.db import transaction with transaction.atomic(): vote = vote_processor.process(Vote(), roll_node) if existing_vote: vote.id = existing_vote.id vote.congress = int(match.group("congress")) vote.chamber = chamber_mapping[match.group("chamber")] vote.session = match.group("session") vote.number = int(match.group("number")) # Get related bill & amendment. for bill_node in roll_node.xpath("bill"): related_bill_num = bill_node.get("number") if 9 <= vote.congress <= 42 and vote.session in ('1', '2'): # Bill numbering from the American Memory colletion is different. The number combines # the session, bill number, and a 0 or 5 for regular or 'half' numbering. Prior to # the 9th congress numbering seems to be wholly assigned by us and not related to # actual numbering, so we skip matching those bills. related_bill_num = "%d%04d%d" % (int(vote.session), int(bill_node.get("number")), 0) try: vote.related_bill = Bill.objects.get(congress=bill_node.get("session"), bill_type=BillType.by_xml_code(bill_node.get("type")), number=related_bill_num) except Bill.DoesNotExist: if vote.congress >= 93: vote.missing_data = True for amdt_node in roll_node.xpath("amendment"): if amdt_node.get("ref") == "regular" and vote.related_bill is not None: try: vote.related_amendment = Amendment.objects.get(congress=vote.related_bill.congress, amendment_type=AmendmentType.by_slug(amdt_node.get("number")[0]+"amdt"), number=amdt_node.get("number")[1:]) except Amendment.DoesNotExist: if vote.congress >= 93: print("Missing amendment", fname) vote.missing_data = True elif amdt_node.get("ref") == "bill-serial": # It is impossible to associate House votes with amendments just from the House # vote XML because the amendment-num might correspond either with the A___ number # or with the "An amendment, numbered ____" number from the amendment purpose, # and there's really no way to figure out which. Maybe we can use the amendment # sponsor instead? #vote.related_amendment = Amendment.objects.get(bill=vote.related_bill, sequence=amdt_node.get("number")) # Instead, we set related_amendment from the amendment parser. Here, we have to # preserve the related_amendment if it is set. if existing_vote: vote.related_amendment = existing_vote.related_amendment # clean up some question text and use the question_details field if vote.category in (VoteCategory.passage, VoteCategory.passage_suspension, VoteCategory.veto_override) and vote.related_bill: # For passage votes, set the question to the bill title and put the question # details in the details field. vote.question = vote.related_bill.title vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display() elif vote.category == VoteCategory.amendment and vote.related_amendment: # For votes on amendments, make a better title/explanation. vote.question = vote.related_amendment.title vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display() elif vote.related_bill and vote.question.startswith("On the Cloture Motion " + vote.related_bill.display_number): vote.question = "Cloture on " + vote.related_bill.title elif vote.related_bill and vote.question.startswith("On Cloture on the Motion to Proceed " + vote.related_bill.display_number): vote.question = "Cloture on " + vote.related_bill.title vote.question_details = "On Cloture on the Motion to Proceed in the " + vote.get_chamber_display() elif vote.related_bill and vote.question.startswith("On the Motion to Proceed " + vote.related_bill.display_number): vote.question = "Motion to Proceed on " + vote.related_bill.title elif vote.related_amendment and vote.question.startswith("On the Cloture Motion " + vote.related_amendment.get_amendment_type_display() + " " + str(vote.related_amendment.number)): vote.question = "Cloture on " + vote.related_amendment.title vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display() # weird House foratting of bill numbers ("H RES 123 Blah blah") if vote.related_bill: vote.question = re.sub( "(On [^:]+): " + vote.related_bill.display_number.replace(". ", " ").replace(".", " ").upper() + " .*", r"\1: " + truncatewords(vote.related_bill.title, 15), vote.question) vote.save() seen_obj_ids.add(vote.id) # don't delete me later # Process roll options, overwrite existing options where possible. seen_option_ids = set() roll_options = {} for option_node in roll_node.xpath('./option'): option = option_processor.process(VoteOption(), option_node) option.vote = vote if existing_vote: try: option.id = VoteOption.objects.filter(vote=vote, key=option.key)[0].id # get is better, but I had the database corruption problem except IndexError: pass option.save() roll_options[option.key] = option seen_option_ids.add(option.id) log_delete_qs(VoteOption.objects.filter(vote=vote).exclude(id__in=seen_option_ids)) # may cascade and delete the Voters too? # Process roll voters, overwriting existing voters where possible. if existing_vote: existing_voters = dict(Voter.objects.filter(vote=vote).values_list("person", "id")) seen_voter_ids = set() voters = list() for voter_node in roll_node.xpath('./voter'): voter = voter_processor.process(roll_options, Voter(), voter_node) voter.vote = vote voter.created = vote.created # for VP votes, load the actual person & role... if voter.voter_type == VoterType.vice_president: try: r = PersonRole.objects.get(role_type=RoleType.vicepresident, startdate__lte=vote.created, enddate__gte=vote.created) voter.person_role = r voter.person = r.person except PersonRole.DoesNotExist: # overlapping roles? missing data? log.error('Could not resolve vice president in %s' % fname) if existing_vote and voter.person: try: voter.id = existing_voters[voter.person.id] except KeyError: pass voters.append(voter) if voter.voter_type == VoterType.unknown and not vote.missing_data: vote.missing_data = True vote.save() # pre-fetch the role of each voter load_roles_at_date([x.person for x in voters if x.person != None], vote.created, vote.congress) for voter in list(voters): if voter.voter_type != VoterType.vice_president: voter.person_role = voter.person.role # If we couldn't match a role for this person on the date of the vote, and if the voter was Not Voting, # and we're looking at historical data, then this is probably a data error --- the voter wasn't even in office. # At the start of each Congress, the House does a Call by States and Election of the Speaker, before swearing # in. In the 116th Congress, these votes had a Not Voting for Walter Jones who had not yet made it to DC, and # then omitted Jones in the votes after swearing in. In those cases, look for a role coming up. if voter.person_role is None and voter.option.key == "0" and vote.question in ("Call by States", "Election of the Speaker"): voter.person_role = voter.person.roles.filter(startdate__gt=vote.created, startdate__lt=vote.created+timedelta(days=30)).first() if voter.person_role is None: if vote.source == VoteSource.keithpoole and voter.option.key == "0": # Drop this record. voters.remove(voter) else: log.error("%s: Could not find role for %s on %s." % (fname, voter.person, vote.created)) vote.missing_data = True vote.save() # save all of the records (inserting/updating) for voter in voters: voter.save() seen_voter_ids.add(voter.id) # remove obsolete voter records log_delete_qs(Voter.objects.filter(vote=vote).exclude(id__in=seen_voter_ids)) # possibly already deleted by cascade above # pre-calculate totals vote.calculate_totals() if not options.disable_events: vote.create_event() File.objects.save_file(fname) except Exception as ex: log.error('Error in processing %s' % fname, exc_info=ex) had_error = True # delete vote objects that are no longer represented on disk if options.congress and not options.filter and not had_error: log_delete_qs(Vote.objects.filter(congress=options.congress).exclude(id__in = seen_obj_ids))
def main(options): """ Update Person and PersonRole models. Do safe update: touch only those records which have been changed. """ BASE_PATH = CONGRESS_LEGISLATORS_PATH SRC_FILES = ['legislators-current', 'legislators-historical', 'legislators-social-media', 'executive'] # order matters for p in SRC_FILES: f = BASE_PATH + p + ".yaml" if not File.objects.is_changed(f) and not options.force: log.info('File %s was not changed' % f) else: # file modified... break else: # no 'break' ==> no files modified return # Start parsing. had_error = False # Get combined data. legislator_data = { } leg_id_map = { } for p in SRC_FILES: log.info('Opening %s...' % p) f = BASE_PATH + p + ".yaml" y = yaml_load(f) for m in y: if p != 'legislators-social-media': govtrack_id = m["id"].get("govtrack") # For the benefit of the social media file, make a mapping of IDs. for k, v in m["id"].items(): if type(v) != list: leg_id_map[(k,v)] = govtrack_id else: # GovTrack IDs are not always listed in this file. govtrack_id = None for k, v in m["id"].items(): if type(v) != list and (k, v) in leg_id_map: govtrack_id = leg_id_map[(k,v)] break if not govtrack_id: print "No GovTrack ID:" pprint.pprint(m) had_error = True continue if govtrack_id not in legislator_data: legislator_data[govtrack_id] = m elif p == "legislators-social-media": legislator_data[govtrack_id]["social"] = m["social"] elif p == "executive": legislator_data[govtrack_id]["terms"].extend( m["terms"] ) else: raise ValueError("Duplication in an unexpected way (%d, %s)." % (govtrack_id, p)) person_processor = PersonProcessor() role_processor = PersonRoleProcessor() existing_persons = set(Person.objects.values_list('pk', flat=True)) processed_persons = set() created_persons = set() progress = Progress(total=len(legislator_data)) log.info('Processing persons') for node in legislator_data.values(): # Wrap each iteration in try/except # so that if some node breaks the parsing process # then other nodes could be parsed try: person = person_processor.process(Person(), node) # Create cached name strings. This is done again later # after the roles are updated. person.set_names() # Now try to load the person with such ID from # database. If found it then just update it # else create new Person object try: ex_person = Person.objects.get(pk=person.pk) if person_processor.changed(ex_person, person) or options.force: # If the person has PK of existing record, # coming in via the YAML-specified GovTrack ID, # then Django ORM will update existing record if not options.force: log.warn("Updated %s" % person) person.save() except Person.DoesNotExist: created_persons.add(person.pk) person.save() log.warn("Created %s" % person) processed_persons.add(person.pk) # Process roles of the person roles = list(PersonRole.objects.filter(person=person)) existing_roles = set(PersonRole.objects.filter(person=person).values_list('pk', flat=True)) processed_roles = set() role_list = [] for role in node['terms']: role = role_processor.process(PersonRole(), role) role.person = person role.current = role.startdate <= datetime.now().date() and role.enddate >= datetime.now().date() # \ #and CURRENT_CONGRESS in role.congress_numbers() # Scan for most recent leadership role within the time period of this term, # which isn't great for Senators because it's likely it changed a few times # within a term, especially if there was a party switch. role.leadership_title = None for leadership_node in node.get("leadership_roles", []): # must match on date and chamber if leadership_node["start"] >= role.enddate.isoformat(): continue # might start on the same day but is for the next Congress if "end" in leadership_node and leadership_node["end"] <= role.startdate.isoformat(): continue # might start on the same day but is for the previous Congress if leadership_node["chamber"] != RoleType.by_value(role.role_type).congress_chamber.lower(): continue role.leadership_title = leadership_node["title"] # Try to match this role with one already in the database. # First search for an exact match on type/start/end. ex_role = None for r in roles: if role.role_type == r.role_type and r.startdate == role.startdate and r.enddate == role.enddate: ex_role = r break # Otherwise match on type/start only. if not ex_role: for r in roles: if role.role_type == r.role_type and r.startdate == role.startdate: ex_role = r break if ex_role: # These roles correspond. processed_roles.add(ex_role.id) role.id = ex_role.id if role_processor.changed(ex_role, role) or options.force: role.save() role_list.append(role) if not options.force: log.warn("Updated %s" % role) roles.remove(ex_role) # don't need to try matching this to any other node else: # Didn't find a matching role. if len([r for r in roles if r.role_type == role.role_type]) > 0: print role, "is one of these?" for ex_role in roles: print "\t", ex_role raise Exception("There is an unmatched role.") log.warn("Created %s" % role) role.save() role_list.append(role) # create the events for the roles after all have been loaded # because we don't create events for ends of terms and # starts of terms that are adjacent. if not options.disable_events: for i in xrange(len(role_list)): role_list[i].create_events( role_list[i-1] if i > 0 else None, role_list[i+1] if i < len(role_list)-1 else None ) removed_roles = existing_roles - processed_roles for pk in removed_roles: pr = PersonRole.objects.get(pk=pk) print pr.person.id, pr raise ValueError("Deleted role??") log.warn("Deleted %s" % pr) pr.delete() # The name can't be determined until all of the roles are set. If # it changes, re-save. Unfortunately roles are cached so this actually # doesn't work yet. Re-run the parser to fix names. nn = (person.name, person.sortname) if hasattr(person, "role"): delattr(person, "role") # clear the cached info person.set_names() if nn != (person.name, person.sortname): log.warn("%s is now %s." % (nn[0], person.name)) person.save() except Exception, ex: # Catch unexpected exceptions and log them pprint.pprint(node) log.error('', exc_info=ex) had_error = True progress.tick()
def main(options): """ Process bill terms and bills """ # Terms term_processor = TermProcessor() terms_parsed = set() # Cache existing terms. There aren't so many. existing_terms = {} for term in BillTerm.objects.all(): existing_terms[(int(term.term_type), term.name)] = term log.info("Processing old bill terms") TERMS_FILE = "data/us/liv.xml" tree = etree.parse(TERMS_FILE) for node in tree.xpath("/liv/top-term"): term = term_processor.process(BillTerm(), node) term.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath("./term"): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] term.subterms.add(subterm) terms_parsed.add(subterm.id) except: try: log.debug("Created %s" % subterm) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error("Duplicated term %s" % term_processor.display_node(subnode)) log.info("Processing new bill terms") for FILE in ("data/us/liv111.xml", "data/us/crsnet.xml"): tree = etree.parse(FILE) for node in tree.xpath("/liv/top-term"): term = term_processor.process(BillTerm(), node) term.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath("./term"): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] terms_parsed.add(subterm.id) term.subterms.add(subterm) except: try: log.debug("Created %s" % term) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error("Duplicated term %s" % term_processor.display_node(subnode)) for term in existing_terms.values(): if not term.id in terms_parsed: log.debug("Deleted %s" % term) term.delete() # Bills bill_index = None if not options.disable_indexing: from bill.search_indexes import BillIndex bill_index = BillIndex() if options.congress and int(options.congress) <= 42: files = glob.glob("data/congress/%s/bills/*/*/*.xml" % options.congress) log.info("Parsing unitedstates/congress bills of only congress#%s" % options.congress) elif options.congress: files = glob.glob("data/us/%s/bills/*.xml" % options.congress) log.info("Parsing bills of only congress#%s" % options.congress) else: files = glob.glob("data/us/*/bills/*.xml") if options.filter: files = [f for f in files if re.match(options.filter, f)] log.info("Processing bills: %d files" % len(files)) total = len(files) progress = Progress(total=total, name="files", step=100) bill_processor = BillProcessor() seen_bill_ids = [] for fname in files: progress.tick() # With indexing or events enabled, if the bill metadata file hasn't changed check # the bill's latest text file for changes so we can create a text-is-available # event and so we can index the bill's text. if ( (not options.congress or options.congress > 42) and (bill_index and not options.disable_events) and not File.objects.is_changed(fname) and not options.force ): m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname) try: b = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3)) seen_bill_ids.append(b.id) # Update the index/events for any bill with recently changed text textfile = get_bill_text_metadata(b, None) if not textfile: if b.congress >= 103 and b.introduced_date < (datetime.now() - timedelta(days=14)).date(): print "No bill text?", fname, b.introduced_date continue textfile = textfile["text_file"] if os.path.exists(textfile) and File.objects.is_changed(textfile): bill_index.update_object(b, using="bill") # index the full text b.create_events() # events for new bill text documents File.objects.save_file(textfile) continue except Bill.DoesNotExist: print "Unchanged metadata file but bill doesn't exist:", fname pass # just parse as normal if options.slow: time.sleep(1) tree = etree.parse(fname) for node in tree.xpath("/bill"): try: bill = bill_processor.process(Bill(), node) except: print fname raise seen_bill_ids.append(bill.id) # don't delete me later if bill.congress >= 93: bill.source = "thomas-congproj" elif bill.congress >= 82: bill.source = "statutesatlarge" if bill.current_status == BillStatus.enacted_signed: bill.current_status = BillStatus.enacted_unknown elif bill.congress <= 42: bill.source = "americanmemory" else: raise ValueError() # So far this is just for American Memory bills. if node.xpath("string(source/@url)"): bill.source_link = unicode(node.xpath("string(source/@url)")) else: bill.source_link = None actions = [] for axn in tree.xpath("actions/*[@state]"): actions.append( ( repr(bill_processor.parse_datetime(axn.xpath("string(@datetime)"))), BillStatus.by_xml_code(axn.xpath("string(@state)")), axn.xpath("string(text)"), etree.tostring(axn), ) ) bill.sliplawpubpriv = None bill.sliplawnum = None for axn in tree.xpath("actions/enacted"): bill.sliplawpubpriv = "PUB" if axn.get("type") == "public" else "PRI" bill.sliplawnum = int(axn.get("number").split("-")[1]) bill.major_actions = actions try: bill.save() except: print bill raise if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() File.objects.save_file(fname) # delete bill objects that are no longer represented on disk.... this is too dangerous. if options.congress and not options.filter: # this doesn't work because seen_bill_ids is too big for sqlite! for b in Bill.objects.filter(congress=options.congress).exclude(id__in=seen_bill_ids): print "Bill is no longer on disk: ", b.id, b # The rest is for current only... if options.congress and int(options.congress) != CURRENT_CONGRESS: return # Parse docs.house.gov for what might be coming up this week. import iso8601 dhg_html = urllib.urlopen("http://docs.house.gov/floor/").read() m = re.search(r"class=\"downloadXML\" href=\"(Download.aspx\?file=.*?)\"", dhg_html) if not m: log.error("No docs.house.gov download link found at http://docs.house.gov.") else: def bt_re(bt): return re.escape(bt[1]).replace(r"\.", r"\.?\s*") try: dhg = etree.parse(urllib.urlopen("http://docs.house.gov/floor/" + m.group(1))).getroot() except: print "http://docs.house.gov/floor/" + m.group(1) raise # iso8601.parse_date(dhg.get("week-date")+"T00:00:00").date() for item in dhg.xpath("category/floor-items/floor-item"): billname = item.xpath("legis-num")[0].text if billname is None: continue # weird but OK m = re.match( r"\s*(?:Concur in the Senate Amendment to |Senate Amendment to )?(" + "|".join(bt_re(bt) for bt in BillType) + r")(\d+)\s*(\[Conference Report\]\s*)?$", billname, re.I, ) if not m: if not billname.strip().endswith(" __"): log.error('Could not parse legis-num "%s" in docs.house.gov.' % billname) else: for bt in BillType: if re.match(bt_re(bt) + "$", m.group(1), re.I): try: bill = Bill.objects.get(congress=CURRENT_CONGRESS, bill_type=bt[0], number=m.group(2)) bill.docs_house_gov_postdate = iso8601.parse_date(item.get("add-date")).replace(tzinfo=None) bill.save() if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() except Bill.DoesNotExist: log.error('Could not find bill "%s" in docs.house.gov.' % billname) break else: log.error('Could not parse legis-num bill type "%s" in docs.house.gov.' % m.group(1)) # Parse Senate.gov's "Floor Schedule" blurb for coming up tomorrow. now = datetime.now() sfs = urllib.urlopen( "http://www.senate.gov/pagelayout/legislative/d_three_sections_with_teasers/calendars.htm" ).read() try: sfs = re.search(r"Floor Schedule([\w\W]*)Previous Meeting", sfs).group(1) for congress, bill_type, number in re.findall( r"http://hdl.loc.gov/loc.uscongress/legislation.(\d+)([a-z]+)(\d+)", sfs ): bill_type = BillType.by_slug(bill_type) bill = Bill.objects.get(congress=congress, bill_type=bill_type, number=number) if bill.senate_floor_schedule_postdate == None or now - bill.senate_floor_schedule_postdate > timedelta( days=7 ): bill.senate_floor_schedule_postdate = now bill.save() if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() except Exception as e: log.error("Could not parse Senate Floor Schedule: " + repr(e))
def main(options): """ Process bill terms and bills """ # Terms term_processor = TermProcessor() terms_parsed = set() # Cache existing terms. There aren't so many. existing_terms = {} for term in BillTerm.objects.all(): existing_terms[(int(term.term_type), term.name)] = term log.info('Processing old bill terms') TERMS_FILE = 'data/us/liv.xml' tree = etree.parse(TERMS_FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] term.subterms.add(subterm) terms_parsed.add(subterm.id) except: try: log.debug("Created %s" % subterm) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) log.info('Processing new bill terms') for FILE in ('data/us/liv111.xml', 'data/us/crsnet.xml'): tree = etree.parse(FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] terms_parsed.add(subterm.id) term.subterms.add(subterm) except: try: log.debug("Created %s" % term) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) for term in existing_terms.values(): if not term.id in terms_parsed: log.debug("Deleted %s" % term) term.delete() # Bills bill_index = None if not options.disable_indexing: from bill.search_indexes import BillIndex bill_index = BillIndex() if options.congress and int(options.congress) <= 42: files = glob.glob('data/congress/%s/bills/*/*/*.xml' % options.congress) log.info('Parsing unitedstates/congress bills of only congress#%s' % options.congress) elif options.congress: files = glob.glob('data/us/%s/bills/*.xml' % options.congress) log.info('Parsing bills of only congress#%s' % options.congress) else: files = glob.glob('data/us/*/bills/*.xml') if options.filter: files = [f for f in files if re.match(options.filter, f)] log.info('Processing bills: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=100) bill_processor = BillProcessor() seen_bill_ids = [] for fname in files: progress.tick() # With indexing or events enabled, if the bill metadata file hasn't changed check # the bill's latest text file for changes so we can create a text-is-available # event and so we can index the bill's text. if (not options.congress or options.congress > 42) and ( bill_index and not options.disable_events ) and not File.objects.is_changed(fname) and not options.force: m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname) try: b = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code( m.group(2)), number=m.group(3)) seen_bill_ids.append(b.id) # Update the index/events for any bill with recently changed text textfile = get_bill_text_metadata(b, None) if not textfile: if b.congress >= 103 and b.introduced_date < ( datetime.now() - timedelta(days=14)).date(): print "No bill text?", fname, b.introduced_date continue textfile = textfile["text_file"] if os.path.exists(textfile) and File.objects.is_changed( textfile): bill_index.update_object( b, using="bill") # index the full text b.create_events() # events for new bill text documents File.objects.save_file(textfile) continue except Bill.DoesNotExist: print "Unchanged metadata file but bill doesn't exist:", fname pass # just parse as normal if options.slow: time.sleep(1) tree = etree.parse(fname) for node in tree.xpath('/bill'): try: bill = bill_processor.process(Bill(), node) except: print fname raise seen_bill_ids.append(bill.id) # don't delete me later if bill.congress >= 93: bill.source = "thomas-congproj" elif bill.congress >= 82: bill.source = "statutesatlarge" if bill.current_status == BillStatus.enacted_signed: bill.current_status = BillStatus.enacted_unknown elif bill.congress <= 42: bill.source = "americanmemory" else: raise ValueError() # So far this is just for American Memory bills. if node.xpath("string(source/@url)"): bill.source_link = unicode(node.xpath("string(source/@url)")) else: bill.source_link = None actions = [] for axn in tree.xpath("actions/*[@state]"): actions.append(( repr( bill_processor.parse_datetime( axn.xpath("string(@datetime)"))), BillStatus.by_xml_code(axn.xpath("string(@state)")), axn.xpath("string(text)"), etree.tostring(axn), )) bill.sliplawpubpriv = None bill.sliplawnum = None for axn in tree.xpath("actions/enacted"): bill.sliplawpubpriv = "PUB" if axn.get( "type") == "public" else "PRI" bill.sliplawnum = int(axn.get("number").split("-")[1]) bill.major_actions = actions try: bill.save() except: print bill raise if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() File.objects.save_file(fname) # delete bill objects that are no longer represented on disk.... this is too dangerous. if options.congress and not options.filter: # this doesn't work because seen_bill_ids is too big for sqlite! for b in Bill.objects.filter(congress=options.congress).exclude( id__in=seen_bill_ids): print "Bill is no longer on disk: ", b.id, b # The rest is for current only... if options.congress and int(options.congress) != CURRENT_CONGRESS: return # Parse docs.house.gov for what might be coming up this week. import iso8601 dhg_html = urllib.urlopen("http://docs.house.gov/floor/").read() m = re.search(r"class=\"downloadXML\" href=\"(Download.aspx\?file=.*?)\"", dhg_html) if not m: log.error( 'No docs.house.gov download link found at http://docs.house.gov.') else: def bt_re(bt): return re.escape(bt[1]).replace(r"\.", r"\.?\s*") try: dhg = etree.parse( urllib.urlopen("http://docs.house.gov/floor/" + m.group(1))).getroot() except: print "http://docs.house.gov/floor/" + m.group(1) raise # iso8601.parse_date(dhg.get("week-date")+"T00:00:00").date() for item in dhg.xpath("category/floor-items/floor-item"): billname = item.xpath("legis-num")[0].text if billname is None: continue # weird but OK m = re.match( r"\s*(?:Concur in the Senate Amendment to |Senate Amendment to )?(" + "|".join(bt_re(bt) for bt in BillType) + r")(\d+)\s*(\[Conference Report\]\s*)?$", billname, re.I) if not m: if not billname.strip().endswith(" __"): log.error( 'Could not parse legis-num "%s" in docs.house.gov.' % billname) else: for bt in BillType: if re.match(bt_re(bt) + "$", m.group(1), re.I): try: bill = Bill.objects.get(congress=CURRENT_CONGRESS, bill_type=bt[0], number=m.group(2)) bill.docs_house_gov_postdate = iso8601.parse_date( item.get("add-date")).replace(tzinfo=None) bill.save() if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() except Bill.DoesNotExist: log.error( 'Could not find bill "%s" in docs.house.gov.' % billname) break else: log.error( 'Could not parse legis-num bill type "%s" in docs.house.gov.' % m.group(1)) # Parse Senate.gov's "Floor Schedule" blurb for coming up tomorrow. now = datetime.now() sfs = urllib.urlopen( "http://www.senate.gov/pagelayout/legislative/d_three_sections_with_teasers/calendars.htm" ).read() try: sfs = re.search(r"Floor Schedule([\w\W]*)Previous Meeting", sfs).group(1) for congress, bill_type, number in re.findall( r"http://hdl.loc.gov/loc.uscongress/legislation.(\d+)([a-z]+)(\d+)", sfs): bill_type = BillType.by_slug(bill_type) bill = Bill.objects.get(congress=congress, bill_type=bill_type, number=number) if bill.senate_floor_schedule_postdate == None or now - bill.senate_floor_schedule_postdate > timedelta( days=7): bill.senate_floor_schedule_postdate = now bill.save() if bill_index: bill_index.update_object(bill, using="bill") if not options.disable_events: bill.create_events() except Exception as e: log.error('Could not parse Senate Floor Schedule: ' + repr(e))
def main(options): """ Parse rolls. """ # Setup XML processors vote_processor = VoteProcessor() option_processor = VoteOptionProcessor() voter_processor = VoterProcessor() voter_processor.PERSON_CACHE = dict((x.pk, x) for x in Person.objects.all()) # The pattern which the roll file matches # Filename contains info which should be placed to DB # along with info extracted from the XML file re_path = re.compile('data/us/(\d+)/rolls/([hs])(\w+)-(\d+)\.xml') chamber_mapping = {'s': CongressChamber.senate, 'h': CongressChamber.house} if options.filter: files = glob.glob(options.filter) log.info('Parsing rolls matching %s' % options.filter) elif options.congress: files = glob.glob('data/us/%s/rolls/*.xml' % options.congress) log.info('Parsing rolls of only congress#%s' % options.congress) else: files = glob.glob('data/us/*/rolls/*.xml') log.info('Processing votes: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=10) def log_delete_qs(qs): if qs.count() > 0: try: print "Deleting: ", qs except Exception as e: print "Deleting [%s]..." % str(e) if qs.count() > 3: print "Delete skipped..." return qs.delete() seen_obj_ids = set() had_error = False for fname in files: progress.tick() match = re_path.search(fname) try: existing_vote = Vote.objects.get(congress=match.group(1), chamber=chamber_mapping[match.group(2)], session=match.group(3), number=match.group(4)) except Vote.DoesNotExist: existing_vote = None if not File.objects.is_changed(fname) and not options.force and existing_vote != None and not existing_vote.missing_data: seen_obj_ids.add(existing_vote.id) continue try: tree = etree.parse(fname) ## Look for votes with VP tie breakers. #if len(tree.xpath("/roll/voter[@VP='1']")) == 0: # had_error = True # prevent delete at the end # continue # Process role object for roll_node in tree.xpath('/roll'): vote = vote_processor.process(Vote(), roll_node) if existing_vote: vote.id = existing_vote.id match = re_path.search(fname) vote.congress = int(match.group(1)) vote.chamber = chamber_mapping[match.group(2)] vote.session = match.group(3) vote.number = int(match.group(4)) for bill_node in roll_node.xpath("bill"): try: vote.related_bill = Bill.objects.get(congress=bill_node.get("session"), bill_type=BillType.by_xml_code(bill_node.get("type")), number=bill_node.get("number")) # for votes on passage, reverse the order of the title so that the # name of the bill comes first, but keep the vote_type at the end # to distinguish suspension votes etc. also, the title that comes # from the upstream source is not formatted in our style. if vote.category in (VoteCategory.passage, VoteCategory.passage_suspension, VoteCategory.veto_override): vote.question = truncatewords(vote.related_bill.title, 12) + " (" + vote.vote_type + ")" except Bill.DoesNotExist: vote.missing_data = True vote.save() seen_obj_ids.add(vote.id) # don't delete me later # Process roll options, overwrite existing options where possible. seen_option_ids = set() roll_options = {} for option_node in roll_node.xpath('./option'): option = option_processor.process(VoteOption(), option_node) option.vote = vote if existing_vote: try: option.id = VoteOption.objects.filter(vote=vote, key=option.key)[0].id # get is better, but I had the database corruption problem except IndexError: pass option.save() roll_options[option.key] = option seen_option_ids.add(option.id) log_delete_qs(VoteOption.objects.filter(vote=vote).exclude(id__in=seen_option_ids)) # may cascade and delete the Voters too? # Process roll voters, overwriting existing voters where possible. if existing_vote: existing_voters = dict(Voter.objects.filter(vote=vote).values_list("person", "id")) seen_voter_ids = set() for voter_node in roll_node.xpath('./voter'): voter = voter_processor.process(roll_options, Voter(), voter_node) voter.vote = vote voter.created = vote.created # for VP votes, load the actual person... if voter.voter_type == VoterType.vice_president: try: r = PersonRole.objects.get(role_type=RoleType.vicepresident, startdate__lte=vote.created, enddate__gte=vote.created) voter.person = r.person except: # overlapping roles? missing data? log.error('Could not resolve vice president in %s' % fname, exc_info=ex) if existing_vote and voter.person: try: voter.id = existing_voters[voter.person.id] except KeyError: pass voter.save() if voter.voter_type == VoterType.unknown and not vote.missing_data: vote.missing_data = True vote.save() seen_voter_ids.add(voter.id) log_delete_qs(Voter.objects.filter(vote=vote).exclude(id__in=seen_voter_ids)) # possibly already deleted by cascade above vote.calculate_totals() if not options.disable_events: vote.create_event() File.objects.save_file(fname) except Exception, ex: log.error('Error in processing %s' % fname, exc_info=ex) had_error = True
def main(options): """ Update Person and PersonRole models. Do safe update: touch only those records which have been changed. """ BASE_PATH = CONGRESS_LEGISLATORS_PATH SRC_FILES = ['legislators-current', 'legislators-historical', 'legislators-social-media', 'executive'] # order matters for p in SRC_FILES: f = BASE_PATH + p + ".yaml" if not File.objects.is_changed(f) and not options.force: log.info('File %s was not changed' % f) else: # file modified... break else: # no 'break' ==> no files modified return # Start parsing. had_error = False # Get combined data. legislator_data = { } leg_id_map = { } for p in SRC_FILES: log.info('Opening %s...' % p) f = BASE_PATH + p + ".yaml" y = yaml_load(f) for m in y: if p != 'legislators-social-media': govtrack_id = m["id"].get("govtrack") # For the benefit of the social media file, make a mapping of IDs. for k, v in m["id"].items(): if type(v) != list: leg_id_map[(k,v)] = govtrack_id else: # GovTrack IDs are not always listed in this file. for k, v in m["id"].items(): if type(v) != list and (k, v) in leg_id_map: govtrack_id = leg_id_map[(k,v)] break if not govtrack_id: print "No GovTrack ID:" pprint.pprint(m) had_error = True continue if govtrack_id not in legislator_data: legislator_data[govtrack_id] = m elif p == "legislators-social-media": legislator_data[govtrack_id]["social"] = m["social"] elif p == "executive": legislator_data[govtrack_id]["terms"].extend( m["terms"] ) else: raise ValueError("Duplication in an unexpected way.") person_processor = PersonProcessor() role_processor = PersonRoleProcessor() existing_persons = set(Person.objects.values_list('pk', flat=True)) processed_persons = set() created_persons = set() progress = Progress(total=len(legislator_data)) log.info('Processing persons') for node in legislator_data.values(): # Wrap each iteration in try/except # so that if some node breaks the parsing process # then other nodes could be parsed try: person = person_processor.process(Person(), node) # Create cached name strings. This is done again later # after the roles are updated. person.set_names() # Now try to load the person with such ID from # database. If found it then just update it # else create new Person object try: ex_person = Person.objects.get(pk=person.pk) if person_processor.changed(ex_person, person) or options.force: # If the person has PK of existing record, # coming in via the YAML-specified GovTrack ID, # then Django ORM will update existing record if not options.force: log.warn("Updated %s" % person) person.save() except Person.DoesNotExist: created_persons.add(person.pk) person.save() log.warn("Created %s" % person) processed_persons.add(person.pk) # Process roles of the person roles = list(PersonRole.objects.filter(person=person)) existing_roles = set(PersonRole.objects.filter(person=person).values_list('pk', flat=True)) processed_roles = set() role_list = [] for role in node['terms']: role = role_processor.process(PersonRole(), role) role.person = person role.current = role.startdate <= datetime.now().date() and role.enddate >= datetime.now().date() # \ #and CURRENT_CONGRESS in role.congress_numbers() # Try to match this role with one already in the database. # First search for an exact match on type/start/end. ex_role = None for r in roles: if role.role_type == r.role_type and r.startdate == role.startdate and r.enddate == role.enddate: ex_role = r break # Otherwise match on type/start only. if not ex_role: for r in roles: if role.role_type == r.role_type and r.startdate == role.startdate: ex_role = r break if ex_role: # These roles correspond. processed_roles.add(ex_role.id) role.id = ex_role.id if role_processor.changed(ex_role, role) or options.force: role.save() role_list.append(role) if not options.force: log.warn("Updated %s" % role) roles.remove(ex_role) # don't need to try matching this to any other node else: # Didn't find a matching role. if len([r for r in roles if r.role_type == role.role_type]) > 0: print role, "is one of these?" for ex_role in roles: print "\t", ex_role raise Exception("There is an unmatched role.") log.warn("Created %s" % role) role.save() role_list.append(role) # create the events for the roles after all have been loaded # because we don't create events for ends of terms and # starts of terms that are adjacent. if not options.disable_events: for i in xrange(len(role_list)): role_list[i].create_events( role_list[i-1] if i > 0 else None, role_list[i+1] if i < len(role_list)-1 else None ) removed_roles = existing_roles - processed_roles for pk in removed_roles: pr = PersonRole.objects.get(pk=pk) print pr.person.id, pr raise ValueError("Deleted role??") log.warn("Deleted %s" % pr) pr.delete() # The name can't be determined until all of the roles are set. If # it changes, re-save. Unfortunately roles are cached so this actually # doesn't work yet. Re-run the parser to fix names. nn = (person.name, person.sortname) if hasattr(person, "role"): delattr(person, "role") # clear the cached info person.set_names() if nn != (person.name, person.sortname): log.warn("%s is now %s." % (nn[0], person.name)) person.save() except Exception, ex: # Catch unexpected exceptions and log them pprint.pprint(node) log.error('', exc_info=ex) had_error = True progress.tick()
def main(options): """ Process amendments """ if options.congress: files = glob.glob(CONGRESS_DATA_PATH + '/{congress}/amendments/*/*/data.xml'.format(congress=options.congress)) log.info('Parsing amendments of only congress#%s' % options.congress) else: files = glob.glob(CONGRESS_DATA_PATH + '/*/amendments/*/*/data.xml') if options.filter: files = [f for f in files if re.match(options.filter, f)] log.info('Processing amendments: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=100) amendment_processor = AmendmentProcessor() seen_amdt_ids = [] for fname in files: progress.tick() m = re.match(re.escape(CONGRESS_DATA_PATH) + r'/(?P<congress>\d+)/amendments/(?P<amendment_type>[a-z]+)/(?P<amendment_type2>[a-z]+)(?P<number>[0-9]+)/data.xml', fname) if not File.objects.is_changed(fname) and not options.force: if not m: raise ValueError("Invalid file name", fname) else: amdt = Amendment.objects.get(congress=int(m.group("congress")), amendment_type=AmendmentType.by_slug(m.group("amendment_type")), number=int(m.group("number"))) seen_amdt_ids.append(amdt.id) # don't delete me later continue tree = etree.parse(fname) node = tree.xpath('/amendment')[0] node.set("amendment_type", m.group("amendment_type")) # move from the filename to a place where we can see it in the XML try: amdt = amendment_processor.process(Amendment(), node) except: print(fname) raise if not amdt: # Amendments to treaties. Can't process. continue # update if already in db try: amdt.id = Amendment.objects.get(congress=amdt.congress, amendment_type=amdt.amendment_type, number=amdt.number).id except Amendment.DoesNotExist: pass # a new amendment seen_amdt_ids.append(amdt.id) # don't delete me later try: amdt.save() except: print(amdt) raise # For House votes on amendments, the only way to associate the vote with the # amendment is to use the THOMAS/Congress.gov action lines. The House vote XML # has an amendment-num field but its meaning is ambiguous, so it is useless. # When we parse a House amendment with an action line referencing a roll call vote, # save this amendment as that vote's related_amendment, then mark the vote as # 'missing data' (below) so that on the next parse of votes its title gets updated. if amdt.amendment_type == AmendmentType.house_amendment: for vote in node.xpath("actions/vote[@how='roll']"): v_congress, v_session = get_session_from_date(XmlProcessor.parse_datetime(vote.get('datetime')).date()) v_roll = int(vote.get("roll")) try: vote = Vote.objects.get(congress=v_congress, chamber=CongressChamber.house, session=v_session, number=v_roll) vote.related_amendment = amdt vote.save() except Vote.DoesNotExist: print("Missing vote data in", fname) # If this amendment is related to a vote, mark the vote as missing data because # we may need to update the vote title if the amendment title has changed. Vote.objects.filter(related_amendment=amdt).update(missing_data=True) File.objects.save_file(fname) # Are any amendments in the database no longer on disk? if options.congress and not options.filter: missing = Amendment.objects.filter(congress=options.congress).exclude(id__in = seen_amdt_ids) if missing.exists(): print("Amendments should be deleted: ", missing)
def main(options): """ Process bill terms and bills """ # Terms term_processor = TermProcessor() terms_parsed = set() # Cache existing terms. There aren't so many. existing_terms = { } for term in BillTerm.objects.all(): existing_terms[(int(term.term_type), term.name)] = term log.info('Processing old bill terms') TERMS_FILE = 'data/us/liv.xml' tree = etree.parse(TERMS_FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] term.subterms.add(subterm) terms_parsed.add(subterm.id) except: try: log.debug("Created %s" % subterm) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) log.info('Processing new bill terms') for FILE in ('data/us/liv111.xml', 'data/us/crsnet.xml'): tree = etree.parse(FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] terms_parsed.add(subterm.id) term.subterms.add(subterm) except: try: log.debug("Created %s" % term) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) for term in existing_terms.values(): if not term.id in terms_parsed: log.debug("Deleted %s" % term) term.delete() # Bills bill_index = None if not options.disable_indexing: from bill.search_indexes import BillIndex bill_index = BillIndex() if options.congress and int(options.congress) <= 42: files = glob.glob('data/congress/%s/bills/*/*/*.xml' % options.congress) log.info('Parsing unitedstates/congress bills of only congress#%s' % options.congress) elif options.congress: files = glob.glob('data/us/%s/bills/*.xml' % options.congress) log.info('Parsing bills of only congress#%s' % options.congress) else: files = glob.glob('data/us/*/bills/*.xml') if options.filter: files = [f for f in files if re.match(options.filter, f)] log.info('Processing bills: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=100) bill_processor = BillProcessor() seen_bill_ids = [] for fname in files: progress.tick() # With indexing or events enabled, if the bill metadata file hasn't changed check # the bill's latest text file for changes so we can create a text-is-available # event and so we can index the bill's text. if (not options.congress or options.congress>42) and (bill_index and not options.disable_events) and not File.objects.is_changed(fname) and not options.force: m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname) try: b = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3)) seen_bill_ids.append(b.id) # Update the index/events for any bill with recently changed text textfile = get_bill_text_metadata(b, None) if not textfile: if b.congress >= 103 and b.introduced_date < (datetime.now()-timedelta(days=14)).date(): print "No bill text?", fname, b.introduced_date continue textfile = textfile["text_file"] if os.path.exists(textfile) and File.objects.is_changed(textfile): b.update_index(bill_index) # index the full text b.create_events() # events for new bill text documents File.objects.save_file(textfile) continue except Bill.DoesNotExist: print "Unchanged metadata file but bill doesn't exist:", fname pass # just parse as normal if options.slow: time.sleep(1) tree = etree.parse(fname) for node in tree.xpath('/bill'): try: bill = bill_processor.process(Bill(), node) except: print fname raise seen_bill_ids.append(bill.id) # don't delete me later # So far this is just for American Memory bills. if node.xpath("string(source/@url)"): bill.source_link = unicode(node.xpath("string(source/@url)")) else: bill.source_link = None actions = [] for axn in tree.xpath("actions/*[@state]"): actions.append( ( repr(bill_processor.parse_datetime(axn.xpath("string(@datetime)"))), BillStatus.by_xml_code(axn.xpath("string(@state)")), axn.xpath("string(text)"), etree.tostring(axn), ) ) bill.sliplawpubpriv = None bill.sliplawnum = None for axn in tree.xpath("actions/enacted"): bill.sliplawpubpriv = "PUB" if axn.get("type") == "public" else "PRI" bill.sliplawnum = int(axn.get("number").split("-")[1]) bill.major_actions = actions try: bill.save() except: print bill raise if bill_index: bill.update_index(bill_index) if not options.disable_events: bill.create_events() File.objects.save_file(fname) # delete bill objects that are no longer represented on disk.... this is too dangerous. if options.congress and not options.filter: # this doesn't work because seen_bill_ids is too big for sqlite! for b in Bill.objects.filter(congress=options.congress).exclude(id__in = seen_bill_ids): print "Bill is no longer on disk: ", b.id, b # The rest is for current only... if options.congress and int(options.congress) != CURRENT_CONGRESS: return # Find what might be coming up this week. load_docs_house_gov(options, bill_index) load_senate_floor_schedule(options, bill_index)
def main(options): """ Process committees, subcommittees and members of current congress committees. """ BASE_PATH = settings.CONGRESS_LEGISLATORS_PATH meeting_processor = CommitteeMeetingProcessor() log.info('Processing committees') COMMITTEES_FILE = BASE_PATH + 'committees-current.yaml' if not File.objects.is_changed(COMMITTEES_FILE) and not options.force: log.info('File %s was not changed' % COMMITTEES_FILE) else: tree = yaml_load(COMMITTEES_FILE) total = len(tree) progress = Progress(total=total) seen_committees = set() for committee in tree: try: cobj = Committee.objects.get(code=committee["thomas_id"]) except Committee.DoesNotExist: print "New committee:", committee["thomas_id"] cobj = Committee(code=committee["thomas_id"]) cobj.committee_type = TYPE_MAPPING[committee["type"]] cobj.name = committee["name"] cobj.url = committee.get("url", None) cobj.obsolete = False cobj.committee = None cobj.jurisdiction = committee.get("jurisdiction") cobj.jurisdiction_link = committee.get("jurisdiction_source") cobj.save() seen_committees.add(cobj.id) for subcom in committee.get('subcommittees', []): code = committee["thomas_id"] + subcom["thomas_id"] try: sobj = Committee.objects.get(code=code) except Committee.DoesNotExist: print "New subcommittee:", code sobj = Committee(code=code) sobj.name = subcom["name"] sobj.url = subcom.get("url", None) sobj.type = None sobj.committee = cobj sobj.obsolete = False sobj.save() seen_committees.add(sobj.id) progress.tick() # Check for non-obsolete committees in the database that aren't in our # file. other_committees = Committee.objects.filter(obsolete=False).exclude(id__in=seen_committees) if len(other_committees) > 0: print "Marking obsolete:", ", ".join(c.code for c in other_committees) other_committees.update(obsolete=True) File.objects.save_file(COMMITTEES_FILE) log.info('Processing committee members') MEMBERS_FILE = BASE_PATH + 'committee-membership-current.yaml' file_changed = File.objects.is_changed(MEMBERS_FILE) if not file_changed and not options.force: log.info('File %s was not changed' % MEMBERS_FILE) else: # map THOMAS IDs to GovTrack IDs y = yaml_load(BASE_PATH + "legislators-current.yaml") person_id_map = { } for m in y: if "id" in m and "govtrack" in m["id"] and "thomas" in m["id"]: person_id_map[m["id"]["thomas"]] = m["id"]["govtrack"] # load committee members tree = yaml_load(MEMBERS_FILE) total = len(tree) progress = Progress(total=total, name='committees') # We can delete CommitteeMember objects because we don't have # any foreign keys to them. CommitteeMember.objects.all().delete() # Process committee nodes for committee, members in tree.items(): try: cobj = Committee.objects.get(code=committee) except Committee.DoesNotExist: print "Committee not found:", committee continue # Process members of current committee node for member in members: mobj = CommitteeMember() mobj.person = Person.objects.get(id=person_id_map[member["thomas"]]) mobj.committee = cobj if "title" in member: mobj.role = ROLE_MAPPING[member["title"]] mobj.save() progress.tick() File.objects.save_file(MEMBERS_FILE) log.info('Processing committee schedule') for chamber in ("house", "senate"): meetings_file = 'data/congress/committee_meetings_%s.json' % chamber file_changed = File.objects.is_changed(meetings_file) if not file_changed and not options.force: log.info('File %s was not changed' % meetings_file) else: meetings = json.load(open(meetings_file)) # Process committee event nodes for meeting in meetings: try: # Associate it with an existing meeting object if GUID is already known. # Must get it like this, vs just assigning the ID as we do in other parsers, # because of the auto_now_add created field, which otherwise misbehaves. try: mobj = CommitteeMeeting.objects.get(guid=meeting['guid']) except CommitteeMeeting.DoesNotExist: mobj = CommitteeMeeting() # Parse. mobj = meeting_processor.process(mobj, meeting) # Attach the meeting to the subcommittee if set. if mobj.subcommittee: mobj.committee = Committee.objects.get(code=mobj.committee.code + mobj.subcommittee) mobj.save() mobj.bills.clear() for bill in meeting["bill_ids"]: try: bill_type, bill_num, bill_cong = re.match(r"([a-z]+)(\d+)-(\d+)$", bill).groups() bill = Bill.objects.get(congress=bill_cong, bill_type=BillType.by_slug(bill_type), number=int(bill_num)) mobj.bills.add(bill) except AttributeError: pass # regex failed except common.enum.NotFound: pass # invalid bill type code in source data except Bill.DoesNotExist: pass # we don't know about bill yet except Committee.DoesNotExist: log.error('Could not load Committee object for meeting %s' % meeting_processor.display_node(meeting)) for committee in Committee.objects.all(): if not options.disable_events: committee.create_events() File.objects.save_file(meetings_file)
def main(options): """ Process bill terms and bills """ # Terms term_processor = TermProcessor() terms_parsed = set() # Cache existing terms. There aren't so many. existing_terms = {} for term in BillTerm.objects.all(): existing_terms[(int(term.term_type), term.name)] = term log.info('Processing old bill terms') TERMS_FILE = 'bill/liv.xml' tree = etree.parse(TERMS_FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.old try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] term.subterms.add(subterm) terms_parsed.add(subterm.id) except: try: log.debug("Created %s" % subterm) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) log.info('Processing new bill terms') for FILE in ('bill/liv111.xml', 'bill/crsnet.xml'): tree = etree.parse(FILE) for node in tree.xpath('/liv/top-term'): term = term_processor.process(BillTerm(), node) term.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. term = existing_terms[(int(term.term_type), term.name)] terms_parsed.add(term.id) except: log.debug("Created %s" % term) term.save() term.subterms.clear() for subnode in node.xpath('./term'): subterm = term_processor.process(BillTerm(), subnode) subterm.term_type = TermType.new try: # No need to update an existing term because there are no other attributes. subterm = existing_terms[(int(subterm.term_type), subterm.name)] terms_parsed.add(subterm.id) term.subterms.add(subterm) except: try: log.debug("Created %s" % term) subterm.save() term.subterms.add(subterm) existing_terms[(int(subterm.term_type), subterm.name)] = subterm terms_parsed.add(subterm.id) except IntegrityError: log.error('Duplicated term %s' % term_processor.display_node(subnode)) for term in existing_terms.values(): if not term.id in terms_parsed: log.debug("Deleted %s" % term) term.delete() # Bills bill_index = None if not options.disable_indexing: from bill.search_indexes import BillIndex bill_index = BillIndex() if options.congress: files = glob.glob(settings.CONGRESS_DATA_PATH + '/%s/bills/*/*/data.xml' % options.congress) log.info('Parsing unitedstates/congress bills of only congress#%s' % options.congress) else: files = glob.glob(settings.CONGRESS_DATA_PATH + '/*/bills/*/*/data.xml') if options.filter: files = [f for f in files if re.match(options.filter, f)] log.info('Processing bills: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=100) bill_processor = BillProcessor() seen_bill_ids = [] for fname in files: progress.tick() # With indexing or events enabled, if the bill metadata file hasn't changed check # the bill's latest text file for changes so we can create a text-is-available # event and so we can index the bill's text. if (not options.congress or int(options.congress) > 42) and ( bill_index and not options.disable_events ) and not File.objects.is_changed(fname) and not options.force: m = re.match( re.escape(settings.CONGRESS_DATA_PATH) + r'/(?P<congress>\d+)/bills/(?P<bill_type>[a-z]+)/(?P<bill_type_2>[a-z]+)(?P<number>\d+)/data.xml', fname) try: b = Bill.objects.get(congress=int(m.group("congress")), bill_type=BillType.by_slug( m.group("bill_type")), number=m.group("number")) seen_bill_ids.append(b.id) # Update the index/events for any bill with recently changed text textfile = get_bill_text_metadata(b, None) if not textfile: if b.congress >= 103 and b.introduced_date < ( datetime.now() - timedelta(days=14)).date(): print("No bill text?", fname, b.introduced_date) continue textfile = textfile["text_file"] if os.path.exists(textfile) and File.objects.is_changed( textfile): b.update_index(bill_index) # index the full text b.create_events() # events for new bill text documents File.objects.save_file(textfile) continue except Bill.DoesNotExist: print("Unchanged metadata file but bill doesn't exist:", fname) pass # just parse as normal if options.slow: time.sleep(1) tree = etree.parse(fname) for node in tree.xpath('/bill'): try: bill = bill_processor.process(Bill(), node) except: print(fname) raise seen_bill_ids.append(bill.id) # don't delete me later # So far this is just for American Memory bills. if node.xpath("string(source/@url)"): bill.source_link = str(node.xpath("string(source/@url)")) else: bill.source_link = None actions = [] for axn in tree.xpath("actions/*[@state]"): if axn.xpath("string(@state)") == "REFERRED": continue # we don't track this state actions.append(( repr( bill_processor.parse_datetime( axn.xpath("string(@datetime)"))), BillStatus.by_xml_code(axn.xpath("string(@state)")), axn.xpath("string(text)"), etree.tostring(axn, encoding=str), )) bill.sliplawpubpriv = None bill.sliplawnum = None for axn in tree.xpath("actions/enacted"): bill.sliplawpubpriv = "PUB" if axn.get( "type") == "public" else "PRI" bill.sliplawnum = int(axn.get("number").split("-")[1]) bill.major_actions = actions try: bill.save() except: print(bill) raise if bill_index: bill.update_index(bill_index) if not options.disable_events: bill.create_events() File.objects.save_file(fname) # delete bill objects that are no longer represented on disk.... this is too dangerous. if options.congress and not options.filter: # this doesn't work because seen_bill_ids is too big for sqlite! for b in Bill.objects.filter(congress=options.congress).exclude( id__in=seen_bill_ids): print("Bill is no longer on disk: ", b.id, b) # The rest is for current only... if options.congress and int(options.congress) != settings.CURRENT_CONGRESS: return # Find what might be coming up this week. load_docs_house_gov(options, bill_index) load_senate_floor_schedule(options, bill_index)
def main(options): """ Parse rolls. """ # Setup XML processors vote_processor = VoteProcessor() option_processor = VoteOptionProcessor() voter_processor = VoterProcessor() voter_processor.PERSON_CACHE = dict((x.pk, x) for x in Person.objects.all()) # The pattern which the roll file matches # Filename contains info which should be placed to DB # along with info extracted from the XML file re_path = re.compile("data/us/(\d+)/rolls/([hs])(\w+)-(\d+)\.xml") chamber_mapping = {"s": CongressChamber.senate, "h": CongressChamber.house} if options.filter: files = glob.glob(options.filter) log.info("Parsing rolls matching %s" % options.filter) elif options.congress: files = glob.glob("data/us/%s/rolls/*.xml" % options.congress) log.info("Parsing rolls of only congress#%s" % options.congress) else: files = glob.glob("data/us/*/rolls/*.xml") log.info("Processing votes: %d files" % len(files)) total = len(files) progress = Progress(total=total, name="files", step=10) def log_delete_qs(qs): if qs.count() == 0: return print "Deleting obsoleted records: ", qs # if qs.count() > 3: # print "Delete skipped..." # return qs.delete() seen_obj_ids = set() had_error = False for fname in files: progress.tick() match = re_path.search(fname) try: existing_vote = Vote.objects.get( congress=match.group(1), chamber=chamber_mapping[match.group(2)], session=match.group(3), number=match.group(4), ) except Vote.DoesNotExist: existing_vote = None if ( not File.objects.is_changed(fname) and not options.force and existing_vote != None and not existing_vote.missing_data ): seen_obj_ids.add(existing_vote.id) continue try: tree = etree.parse(fname) ## Look for votes with VP tie breakers. # if len(tree.xpath("/roll/voter[@VP='1']")) == 0: # had_error = True # prevent delete at the end # continue # Process role object for roll_node in tree.xpath("/roll"): vote = vote_processor.process(Vote(), roll_node) if existing_vote: vote.id = existing_vote.id match = re_path.search(fname) vote.congress = int(match.group(1)) vote.chamber = chamber_mapping[match.group(2)] vote.session = match.group(3) vote.number = int(match.group(4)) # Get related bill & amendment. for bill_node in roll_node.xpath("bill"): related_bill_num = bill_node.get("number") if 9 <= vote.congress <= 42 and vote.session in ("1", "2"): # Bill numbering from the American Memory colletion is different. The number combines # the session, bill number, and a 0 or 5 for regular or 'half' numbering. Prior to # the 9th congress numbering seems to be wholly assigned by us and not related to # actual numbering, so we skip matching those bills. related_bill_num = "%d%04d%d" % (int(vote.session), int(bill_node.get("number")), 0) try: vote.related_bill = Bill.objects.get( congress=bill_node.get("session"), bill_type=BillType.by_xml_code(bill_node.get("type")), number=related_bill_num, ) except Bill.DoesNotExist: if vote.congress >= 93: vote.missing_data = True for amdt_node in roll_node.xpath("amendment"): if amdt_node.get("ref") == "regular" and vote.related_bill is not None: try: vote.related_amendment = Amendment.objects.get( congress=vote.related_bill.congress, amendment_type=AmendmentType.by_slug(amdt_node.get("number")[0]), number=amdt_node.get("number")[1:], ) except Amendment.DoesNotExist: if vote.congress >= 93: print "Missing amendment", fname vote.missing_data = True elif amdt_node.get("ref") == "bill-serial": # It is impossible to associate House votes with amendments just from the House # vote XML because the amendment-num might correspond either with the A___ number # or with the "An amendment, numbered ____" number from the amendment purpose, # and there's really no way to figure out which. Maybe we can use the amendment # sponsor instead? # vote.related_amendment = Amendment.objects.get(bill=vote.related_bill, sequence=amdt_node.get("number")) # Instead, we set related_amendment from the amendment parser. Here, we have to # preserve the related_amendment if it is set. if existing_vote: vote.related_amendment = existing_vote.related_amendment # clean up some question text and use the question_details field if ( vote.category in (VoteCategory.passage, VoteCategory.passage_suspension, VoteCategory.veto_override) and vote.related_bill ): # For passage votes, set the question to the bill title and put the question # details in the details field. vote.question = truncatewords(vote.related_bill.title, 20) vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display() elif vote.category == VoteCategory.amendment and vote.related_amendment: # For votes on amendments, make a better title/explanation. vote.question = truncatewords(vote.related_amendment.title, 20) vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display() elif vote.related_bill and vote.question.startswith( "On the Cloture Motion " + vote.related_bill.display_number ): vote.question = "Cloture on " + truncatewords(vote.related_bill.title, 20) elif vote.related_bill and vote.question.startswith( "On Cloture on the Motion to Proceed " + vote.related_bill.display_number ): vote.question = "Cloture on " + truncatewords(vote.related_bill.title, 20) vote.question_details = "On Cloture on the Motion to Proceed in the " + vote.get_chamber_display() elif vote.related_bill and vote.question.startswith( "On the Motion to Proceed " + vote.related_bill.display_number ): vote.question = "Motion to Proceed on " + truncatewords(vote.related_bill.title, 20) elif vote.related_amendment and vote.question.startswith( "On the Cloture Motion " + vote.related_amendment.get_amendment_type_display() + " " + str(vote.related_amendment.number) ): vote.question = "Cloture on " + truncatewords(vote.related_amendment.title, 20) vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display() # weird House foratting of bill numbers ("H RES 123 Blah blah") if vote.related_bill: vote.question = re.sub( "(On [^:]+): " + vote.related_bill.display_number.replace(". ", " ").replace(".", " ").upper() + " .*", r"\1: " + truncatewords(vote.related_bill.title, 15), vote.question, ) vote.save() seen_obj_ids.add(vote.id) # don't delete me later # Process roll options, overwrite existing options where possible. seen_option_ids = set() roll_options = {} for option_node in roll_node.xpath("./option"): option = option_processor.process(VoteOption(), option_node) option.vote = vote if existing_vote: try: option.id = VoteOption.objects.filter(vote=vote, key=option.key)[ 0 ].id # get is better, but I had the database corruption problem except IndexError: pass option.save() roll_options[option.key] = option seen_option_ids.add(option.id) log_delete_qs( VoteOption.objects.filter(vote=vote).exclude(id__in=seen_option_ids) ) # may cascade and delete the Voters too? # Process roll voters, overwriting existing voters where possible. if existing_vote: existing_voters = dict(Voter.objects.filter(vote=vote).values_list("person", "id")) seen_voter_ids = set() voters = list() for voter_node in roll_node.xpath("./voter"): voter = voter_processor.process(roll_options, Voter(), voter_node) voter.vote = vote voter.created = vote.created # for VP votes, load the actual person... if voter.voter_type == VoterType.vice_president: try: r = PersonRole.objects.get( role_type=RoleType.vicepresident, startdate__lte=vote.created, enddate__gte=vote.created ) voter.person_role = r voter.person = r.person except: # overlapping roles? missing data? log.error("Could not resolve vice president in %s" % fname, exc_info=ex) if existing_vote and voter.person: try: voter.id = existing_voters[voter.person.id] except KeyError: pass voters.append(voter) if voter.voter_type == VoterType.unknown and not vote.missing_data: vote.missing_data = True vote.save() # pre-fetch the role of each voter load_roles_at_date([x.person for x in voters if x.person != None], vote.created) for voter in voters: voter.person_role = voter.person.role if voter.person_role is None: log.error("%s: Could not find role for %s on %s." % (fname, voter.person, vote.created)) vote.missing_data = True vote.save() # save all of the records (inserting/updating) for voter in voters: voter.save() seen_voter_ids.add(voter.id) # remove obsolete voter records log_delete_qs( Voter.objects.filter(vote=vote).exclude(id__in=seen_voter_ids) ) # possibly already deleted by cascade above # pre-calculate totals vote.calculate_totals() if not options.disable_events: vote.create_event() File.objects.save_file(fname) except Exception, ex: log.error("Error in processing %s" % fname, exc_info=ex) had_error = True
def main(options): """ Process committees, subcommittees and members of current congress committees. """ BASE_PATH = settings.CONGRESS_LEGISLATORS_PATH meeting_processor = CommitteeMeetingProcessor() log.info('Processing committees') COMMITTEES_FILE = BASE_PATH + 'committees-current.yaml' if not File.objects.is_changed(COMMITTEES_FILE) and not options.force: log.info('File %s was not changed' % COMMITTEES_FILE) else: tree = yaml_load(COMMITTEES_FILE) total = len(tree) progress = Progress(total=total) seen_committees = set() for committee in tree: try: cobj = Committee.objects.get(code=committee["thomas_id"]) except Committee.DoesNotExist: print "New committee:", committee["thomas_id"] cobj = Committee(code=committee["thomas_id"]) cobj.committee_type = TYPE_MAPPING[committee["type"]] cobj.name = committee["name"] cobj.url = committee.get("url", None) cobj.obsolete = False cobj.committee = None cobj.save() seen_committees.add(cobj.id) for subcom in committee.get('subcommittees', []): code = committee["thomas_id"] + subcom["thomas_id"] try: sobj = Committee.objects.get(code=code) except Committee.DoesNotExist: print "New subcommittee:", code sobj = Committee(code=code) sobj.name = subcom["name"] sobj.url = subcom.get("url", None) sobj.type = None sobj.committee = cobj sobj.obsolete = False sobj.save() seen_committees.add(sobj.id) progress.tick() # Check for non-obsolete committees in the database that aren't in our # file. other_committees = Committee.objects.filter(obsolete=False).exclude(id__in=seen_committees) if len(other_committees) > 0: print "Marking obsolete:", ", ".join(c.code for c in other_committees) other_committees.update(obsolete=True) File.objects.save_file(COMMITTEES_FILE) log.info('Processing committee members') MEMBERS_FILE = BASE_PATH + 'committee-membership-current.yaml' file_changed = File.objects.is_changed(MEMBERS_FILE) if not file_changed and not options.force: log.info('File %s was not changed' % MEMBERS_FILE) else: # map THOMAS IDs to GovTrack IDs y = yaml_load(BASE_PATH + "legislators-current.yaml") person_id_map = { } for m in y: if "id" in m and "govtrack" in m["id"] and "thomas" in m["id"]: person_id_map[m["id"]["thomas"]] = m["id"]["govtrack"] # load committee members tree = yaml_load(MEMBERS_FILE) total = len(tree) progress = Progress(total=total, name='committees') # We can delete CommitteeMember objects because we don't have # any foreign keys to them. CommitteeMember.objects.all().delete() # Process committee nodes for committee, members in tree.items(): if committee[0] == "H": continue # House data is out of date try: cobj = Committee.objects.get(code=committee) except Committee.DoesNotExist: print "Committee not found:", committee continue # Process members of current committee node for member in members: mobj = CommitteeMember() mobj.person = Person.objects.get(id=person_id_map[member["thomas"]]) mobj.committee = cobj if "title" in member: mobj.role = ROLE_MAPPING[member["title"]] mobj.save() progress.tick() File.objects.save_file(MEMBERS_FILE) return log.info('Processing committee schedule') SCHEDULE_FILE = 'data/us/112/committeeschedule.xml' file_changed = File.objects.is_changed(SCHEDULE_FILE) if not file_changed and not options.force: log.info('File %s was not changed' % SCHEDULE_FILE) else: tree = etree.parse(SCHEDULE_FILE) # We have to clear out all CommitteeMeeting objects when we refresh because # we have no unique identifier in the upstream data for a meeting. We might use # the meeting's committee & date as an identifier, but since meeting times can # change this might have awkward consequences for the end user if we even # attempted to track that. CommitteeMeeting.objects.all().delete() # Process committee event nodes for meeting in tree.xpath('/committee-schedule/meeting'): try: mobj = meeting_processor.process(CommitteeMeeting(), meeting) mobj.save() mobj.bills.clear() for bill in meeting.xpath('bill'): bill = Bill.objects.get(congress=bill.get("session"), bill_type=BillType.by_xml_code(bill.get("type")), number=int(bill.get("number"))) mobj.bills.add(bill) except Committee.DoesNotExist: log.error('Could not load Committee object for meeting %s' % meeting_processor.display_node(meeting)) for committee in Committee.objects.all(): if not options.disable_events: committee.create_events() File.objects.save_file(SCHEDULE_FILE)
def main(options): """ Parse rolls. """ # Setup XML processors vote_processor = VoteProcessor() option_processor = VoteOptionProcessor() voter_processor = VoterProcessor() voter_processor.PERSON_CACHE = dict( (x.pk, x) for x in Person.objects.all()) # The pattern which the roll file matches # Filename contains info which should be placed to DB # along with info extracted from the XML file re_path = re.compile('data/us/(\d+)/rolls/([hs])(\w+)-(\d+)\.xml') chamber_mapping = {'s': CongressChamber.senate, 'h': CongressChamber.house} if options.filter: files = glob.glob(options.filter) log.info('Parsing rolls matching %s' % options.filter) elif options.congress: files = glob.glob('data/us/%s/rolls/*.xml' % options.congress) log.info('Parsing rolls of only congress#%s' % options.congress) else: files = glob.glob('data/us/*/rolls/*.xml') log.info('Processing votes: %d files' % len(files)) total = len(files) progress = Progress(total=total, name='files', step=10) def log_delete_qs(qs): if qs.count() > 0: try: print "Deleting: ", qs except Exception as e: print "Deleting [%s]..." % str(e) if qs.count() > 3: print "Delete skipped..." return qs.delete() seen_obj_ids = set() had_error = False for fname in files: progress.tick() match = re_path.search(fname) try: existing_vote = Vote.objects.get( congress=match.group(1), chamber=chamber_mapping[match.group(2)], session=match.group(3), number=match.group(4)) except Vote.DoesNotExist: existing_vote = None if not File.objects.is_changed( fname ) and not options.force and existing_vote != None and not existing_vote.missing_data: seen_obj_ids.add(existing_vote.id) continue try: tree = etree.parse(fname) ## Look for votes with VP tie breakers. #if len(tree.xpath("/roll/voter[@VP='1']")) == 0: # had_error = True # prevent delete at the end # continue # Process role object for roll_node in tree.xpath('/roll'): vote = vote_processor.process(Vote(), roll_node) if existing_vote: vote.id = existing_vote.id match = re_path.search(fname) vote.congress = int(match.group(1)) vote.chamber = chamber_mapping[match.group(2)] vote.session = match.group(3) vote.number = int(match.group(4)) # Get related bill & amendment. for bill_node in roll_node.xpath("bill"): try: vote.related_bill = Bill.objects.get( congress=bill_node.get("session"), bill_type=BillType.by_xml_code( bill_node.get("type")), number=bill_node.get("number")) except Bill.DoesNotExist: vote.missing_data = True for amdt_node in roll_node.xpath("amendment"): if amdt_node.get("ref") == "regular": try: vote.related_amendment = Amendment.objects.get( congress=vote.related_bill.congress, amendment_type=AmendmentType.by_slug( amdt_node.get("number")[0]), number=amdt_node.get("number")[1:]) except Amendment.DoesNotExist: print "Missing amendment", fname vote.missing_data = True elif amdt_node.get("ref") == "bill-serial": # It is impossible to associate House votes with amendments just from the House # vote XML because the amendment-num might correspond either with the A___ number # or with the "An amendment, numbered ____" number from the amendment purpose, # and there's really no way to figure out which. Maybe we can use the amendment # sponsor instead? #vote.related_amendment = Amendment.objects.get(bill=vote.related_bill, sequence=amdt_node.get("number")) # Instead, we set related_amendment from the amendment parser. Here, we have to # preserve the related_amendment if it is set. if existing_vote: vote.related_amendment = existing_vote.related_amendment # clean up some question text and use the question_details field if vote.category in ( VoteCategory.passage, VoteCategory.passage_suspension, VoteCategory.veto_override) and vote.related_bill: # For passage votes, set the question to the bill title and put the question # details in the details field. vote.question = truncatewords(vote.related_bill.title, 20) vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display( ) elif vote.category == VoteCategory.amendment and vote.related_amendment: # For votes on amendments, make a better title/explanation. vote.question = truncatewords(vote.related_amendment.title, 20) vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display( ) elif vote.related_bill and vote.question.startswith( "On the Cloture Motion " + vote.related_bill.display_number): vote.question = "Cloture on " + truncatewords( vote.related_bill.title, 20) elif vote.related_bill and vote.question.startswith( "On Cloture on the Motion to Proceed " + vote.related_bill.display_number): vote.question = "Cloture on " + truncatewords( vote.related_bill.title, 20) vote.question_details = "On Cloture on the Motion to Proceed in the " + vote.get_chamber_display( ) elif vote.related_bill and vote.question.startswith( "On the Motion to Proceed " + vote.related_bill.display_number): vote.question = "Motion to Proceed on " + truncatewords( vote.related_bill.title, 20) elif vote.related_amendment and vote.question.startswith( "On the Cloture Motion " + vote.related_amendment.get_amendment_type_display() + " " + str(vote.related_amendment.number)): vote.question = "Cloture on " + truncatewords( vote.related_amendment.title, 20) vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display( ) # weird House foratting of bill numbers ("H RES 123 Blah blah") if vote.related_bill: vote.question = re.sub( "(On [^:]+): " + vote.related_bill.display_number.replace( ". ", " ").replace(".", " ").upper() + " .*", r"\1: " + truncatewords(vote.related_bill.title, 15), vote.question) vote.save() seen_obj_ids.add(vote.id) # don't delete me later # Process roll options, overwrite existing options where possible. seen_option_ids = set() roll_options = {} for option_node in roll_node.xpath('./option'): option = option_processor.process(VoteOption(), option_node) option.vote = vote if existing_vote: try: option.id = VoteOption.objects.filter( vote=vote, key=option.key )[0].id # get is better, but I had the database corruption problem except IndexError: pass option.save() roll_options[option.key] = option seen_option_ids.add(option.id) log_delete_qs( VoteOption.objects.filter(vote=vote).exclude( id__in=seen_option_ids) ) # may cascade and delete the Voters too? # Process roll voters, overwriting existing voters where possible. if existing_vote: existing_voters = dict( Voter.objects.filter(vote=vote).values_list( "person", "id")) seen_voter_ids = set() for voter_node in roll_node.xpath('./voter'): voter = voter_processor.process(roll_options, Voter(), voter_node) voter.vote = vote voter.created = vote.created # for VP votes, load the actual person... if voter.voter_type == VoterType.vice_president: try: r = PersonRole.objects.get( role_type=RoleType.vicepresident, startdate__lte=vote.created, enddate__gte=vote.created) voter.person = r.person except: # overlapping roles? missing data? log.error( 'Could not resolve vice president in %s' % fname, exc_info=ex) if existing_vote and voter.person: try: voter.id = existing_voters[voter.person.id] except KeyError: pass voter.save() if voter.voter_type == VoterType.unknown and not vote.missing_data: vote.missing_data = True vote.save() seen_voter_ids.add(voter.id) log_delete_qs( Voter.objects.filter(vote=vote).exclude( id__in=seen_voter_ids) ) # possibly already deleted by cascade above vote.calculate_totals() if not options.disable_events: vote.create_event() File.objects.save_file(fname) except Exception, ex: log.error('Error in processing %s' % fname, exc_info=ex) had_error = True