def parse_committee_members(options): log.info('Processing committee members') MEMBERS_FILE = settings.CONGRESS_PROJECT_PATH + '/congress-legislators/committee-membership-current.yaml' file_changed = File.objects.is_changed(MEMBERS_FILE) if not file_changed and not options.force: log.info('File %s was not changed' % MEMBERS_FILE) else: # map Bioguide IDs to GovTrack IDs y = yaml_load(settings.CONGRESS_PROJECT_PATH + "/congress-legislators/legislators-current.yaml") person_id_map = {} for m in y: if "id" in m and "govtrack" in m["id"] and "bioguide" in m["id"]: person_id_map[m["id"]["bioguide"]] = m["id"]["govtrack"] # load committee members tree = yaml_load(MEMBERS_FILE) total = len(tree) progress = Progress(total=total, name='committees') # We can delete CommitteeMember objects because we don't have # any foreign keys to them. CommitteeMember.objects.all().delete() # Process committee nodes for committee, members in tree.items(): try: cobj = Committee.objects.get(code=committee) except Committee.DoesNotExist: print("Committee not found:", committee) continue # Process members of current committee node for member in members: mobj = CommitteeMember() mobj.person = Person.objects.get( id=person_id_map[member["bioguide"]]) mobj.committee = cobj if "title" in member: mobj.role = ROLE_MAPPING[member["title"]] mobj.save() progress.tick() File.objects.save_file(MEMBERS_FILE)
def parse_committee_members(options): log.info('Processing committee members') MEMBERS_FILE = settings.CONGRESS_PROJECT_PATH + '/congress-legislators/committee-membership-current.yaml' file_changed = File.objects.is_changed(MEMBERS_FILE) if not file_changed and not options.force: log.info('File %s was not changed' % MEMBERS_FILE) else: # map Bioguide IDs to GovTrack IDs y = yaml_load(settings.CONGRESS_PROJECT_PATH + "/congress-legislators/legislators-current.yaml") person_id_map = { } for m in y: if "id" in m and "govtrack" in m["id"] and "bioguide" in m["id"]: person_id_map[m["id"]["bioguide"]] = m["id"]["govtrack"] # load committee members tree = yaml_load(MEMBERS_FILE) total = len(tree) progress = Progress(total=total, name='committees') # We can delete CommitteeMember objects because we don't have # any foreign keys to them. CommitteeMember.objects.all().delete() # Process committee nodes for committee, members in tree.items(): try: cobj = Committee.objects.get(code=committee) except Committee.DoesNotExist: print("Committee not found:", committee) continue # Process members of current committee node for member in members: mobj = CommitteeMember() mobj.person = Person.objects.get(id=person_id_map[member["bioguide"]]) mobj.committee = cobj if "title" in member: mobj.role = ROLE_MAPPING[member["title"]] mobj.save() progress.tick() File.objects.save_file(MEMBERS_FILE)
def parse_committee_names(options): log.info('Processing committees') COMMITTEES_FILE = settings.CONGRESS_PROJECT_PATH + '/congress-legislators/committees-current.yaml' if not File.objects.is_changed(COMMITTEES_FILE) and not options.force: log.info('File %s was not changed' % COMMITTEES_FILE) else: tree = yaml_load(COMMITTEES_FILE) total = len(tree) progress = Progress(total=total) seen_committees = set() for committee in tree: try: cobj = Committee.objects.get(code=committee["thomas_id"]) except Committee.DoesNotExist: print("New committee:", committee["thomas_id"]) cobj = Committee(code=committee["thomas_id"]) cobj.committee_type = TYPE_MAPPING[committee["type"]] cobj.name = committee["name"] cobj.url = committee.get("url", None) cobj.obsolete = False cobj.committee = None cobj.jurisdiction = committee.get("jurisdiction") cobj.jurisdiction_link = committee.get("jurisdiction_source") cobj.save() seen_committees.add(cobj.id) for subcom in committee.get('subcommittees', []): code = committee["thomas_id"] + subcom["thomas_id"] try: sobj = Committee.objects.get(code=code) except Committee.DoesNotExist: print("New subcommittee:", code) sobj = Committee(code=code) sobj.name = subcom["name"] sobj.url = subcom.get("url", None) sobj.type = None sobj.committee = cobj sobj.obsolete = False sobj.save() seen_committees.add(sobj.id) progress.tick() # Check for non-obsolete committees in the database that aren't in our # file. other_committees = Committee.objects.filter(obsolete=False).exclude( id__in=seen_committees) if len(other_committees) > 0: print("Marking obsolete:", ", ".join(c.code for c in other_committees)) other_committees.update(obsolete=True) File.objects.save_file(COMMITTEES_FILE)
def parse_committee_names(options): log.info('Processing committees') COMMITTEES_FILE = settings.CONGRESS_PROJECT_PATH + '/congress-legislators/committees-current.yaml' if not File.objects.is_changed(COMMITTEES_FILE) and not options.force: log.info('File %s was not changed' % COMMITTEES_FILE) else: tree = yaml_load(COMMITTEES_FILE) total = len(tree) progress = Progress(total=total) seen_committees = set() for committee in tree: try: cobj = Committee.objects.get(code=committee["thomas_id"]) except Committee.DoesNotExist: print("New committee:", committee["thomas_id"]) cobj = Committee(code=committee["thomas_id"]) cobj.committee_type = TYPE_MAPPING[committee["type"]] cobj.name = committee["name"] cobj.url = committee.get("url", None) cobj.obsolete = False cobj.committee = None cobj.jurisdiction = committee.get("jurisdiction") cobj.jurisdiction_link = committee.get("jurisdiction_source") cobj.save() seen_committees.add(cobj.id) for subcom in committee.get('subcommittees', []): code = committee["thomas_id"] + subcom["thomas_id"] try: sobj = Committee.objects.get(code=code) except Committee.DoesNotExist: print("New subcommittee:", code) sobj = Committee(code=code) sobj.name = subcom["name"] sobj.url = subcom.get("url", None) sobj.type = None sobj.committee = cobj sobj.obsolete = False sobj.save() seen_committees.add(sobj.id) progress.tick() # Check for non-obsolete committees in the database that aren't in our # file. other_committees = Committee.objects.filter(obsolete=False).exclude(id__in=seen_committees) if len(other_committees) > 0: print("Marking obsolete:", ", ".join(c.code for c in other_committees)) other_committees.update(obsolete=True) File.objects.save_file(COMMITTEES_FILE)
def main(options): """ Process committees, subcommittees and members of current congress committees. """ BASE_PATH = settings.CONGRESS_LEGISLATORS_PATH meeting_processor = CommitteeMeetingProcessor() log.info('Processing committees') COMMITTEES_FILE = BASE_PATH + 'committees-current.yaml' if not File.objects.is_changed(COMMITTEES_FILE) and not options.force: log.info('File %s was not changed' % COMMITTEES_FILE) else: tree = yaml_load(COMMITTEES_FILE) total = len(tree) progress = Progress(total=total) seen_committees = set() for committee in tree: try: cobj = Committee.objects.get(code=committee["thomas_id"]) except Committee.DoesNotExist: print "New committee:", committee["thomas_id"] cobj = Committee(code=committee["thomas_id"]) cobj.committee_type = TYPE_MAPPING[committee["type"]] cobj.name = committee["name"] cobj.url = committee.get("url", None) cobj.obsolete = False cobj.committee = None cobj.save() seen_committees.add(cobj.id) for subcom in committee.get('subcommittees', []): code = committee["thomas_id"] + subcom["thomas_id"] try: sobj = Committee.objects.get(code=code) except Committee.DoesNotExist: print "New subcommittee:", code sobj = Committee(code=code) sobj.name = subcom["name"] sobj.url = subcom.get("url", None) sobj.type = None sobj.committee = cobj sobj.obsolete = False sobj.save() seen_committees.add(sobj.id) progress.tick() # Check for non-obsolete committees in the database that aren't in our # file. other_committees = Committee.objects.filter(obsolete=False).exclude(id__in=seen_committees) if len(other_committees) > 0: print "Marking obsolete:", ", ".join(c.code for c in other_committees) other_committees.update(obsolete=True) File.objects.save_file(COMMITTEES_FILE) log.info('Processing committee members') MEMBERS_FILE = BASE_PATH + 'committee-membership-current.yaml' file_changed = File.objects.is_changed(MEMBERS_FILE) if not file_changed and not options.force: log.info('File %s was not changed' % MEMBERS_FILE) else: # map THOMAS IDs to GovTrack IDs y = yaml_load(BASE_PATH + "legislators-current.yaml") person_id_map = { } for m in y: if "id" in m and "govtrack" in m["id"] and "thomas" in m["id"]: person_id_map[m["id"]["thomas"]] = m["id"]["govtrack"] # load committee members tree = yaml_load(MEMBERS_FILE) total = len(tree) progress = Progress(total=total, name='committees') # We can delete CommitteeMember objects because we don't have # any foreign keys to them. CommitteeMember.objects.all().delete() # Process committee nodes for committee, members in tree.items(): if committee[0] == "H": continue # House data is out of date try: cobj = Committee.objects.get(code=committee) except Committee.DoesNotExist: print "Committee not found:", committee continue # Process members of current committee node for member in members: mobj = CommitteeMember() mobj.person = Person.objects.get(id=person_id_map[member["thomas"]]) mobj.committee = cobj if "title" in member: mobj.role = ROLE_MAPPING[member["title"]] mobj.save() progress.tick() File.objects.save_file(MEMBERS_FILE) return log.info('Processing committee schedule') SCHEDULE_FILE = 'data/us/112/committeeschedule.xml' file_changed = File.objects.is_changed(SCHEDULE_FILE) if not file_changed and not options.force: log.info('File %s was not changed' % SCHEDULE_FILE) else: tree = etree.parse(SCHEDULE_FILE) # We have to clear out all CommitteeMeeting objects when we refresh because # we have no unique identifier in the upstream data for a meeting. We might use # the meeting's committee & date as an identifier, but since meeting times can # change this might have awkward consequences for the end user if we even # attempted to track that. CommitteeMeeting.objects.all().delete() # Process committee event nodes for meeting in tree.xpath('/committee-schedule/meeting'): try: mobj = meeting_processor.process(CommitteeMeeting(), meeting) mobj.save() mobj.bills.clear() for bill in meeting.xpath('bill'): bill = Bill.objects.get(congress=bill.get("session"), bill_type=BillType.by_xml_code(bill.get("type")), number=int(bill.get("number"))) mobj.bills.add(bill) except Committee.DoesNotExist: log.error('Could not load Committee object for meeting %s' % meeting_processor.display_node(meeting)) for committee in Committee.objects.all(): if not options.disable_events: committee.create_events() File.objects.save_file(SCHEDULE_FILE)
def main(options): """ Update Person and PersonRole models. Do safe update: touch only those records which have been changed. """ BASE_PATH = CONGRESS_LEGISLATORS_PATH SRC_FILES = [ 'legislators-current', 'legislators-historical', 'legislators-social-media', 'executive' ] # order matters for p in SRC_FILES: f = BASE_PATH + p + ".yaml" if not File.objects.is_changed(f) and not options.force: log.info('File %s was not changed' % f) else: # file modified... break else: # no 'break' ==> no files modified return # Start parsing. had_error = False # Get combined data. legislator_data = {} leg_id_map = {} for p in SRC_FILES: log.info('Opening %s...' % p) f = BASE_PATH + p + ".yaml" y = yaml_load(f) for m in y: if p == "legislators-current": # We know all terms but the last are non-current and the last is. for r in m["terms"]: r["current"] = False m["terms"][-1]["current"] = True elif p == "legislators-historical": # We know all terms are non-current. for r in m["terms"]: r["current"] = False if p != 'legislators-social-media': govtrack_id = m["id"].get("govtrack") # For the benefit of the social media file, make a mapping of IDs. for k, v in m["id"].items(): if type(v) != list: leg_id_map[(k, v)] = govtrack_id else: # GovTrack IDs are not always listed in this file. govtrack_id = None for k, v in m["id"].items(): if type(v) != list and (k, v) in leg_id_map: govtrack_id = leg_id_map[(k, v)] break if not govtrack_id: print("No GovTrack ID:") pprint.pprint(m) had_error = True continue if govtrack_id not in legislator_data: legislator_data[govtrack_id] = m elif p == "legislators-social-media": legislator_data[govtrack_id]["social"] = m["social"] elif p == "executive": legislator_data[govtrack_id]["terms"].extend(m["terms"]) else: raise ValueError("Duplication in an unexpected way (%d, %s)." % (govtrack_id, p)) person_processor = PersonProcessor() role_processor = PersonRoleProcessor() existing_persons = set(Person.objects.values_list('pk', flat=True)) processed_persons = set() created_persons = set() progress = Progress(total=len(legislator_data)) log.info('Processing persons') for node in legislator_data.values(): # Wrap each iteration in try/except # so that if some node breaks the parsing process # then other nodes could be parsed try: person = person_processor.process(Person(), node) # Create cached name strings. This is done again later # after the roles are updated. person.set_names() # Now try to load the person with such ID from # database. If found it then just update it # else create new Person object try: ex_person = Person.objects.get(pk=person.pk) if person_processor.changed(ex_person, person) or options.force: # If the person has PK of existing record, # coming in via the YAML-specified GovTrack ID, # then Django ORM will update existing record if not options.force: log.warn("Updated %s" % person) person.save() except Person.DoesNotExist: created_persons.add(person.pk) person.save() log.warn("Created %s" % person) processed_persons.add(person.pk) # Parse all of the roles. new_roles = [] for termnode in node['terms']: role = role_processor.process(PersonRole(), termnode) role.person = person role.extra = filter_yaml_term_structure( termnode) # copy in the whole YAML structure # Is this role current? For legislators, same as whether it came from legislators-current, which eases Jan 3 transitions when we can't distinguish by date. if "current" in termnode: role.current = termnode["current"] # But executives... else: now = datetime.now().date() role.current = role.startdate <= now and role.enddate >= now # Because of date overlaps at noon transition dates, ensure that only the last term that covers # today is current --- reset past roles to not current. Doesn't handle turning off retirning people tho. for r in new_roles: r.current = False # Scan for most recent leadership role within the time period of this term, # which isn't great for Senators because it's likely it changed a few times # within a term, especially if there was a party switch. role.leadership_title = None for leadership_node in node.get("leadership_roles", []): # must match on date and chamber if leadership_node["start"] >= role.enddate.isoformat(): continue # might start on the same day but is for the next Congress if "end" in leadership_node and leadership_node[ "end"] <= role.startdate.isoformat(): continue # might start on the same day but is for the previous Congress if leadership_node["chamber"] != RoleType.by_value( role.role_type).congress_chamber.lower(): continue role.leadership_title = leadership_node["title"] new_roles.append(role) # Try matching the new roles to existing db records. Since we don't have a primry key # in the source data, we have to match on the record values. But because of errors in data, # term start/end dates can change, so matching has to be a little fuzzy. existing_roles = list(PersonRole.objects.filter(person=person)) matches = [] def run_match_rule(rule): import itertools for new_role, existing_role in itertools.product( new_roles, existing_roles): if new_role not in new_roles or existing_role not in existing_roles: continue # already matched on a previous iteration if new_role.role_type != existing_role.role_type: continue if new_role.state != existing_role.state: continue if rule(new_role, existing_role): matches.append((new_role, existing_role)) new_roles.remove(new_role) existing_roles.remove(existing_role) # First match exactly, then exact on just one date, then on contractions and expansions. run_match_rule(lambda new_role, existing_role: new_role.startdate == existing_role.startdate and new_role.enddate == existing_role.enddate) run_match_rule(lambda new_role, existing_role: new_role.startdate == existing_role.startdate or new_role.enddate == existing_role.enddate) run_match_rule(lambda new_role, existing_role: new_role.startdate >= existing_role.startdate and new_role.enddate <= existing_role.enddate) run_match_rule(lambda new_role, existing_role: new_role.startdate <= existing_role.startdate and new_role.enddate >= existing_role.enddate) # Update the database entries that correspond with records in the data file. did_update_any = False for new_role, existing_role in matches: new_role.id = existing_role.id if role_processor.changed(existing_role, new_role) or options.force: new_role.save() did_update_any = True if not options.force: log.warn("Updated %s" % new_role) # If we have mutliple records on disk that didn't match and multiple records in the database # that didn't match, then we don't know how to align them. if len(new_roles) > 0 and len(existing_roles) > 0: print(new_roles) print(existing_roles) raise Exception("There is an unmatched role.") # Otherwise if there are any unmatched new roles, we can just add them. for role in new_roles: log.warn("Created %s" % role) role.save() did_update_any = True # And likewise for any existing roles that are left over. for pr in existing_roles: print(pr.person.id, pr) raise ValueError("Deleted role??") log.warn("Deleted %s" % pr) pr.delete() if did_update_any and not options.disable_events: # Create the events for the roles after all have been loaded # because we don't create events for ends of terms and # starts of terms that are adjacent. Refresh the list to get # the roles in order. role_list = list( PersonRole.objects.filter( person=person).order_by('startdate')) for i in range(len(role_list)): role_list[i].create_events( role_list[i - 1] if i > 0 else None, role_list[i + 1] if i < len(role_list) - 1 else None) # The name can't be determined until all of the roles are set. If # it changes, re-save. Unfortunately roles are cached so this actually # doesn't work yet. Re-run the parser to fix names. nn = (person.name, person.sortname) if hasattr(person, "role"): delattr(person, "role") # clear the cached info person._most_recent_role = None # clear cache here too person.set_names() if nn != (person.name, person.sortname): log.warn("%s is now %s." % (nn[0], person.name)) person.save() except Exception as ex: # Catch unexpected exceptions and log them pprint.pprint(node) log.error('', exc_info=ex) had_error = True progress.tick() log.info('Processed persons: %d' % len(processed_persons)) log.info('Created persons: %d' % len(created_persons)) if not had_error: # Remove person which were not found in XML file removed_persons = existing_persons - processed_persons for pk in removed_persons: p = Person.objects.get(pk=pk) if p.roles.all().count() > 0: log.warn("Missing? Deleted? %d: %s" % (p.id, p)) else: log.warn("Deleting... %d: %s (remember to prune_index!)" % (p.id, p)) raise Exception("Won't delete!") p.delete() log.info('Missing/deleted persons: %d' % len(removed_persons)) # Mark the files as processed. for p in SRC_FILES: f = BASE_PATH + p + ".yaml" File.objects.save_file(f) update_twitter_list()
def main(options): """ Update Person and PersonRole models. Do safe update: touch only those records which have been changed. """ BASE_PATH = CONGRESS_LEGISLATORS_PATH SRC_FILES = ['legislators-current', 'legislators-historical', 'legislators-social-media', 'executive'] # order matters for p in SRC_FILES: f = BASE_PATH + p + ".yaml" if not File.objects.is_changed(f) and not options.force: log.info('File %s was not changed' % f) else: # file modified... break else: # no 'break' ==> no files modified return # Start parsing. had_error = False # Get combined data. legislator_data = { } leg_id_map = { } for p in SRC_FILES: log.info('Opening %s...' % p) f = BASE_PATH + p + ".yaml" y = yaml_load(f) for m in y: if p != 'legislators-social-media': govtrack_id = m["id"].get("govtrack") # For the benefit of the social media file, make a mapping of IDs. for k, v in m["id"].items(): if type(v) != list: leg_id_map[(k,v)] = govtrack_id else: # GovTrack IDs are not always listed in this file. for k, v in m["id"].items(): if type(v) != list and (k, v) in leg_id_map: govtrack_id = leg_id_map[(k,v)] break if not govtrack_id: print "No GovTrack ID:" pprint.pprint(m) had_error = True continue if govtrack_id not in legislator_data: legislator_data[govtrack_id] = m elif p == "legislators-social-media": legislator_data[govtrack_id]["social"] = m["social"] elif p == "executive": legislator_data[govtrack_id]["terms"].extend( m["terms"] ) else: raise ValueError("Duplication in an unexpected way.") person_processor = PersonProcessor() role_processor = PersonRoleProcessor() existing_persons = set(Person.objects.values_list('pk', flat=True)) processed_persons = set() created_persons = set() progress = Progress(total=len(legislator_data)) log.info('Processing persons') for node in legislator_data.values(): # Wrap each iteration in try/except # so that if some node breaks the parsing process # then other nodes could be parsed try: person = person_processor.process(Person(), node) # Create cached name strings. This is done again later # after the roles are updated. person.set_names() # Now try to load the person with such ID from # database. If found it then just update it # else create new Person object try: ex_person = Person.objects.get(pk=person.pk) if person_processor.changed(ex_person, person) or options.force: # If the person has PK of existing record, # coming in via the YAML-specified GovTrack ID, # then Django ORM will update existing record if not options.force: log.warn("Updated %s" % person) person.save() except Person.DoesNotExist: created_persons.add(person.pk) person.save() log.warn("Created %s" % person) processed_persons.add(person.pk) # Process roles of the person roles = list(PersonRole.objects.filter(person=person)) existing_roles = set(PersonRole.objects.filter(person=person).values_list('pk', flat=True)) processed_roles = set() role_list = [] for role in node['terms']: role = role_processor.process(PersonRole(), role) role.person = person role.current = role.startdate <= datetime.now().date() and role.enddate >= datetime.now().date() # \ #and CURRENT_CONGRESS in role.congress_numbers() # Try to match this role with one already in the database. # First search for an exact match on type/start/end. ex_role = None for r in roles: if role.role_type == r.role_type and r.startdate == role.startdate and r.enddate == role.enddate: ex_role = r break # Otherwise match on type/start only. if not ex_role: for r in roles: if role.role_type == r.role_type and r.startdate == role.startdate: ex_role = r break if ex_role: # These roles correspond. processed_roles.add(ex_role.id) role.id = ex_role.id if role_processor.changed(ex_role, role) or options.force: role.save() role_list.append(role) if not options.force: log.warn("Updated %s" % role) roles.remove(ex_role) # don't need to try matching this to any other node else: # Didn't find a matching role. if len([r for r in roles if r.role_type == role.role_type]) > 0: print role, "is one of these?" for ex_role in roles: print "\t", ex_role raise Exception("There is an unmatched role.") log.warn("Created %s" % role) role.save() role_list.append(role) # create the events for the roles after all have been loaded # because we don't create events for ends of terms and # starts of terms that are adjacent. if not options.disable_events: for i in xrange(len(role_list)): role_list[i].create_events( role_list[i-1] if i > 0 else None, role_list[i+1] if i < len(role_list)-1 else None ) removed_roles = existing_roles - processed_roles for pk in removed_roles: pr = PersonRole.objects.get(pk=pk) print pr.person.id, pr raise ValueError("Deleted role??") log.warn("Deleted %s" % pr) pr.delete() # The name can't be determined until all of the roles are set. If # it changes, re-save. Unfortunately roles are cached so this actually # doesn't work yet. Re-run the parser to fix names. nn = (person.name, person.sortname) if hasattr(person, "role"): delattr(person, "role") # clear the cached info person.set_names() if nn != (person.name, person.sortname): log.warn("%s is now %s." % (nn[0], person.name)) person.save() except Exception, ex: # Catch unexpected exceptions and log them pprint.pprint(node) log.error('', exc_info=ex) had_error = True progress.tick()
def main(options): """ Process committees, subcommittees and members of current congress committees. """ BASE_PATH = settings.CONGRESS_LEGISLATORS_PATH meeting_processor = CommitteeMeetingProcessor() log.info('Processing committees') COMMITTEES_FILE = BASE_PATH + 'committees-current.yaml' if not File.objects.is_changed(COMMITTEES_FILE) and not options.force: log.info('File %s was not changed' % COMMITTEES_FILE) else: tree = yaml_load(COMMITTEES_FILE) total = len(tree) progress = Progress(total=total) seen_committees = set() for committee in tree: try: cobj = Committee.objects.get(code=committee["thomas_id"]) except Committee.DoesNotExist: print "New committee:", committee["thomas_id"] cobj = Committee(code=committee["thomas_id"]) cobj.committee_type = TYPE_MAPPING[committee["type"]] cobj.name = committee["name"] cobj.url = committee.get("url", None) cobj.obsolete = False cobj.committee = None cobj.jurisdiction = committee.get("jurisdiction") cobj.jurisdiction_link = committee.get("jurisdiction_source") cobj.save() seen_committees.add(cobj.id) for subcom in committee.get('subcommittees', []): code = committee["thomas_id"] + subcom["thomas_id"] try: sobj = Committee.objects.get(code=code) except Committee.DoesNotExist: print "New subcommittee:", code sobj = Committee(code=code) sobj.name = subcom["name"] sobj.url = subcom.get("url", None) sobj.type = None sobj.committee = cobj sobj.obsolete = False sobj.save() seen_committees.add(sobj.id) progress.tick() # Check for non-obsolete committees in the database that aren't in our # file. other_committees = Committee.objects.filter(obsolete=False).exclude( id__in=seen_committees) if len(other_committees) > 0: print "Marking obsolete:", ", ".join(c.code for c in other_committees) other_committees.update(obsolete=True) File.objects.save_file(COMMITTEES_FILE) log.info('Processing committee members') MEMBERS_FILE = BASE_PATH + 'committee-membership-current.yaml' file_changed = File.objects.is_changed(MEMBERS_FILE) if not file_changed and not options.force: log.info('File %s was not changed' % MEMBERS_FILE) else: # map THOMAS IDs to GovTrack IDs y = yaml_load(BASE_PATH + "legislators-current.yaml") person_id_map = {} for m in y: if "id" in m and "govtrack" in m["id"] and "thomas" in m["id"]: person_id_map[m["id"]["thomas"]] = m["id"]["govtrack"] # load committee members tree = yaml_load(MEMBERS_FILE) total = len(tree) progress = Progress(total=total, name='committees') # We can delete CommitteeMember objects because we don't have # any foreign keys to them. CommitteeMember.objects.all().delete() # Process committee nodes for committee, members in tree.items(): try: cobj = Committee.objects.get(code=committee) except Committee.DoesNotExist: print "Committee not found:", committee continue # Process members of current committee node for member in members: mobj = CommitteeMember() mobj.person = Person.objects.get( id=person_id_map[member["thomas"]]) mobj.committee = cobj if "title" in member: mobj.role = ROLE_MAPPING[member["title"]] mobj.save() progress.tick() File.objects.save_file(MEMBERS_FILE) log.info('Processing committee schedule') for chamber in ("house", "senate"): meetings_file = 'data/congress/committee_meetings_%s.json' % chamber file_changed = File.objects.is_changed(meetings_file) if not file_changed and not options.force: log.info('File %s was not changed' % meetings_file) else: meetings = json.load(open(meetings_file)) # Process committee event nodes for meeting in meetings: try: # Associate it with an existing meeting object if GUID is already known. # Must get it like this, vs just assigning the ID as we do in other parsers, # because of the auto_now_add created field, which otherwise misbehaves. try: mobj = CommitteeMeeting.objects.get( guid=meeting['guid']) except CommitteeMeeting.DoesNotExist: mobj = CommitteeMeeting() # Parse. mobj = meeting_processor.process(mobj, meeting) # Attach the meeting to the subcommittee if set. if mobj.subcommittee: mobj.committee = Committee.objects.get( code=mobj.committee.code + mobj.subcommittee) mobj.save() mobj.bills.clear() for bill in meeting["bill_ids"]: try: bill_type, bill_num, bill_cong = re.match( r"([a-z]+)(\d+)-(\d+)$", bill).groups() bill = Bill.objects.get( congress=bill_cong, bill_type=BillType.by_slug(bill_type), number=int(bill_num)) mobj.bills.add(bill) except AttributeError: pass # regex failed except common.enum.NotFound: pass # invalid bill type code in source data except Bill.DoesNotExist: pass # we don't know about bill yet except Committee.DoesNotExist: log.error( 'Could not load Committee object for meeting %s' % meeting_processor.display_node(meeting)) for committee in Committee.objects.all(): if not options.disable_events: committee.create_events() File.objects.save_file(meetings_file)
def main(options): """ Update Person and PersonRole models. Do safe update: touch only those records which have been changed. """ BASE_PATH = CONGRESS_LEGISLATORS_PATH SRC_FILES = ['legislators-current', 'legislators-historical', 'legislators-social-media', 'executive'] # order matters for p in SRC_FILES: f = BASE_PATH + p + ".yaml" if not File.objects.is_changed(f) and not options.force: log.info('File %s was not changed' % f) else: # file modified... break else: # no 'break' ==> no files modified return # Start parsing. had_error = False # Get combined data. legislator_data = { } leg_id_map = { } for p in SRC_FILES: log.info('Opening %s...' % p) f = BASE_PATH + p + ".yaml" y = yaml_load(f) for m in y: if p != 'legislators-social-media': govtrack_id = m["id"].get("govtrack") # For the benefit of the social media file, make a mapping of IDs. for k, v in m["id"].items(): if type(v) != list: leg_id_map[(k,v)] = govtrack_id else: # GovTrack IDs are not always listed in this file. govtrack_id = None for k, v in m["id"].items(): if type(v) != list and (k, v) in leg_id_map: govtrack_id = leg_id_map[(k,v)] break if not govtrack_id: print "No GovTrack ID:" pprint.pprint(m) had_error = True continue if govtrack_id not in legislator_data: legislator_data[govtrack_id] = m elif p == "legislators-social-media": legislator_data[govtrack_id]["social"] = m["social"] elif p == "executive": legislator_data[govtrack_id]["terms"].extend( m["terms"] ) else: raise ValueError("Duplication in an unexpected way (%d, %s)." % (govtrack_id, p)) person_processor = PersonProcessor() role_processor = PersonRoleProcessor() existing_persons = set(Person.objects.values_list('pk', flat=True)) processed_persons = set() created_persons = set() progress = Progress(total=len(legislator_data)) log.info('Processing persons') for node in legislator_data.values(): # Wrap each iteration in try/except # so that if some node breaks the parsing process # then other nodes could be parsed try: person = person_processor.process(Person(), node) # Create cached name strings. This is done again later # after the roles are updated. person.set_names() # Now try to load the person with such ID from # database. If found it then just update it # else create new Person object try: ex_person = Person.objects.get(pk=person.pk) if person_processor.changed(ex_person, person) or options.force: # If the person has PK of existing record, # coming in via the YAML-specified GovTrack ID, # then Django ORM will update existing record if not options.force: log.warn("Updated %s" % person) person.save() except Person.DoesNotExist: created_persons.add(person.pk) person.save() log.warn("Created %s" % person) processed_persons.add(person.pk) # Process roles of the person roles = list(PersonRole.objects.filter(person=person)) existing_roles = set(PersonRole.objects.filter(person=person).values_list('pk', flat=True)) processed_roles = set() role_list = [] for role in node['terms']: role = role_processor.process(PersonRole(), role) role.person = person role.current = role.startdate <= datetime.now().date() and role.enddate >= datetime.now().date() # \ #and CURRENT_CONGRESS in role.congress_numbers() # Scan for most recent leadership role within the time period of this term, # which isn't great for Senators because it's likely it changed a few times # within a term, especially if there was a party switch. role.leadership_title = None for leadership_node in node.get("leadership_roles", []): # must match on date and chamber if leadership_node["start"] >= role.enddate.isoformat(): continue # might start on the same day but is for the next Congress if "end" in leadership_node and leadership_node["end"] <= role.startdate.isoformat(): continue # might start on the same day but is for the previous Congress if leadership_node["chamber"] != RoleType.by_value(role.role_type).congress_chamber.lower(): continue role.leadership_title = leadership_node["title"] # Try to match this role with one already in the database. # First search for an exact match on type/start/end. ex_role = None for r in roles: if role.role_type == r.role_type and r.startdate == role.startdate and r.enddate == role.enddate: ex_role = r break # Otherwise match on type/start only. if not ex_role: for r in roles: if role.role_type == r.role_type and r.startdate == role.startdate: ex_role = r break if ex_role: # These roles correspond. processed_roles.add(ex_role.id) role.id = ex_role.id if role_processor.changed(ex_role, role) or options.force: role.save() role_list.append(role) if not options.force: log.warn("Updated %s" % role) roles.remove(ex_role) # don't need to try matching this to any other node else: # Didn't find a matching role. if len([r for r in roles if r.role_type == role.role_type]) > 0: print role, "is one of these?" for ex_role in roles: print "\t", ex_role raise Exception("There is an unmatched role.") log.warn("Created %s" % role) role.save() role_list.append(role) # create the events for the roles after all have been loaded # because we don't create events for ends of terms and # starts of terms that are adjacent. if not options.disable_events: for i in xrange(len(role_list)): role_list[i].create_events( role_list[i-1] if i > 0 else None, role_list[i+1] if i < len(role_list)-1 else None ) removed_roles = existing_roles - processed_roles for pk in removed_roles: pr = PersonRole.objects.get(pk=pk) print pr.person.id, pr raise ValueError("Deleted role??") log.warn("Deleted %s" % pr) pr.delete() # The name can't be determined until all of the roles are set. If # it changes, re-save. Unfortunately roles are cached so this actually # doesn't work yet. Re-run the parser to fix names. nn = (person.name, person.sortname) if hasattr(person, "role"): delattr(person, "role") # clear the cached info person.set_names() if nn != (person.name, person.sortname): log.warn("%s is now %s." % (nn[0], person.name)) person.save() except Exception, ex: # Catch unexpected exceptions and log them pprint.pprint(node) log.error('', exc_info=ex) had_error = True progress.tick()
def main(options): """ Process committees, subcommittees and members of current congress committees. """ BASE_PATH = settings.CONGRESS_LEGISLATORS_PATH meeting_processor = CommitteeMeetingProcessor() log.info('Processing committees') COMMITTEES_FILE = BASE_PATH + 'committees-current.yaml' if not File.objects.is_changed(COMMITTEES_FILE) and not options.force: log.info('File %s was not changed' % COMMITTEES_FILE) else: tree = yaml_load(COMMITTEES_FILE) total = len(tree) progress = Progress(total=total) seen_committees = set() for committee in tree: try: cobj = Committee.objects.get(code=committee["thomas_id"]) except Committee.DoesNotExist: print "New committee:", committee["thomas_id"] cobj = Committee(code=committee["thomas_id"]) cobj.committee_type = TYPE_MAPPING[committee["type"]] cobj.name = committee["name"] cobj.url = committee.get("url", None) cobj.obsolete = False cobj.committee = None cobj.jurisdiction = committee.get("jurisdiction") cobj.jurisdiction_link = committee.get("jurisdiction_source") cobj.save() seen_committees.add(cobj.id) for subcom in committee.get('subcommittees', []): code = committee["thomas_id"] + subcom["thomas_id"] try: sobj = Committee.objects.get(code=code) except Committee.DoesNotExist: print "New subcommittee:", code sobj = Committee(code=code) sobj.name = subcom["name"] sobj.url = subcom.get("url", None) sobj.type = None sobj.committee = cobj sobj.obsolete = False sobj.save() seen_committees.add(sobj.id) progress.tick() # Check for non-obsolete committees in the database that aren't in our # file. other_committees = Committee.objects.filter(obsolete=False).exclude(id__in=seen_committees) if len(other_committees) > 0: print "Marking obsolete:", ", ".join(c.code for c in other_committees) other_committees.update(obsolete=True) File.objects.save_file(COMMITTEES_FILE) log.info('Processing committee members') MEMBERS_FILE = BASE_PATH + 'committee-membership-current.yaml' file_changed = File.objects.is_changed(MEMBERS_FILE) if not file_changed and not options.force: log.info('File %s was not changed' % MEMBERS_FILE) else: # map THOMAS IDs to GovTrack IDs y = yaml_load(BASE_PATH + "legislators-current.yaml") person_id_map = { } for m in y: if "id" in m and "govtrack" in m["id"] and "thomas" in m["id"]: person_id_map[m["id"]["thomas"]] = m["id"]["govtrack"] # load committee members tree = yaml_load(MEMBERS_FILE) total = len(tree) progress = Progress(total=total, name='committees') # We can delete CommitteeMember objects because we don't have # any foreign keys to them. CommitteeMember.objects.all().delete() # Process committee nodes for committee, members in tree.items(): try: cobj = Committee.objects.get(code=committee) except Committee.DoesNotExist: print "Committee not found:", committee continue # Process members of current committee node for member in members: mobj = CommitteeMember() mobj.person = Person.objects.get(id=person_id_map[member["thomas"]]) mobj.committee = cobj if "title" in member: mobj.role = ROLE_MAPPING[member["title"]] mobj.save() progress.tick() File.objects.save_file(MEMBERS_FILE) log.info('Processing committee schedule') for chamber in ("house", "senate"): meetings_file = 'data/congress/committee_meetings_%s.json' % chamber file_changed = File.objects.is_changed(meetings_file) if not file_changed and not options.force: log.info('File %s was not changed' % meetings_file) else: meetings = json.load(open(meetings_file)) # Process committee event nodes for meeting in meetings: try: # Associate it with an existing meeting object if GUID is already known. # Must get it like this, vs just assigning the ID as we do in other parsers, # because of the auto_now_add created field, which otherwise misbehaves. try: mobj = CommitteeMeeting.objects.get(guid=meeting['guid']) except CommitteeMeeting.DoesNotExist: mobj = CommitteeMeeting() # Parse. mobj = meeting_processor.process(mobj, meeting) # Attach the meeting to the subcommittee if set. if mobj.subcommittee: mobj.committee = Committee.objects.get(code=mobj.committee.code + mobj.subcommittee) mobj.save() mobj.bills.clear() for bill in meeting["bill_ids"]: try: bill_type, bill_num, bill_cong = re.match(r"([a-z]+)(\d+)-(\d+)$", bill).groups() bill = Bill.objects.get(congress=bill_cong, bill_type=BillType.by_slug(bill_type), number=int(bill_num)) mobj.bills.add(bill) except AttributeError: pass # regex failed except common.enum.NotFound: pass # invalid bill type code in source data except Bill.DoesNotExist: pass # we don't know about bill yet except Committee.DoesNotExist: log.error('Could not load Committee object for meeting %s' % meeting_processor.display_node(meeting)) for committee in Committee.objects.all(): if not options.disable_events: committee.create_events() File.objects.save_file(meetings_file)
def main(options): """ Update Person and PersonRole models. Do safe update: touch only those records which have been changed. """ BASE_PATH = CONGRESS_LEGISLATORS_PATH SRC_FILES = ['legislators-current', 'legislators-historical', 'legislators-social-media', 'executive'] # order matters for p in SRC_FILES: f = BASE_PATH + p + ".yaml" if not File.objects.is_changed(f) and not options.force: log.info('File %s was not changed' % f) else: # file modified... break else: # no 'break' ==> no files modified return # Start parsing. had_error = False # Get combined data. legislator_data = { } leg_id_map = { } for p in SRC_FILES: log.info('Opening %s...' % p) f = BASE_PATH + p + ".yaml" y = yaml_load(f) for m in y: if p == "legislators-current": # We know all terms but the last are non-current and the last is. for r in m["terms"]: r["current"] = False m["terms"][-1]["current"] = True elif p == "legislators-historical": # We know all terms are non-current. for r in m["terms"]: r["current"] = False if p != 'legislators-social-media': govtrack_id = m["id"].get("govtrack") # For the benefit of the social media file, make a mapping of IDs. for k, v in m["id"].items(): if type(v) != list: leg_id_map[(k,v)] = govtrack_id else: # GovTrack IDs are not always listed in this file. govtrack_id = None for k, v in m["id"].items(): if type(v) != list and (k, v) in leg_id_map: govtrack_id = leg_id_map[(k,v)] break if not govtrack_id: print "No GovTrack ID:" pprint.pprint(m) had_error = True continue if govtrack_id not in legislator_data: legislator_data[govtrack_id] = m elif p == "legislators-social-media": legislator_data[govtrack_id]["social"] = m["social"] elif p == "executive": legislator_data[govtrack_id]["terms"].extend( m["terms"] ) else: raise ValueError("Duplication in an unexpected way (%d, %s)." % (govtrack_id, p)) person_processor = PersonProcessor() role_processor = PersonRoleProcessor() existing_persons = set(Person.objects.values_list('pk', flat=True)) processed_persons = set() created_persons = set() progress = Progress(total=len(legislator_data)) log.info('Processing persons') for node in legislator_data.values(): # Wrap each iteration in try/except # so that if some node breaks the parsing process # then other nodes could be parsed try: person = person_processor.process(Person(), node) # Create cached name strings. This is done again later # after the roles are updated. person.set_names() # Now try to load the person with such ID from # database. If found it then just update it # else create new Person object try: ex_person = Person.objects.get(pk=person.pk) if person_processor.changed(ex_person, person) or options.force: # If the person has PK of existing record, # coming in via the YAML-specified GovTrack ID, # then Django ORM will update existing record if not options.force: log.warn("Updated %s" % person) person.save() except Person.DoesNotExist: created_persons.add(person.pk) person.save() log.warn("Created %s" % person) processed_persons.add(person.pk) # Parse all of the roles. new_roles = [] for termnode in node['terms']: role = role_processor.process(PersonRole(), termnode) role.person = person role.extra = filter_yaml_term_structure(termnode) # copy in the whole YAML structure # Is this role current? For legislators, same as whether it came from legislators-current, which eases Jan 3 transitions when we can't distinguish by date. if "current" in termnode: role.current = termnode["current"] # But executives... else: now = datetime.now().date() role.current = role.startdate <= now and role.enddate >= now # Because of date overlaps at noon transition dates, ensure that only the last term that covers # today is current --- reset past roles to not current. Doesn't handle turning off retirning people tho. for r in new_roles: r.current = False # Scan for most recent leadership role within the time period of this term, # which isn't great for Senators because it's likely it changed a few times # within a term, especially if there was a party switch. role.leadership_title = None for leadership_node in node.get("leadership_roles", []): # must match on date and chamber if leadership_node["start"] >= role.enddate.isoformat(): continue # might start on the same day but is for the next Congress if "end" in leadership_node and leadership_node["end"] <= role.startdate.isoformat(): continue # might start on the same day but is for the previous Congress if leadership_node["chamber"] != RoleType.by_value(role.role_type).congress_chamber.lower(): continue role.leadership_title = leadership_node["title"] new_roles.append(role) # Try matching the new roles to existing db records. Since we don't have a primry key # in the source data, we have to match on the record values. But because of errors in data, # term start/end dates can change, so matching has to be a little fuzzy. existing_roles = list(PersonRole.objects.filter(person=person)) matches = [] def run_match_rule(rule): import itertools for new_role, existing_role in itertools.product(new_roles, existing_roles): if new_role not in new_roles or existing_role not in existing_roles: continue # already matched on a previous iteration if new_role.role_type != existing_role.role_type: continue if new_role.state != existing_role.state: continue if rule(new_role, existing_role): matches.append((new_role, existing_role)) new_roles.remove(new_role) existing_roles.remove(existing_role) # First match exactly, then exact on just one date, then on contractions and expansions. run_match_rule(lambda new_role, existing_role : new_role.startdate == existing_role.startdate and new_role.enddate == existing_role.enddate) run_match_rule(lambda new_role, existing_role : new_role.startdate == existing_role.startdate or new_role.enddate == existing_role.enddate) run_match_rule(lambda new_role, existing_role : new_role.startdate >= existing_role.startdate and new_role.enddate <= existing_role.enddate) run_match_rule(lambda new_role, existing_role : new_role.startdate <= existing_role.startdate and new_role.enddate >= existing_role.enddate) # Update the database entries that correspond with records in the data file. did_update_any = False for new_role, existing_role in matches: new_role.id = existing_role.id if role_processor.changed(existing_role, new_role) or options.force: new_role.save() did_update_any = True if not options.force: log.warn("Updated %s" % new_role) # If we have mutliple records on disk that didn't match and multiple records in the database # that didn't match, then we don't know how to align them. if len(new_roles) > 0 and len(existing_roles) > 0: print(new_roles) print(existing_roles) raise Exception("There is an unmatched role.") # Otherwise if there are any unmatched new roles, we can just add them. for role in new_roles: log.warn("Created %s" % role) role.save() did_update_any = True # And likewise for any existing roles that are left over. for pr in existing_roles: print pr.person.id, pr raise ValueError("Deleted role??") log.warn("Deleted %s" % pr) pr.delete() if did_update_any and not options.disable_events: # Create the events for the roles after all have been loaded # because we don't create events for ends of terms and # starts of terms that are adjacent. Refresh the list to get # the roles in order. role_list = list(PersonRole.objects.filter(person=person).order_by('startdate')) for i in xrange(len(role_list)): role_list[i].create_events( role_list[i-1] if i > 0 else None, role_list[i+1] if i < len(role_list)-1 else None ) # The name can't be determined until all of the roles are set. If # it changes, re-save. Unfortunately roles are cached so this actually # doesn't work yet. Re-run the parser to fix names. nn = (person.name, person.sortname) if hasattr(person, "role"): delattr(person, "role") # clear the cached info person.set_names() if nn != (person.name, person.sortname): log.warn("%s is now %s." % (nn[0], person.name)) person.save() except Exception, ex: # Catch unexpected exceptions and log them pprint.pprint(node) log.error('', exc_info=ex) had_error = True progress.tick()