def import_event(data): event = None data = normalize_dates(data) if '_guid' in data: event = db.events.find_one({ settings.LEVEL_FIELD: data[settings.LEVEL_FIELD], '_guid': data['_guid'] }) if not event: event = db.events.find_one({ settings.LEVEL_FIELD: data[settings.LEVEL_FIELD], 'when': data['when'], 'end': data['end'], 'type': data['type'], 'description': data['description'] }) data = apply_filters(filters, data) if not event: data['created_at'] = datetime.datetime.utcnow() data['updated_at'] = data['created_at'] _insert_with_id(data) else: update(event, data, db.events)
def import_event(data): event = None data = normalize_dates(data) if "_guid" in data: event = db.events.find_one({settings.LEVEL_FIELD: data[settings.LEVEL_FIELD], "_guid": data["_guid"]}) if not event: event = db.events.find_one( { settings.LEVEL_FIELD: data[settings.LEVEL_FIELD], "when": data["when"], "end": data["end"], "type": data["type"], "description": data["description"], } ) data = apply_filters(filters, data) if not event: data["created_at"] = datetime.datetime.utcnow() data["updated_at"] = data["created_at"] _insert_with_id(data) else: update(event, data, db.events)
def import_event(data): event = None data = normalize_dates(data) if '_guid' in data: event = db.events.find_one({settings.LEVEL_FIELD: data[settings.LEVEL_FIELD], '_guid': data['_guid']}) if not event: event = db.events.find_one({settings.LEVEL_FIELD: data[settings.LEVEL_FIELD], 'when': data['when'], 'end': data['end'], 'type': data['type'], 'description': data['description']}) data = apply_filters(filters, data) if not event: data['created_at'] = datetime.datetime.utcnow() data['updated_at'] = data['created_at'] _insert_with_id(data) else: update(event, data, db.events)
def import_bill(data, standalone_votes, categorizer): """ insert or update a bill data - raw bill JSON standalone_votes - votes scraped separately categorizer - SubjectCategorizer (None - no categorization) """ abbr = data[settings.LEVEL_FIELD] # clean up bill_ids data['bill_id'] = fix_bill_id(data['bill_id']) if 'alternate_bill_ids' in data: data['alternate_bill_ids'] = [ fix_bill_id(bid) for bid in data['alternate_bill_ids'] ] # move subjects to scraped_subjects # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common) subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects # update categorized subjects if categorizer: categorizer.categorize_bill(data) # companions for companion in data['companions']: companion['bill_id'] = fix_bill_id(companion['bill_id']) # query based on companion spec = companion.copy() spec[settings.LEVEL_FIELD] = abbr if not spec['chamber']: spec.pop('chamber') companion_obj = db.bills.find_one(spec) if companion_obj: companion['internal_id'] = companion_obj['_id'] else: logger.warning( 'Unknown companion: {chamber} {session} {bill_id}'.format( **companion)) # look for a prior version of this bill bill = db.bills.find_one({ settings.LEVEL_FIELD: abbr, 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id'] }) # keep doc ids consistent doc_matcher = DocumentMatcher(abbr) if bill: doc_matcher.learn_ids(bill['versions'] + bill['documents']) doc_matcher.set_ids(data['versions'] + data['documents']) # match sponsor leg_ids match_sponsor_ids(abbr, data) # process votes ############ # pull votes off bill bill_votes = data.pop('votes', []) # grab the external bill votes if present if metadata(abbr).get('_partial_vote_bill_id'): # this is a hack initially added for Rhode Island where we can't # determine the full bill_id, if this key is in the metadata # we just use the numeric portion, not ideal as it won't work # where HB/SBs overlap, but in RI they never do # pull off numeric portion of bill_id numeric_bill_id = data['bill_id'].split()[1] bill_votes += standalone_votes.pop( (data['chamber'], data['session'], numeric_bill_id), []) else: # add loaded votes to data bill_votes += standalone_votes.pop( (data['chamber'], data['session'], data['bill_id']), []) # do id matching and other vote prep if bill: prepare_votes(abbr, data['session'], bill['_id'], bill_votes) else: prepare_votes(abbr, data['session'], None, bill_votes) # process actions ########### dates = { 'first': None, 'last': None, 'passed_upper': None, 'passed_lower': None, 'signed': None } vote_flags = { "bill:passed", "bill:failed", "bill:veto_override:passed", "bill:veto_override:failed", "amendment:passed", "amendment:failed", "committee:passed", "committee:passed:favorable", "committee:passed:unfavorable", "committee:passed:failed" } already_linked = set() remove_vote = set() for action in data['actions']: adate = action['date'] def _match_committee(name): return get_committee_id(abbr, action['actor'], name) def _match_legislator(name): return get_legislator_id(abbr, data['session'], action['actor'], name) resolvers = { "committee": _match_committee, "legislator": _match_legislator } if "related_entities" in action: for entity in action['related_entities']: try: resolver = resolvers[entity['type']] except KeyError as e: # We don't know how to deal. logger.error("I don't know how to sort a %s" % e) continue id = resolver(entity['name']) entity['id'] = id # first & last dates if not dates['first'] or adate < dates['first']: dates['first'] = adate if not dates['last'] or adate > dates['last']: dates['last'] = adate # passed & signed dates if (not dates['passed_upper'] and action['actor'] == 'upper' and 'bill:passed' in action['type']): dates['passed_upper'] = adate elif (not dates['passed_lower'] and action['actor'] == 'lower' and 'bill:passed' in action['type']): dates['passed_lower'] = adate elif (not dates['signed'] and 'governor:signed' in action['type']): dates['signed'] = adate # vote-action matching action_attached = False # only attempt vote matching if action has a date and is one of the # designated vote action types if set(action['type']).intersection(vote_flags) and action['date']: for vote in bill_votes: if not vote['date']: continue delta = abs(vote['date'] - action['date']) if (delta < datetime.timedelta(hours=20) and vote['chamber'] == action['actor']): if action_attached: # multiple votes match, we can't guess action.pop('related_votes', None) else: related_vote = vote['vote_id'] if related_vote in already_linked: remove_vote.add(related_vote) already_linked.add(related_vote) action['related_votes'] = [related_vote] action_attached = True # remove related_votes that we linked to multiple actions for action in data['actions']: for vote in remove_vote: if vote in action.get('related_votes', []): action['related_votes'].remove(vote) # save action dates to data data['action_dates'] = dates data['_term'] = term_for_session(abbr, data['session']) alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: # Merge any version titles into the alternate_titles list if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) data = apply_filters(filters, data) if not bill: insert_with_id(data) elasticsearch_push(data) git_add_bill(data) save_votes(data, bill_votes) return "insert" else: update(bill, data, db.bills) elasticsearch_push(bill) git_add_bill(bill) save_votes(bill, bill_votes) return "update"
def import_legislator(data): data = prepare_obj(data) if data.get('_scraped_name') is None: data['_scraped_name'] = data['full_name'] # Rename 'role' -> 'type' for role in data['roles']: if 'role' in role: role['type'] = role.pop('role') # copy over LEVEL_FIELD into role if settings.LEVEL_FIELD in data: role[settings.LEVEL_FIELD] = data[settings.LEVEL_FIELD] scraped_role = data['roles'][0] scraped_term = scraped_role['term'] abbr = data[settings.LEVEL_FIELD] spec = { settings.LEVEL_FIELD: abbr, 'type': scraped_role['type'], 'term': scraped_term } if 'district' in scraped_role: spec['district'] = scraped_role['district'] if 'chamber' in scraped_role: spec['chamber'] = scraped_role['chamber'] # find matching legislator in current term leg = db.legislators.find_one({ settings.LEVEL_FIELD: abbr, '_scraped_name': data['_scraped_name'], 'roles': { '$elemMatch': spec } }) # legislator with a matching old_role if not leg: spec.pop('term') leg = db.legislators.find_one({ settings.LEVEL_FIELD: abbr, '_scraped_name': data['_scraped_name'], 'old_roles.%s' % scraped_term: { '$elemMatch': spec } }) if leg: if 'old_roles' not in data: data['old_roles'] = leg.get('old_roles', {}) # put scraped roles into their old_roles data['old_roles'][scraped_term] = data['roles'] data['roles'] = leg['roles'] # don't overwrite their current roles # active matching legislator from different term if not leg: spec.pop('term', None) leg = db.legislators.find_one({ settings.LEVEL_FIELD: abbr, '_scraped_name': data['_scraped_name'], 'roles': { '$elemMatch': spec } }) if leg: if 'old_roles' not in data: data['old_roles'] = leg.get('old_roles', {}) # scraped_term < leg's term if term_older_than(abbr, scraped_term, leg['roles'][0]['term']): # move scraped roles into old_roles data['old_roles'][scraped_term] = data['roles'] data['roles'] = leg['roles'] else: data['old_roles'][leg['roles'][0]['term']] = leg['roles'] data = apply_filters(filters, data) if leg: update(leg, data, db.legislators) return "update" else: insert_with_id(data) return "insert"
def import_bill(data, standalone_votes, categorizer): """ insert or update a bill data - raw bill JSON standalone_votes - votes scraped separately categorizer - SubjectCategorizer (None - no categorization) """ abbr = data[settings.LEVEL_FIELD] # clean up bill_ids data['bill_id'] = fix_bill_id(data['bill_id']) if 'alternate_bill_ids' in data: data['alternate_bill_ids'] = [fix_bill_id(bid) for bid in data['alternate_bill_ids']] # move subjects to scraped_subjects # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common) subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects # update categorized subjects if categorizer: categorizer.categorize_bill(data) # companions for companion in data['companions']: companion['bill_id'] = fix_bill_id(companion['bill_id']) # query based on companion spec = companion.copy() spec[settings.LEVEL_FIELD] = abbr if not spec['chamber']: spec.pop('chamber') companion_obj = db.bills.find_one(spec) if companion_obj: companion['internal_id'] = companion_obj['_id'] else: logger.warning('Unknown companion: {chamber} {session} {bill_id}' .format(**companion)) # look for a prior version of this bill bill = db.bills.find_one({settings.LEVEL_FIELD: abbr, 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id']}) # keep doc ids consistent doc_matcher = DocumentMatcher(abbr) if bill: doc_matcher.learn_ids(bill['versions'] + bill['documents']) doc_matcher.set_ids(data['versions'] + data['documents']) # match sponsor leg_ids match_sponsor_ids(abbr, data) # process votes ############ # pull votes off bill bill_votes = data.pop('votes', []) # grab the external bill votes if present if metadata(abbr).get('_partial_vote_bill_id'): # this is a hack initially added for Rhode Island where we can't # determine the full bill_id, if this key is in the metadata # we just use the numeric portion, not ideal as it won't work # where HB/SBs overlap, but in RI they never do # pull off numeric portion of bill_id numeric_bill_id = data['bill_id'].split()[1] bill_votes += standalone_votes.pop((data['chamber'], data['session'], numeric_bill_id), []) else: # add loaded votes to data bill_votes += standalone_votes.pop((data['chamber'], data['session'], data['bill_id']), []) # do id matching and other vote prep if bill: prepare_votes(abbr, data['session'], bill['_id'], bill_votes) else: prepare_votes(abbr, data['session'], None, bill_votes) # process actions ########### dates = {'first': None, 'last': None, 'passed_upper': None, 'passed_lower': None, 'signed': None} vote_flags = { "bill:passed", "bill:failed", "bill:veto_override:passed", "bill:veto_override:failed", "amendment:passed", "amendment:failed", "committee:passed", "committee:passed:favorable", "committee:passed:unfavorable", "committee:passed:failed" } already_linked = set() remove_vote = set() for action in data['actions']: adate = action['date'] def _match_committee(name): return get_committee_id(abbr, action['actor'], name) def _match_legislator(name): return get_legislator_id(abbr, data['session'], action['actor'], name) resolvers = { "committee": _match_committee, "legislator": _match_legislator } if "related_entities" in action: for entity in action['related_entities']: try: resolver = resolvers[entity['type']] except KeyError as e: # We don't know how to deal. logger.error("I don't know how to sort a %s" % e) continue id = resolver(entity['name']) entity['id'] = id # first & last dates if not dates['first'] or adate < dates['first']: dates['first'] = adate if not dates['last'] or adate > dates['last']: dates['last'] = adate # passed & signed dates if (not dates['passed_upper'] and action['actor'] == 'upper' and 'bill:passed' in action['type']): dates['passed_upper'] = adate elif (not dates['passed_lower'] and action['actor'] == 'lower' and 'bill:passed' in action['type']): dates['passed_lower'] = adate elif (not dates['signed'] and 'governor:signed' in action['type']): dates['signed'] = adate # vote-action matching action_attached = False # only attempt vote matching if action has a date and is one of the # designated vote action types if set(action['type']).intersection(vote_flags) and action['date']: for vote in bill_votes: if not vote['date']: continue delta = abs(vote['date'] - action['date']) if (delta < datetime.timedelta(hours=20) and vote['chamber'] == action['actor']): if action_attached: # multiple votes match, we can't guess action.pop('related_votes', None) else: related_vote = vote['vote_id'] if related_vote in already_linked: remove_vote.add(related_vote) already_linked.add(related_vote) action['related_votes'] = [related_vote] action_attached = True # remove related_votes that we linked to multiple actions for action in data['actions']: for vote in remove_vote: if vote in action.get('related_votes', []): action['related_votes'].remove(vote) # save action dates to data data['action_dates'] = dates data['_term'] = term_for_session(abbr, data['session']) alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: # add/update tracked_versions collection track_version(data, version) # Merge any version titles into the alternate_titles list if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) data = apply_filters(filters, data) if not bill: insert_with_id(data) git_add_bill(data) save_votes(data, bill_votes) return "insert" else: update(bill, data, db.bills) git_add_bill(bill) save_votes(bill, bill_votes) return "update"
def import_legislator(data): data = prepare_obj(data) if data.get('_scraped_name') is None: data['_scraped_name'] = data['full_name'] # Rename 'role' -> 'type' for role in data['roles']: if 'role' in role: role['type'] = role.pop('role') # copy over LEVEL_FIELD into role if settings.LEVEL_FIELD in data: role[settings.LEVEL_FIELD] = data[settings.LEVEL_FIELD] scraped_role = data['roles'][0] scraped_term = scraped_role['term'] abbr = data[settings.LEVEL_FIELD] spec = {settings.LEVEL_FIELD: abbr, 'type': scraped_role['type'], 'term': scraped_term} if 'district' in scraped_role: spec['district'] = scraped_role['district'] if 'chamber' in scraped_role: spec['chamber'] = scraped_role['chamber'] # find matching legislator in current term leg = db.legislators.find_one( {settings.LEVEL_FIELD: abbr, '_scraped_name': data['_scraped_name'], 'roles': {'$elemMatch': spec}}) # legislator with a matching old_role if not leg: spec.pop('term') leg = db.legislators.find_one({ settings.LEVEL_FIELD: abbr, '_scraped_name': data['_scraped_name'], 'old_roles.%s' % scraped_term: {'$elemMatch': spec} }) if leg: if 'old_roles' not in data: data['old_roles'] = leg.get('old_roles', {}) # put scraped roles into their old_roles data['old_roles'][scraped_term] = data['roles'] data['roles'] = leg['roles'] # don't overwrite their current roles # active matching legislator from different term if not leg: spec.pop('term', None) leg = db.legislators.find_one( {settings.LEVEL_FIELD: abbr, '_scraped_name': data['_scraped_name'], 'roles': {'$elemMatch': spec}}) if leg: if 'old_roles' not in data: data['old_roles'] = leg.get('old_roles', {}) # scraped_term < leg's term if term_older_than(abbr, scraped_term, leg['roles'][0]['term']): # move scraped roles into old_roles data['old_roles'][scraped_term] = data['roles'] data['roles'] = leg['roles'] else: data['old_roles'][leg['roles'][0]['term']] = leg['roles'] data = apply_filters(filters, data) if leg: update(leg, data, db.legislators) return "update" else: insert_with_id(data) return "insert"
from billy.core import db from billy_settings import LEGISLATOR_FILTERS from billy.importers.filters import apply_filters from dictdiffer import diff filters = LEGISLATOR_FILTERS for leg in db.legislators.find(): d1 = leg leg = leg.copy() d2 = apply_filters(filters, leg) changes = list(diff(d1, d2)) if changes != []: print leg['_id'], changes