def test_fix_bill_id(): expect = "AB 74" bill_ids = ["A.B. 74", "A.B.74", "AB74", "AB 0074", "AB074", "A.B.074", "A.B. 074", "A.B\t074"] for bill_id in bill_ids: assert utils.fix_bill_id(bill_id) == expect assert utils.fix_bill_id("PR19-0041") == "PR 19-0041"
def test_fix_bill_id(): expect = 'AB 74' bill_ids = [ 'A.B. 74', 'A.B.74', 'AB74', 'AB 0074', 'AB074', 'A.B.074', 'A.B. 074', 'A.B\t074' ] for bill_id in bill_ids: assert utils.fix_bill_id(bill_id) == expect assert utils.fix_bill_id('PR19-0041') == 'PR 19-0041'
def bill(request, abbr, session, bill_id): # get fixed version fixed_bill_id = fix_bill_id(bill_id) # redirect if URL's id isn't fixed id without spaces if fixed_bill_id.replace(' ', '') != bill_id: return redirect('bill', abbr=abbr, session=session, bill_id=fixed_bill_id.replace(' ', '')) bill = db.bills.find_one({'state': abbr, 'session': session, 'bill_id': fixed_bill_id}) if bill is None: raise Http404('no bill found {0} {1} {2}'.format(abbr, session, bill_id)) events = db.events.find({ "state": abbr, "related_bills.bill_id": bill['_id'] }).sort("when", -1) popularity.counter.inc('bills', bill['_id'], abbr=abbr, session=session) show_all_sponsors = request.GET.get('show_all_sponsors') if show_all_sponsors: sponsors = bill.sponsors_manager else: sponsors = bill.sponsors_manager.first_fifteen return render(request, templatename('bill'), dict(vote_preview_row_template=templatename('vote_preview_row'), abbr=abbr, metadata=Metadata.get_object(abbr), bill=bill, events=events, show_all_sponsors=show_all_sponsors, sponsors=sponsors, sources=bill['sources'], statenav_active='bills'))
def test_fix_bill_id(): expect = 'AB 74' bill_ids = ['A.B. 74', 'A.B.74', 'AB74', 'AB 0074', 'AB074', 'A.B.074', 'A.B. 074', 'A.B\t074'] for bill_id in bill_ids: assert utils.fix_bill_id(bill_id) == expect
def import_events(abbr, data_dir, import_actions=False): data_dir = os.path.join(data_dir, abbr) pattern = os.path.join(data_dir, "events", "*.json") for path in glob.iglob(pattern): with open(path) as f: data = prepare_obj(json.load(f)) def _resolve_ctty(committee): return get_committee_id(data[settings.LEVEL_FIELD], committee["chamber"], committee["participant"]) def _resolve_leg(leg): chamber = leg["chamber"] if leg["chamber"] in ["upper", "lower"] else None return get_legislator_id(abbr, data["session"], chamber, leg["participant"]) resolvers = {"committee": _resolve_ctty, "legislator": _resolve_leg} for entity in data["participants"]: type = entity["participant_type"] id = None if type in resolvers: id = resolvers[type](entity) else: logger.warning("I don't know how to resolve a %s" % type) entity["id"] = id for bill in data["related_bills"]: bill["_scraped_bill_id"] = bill["bill_id"] bill_id = bill["bill_id"] bill_id = fix_bill_id(bill_id) bill["bill_id"] = "" db_bill = db.bills.find_one( { "$or": [ {settings.LEVEL_FIELD: abbr, "session": data["session"], "bill_id": bill_id}, {settings.LEVEL_FIELD: abbr, "session": data["session"], "alternate_bill_ids": bill_id}, ] } ) if not db_bill: logger.warning("Error: Can't find %s" % bill_id) db_bill = {} db_bill["_id"] = None # Events are really hard to pin to a chamber. Some of these are # also a committee considering a bill from the other chamber, or # something like that. bill["bill_id"] = db_bill["_id"] import_event(data) ensure_indexes()
def import_votes(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, 'votes', '*.json') paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # clean up bill_id, needs to match the one already in the database data['bill_id'] = fix_bill_id(data['bill_id']) bill = db.bills.find_one({'state': state, 'chamber': data['bill_chamber'], 'session': data['session'], 'bill_id': data['bill_id']}) if not bill: _log.warning("Couldn't find bill %s" % data['bill_id']) continue del data['bill_id'] try: del data['filename'] except KeyError: pass for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in data[vtype]: id = get_legislator_id(state, data['session'], data['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) data[vtype] = svlist for vote in bill['votes']: if (vote['motion'] == data['motion'] and vote['date'] == data['date']): vote.update(data) break else: bill['votes'].append(data) db.bills.save(bill, safe=True) print 'imported %s vote files' % len(paths)
def import_votes(state, data_dir): pattern = os.path.join(data_dir, 'votes', '*.json') paths = glob.glob(pattern) votes = defaultdict(list) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # need to match bill_id already in the database bill_id = fix_bill_id(data.pop('bill_id')) votes[(data['bill_chamber'], data['session'], bill_id)].append(data) print 'imported %s vote files' % len(paths) return votes
def import_votes(data_dir): pattern = os.path.join(data_dir, "votes", "*.json") paths = glob.glob(pattern) votes = defaultdict(list) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # need to match bill_id already in the database bill_id = fix_bill_id(data.pop("bill_id")) votes[(data["bill_chamber"], data["session"], bill_id)].append(data) logger.info("imported %s vote files" % len(paths)) return votes
def import_events(abbr, data_dir, import_actions=False): data_dir = os.path.join(data_dir, abbr) pattern = os.path.join(data_dir, 'events', '*.json') for path in glob.iglob(pattern): with open(path) as f: data = prepare_obj(json.load(f)) for committee in data['participants']: cttyid = get_committee_id(data['level'], data['state'], committee['participant'], committee['chamber'] ) if cttyid: committee['committee_id'] = cttyid for bill in data['related_bills']: bill['_scraped_bill_id'] = bill['bill_id'] bill_id = bill['bill_id'] bill_id = fix_bill_id(bill_id) bill['bill_id'] = "" db_bill = db.bills.find_one({ "$or": [ { "state": abbr, 'session': data['session'], 'bill_id': bill_id }, { "state": abbr, 'session': data['session'], 'alternate_bill_ids': bill_id } ] }) if not db_bill: logger.warning("Error: Can't find %s" % bill_id) db_bill = {} db_bill['_id'] = None # Events are really hard to pin to a chamber. Some of these are # also a committee considering a bill from the other chamber, or # something like that. bill['bill_id'] = db_bill['_id'] import_event(data) ensure_indexes()
def func(request, abbr, session, bill_id, key): # get fixed version fixed_bill_id = fix_bill_id(bill_id) # redirect if URL's id isn't fixed id without spaces if fixed_bill_id.replace(' ', '') != bill_id: return redirect('bill', abbr=abbr, session=session, bill_id=fixed_bill_id.replace(' ', '')) bill = db.bills.find_one({'state': abbr, 'session': session, 'bill_id': fixed_bill_id}) if bill is None: raise Http404('no bill found {0} {1} {2}'.format(abbr, session, bill_id)) return render(request, templatename('bill_all_%s' % key), dict(abbr=abbr, metadata=Metadata.get_object(abbr), bill=bill, sources=bill['sources'], statenav_active='bills'))
def search_by_bill_id(abbr, search_text): '''Find bills with ids like "HB1234". ''' spec = {} # If the input looks like a bill id, try to fetch the bill. if re.search(r'\d', search_text): bill_id = fix_bill_id(search_text).upper() collection = db.bills spec.update(bill_id=bill_id) if abbr != 'all': spec['state'] = abbr docs = collection.find(spec) # Do a regex search if the input consists solely of digits. if 0 == docs.count(): spec['bill_id'] = {'$regex': bill_id} docs = collection.find(spec) # If there were actual results, return a bill_id result view. if 0 < docs.count(): def sortkey(doc): session = doc['session'] years = re.findall(r'\d{4}', session) try: return int(years[-1]) except IndexError: return session docs = sorted(docs, key=operator.itemgetter('session'), reverse=True) return docs
def import_bills(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, "bills", "*.json") meta = db.metadata.find_one({"_id": state}) # Build a session to term mapping sessions = {} for term in meta["terms"]: for session in term["sessions"]: sessions[session] = term["name"] votes = import_votes(state, data_dir) paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # clean up bill_id data["bill_id"] = fix_bill_id(data["bill_id"]) # move subjects to scraped_subjects subjects = data.pop("subjects", None) # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common in fact) if subjects: data["scraped_subjects"] = subjects # add loaded votes to data bill_votes = votes.pop((data["chamber"], data["session"], data["bill_id"]), []) data["votes"].extend(bill_votes) bill = db.bills.find_one( {"state": data["state"], "session": data["session"], "chamber": data["chamber"], "bill_id": data["bill_id"]} ) vote_matcher = VoteMatcher(data["state"]) if bill: vote_matcher.learn_vote_ids(bill["votes"]) vote_matcher.set_vote_ids(data["votes"]) # match sponsor leg_ids for sponsor in data["sponsors"]: id = get_legislator_id(state, data["session"], None, sponsor["name"]) sponsor["leg_id"] = id for vote in data["votes"]: # committee_ids if "committee" in vote: committee_id = get_committee_id(state, vote["chamber"], vote["committee"]) vote["committee_id"] = committee_id # vote leg_ids for vtype in ("yes_votes", "no_votes", "other_votes"): svlist = [] for svote in vote[vtype]: id = get_legislator_id(state, data["session"], vote["chamber"], svote) svlist.append({"name": svote, "leg_id": id}) vote[vtype] = svlist data["_term"] = sessions[data["session"]] # Merge any version titles into the alternate_titles list alt_titles = set(data.get("alternate_titles", [])) for version in data["versions"]: if "title" in version: alt_titles.add(version["title"]) if "+short_title" in version: alt_titles.add(version["+short_title"]) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data["title"]) except KeyError: pass data["alternate_titles"] = list(alt_titles) if not bill: data["_keywords"] = list(bill_keywords(data)) insert_with_id(data) else: data["_keywords"] = list(bill_keywords(data)) update(bill, data, db.bills) print "imported %s bill files" % len(paths) for remaining in votes.keys(): print "Failed to match vote %s %s %s" % tuple([r.encode("ascii", "replace") for r in remaining]) populate_current_fields(state) ensure_indexes()
def import_bill(data, votes, categorizer): level = data['level'] abbr = data[level] # clean up bill_ids data['bill_id'] = fix_bill_id(data['bill_id']) if 'alternate_bill_ids' in data: data['alternate_bill_ids'] = [fix_bill_id(bid) for bid in data['alternate_bill_ids']] # move subjects to scraped_subjects # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common) subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects # update categorized subjects if categorizer: categorizer.categorize_bill(data) # this is a hack added for Rhode Island where we can't # determine the full bill_id, if this key is in the metadata # we just use the numeric portion, not ideal as it won't work # in states where HB/SBs overlap, but in RI they never do if metadata(abbr).get('_partial_vote_bill_id'): # pull off numeric portion of bill_id numeric_bill_id = data['bill_id'].split()[1] bill_votes = votes.pop((data['chamber'], data['session'], numeric_bill_id), []) else: # add loaded votes to data bill_votes = votes.pop((data['chamber'], data['session'], data['bill_id']), []) data['votes'].extend(bill_votes) bill = db.bills.find_one({'level': level, level: abbr, 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id']}) # keep vote/doc ids consistent vote_matcher = VoteMatcher(abbr) doc_matcher = DocumentMatcher(abbr) if bill: vote_matcher.learn_ids(bill['votes']) doc_matcher.learn_ids(bill['versions'] + bill['documents']) vote_matcher.set_ids(data['votes']) doc_matcher.set_ids(data['versions'] + data['documents']) # match sponsor leg_ids for sponsor in data['sponsors']: id = get_legislator_id(abbr, data['session'], None, sponsor['name']) sponsor['leg_id'] = id if id is None: cid = get_committee_id(level, abbr, data['chamber'], sponsor['name']) if not cid is None: sponsor['committee_id'] = cid # process votes for vote in data['votes']: # committee_ids if 'committee' in vote: committee_id = get_committee_id(level, abbr, vote['chamber'], vote['committee']) vote['committee_id'] = committee_id # vote leg_ids for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in vote[vtype]: id = get_legislator_id(abbr, data['session'], vote['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) vote[vtype] = svlist # process actions dates = {'first': None, 'last': None, 'passed_upper': None, 'passed_lower': None, 'signed': None} for action in data['actions']: # We'll try to recover some Committee IDs here. if "committee" in action: cid = get_committee_id(level, abbr, data['chamber'], action['committee']) action['_scraped_committee_name'] = action['committee'] if cid is not None: action['committee'] = cid else: del(action['committee']) adate = action['date'] # first & last if not dates['first'] or adate < dates['first']: dates['first'] = adate elif not dates['last'] or adate > dates['last']: dates['last'] = adate # passed & signed if (not dates['passed_upper'] and action['actor'] == 'upper' and 'bill:passed' in action['type']): dates['passed_upper'] = adate elif (not dates['passed_lower'] and action['actor'] == 'lower' and 'bill:passed' in action['type']): dates['passed_lower'] = adate elif (not dates['signed'] and 'governor:signed' in action['type']): dates['signed'] = adate # save action dates to data data['action_dates'] = dates data['_term'] = term_for_session(abbr, data['session']) alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: # push versions to oyster if settings.ENABLE_OYSTER and 'url' in version: oysterize_version(data, version) # Merge any version titles into the alternate_titles list if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) if not bill: bill_id = insert_with_id(data) denormalize_votes(data, bill_id) return "insert" else: update(bill, data, db.bills) denormalize_votes(data, bill['_id']) return "update"
def import_bills(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, 'bills', '*.json') meta = db.metadata.find_one({'_id': state}) # Build a session to term mapping sessions = {} for term in meta['terms']: for session in term['sessions']: sessions[session] = term['name'] votes = import_votes(state, data_dir) paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # clean up bill_id data['bill_id'] = fix_bill_id(data['bill_id']) # move subjects to scraped_subjects subjects = data.pop('subjects', None) # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common in fact) if subjects: data['scraped_subjects'] = subjects # add loaded votes to data bill_votes = votes.pop((data['chamber'], data['session'], data['bill_id']), []) data['votes'].extend(bill_votes) bill = db.bills.find_one({'state': data['state'], 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id']}) vote_matcher = VoteMatcher(data['state']) if bill: vote_matcher.learn_vote_ids(bill['votes']) vote_matcher.set_vote_ids(data['votes']) # match sponsor leg_ids for sponsor in data['sponsors']: id = get_legislator_id(state, data['session'], None, sponsor['name']) sponsor['leg_id'] = id for vote in data['votes']: # committee_ids if 'committee' in vote: committee_id = get_committee_id(state, vote['chamber'], vote['committee']) vote['committee_id'] = committee_id # vote leg_ids for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in vote[vtype]: id = get_legislator_id(state, data['session'], vote['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) vote[vtype] = svlist data['_term'] = sessions[data['session']] # Merge any version titles into the alternate_titles list alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) if not bill: data['_keywords'] = list(bill_keywords(data)) insert_with_id(data) else: data['_keywords'] = list(bill_keywords(data)) update(bill, data, db.bills) print 'imported %s bill files' % len(paths) for remaining in votes.keys(): print 'Failed to match vote %s %s %s' % tuple([ r.encode('ascii', 'replace') for r in remaining]) populate_current_fields(state) ensure_indexes()
def import_bill(data, votes, categorizer): level = data["level"] abbr = data[level] # clean up bill_ids data["bill_id"] = fix_bill_id(data["bill_id"]) if "alternate_bill_ids" in data: data["alternate_bill_ids"] = [fix_bill_id(bid) for bid in data["alternate_bill_ids"]] # move subjects to scraped_subjects # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common) subjects = data.pop("subjects", None) if subjects: data["scraped_subjects"] = subjects # update categorized subjects if categorizer: categorizer.categorize_bill(data) # this is a hack added for Rhode Island where we can't # determine the full bill_id, if this key is in the metadata # we just use the numeric portion, not ideal as it won't work # in states where HB/SBs overlap, but in RI they never do if metadata(abbr).get("_partial_vote_bill_id"): # pull off numeric portion of bill_id numeric_bill_id = data["bill_id"].split()[1] bill_votes = votes.pop((data["chamber"], data["session"], numeric_bill_id), []) else: # add loaded votes to data bill_votes = votes.pop((data["chamber"], data["session"], data["bill_id"]), []) data["votes"].extend(bill_votes) bill = db.bills.find_one( { "level": level, level: abbr, "session": data["session"], "chamber": data["chamber"], "bill_id": data["bill_id"], } ) # keep vote/doc ids consistent vote_matcher = VoteMatcher(abbr) doc_matcher = DocumentMatcher(abbr) if bill: vote_matcher.learn_ids(bill["votes"]) doc_matcher.learn_ids(bill["versions"] + bill["documents"]) vote_matcher.set_ids(data["votes"]) doc_matcher.set_ids(data["versions"] + data["documents"]) # match sponsor leg_ids for sponsor in data["sponsors"]: id = get_legislator_id(abbr, data["session"], None, sponsor["name"]) sponsor["leg_id"] = id for vote in data["votes"]: # committee_ids if "committee" in vote: committee_id = get_committee_id(level, abbr, vote["chamber"], vote["committee"]) vote["committee_id"] = committee_id # vote leg_ids for vtype in ("yes_votes", "no_votes", "other_votes"): svlist = [] for svote in vote[vtype]: id = get_legislator_id(abbr, data["session"], vote["chamber"], svote) svlist.append({"name": svote, "leg_id": id}) vote[vtype] = svlist data["_term"] = term_for_session(abbr, data["session"]) alt_titles = set(data.get("alternate_titles", [])) for version in data["versions"]: # push versions to oyster if settings.ENABLE_OYSTER and "url" in version: oysterize_version(data, version) # Merge any version titles into the alternate_titles list if "title" in version: alt_titles.add(version["title"]) if "+short_title" in version: alt_titles.add(version["+short_title"]) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data["title"]) except KeyError: pass data["alternate_titles"] = list(alt_titles) if not bill: insert_with_id(data) return "insert" else: update(bill, data, db.bills) return "update"
def scrape_senate_vote(self, bill, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, 'text') os.remove(path) lines = text.split('\n') date_match = re.search(r'Date:\s+(\d+/\d+/\d+)', text) if not date_match: self.log("Couldn't find date on %s" % url) return time_match = re.search(r'Time:\s+(\d+:\d+:\d+)\s+(AM|PM)', text) date = "%s %s %s" % (date_match.group(1), time_match.group(1), time_match.group(2)) date = datetime.datetime.strptime(date, "%m/%d/%Y %I:%M:%S %p") date = self._tz.localize(date) vote_type = None yes_count, no_count, other_count = None, None, 0 votes = [] for line in lines[21:]: line = line.strip() if not line: continue if line.startswith('YEAS'): yes_count = int(line.split(' - ')[1]) vote_type = 'yes' elif line.startswith('NAYS'): no_count = int(line.split(' - ')[1]) vote_type = 'no' elif line.startswith('EXCUSED') or line.startswith('NOT VOTING'): other_count += int(line.split(' - ')[1]) vote_type = 'other' else: votes.extend([(n.strip(), vote_type) for n in re.split(r'\s{2,}', line)]) if yes_count is None or no_count is None: self.log("Couldne't find vote counts in %s" % url) return passed = yes_count > no_count + other_count clean_bill_id = fix_bill_id(bill['bill_id']) motion_line = None for i, line in enumerate(lines): if line.strip() == clean_bill_id: motion_line = i + 2 motion = lines[motion_line] if not motion: self.log("Couldn't find motion for %s" % url) return vote = Vote('upper', date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) insert_specific_votes(vote, votes) check_vote_counts(vote) bill.add_vote(vote)
def import_bill(data, standalone_votes, categorizer): """ insert or update a bill data - raw bill JSON standalone_votes - votes scraped separately categorizer - SubjectCategorizer (None - no categorization) """ abbr = data[settings.LEVEL_FIELD] # clean up bill_ids data['bill_id'] = fix_bill_id(data['bill_id']) if 'alternate_bill_ids' in data: data['alternate_bill_ids'] = [fix_bill_id(bid) for bid in data['alternate_bill_ids']] # move subjects to scraped_subjects # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common) subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects # update categorized subjects if categorizer: categorizer.categorize_bill(data) # companions for companion in data['companions']: companion['bill_id'] = fix_bill_id(companion['bill_id']) # query based on companion spec = companion.copy() spec[settings.LEVEL_FIELD] = abbr if not spec['chamber']: spec.pop('chamber') companion_obj = db.bills.find_one(spec) if companion_obj: companion['internal_id'] = companion_obj['_id'] else: logger.warning('Unknown companion: {chamber} {session} {bill_id}' .format(**companion)) # look for a prior version of this bill bill = db.bills.find_one({settings.LEVEL_FIELD: abbr, 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id']}) # keep doc ids consistent doc_matcher = DocumentMatcher(abbr) if bill: doc_matcher.learn_ids(bill['versions'] + bill['documents']) doc_matcher.set_ids(data['versions'] + data['documents']) # match sponsor leg_ids for sponsor in data['sponsors']: # use sponsor's chamber if specified id = get_legislator_id(abbr, data['session'], sponsor.get('chamber'), sponsor['name']) sponsor['leg_id'] = id if id is None: cid = get_committee_id(abbr, data['chamber'], sponsor['name']) if not cid is None: sponsor['committee_id'] = cid # process votes ############ # pull votes off bill bill_votes = data.pop('votes', []) # grab the external bill votes if present if metadata(abbr).get('_partial_vote_bill_id'): # this is a hack initially added for Rhode Island where we can't # determine the full bill_id, if this key is in the metadata # we just use the numeric portion, not ideal as it won't work # where HB/SBs overlap, but in RI they never do # pull off numeric portion of bill_id numeric_bill_id = data['bill_id'].split()[1] bill_votes += standalone_votes.pop((data['chamber'], data['session'], numeric_bill_id), []) else: # add loaded votes to data bill_votes += standalone_votes.pop((data['chamber'], data['session'], data['bill_id']), []) # do id matching and other vote prep if bill: prepare_votes(abbr, data['session'], bill['_id'], bill_votes) else: prepare_votes(abbr, data['session'], None, bill_votes) # process actions ########### dates = {'first': None, 'last': None, 'passed_upper': None, 'passed_lower': None, 'signed': None} vote_flags = { "bill:passed", "bill:failed", "bill:veto_override:passed", "bill:veto_override:failed", "amendment:passed", "amendment:failed", "committee:passed", "committee:passed:favorable", "committee:passed:unfavorable", "committee:passed:failed" } already_linked = set() remove_vote = set() for action in data['actions']: adate = action['date'] def _match_committee(name): return get_committee_id(abbr, action['actor'], name) def _match_legislator(name): return get_legislator_id(abbr, data['session'], action['actor'], name) resolvers = { "committee": _match_committee, "legislator": _match_legislator } if "related_entities" in action: for entity in action['related_entities']: try: resolver = resolvers[entity['type']] except KeyError as e: # We don't know how to deal. logger.error("I don't know how to sort a %s" % e) continue id = resolver(entity['name']) entity['id'] = id # first & last dates if not dates['first'] or adate < dates['first']: dates['first'] = adate if not dates['last'] or adate > dates['last']: dates['last'] = adate # passed & signed dates if (not dates['passed_upper'] and action['actor'] == 'upper' and 'bill:passed' in action['type']): dates['passed_upper'] = adate elif (not dates['passed_lower'] and action['actor'] == 'lower' and 'bill:passed' in action['type']): dates['passed_lower'] = adate elif (not dates['signed'] and 'governor:signed' in action['type']): dates['signed'] = adate # vote-action matching action_attached = False # only attempt vote matching if action has a date and is one of the # designated vote action types if set(action['type']).intersection(vote_flags) and action['date']: for vote in bill_votes: if not vote['date']: continue delta = abs(vote['date'] - action['date']) if (delta < datetime.timedelta(hours=20) and vote['chamber'] == action['actor']): if action_attached: # multiple votes match, we can't guess action.pop('related_votes', None) else: related_vote = vote['vote_id'] if related_vote in already_linked: remove_vote.add(related_vote) already_linked.add(related_vote) action['related_votes'] = [related_vote] action_attached = True # remove related_votes that we linked to multiple actions for action in data['actions']: for vote in remove_vote: if vote in action.get('related_votes', []): action['related_votes'].remove(vote) # save action dates to data data['action_dates'] = dates data['_term'] = term_for_session(abbr, data['session']) alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: # push versions to oyster if settings.ENABLE_OYSTER and 'url' in version: oysterize_version(data, version) # Merge any version titles into the alternate_titles list if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) data = apply_filters(filters, data) if not bill: bill_id = insert_with_id(data) git_add_bill(data) save_votes(data, bill_votes) return "insert" else: git_add_bill(bill) update(bill, data, db.bills) save_votes(bill, bill_votes) return "update"
def import_bill(data, votes, categorizer): level = data['level'] abbr = data[level] # clean up bill_ids data['bill_id'] = fix_bill_id(data['bill_id']) if 'alternate_bill_ids' in data: data['alternate_bill_ids'] = [ fix_bill_id(bid) for bid in data['alternate_bill_ids'] ] # move subjects to scraped_subjects # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common) subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects # update categorized subjects if categorizer: categorizer.categorize_bill(data) # this is a hack added for Rhode Island where we can't # determine the full bill_id, if this key is in the metadata # we just use the numeric portion, not ideal as it won't work # in states where HB/SBs overlap, but in RI they never do if metadata(abbr).get('_partial_vote_bill_id'): # pull off numeric portion of bill_id numeric_bill_id = data['bill_id'].split()[1] bill_votes = votes.pop( (data['chamber'], data['session'], numeric_bill_id), []) else: # add loaded votes to data bill_votes = votes.pop( (data['chamber'], data['session'], data['bill_id']), []) data['votes'].extend(bill_votes) bill = db.bills.find_one({ 'level': level, level: abbr, 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id'] }) # keep vote/doc ids consistent vote_matcher = VoteMatcher(abbr) doc_matcher = DocumentMatcher(abbr) if bill: vote_matcher.learn_ids(bill['votes']) doc_matcher.learn_ids(bill['versions'] + bill['documents']) vote_matcher.set_ids(data['votes']) doc_matcher.set_ids(data['versions'] + data['documents']) # match sponsor leg_ids for sponsor in data['sponsors']: id = get_legislator_id(abbr, data['session'], None, sponsor['name']) sponsor['leg_id'] = id for vote in data['votes']: # committee_ids if 'committee' in vote: committee_id = get_committee_id(level, abbr, vote['chamber'], vote['committee']) vote['committee_id'] = committee_id # vote leg_ids for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in vote[vtype]: id = get_legislator_id(abbr, data['session'], vote['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) vote[vtype] = svlist data['_term'] = term_for_session(abbr, data['session']) alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: # push versions to oyster if settings.ENABLE_OYSTER and 'url' in version: oysterize_version(data, version) # Merge any version titles into the alternate_titles list if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) if not bill: insert_with_id(data) return "insert" else: update(bill, data, db.bills) return "update"
def import_bills(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, 'bills', '*.json') meta = db.metadata.find_one({'_id': state}) # Build a session to term mapping sessions = {} for term in meta['terms']: for session in term['sessions']: sessions[session] = term['name'] paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # clean up bill_id data['bill_id'] = fix_bill_id(data['bill_id']) subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects bill = db.bills.find_one({'state': data['state'], 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id']}) for sponsor in data['sponsors']: id = get_legislator_id(state, data['session'], None, sponsor['name']) sponsor['leg_id'] = id for vote in data['votes']: if 'committee' in vote: committee_id = get_committee_id(state, vote['chamber'], vote['committee']) vote['committee_id'] = committee_id for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in vote[vtype]: id = get_legislator_id(state, data['session'], vote['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) vote[vtype] = svlist data['_term'] = sessions[data['session']] # Merge any version titles into the alternate_titles list alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) if not bill: data['_keywords'] = list(bill_keywords(data)) insert_with_id(data) else: data['_keywords'] = list(bill_keywords(data)) update(bill, data, db.bills) print 'imported %s bill files' % len(paths) populate_current_fields(state) ensure_indexes()
def search(request, scope): abbr = None search_text = request.GET['q'] scope_name = None spec = {} # If the input looks like a bill id, try to fetch the bill. if re.search(r'\d', search_text): bill_id = fix_bill_id(search_text).upper() collection = db.bills spec.update(bill_id=bill_id) if scope != 'all': abbr = scope docs = collection.find(spec, limit=10) # If there were actual results, return a bill_id result view. if 0 < docs.count(): def sortkey(doc): session = doc['session'] years = re.findall(r'\d{4}', session) try: return int(years[-1]) except IndexError: return session docs = sorted(docs, key=operator.itemgetter('session'), reverse=True) return render(request, templatename('search_results_bill_id'), dict(bill_id=bill_id, abbr=abbr, rowtemplate_name=templatename('bills_list_row_with_state_and_session'), object_list=IteratorPaginator(docs), use_table=True, column_headers=('Title', 'Session', 'Introduced', 'Recent Action', 'Votes'), statenav_active=None)) # The input didn't contain \d{4}, so assuming it's not a bill, # search bill title and legislator names. if settings.ENABLE_ELASTICSEARCH: kwargs = {} if scope != 'all': kwargs['state'] = scope bill_results = Bill.search(search_text, **kwargs) else: spec = {'title': {'$regex': search_text, '$options': 'i'}} if scope != 'all': abbr = scope scope_name = Metadata.get_object(abbr)['name'] spec.update(state=abbr) bill_results = db.bills.find(spec) # See if any legislator names match. spec = {'full_name': {'$regex': search_text, '$options': 'i'}} if scope != 'all': abbr = scope scope_name = Metadata.get_object(abbr)['name'] spec.update(state=abbr) legislator_results = db.legislators.find(spec) return render(request, templatename('search_results_bills_legislators'), dict(search_text=search_text, abbr=abbr, scope_name=scope_name, bills_list=bill_results.limit(5), more_bills_available=(5 < bill_results.count()), legislators_list=legislator_results.limit(5), more_legislators_available=(5 < legislator_results.count()), bill_column_headers=('State', 'Title', 'Session', 'Introduced', 'Recent Action',), rowtemplate_name=templatename('bills_list_row_with_state_and_session'), show_chamber_column=True, statenav_active=None))
def import_bills(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, 'bills', '*.json') meta = db.metadata.find_one({'_id': state}) # Build a session to term mapping sessions = {} for term in meta['terms']: for session in term['sessions']: sessions[session] = term['name'] votes = import_votes(state, data_dir) paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # clean up bill_id data['bill_id'] = fix_bill_id(data['bill_id']) # move subjects to scraped_subjects subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects # add loaded votes to data bill_votes = votes.pop( (data['chamber'], data['session'], data['bill_id']), []) data['votes'].extend(bill_votes) bill = db.bills.find_one({ 'state': data['state'], 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id'] }) vote_matcher = VoteMatcher(data['state']) if bill: vote_matcher.learn_vote_ids(bill['votes']) vote_matcher.set_vote_ids(data['votes']) # match sponsor leg_ids for sponsor in data['sponsors']: id = get_legislator_id(state, data['session'], None, sponsor['name']) sponsor['leg_id'] = id for vote in data['votes']: # committee_ids if 'committee' in vote: committee_id = get_committee_id(state, vote['chamber'], vote['committee']) vote['committee_id'] = committee_id # vote leg_ids for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in vote[vtype]: id = get_legislator_id(state, data['session'], vote['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) vote[vtype] = svlist data['_term'] = sessions[data['session']] # Merge any version titles into the alternate_titles list alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) if not bill: data['_keywords'] = list(bill_keywords(data)) insert_with_id(data) else: data['_keywords'] = list(bill_keywords(data)) update(bill, data, db.bills) print 'imported %s bill files' % len(paths) for remaining in votes.keys(): print 'Failed to match vote %s %s %s' % tuple( [r.encode('ascii', 'replace') for r in remaining]) populate_current_fields(state) ensure_indexes()
def scrape_senate_vote(self, bill, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, "text") os.remove(path) lines = text.split("\n") date_match = re.search(r"Date:\s+(\d+/\d+/\d+)", text) if not date_match: self.log("Couldn't find date on %s" % url) return time_match = re.search(r"Time:\s+(\d+:\d+:\d+)\s+(AM|PM)", text) date = "%s %s %s" % (date_match.group(1), time_match.group(1), time_match.group(2)) date = datetime.datetime.strptime(date, "%m/%d/%Y %I:%M:%S %p") date = self._tz.localize(date) vote_type = None yes_count, no_count, other_count = None, None, 0 votes = [] for line in lines[21:]: line = line.strip() if not line: continue if line.startswith("YEAS"): yes_count = int(line.split(" - ")[1]) vote_type = "yes" elif line.startswith("NAYS"): no_count = int(line.split(" - ")[1]) vote_type = "no" elif line.startswith("EXCUSED") or line.startswith("NOT VOTING"): other_count += int(line.split(" - ")[1]) vote_type = "other" else: votes.extend([(n.strip(), vote_type) for n in re.split(r"\s{2,}", line)]) if yes_count is None or no_count is None: self.log("Couldne't find vote counts in %s" % url) return passed = yes_count > no_count + other_count clean_bill_id = fix_bill_id(bill["bill_id"]) motion_line = None for i, line in enumerate(lines): if line.strip() == clean_bill_id: motion_line = i + 2 motion = lines[motion_line] if not motion: self.log("Couldn't find motion for %s" % url) return vote = Vote("upper", date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) for name, vtype in votes: if vtype == "yes": vote.yes(name) elif vtype == "no": vote.no(name) elif vtype == "other": vote.other(name) assert yes_count == len(vote["yes_votes"]) assert no_count == len(vote["no_votes"]) assert other_count == len(vote["other_votes"]) bill.add_vote(vote)