示例#1
0
def test_fix_bill_id():
    expect = "AB 74"
    bill_ids = ["A.B. 74", "A.B.74", "AB74", "AB 0074", "AB074", "A.B.074", "A.B. 074", "A.B\t074"]

    for bill_id in bill_ids:
        assert utils.fix_bill_id(bill_id) == expect

    assert utils.fix_bill_id("PR19-0041") == "PR 19-0041"
示例#2
0
def test_fix_bill_id():
    expect = 'AB 74'
    bill_ids = [
        'A.B. 74', 'A.B.74', 'AB74', 'AB 0074', 'AB074', 'A.B.074', 'A.B. 074',
        'A.B\t074'
    ]

    for bill_id in bill_ids:
        assert utils.fix_bill_id(bill_id) == expect

    assert utils.fix_bill_id('PR19-0041') == 'PR 19-0041'
示例#3
0
文件: bills.py 项目: JT5D/billy
def bill(request, abbr, session, bill_id):
    # get fixed version
    fixed_bill_id = fix_bill_id(bill_id)
    # redirect if URL's id isn't fixed id without spaces
    if fixed_bill_id.replace(' ', '') != bill_id:
        return redirect('bill', abbr=abbr, session=session,
                        bill_id=fixed_bill_id.replace(' ', ''))
    bill = db.bills.find_one({'state': abbr, 'session': session,
                              'bill_id': fixed_bill_id})
    if bill is None:
        raise Http404('no bill found {0} {1} {2}'.format(abbr, session,
                                                         bill_id))

    events = db.events.find({
        "state": abbr,
        "related_bills.bill_id": bill['_id']
    }).sort("when", -1)

    popularity.counter.inc('bills', bill['_id'], abbr=abbr, session=session)

    show_all_sponsors = request.GET.get('show_all_sponsors')
    if show_all_sponsors:
        sponsors = bill.sponsors_manager
    else:
        sponsors = bill.sponsors_manager.first_fifteen
    return render(request, templatename('bill'),
        dict(vote_preview_row_template=templatename('vote_preview_row'),
             abbr=abbr,
             metadata=Metadata.get_object(abbr),
             bill=bill,
             events=events,
             show_all_sponsors=show_all_sponsors,
             sponsors=sponsors,
             sources=bill['sources'],
             statenav_active='bills'))
示例#4
0
def test_fix_bill_id():
    expect = 'AB 74'
    bill_ids = ['A.B. 74', 'A.B.74', 'AB74', 'AB 0074',
                'AB074', 'A.B.074', 'A.B. 074', 'A.B\t074']

    for bill_id in bill_ids:
        assert utils.fix_bill_id(bill_id) == expect
示例#5
0
文件: events.py 项目: JT5D/billy
def import_events(abbr, data_dir, import_actions=False):
    data_dir = os.path.join(data_dir, abbr)
    pattern = os.path.join(data_dir, "events", "*.json")

    for path in glob.iglob(pattern):
        with open(path) as f:
            data = prepare_obj(json.load(f))

        def _resolve_ctty(committee):
            return get_committee_id(data[settings.LEVEL_FIELD], committee["chamber"], committee["participant"])

        def _resolve_leg(leg):
            chamber = leg["chamber"] if leg["chamber"] in ["upper", "lower"] else None

            return get_legislator_id(abbr, data["session"], chamber, leg["participant"])

        resolvers = {"committee": _resolve_ctty, "legislator": _resolve_leg}

        for entity in data["participants"]:
            type = entity["participant_type"]
            id = None
            if type in resolvers:
                id = resolvers[type](entity)
            else:
                logger.warning("I don't know how to resolve a %s" % type)
            entity["id"] = id

        for bill in data["related_bills"]:
            bill["_scraped_bill_id"] = bill["bill_id"]
            bill_id = bill["bill_id"]
            bill_id = fix_bill_id(bill_id)
            bill["bill_id"] = ""
            db_bill = db.bills.find_one(
                {
                    "$or": [
                        {settings.LEVEL_FIELD: abbr, "session": data["session"], "bill_id": bill_id},
                        {settings.LEVEL_FIELD: abbr, "session": data["session"], "alternate_bill_ids": bill_id},
                    ]
                }
            )

            if not db_bill:
                logger.warning("Error: Can't find %s" % bill_id)
                db_bill = {}
                db_bill["_id"] = None

            # Events are really hard to pin to a chamber. Some of these are
            # also a committee considering a bill from the other chamber, or
            # something like that.
            bill["bill_id"] = db_bill["_id"]
        import_event(data)
    ensure_indexes()
示例#6
0
def import_votes(state, data_dir):
    data_dir = os.path.join(data_dir, state)
    pattern = os.path.join(data_dir, 'votes', '*.json')

    paths = glob.glob(pattern)

    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        # clean up bill_id, needs to match the one already in the database
        data['bill_id'] = fix_bill_id(data['bill_id'])

        bill = db.bills.find_one({'state': state,
                                  'chamber': data['bill_chamber'],
                                  'session': data['session'],
                                  'bill_id': data['bill_id']})

        if not bill:
            _log.warning("Couldn't find bill %s" % data['bill_id'])
            continue

        del data['bill_id']

        try:
            del data['filename']
        except KeyError:
            pass

        for vtype in ('yes_votes', 'no_votes', 'other_votes'):
            svlist = []
            for svote in data[vtype]:
                id = get_legislator_id(state, data['session'],
                                       data['chamber'], svote)
                svlist.append({'name': svote, 'leg_id': id})

            data[vtype] = svlist

        for vote in bill['votes']:
            if (vote['motion'] == data['motion']
                and vote['date'] == data['date']):
                vote.update(data)
                break
        else:
            bill['votes'].append(data)

        db.bills.save(bill, safe=True)

    print 'imported %s vote files' % len(paths)
示例#7
0
def import_votes(state, data_dir):
    pattern = os.path.join(data_dir, 'votes', '*.json')
    paths = glob.glob(pattern)

    votes = defaultdict(list)

    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        # need to match bill_id already in the database
        bill_id = fix_bill_id(data.pop('bill_id'))

        votes[(data['bill_chamber'], data['session'], bill_id)].append(data)

    print 'imported %s vote files' % len(paths)
    return votes
示例#8
0
文件: bills.py 项目: msabramo/billy
def import_votes(data_dir):
    pattern = os.path.join(data_dir, "votes", "*.json")
    paths = glob.glob(pattern)

    votes = defaultdict(list)

    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        # need to match bill_id already in the database
        bill_id = fix_bill_id(data.pop("bill_id"))

        votes[(data["bill_chamber"], data["session"], bill_id)].append(data)

    logger.info("imported %s vote files" % len(paths))
    return votes
示例#9
0
def import_votes(state, data_dir):
    pattern = os.path.join(data_dir, 'votes', '*.json')
    paths = glob.glob(pattern)

    votes = defaultdict(list)

    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        # need to match bill_id already in the database
        bill_id = fix_bill_id(data.pop('bill_id'))

        votes[(data['bill_chamber'], data['session'], bill_id)].append(data)

    print 'imported %s vote files' % len(paths)
    return votes
示例#10
0
def import_events(abbr, data_dir, import_actions=False):
    data_dir = os.path.join(data_dir, abbr)
    pattern = os.path.join(data_dir, 'events', '*.json')

    for path in glob.iglob(pattern):
        with open(path) as f:
            data = prepare_obj(json.load(f))
        for committee in data['participants']:
            cttyid = get_committee_id(data['level'], data['state'],
                                      committee['participant'],
                                      committee['chamber'] )
            if cttyid:
                committee['committee_id'] = cttyid

        for bill in data['related_bills']:
            bill['_scraped_bill_id'] = bill['bill_id']
            bill_id = bill['bill_id']
            bill_id = fix_bill_id(bill_id)
            bill['bill_id'] = ""
            db_bill = db.bills.find_one({
                "$or": [
                    {
                        "state": abbr,
                        'session': data['session'],
                        'bill_id': bill_id
                    },
                    {
                        "state": abbr,
                        'session': data['session'],
                        'alternate_bill_ids': bill_id
                    }
                ]
            })

            if not db_bill:
                logger.warning("Error: Can't find %s" % bill_id)
                db_bill = {}
                db_bill['_id'] = None

            # Events are really hard to pin to a chamber. Some of these are
            # also a committee considering a bill from the other chamber, or
            # something like that.
            bill['bill_id'] = db_bill['_id']
        import_event(data)
    ensure_indexes()
示例#11
0
文件: bills.py 项目: JT5D/billy
 def func(request, abbr, session, bill_id, key):
     # get fixed version
     fixed_bill_id = fix_bill_id(bill_id)
     # redirect if URL's id isn't fixed id without spaces
     if fixed_bill_id.replace(' ', '') != bill_id:
         return redirect('bill', abbr=abbr, session=session,
                         bill_id=fixed_bill_id.replace(' ', ''))
     bill = db.bills.find_one({'state': abbr, 'session': session,
                               'bill_id': fixed_bill_id})
     if bill is None:
         raise Http404('no bill found {0} {1} {2}'.format(abbr, session,
                                                          bill_id))
     return render(request, templatename('bill_all_%s' % key),
     dict(abbr=abbr,
          metadata=Metadata.get_object(abbr),
          bill=bill,
          sources=bill['sources'],
          statenav_active='bills'))
示例#12
0
文件: search.py 项目: JT5D/billy
def search_by_bill_id(abbr, search_text):
    '''Find bills with ids like "HB1234".
    '''
    spec = {}

    # If the input looks like a bill id, try to fetch the bill.
    if re.search(r'\d', search_text):
        bill_id = fix_bill_id(search_text).upper()
        collection = db.bills
        spec.update(bill_id=bill_id)

        if abbr != 'all':
            spec['state'] = abbr

        docs = collection.find(spec)

        # Do a regex search if the input consists solely of digits.
        if 0 == docs.count():
            spec['bill_id'] = {'$regex': bill_id}
            docs = collection.find(spec)

        # If there were actual results, return a bill_id result view.
        if 0 < docs.count():

            def sortkey(doc):
                session = doc['session']
                years = re.findall(r'\d{4}', session)
                try:
                    return int(years[-1])
                except IndexError:
                    return session

            docs = sorted(docs, key=operator.itemgetter('session'),
                          reverse=True)

            return docs
示例#13
0
文件: bills.py 项目: rzuck/openstates
def import_bills(state, data_dir):
    data_dir = os.path.join(data_dir, state)
    pattern = os.path.join(data_dir, "bills", "*.json")

    meta = db.metadata.find_one({"_id": state})

    # Build a session to term mapping
    sessions = {}
    for term in meta["terms"]:
        for session in term["sessions"]:
            sessions[session] = term["name"]

    votes = import_votes(state, data_dir)

    paths = glob.glob(pattern)

    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        # clean up bill_id
        data["bill_id"] = fix_bill_id(data["bill_id"])

        # move subjects to scraped_subjects
        subjects = data.pop("subjects", None)

        # NOTE: intentionally doesn't copy blank lists of subjects
        # this avoids the problem where a bill is re-run but we can't
        # get subjects anymore (quite common in fact)
        if subjects:
            data["scraped_subjects"] = subjects

        # add loaded votes to data
        bill_votes = votes.pop((data["chamber"], data["session"], data["bill_id"]), [])
        data["votes"].extend(bill_votes)

        bill = db.bills.find_one(
            {"state": data["state"], "session": data["session"], "chamber": data["chamber"], "bill_id": data["bill_id"]}
        )

        vote_matcher = VoteMatcher(data["state"])
        if bill:
            vote_matcher.learn_vote_ids(bill["votes"])
        vote_matcher.set_vote_ids(data["votes"])

        # match sponsor leg_ids
        for sponsor in data["sponsors"]:
            id = get_legislator_id(state, data["session"], None, sponsor["name"])
            sponsor["leg_id"] = id

        for vote in data["votes"]:

            # committee_ids
            if "committee" in vote:
                committee_id = get_committee_id(state, vote["chamber"], vote["committee"])
                vote["committee_id"] = committee_id

            # vote leg_ids
            for vtype in ("yes_votes", "no_votes", "other_votes"):
                svlist = []
                for svote in vote[vtype]:
                    id = get_legislator_id(state, data["session"], vote["chamber"], svote)
                    svlist.append({"name": svote, "leg_id": id})

                vote[vtype] = svlist

        data["_term"] = sessions[data["session"]]

        # Merge any version titles into the alternate_titles list
        alt_titles = set(data.get("alternate_titles", []))
        for version in data["versions"]:
            if "title" in version:
                alt_titles.add(version["title"])
            if "+short_title" in version:
                alt_titles.add(version["+short_title"])
        try:
            # Make sure the primary title isn't included in the
            # alternate title list
            alt_titles.remove(data["title"])
        except KeyError:
            pass
        data["alternate_titles"] = list(alt_titles)

        if not bill:
            data["_keywords"] = list(bill_keywords(data))
            insert_with_id(data)
        else:
            data["_keywords"] = list(bill_keywords(data))
            update(bill, data, db.bills)

    print "imported %s bill files" % len(paths)

    for remaining in votes.keys():
        print "Failed to match vote %s %s %s" % tuple([r.encode("ascii", "replace") for r in remaining])

    populate_current_fields(state)
    ensure_indexes()
示例#14
0
文件: bills.py 项目: annerajb/billy
def import_bill(data, votes, categorizer):
    level = data['level']
    abbr = data[level]

    # clean up bill_ids
    data['bill_id'] = fix_bill_id(data['bill_id'])
    if 'alternate_bill_ids' in data:
        data['alternate_bill_ids'] = [fix_bill_id(bid) for bid in
                                      data['alternate_bill_ids']]

    # move subjects to scraped_subjects
    # NOTE: intentionally doesn't copy blank lists of subjects
    # this avoids the problem where a bill is re-run but we can't
    # get subjects anymore (quite common)
    subjects = data.pop('subjects', None)
    if subjects:
        data['scraped_subjects'] = subjects

    # update categorized subjects
    if categorizer:
        categorizer.categorize_bill(data)

    # this is a hack added for Rhode Island where we can't
    # determine the full bill_id, if this key is in the metadata
    # we just use the numeric portion, not ideal as it won't work
    # in states where HB/SBs overlap, but in RI they never do
    if metadata(abbr).get('_partial_vote_bill_id'):
        # pull off numeric portion of bill_id
        numeric_bill_id = data['bill_id'].split()[1]
        bill_votes = votes.pop((data['chamber'], data['session'],
                                numeric_bill_id), [])
    else:
        # add loaded votes to data
        bill_votes = votes.pop((data['chamber'], data['session'],
                                data['bill_id']), [])

    data['votes'].extend(bill_votes)

    bill = db.bills.find_one({'level': level, level: abbr,
                              'session': data['session'],
                              'chamber': data['chamber'],
                              'bill_id': data['bill_id']})

    # keep vote/doc ids consistent
    vote_matcher = VoteMatcher(abbr)
    doc_matcher = DocumentMatcher(abbr)
    if bill:
        vote_matcher.learn_ids(bill['votes'])
        doc_matcher.learn_ids(bill['versions'] + bill['documents'])
    vote_matcher.set_ids(data['votes'])
    doc_matcher.set_ids(data['versions'] + data['documents'])

    # match sponsor leg_ids
    for sponsor in data['sponsors']:
        id = get_legislator_id(abbr, data['session'], None,
                               sponsor['name'])
        sponsor['leg_id'] = id
        if id is None:
            cid = get_committee_id(level, abbr, data['chamber'], sponsor['name'])
            if not cid is None:
                sponsor['committee_id'] = cid

    # process votes
    for vote in data['votes']:

        # committee_ids
        if 'committee' in vote:
            committee_id = get_committee_id(level, abbr, vote['chamber'],
                                            vote['committee'])
            vote['committee_id'] = committee_id

        # vote leg_ids
        for vtype in ('yes_votes', 'no_votes', 'other_votes'):
            svlist = []
            for svote in vote[vtype]:
                id = get_legislator_id(abbr, data['session'],
                                       vote['chamber'], svote)
                svlist.append({'name': svote, 'leg_id': id})

            vote[vtype] = svlist

    # process actions
    dates = {'first': None, 'last': None, 'passed_upper': None,
             'passed_lower': None, 'signed': None}
    for action in data['actions']:

        # We'll try to recover some Committee IDs here.
        if "committee" in action:
            cid = get_committee_id(level, abbr, data['chamber'],
                                   action['committee'])
            action['_scraped_committee_name'] = action['committee']
            if cid is not None:
                action['committee'] = cid
            else:
                del(action['committee'])

        adate = action['date']

        # first & last
        if not dates['first'] or adate < dates['first']:
            dates['first'] = adate
        elif not dates['last'] or adate > dates['last']:
            dates['last'] = adate

        # passed & signed
        if (not dates['passed_upper'] and action['actor'] == 'upper'
            and 'bill:passed' in action['type']):
            dates['passed_upper'] = adate
        elif (not dates['passed_lower'] and action['actor'] == 'lower'
            and 'bill:passed' in action['type']):
            dates['passed_lower'] = adate
        elif (not dates['signed'] and 'governor:signed' in action['type']):
            dates['signed'] = adate

    # save action dates to data
    data['action_dates'] = dates

    data['_term'] = term_for_session(abbr, data['session'])

    alt_titles = set(data.get('alternate_titles', []))

    for version in data['versions']:
        # push versions to oyster
        if settings.ENABLE_OYSTER and 'url' in version:
            oysterize_version(data, version)

        # Merge any version titles into the alternate_titles list
        if 'title' in version:
            alt_titles.add(version['title'])
        if '+short_title' in version:
            alt_titles.add(version['+short_title'])
    try:
        # Make sure the primary title isn't included in the
        # alternate title list
        alt_titles.remove(data['title'])
    except KeyError:
        pass
    data['alternate_titles'] = list(alt_titles)

    if not bill:
        bill_id = insert_with_id(data)
        denormalize_votes(data, bill_id)
        return "insert"
    else:
        update(bill, data, db.bills)
        denormalize_votes(data, bill['_id'])
        return "update"
示例#15
0
def import_bills(state, data_dir):
    data_dir = os.path.join(data_dir, state)
    pattern = os.path.join(data_dir, 'bills', '*.json')

    meta = db.metadata.find_one({'_id': state})

    # Build a session to term mapping
    sessions = {}
    for term in meta['terms']:
        for session in term['sessions']:
            sessions[session] = term['name']

    votes = import_votes(state, data_dir)

    paths = glob.glob(pattern)

    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        # clean up bill_id
        data['bill_id'] = fix_bill_id(data['bill_id'])

        # move subjects to scraped_subjects
        subjects = data.pop('subjects', None)

        # NOTE: intentionally doesn't copy blank lists of subjects
        # this avoids the problem where a bill is re-run but we can't
        # get subjects anymore (quite common in fact)
        if subjects:
            data['scraped_subjects'] = subjects

        # add loaded votes to data
        bill_votes = votes.pop((data['chamber'], data['session'],
                                data['bill_id']), [])
        data['votes'].extend(bill_votes)

        bill = db.bills.find_one({'state': data['state'],
                                  'session': data['session'],
                                  'chamber': data['chamber'],
                                  'bill_id': data['bill_id']})

        vote_matcher = VoteMatcher(data['state'])
        if bill:
            vote_matcher.learn_vote_ids(bill['votes'])
        vote_matcher.set_vote_ids(data['votes'])

        # match sponsor leg_ids
        for sponsor in data['sponsors']:
            id = get_legislator_id(state, data['session'], None,
                                   sponsor['name'])
            sponsor['leg_id'] = id

        for vote in data['votes']:

            # committee_ids
            if 'committee' in vote:
                committee_id = get_committee_id(state,
                                                vote['chamber'],
                                                vote['committee'])
                vote['committee_id'] = committee_id

            # vote leg_ids
            for vtype in ('yes_votes', 'no_votes', 'other_votes'):
                svlist = []
                for svote in vote[vtype]:
                    id = get_legislator_id(state, data['session'],
                                           vote['chamber'], svote)
                    svlist.append({'name': svote, 'leg_id': id})

                vote[vtype] = svlist

        data['_term'] = sessions[data['session']]

        # Merge any version titles into the alternate_titles list
        alt_titles = set(data.get('alternate_titles', []))
        for version in data['versions']:
            if 'title' in version:
                alt_titles.add(version['title'])
            if '+short_title' in version:
                alt_titles.add(version['+short_title'])
        try:
            # Make sure the primary title isn't included in the
            # alternate title list
            alt_titles.remove(data['title'])
        except KeyError:
            pass
        data['alternate_titles'] = list(alt_titles)

        if not bill:
            data['_keywords'] = list(bill_keywords(data))
            insert_with_id(data)
        else:
            data['_keywords'] = list(bill_keywords(data))
            update(bill, data, db.bills)

    print 'imported %s bill files' % len(paths)

    for remaining in votes.keys():
        print 'Failed to match vote %s %s %s' % tuple([
            r.encode('ascii', 'replace') for r in remaining])

    populate_current_fields(state)
    ensure_indexes()
示例#16
0
文件: bills.py 项目: msabramo/billy
def import_bill(data, votes, categorizer):
    level = data["level"]
    abbr = data[level]

    # clean up bill_ids
    data["bill_id"] = fix_bill_id(data["bill_id"])
    if "alternate_bill_ids" in data:
        data["alternate_bill_ids"] = [fix_bill_id(bid) for bid in data["alternate_bill_ids"]]

    # move subjects to scraped_subjects
    # NOTE: intentionally doesn't copy blank lists of subjects
    # this avoids the problem where a bill is re-run but we can't
    # get subjects anymore (quite common)
    subjects = data.pop("subjects", None)
    if subjects:
        data["scraped_subjects"] = subjects

    # update categorized subjects
    if categorizer:
        categorizer.categorize_bill(data)

    # this is a hack added for Rhode Island where we can't
    # determine the full bill_id, if this key is in the metadata
    # we just use the numeric portion, not ideal as it won't work
    # in states where HB/SBs overlap, but in RI they never do
    if metadata(abbr).get("_partial_vote_bill_id"):
        # pull off numeric portion of bill_id
        numeric_bill_id = data["bill_id"].split()[1]
        bill_votes = votes.pop((data["chamber"], data["session"], numeric_bill_id), [])
    else:
        # add loaded votes to data
        bill_votes = votes.pop((data["chamber"], data["session"], data["bill_id"]), [])

    data["votes"].extend(bill_votes)

    bill = db.bills.find_one(
        {
            "level": level,
            level: abbr,
            "session": data["session"],
            "chamber": data["chamber"],
            "bill_id": data["bill_id"],
        }
    )

    # keep vote/doc ids consistent
    vote_matcher = VoteMatcher(abbr)
    doc_matcher = DocumentMatcher(abbr)
    if bill:
        vote_matcher.learn_ids(bill["votes"])
        doc_matcher.learn_ids(bill["versions"] + bill["documents"])
    vote_matcher.set_ids(data["votes"])
    doc_matcher.set_ids(data["versions"] + data["documents"])

    # match sponsor leg_ids
    for sponsor in data["sponsors"]:
        id = get_legislator_id(abbr, data["session"], None, sponsor["name"])
        sponsor["leg_id"] = id

    for vote in data["votes"]:

        # committee_ids
        if "committee" in vote:
            committee_id = get_committee_id(level, abbr, vote["chamber"], vote["committee"])
            vote["committee_id"] = committee_id

        # vote leg_ids
        for vtype in ("yes_votes", "no_votes", "other_votes"):
            svlist = []
            for svote in vote[vtype]:
                id = get_legislator_id(abbr, data["session"], vote["chamber"], svote)
                svlist.append({"name": svote, "leg_id": id})

            vote[vtype] = svlist

    data["_term"] = term_for_session(abbr, data["session"])

    alt_titles = set(data.get("alternate_titles", []))

    for version in data["versions"]:
        # push versions to oyster
        if settings.ENABLE_OYSTER and "url" in version:
            oysterize_version(data, version)

        # Merge any version titles into the alternate_titles list
        if "title" in version:
            alt_titles.add(version["title"])
        if "+short_title" in version:
            alt_titles.add(version["+short_title"])
    try:
        # Make sure the primary title isn't included in the
        # alternate title list
        alt_titles.remove(data["title"])
    except KeyError:
        pass
    data["alternate_titles"] = list(alt_titles)

    if not bill:
        insert_with_id(data)
        return "insert"
    else:
        update(bill, data, db.bills)
        return "update"
示例#17
0
    def scrape_senate_vote(self, bill, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, 'text')
        os.remove(path)

        lines = text.split('\n')

        date_match = re.search(r'Date:\s+(\d+/\d+/\d+)', text)
        if not date_match:
            self.log("Couldn't find date on %s" % url)
            return

        time_match = re.search(r'Time:\s+(\d+:\d+:\d+)\s+(AM|PM)', text)
        date = "%s %s %s" % (date_match.group(1), time_match.group(1),
                           time_match.group(2))
        date = datetime.datetime.strptime(date, "%m/%d/%Y %I:%M:%S %p")
        date = self._tz.localize(date)

        vote_type = None
        yes_count, no_count, other_count = None, None, 0
        votes = []
        for line in lines[21:]:
            line = line.strip()
            if not line:
                continue

            if line.startswith('YEAS'):
                yes_count = int(line.split(' - ')[1])
                vote_type = 'yes'
            elif line.startswith('NAYS'):
                no_count = int(line.split(' - ')[1])
                vote_type = 'no'
            elif line.startswith('EXCUSED') or line.startswith('NOT VOTING'):
                other_count += int(line.split(' - ')[1])
                vote_type = 'other'
            else:
                votes.extend([(n.strip(), vote_type)
                              for n in re.split(r'\s{2,}', line)])

        if yes_count is None or no_count is None:
            self.log("Couldne't find vote counts in %s" % url)
            return

        passed = yes_count > no_count + other_count

        clean_bill_id = fix_bill_id(bill['bill_id'])
        motion_line = None
        for i, line in enumerate(lines):
            if line.strip() == clean_bill_id:
                motion_line = i + 2
        motion = lines[motion_line]
        if not motion:
            self.log("Couldn't find motion for %s" % url)
            return

        vote = Vote('upper', date, motion, passed, yes_count, no_count,
                    other_count)
        vote.add_source(url)

        insert_specific_votes(vote, votes)
        check_vote_counts(vote)

        bill.add_vote(vote)
示例#18
0
文件: bills.py 项目: JT5D/billy
def import_bill(data, standalone_votes, categorizer):
    """
        insert or update a bill

        data - raw bill JSON
        standalone_votes - votes scraped separately
        categorizer - SubjectCategorizer (None - no categorization)
    """
    abbr = data[settings.LEVEL_FIELD]

    # clean up bill_ids
    data['bill_id'] = fix_bill_id(data['bill_id'])
    if 'alternate_bill_ids' in data:
        data['alternate_bill_ids'] = [fix_bill_id(bid) for bid in
                                      data['alternate_bill_ids']]

    # move subjects to scraped_subjects
    # NOTE: intentionally doesn't copy blank lists of subjects
    # this avoids the problem where a bill is re-run but we can't
    # get subjects anymore (quite common)
    subjects = data.pop('subjects', None)
    if subjects:
        data['scraped_subjects'] = subjects

    # update categorized subjects
    if categorizer:
        categorizer.categorize_bill(data)

    # companions
    for companion in data['companions']:
        companion['bill_id'] = fix_bill_id(companion['bill_id'])
        # query based on companion
        spec = companion.copy()
        spec[settings.LEVEL_FIELD] = abbr
        if not spec['chamber']:
            spec.pop('chamber')
        companion_obj = db.bills.find_one(spec)
        if companion_obj:
            companion['internal_id'] = companion_obj['_id']
        else:
            logger.warning('Unknown companion: {chamber} {session} {bill_id}'
                           .format(**companion))

    # look for a prior version of this bill
    bill = db.bills.find_one({settings.LEVEL_FIELD: abbr,
                              'session': data['session'],
                              'chamber': data['chamber'],
                              'bill_id': data['bill_id']})

    # keep doc ids consistent
    doc_matcher = DocumentMatcher(abbr)
    if bill:
        doc_matcher.learn_ids(bill['versions'] + bill['documents'])
    doc_matcher.set_ids(data['versions'] + data['documents'])

    # match sponsor leg_ids
    for sponsor in data['sponsors']:
        # use sponsor's chamber if specified
        id = get_legislator_id(abbr, data['session'], sponsor.get('chamber'),
                               sponsor['name'])
        sponsor['leg_id'] = id
        if id is None:
            cid = get_committee_id(abbr, data['chamber'], sponsor['name'])
            if not cid is None:
                sponsor['committee_id'] = cid


    # process votes ############

    # pull votes off bill
    bill_votes = data.pop('votes', [])

    # grab the external bill votes if present
    if metadata(abbr).get('_partial_vote_bill_id'):
        # this is a hack initially added for Rhode Island where we can't
        # determine the full bill_id, if this key is in the metadata
        # we just use the numeric portion, not ideal as it won't work
        # where HB/SBs overlap, but in RI they never do
        # pull off numeric portion of bill_id
        numeric_bill_id = data['bill_id'].split()[1]
        bill_votes += standalone_votes.pop((data['chamber'], data['session'],
                                            numeric_bill_id), [])
    else:
        # add loaded votes to data
        bill_votes += standalone_votes.pop((data['chamber'], data['session'],
                                            data['bill_id']), [])

    # do id matching and other vote prep
    if bill:
        prepare_votes(abbr, data['session'], bill['_id'], bill_votes)
    else:
        prepare_votes(abbr, data['session'], None, bill_votes)

    # process actions ###########

    dates = {'first': None, 'last': None, 'passed_upper': None,
             'passed_lower': None, 'signed': None}

    vote_flags = {
        "bill:passed",
        "bill:failed",
        "bill:veto_override:passed",
        "bill:veto_override:failed",
        "amendment:passed",
        "amendment:failed",
        "committee:passed",
        "committee:passed:favorable",
        "committee:passed:unfavorable",
        "committee:passed:failed"
    }
    already_linked = set()
    remove_vote = set()

    for action in data['actions']:
        adate = action['date']

        def _match_committee(name):
            return get_committee_id(abbr, action['actor'], name)

        def _match_legislator(name):
            return get_legislator_id(abbr,
                                     data['session'],
                                     action['actor'],
                                     name)

        resolvers = {
            "committee": _match_committee,
            "legislator": _match_legislator
        }

        if "related_entities" in action:
            for entity in action['related_entities']:
                try:
                    resolver = resolvers[entity['type']]
                except KeyError as e:
                    # We don't know how to deal.
                    logger.error("I don't know how to sort a %s" % e)
                    continue

                id = resolver(entity['name'])
                entity['id'] = id

        # first & last dates
        if not dates['first'] or adate < dates['first']:
            dates['first'] = adate
        if not dates['last'] or adate > dates['last']:
            dates['last'] = adate

        # passed & signed dates
        if (not dates['passed_upper'] and action['actor'] == 'upper'
            and 'bill:passed' in action['type']):
            dates['passed_upper'] = adate
        elif (not dates['passed_lower'] and action['actor'] == 'lower'
            and 'bill:passed' in action['type']):
            dates['passed_lower'] = adate
        elif (not dates['signed'] and 'governor:signed' in action['type']):
            dates['signed'] = adate

        # vote-action matching
        action_attached = False
        # only attempt vote matching if action has a date and is one of the
        # designated vote action types
        if set(action['type']).intersection(vote_flags) and action['date']:
            for vote in bill_votes:
                if not vote['date']:
                    continue

                delta = abs(vote['date'] - action['date'])
                if (delta < datetime.timedelta(hours=20) and
                    vote['chamber'] == action['actor']):
                    if action_attached:
                        # multiple votes match, we can't guess
                        action.pop('related_votes', None)
                    else:
                        related_vote = vote['vote_id']
                        if related_vote in already_linked:
                            remove_vote.add(related_vote)

                        already_linked.add(related_vote)
                        action['related_votes'] = [related_vote]
                        action_attached = True

    # remove related_votes that we linked to multiple actions
    for action in data['actions']:
        for vote in remove_vote:
            if vote in action.get('related_votes', []):
                action['related_votes'].remove(vote)

    # save action dates to data
    data['action_dates'] = dates

    data['_term'] = term_for_session(abbr, data['session'])

    alt_titles = set(data.get('alternate_titles', []))

    for version in data['versions']:
        # push versions to oyster
        if settings.ENABLE_OYSTER and 'url' in version:
            oysterize_version(data, version)

        # Merge any version titles into the alternate_titles list
        if 'title' in version:
            alt_titles.add(version['title'])
        if '+short_title' in version:
            alt_titles.add(version['+short_title'])
    try:
        # Make sure the primary title isn't included in the
        # alternate title list
        alt_titles.remove(data['title'])
    except KeyError:
        pass
    data['alternate_titles'] = list(alt_titles)
    data = apply_filters(filters, data)

    if not bill:
        bill_id = insert_with_id(data)
        git_add_bill(data)
        save_votes(data, bill_votes)
        return "insert"
    else:
        git_add_bill(bill)
        update(bill, data, db.bills)
        save_votes(bill, bill_votes)
        return "update"
示例#19
0
def import_bill(data, votes, categorizer):
    level = data['level']
    abbr = data[level]

    # clean up bill_ids
    data['bill_id'] = fix_bill_id(data['bill_id'])
    if 'alternate_bill_ids' in data:
        data['alternate_bill_ids'] = [
            fix_bill_id(bid) for bid in data['alternate_bill_ids']
        ]

    # move subjects to scraped_subjects
    # NOTE: intentionally doesn't copy blank lists of subjects
    # this avoids the problem where a bill is re-run but we can't
    # get subjects anymore (quite common)
    subjects = data.pop('subjects', None)
    if subjects:
        data['scraped_subjects'] = subjects

    # update categorized subjects
    if categorizer:
        categorizer.categorize_bill(data)

    # this is a hack added for Rhode Island where we can't
    # determine the full bill_id, if this key is in the metadata
    # we just use the numeric portion, not ideal as it won't work
    # in states where HB/SBs overlap, but in RI they never do
    if metadata(abbr).get('_partial_vote_bill_id'):
        # pull off numeric portion of bill_id
        numeric_bill_id = data['bill_id'].split()[1]
        bill_votes = votes.pop(
            (data['chamber'], data['session'], numeric_bill_id), [])
    else:
        # add loaded votes to data
        bill_votes = votes.pop(
            (data['chamber'], data['session'], data['bill_id']), [])

    data['votes'].extend(bill_votes)

    bill = db.bills.find_one({
        'level': level,
        level: abbr,
        'session': data['session'],
        'chamber': data['chamber'],
        'bill_id': data['bill_id']
    })

    # keep vote/doc ids consistent
    vote_matcher = VoteMatcher(abbr)
    doc_matcher = DocumentMatcher(abbr)
    if bill:
        vote_matcher.learn_ids(bill['votes'])
        doc_matcher.learn_ids(bill['versions'] + bill['documents'])
    vote_matcher.set_ids(data['votes'])
    doc_matcher.set_ids(data['versions'] + data['documents'])

    # match sponsor leg_ids
    for sponsor in data['sponsors']:
        id = get_legislator_id(abbr, data['session'], None, sponsor['name'])
        sponsor['leg_id'] = id

    for vote in data['votes']:

        # committee_ids
        if 'committee' in vote:
            committee_id = get_committee_id(level, abbr, vote['chamber'],
                                            vote['committee'])
            vote['committee_id'] = committee_id

        # vote leg_ids
        for vtype in ('yes_votes', 'no_votes', 'other_votes'):
            svlist = []
            for svote in vote[vtype]:
                id = get_legislator_id(abbr, data['session'], vote['chamber'],
                                       svote)
                svlist.append({'name': svote, 'leg_id': id})

            vote[vtype] = svlist

    data['_term'] = term_for_session(abbr, data['session'])

    alt_titles = set(data.get('alternate_titles', []))

    for version in data['versions']:
        # push versions to oyster
        if settings.ENABLE_OYSTER and 'url' in version:
            oysterize_version(data, version)

        # Merge any version titles into the alternate_titles list
        if 'title' in version:
            alt_titles.add(version['title'])
        if '+short_title' in version:
            alt_titles.add(version['+short_title'])
    try:
        # Make sure the primary title isn't included in the
        # alternate title list
        alt_titles.remove(data['title'])
    except KeyError:
        pass
    data['alternate_titles'] = list(alt_titles)

    if not bill:
        insert_with_id(data)
        return "insert"
    else:
        update(bill, data, db.bills)
        return "update"
示例#20
0
def import_bills(state, data_dir):
    data_dir = os.path.join(data_dir, state)
    pattern = os.path.join(data_dir, 'bills', '*.json')

    meta = db.metadata.find_one({'_id': state})

    # Build a session to term mapping
    sessions = {}
    for term in meta['terms']:
        for session in term['sessions']:
            sessions[session] = term['name']

    paths = glob.glob(pattern)

    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        # clean up bill_id
        data['bill_id'] = fix_bill_id(data['bill_id'])

        subjects = data.pop('subjects', None)
        if subjects:
            data['scraped_subjects'] = subjects

        bill = db.bills.find_one({'state': data['state'],
                                  'session': data['session'],
                                  'chamber': data['chamber'],
                                  'bill_id': data['bill_id']})

        for sponsor in data['sponsors']:
            id = get_legislator_id(state, data['session'], None,
                                   sponsor['name'])
            sponsor['leg_id'] = id

        for vote in data['votes']:
            if 'committee' in vote:
                committee_id = get_committee_id(state,
                                                vote['chamber'],
                                                vote['committee'])
                vote['committee_id'] = committee_id

            for vtype in ('yes_votes', 'no_votes', 'other_votes'):
                svlist = []
                for svote in vote[vtype]:
                    id = get_legislator_id(state, data['session'],
                                           vote['chamber'], svote)
                    svlist.append({'name': svote, 'leg_id': id})

                vote[vtype] = svlist

        data['_term'] = sessions[data['session']]

        # Merge any version titles into the alternate_titles list
        alt_titles = set(data.get('alternate_titles', []))
        for version in data['versions']:
            if 'title' in version:
                alt_titles.add(version['title'])
            if '+short_title' in version:
                alt_titles.add(version['+short_title'])
        try:
            # Make sure the primary title isn't included in the
            # alternate title list
            alt_titles.remove(data['title'])
        except KeyError:
            pass
        data['alternate_titles'] = list(alt_titles)

        if not bill:
            data['_keywords'] = list(bill_keywords(data))
            insert_with_id(data)
        else:
            data['_keywords'] = list(bill_keywords(data))
            update(bill, data, db.bills)

    print 'imported %s bill files' % len(paths)

    populate_current_fields(state)
    ensure_indexes()
示例#21
0
文件: region.py 项目: annerajb/billy
def search(request, scope):

    abbr = None
    search_text = request.GET['q']
    scope_name = None
    spec = {}

    # If the input looks like a bill id, try to fetch the bill.
    if re.search(r'\d', search_text):
        bill_id = fix_bill_id(search_text).upper()
        collection = db.bills
        spec.update(bill_id=bill_id)

        if scope != 'all':
            abbr = scope

        docs = collection.find(spec, limit=10)

        # If there were actual results, return a bill_id result view.
        if 0 < docs.count():

            def sortkey(doc):
                session = doc['session']
                years = re.findall(r'\d{4}', session)
                try:
                    return int(years[-1])
                except IndexError:
                    return session

            docs = sorted(docs, key=operator.itemgetter('session'),
                          reverse=True)

            return render(request, templatename('search_results_bill_id'),
              dict(bill_id=bill_id,
               abbr=abbr,
               rowtemplate_name=templatename('bills_list_row_with_state_and_session'),
               object_list=IteratorPaginator(docs),
               use_table=True,
               column_headers=('Title', 'Session', 'Introduced',
                               'Recent Action', 'Votes'),
               statenav_active=None))

    # The input didn't contain \d{4}, so assuming it's not a bill,
    # search bill title and legislator names.
    if settings.ENABLE_ELASTICSEARCH:
        kwargs = {}
        if scope != 'all':
            kwargs['state'] = scope
        bill_results = Bill.search(search_text, **kwargs)
    else:
        spec = {'title': {'$regex': search_text, '$options': 'i'}}
        if scope != 'all':
            abbr = scope
            scope_name = Metadata.get_object(abbr)['name']
            spec.update(state=abbr)
        bill_results = db.bills.find(spec)

    # See if any legislator names match.
    spec = {'full_name': {'$regex': search_text, '$options': 'i'}}
    if scope != 'all':
        abbr = scope
        scope_name = Metadata.get_object(abbr)['name']
        spec.update(state=abbr)
    legislator_results = db.legislators.find(spec)

    return render(request, templatename('search_results_bills_legislators'),
        dict(search_text=search_text,
             abbr=abbr,
             scope_name=scope_name,
             bills_list=bill_results.limit(5),
             more_bills_available=(5 < bill_results.count()),
             legislators_list=legislator_results.limit(5),
             more_legislators_available=(5 < legislator_results.count()),
             bill_column_headers=('State', 'Title', 'Session', 'Introduced',
                                  'Recent Action',),
             rowtemplate_name=templatename('bills_list_row_with_state_and_session'),
             show_chamber_column=True,
             statenav_active=None))
示例#22
0
def import_bills(state, data_dir):
    data_dir = os.path.join(data_dir, state)
    pattern = os.path.join(data_dir, 'bills', '*.json')

    meta = db.metadata.find_one({'_id': state})

    # Build a session to term mapping
    sessions = {}
    for term in meta['terms']:
        for session in term['sessions']:
            sessions[session] = term['name']

    votes = import_votes(state, data_dir)

    paths = glob.glob(pattern)

    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        # clean up bill_id
        data['bill_id'] = fix_bill_id(data['bill_id'])

        # move subjects to scraped_subjects
        subjects = data.pop('subjects', None)
        if subjects:
            data['scraped_subjects'] = subjects

        # add loaded votes to data
        bill_votes = votes.pop(
            (data['chamber'], data['session'], data['bill_id']), [])
        data['votes'].extend(bill_votes)

        bill = db.bills.find_one({
            'state': data['state'],
            'session': data['session'],
            'chamber': data['chamber'],
            'bill_id': data['bill_id']
        })

        vote_matcher = VoteMatcher(data['state'])
        if bill:
            vote_matcher.learn_vote_ids(bill['votes'])
        vote_matcher.set_vote_ids(data['votes'])

        # match sponsor leg_ids
        for sponsor in data['sponsors']:
            id = get_legislator_id(state, data['session'], None,
                                   sponsor['name'])
            sponsor['leg_id'] = id

        for vote in data['votes']:

            # committee_ids
            if 'committee' in vote:
                committee_id = get_committee_id(state, vote['chamber'],
                                                vote['committee'])
                vote['committee_id'] = committee_id

            # vote leg_ids
            for vtype in ('yes_votes', 'no_votes', 'other_votes'):
                svlist = []
                for svote in vote[vtype]:
                    id = get_legislator_id(state, data['session'],
                                           vote['chamber'], svote)
                    svlist.append({'name': svote, 'leg_id': id})

                vote[vtype] = svlist

        data['_term'] = sessions[data['session']]

        # Merge any version titles into the alternate_titles list
        alt_titles = set(data.get('alternate_titles', []))
        for version in data['versions']:
            if 'title' in version:
                alt_titles.add(version['title'])
            if '+short_title' in version:
                alt_titles.add(version['+short_title'])
        try:
            # Make sure the primary title isn't included in the
            # alternate title list
            alt_titles.remove(data['title'])
        except KeyError:
            pass
        data['alternate_titles'] = list(alt_titles)

        if not bill:
            data['_keywords'] = list(bill_keywords(data))
            insert_with_id(data)
        else:
            data['_keywords'] = list(bill_keywords(data))
            update(bill, data, db.bills)

    print 'imported %s bill files' % len(paths)

    for remaining in votes.keys():
        print 'Failed to match vote %s %s %s' % tuple(
            [r.encode('ascii', 'replace') for r in remaining])

    populate_current_fields(state)
    ensure_indexes()
示例#23
0
    def scrape_senate_vote(self, bill, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, "text")
        os.remove(path)

        lines = text.split("\n")

        date_match = re.search(r"Date:\s+(\d+/\d+/\d+)", text)
        if not date_match:
            self.log("Couldn't find date on %s" % url)
            return

        time_match = re.search(r"Time:\s+(\d+:\d+:\d+)\s+(AM|PM)", text)
        date = "%s %s %s" % (date_match.group(1), time_match.group(1), time_match.group(2))
        date = datetime.datetime.strptime(date, "%m/%d/%Y %I:%M:%S %p")
        date = self._tz.localize(date)

        vote_type = None
        yes_count, no_count, other_count = None, None, 0
        votes = []
        for line in lines[21:]:
            line = line.strip()
            if not line:
                continue

            if line.startswith("YEAS"):
                yes_count = int(line.split(" - ")[1])
                vote_type = "yes"
            elif line.startswith("NAYS"):
                no_count = int(line.split(" - ")[1])
                vote_type = "no"
            elif line.startswith("EXCUSED") or line.startswith("NOT VOTING"):
                other_count += int(line.split(" - ")[1])
                vote_type = "other"
            else:
                votes.extend([(n.strip(), vote_type) for n in re.split(r"\s{2,}", line)])

        if yes_count is None or no_count is None:
            self.log("Couldne't find vote counts in %s" % url)
            return

        passed = yes_count > no_count + other_count

        clean_bill_id = fix_bill_id(bill["bill_id"])
        motion_line = None
        for i, line in enumerate(lines):
            if line.strip() == clean_bill_id:
                motion_line = i + 2
        motion = lines[motion_line]
        if not motion:
            self.log("Couldn't find motion for %s" % url)
            return

        vote = Vote("upper", date, motion, passed, yes_count, no_count, other_count)
        vote.add_source(url)

        for name, vtype in votes:
            if vtype == "yes":
                vote.yes(name)
            elif vtype == "no":
                vote.no(name)
            elif vtype == "other":
                vote.other(name)

        assert yes_count == len(vote["yes_votes"])
        assert no_count == len(vote["no_votes"])
        assert other_count == len(vote["other_votes"])

        bill.add_vote(vote)