Exemplo n.º 1
0
def import_speeches(abbr, data_dir):
    data_dir = os.path.join(data_dir, abbr)
    pattern = os.path.join(data_dir, 'speeches', '*.json')
    speech_record_ids = defaultdict(set)

    for path in glob.iglob(pattern):
        # OK, We need to first go through all the JSON and load the document
        # IDs to clear out.
        with open(path) as f:
            data = prepare_obj(json.load(f))

        session = data['session']
        chamber = data['chamber']

        speech_record_ids[session].add((chamber, data['record_id']))

    for session in speech_record_ids:
        for obj in speech_record_ids[session]:
            chamber, record = obj
            # XXX: Should we really be clearing them all up front? Should
            #      we clear as we process each record block? Is it OK to
            #      store everything in memory? (there's a lot)
            #
            #      this will result in broken data if the import breaks
            #      below.
            #  -- PRT
            clear_old_speeches(abbr, chamber, session, record)

    for path in glob.iglob(pattern):
        # OK, now we need to import all the JSON. We don't keep the objects
        # from above, since that'd really dent memory, and a few more ms on
        # import isn't the end of the world.
        with open(path) as f:
            data = prepare_obj(json.load(f))
        import_speech(data)
Exemplo n.º 2
0
def import_committees(abbr, data_dir):
    data_dir = os.path.join(data_dir, abbr)
    pattern = os.path.join(data_dir, 'committees', '*.json')

    meta = db.metadata.find_one({'_id': abbr})
    current_term = meta['terms'][-1]['name']
    current_session = meta['terms'][-1]['sessions'][-1]
    level = meta['level']

    paths = glob.glob(pattern)

    for committee in db.committees.find({'level': level, level: abbr}):
        committee['members'] = []
        db.committees.save(committee, safe=True)

    # import committees from legislator roles, no standalone committees scraped
    if not paths:
        import_committees_from_legislators(current_term, level, abbr)

    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        import_committee(data, current_session, current_term)

    print 'imported %s committee files' % len(paths)

    link_parents(level, abbr)

    ensure_indexes()
Exemplo n.º 3
0
def import_bills(abbr, data_dir):
    data_dir = os.path.join(data_dir, abbr)
    pattern = os.path.join(data_dir, "bills", "*.json")

    counts = {"update": 0, "insert": 0, "total": 0}

    votes = import_votes(data_dir)
    try:
        categorizer = SubjectCategorizer(abbr)
    except Exception as e:
        logger.debug("Proceeding without subject categorizer: %s" % e)
        categorizer = None

    paths = glob.glob(pattern)
    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        counts["total"] += 1
        ret = import_bill(data, votes, categorizer)
        counts[ret] += 1

    logger.info("imported %s bill files" % len(paths))

    for remaining in votes.keys():
        logger.debug("Failed to match vote %s %s %s" % tuple([r.encode("ascii", "replace") for r in remaining]))

    meta = db.metadata.find_one({"_id": abbr})
    level = meta["level"]
    populate_current_fields(level, abbr)

    ensure_indexes()

    return counts
Exemplo n.º 4
0
def import_events(state, data_dir, import_actions=True):
    data_dir = os.path.join(data_dir, state)
    pattern = os.path.join(data_dir, 'events', '*.json')

    for path in glob.iglob(pattern):
        with open(path) as f:
            data = prepare_obj(json.load(f))

        event = None
        if '_guid' in data:
            event = db.events.find_one({'state': data['state'],
                                        '_guid': data['_guid']})

        if not event:
            event = db.events.find_one({'state': data['state'],
                                        'when': data['when'],
                                        'end': data['end'],
                                        'type': data['type'],
                                        'description': data['description']})

        if not event:
            data['created_at'] = datetime.datetime.utcnow()
            data['updated_at'] = data['created_at']
            _insert_with_id(data)
        else:
            update(event, data, db.events)

#    if import_actions:
#        actions_to_events(state)

    ensure_indexes()
Exemplo n.º 5
0
def import_legislator(data):
    data = prepare_obj(data)
    data['_scraped_name'] = data['full_name']

    # Rename 'role' -> 'type'
    for role in data['roles']:
        if 'role' in role:
            role['type'] = role['role']
            del role['role']

        # copy over country and/or state into role
        # TODO: base this on all possible level fields
        role['level'] = data['level']
        if 'country' in data:
            role['country'] = data['country']
        if 'state' in data:
            role['state'] = data['state']

    cur_role = data['roles'][0]
    term = cur_role['term']

    level = data['level']
    abbrev = data[level]

    prev_term = get_previous_term(abbrev, term)
    next_term = get_next_term(abbrev, term)

    spec = {level: abbrev,
            'type': cur_role['type'],
            'term': {'$in': [term, prev_term, next_term]}}
    if 'district' in cur_role:
        spec['district'] = cur_role['district']
    if 'chamber' in cur_role:
        spec['chamber'] = cur_role['chamber']

    leg = db.legislators.find_one(
        {'level': level, level: abbrev,
         '_scraped_name': data['full_name'],
         'roles': {'$elemMatch': spec}})

    if leg:
        if 'old_roles' not in leg:
            leg['old_roles'] = {}

        if leg['roles'][0]['term'] == prev_term:
            # Move to old
            leg['old_roles'][leg['roles'][0]['term']] = leg['roles']
        elif leg['roles'][0]['term'] == next_term:
            leg['old_roles'][term] = data['roles']
            data['roles'] = leg['roles']

        update(leg, data, db.legislators)
    else:
        insert_with_id(data)
Exemplo n.º 6
0
def import_events(abbr, data_dir, import_actions=False):
    data_dir = os.path.join(data_dir, abbr)
    pattern = os.path.join(data_dir, 'events', '*.json')

    for path in glob.iglob(pattern):
        with open(path) as f:
            data = prepare_obj(json.load(f))

            import_event(data)

    ensure_indexes()
Exemplo n.º 7
0
Arquivo: events.py Projeto: JT5D/billy
def import_events(abbr, data_dir, import_actions=False):
    data_dir = os.path.join(data_dir, abbr)
    pattern = os.path.join(data_dir, "events", "*.json")

    for path in glob.iglob(pattern):
        with open(path) as f:
            data = prepare_obj(json.load(f))

        def _resolve_ctty(committee):
            return get_committee_id(data[settings.LEVEL_FIELD], committee["chamber"], committee["participant"])

        def _resolve_leg(leg):
            chamber = leg["chamber"] if leg["chamber"] in ["upper", "lower"] else None

            return get_legislator_id(abbr, data["session"], chamber, leg["participant"])

        resolvers = {"committee": _resolve_ctty, "legislator": _resolve_leg}

        for entity in data["participants"]:
            type = entity["participant_type"]
            id = None
            if type in resolvers:
                id = resolvers[type](entity)
            else:
                logger.warning("I don't know how to resolve a %s" % type)
            entity["id"] = id

        for bill in data["related_bills"]:
            bill["_scraped_bill_id"] = bill["bill_id"]
            bill_id = bill["bill_id"]
            bill_id = fix_bill_id(bill_id)
            bill["bill_id"] = ""
            db_bill = db.bills.find_one(
                {
                    "$or": [
                        {settings.LEVEL_FIELD: abbr, "session": data["session"], "bill_id": bill_id},
                        {settings.LEVEL_FIELD: abbr, "session": data["session"], "alternate_bill_ids": bill_id},
                    ]
                }
            )

            if not db_bill:
                logger.warning("Error: Can't find %s" % bill_id)
                db_bill = {}
                db_bill["_id"] = None

            # Events are really hard to pin to a chamber. Some of these are
            # also a committee considering a bill from the other chamber, or
            # something like that.
            bill["bill_id"] = db_bill["_id"]
        import_event(data)
    ensure_indexes()
Exemplo n.º 8
0
def import_votes(state, data_dir):
    data_dir = os.path.join(data_dir, state)
    pattern = os.path.join(data_dir, 'votes', '*.json')

    paths = glob.glob(pattern)

    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        # clean up bill_id, needs to match the one already in the database
        data['bill_id'] = fix_bill_id(data['bill_id'])

        bill = db.bills.find_one({'state': state,
                                  'chamber': data['bill_chamber'],
                                  'session': data['session'],
                                  'bill_id': data['bill_id']})

        if not bill:
            _log.warning("Couldn't find bill %s" % data['bill_id'])
            continue

        del data['bill_id']

        try:
            del data['filename']
        except KeyError:
            pass

        for vtype in ('yes_votes', 'no_votes', 'other_votes'):
            svlist = []
            for svote in data[vtype]:
                id = get_legislator_id(state, data['session'],
                                       data['chamber'], svote)
                svlist.append({'name': svote, 'leg_id': id})

            data[vtype] = svlist

        for vote in bill['votes']:
            if (vote['motion'] == data['motion']
                and vote['date'] == data['date']):
                vote.update(data)
                break
        else:
            bill['votes'].append(data)

        db.bills.save(bill, safe=True)

    print 'imported %s vote files' % len(paths)
Exemplo n.º 9
0
def import_metadata(state, data_dir):
    preserved = {}
    old_metadata = db.metadata.find_one({'_id':state}) or {}
    for field in PRESERVED_FIELDS:
        if field in old_metadata:
            preserved[field] = old_metadata[field]

    data_dir = os.path.join(data_dir, state)
    with open(os.path.join(data_dir, 'state_metadata.json')) as f:
        data = json.load(f)
        data['_type'] = 'metadata'
        data = prepare_obj(data)

    data['_id'] = state
    data.update(preserved)
    db.metadata.save(data, safe=True)
Exemplo n.º 10
0
def import_votes(state, data_dir):
    pattern = os.path.join(data_dir, 'votes', '*.json')
    paths = glob.glob(pattern)

    votes = defaultdict(list)

    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        # need to match bill_id already in the database
        bill_id = fix_bill_id(data.pop('bill_id'))

        votes[(data['bill_chamber'], data['session'], bill_id)].append(data)

    print 'imported %s vote files' % len(paths)
    return votes
Exemplo n.º 11
0
def import_votes(data_dir):
    pattern = os.path.join(data_dir, "votes", "*.json")
    paths = glob.glob(pattern)

    votes = defaultdict(list)

    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        # need to match bill_id already in the database
        bill_id = fix_bill_id(data.pop("bill_id"))

        votes[(data["bill_chamber"], data["session"], bill_id)].append(data)

    logger.info("imported %s vote files" % len(paths))
    return votes
Exemplo n.º 12
0
def import_events(abbr, data_dir, import_actions=False):
    data_dir = os.path.join(data_dir, abbr)
    pattern = os.path.join(data_dir, 'events', '*.json')

    for path in glob.iglob(pattern):
        with open(path) as f:
            data = prepare_obj(json.load(f))
        for committee in data['participants']:
            cttyid = get_committee_id(data['level'], data['state'],
                                      committee['participant'],
                                      committee['chamber'] )
            if cttyid:
                committee['committee_id'] = cttyid

        for bill in data['related_bills']:
            bill['_scraped_bill_id'] = bill['bill_id']
            bill_id = bill['bill_id']
            bill_id = fix_bill_id(bill_id)
            bill['bill_id'] = ""
            db_bill = db.bills.find_one({
                "$or": [
                    {
                        "state": abbr,
                        'session': data['session'],
                        'bill_id': bill_id
                    },
                    {
                        "state": abbr,
                        'session': data['session'],
                        'alternate_bill_ids': bill_id
                    }
                ]
            })

            if not db_bill:
                logger.warning("Error: Can't find %s" % bill_id)
                db_bill = {}
                db_bill['_id'] = None

            # Events are really hard to pin to a chamber. Some of these are
            # also a committee considering a bill from the other chamber, or
            # something like that.
            bill['bill_id'] = db_bill['_id']
        import_event(data)
    ensure_indexes()
Exemplo n.º 13
0
def import_metadata(abbr, data_dir):
    preserved = {}
    old_metadata = db.metadata.find_one({'_id': abbr}) or {}
    for field in PRESERVED_FIELDS:
        if field in old_metadata:
            preserved[field] = old_metadata[field]

    data_dir = os.path.join(data_dir, abbr)
    with open(os.path.join(data_dir, 'metadata.json')) as f:
        data = json.load(f)
        data['_type'] = 'metadata'
        data = prepare_obj(data)

    data['_id'] = abbr
    data.update(preserved)

    data['latest_update'] = datetime.datetime.utcnow()

    db.metadata.save(data, safe=True)
Exemplo n.º 14
0
def import_legislator(data):
    data = prepare_obj(data)
    data['_scraped_name'] = data['full_name']

    # Rename 'role' -> 'type'
    for role in data['roles']:
        if 'role' in role:
            role['type'] = role['role']
            del role['role']

    cur_role = data['roles'][0]
    term = cur_role['term']
    prev_term = get_previous_term(data['state'], term)
    next_term = get_next_term(data['state'], term)

    spec = {'state': data['state'],
            'type': cur_role['type'],
            'term': {'$in': [term, prev_term, next_term]}}
    if 'district' in cur_role:
        spec['district'] = cur_role['district']
    if 'chamber' in cur_role:
        spec['chamber'] = cur_role['chamber']

    leg = db.legislators.find_one(
        {'state': data['state'],
         '_scraped_name': data['full_name'],
         'roles': {'$elemMatch': spec}})

    if leg:
        if 'old_roles' not in leg:
            leg['old_roles'] = {}

        if leg['roles'][0]['term'] == prev_term:
            # Move to old
            leg['old_roles'][leg['roles'][0]['term']] = leg['roles']
        elif leg['roles'][0]['term'] == next_term:
            leg['old_roles'][term] = data['roles']
            data['roles'] = leg['roles']

        update(leg, data, db.legislators)
    else:
        insert_with_id(data)
Exemplo n.º 15
0
def import_bills(abbr, data_dir):
    data_dir = os.path.join(data_dir, abbr)
    pattern = os.path.join(data_dir, 'bills', '*.json')

    git_prelod(abbr)

    counts = {
        "update": 0,
        "insert": 0,
        "total": 0
    }

    votes = load_standalone_votes(data_dir)
    try:
        categorizer = SubjectCategorizer(abbr)
    except Exception as e:
        logger.debug('Proceeding without subject categorizer: %s' % e)
        categorizer = None

    paths = glob.glob(pattern)
    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        counts["total"] += 1
        ret = import_bill(data, votes, categorizer)
        counts[ret] += 1

    logger.info('imported %s bill files' % len(paths))

    for remaining in votes.keys():
        logger.debug('Failed to match vote %s %s %s' % tuple([
            r.encode('ascii', 'replace') for r in remaining]))

    populate_current_fields(abbr)

    git_commit("Import Update")

    ensure_indexes()

    return counts
Exemplo n.º 16
0
def import_committees(abbr, data_dir):
    data_dir = os.path.join(data_dir, abbr)
    pattern = os.path.join(data_dir, 'committees', '*.json')

    counts = {
        "update": 0,
        "insert": 0,
        "total": 0
    }

    meta = db.metadata.find_one({'_id': abbr})
    current_term = meta['terms'][-1]['name']
    current_session = meta['terms'][-1]['sessions'][-1]

    paths = glob.glob(pattern)

    for committee in db.committees.find({settings.LEVEL_FIELD: abbr}):
        committee['members'] = []
        db.committees.save(committee, safe=True)

    # import committees from legislator roles, no standalone committees scraped
    if not paths:
        import_committees_from_legislators(current_term, abbr)

    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        counts["total"] += 1
        ret = import_committee(data, current_session, current_term)
        counts[ret] += 1

    logger.info('imported %s committee files' % len(paths))

    link_parents(abbr)

    ensure_indexes()
    return counts
Exemplo n.º 17
0
def import_bills(abbr, data_dir):
    data_dir = os.path.join(data_dir, abbr)
    pattern = os.path.join(data_dir, 'bills', '*.json')

    votes = import_votes(data_dir)

    paths = glob.glob(pattern)
    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        import_bill(data, votes)

    print 'imported %s bill files' % len(paths)

    for remaining in votes.keys():
        print 'Failed to match vote %s %s %s' % tuple([
            r.encode('ascii', 'replace') for r in remaining])

    meta = db.metadata.find_one({'_id': abbr})
    level = meta['level']
    populate_current_fields(level, abbr)

    ensure_indexes()
Exemplo n.º 18
0
def import_committees(state, data_dir):
    data_dir = os.path.join(data_dir, state)
    pattern = os.path.join(data_dir, 'committees', '*.json')

    meta = db.metadata.find_one({'_id': state})
    current_term = meta['terms'][-1]['name']
    current_session = meta['terms'][-1]['sessions'][-1]

    paths = glob.glob(pattern)

    for committee in db.committees.find({'state': state}):
        committee['members'] = []
        db.committees.save(committee)

    if not paths:
        # Not standalone committees
        for legislator in db.legislators.find({
            'roles': {'$elemMatch': {'term': current_term,
                                     'state': state}}}):

            for role in legislator['roles']:
                if (role['type'] == 'committee member' and
                    'committee_id' not in role):

                    spec = {'state': role['state'],
                            'chamber': role['chamber'],
                            'committee': role['committee']}
                    if 'subcommittee' in role:
                        spec['subcommittee'] = role['subcommittee']

                    committee = db.committees.find_one(spec)

                    if not committee:
                        committee = spec
                        committee['_type'] = 'committee'
                        committee['members'] = []
                        committee['sources'] = []
                        if 'subcommittee' not in committee:
                            committee['subcommittee'] = None
                        insert_with_id(committee)

                    for member in committee['members']:
                        if member['leg_id'] == legislator['leg_id']:
                            break
                    else:
                        committee['members'].append(
                            {'name': legislator['full_name'],
                             'leg_id': legislator['leg_id'],
                             'role': role.get('position') or 'member'})
                        db.committees.save(committee, safe=True)

                        role['committee_id'] = committee['_id']

            db.legislators.save(legislator, safe=True)

    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        spec = {'state': state,
                'chamber': data['chamber'],
                'committee': data['committee']}
        if 'subcommittee' in data:
            spec['subcommittee'] = data['subcommittee']

        committee = db.committees.find_one(spec)

        if not committee:
            insert_with_id(data)
            committee = data
        else:
            update(committee, data, db.committees)

        for member in committee['members']:
            if not member['name']:
                continue

            leg_id = get_legislator_id(state, current_session,
                                       data['chamber'],
                                       member['name'])

            if not leg_id:
                print "No matches for %s" % member['name'].encode(
                    'ascii', 'ignore')
                member['leg_id'] = None
                continue

            legislator = db.legislators.find_one({'_id': leg_id})

            member['leg_id'] = leg_id

            for role in legislator['roles']:
                if (role['type'] == 'committee member' and
                    role['term'] == current_term and
                    role['committee_id'] == committee['_id']):
                    break
            else:
                new_role = {'type': 'committee member',
                            'committee': committee['committee'],
                            'term': current_term,
                            'chamber': committee['chamber'],
                            'committee_id': committee['_id'],
                            'state': state}
                if 'subcommittee' in committee:
                    new_role['subcommittee'] = committee['subcommittee']
                legislator['roles'].append(new_role)
                legislator['updated_at'] = datetime.datetime.utcnow()
                db.legislators.save(legislator, safe=True)

        db.committees.save(committee, safe=True)

    print 'imported %s committee files' % len(paths)

    link_parents(state)

    ensure_indexes()
Exemplo n.º 19
0
def import_bills(state, data_dir):
    data_dir = os.path.join(data_dir, state)
    pattern = os.path.join(data_dir, 'bills', '*.json')

    meta = db.metadata.find_one({'_id': state})

    # Build a session to term mapping
    sessions = {}
    for term in meta['terms']:
        for session in term['sessions']:
            sessions[session] = term['name']

    paths = glob.glob(pattern)

    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        # clean up bill_id
        data['bill_id'] = fix_bill_id(data['bill_id'])

        subjects = data.pop('subjects', None)
        if subjects:
            data['scraped_subjects'] = subjects

        bill = db.bills.find_one({'state': data['state'],
                                  'session': data['session'],
                                  'chamber': data['chamber'],
                                  'bill_id': data['bill_id']})

        for sponsor in data['sponsors']:
            id = get_legislator_id(state, data['session'], None,
                                   sponsor['name'])
            sponsor['leg_id'] = id

        for vote in data['votes']:
            if 'committee' in vote:
                committee_id = get_committee_id(state,
                                                vote['chamber'],
                                                vote['committee'])
                vote['committee_id'] = committee_id

            for vtype in ('yes_votes', 'no_votes', 'other_votes'):
                svlist = []
                for svote in vote[vtype]:
                    id = get_legislator_id(state, data['session'],
                                           vote['chamber'], svote)
                    svlist.append({'name': svote, 'leg_id': id})

                vote[vtype] = svlist

        data['_term'] = sessions[data['session']]

        # Merge any version titles into the alternate_titles list
        alt_titles = set(data.get('alternate_titles', []))
        for version in data['versions']:
            if 'title' in version:
                alt_titles.add(version['title'])
            if '+short_title' in version:
                alt_titles.add(version['+short_title'])
        try:
            # Make sure the primary title isn't included in the
            # alternate title list
            alt_titles.remove(data['title'])
        except KeyError:
            pass
        data['alternate_titles'] = list(alt_titles)

        if not bill:
            data['_keywords'] = list(bill_keywords(data))
            insert_with_id(data)
        else:
            data['_keywords'] = list(bill_keywords(data))
            update(bill, data, db.bills)

    print 'imported %s bill files' % len(paths)

    populate_current_fields(state)
    ensure_indexes()
Exemplo n.º 20
0
def import_events(abbr, data_dir, import_actions=False):
    data_dir = os.path.join(data_dir, abbr)
    pattern = os.path.join(data_dir, 'events', '*.json')

    for path in glob.iglob(pattern):
        with open(path) as f:
            data = prepare_obj(json.load(f))

        def _resolve_ctty(committee):
            return get_committee_id(data[settings.LEVEL_FIELD],
                                    committee['chamber'],
                                    committee['participant'])

        def _resolve_leg(leg):
            chamber = leg['chamber'] if leg['chamber'] in ['upper', 'lower'] \
                else None

            return get_legislator_id(abbr,
                                     data['session'],
                                     chamber,
                                     leg['participant'])

        resolvers = {
            "committee": _resolve_ctty,
            "legislator": _resolve_leg
        }

        for entity in data['participants']:
            type = entity['participant_type']
            id = None
            if type in resolvers:
                id = resolvers[type](entity)
            else:
                logger.warning("I don't know how to resolve a %s" % type)
            entity['id'] = id

        for bill in data['related_bills']:
            bill_id = bill['bill_id']
            bill_id = fix_bill_id(bill_id)
            db_bill = db.bills.find_one({
                "$or": [
                    {
                        settings.LEVEL_FIELD: abbr,
                        'session': data['session'],
                        'bill_id': bill_id
                    },
                    {
                        settings.LEVEL_FIELD: abbr,
                        'session': data['session'],
                        'alternate_bill_ids': bill_id
                    }
                ]
            })

            if not db_bill:
                logger.warning("Error: Can't find %s" % bill_id)
                db_bill = {}
                db_bill['_id'] = None

            # Events are really hard to pin to a chamber. Some of these are
            # also a committee considering a bill from the other chamber, or
            # something like that.
            bill['id'] = db_bill['_id']
        import_event(data)
    ensure_indexes()
Exemplo n.º 21
0
def import_bills(state, data_dir):
    data_dir = os.path.join(data_dir, state)
    pattern = os.path.join(data_dir, 'bills', '*.json')

    meta = db.metadata.find_one({'_id': state})

    # Build a session to term mapping
    sessions = {}
    for term in meta['terms']:
        for session in term['sessions']:
            sessions[session] = term['name']

    votes = import_votes(state, data_dir)

    paths = glob.glob(pattern)

    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        # clean up bill_id
        data['bill_id'] = fix_bill_id(data['bill_id'])

        # move subjects to scraped_subjects
        subjects = data.pop('subjects', None)

        # NOTE: intentionally doesn't copy blank lists of subjects
        # this avoids the problem where a bill is re-run but we can't
        # get subjects anymore (quite common in fact)
        if subjects:
            data['scraped_subjects'] = subjects

        # add loaded votes to data
        bill_votes = votes.pop((data['chamber'], data['session'],
                                data['bill_id']), [])
        data['votes'].extend(bill_votes)

        bill = db.bills.find_one({'state': data['state'],
                                  'session': data['session'],
                                  'chamber': data['chamber'],
                                  'bill_id': data['bill_id']})

        vote_matcher = VoteMatcher(data['state'])
        if bill:
            vote_matcher.learn_vote_ids(bill['votes'])
        vote_matcher.set_vote_ids(data['votes'])

        # match sponsor leg_ids
        for sponsor in data['sponsors']:
            id = get_legislator_id(state, data['session'], None,
                                   sponsor['name'])
            sponsor['leg_id'] = id

        for vote in data['votes']:

            # committee_ids
            if 'committee' in vote:
                committee_id = get_committee_id(state,
                                                vote['chamber'],
                                                vote['committee'])
                vote['committee_id'] = committee_id

            # vote leg_ids
            for vtype in ('yes_votes', 'no_votes', 'other_votes'):
                svlist = []
                for svote in vote[vtype]:
                    id = get_legislator_id(state, data['session'],
                                           vote['chamber'], svote)
                    svlist.append({'name': svote, 'leg_id': id})

                vote[vtype] = svlist

        data['_term'] = sessions[data['session']]

        # Merge any version titles into the alternate_titles list
        alt_titles = set(data.get('alternate_titles', []))
        for version in data['versions']:
            if 'title' in version:
                alt_titles.add(version['title'])
            if '+short_title' in version:
                alt_titles.add(version['+short_title'])
        try:
            # Make sure the primary title isn't included in the
            # alternate title list
            alt_titles.remove(data['title'])
        except KeyError:
            pass
        data['alternate_titles'] = list(alt_titles)

        if not bill:
            data['_keywords'] = list(bill_keywords(data))
            insert_with_id(data)
        else:
            data['_keywords'] = list(bill_keywords(data))
            update(bill, data, db.bills)

    print 'imported %s bill files' % len(paths)

    for remaining in votes.keys():
        print 'Failed to match vote %s %s %s' % tuple([
            r.encode('ascii', 'replace') for r in remaining])

    populate_current_fields(state)
    ensure_indexes()
Exemplo n.º 22
0
def import_bills(state, data_dir):
    data_dir = os.path.join(data_dir, state)
    pattern = os.path.join(data_dir, "bills", "*.json")

    meta = db.metadata.find_one({"_id": state})

    # Build a session to term mapping
    sessions = {}
    for term in meta["terms"]:
        for session in term["sessions"]:
            sessions[session] = term["name"]

    votes = import_votes(state, data_dir)

    paths = glob.glob(pattern)

    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        # clean up bill_id
        data["bill_id"] = fix_bill_id(data["bill_id"])

        # move subjects to scraped_subjects
        subjects = data.pop("subjects", None)

        # NOTE: intentionally doesn't copy blank lists of subjects
        # this avoids the problem where a bill is re-run but we can't
        # get subjects anymore (quite common in fact)
        if subjects:
            data["scraped_subjects"] = subjects

        # add loaded votes to data
        bill_votes = votes.pop((data["chamber"], data["session"], data["bill_id"]), [])
        data["votes"].extend(bill_votes)

        bill = db.bills.find_one(
            {"state": data["state"], "session": data["session"], "chamber": data["chamber"], "bill_id": data["bill_id"]}
        )

        vote_matcher = VoteMatcher(data["state"])
        if bill:
            vote_matcher.learn_vote_ids(bill["votes"])
        vote_matcher.set_vote_ids(data["votes"])

        # match sponsor leg_ids
        for sponsor in data["sponsors"]:
            id = get_legislator_id(state, data["session"], None, sponsor["name"])
            sponsor["leg_id"] = id

        for vote in data["votes"]:

            # committee_ids
            if "committee" in vote:
                committee_id = get_committee_id(state, vote["chamber"], vote["committee"])
                vote["committee_id"] = committee_id

            # vote leg_ids
            for vtype in ("yes_votes", "no_votes", "other_votes"):
                svlist = []
                for svote in vote[vtype]:
                    id = get_legislator_id(state, data["session"], vote["chamber"], svote)
                    svlist.append({"name": svote, "leg_id": id})

                vote[vtype] = svlist

        data["_term"] = sessions[data["session"]]

        # Merge any version titles into the alternate_titles list
        alt_titles = set(data.get("alternate_titles", []))
        for version in data["versions"]:
            if "title" in version:
                alt_titles.add(version["title"])
            if "+short_title" in version:
                alt_titles.add(version["+short_title"])
        try:
            # Make sure the primary title isn't included in the
            # alternate title list
            alt_titles.remove(data["title"])
        except KeyError:
            pass
        data["alternate_titles"] = list(alt_titles)

        if not bill:
            data["_keywords"] = list(bill_keywords(data))
            insert_with_id(data)
        else:
            data["_keywords"] = list(bill_keywords(data))
            update(bill, data, db.bills)

    print "imported %s bill files" % len(paths)

    for remaining in votes.keys():
        print "Failed to match vote %s %s %s" % tuple([r.encode("ascii", "replace") for r in remaining])

    populate_current_fields(state)
    ensure_indexes()
Exemplo n.º 23
0
def import_legislator(data):
    data = prepare_obj(data)

    if data.get('_scraped_name') is None:
        data['_scraped_name'] = data['full_name']

    # Rename 'role' -> 'type'
    for role in data['roles']:
        if 'role' in role:
            role['type'] = role.pop('role')

        # copy over LEVEL_FIELD into role
        if settings.LEVEL_FIELD in data:
            role[settings.LEVEL_FIELD] = data[settings.LEVEL_FIELD]

    scraped_role = data['roles'][0]
    scraped_term = scraped_role['term']

    abbr = data[settings.LEVEL_FIELD]

    spec = {settings.LEVEL_FIELD: abbr,
            'type': scraped_role['type'],
            'term': scraped_term}
    if 'district' in scraped_role:
        spec['district'] = scraped_role['district']
    if 'chamber' in scraped_role:
        spec['chamber'] = scraped_role['chamber']

    # find matching legislator in current term
    leg = db.legislators.find_one(
        {settings.LEVEL_FIELD: abbr,
         '_scraped_name': data['_scraped_name'],
         'roles': {'$elemMatch': spec}})

    # legislator with a matching old_role
    if not leg:
        spec.pop('term')
        leg = db.legislators.find_one({
            settings.LEVEL_FIELD: abbr,
            '_scraped_name': data['_scraped_name'],
            'old_roles.%s' % scraped_term: {'$elemMatch': spec}
        })

        if leg:
            if 'old_roles' not in data:
                data['old_roles'] = leg.get('old_roles', {})
             # put scraped roles into their old_roles
            data['old_roles'][scraped_term] = data['roles']
            data['roles'] = leg['roles']  # don't overwrite their current roles

    # active matching legislator from different term
    if not leg:
        spec.pop('term', None)
        leg = db.legislators.find_one(
            {settings.LEVEL_FIELD: abbr,
             '_scraped_name': data['_scraped_name'],
             'roles': {'$elemMatch': spec}})
        if leg:
            if 'old_roles' not in data:
                data['old_roles'] = leg.get('old_roles', {})

            # scraped_term < leg's term
            if term_older_than(abbr, scraped_term, leg['roles'][0]['term']):
                # move scraped roles into old_roles
                data['old_roles'][scraped_term] = data['roles']
                data['roles'] = leg['roles']
            else:
                data['old_roles'][leg['roles'][0]['term']] = leg['roles']

    data = apply_filters(filters, data)

    if leg:
        update(leg, data, db.legislators)
        return "update"
    else:
        insert_with_id(data)
        return "insert"