Exemplo n.º 1
0
def scan_committees(abbr):
    metadata = db.metadata.find_one({'_id': abbr})
    level = metadata['level']

    duplicate_sources = defaultdict(int)
    report = {'upper_count': 0,
              'lower_count': 0,
              'joint_count': 0,
              'empty_count': 0,
              '_updated_today_count': 0,
              '_updated_this_month_count': 0,
              '_updated_this_year_count': 0,
              '_member_count': 0,
              '_members_with_leg_id_count': 0,
              'sourceless_count': 0,
              'unmatched_leg_ids': set(),
             }

    for com in db.committees.find({'level': level, level: abbr}):

        update_common(com, report)

        if com['chamber'] == 'upper':
            report['upper_count'] += 1
        elif com['chamber'] == 'lower':
            report['lower_count'] += 1
        elif com['chamber'] == 'joint':
            report['joint_count'] += 1

        # members
        if not com['members']:
            report['empty_count'] += 1

        for member in com['members']:
            report['_member_count'] += 1
            if member.get('leg_id'):
                report['_members_with_leg_id_count'] += 1
            else:
                report['unmatched_leg_ids'].add((com.get('term', ''),
                                                 com['chamber'],
                                                 member['name']))

        # sources
        for source in com['sources']:
            duplicate_sources[source['url']] += 1

    report['duplicate_sources'] = []
    for url, n in duplicate_sources.iteritems():
        if n > 1:
            report['duplicate_sources'].append(url)

    return report
Exemplo n.º 2
0
def scan_legislators(abbr):
    metadata = db.metadata.find_one({'_id': abbr})
    level = metadata['level']

    duplicate_sources = defaultdict(int)
    report = {'upper_active_count': 0,
              'lower_active_count': 0,
              'inactive_count': 0,
              '_updated_today_count': 0,
              '_updated_this_month_count': 0,
              '_updated_this_year_count': 0,
              'sourceless_count': 0,
             }
    seats_filled = {'upper': defaultdict(int), 'lower': defaultdict(int)}
    for key in checked_keys:
        report[key] = 0

    for leg in db.legislators.find({'level': level, level: abbr}):

        # do common details
        update_common(leg, report)

        # most checks only apply to active set
        if leg.get('active'):
            chamber = leg.get('chamber')
            if chamber == 'upper':
                report['upper_active_count'] += 1
            elif chamber == 'lower':
                report['lower_active_count'] += 1
            else:
                # TODO: track these? (executives)
                continue

            seats_filled[chamber][leg['district']] += 1
            # TODO: check seats_filled against districts

            for key in checked_keys:
                if leg.get(key):
                    report[key] += 1
        else:
            report['inactive_count'] += 1

        for source in leg['sources']:
            duplicate_sources[source['url']] += 1

    report['duplicate_sources'] = []
    for url, n in duplicate_sources.iteritems():
        if n > 1:
            report['duplicate_sources'].append(url)

    return report
Exemplo n.º 3
0
def scan_legislators(abbr):
    duplicate_sources = defaultdict(int)
    report = {'upper_active_count': 0,
              'lower_active_count': 0,
              'inactive_count': 0,
              '_updated_today_count': 0,
              '_updated_this_month_count': 0,
              '_updated_this_year_count': 0,
             }
    for key in checked_keys:
        report[key] = 0

    # initialize seat counts
    district_seats = {'upper': defaultdict(int), 'lower': defaultdict(int)}
    for district in db.districts.find({'abbr': abbr}):
        district_seats[district['chamber']][district['name']] = \
                district['num_seats']

    for leg in db.legislators.find({settings.LEVEL_FIELD: abbr}):

        # do common details
        update_common(leg, report)

        # most checks only apply to active set
        if leg.get('active'):
            chamber = leg.get('chamber')
            if chamber == 'upper':
                report['upper_active_count'] += 1
            elif chamber == 'lower':
                report['lower_active_count'] += 1
            else:
                # TODO: track these? (executives)
                continue

            # decrement empty seats (if it goes negative, we have too many)
            district_seats[chamber][leg['district']] -= 1

            for key in checked_keys:
                if leg.get(key):
                    report[key] += 1
        else:
            report['inactive_count'] += 1

        for source in leg['sources']:
            duplicate_sources[source['url']] += 1

    report['duplicate_sources'] = []
    for url, n in duplicate_sources.iteritems():
        if n > 1:
            report['duplicate_sources'].append(url)

    # copy over seat issues into report
    report['overfilled_seats'] = []
    report['vacant_seats'] = []
    for chamber, chamber_seats in district_seats.iteritems():
        for seat, count in chamber_seats.iteritems():
            if count < 0:
                report['overfilled_seats'].append((chamber, seat, -count))
            elif count > 0:
                report['vacant_seats'].append((chamber, seat, count))

    return report
Exemplo n.º 4
0
def scan_bills(abbr):
    metadata = db.metadata.find_one({'_id': abbr})
    level = metadata['level']

    duplicate_sources = defaultdict(int)
    duplicate_versions = defaultdict(int)
    other_actions = defaultdict(int)
    uncategorized_subjects = defaultdict(int)
    sessions = defaultdict(_bill_report_dict)

    for bill in db.bills.find({'level': level, level: abbr}):
        session_d = sessions[bill['session']]

        # chamber count & bill_types
        if bill['chamber'] == 'lower':
            session_d['lower_count'] += 1
        elif bill['chamber'] == 'upper':
            session_d['upper_count'] += 1
        for type in bill['type']:
            session_d['bill_types'][type] += 1

        update_common(bill, session_d)

        # actions
        last_date = datetime.datetime(1900, 1, 1)
        for action in bill['actions']:
            date = action['date']
            if date < last_date:
                session_d['actions_unsorted'].add(bill['_id'])
            session_d['action_count'] += 1
            for type in action['type']:
                session_d['actions_per_type'][type] += 1
            if 'other' in action['type']:
                other_actions[action['action']] += 1
            session_d['actions_per_actor'][action['actor']] += 1
            session_d['actions_per_month'][date.strftime('%Y-%m')] += 1
        if not bill['actions']:
            session_d['actionless_count'] += 1

        # sponsors
        for sponsor in bill['sponsors']:
            session_d['_sponsor_count'] += 1
            if sponsor.get('leg_id'):
                session_d['_sponsors_with_leg_id_count'] += 1
            else:
                # keep missing leg_ids
                session_d['unmatched_leg_ids'].add(
                    (term_for_session(abbr, bill['session']), bill['chamber'],
                     sponsor['name']))
            session_d['sponsors_per_type'][sponsor['type']] += 1
        if not bill['sponsors']:
            session_d['sponsorless_count'] += 1

        # votes
        for vote in bill['votes']:
            session_d['vote_count'] += 1
            if vote['passed']:
                session_d['_passed_vote_count'] += 1
            session_d['votes_per_chamber'][vote['chamber']] += 1
            if not vote.get('type'):
                logger.warning('vote is missing type on %s' % bill['_id'])
                continue
            session_d['votes_per_type'][vote.get('type')] += 1
            if not vote.get('date'):
                logger.warning('vote is missing date on %s' % bill['_id'])
                continue
            session_d['votes_per_month'][vote['date'].strftime('%Y-%m')] += 1

            # roll calls
            has_rollcalls = False
            for rc in (vote['yes_votes'] + vote['no_votes'] +
                       vote['other_votes']):
                has_rollcalls = True
                session_d['_rollcall_count'] += 1
                if rc.get('leg_id'):
                    session_d['_rollcalls_with_leg_id_count'] += 1
                else:
                    # keep missing leg_ids
                    session_d['unmatched_leg_ids'].add(
                        (term_for_session(abbr, bill['session']),
                         vote['chamber'], rc['name']))

            # check counts if any rollcalls are present
            if (has_rollcalls
                    and (len(vote['yes_votes']) != vote['yes_count']
                         or len(vote['no_votes']) != vote['no_count']
                         or len(vote['other_votes']) != vote['other_count'])):
                session_d['bad_vote_counts'].add(bill['_id'])

        # subjects
        for subj in bill.get('scraped_subjects', []):
            uncategorized_subjects[subj] += 1
        if bill.get('subjects'):
            session_d['_subjects_count'] += 1
            for subject in bill['subjects']:
                session_d['bills_per_subject'][subject] += 1

        # sources
        for source in bill['sources']:
            duplicate_sources[source['url']] += 1

        # versions
        if not bill['versions']:
            # total num of bills w/o versions
            session_d['versionless_count'] += 1
        else:
            # total num of versions
            session_d['version_count'] += len(bill['versions'])
        for doc in bill['versions']:
            duplicate_versions[doc['url']] += 1
        # TODO: add a duplicate documents back in?

    dup_version_urls = []
    dup_source_urls = []
    for url, n in duplicate_versions.iteritems():
        if n > 1:
            dup_version_urls.append(url)
    for url, n in duplicate_sources.iteritems():
        if n > 1:
            dup_source_urls.append(url)

    return {
        'duplicate_versions': dup_version_urls,
        'duplicate_sources': dup_source_urls,
        'other_actions': other_actions.items(),
        'uncategorized_subjects': uncategorized_subjects.items(),
        'sessions': sessions,
    }
Exemplo n.º 5
0
def scan_legislators(abbr):
    duplicate_sources = defaultdict(int)
    report = {
        'upper_active_count': 0,
        'lower_active_count': 0,
        'inactive_count': 0,
        '_updated_today_count': 0,
        '_updated_this_month_count': 0,
        '_updated_this_year_count': 0,
    }
    for key in checked_keys:
        report[key] = 0

    # initialize seat counts
    district_seats = {'upper': defaultdict(int), 'lower': defaultdict(int)}
    for district in db.districts.find({'abbr': abbr}):
        district_seats[district['chamber']][district['name']] = \
            district['num_seats']

    for leg in db.legislators.find({settings.LEVEL_FIELD: abbr}):

        # do common details
        update_common(leg, report)

        # most checks only apply to active set
        if leg.get('active'):
            chamber = leg.get('chamber')
            if chamber == 'upper':
                report['upper_active_count'] += 1
            elif chamber == 'lower':
                report['lower_active_count'] += 1
            else:
                # TODO: track these? (executives)
                continue

            # decrement empty seats (if it goes negative, we have too many)
            district_seats[chamber][leg['district']] -= 1

            for key in checked_keys:
                if leg.get(key):
                    report[key] += 1
        else:
            report['inactive_count'] += 1

        for source in leg['sources']:
            duplicate_sources[source['url']] += 1

    report['duplicate_sources'] = []
    for url, n in duplicate_sources.items():
        if n > 1:
            report['duplicate_sources'].append(url)

    # copy over seat issues into report
    report['overfilled_seats'] = []
    report['vacant_seats'] = []
    for chamber, chamber_seats in district_seats.items():
        for seat, count in chamber_seats.items():
            if count < 0:
                report['overfilled_seats'].append((chamber, seat, -count))
            elif count > 0:
                report['vacant_seats'].append((chamber, seat, count))

    return report
Exemplo n.º 6
0
def scan_bills(abbr):
    duplicate_sources = defaultdict(int)
    duplicate_versions = defaultdict(int)
    other_actions = defaultdict(int)
    uncategorized_subjects = defaultdict(int)
    sessions = defaultdict(_bill_report_dict)

    # load exception data into sets of ids indexed by exception type
    quality_exceptions = get_quality_exceptions(abbr)

    for bill in db.bills.find({settings.LEVEL_FIELD: abbr}):
        session_d = sessions[bill['session']]

        # chamber count & bill_types
        if bill['chamber'] == 'lower':
            session_d['lower_count'] += 1
        elif bill['chamber'] == 'upper':
            session_d['upper_count'] += 1
        for type in bill['type']:
            session_d['bill_types'][type] += 1

        update_common(bill, session_d)

        # actions
        last_date = datetime.datetime(1900, 1, 1)
        for action in bill['actions']:
            date = action['date']
            if date < last_date:
                session_d['actions_unsorted'].add(bill['_id'])
            session_d['action_count'] += 1
            for type in action['type']:
                session_d['actions_per_type'][type] += 1
            if 'other' in action['type']:
                other_actions[action['action']] += 1
            session_d['actions_per_actor'][action['actor']] += 1
            session_d['actions_per_month'][date.strftime('%Y-%m')] += 1

        # handle no_actions bills
        if not bill['actions']:
            if bill['_id'] not in quality_exceptions['bills:no_actions']:
                session_d['actionless_count'] += 1
            else:
                quality_exceptions['bills:no_actions'].remove(bill['_id'])

        # sponsors
        for sponsor in bill['sponsors']:
            session_d['_sponsor_count'] += 1
            if sponsor.get('leg_id') or sponsor.get('committee_id'):
                session_d['_sponsors_with_id_count'] += 1
            else:
                # keep list of unmatched sponsors
                session_d['unmatched_sponsors'].add(
                    (term_for_session(abbr, bill['session']), bill['chamber'],
                     sponsor['name'])
                )
            session_d['sponsors_per_type'][sponsor['type']] += 1

        # handle no sponsors bills
        if not bill['sponsors']:
            if bill['_id'] not in quality_exceptions['bills:no_sponsors']:
                session_d['sponsorless_count'] += 1
            else:
                quality_exceptions['bills:no_sponsors'].remove(bill['_id'])

        # subjects
        for subj in bill.get('scraped_subjects', []):
            uncategorized_subjects[subj] += 1
        if bill.get('subjects'):
            session_d['_subjects_count'] += 1
            for subject in bill['subjects']:
                session_d['bills_per_subject'][subject] += 1

        # sources
        for source in bill['sources']:
            duplicate_sources[source['url']] += 1

        # versions
        if not bill['versions']:
            # total num of bills w/o versions
            if bill['_id'] not in quality_exceptions['bills:no_versions']:
                session_d['versionless_count'] += 1
            else:
                quality_exceptions['bills:no_versions'].remove(bill['_id'])
        else:
            # total num of versions
            session_d['version_count'] += len(bill['versions'])
        for doc in bill['versions']:
            duplicate_versions[doc['url']] += 1
        # TODO: add duplicate document detection back in?

        # Check for progress meter gaps.
        progress_meter_gaps = session_d['progress_meter_gaps']
        action_dates = bill['action_dates']
        bill_chamber = bill['chamber']
        other_chamber = dict(lower='upper', upper='lower')[bill_chamber]

        # Check for bills that were signed but didn't pass both chambers.
        if bill['type'] == 'bill':
            if action_dates['signed']:
                if not action_dates['passed_upper']:
                    progress_meter_gaps.add(bill['_id'])
                elif not action_dates['passed_lower']:
                    progress_meter_gaps.add(bill['_id'])

        else:
            # Check for nonbills that were signed but didn't pass their
            # house of origin.
            if action_dates['signed']:
                if not action_dates['passed_' + bill_chamber]:
                    progress_meter_gaps.add(bill['_id'])

        if action_dates['passed_' + other_chamber]:
            if not action_dates['passed_' + bill_chamber]:
                progress_meter_gaps.add(bill['_id'])

    dup_version_urls = []
    dup_source_urls = []
    for url, n in duplicate_versions.items():
        if n > 1:
            dup_version_urls.append(url)
    for url, n in duplicate_sources.items():
        if n > 1:
            dup_source_urls.append(url)

    # do logging of unnecessary exceptions
    for qe_type, qes in quality_exceptions.items():
        if qes:
            logger.warning('unnecessary {0} exceptions for {1} bills: \n  {2}'
                           .format(qe_type, len(qes), '\n  '.join(qes)))

    return {'duplicate_versions': dup_version_urls,
            'duplicate_sources': dup_source_urls,
            'other_actions': other_actions.items(),
            'uncategorized_subjects': uncategorized_subjects.items(),
            'sessions': sessions,
            'progress_meter_gaps': []
           }
Exemplo n.º 7
0
def scan_bills(abbr):
    duplicate_sources = defaultdict(int)
    duplicate_versions = defaultdict(int)
    other_actions = defaultdict(int)
    uncategorized_subjects = defaultdict(int)
    sessions = defaultdict(_bill_report_dict)

    # load exception data into sets of ids indexed by exception type
    quality_exceptions = get_quality_exceptions(abbr)

    for bill in db.bills.find({settings.LEVEL_FIELD: abbr}):
        session_d = sessions[bill['session']]

        # chamber count & bill_types
        if bill['chamber'] == 'lower':
            session_d['lower_count'] += 1
        elif bill['chamber'] == 'upper':
            session_d['upper_count'] += 1
        for type in bill['type']:
            session_d['bill_types'][type] += 1

        update_common(bill, session_d)

        # actions
        last_date = datetime.datetime(1900, 1, 1)
        for action in bill['actions']:
            date = action['date']
            
            if not date:
              continue 

            if date < last_date:
                session_d['actions_unsorted'].add(bill['_id'])
            session_d['action_count'] += 1
            for type in action['type']:
                session_d['actions_per_type'][type] += 1
            if 'other' in action['type']:
                other_actions[action['action']] += 1
            session_d['actions_per_actor'][action['actor']] += 1
            session_d['actions_per_month'][date.strftime('%Y-%m')] += 1

        # handle no_actions bills
        if not bill['actions']:
            if bill['_id'] not in quality_exceptions['bills:no_actions']:
                session_d['actionless_count'] += 1
            else:
                quality_exceptions['bills:no_actions'].remove(bill['_id'])

        # sponsors
        for sponsor in bill['sponsors']:
            session_d['_sponsor_count'] += 1
            if sponsor.get('leg_id') or sponsor.get('committee_id'):
                session_d['_sponsors_with_id_count'] += 1
            else:
                # keep list of unmatched sponsors
                session_d['unmatched_sponsors'].add(
                    (term_for_session(abbr, bill['session']), bill['chamber'],
                     sponsor['name'])
                )
            session_d['sponsors_per_type'][sponsor['type']] += 1

        # handle no sponsors bills
        if not bill['sponsors']:
            if bill['_id'] not in quality_exceptions['bills:no_sponsors']:
                session_d['sponsorless_count'] += 1
            else:
                quality_exceptions['bills:no_sponsors'].remove(bill['_id'])

        # subjects
        for subj in bill.get('scraped_subjects', []):
            uncategorized_subjects[subj] += 1
        if bill.get('subjects'):
            session_d['_subjects_count'] += 1
            for subject in bill['subjects']:
                session_d['bills_per_subject'][subject] += 1

        # sources
        for source in bill['sources']:
            duplicate_sources[source['url']] += 1

        # versions
        if not bill['versions']:
            # total num of bills w/o versions
            if bill['_id'] not in quality_exceptions['bills:no_versions']:
                session_d['versionless_count'] += 1
            else:
                quality_exceptions['bills:no_versions'].remove(bill['_id'])
        else:
            # total num of versions
            session_d['version_count'] += len(bill['versions'])
        for doc in bill['versions']:
            duplicate_versions[doc['url']] += 1
        # TODO: add duplicate document detection back in?

        # Check for progress meter gaps.
        progress_meter_gaps = session_d['progress_meter_gaps']
        action_dates = bill['action_dates']
        bill_chamber = bill['chamber']
        other_chamber = dict(lower='upper', upper='lower')[bill_chamber]

        # Check for bills that were signed but didn't pass both chambers.
        if bill['type'] == 'bill':
            if action_dates['signed']:
                if not action_dates['passed_upper']:
                    progress_meter_gaps.add(bill['_id'])
                elif not action_dates['passed_lower']:
                    progress_meter_gaps.add(bill['_id'])

        else:
            # Check for nonbills that were signed but didn't pass their
            # house of origin.
            if action_dates['signed']:
                if not action_dates['passed_' + bill_chamber]:
                    progress_meter_gaps.add(bill['_id'])

        if action_dates['passed_' + other_chamber]:
            if not action_dates['passed_' + bill_chamber]:
                progress_meter_gaps.add(bill['_id'])

    dup_version_urls = []
    dup_source_urls = []
    for url, n in duplicate_versions.items():
        if n > 1:
            dup_version_urls.append(url)
    for url, n in duplicate_sources.items():
        if n > 1:
            dup_source_urls.append(url)

    # do logging of unnecessary exceptions
    for qe_type, qes in quality_exceptions.items():
        if qes:
            logger.warning('unnecessary {0} exceptions for {1} bills: \n  {2}'
                           .format(qe_type, len(qes), '\n  '.join(qes)))

    return {'duplicate_versions': dup_version_urls,
            'duplicate_sources': dup_source_urls,
            'other_actions': other_actions.items(),
            'uncategorized_subjects': uncategorized_subjects.items(),
            'sessions': sessions,
            'progress_meter_gaps': []
           }
Exemplo n.º 8
0
def scan_bills(abbr):
    metadata = db.metadata.find_one({'_id': abbr})
    level = metadata['level']

    duplicate_sources = defaultdict(int)
    duplicate_versions = defaultdict(int)
    other_actions = defaultdict(int)
    uncategorized_subjects = defaultdict(int)
    sessions = defaultdict(_bill_report_dict)

    for bill in db.bills.find({'level': level, level: abbr}):
        session_d = sessions[bill['session']]

        # chamber count & bill_types
        if bill['chamber'] == 'lower':
            session_d['lower_count'] += 1
        elif bill['chamber'] == 'upper':
            session_d['upper_count'] += 1
        for type in bill['type']:
            session_d['bill_types'][type] += 1

        update_common(bill, session_d)

        # actions
        last_date = datetime.datetime(1900, 1, 1)
        for action in bill['actions']:
            date = action['date']
            if date < last_date:
                session_d['actions_unsorted'].add(bill['_id'])
            session_d['action_count'] += 1
            for type in action['type']:
                session_d['actions_per_type'][type] += 1
            if 'other' in action['type']:
                other_actions[action['action']] += 1
            session_d['actions_per_actor'][action['actor']] += 1
            session_d['actions_per_month'][date.strftime('%Y-%m')] += 1
        if not bill['actions']:
            session_d['actionless_count'] += 1

        # sponsors
        for sponsor in bill['sponsors']:
            session_d['_sponsor_count'] += 1
            if sponsor.get('leg_id'):
                session_d['_sponsors_with_leg_id_count'] += 1
            else:
                # keep missing leg_ids
                session_d['unmatched_leg_ids'].add(
                    (term_for_session(abbr, bill['session']), bill['chamber'],
                    sponsor['name'])
                )
            session_d['sponsors_per_type'][sponsor['type']] += 1
        if not bill['sponsors']:
            session_d['sponsorless_count'] += 1

        # votes
        for vote in bill['votes']:
            session_d['vote_count'] += 1
            if vote['passed']:
                session_d['_passed_vote_count'] += 1
            session_d['votes_per_chamber'][vote['chamber']] += 1
            if not vote.get('type'):
                logger.warning('vote is missing type on %s' % bill['_id'])
                continue
            session_d['votes_per_type'][vote.get('type')] += 1
            if not vote.get('date'):
                logger.warning('vote is missing date on %s' % bill['_id'])
                continue
            session_d['votes_per_month'][vote['date'].strftime('%Y-%m')] += 1

            # roll calls
            has_rollcalls = False
            for rc in (vote['yes_votes'] + vote['no_votes'] +
                       vote['other_votes']):
                has_rollcalls = True
                session_d['_rollcall_count'] += 1
                if rc.get('leg_id'):
                    session_d['_rollcalls_with_leg_id_count'] += 1
                else:
                    # keep missing leg_ids
                    session_d['unmatched_leg_ids'].add(
                        (term_for_session(abbr, bill['session']),
                         vote['chamber'],
                        rc['name'])
                    )

            # check counts if any rollcalls are present
            if (has_rollcalls and
                (len(vote['yes_votes']) != vote['yes_count'] or
                 len(vote['no_votes']) != vote['no_count'] or
                 len(vote['other_votes']) != vote['other_count'])):
                session_d['bad_vote_counts'].add(bill['_id'])

        # subjects
        for subj in bill.get('scraped_subjects', []):
            uncategorized_subjects[subj] += 1
        if bill.get('subjects'):
            session_d['_subjects_count'] += 1
            for subject in bill['subjects']:
                session_d['bills_per_subject'][subject] += 1

        # sources
        for source in bill['sources']:
            duplicate_sources[source['url']] += 1

        # versions
        if not bill['versions']:
            # total num of bills w/o versions
            session_d['versionless_count'] += 1
        else:
            # total num of versions
            session_d['version_count'] += len(bill['versions'])
        for doc in bill['versions']:
            duplicate_versions[doc['url']] += 1
        # TODO: add a duplicate documents back in?

    dup_version_urls = []
    dup_source_urls = []
    for url, n in duplicate_versions.iteritems():
        if n > 1:
            dup_version_urls.append(url)
    for url, n in duplicate_sources.iteritems():
        if n > 1:
            dup_source_urls.append(url)

    return {'duplicate_versions': dup_version_urls,
            'duplicate_sources': dup_source_urls,
            'other_actions': other_actions.items(),
            'uncategorized_subjects': uncategorized_subjects.items(),
            'sessions': sessions,
           }
Exemplo n.º 9
0
Arquivo: bills.py Projeto: JT5D/billy
def scan_bills(abbr):
    duplicate_sources = defaultdict(int)
    duplicate_versions = defaultdict(int)
    other_actions = defaultdict(int)
    uncategorized_subjects = defaultdict(int)
    sessions = defaultdict(_bill_report_dict)

    # load exception data into sets of ids indexed by exception type
    quality_exceptions = get_quality_exceptions(abbr)

    for bill in db.bills.find({settings.LEVEL_FIELD: abbr}):
        session_d = sessions[bill["session"]]

        # chamber count & bill_types
        if bill["chamber"] == "lower":
            session_d["lower_count"] += 1
        elif bill["chamber"] == "upper":
            session_d["upper_count"] += 1
        for type in bill["type"]:
            session_d["bill_types"][type] += 1

        update_common(bill, session_d)

        # actions
        last_date = datetime.datetime(1900, 1, 1)
        for action in bill["actions"]:
            date = action["date"]
            if date < last_date:
                session_d["actions_unsorted"].add(bill["_id"])
            session_d["action_count"] += 1
            for type in action["type"]:
                session_d["actions_per_type"][type] += 1
            if "other" in action["type"]:
                other_actions[action["action"]] += 1
            session_d["actions_per_actor"][action["actor"]] += 1
            session_d["actions_per_month"][date.strftime("%Y-%m")] += 1

        # handle no_actions bills
        if not bill["actions"]:
            if bill["_id"] not in quality_exceptions["bills:no_actions"]:
                session_d["actionless_count"] += 1
            else:
                quality_exceptions["bills:no_actions"].remove(bill["_id"])

        # sponsors
        for sponsor in bill["sponsors"]:
            session_d["_sponsor_count"] += 1
            if sponsor.get("leg_id") or sponsor.get("committee_id"):
                session_d["_sponsors_with_id_count"] += 1
            else:
                # keep list of unmatched sponsors
                session_d["unmatched_sponsors"].add(
                    (term_for_session(abbr, bill["session"]), bill["chamber"], sponsor["name"])
                )
            session_d["sponsors_per_type"][sponsor["type"]] += 1

        # handle no sponsors bills
        if not bill["sponsors"]:
            if bill["_id"] not in quality_exceptions["bills:no_sponsors"]:
                session_d["sponsorless_count"] += 1
            else:
                quality_exceptions["bills:no_sponsors"].remove(bill["_id"])

        # subjects
        for subj in bill.get("scraped_subjects", []):
            uncategorized_subjects[subj] += 1
        if bill.get("subjects"):
            session_d["_subjects_count"] += 1
            for subject in bill["subjects"]:
                session_d["bills_per_subject"][subject] += 1

        # sources
        for source in bill["sources"]:
            duplicate_sources[source["url"]] += 1

        # versions
        if not bill["versions"]:
            # total num of bills w/o versions
            if bill["_id"] not in quality_exceptions["bills:no_versions"]:
                session_d["versionless_count"] += 1
            else:
                quality_exceptions["bills:no_versions"].remove(bill["_id"])
        else:
            # total num of versions
            session_d["version_count"] += len(bill["versions"])
        for doc in bill["versions"]:
            duplicate_versions[doc["url"]] += 1
        # TODO: add duplicate document detection back in?

    dup_version_urls = []
    dup_source_urls = []
    for url, n in duplicate_versions.iteritems():
        if n > 1:
            dup_version_urls.append(url)
    for url, n in duplicate_sources.iteritems():
        if n > 1:
            dup_source_urls.append(url)

    # do logging of unnecessary exceptions
    for qe_type, qes in quality_exceptions.iteritems():
        if qes:
            logger.warning(
                "unnecessary {0} exceptions for {1} bills: \n  {2}".format(qe_type, len(qes), "\n  ".join(qes))
            )

    return {
        "duplicate_versions": dup_version_urls,
        "duplicate_sources": dup_source_urls,
        "other_actions": other_actions.items(),
        "uncategorized_subjects": uncategorized_subjects.items(),
        "sessions": sessions,
    }