示例#1
0
def ky_by_candidate():

    generated_js.write('candidate_contributions = [')

    current_cand_ky_contrib = ky_candidate_contributions.where(
        lambda r: r['cmte_id'] in current_candidate_cmte_ids)
    current_candidate_groups = current_cand_ky_contrib.group_by('cand_nm')
    current_candidate_totals = current_candidate_groups.aggregate([
        ('contributions_count', agate.Count()),
        ('contributions_sum', agate.Sum('contb_receipt_amt'))
    ])
    sorted_current_candidate_totals = current_candidate_totals.order_by(
        'contributions_sum', reverse=True)
    for row in sorted_current_candidate_totals.rows:
        generated_js.write('{name: "' + row[0] + '", count: ' + str(row[1]) +
                           ', sum: ' + str(row[2]) + ', status: "current"},')

    dropped_cand_ky_contrib = ky_candidate_contributions.where(
        lambda r: r['cmte_id'] not in current_candidate_cmte_ids)
    dropped_candidate_groups = dropped_cand_ky_contrib.group_by('cand_nm')
    dropped_candidate_totals = dropped_candidate_groups.aggregate([
        ('contributions_count', agate.Count()),
        ('contributions_sum', agate.Sum('contb_receipt_amt'))
    ])
    sorted_dropped_candidate_totals = dropped_candidate_totals.order_by(
        'contributions_sum', reverse=True)
    filtered_dropped_candidate_totals = sorted_dropped_candidate_totals.where(
        lambda r: r['contributions_sum'] > 25000)
    for row in filtered_dropped_candidate_totals.rows:
        generated_js.write('{name: "' + row[0] + '", count: ' + str(row[1]) +
                           ', sum: ' + str(row[2]) + ', status: "dropped"},')

    generated_js.write(']\n')
    def _overall_stats(self):
        count_open_licenses = agate.Summary(
            'license_id', agate.Number(),
            lambda r: sum(license_id in utils.OPEN_LICENSES
                          for license_id in r.values()))

        self.overall_package_stats = self._package_table().aggregate([
            ('open_data_count', count_open_licenses),
        ])
        self.resource_stats = self._package_resource_table().compute([
            ('open_format', agate.Formula(agate.Boolean(),
                                          open_formats_count)),
        ])
        if len(self._package_resource_table()) > 0:
            self.resource_stats = self.resource_stats.aggregate([
                ('open_format_count', agate.Count('open_format', True)),
                ('min_date', agate.Min('created')),
                ('max_date', agate.Max('created'))
            ])
            format_table = self._package_resource_table().group_by(
                "format").aggregate([
                    ('count', agate.Count()),
                ])
            count = format_table.aggregate([
                ('different_formats', agate.Count()),
            ])
            self.open_datasets = self.overall_package_stats.get(
                "open_data_count", 0)
            self.open_format_count = self.resource_stats.get(
                "open_format_count", 0)
            self.format_count = count.get("different_formats", 0)
            self.compute_dates()
示例#3
0
def race_and_age(data):
    # Filters rows without age data
    only_with_age = data['with_years_in_prison'].where(
        lambda r: r['age'] is not None
    )

    # Group by race
    race_groups = only_with_age.group_by('race')

    # Sub-group by age cohorts (20s, 30s, etc.)
    race_and_age_groups = race_groups.group_by(
        lambda r: '%i0s' % (r['age'] // 10),
        key_name='age_group'
    )

    # Aggregate medians for each group
    medians = race_and_age_groups.aggregate([
        ('count', agate.Count()),
        ('median_years_in_prison', agate.Median('years_in_prison'))
    ])

    # Sort the results
    sorted_groups = medians.order_by('median_years_in_prison', reverse=True)

    # Print out the results
    sorted_groups.print_table(max_rows=10)
示例#4
0
def year_sum_counts(data):
    data['groupped_year'] = data['table'].group_by('year').aggregate([
        ('killed', agate.Sum('killed')), ('injured', agate.Sum('injured')),
        ('accidents', agate.Count()),
        ('accidents_injured', count_accidents_injured)
    ])
    return data
 def test_charting(self):
     exonerations = agate.Table.from_csv(
         '../../../data/exonerations-20150828.csv')
     by_year_exonerated = exonerations.group_by('exonerated')
     counts = by_year_exonerated.aggregate([('count', agate.Count())])
     chart = counts.order_by('exonerated').line_chart('exonerated',
                                                      'count',
                                                      path="line_chart.svg")
     print(chart)
示例#6
0
def states(data):
    by_state = data['with_years_in_prison'].group_by('state')
    state_totals = by_state.aggregate([
        ('count', agate.Count())
    ])

    sorted_totals = state_totals.order_by('count', reverse=True)

    sorted_totals.print_table(max_rows=5)

    medians = by_state.aggregate([
        ('count', agate.Count()),
        ('median_years_in_prison', agate.Median('years_in_prison'))
    ])

    sorted_medians = medians.order_by('median_years_in_prison', reverse=True)

    sorted_medians.print_table(max_rows=5)
示例#7
0
 def distinct_values(self, key=None, with_count=False) -> agate.Table:
     logger.debug('distinct values for: ' + key)
     if with_count:
         table = self._table.group_by(key)
         # group_by returns a TableSet, so another step is required
         table = table.aggregate([('count', agate.Count())])
     else:
         table = self._table.select(key).distinct(key)
     return table
示例#8
0
def statistics(data):
    data['statistics'] = data['table'].aggregate([
        ('killed', agate.Sum('killed')), ('injured', agate.Sum('injured')),
        ('accidents', agate.Count()),
        ('mean_accidents', agate.Mean('accidents')),
        ('mean_killed', agate.Mean('killed')),
        ('mean_injured', agate.Mean('injured'))
    ])
    return data
示例#9
0
def year_police_beat_sum_counts(data):
    data['year_police_beat'] = data['table'].group_by('year').group_by(
        'police_beat').aggregate([
            ('killed', agate.Sum('killed')), ('injured', agate.Sum('injured')),
            ('accidents', agate.Count())
        ]).compute([('weighted_rank', RankWeightedAccidents('year')),
                    ('killed_rank', GroupRanking('killed', 'year')),
                    ('accidents_rank', GroupRanking('accidents', 'year')),
                    ('injured_rank', GroupRanking('injured', 'year'))])
    return data
示例#10
0
    def _analyze_count(self, event):
        # Average emails per day
        total = self.table.aggregate([("total", agate.Count())])["total"]
        total_senders = (
            self.table.distinct("fields/from").select("fields/from").aggregate(
                [("total", agate.Count())])["total"])

        if total == 0:
            first_email_date = ""
            last_email_date = None
        else:
            date_data = self.table.where(
                lambda row: row["fields/date"] is not None).compute([(
                    "reduce_to_datetime",
                    agate.Formula(
                        agate.DateTime(datetime_format="%Y-%m-%d %H:%M:%S"),
                        lambda row: helpers.reduce_to_datetime(row[
                            "fields/date"]),
                    ),
                )])
            first_email_date = (date_data.order_by("reduce_to_datetime").limit(
                1).columns["fields/date"].values()[0])
            last_email_date = (date_data.order_by(
                "reduce_to_datetime",
                reverse=True).limit(1).columns["fields/date"].values()[0])
        event.set()

        metrics = [
            ["Total emails", total],
            ["Senders", total_senders],
            ["First Email Date", first_email_date],
        ]

        if last_email_date:
            date_delta = helpers.convert_date(
                last_email_date) - helpers.convert_date(first_email_date)
            avg_email_per_day = total / date_delta.days
            metrics.append(["Avg. Emails/Day", f"{avg_email_per_day:.2f}"])

        print(f"\n\n{helpers.h1_icn} Stats\n")
        print(termtables.to_string(metrics))
 def get_org_format_aggregates(self, package_table):
     # format can not exist!?!
     format_table = package_table.group_by("format").aggregate([
         ('count', agate.Count()),
     ])
     open_format_table = package_table.compute([
         ('open_format', agate.Formula(agate.Boolean(), open_formats_count))
     ])
     open_format_table_aggregates = open_format_table.aggregate([
         ('open_formats', agate.Count('open_format', True)),
     ])
     new_table = format_table.compute([
         ('open_format', agate.Formula(agate.Boolean(), open_formats_count))
     ])
     count = new_table.aggregate([
         ('different_formats', agate.Count()),
         ('open_formats', agate.Count('open_format', True)),
     ])
     count["open_formats_datasets"] = open_format_table_aggregates[
         "open_formats"]
     return count
 def get_org_groups_aggregate(self):
     self.groups = len(set(self.package_groups))
     group_table = self._group_table()
     if (len(group_table) > 0):
         group_aggregates = group_table.group_by('a').aggregate([
             ('count', agate.Count())
         ])
         if len(group_aggregates) > 0:
             self.groups_dataset_variance = 1
             if len(group_aggregates) > 1:
                 self.groups_dataset_variance = group_aggregates.aggregate(
                     agate.Variance('count'))
                 self.group_score = self.score_for_groups(
                     len(group_aggregates))
示例#13
0
def print_ky_current_candidate_sum():
    current_cand_ky_contrib = ky_candidate_contributions.where(
        lambda r: r['cmte_id'] in current_candidate_cmte_ids)

    ky_current_candidate_count = current_cand_ky_contrib.aggregate(
        agate.Count())
    ky_current_candidate_sum = current_cand_ky_contrib.aggregate(
        agate.Sum('contb_receipt_amt'))
    current_candidate_count = len(current_candidate_cmte_ids)

    print('There are currently ' + str(current_candidate_count) +
          ' candidates running for president. Those ' +
          str(current_candidate_count) + ' candidates have received ' +
          str(ky_current_candidate_count) + ' donations totaling $' +
          str(ky_current_candidate_sum))
示例#14
0
def print_ky_overall_summary():

    # How much money has been donated by Kentuckians to the 2016 presidential race?
    ky_contrib_sum = ky_all_contributions.aggregate(
        agate.Sum('TRANSACTION_AMT'))
    # How many contributions have Kentuckians made to the presidential race?
    ky_contrib_count = ky_all_contributions.aggregate(agate.Count())

    print(
        str(ky_contrib_count) + ' donations, totaling $' +
        str(ky_contrib_sum) +
        ' have been donated by Kentuckians to the 2016 presidential race.')

    generated_js.write('total_donated_sum = ' + str(ky_contrib_sum) +
                       '\ntotal_donated_count = ' + str(ky_contrib_count) +
                       '\n')
 def test_grouping(self):
     exonerations = agate.Table.from_csv(
         '../../../data/exonerations-20150828.csv')
     clean_state_data = exonerations.compute(
         [('federal',
           agate.Formula(agate.Boolean(),
                         lambda row: row['state'].startswith('F-'))),
          ('state',
           agate.Formula(
               agate.Text(), lambda row: row['state'][2:]
               if row['state'].startswith('F-') else row['state']))],
         replace=True)
     by_state = clean_state_data.group_by('state')
     state_totals = by_state.aggregate([('count', agate.Count())])
     sorted_totals = state_totals.order_by('count', reverse=True)
     sorted_totals.print_table(max_rows=10)
示例#16
0
def top_ky_donors_pac():
    contributor_groups = ky_all_contributions.group_by('NAME')

    contributor_totals = contributor_groups.aggregate([
        ('contributions_count', agate.Count()),
        ('contributions_sum', agate.Sum('TRANSACTION_AMT'))
    ])
    sorted_contributor_totals = contributor_totals.order_by(
        'contributions_sum', reverse=True)

    sorted_contributor_totals.print_table()

    generated_js.write('top_donors_to_pacs = [')
    for row in itertools.islice(sorted_contributor_totals.rows, 0, 5):
        generated_js.write('{name: "' + row[0] + '", count: ' + str(row[1]) +
                           ', sum: ' + str(row[2]) + '},')
    generated_js.write(']\n')
示例#17
0
def top_ky_donors_candidates():
    contributor_groups = ky_candidate_contributions.group_by('contbr_nm')

    contributor_totals = contributor_groups.aggregate([
        ('contributions_count', agate.Count()),
        ('contributions_sum', agate.Sum('contb_receipt_amt'))
    ])
    sorted_contributor_totals = contributor_totals.order_by(
        'contributions_sum', reverse=True)

    sorted_contributor_totals.print_table()

    generated_js.write('top_donors_to_candidates = [')
    for row in itertools.islice(sorted_contributor_totals.rows, 0, 5):
        generated_js.write('{name: "' + row[0] + '", count: ' + str(row[1]) +
                           ', sum: ' + str(row[2]) + '},')
    generated_js.write(']\n')
示例#18
0
def print_ky_candidate_summary():

    # How much money has been donated by Kentuckians to the presidential candidates?
    ky_candidate_sum = ky_candidate_contributions.aggregate(
        agate.Sum('contb_receipt_amt'))
    # How many contributions have Kentuckians made to presidential candidates?
    ky_candidate_count = ky_candidate_contributions.aggregate(agate.Count())

    print(
        str(ky_candidate_count) + ' donations, totaling $' +
        str(ky_candidate_sum) +
        ' have been donated by Kentuckians specifically to the 2016 presidential candidates.'
    )

    generated_js.write('total_candidate_donated_sum = ' +
                       str(ky_candidate_sum) +
                       '\ntotal_candidate_donated_count = ' +
                       str(ky_candidate_count) + '\n')
示例#19
0
def sum_counts_by_hour(data):
    data['hour'] = data['table'].group_by('hour').aggregate([
        ('killed', agate.Sum('killed')), ('injured', agate.Sum('injured')),
        ('accidents', agate.Count()),
        ('accidents_injured', count_accidents_injured)
    ]).compute([
        ('killed_percent', agate.Percent('killed')),
        ('injured_percent', agate.Percent('injured')),
        ('accidents_percent', agate.Percent('accidents')),
    ]).compute([
        ('weighted',
         agate.Formula(agate.Number(),
                       lambda r: r['killed_percent'] + r['injured_percent'])),
        ('accidents_within_half_deviation',
         StandardDeviations('accidents', 0.5)),
        ('killed_within_half_deviation', StandardDeviations('killed', 0.5)),
        ('injured_within_half_deviation', StandardDeviations('injured', 0.5))
    ])
    return data
示例#20
0
def organisations(event, context):
    table = get_all_orgs()
    table.to_csv('/tmp/open_data_germany.csv')
    upload_file_to_s3('open_data_cities.csv','/tmp/open_data_germany.csv')
    aggregates = table.aggregate([
            ('count', agate.Count()),
            ('sum', agate.Sum('datasets'))
            ])
    with open('/tmp/summary.json', 'w') as f:
        json.dump(aggregates, f)
    upload_file_to_s3('open_data_cities_summary.json','/tmp/summary.json')
    body = {
        "message": "Go Serverless v1.0! Your function executed successfully!",
        "input": event
    }

    response = {
        "statusCode": 200,
        "body": json.dumps(body)
    }

    return response
 def get_org_groups_aggregate(self, package_data):
     package_groups = [
         group for p in package_data
         for group in self.get_group_title_from_package(p)
     ]
     group_table = agate.Table.from_object(package_groups)
     result = {}
     if (len(group_table) > 0):
         group_aggregates = group_table.group_by('a').aggregate([
             ('count', agate.Count())
         ])
         if len(group_aggregates) > 0:
             result = {"groups": 1, "groups_dataset_variance": 1}
             if len(group_aggregates) > 1:
                 result = {
                     "groups":
                     len(set(package_groups)),
                     "groups_dataset_variance":
                     group_aggregates.aggregate(agate.Variance('count')),
                     "group_score":
                     self.score_for_groups(len(group_aggregates))
                 }
     return result
 def get_open_stats(self, package_data):
     table = agate.Table.from_object(
         self.get_open_formats_and_license(package_data))
     return table.aggregate([('open_data_count', agate.Count())])
def year_police_beat_sum_counts(data):
    data['year_police_beat'] = data['table'].group_by('year').group_by(
        'police_beat').aggregate([('killed', agate.Sum('killed')),
                                  ('injured', agate.Sum('injured')),
                                  ('accidents', agate.Count())])
def sum_counts_by_full_hour(data):
    data['full_hour'] = data['table'].group_by('date_hour').aggregate([
        ('killed', agate.Sum('killed')), ('injured', agate.Sum('injured')),
        ('accidents', agate.Count())
    ])
示例#25
0
# tables - now separate by gender
youthf = youth.where(lambda row: row['gender'] == 'F')
youthm = youth.where(lambda row: row['gender'] == 'M')

adultf = adult.where(lambda row: row['gender'] == 'F')
adultm = adult.where(lambda row: row['gender'] == 'M')

elderlyf = elderly.where(lambda row: row['gender'] == 'F')
elderlym = elderly.where(lambda row: row['gender'] == 'M')

# I could also use a pivot, but I'm being explicit, for the sake of learning
# and ensuring I don't screw up
# http://agate.readthedocs.org/en/1.3.0/cookbook/transform.html

# aggregate the count for all
cmhayouthf = youthf.aggregate(agate.Count('housing_other', 'CMHA'))
cmhayouthm = youthm.aggregate(agate.Count('housing_other', 'CMHA'))

cmhaadultf = adultf.aggregate(agate.Count('housing_other', 'CMHA'))
cmhaadultm = adultm.aggregate(agate.Count('housing_other', 'CMHA'))

cmhaelderlyf = elderlyf.aggregate(agate.Count('housing_other', 'CMHA'))
cmhaelderlym = elderlym.aggregate(agate.Count('housing_other', 'CMHA'))

print("For CMHA clients")
print("Youth F clients this month:", (cmhayouthf))
print("Youth M clients this month:", (cmhayouthm))
print("adult F clients this month:", (cmhaadultf))
print("Youth M clients this month:", (cmhayouthm))
print("Elderly F clients this month", (cmhaelderlyf))
print("Elderly M clients this month:", (cmhaelderlym))
示例#26
0
#!/usr/bin/env python

import agate

table = agate.Table.from_csv(
    'examples/realdata/Datagov_FY10_EDU_recp_by_State.csv')

table.limit(10).bar_chart('State Name', 'TOTAL', 'docs/images/bar_chart.svg')
table.limit(10).column_chart('State Name', 'TOTAL',
                             'docs/images/column_chart.svg')

table = agate.Table.from_csv('examples/realdata/exonerations-20150828.csv')

by_year_exonerated = table.group_by('exonerated')
counts = by_year_exonerated.aggregate([('count', agate.Count())])

counts.order_by('exonerated').line_chart('exonerated', 'count',
                                         'docs/images/line_chart.svg')
table.scatterplot('exonerated', 'age', 'docs/images/dots_chart.svg')

top_crimes = table.group_by('crime').having([('count', agate.Count())],
                                            lambda t: t['count'] > 100)

by_year = top_crimes.group_by('exonerated')

counts = by_year.aggregate([('count', agate.Count())])

by_crime = counts.group_by('crime')

by_crime.order_by('exonerated').line_chart('exonerated', 'count',
                                           'docs/images/lattice.svg')
示例#27
0
def confessions(data):
    num_false_confessions = data['exonerations'].aggregate(agate.Count('false_confession', True))

    print('False confessions: %i' % num_false_confessions)
示例#28
0
# -*- coding: utf-8 -*-
"""
Generates basic tables from okc_relevant_data_genbin.csv.
"""
import agate

# create agate table from gender-binary csv data
okc_table = agate.Table.from_csv('okc_relevant_data_genbin.csv')

# group by gender, returns TableSet
by_gender = okc_table.group_by('gender', 'Gender')
# count aggregate by_gender, returns table
by_gender_count = by_gender.aggregate([('Count', agate.Count())])
# compute by_gender percentages
by_gender_count_percent = by_gender_count.compute([('%',
                                                    agate.Percent('Count'))])
by_gender_count_percent.print_table()
"""
| Gender |  Count |       % |
| ------ | ------ | ------- |
| Woman  |  9,235 | 31.972… |
| Man    | 19,650 | 68.028… |
"""

# group by orientation, returns TableSet
by_orientation = okc_table.group_by('orientation', 'Orientation')
# count aggregate by_orientation, returns table
by_orientation_count = by_orientation.aggregate([('Count', agate.Count())])
# compute by_orientation percentages
by_orientation_count_percent = by_orientation_count.compute([
    ('%', agate.Percent('Count'))
示例#29
0
ic = incorrect.where(lambda r: r['review_decision'] == 'IC')
inc = incorrect.where(lambda r: r['review_decision'] == 'INC')
cc = data.where(lambda r: r['review_decision'] == 'CC')
cnc = data.where(lambda r: r['review_decision'] == 'CNC')

# In[2]:

# game data and play data
game_include = [
    'game_id', 'away', 'home', 'date', 'ref_1', 'ref_2', 'ref_3', 'score_away',
    'score_home', 'original_pdf', 'box_score_url'
]

by_game = incorrect.group_by('game_id')

by_game_with_count = by_game.aggregate([('incorrect', agate.Count())])

game_data = data.select(game_include).distinct()

# merge
game_data_merged = game_data.join(by_game_with_count, 'game_id', 'game_id')
game_data_merged.to_csv('output/web_games.csv')

# In[3]:

play_include = [
    'game_id', 'period', 'time', 'seconds_left', 'call_type',
    'committing_player', 'disadvantaged_player', 'review_decision', 'comment',
    'video', 'committing_team', 'disadvantaged_team', 'play_id',
    'ref_made_call'
]
 def test_aggregate(self):
     exonerations = agate.Table.from_csv(
         '../../../data/exonerations-20150828.csv')
     print(
         exonerations.aggregate(agate.Count('false_confession',
                                            value=True)))