def race_and_age(data): # Filters rows without age data only_with_age = data['with_years_in_prison'].where( lambda r: r['age'] is not None ) # Group by race race_groups = only_with_age.group_by('race') # Sub-group by age cohorts (20s, 30s, etc.) race_and_age_groups = race_groups.group_by( lambda r: '%i0s' % (r['age'] // 10), key_name='age_group' ) # Aggregate medians for each group medians = race_and_age_groups.aggregate([ ('count', agate.Count()), ('median_years_in_prison', agate.Median('years_in_prison')) ]) # Sort the results sorted_groups = medians.order_by('median_years_in_prison', reverse=True) # Print out the results sorted_groups.print_table(max_rows=10)
def test_computing_new_columns(self): # 计算新的列 exonerations = agate.Table.from_csv( '../../../data/exonerations-20150828.csv') with_years_in_prison = exonerations.compute([ ('years_in_prison', agate.Change('convicted', 'exonerated')) ]) print(with_years_in_prison.aggregate(agate.Median('years_in_prison')))
def median_age(data): median_age = data['exonerations'].aggregate(agate.Median('age')) print('Median age at time of arrest: %i' % median_age) data['exonerations'].bins('age', 10, 0, 100).print_bars('age', width=80) data['exonerations'].pivot('age').order_by('age').print_bars('age', width=80) data['exonerations'].bins('age').print_bars('age', width=80)
def states(data): state_totals = data['with_years_in_prison'].group_by('state') medians = state_totals.aggregate([('years_in_prison', agate.Median(), 'median_years_in_prison')]) sorted_medians = medians.order_by('median_years_in_prison', reverse=True) print(sorted_medians.format(max_rows=5))
def states(data): by_state = data['with_years_in_prison'].group_by('state') state_totals = by_state.aggregate([ ('count', agate.Count()) ]) sorted_totals = state_totals.order_by('count', reverse=True) sorted_totals.print_table(max_rows=5) medians = by_state.aggregate([ ('count', agate.Count()), ('median_years_in_prison', agate.Median('years_in_prison')) ]) sorted_medians = medians.order_by('median_years_in_prison', reverse=True) sorted_medians.print_table(max_rows=5)
country_json = json.load(f) country_dict = {} for dct in country_json: country_dict[dct['name']] = dct['parent'] cpi_and_cl = cpi_and_cl.compute([('continent', agate.Formula(agate.Text(), get_country))]) grp_by_cont = cpi_and_cl.group_by('continent') print(grp_by_cont) for cont, table in grp_by_cont.items(): print(cont, len(table.rows)) # 눈으로 확인했을 때 아프리카와 아시아가 높은 값을 가지는 것을 확인할 수 있다. # 하지만 이것만으로 데이터에 접근하기엔 쉽지 않다. # 이 때 필요한 것이 집계 메서드이다. # 국민들이 인식하는 정부 부패 및 아동 노동과 관련하여 대륙들이 어떻게 다른지 비교해보자. agg = grp_by_cont.aggregate([ ('cl_mean', agate.Mean('Total (%)')), ('cl_max', agate.Max('Total (%)')), ('cpi_median', agate.Median('CPI 2013 Score')), ('cpi_min', agate.Min('CPI 2013 Score')) ]) agg.print_table() print() agg.print_bars('continent', 'cl_max') with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cpi_and_cl_2.pickle'), 'wb') as f: pickle.dump(cpi_and_cl, f)
import agate #tester = agate.TypeTester(force={'通道': agate.Boolean()})强制指定类型 purchases = agate.Table.from_csv('data.csv', encoding='GBK') print(purchases) #agate.Table.print_structure(purchases) #两种查看列名和数据类型 print(purchases.columns['通道']) exonerations = agate.Table.from_csv('20.csv', row_names=lambda r: '%(电池名)s' % (r)) #添加列名 print(exonerations.rows[0]) print(exonerations.rows['M45182110100D3']) for row in exonerations.rows[:5]: print(row['电池名']) median = exonerations.aggregate(agate.Median('通道')) print(median) with_age = exonerations.where(lambda row: row['通道'] is not None ) #where方法按照lambda 函数筛选出符合条件的所有行,组成新的Table for row in exonerations.rows[:5]: print(row[:]) print(len(exonerations.rows) - len(with_age.rows)) with_years_in_prison = exonerations.compute([ ('years_in_prison', agate.Change('convicted', 'exonerated')) ]) with_years_in_prison.aggregate(agate.Median('years_in_prison')) #print(with_age) #by_county = purchases.group_by('电池名') '''totals = by_county.aggregate([ ('county_cost', agate.Sum('total_cost')) ])
def median_age(data): with_age = data['exonerations'].where(lambda row: row['age'] is not None) median_age = with_age.columns['age'].aggregate(agate.Median()) print('Median age at time of arrest: %i' % median_age)
if __name__ == '__main__': data_lists = generate_test_data() # Create data table tbl = agate.Table(data_lists, column_names=column_names, column_types=column_types) # Produce summary table by_payband = tbl.group_by('pb') summary_tbl = by_payband.aggregate([('count', agate.Count()), ('sal_min', agate.Min('salary')), ('sal_max', agate.Max('salary')), ('sal_median', agate.Median('salary')) ]) # Display summary of generated test data print('Model data summary:\n') summary_tbl.print_table() print() summary_tbl.print_bars('pb', 'count', width=40) # ---Generate random numbers for simulation new_table = _add_random_column(tbl) # Show distributions of new table rand_tbl_count = new_table.pivot('random_group') rand_tbl_count = rand_tbl_count.order_by('random_group')
c = 0 for item in data_row: if isinstance(item, (str, unicode)): data_row[c] = item.decode('utf-8', 'replace') c += 1 homicide_table = get_table(homicide_data, homicide_types, homicide_titles) homicide_table = homicide_table.where(lambda x: 'rates' in x['GHO (DISPLAY)']) hom_and_cl = africa_ranked.join(homicide_table, 'Countries and areas', 'COUNTRY (DISPLAY)', inner=True) print(hom_and_cl.columns['Numeric'].aggregate(agate.Median())) for r in hom_and_cl.order_by('Numeric', reverse=True).rows: print(r['Countries and areas'], r['Numeric'], r['Total (%)']) # Investigating CPI (Corruption perception index) pci_workbook = xlrd.open_workbook(DATA_FOLDER + 'perceived_corruption_index.xls') pci_sheet = pci_workbook.sheets()[0] for r in range(pci_sheet.nrows): print(r, pci_sheet.row_values(r)) pci_title_rows = zip(pci_sheet.row_values(1), pci_sheet.row_values(2)) pci_titles = [t[0] + ' ' + t[1] for t in pci_title_rows]