def compute_ranks(table): table = table.compute([ ('dataset_rank', agate.Rank('datasets', reverse=True)), ('formats_rank', agate.Rank('format_count', reverse=True)), #('open_formats_rank', agate.Rank('open_formats', reverse=True)), ('last_update_rank', agate.Rank('days_since_last_update')), #('open_datasets_rank', agate.Rank('open_datasets', reverse=True)), ('category_rank', agate.Rank('category_count', reverse=True)), ('category_variance_rank', agate.Rank('category_variance')), #('update_start_rank', agate.Rank('days_between_start_and_last_update')), #('start_rank', agate.Rank('days_since_start', reverse=True)), #('openess_score', agate.Formula(number, openness_score)), ('dataset_score_rank', agate.Rank('dataset_score', reverse=True)), ('category_score_rank', agate.Rank('category_score', reverse=True)), ]) table = table.compute([ ('dataset_rank_std', StandadizeScore('dataset_rank')), ('formats_rank_std', StandadizeScore('formats_rank')), ('last_update_rank_std', StandadizeScore('last_update_rank')), ('category_rank_std', StandadizeScore('category_rank')), ('category_variance_rank_std', StandadizeScore('category_variance_rank')), ('dataset_score_rank_std', StandadizeScore('dataset_score_rank')), ('category_score_rank_std', StandadizeScore('category_score_rank')), ]) table = table.compute([ ('overall_rank_data', agate.Formula(agate.Number(), overall_rank)) ]) table = table.compute([ ('overall_rank', agate.Rank('overall_rank_data')), ]) return table
def compute_ranks(self): self.orgs_table = self.orgs_table.compute([ ('dataset_rank', agate.Rank('datasets')), ('formats_rank', agate.Rank('format_count')), ('open_formats_rank', agate.Rank('datasets')), ('last_update_rank', agate.Rank('days_since_last_update')), ('open_datasets_rank', agate.Rank('open_datasets')), ]) self.orgs_table = self.orgs_table.compute([ ('overall_rank_data', agate.Formula(number, overall_rank)), ]) self.orgs_table = self.orgs_table.compute([ ('overall_rank', agate.Rank('overall_rank_data', reverse=True)), ])
def unicef_data(): """ Return a ranked agate table of unicef data with proper cleaned rows.""" workbook = xlrd.open_workbook('unicef_oct_2014.xlsx') sheet = workbook.sheets()[0] title_rows = zip(sheet.row_values(4), sheet.row_values(5)) titles = [t[0] + ' ' + t[1] for t in title_rows] titles = [t.strip() for t in titles] country_rows = [sheet.row_values(r) for r in range(6, 114)] cleaned_rows = [] for row in country_rows: cleaned_row = [remove_bad_chars(rv) for rv in row] cleaned_rows.append(cleaned_row) example_row = sheet.row(6) types = get_types(example_row) table = agate.Table(cleaned_rows, titles, types) ranked = table.compute([('Total Child Labor Rank', agate.Rank('Total (%)', reverse=True)), ]) return ranked
(lambda x: 'Positive' if x >= 1 else 'Zero or Negative')(0) (lambda x: 'Positive' if x >= 1 else 'Zero or Negative')(4) #table.columns['Place of residence (%) Urban'].aggregate(agate.Mean()) col = table.columns['Place of residence (%) Urban'] table.aggregate(agate.Mean('Place of residence (%) Urban')) has_por = table.where(lambda r: r['Place of residence (%) Urban'] is not None) has_por.aggregate(agate.Mean('Place of residence (%) Urban')) first_match = has_por.find(lambda x: x['Rural'] > 50) print(first_match['Countries and areas']) ranked = table.compute([ ('Total Child Labor Rank', agate.Rank('Total (%)', reverse=True)), ]) for row in ranked.order_by('Total (%)', reverse=True).limit(20).rows: print row['Total (%)'], row['Total Child Labor Rank'] def reverse_percent(row): return 100 - row['Total (%)'] ranked = table.compute([ ('Children not working (%)', agate.Formula(number_type, reverse_percent)), ]) ranked = ranked.compute([ ('Total Child Labor Rank', agate.Rank('Children not working (%)')),
# Null 값을 제외하고 aggregate에 Mean 함수를 넣어 계산해본다. # 이를 이용하여 Min과 Max도 계산이 가능하다. has_por = table.where(lambda x: x['Place of residence (%) Urban'] is not None) print(has_por.aggregate(agate.Mean('Place of residence (%) Urban'))) print() # 지방 아동 노동률이 50% 이상인 행 가운데 하나를 찾아 보자. # 조건을 만족하는 첫 번째 행을 반환한다. first_match = has_por.find(lambda x: x['Rural'] > 50) print(first_match['Countries and areas']) print() # 아동 노동률이 높은 국가의 순위를 알아보자 # 이를 위해서는 Total(%) 열을 기반으로 데이터를 정렬하면 된다. ranked = table.compute([ ('Total Child Labor Rank', agate.Rank('Total (%)', reverse=True)), ]) for row in ranked.order_by('Total (%)', reverse=True).limit(20).rows: print(row['Total (%)'], row['Total Child Labor Rank']) print() # reverse를 사용하지 않고 오름차순 정렬을 하고 싶다면 역 백분율을 기준으로 열을 생성하면 된다. def reverse_percent(row): return 100 - row['Total (%)'] ranked = table.compute([('Children not working (%)', agate.Formula(agate.Number(), reverse_percent))]) ranked = ranked.compute([ ('Total Child Labor Rank', agate.Rank('Children not working (%)')),
#~ #~ STEP 1 #~ #~ Create a shortlist of courses by using a popularity ranking based on preferences. #~ # count the preference positions ("preference matrix") for each course pref_count = prefs_n.pivot('course', 'preference') # calculate a course popularity index and rank pref_count = pref_count \ .compute([ ('pop', agate.Formula(agate.Number(), popularity_function)) ]) \ .compute([ ('rank', agate.Rank('pop', reverse=True)) ]) \ .order_by('rank') # reorder the preference columns pref_count_na = list( pref_count.exclude(['course', 'pop', 'rank']).column_names) pref_count_na.sort() pref_count_na = ['rank', 'pop', 'course'] + pref_count_na pref_count = pref_count.select(pref_count_na) #output pref_count.to_csv(outputdir + 'longlist.csv') # make sure the n_courses matches the length of the course list. For duplicating courses # add a 'runtimes' column which indicates if columns are to run multiple times
africa_cpi_cl = cpi_and_cl.where(lambda x: x['continent'] == 'africa') for r in africa_cpi_cl.order_by('Total (%)', reverse=True).rows: print(f"{r['Country / Territory']}: {r['Total (%)']}% - {r['CPI 2013 Score']}") print() print( numpy.corrcoef( [float(t) for t in africa_cpi_cl.columns['Total (%)'].values()], [float(c) for c in africa_cpi_cl.columns['CPI 2013 Score'].values()] )[0, 1] ) print() africa_cpi_cl = africa_cpi_cl.compute([('Africa Child Labor Rank', agate.Rank('Total (%)', reverse=True)), ]) africa_cpi_cl = africa_cpi_cl.compute([('Africa CPI Rank', agate.Rank('CPI 2013 Score')), ]) # 전체 데이터를 대상으로 상관 관계를 판단했을 때보다 상관 계수가 감소했다. # 이는 아프리카 데이터만을 살펴 보면 아동 노동과 국민의 부패인식도가 좀 더 밀접한 관계를 나타낸다는 것을 의미한다. # 부패 인식도와 아동 노동 백분율의 평균값을 찾고 가장 높은 아동 노동률과 최악의 부패 인식도를 보유한 국가르 찾아보자 cl_mean = africa_cpi_cl.aggregate(agate.Mean('Total (%)')) cpi_mean = africa_cpi_cl.aggregate(agate.Mean('CPI 2013 Score')) def highest_rates(row): if row['Total (%)'] > cl_mean and row['CPI 2013 Score'] < cpi_mean: return True return False
try: table.columns['Place of residence (%) Urban'].mean() except: pass has_por = table.where(lambda r: r['Place of residence (%) Urban'] is not None) has_por.columns['Place of residence (%) Urban'].aggregate(agate.Mean()) has_por.columns['Place of residence (%) Urban'].aggregate(agate.Max()) has_por.columns['Rural'].aggregate(agate.Mean()) has_por.columns['Rural'].aggregate(agate.Max()) has_por.find(lambda x: x['Rural'] > 50) ranked = table.compute([(agate.Rank('Total (%)', reverse=True), 'Total Child Labor Rank')]) # If we wanted a column showing children not working percentage ... def reverse_percent(row): return 100 - row['Total (%)'] table = table.compute([(agate.Formula(number_type, reverse_percent), 'Children not working (%)')]) # some investigation into other possible connections hiv_workbook = xlrd.open_workbook(DATA_FOLDER + 'hiv_aids_2014.xlsx') hiv_workbook.sheet_names()
titles = [t[0] + ' ' + t[1] for t in title_rows] titles = [t.strip() for t in titles] country_rows = [sheet.row_values(r) for r in range(6, 114)] cleaned_rows = [] for row in country_rows: cleaned_row = [remove_bad_chars(rv) for rv in row] cleaned_rows.append(cleaned_row) example_row = sheet.row(6) types = get_types(example_row) table = agate.Table(cleaned_rows, titles, types) ranked = table.compute([('Total Child Labor Rank', agate.Rank('Total (%)', reverse=True)), ]) cpi_workbook = xlrd.open_workbook( '../../data/chp9/corruption_perception_index.xls') cpi_sheet = cpi_workbook.sheets()[0] for r in range(cpi_sheet.nrows): print(r, cpi_sheet.row_values(r)) cpi_title_rows = zip(cpi_sheet.row_values(1), cpi_sheet.row_values(2)) cpi_titles = [t[0] + ' ' + t[1] for t in cpi_title_rows] cpi_titles = [t.strip() for t in cpi_titles] cpi_rows = [cpi_sheet.row_values(r) for r in range(3, cpi_sheet.nrows)]
titles = [t[0] + ' ' + t[1] for t in title_rows] titles = [t.strip() for t in titles] country_rows = [sheet.row_values(r) for r in range(6, 114)] cleaned_rows = [] for row in country_rows: cleaned_row = [remove_bad_chars(rv) for rv in row] cleaned_rows.append(cleaned_row) example_row = sheet.row(6) types = get_types(example_row) table = agate.Table(cleaned_rows, titles, types) ranked = table.compute([ ('Total Child Labor Rank', agate.Rank('Total (%)', reverse=True)), ]) cpi_workbook = xlrd.open_workbook( '../../data/chp9/corruption_perception_index.xls') cpi_sheet = cpi_workbook.sheets()[0] for r in range(cpi_sheet.nrows): print r, cpi_sheet.row_values(r) cpi_title_rows = zip(cpi_sheet.row_values(1), cpi_sheet.row_values(2)) cpi_titles = [t[0] + ' ' + t[1] for t in cpi_title_rows] cpi_titles = [t.strip() for t in cpi_titles] cpi_rows = [cpi_sheet.row_values(r) for r in range(3, cpi_sheet.nrows)]