Пример #1
0
def compute_ranks(table):
    table = table.compute([
        ('dataset_rank', agate.Rank('datasets', reverse=True)),
        ('formats_rank', agate.Rank('format_count', reverse=True)),
        #('open_formats_rank', agate.Rank('open_formats', reverse=True)),
        ('last_update_rank', agate.Rank('days_since_last_update')),
        #('open_datasets_rank', agate.Rank('open_datasets', reverse=True)),
        ('category_rank', agate.Rank('category_count', reverse=True)),
        ('category_variance_rank', agate.Rank('category_variance')),
        #('update_start_rank', agate.Rank('days_between_start_and_last_update')),
        #('start_rank', agate.Rank('days_since_start', reverse=True)),
        #('openess_score', agate.Formula(number, openness_score)),
        ('dataset_score_rank', agate.Rank('dataset_score', reverse=True)),
        ('category_score_rank', agate.Rank('category_score', reverse=True)),
    ])
    table = table.compute([
        ('dataset_rank_std', StandadizeScore('dataset_rank')),
        ('formats_rank_std', StandadizeScore('formats_rank')),
        ('last_update_rank_std', StandadizeScore('last_update_rank')),
        ('category_rank_std', StandadizeScore('category_rank')),
        ('category_variance_rank_std', StandadizeScore('category_variance_rank')),
        ('dataset_score_rank_std', StandadizeScore('dataset_score_rank')),
        ('category_score_rank_std', StandadizeScore('category_score_rank')),
        ])
    table = table.compute([
        ('overall_rank_data', agate.Formula(agate.Number(), overall_rank))
    ])
    table = table.compute([
        ('overall_rank', agate.Rank('overall_rank_data')),
        ])
    return table
 def compute_ranks(self):
     self.orgs_table = self.orgs_table.compute([
         ('dataset_rank', agate.Rank('datasets')),
         ('formats_rank', agate.Rank('format_count')),
         ('open_formats_rank', agate.Rank('datasets')),
         ('last_update_rank', agate.Rank('days_since_last_update')),
         ('open_datasets_rank', agate.Rank('open_datasets')),
     ])
     self.orgs_table = self.orgs_table.compute([
         ('overall_rank_data', agate.Formula(number, overall_rank)),
     ])
     self.orgs_table = self.orgs_table.compute([
         ('overall_rank', agate.Rank('overall_rank_data', reverse=True)),
     ])
Пример #3
0
def unicef_data():
    """ Return a ranked agate table of unicef data with proper cleaned rows."""
    workbook = xlrd.open_workbook('unicef_oct_2014.xlsx')
    sheet = workbook.sheets()[0]

    title_rows = zip(sheet.row_values(4), sheet.row_values(5))
    titles = [t[0] + ' ' + t[1] for t in title_rows]
    titles = [t.strip() for t in titles]

    country_rows = [sheet.row_values(r) for r in range(6, 114)]
    cleaned_rows = []

    for row in country_rows:
        cleaned_row = [remove_bad_chars(rv) for rv in row]
        cleaned_rows.append(cleaned_row)

    example_row = sheet.row(6)
    types = get_types(example_row)

    table = agate.Table(cleaned_rows, titles, types)
    ranked = table.compute([('Total Child Labor Rank',
                            agate.Rank('Total (%)', reverse=True)), ])

    return ranked
Пример #4
0
(lambda x: 'Positive' if x >= 1 else 'Zero or Negative')(0)
(lambda x: 'Positive' if x >= 1 else 'Zero or Negative')(4)

#table.columns['Place of residence (%) Urban'].aggregate(agate.Mean())
col = table.columns['Place of residence (%) Urban']
table.aggregate(agate.Mean('Place of residence (%) Urban'))

has_por = table.where(lambda r: r['Place of residence (%) Urban'] is not None)
has_por.aggregate(agate.Mean('Place of residence (%) Urban'))

first_match = has_por.find(lambda x: x['Rural'] > 50)
print(first_match['Countries and areas'])

ranked = table.compute([
    ('Total Child Labor Rank', agate.Rank('Total (%)', reverse=True)),
])
for row in ranked.order_by('Total (%)', reverse=True).limit(20).rows:
    print row['Total (%)'], row['Total Child Labor Rank']


def reverse_percent(row):
    return 100 - row['Total (%)']


ranked = table.compute([
    ('Children not working (%)', agate.Formula(number_type, reverse_percent)),
])

ranked = ranked.compute([
    ('Total Child Labor Rank', agate.Rank('Children not working (%)')),
Пример #5
0
# Null 값을 제외하고 aggregate에 Mean 함수를 넣어 계산해본다.
# 이를 이용하여 Min과 Max도 계산이 가능하다.
has_por = table.where(lambda x: x['Place of residence (%) Urban'] is not None)
print(has_por.aggregate(agate.Mean('Place of residence (%) Urban')))
print()

# 지방 아동 노동률이 50% 이상인 행 가운데 하나를 찾아 보자.
# 조건을 만족하는 첫 번째 행을 반환한다.
first_match = has_por.find(lambda x: x['Rural'] > 50)
print(first_match['Countries and areas'])
print()

# 아동 노동률이 높은 국가의 순위를 알아보자
# 이를 위해서는 Total(%) 열을 기반으로 데이터를 정렬하면 된다.
ranked = table.compute([
    ('Total Child Labor Rank', agate.Rank('Total (%)', reverse=True)),
])
for row in ranked.order_by('Total (%)', reverse=True).limit(20).rows:
    print(row['Total (%)'], row['Total Child Labor Rank'])
print()


# reverse를 사용하지 않고 오름차순 정렬을 하고 싶다면 역 백분율을 기준으로 열을 생성하면 된다.
def reverse_percent(row):
    return 100 - row['Total (%)']


ranked = table.compute([('Children not working (%)',
                         agate.Formula(agate.Number(), reverse_percent))])
ranked = ranked.compute([
    ('Total Child Labor Rank', agate.Rank('Children not working (%)')),
Пример #6
0
#~
#~ STEP 1
#~
#~ Create a shortlist of courses by using a popularity ranking based on preferences.
#~

# count the preference positions ("preference matrix") for each course
pref_count = prefs_n.pivot('course', 'preference')
# calculate a course popularity index and rank
pref_count = pref_count \
    .compute([
        ('pop', agate.Formula(agate.Number(), popularity_function))
    ]) \
    .compute([
        ('rank', agate.Rank('pop', reverse=True))
    ]) \
    .order_by('rank')

# reorder the preference columns
pref_count_na = list(
    pref_count.exclude(['course', 'pop', 'rank']).column_names)
pref_count_na.sort()
pref_count_na = ['rank', 'pop', 'course'] + pref_count_na
pref_count = pref_count.select(pref_count_na)

#output
pref_count.to_csv(outputdir + 'longlist.csv')

# make sure the n_courses matches the length of the course list. For duplicating courses
# add a 'runtimes' column which indicates if columns are to run multiple times
Пример #7
0
africa_cpi_cl = cpi_and_cl.where(lambda x: x['continent'] == 'africa')

for r in africa_cpi_cl.order_by('Total (%)', reverse=True).rows:
    print(f"{r['Country / Territory']}: {r['Total (%)']}% - {r['CPI 2013 Score']}")
print()

print(
    numpy.corrcoef(
        [float(t) for t in africa_cpi_cl.columns['Total (%)'].values()],
        [float(c) for c in africa_cpi_cl.columns['CPI 2013 Score'].values()]
    )[0, 1]
)
print()

africa_cpi_cl = africa_cpi_cl.compute([('Africa Child Labor Rank', agate.Rank('Total (%)', reverse=True)), ])
africa_cpi_cl = africa_cpi_cl.compute([('Africa CPI Rank', agate.Rank('CPI 2013 Score')), ])

# 전체 데이터를 대상으로 상관 관계를 판단했을 때보다 상관 계수가 감소했다.
# 이는 아프리카 데이터만을 살펴 보면 아동 노동과 국민의 부패인식도가 좀 더 밀접한 관계를 나타낸다는 것을 의미한다.

# 부패 인식도와 아동 노동 백분율의 평균값을 찾고 가장 높은 아동 노동률과 최악의 부패 인식도를 보유한 국가르 찾아보자
cl_mean = africa_cpi_cl.aggregate(agate.Mean('Total (%)'))
cpi_mean = africa_cpi_cl.aggregate(agate.Mean('CPI 2013 Score'))


def highest_rates(row):
    if row['Total (%)'] > cl_mean and row['CPI 2013 Score'] < cpi_mean:
        return True
    return False
try:
    table.columns['Place of residence (%) Urban'].mean()
except:
    pass

has_por = table.where(lambda r: r['Place of residence (%) Urban'] is not None)

has_por.columns['Place of residence (%) Urban'].aggregate(agate.Mean())
has_por.columns['Place of residence (%) Urban'].aggregate(agate.Max())

has_por.columns['Rural'].aggregate(agate.Mean())
has_por.columns['Rural'].aggregate(agate.Max())

has_por.find(lambda x: x['Rural'] > 50)

ranked = table.compute([(agate.Rank('Total (%)',
                                    reverse=True), 'Total Child Labor Rank')])

# If we wanted a column showing children not working percentage ...


def reverse_percent(row):
    return 100 - row['Total (%)']


table = table.compute([(agate.Formula(number_type, reverse_percent),
                        'Children not working (%)')])

# some investigation into other possible connections

hiv_workbook = xlrd.open_workbook(DATA_FOLDER + 'hiv_aids_2014.xlsx')
hiv_workbook.sheet_names()
Пример #9
0
titles = [t[0] + ' ' + t[1] for t in title_rows]
titles = [t.strip() for t in titles]

country_rows = [sheet.row_values(r) for r in range(6, 114)]
cleaned_rows = []

for row in country_rows:
    cleaned_row = [remove_bad_chars(rv) for rv in row]
    cleaned_rows.append(cleaned_row)

example_row = sheet.row(6)
types = get_types(example_row)

table = agate.Table(cleaned_rows, titles, types)
ranked = table.compute([('Total Child Labor Rank',
                         agate.Rank('Total (%)', reverse=True)), ])


cpi_workbook = xlrd.open_workbook(
    '../../data/chp9/corruption_perception_index.xls')
cpi_sheet = cpi_workbook.sheets()[0]

for r in range(cpi_sheet.nrows):
    print(r, cpi_sheet.row_values(r))

cpi_title_rows = zip(cpi_sheet.row_values(1), cpi_sheet.row_values(2))
cpi_titles = [t[0] + ' ' + t[1] for t in cpi_title_rows]
cpi_titles = [t.strip() for t in cpi_titles]
cpi_rows = [cpi_sheet.row_values(r) for r in range(3, cpi_sheet.nrows)]

Пример #10
0
titles = [t[0] + ' ' + t[1] for t in title_rows]
titles = [t.strip() for t in titles]

country_rows = [sheet.row_values(r) for r in range(6, 114)]
cleaned_rows = []

for row in country_rows:
    cleaned_row = [remove_bad_chars(rv) for rv in row]
    cleaned_rows.append(cleaned_row)

example_row = sheet.row(6)
types = get_types(example_row)

table = agate.Table(cleaned_rows, titles, types)
ranked = table.compute([
    ('Total Child Labor Rank', agate.Rank('Total (%)', reverse=True)),
])

cpi_workbook = xlrd.open_workbook(
    '../../data/chp9/corruption_perception_index.xls')
cpi_sheet = cpi_workbook.sheets()[0]

for r in range(cpi_sheet.nrows):
    print r, cpi_sheet.row_values(r)

cpi_title_rows = zip(cpi_sheet.row_values(1), cpi_sheet.row_values(2))
cpi_titles = [t[0] + ' ' + t[1] for t in cpi_title_rows]
cpi_titles = [t.strip() for t in cpi_titles]
cpi_rows = [cpi_sheet.row_values(r) for r in range(3, cpi_sheet.nrows)]