def run(timezones_amount=20): # dataframe takes list of dicts frame = pd.DataFrame(load_jsoned_file('bitlygov_ua.json')) print frame tz_counts = frame['tz'].value_counts().fillna('Missing') pprint(tz_counts[:timezones_amount]) print 'ploting...' plot_top(tz_counts, timezones_amount)
def run(): frame = pd.DataFrame(misc.load_jsoned_file('bitlygov_ua.json')) # get not null fields from 'a' field cframe = frame[frame.a.notnull()] # np.where first is the condition (has Windows in string) # Group to Windiws, Non Windows fields # example # In [42]: np.where([1,0,1,1], "tak", "nie") # Out[42]: # array(['tak', 'nie', 'tak', 'tak'], # dtype='|S3') oss = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Non Windows') # grouping timezones by oss tz_by_os = cframe.groupby([cframe['tz'], oss]) # size.unstack ???? agg_tzs = tz_by_os.size().unstack().fillna(0) # ???? indexer = agg_tzs.sum(1).argsort() # ???? count_subset = agg_tzs.take(indexer)[-10:] count_subset.plot(kind='barh', stacked=True)