def zscore(df, index=False, datecolumn='acquisition'): import pandas as pd import numpy as np if index == False: df.index = pd.DatetimeIndex(df[datecolumn]) df = df.drop(datecolumn, axis=1) else: df.index = pd.DatetimeIndex(df.index) # CORE da function mean = pd.groupby(df, by=[df.index.dayofyear]).aggregate(np.nanmean) std = pd.groupby(df, by=[df.index.dayofyear]).aggregate(np.nanstd) df2 = df.copy() for y in np.unique(df.index.year): for d in np.unique(df.index.dayofyear): df2[(df.index.year == y) & (df.index.dayofyear == d)] = (df[(df.index.year == y) & (df.index.dayofyear == d)] - mean.ix[d]) / std.ix[d] df2.index.name = 'date' return df2
def conjoint(self): ''' Calcule l'identifiant du conjoint et vérifie que les conjoint sont bien reciproques ''' print ("travail sur les conjoints") ind = self.ind conj = ind.ix[ind['couple']==1,['men','lienpref','id']] conj['lienpref'].value_counts() conj.ix[conj['lienpref']==1,'lienpref'] = 0 conj.ix[conj['lienpref']==31,'lienpref'] = 2 conj.ix[conj['lienpref']==32,'lienpref'] = 3 conj.ix[conj['lienpref']==50,'lienpref'] = 10 conj2 = merge(conj, conj, on=['men','lienpref']) conj2 = conj2[conj2['id_x'] != conj2['id_y']] assert len(conj2) == len(conj) conj = conj2 test = pd.groupby(conj, ['men','lienpref']).size() assert max(test)==2 and min(test)==2 couple = pd.groupby(conj, 'id_x') for id, potential in couple: if len(potential) == 1: conj.loc[ conj['id_x']==id, 'id_y'] = potential['id_y'] else: pdb.set_trace() # TODO: pas de probleme, bizarre conj = conj.rename(columns={'id_x': 'id', 'id_y':'conj'}) ind = merge(ind,conj[['id','conj']], on='id', how='left') self.ind = ind ## verif sur les conj réciproque test_conj = merge(ind[['conj','id']],ind[['conj','id']], left_on='id',right_on='conj') print "le nombre de couple non réciproque est:", sum(test_conj['id_x'] != test_conj['conj_y']) print ("fin du travail sur les conjoints")
def get_avg_diff(temp, selected_feature_names): """ get the average difference between active / drop-off groups, for each active label definition. temp = dataframe of features with labels, selected_feature_names = list of names of features to be included """ # grouping by label, for each definition of 'active' active_interested_group = pd.groupby(temp, by='isactive_interested') active_interested = active_interested_group.get_group(1) inactive_interested = active_interested_group.get_group(0) active_engaged_group = pd.groupby(temp, by='isactive_engaged') active_engaged = active_engaged_group.get_group(1) inactive_engaged = active_engaged_group.get_group(0) active_subscribed_group = pd.groupby(temp, by='isactive_subscribed') active_subscribed = active_subscribed_group.get_group(1) inactive_subscribed = active_subscribed_group.get_group(0) # extract the difference between group averages for features included in model mean_diff_interested = [] mean_diff_engaged = [] mean_diff_subscribed = [] for i in selected_feature_names: mean_diff_interested.append(active_interested[i].mean() - inactive_interested[i].mean()) mean_diff_engaged.append(active_engaged[i].mean() - inactive_engaged[i].mean()) mean_diff_subscribed.append(active_subscribed[i].mean() - inactive_subscribed[i].mean()) return mean_diff_interested, mean_diff_engaged, mean_diff_subscribed
def socioeconomic_ratios(): mgra_b = pd.read_csv("mgra13_based_input2012.csv") mgra_sb = pd.read_csv("mgra13_based_input2012_sb.csv") households = pd.read_csv("households.csv") persons = pd.read_csv("persons.csv") # Manupulating the variables of interest cs = [1 if x == 2 else 0 for x in persons.PSTUDENT] ss = [1 if x == 1 else 0 for x in persons.PSTUDENT] emp = [1 if x == 1 or x == 2 else 0 for x in persons.PEMPLOY] df1 = pd.merge(pd.groupby(pd.DataFrame({ "HHID": persons.HHID, "college_students": cs, "school_students": ss, "employed": emp }), by="HHID", as_index=False, sort=True, group_keys=True).sum(), households, on="HHID", sort=True) df2 = pd.groupby(pd.DataFrame({ "taz": df1.TAZ, "mgra": df1.MGRA, "college_students": df1.college_students, "school_students": df1.school_students, "employed": df1.employed, "HWORKERS": df1.HWORKERS }), by="mgra" and "taz", as_index=False, sort=True)["college_students", "school_students", "HWORKERS"].sum() df3 = pd.DataFrame({ "mgra": mgra_b.mgra, "taz": mgra_b.TAZ, "school_enrollments": mgra_b.EnrollGradeKto8 + mgra_b.EnrollGrade9to12, "college_enrollments": mgra_b.collegeEnroll + mgra_b.otherCollegeEnroll + mgra_b.AdultSchEnrl, "emp_total": mgra_b.emp_total }) df4 = pd.groupby(df3, by="mgra" and "taz", as_index=False, sort=True)["college_enrollments", "school_enrollments", "emp_total"].sum() ############################################################################################################ a = df4.school_enrollments.sum() / df2.school_students.sum() b = df4.college_enrollments.sum() / df2.college_students.sum() c = df4.emp_total.sum() / df2.HWORKERS.sum()
def generateNumericSummary(dat, group): #write your code std = pd.groupby(dat, group).std() nums_missing = dat.shape[0] - dat.count() means = pd.groupby(dat, group).mean() dic = {'std': std, 'numMissing': nums_missing, 'mean': means} return dic
def bayesMean(dt_in, dt_out, t_col = "brand", y_col = "target"): mean_dict = pd.groupby(dt_in[[t_col, y_col]], t_col).mean().to_dict()[y_col] ct_dict = pd.groupby(dt_in[[t_col, y_col]], t_col).count().to_dict()[y_col] glbmean = dt_in[y_col].values.mean() def bMeanSngl(vc, vm, glbmean = glbmean, prior = 5): return ((vc*vm)+(prior*glbmean))/(vc+prior) bmean_dict = dict((kc, bMeanSngl(vc, vm)) for ((kc, vc), (km, vm)) in \ zip(ct_dict.iteritems(), mean_dict.iteritems())) out = dt_out[t_col].apply(lambda x : bmean_dict.get(x, glbmean)).values return out
def create_freq_feats(data, column_name): ''' 求列的频率特征,也就是求对应列每周的平均值 ''' freq_feat = column_name + '_freq' print('Creating frequency feature: %s' % freq_feat) # 列+周的target计数 freq_frame = pd.groupby(data, [column_name, 'Semana'])['target'].count().reset_index() freq_frame.rename(columns={'target': freq_feat}, inplace=True) # 计算平均值 freq_frame = pd.groupby(freq_frame, [column_name])[freq_feat].mean().reset_index() # 将平均值join回原来的data中 return pd.merge(data, freq_frame, how='left', on=[column_name], left_index=False, right_index=False, suffixes=('', '_freq'), copy=False)
def add_value(target_df, data_df, group_col, typ, dropbox_path): ''' Help function for regression_data ''' df_ls = [] grouped = pd.groupby(target_df, group_col) for key, sub_df in grouped: if typ == 'swap': nation_swap = data_df[data_df.Currency == NATION_CURRENCY_DICT[key]] df_ls.append( sub_df.join(nation_swap[['5Y', 'Butterfly 5y', 'Curve 5y']])) elif typ == 'credit': credit_df = pd.read_csv( dropbox_path + 'cleaned data/Monthly credit spread curves/' + CREDIT_DICT[key], parse_dates=True, infer_datetime_format=True) credit_df.Date = pd.to_datetime(credit_df.Date, infer_datetime_format=True) credit_df.set_index('Date', inplace=True) df_ls.append(sub_df.join(credit_df['5Y'])) return pd.concat(df_ls)
def campaign_count(request): # Chart data is passed to the `dataSource` parameter, as dict, in the form of key-value pairs. data_source = dict() CHART["caption"] = "Total campaign registrations" data_source['chart'] = CHART data_source['data'] = [] my_campaigns = [c for c in Campaign.objects.filter(removed=False)] data = pd.DataFrame() data['id'] = [c.pk for c in my_campaigns] data['created_at'] = [c.created_at for c in my_campaigns] data['month'] = data['created_at'].apply(lambda date: '{y}-{m}'.format( y=date.year, m=get_month_format(date.month))) data.sort_values(by=['month'], inplace=True) gp = pd.groupby(data, by='month').aggregate({'id': 'count'}) gp = pd.DataFrame(gp) for idx, row in gp.iterrows(): data = dict() data['label'] = idx data['value'] = str(row['id']) data_source['data'].append(data) # Create an object for the Column 2D chart using the FusionCharts class constructor column_2d = FusionCharts("column2D", "ex1", "600", "350", "chart-1", "json", data_source) return render(request, cts.STATS_INDEX, {'output': column_2d.render()})
def plot_deliveries_by_team(): team_deliveries = ipl_df[['batting_team', 'delivery']] ipl_bat_group = pd.groupby(team_deliveries, by='batting_team') # print(team_deliveries) # print(ipl_bat_group.count().head()) ipl_bat_group.plot(kind='bar') plt.show()
def analysis(self): bin_dict = {i: pd.DataFrame() for i in xrange(1, self.num_of_bins+1, 1)} grouped = pd.groupby(self.research_data.data, by=[self.research_data.data.date]) for time_stamp, group in grouped: # Filter the input data group = group[group[self.alpha.name] != 0] group = group.dropna() group = group.sort_values(self.alpha.name) # Partition daily data into n bins partitions = np.array_split(group, self.num_of_bins) for i in xrange(1, self.num_of_bins+1, 1): bin_dict[i] = bin_dict[i].append(partitions[i-1]) # Alpha/Return analysis for i in xrange(1, self.num_of_bins+1, 1): alpha_bin = pd.DataFrame({ 'bin': [i], self.alpha.name: [bin_dict[i][self.alpha.name].mean()], 'return': [bin_dict[i]['return'].mean()*10000] }) self.alpha_return = self.alpha_return.append(alpha_bin[['bin', self.alpha.name, 'return']], ignore_index=True) self.alpha_return = self.alpha_return.set_index('bin') self._plot() return self.alpha_return
def recommended_candidates(request): """ Recommended """ data_source = dict() CHART["caption"] = "Recommended candidates" data_source['chart'] = CHART columns = ['id', 'created_at'] data = pd.DataFrame(list( Candidate.objects.filter(state__code__in=['GTJ', 'STC'], removed=False).values_list(*columns)), columns=columns) data['month'] = data['created_at'].apply(lambda date: '{y}-{m}'.format( y=date.year, m=get_month_format(date.month))) data.drop('created_at', inplace=True, axis=1) gp = pd.groupby(data, by='month').aggregate({'id': 'count'}) data = pd.DataFrame(gp) data.sort_index(inplace=True) data_source['data'] = [] for idx, row in data.iterrows(): data_source['data'].append({'label': idx, 'value': str(row['id'])}) # Create an object for the Column 2D chart using the FusionCharts class constructor column_2d = FusionCharts("column2D", "ex1", "600", "350", "chart-1", "json", data_source) return render(request, cts.STATS_INDEX, {'output': column_2d.render()})
def get_unique_users_registrations(request): """ Unique users registered per month """ data_source = dict() CHART["caption"] = "Unique user registrations" data_source['chart'] = CHART columns = ['id', 'created_at'] data = pd.DataFrame(list(User.objects.all().values_list(*columns)), columns=columns) data['month'] = data['created_at'].apply(lambda date: '{y}-{m}'.format( y=date.year, m=get_month_format(date.month))) data.drop('created_at', inplace=True, axis=1) gp = pd.groupby(data, by='month').aggregate({'id': 'count'}) data = pd.DataFrame(gp) data.sort_index(inplace=True) data_source['data'] = [] for idx, row in data.iterrows(): data_source['data'].append({'label': idx, 'value': str(row['id'])}) # Create an object for the Column 2D chart using the FusionCharts class constructor column_2d = FusionCharts("column2D", "ex1", "600", "350", "chart-1", "json", data_source) return render(request, cts.STATS_INDEX, {'output': column_2d.render()})
def generate_conv_timestamp(): #add timestamp of conversion for each user path = r'C:\Users\sesig\Documents\master data science\tfm\r_dataset_cleaned\data_all_1u.csv' data = pd.read_csv(filepath_or_buffer=path, sep=',') data_grouped = pd.groupby(data, by='uid') nuser = pd.Series.nunique(data['uid']) x = pd.DataFrame( data={ 'uid': np.arange(nuser, dtype=np.int_), 'tconv': np.zeros(nuser, dtype=np.float_) }) path_params = r'C:\Users\sesig\Documents\master data science\tfm\criteo_cleaned_data\gamma_dist_params.csv' channel_params = pd.read_csv(filepath_or_buffer=path_params, sep=',') i = 0 for name, group in data_grouped: if group.iloc[0, 2] == 1: ch = group.iloc[-1, 1] a = channel_params.loc[ch, 'shape parameter'] loc = channel_params.loc[ch, 'location parameter'] scale = channel_params.loc[ch, 'scale parameter'] x.loc[i, 'tconv'] = group.iloc[-1, 3] + stats.gamma.rvs( a, loc=loc, scale=scale, size=1, random_state=i) i += 1 else: x.loc[i, 'tconv'] = group.iloc[-1, 3] + 15 i += 1 path_out = r'C:\Users\sesig\Documents\master data science\tfm\r_dataset_cleaned\r_dataset_tconv.csv' pd.DataFrame.to_csv(x, path_or_buf=path_out, sep=',', index=False)
def get_number_of_unique_users(): """ select date_trunc('month', created_at) m, count(distinct user_id) unique_users from candidates where state_id!=11 and not removed group by m order by m; """ first_candidate_columns = ['user_id', 'user__created_at'] data = pd.DataFrame(list( Candidate.objects.filter( ~Q(state__in=get_prospect_states()), removed=False).values_list(*first_candidate_columns)), columns=first_candidate_columns) data['month'] = data['user__created_at'].apply( lambda date: '{y}-{m}'.format(y=date.year, m=get_month_format(date.month))) data.drop('user__created_at', inplace=True, axis=1) gp = pd.groupby(data, by='month').aggregate({'user_id': pd.Series.nunique}) data = pd.DataFrame(gp) return data
def get_reinspection_current_count(bbh): """Counts the number of hangers inside the reinspection at every timestep from a BitBusHist dataframe. Returns a three-tuple: counter: Series, index like bbh, values are number of carcasses in reinspection at the given time. irregulars: list of uids that do not conform to expectations leftovers: list of uids that are not registered as leaving the reinspection """ # Curious note to self: # It seems that np.in1d is about 3 times faster than pd.ser.isin bbh = bbh.sort_index().reset_index() inside = set() irregulars = [] bbh['movements'] = np.in1d(bbh.Tx.values, REINSPECTION_TX_IN).astype(int)\ - np.in1d(bbh.Tx.values, REINSPECTION_TX_OUT).astype(int) n_uids = len(bbh.uids.unique()) inspect = [] for uid, vals in IProgressBar(bbh.groupby('uids'), n_uids): s = vals.movements.sum() if s != 0: inspect.append(uid) # Use multiprocessing.Pool.map here to examine the uids # in `inspect` leaving = bbh.Tx.isin(REINSPECTION_TX_OUT) moves = bbh.Tx.isin(REINSPECTION_TX_IN) - leaving defleft = bbh.Tx.isin(REINSPECTION_TX_DEFINITELY_OUT) for (uid, leave), (uid, left) in \ zip(pd.groupby(leaving, by=bbh.uids), pd.groupby(defleft, by=bbh.uids)): pass # TODO counter = (entering - leaving).cumsum()
def group_data( table ): groups = [] for key, group in pd.groupby( table, lambda x: x[1] ): total = 0 for item in group: total += int( item[2] ) groups.append( group[0][1],total )
def plot_yrly_result(self, **kwargs): df = self.data[['result'] + self.benchmarks] df = df + 1 df = pd.groupby(df, by=[df.index.year]).prod() df = df - 1 df = df * 100 df.rename(columns={'result': self.name}, inplace=True) df.plot.bar(legend=True, **kwargs)
def groupby_year_month(df): """ Groups a pandas `DataFrame` by year and month. :param df: A pandas `DataFrame`. :returns: The grouped `DataFrame`. """ return pd.groupby(df, by=[df.index.year, df.index.month])
def plotEQCountByMonth(df): pdg = pd.groupby(df, by=[df.index.month, df.index.year]) plot = pdg.count()[['code' ]].plot(kind='bar', legend=False, title="Count of Earthquakes by Month in 2016") plot.set(xlabel="Months", ylabel="No. of EarthQuakes") plt.show()
def gen_normalize_by_month_1(stock,pct,month): pct=pct.ix[pct.index.month==month] dataGroup = pd.groupby(pct,by=[pct.index.month,pct.index.year]) for key in dataGroup.groups : _month = dataGroup.get_group(key) _month = (_month + 1).cumprod() _month['normalized']=(_month[stock]-_month['spx']) + 1 yield key, _month, _month['normalized']
def enfants(self): ''' Calcule l'identifiant des parents ''' ind = self.ind enf = ind.ix[ ind['enf'] != 0 ,['men','lienpref','id','enf']] enf0 = enf[enf['enf'].isin([1,2])] enf0['lienpref'] = 0 enf0 = merge(enf0, ind[['men','lienpref','id']], on=['men','lienpref'], how='left', suffixes=('', '_1')) enf1 = enf[enf['enf'].isin([1,3])] enf1['lienpref'] = 1 enf1 = merge(enf1, ind[['men','lienpref','id']], on=['men','lienpref'], how='left', suffixes=('', '_2')) # cas des petits-enfants : on cherche les enfants de la personne de référence (enf=1,2 ou 3) et on tente de les associer # aux petits enfants (lienpref=31) # en toute rigueur, il faudrait garder un lien si on ne trouve pas les parents pour l'envoyer dans le registre... # et savoir que ce sont les petites enfants (pour l'héritage par exemple) par4 = enf[enf['enf'].isin([1,2,3])] par4['lienpref'] = 21 par4 = merge(par4, ind[['men','lienpref','id']], on=['men','lienpref'], how='inner', suffixes=('_4', '')) enf4 = DataFrame( index=par4['id'].unique(), columns=['id_1','id_2'], dtype=np.int32) parents = pd.groupby(par4, 'id') for idx, parent in parents: id = int(idx) if len(parent) == 1: enf4['id_1'][id] = int(parent['id_4']) else: # cas à résoudre "à la main" potential = ind.loc[parent['id_4'], ['anais','lienpref','sexe','couple','conj']] potential = potential[ind.loc[id,'anais'] - potential['anais'] > 16 ] pot_mother = potential[potential['sexe'] ==2 ] if len(pot_mother): par = pot_mother['anais'].idxmin() else: par = potential['anais'].idxmin() enf4['id_1'][id] = par enf4['id'] = enf4.index enf4['id_2'] = ind.ix[enf4['id_1'],'conj'].values enf = merge(enf0[['id','id_1']],enf1[['id','id_2']], how='outer') enf = enf.append(enf4[['id','id_1','id_2']]) enf = merge(enf,ind[['id','sexe']], left_on='id_1', right_on='id', how = 'left', suffixes=('', '_')) del enf['id_'] enf['pere'] = Series(dtype=np.int32) enf['pere'][enf['sexe']==1] = enf['id_1'][enf['sexe']==1] enf['mere'] = Series(dtype=np.int32) enf['mere'][enf['sexe']==2] = enf['id_1'][enf['sexe']==2] cond_pere = notnull(enf['mere']) & notnull(enf['id_2']) enf['pere'][cond_pere] = enf['id_2'][cond_pere] cond_mere = ~notnull(enf['mere']) & notnull(enf['id_2']) enf['mere'][cond_mere] = enf['id_2'][cond_mere] #sum(sexe1==sexe2) 6 couples de parents homosexuels ind = merge(ind,enf[['id','pere','mere']], on='id', how='left') self.ind = ind
def cat_report(): ''' prints longitudinal expenditure per category ''' c = consolidate('/statements') d = c[c['Transaction_Ref2'] != 'LeeEJ'].set_index('Transaction_Date') e = pd.groupby(d, by=['Transaction_Ref1', d.index.year, d.index.month]).sum() return e
def function_over_events(function, dataframe, branch_selection=None, **kwargs): """Generator which yields `function(event, **kwargs)` of each processed data event in dataframe """ for run_number, events in pd.groupby(dataframe, 'run_number'): yield from function_results_datasets(run_number, function, events.event_number.values, branch_selection=branch_selection, kwargs=kwargs)
def groupby_reset(col): colname = "'%s'" % col df = ( pd.groupby([colname]).sum() .sort_values('Global_Sales', ascending = False) .reset_index(col_level = 1) ) return df
def make_propspertext(distrawcounts, label): distrawcounts = distrawcounts.T segids = list(distrawcounts.index) distrawcounts["idnos"] = [item[0:6] for item in segids] # print(distrawcounts.head()) rawcountspertext = pd.groupby(distrawcounts, "idnos") distpropspertext = rawcountspertext.aggregate(np.mean) distpropspertext["label"] = label # print(distpropspertext.head()) return distpropspertext
def create_hist_data(evts, limit, max_top=10): nevts = evts.iloc[0:limit] tmp = pd.groupby(nevts, by=[nevts.index.month]).count() tmp['real_top'] = max_top * tmp.sdt/max(tmp.sdt) res = [] for i in range(2, 7): res.append(tmp['real_top'].get(i, 0)) return res
def create_hist_data(evts, limit, max_top=10): nevts = evts.iloc[0:limit] tmp = pd.groupby(nevts, by=[nevts.index.month]).count() tmp['real_top'] = max_top * tmp.sdt / max(tmp.sdt) res = [] for i in range(2, 7): res.append(tmp['real_top'].get(i, 0)) return res
def enfants(self): ''' Calcule l'identifiant des parents ''' ind = self.ind print("travail sur les enfants") enf = ind.ix[ ind['enf'] != 0 ,['men','lienpref','id','enf']] enf0 = enf[enf['enf'].isin([1,2])] enf0['lienpref'] = 0 enf0 = merge(enf0, ind[['men','lienpref','id']], on=['men','lienpref'], how='left', suffixes=('', '_1')) enf1 = enf[enf['enf'].isin([1,3])] enf1['lienpref'] = 1 enf1 = merge(enf1, ind[['men','lienpref','id']], on=['men','lienpref'], how='left', suffixes=('', '_2')) #pour les petits enfants, on renverse, on selectionne, les enfants qui seront des #parents pour les petits-enfants print("cas des petits-enfants") enf4 = enf[enf['enf'].isin([1,2,3])] enf4['lienpref'] = 21 enf4 = merge(enf4, ind[['men','lienpref','id']], on=['men','lienpref'], how='inner', suffixes=('_4', '')) enf4['id_1'] = Series() enf4['id_2'] = Series() parents = pd.groupby(enf4, 'id') for id, parent in parents: if len(parent) == 1: enf4.loc[ enf4['id']==id, 'id_1'] = parent['id_4'] elif len(parent) == 2: enf4.loc[ enf4['id']==id, 'id_1'] = parent['id_4'].values[0] enf4.loc[ enf4['id']==id, 'id_2'] = parent['id_4'].values[1] else: # cas à résoudre # print(ind.ix[ind['men']==parent['men'].values[0],['age','lienpref']]) enf4.ix[ enf4['id']==id, 'id_1'] = 15043 enf = merge(enf0[['id','id_1']],enf1[['id','id_2']], how='outer') enf = enf.append(enf4[['id','id_1','id_2']]) enf = merge(enf,ind[['id','sexe']], left_on='id_1', right_on='id', how = 'left', suffixes=('', '_')) del enf['id_'] enf['pere'] = Series() enf['pere'][enf['sexe']==1] = enf['id_1'][enf['sexe']==1] enf['mere'] = Series() enf['mere'][enf['sexe']==2] = enf['id_1'][enf['sexe']==2] cond_pere = notnull(enf['mere']) & notnull(enf['id_2']) enf['pere'][cond_pere] = enf['id_2'][cond_pere] cond_mere = ~notnull(enf['mere']) & notnull(enf['id_2']) enf['mere'][cond_mere] = enf['id_2'][cond_mere] #sum(sexe1==sexe2) 6 couples de parents homosexuels ind = merge(ind,enf[['id','pere','mere']], on='id', how='left') print("fin du travail sur les enfants") self.ind = ind
def monthly_report(): c = consolidate('/statements') d = c[c['Transaction_Ref2'] != 'LeeEJ'].set_index('Transaction_Date') e = pd.groupby(d, by=[d.index.year, d.index.month]) savings = e['Credit_Amount'].sum() - e['Debit_Amount'].sum() print e.sum() print e.sum().sum() print savings print savings.sum() # savings.plot(style='o') # plt.show() return e
def aggregate_num_mosquitos(train,test): num_by_trap = pd.groupby(train[['Trap', 'NumMosquitos', 'WnvPresent']], 'Trap').agg('sum') num_by_trap['trap_percent_of_all_mosquitos'] = num_by_trap['NumMosquitos']/sum(num_by_trap.NumMosquitos) num_by_trap['trap_percent_with_wnv'] = num_by_trap.WnvPresent/num_by_trap.NumMosquitos num_by_trap.reset_index(inplace=True) map_mosq_weight = {t:v for t, v in zip(num_by_trap.Trap.values, num_by_trap['trap_percent_of_all_mosquitos'].values)} map_wnv_weight = {t:v for t, v in zip(num_by_trap.Trap.values, num_by_trap['trap_percent_with_wnv'].values)} train['trap_mosq_rate'] = train.Trap.map(map_mosq_weight) train['trap_wnv_rate'] = train.Trap.map(map_wnv_weight) test['trap_mosq_rate'] = test.Trap.map(map_mosq_weight).fillna(0) test['trap_wnv_rate'] = test.Trap.map(map_wnv_weight).fillna(0) return train,test
def target_encoding(df, columns, target, new_column = False): """ Encodes a categorical feature as its target mean """ for column in columns: group = pd.groupby(df[[column, target]], column).mean() new_column_name = column if new_column: new_column_name = column + "_target_encoding" df[new_column_name] = df[column].apply(lambda x : group[group.index == x][target].values[0]) return df
def create_hist(evts, max_top=10): main_grpd = pd.groupby(evts, by=[evts.index.month]).count() main_grpd['i'] = main_grpd.index main_grpd['left'] = main_grpd.apply(lambda x: dt.datetime(2014, x.i, 1), axis=1) main_grpd['right'] = pd.date_range( start=main_grpd['left'][main_grpd.index[0]], periods=len(main_grpd), freq='M') main_grpd['top'] = np.zeros(len(main_grpd.index)) main_grpd['bottom'] = np.zeros(len(main_grpd.index)) main_grpd['real_top'] = max_top * main_grpd.sdt / max(main_grpd.sdt) return main_grpd
def parallelize_on_flid_save_to_db(df_in, func): """group the dataframe by icao address and process these using parallelization""" pool = multiprocessing.Pool(multiprocessing.cpu_count()) print("Number of unique icaos: %d", len(df_in['flight_id'].unique())) res = pool.map( func, [group for name, group in pd.groupby(df_in, by=['flight_id'])]) pool.close() pool.join() return True
def create_hist(evts, max_top=10): main_grpd = pd.groupby(evts, by=[evts.index.month]).count() main_grpd['i'] = main_grpd.index main_grpd['left'] = main_grpd.apply(lambda x: dt.datetime(2014, x.i, 1), axis=1) main_grpd['right'] = pd.date_range( start=main_grpd['left'][main_grpd.index[0]], periods=len(main_grpd), freq='M' ) main_grpd['top'] = np.zeros(len(main_grpd.index)) main_grpd['bottom'] = np.zeros(len(main_grpd.index)) main_grpd['real_top'] = max_top * main_grpd.sdt/max(main_grpd.sdt) return main_grpd
def create_hist_layers(evts, max_top): """ Takes a dataframe of events (where index contains the datetime of each evt and return new groupby.count() object with the following columns to build an histogram: left, right, bottom, top, real_top """ try: main_grpd = pd.groupby(evts, by=[evts.index.month]).count() main_grpd['i'] = main_grpd.index main_grpd['left'] = main_grpd.apply(lambda x: dt.datetime(2014, x.i, 1), axis=1) main_grpd['right'] = pd.date_range( start=main_grpd['left'][main_grpd.index[0]], periods=len(main_grpd), freq='M' ) main_grpd['top'] = np.zeros(len(main_grpd.index)) main_grpd['bottom'] = np.zeros(len(main_grpd.index)) main_grpd['real_top'] = max_top * main_grpd.sdt/max(main_grpd.sdt) except AttributeError: # evts is not a DataFrame, most likely case there are no evts pass for i in range(1, len(evts)+1): nevts = evts.iloc[0:i] # build a temp dh to compute this layer histogram counts tmp= pd.groupby(nevts, by=[nevts.index.month]).count() tmp['real_top'] = tmp.sdt/max(tmp.sdt) # copy the original df so we have all basic config already set grpd = main_grpd.copy() for ind in grpd.index: grpd.real_top[ind] = tmp.real_top.get(ind, 0) yield grpd
def create_hist_layers(evts, max_top): """ Takes a dataframe of events (where index contains the datetime of each evt and return new groupby.count() object with the following columns to build an histogram: left, right, bottom, top, real_top """ try: main_grpd = pd.groupby(evts, by=[evts.index.month]).count() main_grpd['i'] = main_grpd.index main_grpd['left'] = main_grpd.apply( lambda x: dt.datetime(2014, x.i, 1), axis=1) main_grpd['right'] = pd.date_range( start=main_grpd['left'][main_grpd.index[0]], periods=len(main_grpd), freq='M') main_grpd['top'] = np.zeros(len(main_grpd.index)) main_grpd['bottom'] = np.zeros(len(main_grpd.index)) main_grpd['real_top'] = max_top * main_grpd.sdt / max(main_grpd.sdt) except AttributeError: # evts is not a DataFrame, most likely case there are no evts pass for i in range(1, len(evts) + 1): nevts = evts.iloc[0:i] # build a temp dh to compute this layer histogram counts tmp = pd.groupby(nevts, by=[nevts.index.month]).count() tmp['real_top'] = tmp.sdt / max(tmp.sdt) # copy the original df so we have all basic config already set grpd = main_grpd.copy() for ind in grpd.index: grpd.real_top[ind] = tmp.real_top.get(ind, 0) yield grpd
def calib_func(x,data,buckets,option,minfactor,maxfactor): if(option == 'no'): minfactor = np.percentile(data['factor1'].values,2) maxfactor = np.percentile(data['factor1'].values,98) #Limit movement outside model bounds if(option == 'yes'): data['factor1'][data['factor1']>=maxfactor] = maxfactor data['factor1'][data['factor1']<=minfactor] = minfactor #The second layer of this simplfied neural network controls the trend load factorload2 = 1.0/(1.0+np.exp(-x[3]*data['range']/data['duration'])) #Scale this layer without allowing fixed movement factorload = x[2]*factorload2 - x[2]/2.0 #Layer 1 - Main classification factor of neural network z = 1.0/(1.0+np.exp(-factorload * x[1] * data['factor1'])) #Sigmoid function is used for classification adjustment_series = pd.Series(z) #Scale the model adjustment_series = x[0]*adjustment_series - x[0]/2 newprice = data['bs_probability'] + adjustment_series.values data['newprice']=newprice data['newpnl']=0.0 data['newpnl'] = data['wins']-data['newprice'] if(option=='yes'): data['factorload']=x[2]*factorload2 - x[2]/2.0 data['factorload2']=factorload2 return 1.0 d1=np.repeat(0.0,7) i=0 ranges = pd.groupby(data,pd.qcut(data['range'],7)) for mname,m_data in ranges: d1[i]=np.mean(pd.groupby(m_data,by=pd.qcut(m_data['factor1'],7)).mean()['newpnl']**2) i=i+1 return np.mean(d1)**0.5
def extend(data, treemakers): """Extends the dataframe data by loading treemakers for the remaining events See https://github.com/XENON1T/hax/pull/52 for more information. :param data: dataframe, assumed to be event-per-row :param treemakers: list of treemakers to load """ new_minitrees = [] for run_number, events in pd.groupby(data, 'run_number'): new_minitrees.append(load_single_dataset(run_number, treemakers, event_list=events.event_number.values)[0]) result = _merge_minitrees(data, pd.concat(new_minitrees)) result.cut_history = data.cut_history return result
def fit(self, df, y=None): N = df.shape[0] min_freq = max(self.min_freq, np.ceil(N * self.min_freq_ratio)) self.mappers_ = {} self.features_ = list(df.columns) for col in self.features_: self.mappers_[col] = {} group_by = pd.groupby(y, by=df[col]) group_by_count = group_by.count() active_groups = group_by_count[group_by_count >= min_freq].index if self.report_freq_ratio: self.mappers_[col]['freq_ratio'] = (group_by_count / N)[active_groups].to_dict() for agg in self.aggregators: self.mappers_[col][agg] = group_by.agg(agg)[active_groups].to_dict() return self
def process_delta(delta, name): # type: (pd.Series, str) -> pd.Series """Take a delta of a column """ delta_per_month_gb = pd.groupby( delta, by=[delta.index.year, delta.index.month] ) # Drop the first month # Shifts turn dtype into floating point, return to int delta_per_month = delta_per_month_gb.sum().iloc[1:] delta_per_month.index.name = "Month" delta_per_month.index = delta_per_month.index.to_series().apply( pretty_month) delta_per_month.name = name return delta_per_month
def parallelize_on_icao(df_in, func): """group the dataframe by icao address and process \ these using parallelization""" pool_cpu_size = cpu_count pool = multiprocessing.Pool(pool_cpu_size) print("Number of unique icaos: %d", len(df_in['icao'].unique())) df_processed = pd.concat( pool.map(func, [group for name, group in pd.groupby(df_in, by=['icao'])])) pool.close() pool.join() return df_processed
def add_value(target_df, data_df, group_col, typ): df_ls = [] grouped = pd.groupby(target_df, group_col) for key, sub_df in grouped: if typ == 'swap': nation_swap = data_df[data_df.Currency == NATION_CURRENCY_DICT[key]] df_ls.append(sub_df.join(nation_swap[['10Y', 'Butterfly 10y', 'Curve 10y']])) elif typ == 'credit': credit_df = pd.read_csv(ROOT_DIR + 'cleaned data/Monthly credit spread curves/' + CREDIT_DICT[NATION_CURRENCY_DICT[key]],parse_dates = True, infer_datetime_format=True ) credit_df.Date = pd.to_datetime(credit_df.Date, infer_datetime_format = True) credit_df.set_index('Date', inplace = True) df_ls.append(sub_df.join(credit_df['10Y'])) return pd.concat(df_ls)
def equity_resharp(newtotalpo): deltatime=offsets.DateOffset(hours=6) newtotalpo['stockdate']=newtotalpo['stockdate']+deltatime newtotalpo['day']=newtotalpo['stockdate'].apply(lambda x: x.strftime('%Y%m%d')) mydailposition=newtotalpo[newtotalpo['stockdate'].apply(lambda x: x.strftime('%H%M%S'))=='205900'][['stockdate','totalposition']] mydailposition['day']=mydailposition['stockdate'].apply(lambda x: x.strftime('%Y%m%d')) mydailposition['lastdayposition']=mydailposition['totalposition'] mydailposition=mydailposition[['day','lastdayposition']] newtotalpo['deltaposition']=abs(newtotalpo['totalposition']-newtotalpo['totalposition'].shift()) day_equity=pd.groupby(newtotalpo,'day').sum() day_equity['day']=day_equity.index day_equity=pd.merge(day_equity,mydailposition,how='left',on='day') day_equity['lastdayposition']=day_equity['lastdayposition'].fillna(method='ffill') return day_equity[['profit','comm','equity','deltaposition','lastdayposition','day']]
def run(self): infiles = self.Config.append_dir("MakeDrugPairsIn") outfiles = self.Config.append_dir("MakeDrugPairsOut") for (infile,outfile) in zip(infiles,outfiles): data = self.loadDF(infile) if not data: continue data['ChemID'] = data[self.Config.keys['bnf']].map(lambda x: x[0:9]) data = util.sumBy(data,[self.Config.keys['practice'],'ChemID',self.Config.keys['gen'],'postal code']) grouped = pandas.groupby(data,self.Config.keys['gen']) data = pandas.merge(grouped.get_group(1.0),grouped.get_group(0.0), on =[self.Config.keys['practice'],'ChemID'], left_index=False, right_index = False, how = 'outer', sort = False, suffixes = ('_gen','_brand')) for col in data.columns.values.tolist(): data[col] = data[col].map(lambda x: 0 if x!=x else x) data['postal code'] = data.apply( lambda row: row['postal code_gen'] if row['postal code_brand']!=row['postal code_brand'] else row['postal code_brand'], axis=1) items = self.Config.keys['items'] quan = self.Config.keys['quantity'] nic = self.Config.keys['nic'] cols = [items,quan,nic] for col in cols: data['sum'+col] = data[col+'_brand']+data[col+'_gen'] data['percent'+col]= data[col+'_brand']/data['sum'+col] data = data.drop(['INCLUDE_gen','INCLUDE_brand', 'GENERIC_gen','GENERIC_brand', 'postal code_gen','postal code_brand'], axis=1) data.to_csv(outfile, index = False)
def equity_resharp_huibao(newtotalpo): deltatime=offsets.DateOffset(hours=6) newtotalpo['stockdate']=newtotalpo['stockdate']+deltatime newtotalpo['day']=newtotalpo['stockdate'].apply(lambda x: x.strftime('%Y%m%d')) day_equity=pd.groupby(newtotalpo,'day').sum() return day_equity[['profit','comm','equity']] pass ############################## pass
def get_benchmark_from_db(type): db = MySQLdb.connect("192.168.51.100","PASS_DEV","Develop-2015","PASS_SYS") if type == 'Top': symbol = "PERF-GLOBAL" elif type == 'Equity': symbol = "SPX INDEX" query = "select HD_PK from PASS_SYS.V_SERIE where ST_SECURITY_CODE='%s'" % (symbol) select_0 = pd.read_sql(query, db, coerce_float = False) query1 = "select DT_DATE, NU_PX_LAST from PASS_SYS.V_MKTDATA where LK_SERIE = unhex('%s')" % (select_0.values[0][0].encode('hex')) bench = pd.read_sql(query1, db, index_col = 'DT_DATE') # calculation of benchmark standard deviation by month bench_sharp = bench bench_sharp['Returns'] = bench.pct_change() bench_sharp = bench_sharp['Returns'] bench_st_deviation = pd.groupby(bench_sharp, by=[bench_sharp.index.year,bench_sharp.index.month]).apply(lambda rets: np.std(rets)) # calculation of benchmark return by month bench = bench.resample("M").ffill() bench['Returns'] = bench.pct_change() bench = to_multindex(bench) bench_returns = bench['Returns'].dropna() bench_sharpe_ratio = bench_returns.to_frame(name = 'Returns') / bench_st_deviation.to_frame(name = 'Returns') return bench_returns, bench_sharpe_ratio
def combine_columns(dfOrig, codeName = 'SEO.Code.', prcName = 'SEO.Percentage.', codeRange = 5, index = 'Grant.Application.ID'): """ Goes through all codeNames + codeRange, , impute '99' to blanks, get_dummies on them, drops colums with code 0 and add up each throughout the range.""" df = dfOrig.copy() cleanDf = df[['{}{}'.format(codeName, i) for i in range(1, codeRange+1)]].fillna(990000) // 10000 cleanDf[index] = df[index] dummyDf = [] for i in range(1, codeRange+1): cleanDf['{}{}'.format(prcName, i)] = df['{}{}'.format(prcName, i)] currDummy = cleanDf[[index] + ['{}{}'.format(codeName, i)]] currDummy = pd.get_dummies(currDummy['{}{}'.format(codeName, i)], prefix = codeName) currDummy[index] = cleanDf[index] currDummy['{}{}'.format(prcName, i)] = cleanDf['{}{}'.format(prcName, i)] currDummy = pd.groupby(currDummy, index)[currDummy.columns].max() currDummy2 = currDummy.apply(lambda x: x[:-2] * x[-1], axis = 1) currDummy2[index] = currDummy[index] dummyDf.append(currDummy2) currDummy = dummyDf[0] for i in range(1, codeRange): currDummy = currDummy.add(dummyDf[i], fill_value = 0.) currDummy[index] = dummyDf[i][index] currDummy.fillna(0, inplace=True) return currDummy
print 'Loaded load' winddf = winddf[winddf.index < pd.Timestamp('2015-01-01 00:00:00')] solardf = solardf[solardf.index < pd.Timestamp('2015-01-01 00:00:00')] loaddf = loaddf[loaddf.index < pd.Timestamp('2015-01-01 00:00:00')] winddf.index.name = 'Time' solardf.index.name = 'Time' loaddf.index.name = 'Time' winddf.to_csv(outdir + 'wind_signal_ECMWF.csv', float_format='%.4f') solardf.to_csv(outdir + 'solar_signal_ECMWF.csv', float_format='%.4f') loaddf.to_csv(outdir + 'load_signal.csv', float_format='%.4f') raise SystemExit # # Interesting plots # Mean production for each hour of the day, relative to yearly mean. # Increase of ~50% during midday. (pd.groupby(df, by=df.index.hour).mean()/df.mean(axis=0)).plot(c='k', alpha=0.1) w = winddf.mean(axis=1)/winddf.mean().mean() s = solardf.mean(axis=1)/solardf.mean().mean() l = loaddf.mean(axis=1)/loaddf.mean().mean() df = pd.DataFrame(data=np.array([a*w-(1-a)*s-l for a in alphas]).T, columns=alphas, index=w.index)
def _box_reshape(vals, groupby, names, order): """Reshape the box/violinplot input options and find plot labels.""" # Set up default label outputs xlabel, ylabel = None, None # If order is provided, make sure it was used correctly if order is not None: # Assure that order is the same length as names, if provided if names is not None: if len(order) != len(names): raise ValueError("`order` must have same length as `names`") # Assure that order is only used with the right inputs is_pd = isinstance(vals, pd.Series) or isinstance(vals, pd.DataFrame) if not is_pd: raise ValueError("`vals` must be a Pandas object to use `order`.") # Handle case where data is a wide DataFrame if isinstance(vals, pd.DataFrame): if order is not None: vals = vals[order] if names is None: names = vals.columns.tolist() if vals.columns.name is not None: xlabel = vals.columns.name vals = vals.values.T # Handle case where data is a long Series and there is a grouping object elif isinstance(vals, pd.Series) and groupby is not None: groups = pd.groupby(vals, groupby).groups order = sorted(groups) if order is None else order if hasattr(groupby, "name"): if groupby.name is not None: xlabel = groupby.name if vals.name is not None: ylabel = vals.name vals = [vals.reindex(groups[name]) for name in order] if names is None: names = order else: # Handle case where the input data is an array or there was no groupby if hasattr(vals, 'shape'): if len(vals.shape) == 1: if np.isscalar(vals[0]): vals = [vals] else: vals = list(vals) elif len(vals.shape) == 2: nr, nc = vals.shape if nr == 1: vals = [vals] elif nc == 1: vals = [vals.ravel()] else: vals = [vals[:, i] for i in range(nc)] else: error = "Input `vals` can have no more than 2 dimensions" raise ValueError(error) # This should catch things like flat lists elif np.isscalar(vals[0]): vals = [vals] # By default, just use the plot positions as names if names is None: names = list(range(1, len(vals) + 1)) elif hasattr(names, "name"): if names.name is not None: xlabel = names.name # Now convert vals to a common representation # The plotting functions will work with a list of arrays # The list allows each array to possibly be of a different length vals = [np.asarray(a, np.float) for a in vals] return vals, xlabel, ylabel, names
beginning = pd.to_datetime("2014-07-01") comments = comments[comments['date'].notnull()] comments = comments[comments['date'] >= beginning] comments.index=comments['date'] comments.index = comments.index.tz_localize('UTC').tz_convert('US/Central') # determining gender membership comments['male'] = [1 if mset.intersection(row.split()) else 0 for row in comments['norm_message']] comments['female'] = [1 if fset.intersection(row.split()) else 0 for row in comments['norm_message']] comments['both'] = [1 if row['male'] == 1 and row['female'] == 1 else 0 for (i, row) in comments.iterrows()] comments['none'] = [0 if row['male'] == 1 or row['female'] == 1 else 1 for (i, row) in comments.iterrows()] # start of analysis on different categories in the data grouped = pd.groupby(comments,by=[comments.index.year,comments.index.month]) res = pd.DataFrame(columns=colNames) for name, group in grouped: if name[0] >= 2015 or (name[0] == 2014 and name[1] >=7) : print name t = agg_groups(group, name) res = res.append(t, ignore_index = True) res.to_csv(runName + 'month_year.csv', sep=','); grouped = pd.groupby(comments,by=[comments.index.dayofweek,comments.index.hour]) res = pd.DataFrame(columns=colNames) for name, group in grouped: print name t = agg_groups(group, name)
77177] rng_seed_list2 = range(9725, 9727+50*7, 7) rng_seed_list3 = range(9726, 9728+50*7, 7) rng_seed_list = rng_seed_list1 + rng_seed_list2 + rng_seed_list3 assert len(rng_seed_list) >= NUM_RAND ####### 3. Augment training data ####### data = np.load("./data/processed_train.npy") obs_ids_all = np.load("./valid/obs_ids_valid_cv%s.npy" % (CV)) data_pd = pd.DataFrame(data=data[:,0:], columns=COLUMNS) data_pd_ids_all = np.array(data_pd['Id']) data_pd_ids_selected = np.in1d(data_pd_ids_all, obs_ids_all) data_pd_filtered = data_pd[data_pd_ids_selected] data_pd_gp = pd.groupby(data_pd_filtered, "Id") data_size = len(data_pd_gp) for jj, rng_seed in enumerate(rng_seed_list[0:NUM_RAND]): rng = np.random.RandomState(rng_seed) output = np.empty((data_size, INPUT_WIDTH, 22)) y_output = np.zeros(data_size) i = 0 for _, group in data_pd_gp: group_array = np.array(group) X = extend_series(group_array[:,1:23], rng, target_len=INPUT_WIDTH) y = group_array[0,23] output[i,:,:] = X[:,:] y_output[i]= y i += 1
def test_groupby(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): pd.groupby(pd.Series([1, 2, 3]), [1, 1, 1])
bench_sharpe_ratio = get_benchmark_from_db(group)[1] # Get the cumulative return/risk (sharpe ratio) of all strategies for each month for freq in freqs: resultsdir = os.getcwd() + '/synology/%s/RESULTS_%s/' % (group, freq) if not os.path.exists(resultsdir + "Ranking/"): os.makedirs(resultsdir + "Ranking/") strategy_ids = [results.split('/')[-1].split('.')[0] for results in glob.glob(resultsdir + '*.csv')] i = 0 ident = 0 for strategy_id in strategy_ids: ident += 1 #leg = leg.append[ident,strategy_id] returns = pd.read_csv(resultsdir + strategy_id + ".csv", index_col = 'Unnamed: 0', parse_dates = True)['returns'] returns_copy1 = pd.groupby(returns,by=[returns.index.year,returns.index.month]).apply(lambda rets: pa.ts_metrics.cum_returns(rets).iloc[-1]) st_deviation = pd.groupby(returns, by=[returns.index.year,returns.index.month]).apply(lambda rets: np.std(rets)) sharpe_ratio = returns_copy1 / st_deviation strategy_result = pd.concat([returns_copy1.to_frame(), st_deviation.to_frame(), sharpe_ratio.to_frame()], axis = 1) if (i == 0): sratio = pd.DataFrame(index = strategy_result.index) sratio = pd.concat([sratio, sharpe_ratio.to_frame(name = ident)], axis = 1) cum_returns = pd.DataFrame(index = strategy_result.index) cum_returns = pd.concat([cum_returns, returns_copy1.to_frame(name = ident)], axis = 1) i = 1 else: sratio = pd.concat([sratio, sharpe_ratio.to_frame(name = ident)], axis = 1) cum_returns = pd.concat([cum_returns, returns_copy1.to_frame(name = ident)], axis = 1) bench_returns.name = 14 bench_sharpe_ratio.columns = [14]
# TMAX - maximum daily temperature, Degrees Celsius # TMIN - minimum daily temperature, Degrees Celsius # AWND - average daily wind speed, meters per seconds # # An example row from the data: # In[10]: weather_chicago.head(1) # You can investigate the weather conditions in Chicago using the plot below. If you click on the legend you turn the data series visibility on/off. # In[11]: weather_temp = pd.groupby(weather_chicago, by=[weather_chicago.index.week]).mean() layout = go.Layout( title='Mean values of weather conditions in Chicago per week of the year', yaxis=dict( rangemode='tozero', autorange=True, hoverformat='.1f', title='[respective units]' ), xaxis=dict( title='Week of the year' ) ) data_list = []
def process_weather_stations(weather_stations, path='', frequency='A', \ plot_monthly_pattern=False, \ plot_yearly_rainfall=False): assert frequency in ['A', 'M'], "Frequency must be either: 'A' or 'M'" weather_station_details = {} weather_dfs = {} for station in weather_stations: with open(os.path.join(path, station + '.txt'), 'r') as f: text = f.read() # Get station number: station_number = re.search('Patched Point data for station: (\S+)', text).group(1) # Get station Lat Long which corresponds to GDA94: station_latlong = re.search('Lat: (\S+) Long: (\S+)', text).group().strip('"') # Get elevation of station: station_elev = re.search('Elevation:\s+(\w+)', text).group() weather_station_details[station] = [station_number, station_latlong , station_elev] #Read in time series data: weather_dfs[station] = pd.read_csv(os.path.join(path, station + '.txt'), index_col=0, skiprows=[41], parse_dates=True, infer_datetime_format=True, delim_whitespace=True, comment='"', skipinitialspace=True, usecols=[0,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]) def cm2inch(*tupl): inch = 2.54 if isinstance(tupl[0], tuple): return tuple(i/inch for i in tupl[0]) else: return tuple(i/inch for i in tupl) def get_rain_and_ET_from_df(df, stations, freq, how='sum'): new_df = pd.DataFrame() for station in stations: if how == 'mean': new_df.loc[:, station] = df[station]['Rain'].resample(freq).mean() new_df.loc[:, station + '_ET'] = df[station]['Evap'].resample(freq).mean() elif how == 'sum': new_df.loc[:, station] = df[station]['Rain'].resample(freq).sum() new_df.loc[:, station + '_ET'] = df[station]['Evap'].resample(freq).sum() # end if #end for return new_df annual_weather_df = get_rain_and_ET_from_df(weather_dfs, weather_stations, 'A', how='sum') monthly_weather_df = get_rain_and_ET_from_df(weather_dfs, weather_stations, 'M', how='mean') if plot_yearly_rainfall: plt.figure(figsize=cm2inch(18,8)) plt.ylabel("Annual Rainfall [mm]") for station in weather_stations: weather_dfs[station]['Rain'].plot() weather_dfs[station]['Rain'].resample("M", how='sum').plot() weather_dfs[station]['Rain'].resample("A", how='sum'). \ plot(legend=True, label=station + ', ' + weather_station_details[station][0] + ', ' + weather_station_details[station][2] + ', Average: ' + str(weather_dfs[station]['Rain'].resample("A", how='sum').mean())[:5] + 'mm') plt.xlabel("Year") plt.legend(bbox_to_anchor=(0, 1), loc='upper left', ncol=1) annual_weather_df.plot(kind='box') plt.ylabel("Annual Rainfall [mm]") if plot_monthly_pattern: Months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] month_avg = pd.groupby(monthly_weather_df,by=[monthly_weather_df.index.month]).mean() month_avg['Months'] = Months month_avg.plot(kind='bar',x='Months',y=weather_stations) plt.ylabel('Average Monthly Rainfall [mm]') plt.xlabel("") plt.tight_layout() plt.legend(bbox_to_anchor=(0, 1), loc='upper left', ncol=1) if frequency == 'A': # Keeping this as is for now but should not calculate mean here return annual_weather_df.mean() if frequency == 'M': return monthly_weather_df
def genplot(df2,day,pdf=None): # Figure out how many dots may appear in one x bin max_people_per_bin = pd.groupby(df2,['Team','Location']).count()['Site'].max() # Figure out how many teams num_teams = len(df2['Team'].unique()) num_locations = len(df2['Location'].unique()) # Complete for all days. Consider loop based on 'Day'.unique() daydf = df2[df2['Day']==day] colors = getColors() X = daydf['Site'].values + 0.5 Y = daydf['TeamMap'].values + 0.5 day_team_values = daydf['Team'].values day_site_values = daydf['Site'].values s = [] for i in range(0,len(X)): size = len(daydf[(daydf['Team']==day_team_values[i]) & (daydf['Site']==day_site_values[i])]) s.append(size) #s = [20*4**n for n in range(len(x))] #t = [20*4**x for x in s] t = [100*x for x in s] plt.scatter(X,Y,s=t,alpha=0.5) # Fill in the background # TODO: colors is conveniently sized to match data, should expand for i in range(0,num_locations): xgap = [i,i+1] plt.fill_between(x=xgap, y1=num_teams, y2=0, color=colors[i], alpha=0.2) # Consider setting the color or alpha here for i in range(0,num_teams-1): # don't bother with top line plt.axhline(y=i+1,alpha=0.2) # Make sure to use the full df not day-specific #loc_labels = df2['Location'].unique() loc_labels=['Off', 'Tel', 'Loc A', 'Loc B', 'Loc C', 'Vacation', 'Other', ] plt.xticks(np.arange(len(loc_labels))+0.5,loc_labels)#,rotation=45) team_labels=df2['Team'].unique() plt.yticks(np.arange(len(team_labels))+0.5,team_labels) ax = plt.gca() ax.set_autoscale_on(False) ax.invert_yaxis() ax.xaxis.tick_top() plt.tick_params(labelsize=10) plt.tick_params(axis='x',top='off') plt.tick_params(axis='y',left='off') plt.tick_params(axis='y',right='off') #plt.title('Locations for %s'%(day)) #plt.xlabel('Location') plt.xlabel('Locations for %s'%(day)) plt.ylabel('Team') plt.xlim(0,num_locations) plt.ylim(0,num_teams) if pdf: pdf.savefig() plt.close() else: plt.show()
print_image_directive(filename, figure) best_index = numpy.where(scores==numpy.max(scores)) print(" Best Score, {0:.2f}".format(scores[best_index][0])) print(" max-depth parameter with best score,{0}".format(parameters[best_index][0])) bin_range = best_models.parameter.max() - best_models.parameter.min() bins = pandas.cut(best_models.parameter, bin_range) counts = bins.value_counts() for bounds in counts.index: parameter = bounds.split(',')[0].lstrip('()') print(' {0},{1}'.format(int(round(float(parameter))), counts.loc[bounds][0])) parameter_group = pandas.groupby(best_models, 'parameter') medians = parameter_group.score.median() for max_depth in medians.index: print(' {0},{1:.2f}'.format(max_depth, medians.loc[max_depth])) maxes = parameter_group.score.max() for max_depth in maxes.index: print(' {0},{1:.2f}'.format(max_depth, maxes.loc[max_depth])) best_model = models[best_index[0][0]] sale_price = best_model.predict(CLIENT_FEATURES) predicted = sale_price[0] * 1000 actual_median = housing_frame.median_value.median() * 1000 print(" Predicted value of client's home; ${0:,.2f}".format(predicted)) print(" Difference between median and predicted; ${0:,.2f}".format(actual_median - predicted))
def munge_data(df_orig): df = df_orig.copy() del df['Person.ID.1'] # Find the oldest investigator's birth date oldest = pd.DataFrame(df.groupby('Grant.Application.ID')['Year.of.Birth.1'].min()) # Get the number of investigators for each role numRole = pd.get_dummies(df['Role.1']) numRole['Grant.Application.ID'] = df['Grant.Application.ID'] numRole = pd.groupby(numRole, 'Grant.Application.ID')[numRole.columns].sum() # Get the % of aussies numAussies = pd.get_dummies(df['Country.of.Birth.1']) numAussies['Grant.Application.ID'] = df['Grant.Application.ID'] numAussies = pd.groupby(numAussies, 'Grant.Application.ID')[numAussies.columns].sum() # We just imputed all values with NaN (no country info) to zero prcAussies = pd.DataFrame((numAussies['Australia'] / numAussies.sum(axis = 1)).fillna(0), columns = ['% Australians']) # Sum the # of published papers numPapers = df.groupby('Grant.Application.ID')['A..1', 'A.1','B.1', 'C.1','Number.of.Successful.Grant.1','Number.of.Unsuccessful.Grant.1'].sum() df['Contract.Value.Band...see.note.A'].fillna('A', inplace=True) df['Contract.Value.Band...see.note.A']=df['Contract.Value.Band...see.note.A'].apply(lambda x: ord(x.rstrip(' '))) # converting categories to dummy variables grant_cats = pd.get_dummies(df['Grant.Category.Code'], dummy_na=True) grant_cats['Grant.Application.ID']=df['Grant.Application.ID'] grant_cats = pd.groupby(grant_cats, 'Grant.Application.ID')[grant_cats.columns].min() grant_cats = pd.DataFrame(grant_cats) # imputing missing percentages for RFCD.Percentage columns with the mean df['RFCD.Percentage.1'].fillna(df['RFCD.Percentage.1'].mean(), inplace=True) df['RFCD.Percentage.2'].fillna(df['RFCD.Percentage.2'].mean(), inplace=True) df['RFCD.Percentage.3'].fillna(df['RFCD.Percentage.3'].mean(), inplace=True) df['RFCD.Percentage.4'].fillna(df['RFCD.Percentage.4'].mean(), inplace=True) df['RFCD.Percentage.5'].fillna(df['RFCD.Percentage.5'].mean(), inplace=True) # doing the same as above with SEO.Percentage columns df['SEO.Percentage.1'].fillna(df['SEO.Percentage.1'].mean(), inplace=True) df['SEO.Percentage.2'].fillna(df['SEO.Percentage.2'].mean(), inplace=True) df['SEO.Percentage.3'].fillna(df['SEO.Percentage.3'].mean(), inplace=True) df['SEO.Percentage.4'].fillna(df['SEO.Percentage.4'].mean(), inplace=True) df['SEO.Percentage.5'].fillna(df['SEO.Percentage.5'].mean(), inplace=True) rfcds = combine_columns(df, 'RFCD.Code.', 'RFCD.Percentage.') seos = combine_columns(df, 'SEO.Code.', 'SEO.Percentage.') # Get rid of everything we don't need # REMINDER - LATER COME BACK AND DEAL WITH DEPARTMENT, FACULTY, NO YEARS AT FACULTY, PHD, ETC df.drop(['A..1', u'A.1', u'B.1', u'C.1', u'Country.of.Birth.1', u'Dept.No..1', u'Faculty.No..1', u'Home.Language.1', u'No..of.Years.in.Uni.at.Time.of.Grant.1', u'Number.of.Successful.Grant.1', u'Number.of.Unsuccessful.Grant.1', u'Role.1', u'Sponsor.Code', u'With.PHD.1', u'Year.of.Birth.1', u'SEO.Code.4', u'SEO.Code.5', u'SEO.Code.1', u'SEO.Code.2', u'SEO.Code.3', u'RFCD.Code.1', u'RFCD.Code.2', u'RFCD.Code.3', u'RFCD.Code.4', u'RFCD.Code.5', 'Grant.Category.Code', u'RFCD.Percentage.1', u'RFCD.Percentage.2', u'RFCD.Percentage.3', u'RFCD.Percentage.4', u'RFCD.Percentage.5', u'SEO.Percentage.1', u'SEO.Percentage.2', u'SEO.Percentage.3', u'SEO.Percentage.4', u'SEO.Percentage.5',], inplace = True, axis = 1) df.drop_duplicates(inplace = True) df.set_index('Grant.Application.ID', inplace=True) finalDf = pd.merge(df, oldest, left_index = True, right_index = True) finalDf = pd.merge(finalDf, numRole, left_index = True, right_index = True) finalDf = pd.merge(finalDf, prcAussies, left_index = True, right_index = True) finalDf = pd.merge(finalDf, numPapers, left_index = True, right_index = True) finalDf = pd.merge(finalDf, grant_cats, left_index = True, right_index = True) finalDf = pd.merge(finalDf, rfcds, left_index = True, right_index = True) finalDf = pd.merge(finalDf, seos, left_index = True, right_index = True) #imputing ages with median finalDf['Year.of.Birth.1'] = finalDf['Year.of.Birth.1'].fillna(finalDf['Year.of.Birth.1'].median()) #imputing missing papers with 0 finalDf['A..1']=finalDf['A..1'].fillna(0) finalDf['A.1']=finalDf['A.1'].fillna(0) finalDf['B.1']=finalDf['B.1'].fillna(0) finalDf['C.1']=finalDf['C.1'].fillna(0) #imputing missing successful and unsuccessful grants with 0 finalDf['Number.of.Successful.Grant.1']=finalDf['Number.of.Successful.Grant.1'].fillna(0) finalDf['Number.of.Unsuccessful.Grant.1']=finalDf['Number.of.Unsuccessful.Grant.1'].fillna(0) del finalDf['Grant.Application.ID_y'] del finalDf['Grant.Application.ID_x'] finalDf['Proc.Start.Date'] = finalDf['Start.date'].apply(lambda x: time.mktime(datetime.datetime.strptime(x,'%d/%m/%y').timetuple())) #splitting dataframe #m, b = 4.8261954316943646e-08, -53.597570965226083 #finalDf['Number.of.Unsuccessful.Grant.1']= finalDf['Number.of.Unsuccessful.Grant.1'] - (m * finalDf['Proc.Start.Date'] +b) mask = time_mask(finalDf) finalDf_test = finalDf[mask] finalDf_train = finalDf[-mask] #creating X, y splits for test and train dataframes y_train = finalDf_train['Grant.Status'].values del finalDf_train['Grant.Status'] del finalDf_train['Start.date'] X_train = finalDf_train.values y_test = finalDf_test['Grant.Status'].values del finalDf_test['Grant.Status'] del finalDf_test['Start.date'] X_test = finalDf_test.values return X_train, y_train, X_test, y_test, finalDf_test, finalDf_train