def test_pivot_table_multi(self): res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', values=['D', 'E']) res_dense = pd.pivot_table(self.dense, index='A', columns='B', values=['D', 'E']) res_dense = res_dense.apply(lambda x: x.astype("Sparse[float64]")) tm.assert_frame_equal(res_sparse, res_dense)
def data2sub_matrix(data_fit,values_col,index_col,type_form,aggfunc='mean'): """ This creates substitition matrix from input data (frequncies `data_lbl` or `data_fit`). :param data_fit: fitness data (`data_fit`). :param values_col: column name with values (str). :param index_col: column name with index (str). :param type_form: type of values ["aas" : amino acid | "cds" : codons]. """ from dms2dfe.lib.global_vars import aas_21,cds_64 if aggfunc=="mean": data_sub_matrix=pd.pivot_table(data_fit,values=values_col,index=index_col,columns='ref') else: data_sub_matrix=pd.pivot_table(data_fit,values=values_col,index=index_col,columns='ref',aggfunc=aggfunc) #make it 21X21 if type_form=="aas": sub_matrix=pd.DataFrame(index=aas_21,columns=aas_21) sub_matrix.index.name='Mutation to' sub_matrix.columns.name='Wild type' for ref in aas_21: for mut in aas_21: try: sub_matrix.loc[mut,ref]=data_sub_matrix.loc[mut,ref] except: # a=1 sub_matrix.loc[mut,ref]=np.nan return sub_matrix.astype(float)
def main(fh_xls,well): info=pd.read_excel(fh_xls,'info') info=info.set_index('varname') for var in info.iterrows() : val=info['input'][var[0]] if not pd.isnull(val): exec("%s=info['input']['%s']" % (var[0],var[0]),locals(), globals()) else: exec("%s=info['default']['%s']" % (var[0],var[0]),locals(), globals()) data_job=pd.read_excel(fh_xls,'JobView') data_fns=pd.pivot_table(data_job,values='File Name',index='Loop_bleach Index',columns='Well Name', aggfunc=lambda x: x.iloc[0]) data_fns_P =pd.pivot_table(data_job,values='File Name',index='TimeLapse1 Index',columns='Well Name', aggfunc=lambda x: x.iloc[0]) data_fns =pd.concat([data_fns,data_fns_P],axis=0) wells=[str(x) for x in list(data_job['Well Name'].unique())] if not any(x in well for x in wells): print "### ERROR : Could not find '%s'!" % well sys.exit(1) if not exists("%s.%sstb1.mp4" % (fh_xls,well)): print ">>> STATUS : nd2vid : %s" % well nd_fns=data_fns[well].dropna().unique() arr_list=nd2arr_list(nd_dh,nd_fns) arr_list_stb=raw2phasecorr(arr_list,clip=0.125) plt.imshow(arr_list_stb[0]) plt.savefig('%s.%s_ref_frame.png' % (fh_xls,well)) regions,kins_mean=arr_list2regions(arr_list_stb,time_increment) kins_mean.to_csv('%s.%s_kins_mean.csv' % (fh_xls,well)) arr_list2vid(arr_list_stb,regions,kins_mean,('%s.%sstb.mp4' % (fh_xls,well)),384, 384) # arr_list2vid(arr_list ,regions,kins_mean,('%s.%sraw.mp4' % (fh_xls,well)),384, 384) else: print ">>> STATUS : nd2vid :already done"
def num_hpo(self,fdr_cutoff=0.3,min_snp2gene_obs=2,dropna=False,drop_empty_cols=True): ''' Returns a summary table with the number of HPO genes discoverd for different networks ''' candidates = self.high_priority_candidates(fdr_cutoff=fdr_cutoff,min_snp2gene_obs=min_snp2gene_obs) candidates['fdr'] = fdr_cutoff # Calculate Totals total = pd.pivot_table( candidates, index=['fdr','Method','COB'], values='gene', dropna=dropna, aggfunc=lambda x: len(set(x)) ).fillna(0).astype(int) total.columns = ['Total'] total = total.T # Pivot and aggregate by_term = pd.pivot_table( candidates, columns=['fdr','Method','COB'], index=['Term'], values='gene', dropna=dropna, aggfunc=lambda x: len(set(x)) ).fillna(0).astype(int) num_hpo = by_term.append(total) if drop_empty_cols: num_hpo = num_hpo.loc[:,num_hpo.sum()>0] return num_hpo
def get_emp_SE(data, code_type): from scipy.stats import stats allcor=data[data['acc'].isin([1])] cor_pivot=pd.pivot_table(allcor, values='rt', cols=['stim', 'cue'], rows=['subj_idx'], aggfunc=np.average) acc_pivot=pd.pivot_table(data, values='acc', cols=['stim', 'cue'], rows=['subj_idx'], aggfunc=np.average) #Get theoretical RT S.E.M's sem_rt=[] for img, cue in cor_pivot.columns: x=stats.sem(cor_pivot[img][cue]) sem_rt.append(x) #Get theoretical ACCURACY S.E.M's sem_acc=[] for img, cue in acc_pivot.columns: x=stats.sem(acc_pivot[img][cue]) sem_acc.append(x) if code_type=='HNL': face_emp_acc_SE=sem_acc[:3] house_emp_acc_SE=sem_acc[3:] face_emp_rts_SE=sem_rt[:3] house_emp_rts_SE=sem_rt[3:] else: face_emp_acc_SE=sem_acc[:5] house_emp_acc_SE=sem_acc[5:] face_emp_rts_SE=sem_rt[:5] house_emp_rts_SE=sem_rt[5:] sem_list=[face_emp_acc_SE, house_emp_acc_SE, face_emp_rts_SE, house_emp_rts_SE] return sem_list
def change_finder(extra_sites,long_term, all_sites): ''' Function to calculate % difference between long term site time series and bootstrapped all sties time series ''' for n in xrange(len(extra_sites)): #subset all sites to drop one value drop_site = extra_sites.iloc[n][0] sites2test =all_sites[all_sites.Site != drop_site] #Find normal Volumes long_t = pivot_df(long_term) all_s = pivot_df(sites2test) date_influence = (all_s['Norm_Vol'] - long_t['Norm_Vol'])/(all_s['Norm_Vol']) date_influence = date_influence.rename('Pct_Change') long_t= pd.pivot_table(long_t.reset_index(), index=['TripDate'], values=['Norm_Vol']) all_s = pd.pivot_table(all_s.reset_index(), index=['TripDate'], values=['Norm_Vol']) if n == 0: in_df=None in_df2 = None df_back, df2_back = merge_df(long_t,all_s,drop_site,in_df,date_influence,in_df2) else: df_back, df2_back = merge_df(long_t,all_s,drop_site,df_back,date_influence,df2_back) return df_back, df2_back
def calc_cleaning(df_calc, option=1): """ Take data, clean it, return for calculations option=1 means total minutes in raw or percent option=2 means for salaries """ df_calc.fillna(0, inplace=True) # Replace NaN with 0 for calcs df_calc['total_time'] = ((df_calc['hours_spent'] * 60) + df_calc['minutes_spent']) df_calc['username'] = df_calc['username'].apply( lambda x: x.lower()) if option == 1: # General, returns total minutes (raw or percent) table = pd.pivot_table(df_calc, values=['total_time'], rows=['username'], columns=['program_name'], aggfunc=np.sum, margins=True) # Pivot data return table.total_time else: # option == 2 # For salary df_calc['hourly_wage'] = df_calc[['time.user.userprofile.salary']] / 52 / 35 # Annual to Hourly df_calc.rename(columns={'time.user.userprofile.department.name': 'dept'}, inplace=True) df_calc['cost'] = ((df_calc['total_time'] / 60) * df_calc['hourly_wage']) # Cost by dep df_calc['cost'] = ((df_calc['total_time'] / 60) * df_calc['hourly_wage']) table = pd.pivot_table(df_calc, values=['cost'], rows=['dept'], columns=['program_name'], aggfunc=np.sum, margins=True) return table
def getFitLine(df): # returns one-column df with avg speed by hr (index) df2 = copy.deepcopy(df) counter = 1 while counter < len(df2)-6: diff = np.sqrt((df2.speeds[counter] - df2.speeds[counter-1])**2) percentDiff = diff / (df2.speeds[counter-1] + .00000001) if percentDiff > .3: df2.ix[counter, 'speeds'] = np.nan df2.ix[counter-1, 'speeds'] = np.nan df2.ix[counter+1, 'speeds'] = np.nan df2.ix[counter+2, 'speeds'] = np.nan df2.ix[counter+3, 'speeds'] = np.nan df2.ix[counter+4, 'speeds'] = np.nan counter += 6 else: counter += 1 # only keeping entries where altDelta less than 1, speed greater than 1, distDelta greater than 1, and change in speed less than threshhold df2 = df2[(np.sqrt(df2.altDeltas**2) < 1.0) & (df2.speeds > 0.5)] # consolidating to hr groups hrGroups = np.arange(60,200,5) # buckets five wide def assignHrGroups(x): for i in range(len(hrGroups)-1): if x >= hrGroups[i] and x < hrGroups[i+1]: return hrGroups[i] hrs = df2.hr hrs = hrs.map(assignHrGroups) df2['hrGroup'] = hrs # making pivot table to consolidate by hr fitLine = pivot_table(df2, ['speeds'], 'hrGroup', aggfunc=np.mean) f = pivot_table(df2, ['speeds'], 'hrGroup', aggfunc=len) fitLine = DataFrame({'date': df.date[0], 'hr':fitLine.index, 'avgSpeed':fitLine.speeds, 'count':f.speeds}) fitLine = fitLine[fitLine['count'] >= 10] # only keeping groups w at least 20 entries return fitLine
def mkfile(dfd, dfs): #first open file writer = pd.ExcelWriter('grouped_data_4heat.xlsx') dfd.sort_index() print(dfd) #using pivot_table method to create the half matrix pivotplay1 = pd.pivot_table(dfd, values="fc", index=['position1','mut_type1'], columns=['position2','mut_type2']) #create the transposed half matrix pivotplay2 = pd.pivot_table(dfd, values="fc", index=['position2','mut_type2'], columns=['position1','mut_type1']) #sneakily rename the levels in transposed matrix pivotplay2.index.names = pivotplay1.index.names pivotplay2.columns.names = pivotplay1.columns.names #merge the two matrices combined_pivot = pivotplay1.combine_first(pivotplay2) combined_pivot.to_excel(writer,'combined_pivot') #need to fill the ID portion of the matrix, using the set_value fxn: #df.set_value('Col', 'Index', value) or using some nifty solns found here: http://manishamde.github.io/blog/2013/03/07/pandas-and-python-top-10/ print(dfs) def fill_id(x): if type(x) is NaN: if dfs('mut_type', 'position') == combined_pivot(x, 'mut_type1','position1'): return dfs('fc') else: return 0 combined_pivot.applymap(fill_id)
def pivot_one_sfdc(sfdc_file, sfdc_load_key, sfdc_pivot_key, sfdc_pivot_header): if sfdc_file is None or not os.path.isfile(sfdc_file) or not os.path.exists(sfdc_file): return None sfdc_dataframe = pd.read_csv(sfdc_file, index_col=sfdc_load_key) # generate pivot file without big deal filtering pivoted_file_name = os.path.join(os.path.dirname(sfdc_file), "Pivot_" + os.path.basename(sfdc_file)) pivot_dataframe = pd.pivot_table(sfdc_dataframe, index=sfdc_pivot_key, values=sfdc_pivot_header, aggfunc=np.sum) pivot_dataframe.to_csv(pivoted_file_name) # generate pivot file with big deal as filtering pivoted_file_name = os.path.join(os.path.dirname(sfdc_file), "Pivot_bigdealfiltering" + os.path.basename(sfdc_file)) pivot_dataframe = pd.pivot_table(sfdc_dataframe, index=sfdc_pivot_key, columns=["BIG DEAL"], values=sfdc_pivot_header, aggfunc=np.sum) pivot_dataframe.to_csv(pivoted_file_name) # generate pivot file with big deal as index pivoted_file_name = os.path.join(os.path.dirname(sfdc_file), "Pivot_bigdealindexing" + os.path.basename(sfdc_file)) index_keys = [sfdc_pivot_key, "BIG DEAL"] pivot_dataframe = pd.pivot_table(sfdc_dataframe, index=index_keys, values=sfdc_pivot_header, aggfunc=np.sum) pivot_dataframe.to_csv(pivoted_file_name) return pivoted_file_name
def blockify_data(self, data, get_var=False, measures=['rt', 'acc']): data = data.copy() if self.blocksCol not in data.columns: data = blockify_trials(data, nblocks=self.nblocks) goDF = data[data.response==1.] ssDF = data[data.ttype==0.] tableList = [] if 'rt' in measures: rtTable = pd.pivot_table(goDF, values='rt', columns=self.blocksCol, index='idx') tableList.append(rtTable) if 'acc' in measures: ssTable = pd.pivot_table(ssDF, values='acc', columns=self.blocksCol, index='idx') tableList.append(ssTable) if 'bound' in measures: scoreTable = pd.pivot_table(data, values='bound', columns=self.blocksCol, index='idx') tableList.append(scoreTable) if 'score' in measures: scoreTable = pd.pivot_table(data, values='score', columns=self.blocksCol, index='idx') tableList.append(scoreTable) if 'vTrial' in measures: vTable = pd.pivot_table(data, values='drift', columns=self.blocksCol, index='idx') tableList.append(vTable) if get_var: blockedMeasures = [[table.mean().values, table.sem().values*1.96] for table in tableList] blockedMeasures = list(itertools.chain.from_iterable(blockedMeasures)) else: blockedMeasures = [table.mean().values for table in tableList] return blockedMeasures
def main(in_file='avito_train.tsv'): # Takes 1min on full training set # category, subcategory, attr, is_blocked df = pd.read_csv(in_file, sep='\t', usecols=np.array([1,2,5,8])) frac_blocked(df) # view totals in category/subcategory print pd.pivot_table(df,'is_blocked','subcategory','category',aggfunc='count')
def understand_data(input_list): for choice in input_list: if choice == 1: print raw_data.head(10) # print pd.unique(raw_data.TripType) print "Count of unique product Upc", int(pd.DataFrame(pd.unique(raw_data.Upc)).count()) print "Count of unique product department descriptions", int( pd.DataFrame(pd.unique(raw_data.DepartmentDescription)).count() ) elif choice == 2: share_of_trip_type = pd.DataFrame( raw_data.groupby(["TripType"], axis=0)["VisitNumber"].count() * 100 / len(raw_data) ) print share_of_trip_type products_departments = pd.DataFrame( raw_data.groupby(["DepartmentDescription"], axis=0)["Upc"].nunique() ) # http://stackoverflow.com/questions/15411158/pandas-countdistinct-equivalent print products_departments elif choice == 3: # http://pandas.pydata.org/pandas-docs/stable/reshaping.html # department_triptype_pivot = pd.pivot_table(raw_data, values='VisitNumber', index='DepartmentDescription', columns='TripType', aggfunc=np.size) # print department_triptype_pivot department_finelinenum_pivot = pd.pivot_table( raw_data, values="VisitNumber", index="FinelineNumber", columns="DepartmentDescription", aggfunc=np.size ) print department_finelinenum_pivot department_weekday_pivot = pd.pivot_table( raw_data, values="VisitNumber", index="DepartmentDescription", columns="Weekday", aggfunc=np.size ) print department_weekday_pivot Weekday_trip_type_pivot = pd.pivot_table( raw_data, values="VisitNumber", index="TripType", columns="Weekday", aggfunc=np.size ) print Weekday_trip_type_pivot elif choice == 10: # http://stackoverflow.com/questions/21654635/scatter-plots-in-pandas-pyplot-how-to-plot-by-category groups = raw_data.groupby("TripType") fig, ax = plt.subplots() for name, group in groups: # print name # print group.DepartmentDescription ax.plot(group.ScanCount, group.Weekday_num, marker="o", linestyle="", ms=5, label=name) ax.legend() plt.show() elif choice == 20: # (pd.DataFrame(pd.unique(raw_data.TripType))).to_csv('Unique_trip_types.csv',sep = ',',index = False) # share_of_trip_type.to_csv('TripType_percentage_share.csv',sep = ',') # products_departments.to_csv('Unique products per department.csv',sep = ',') # department_triptype_pivot.to_csv('DepartmentDescription trip Type visit number frequency pivot table.csv',sep=',') # department_finelinenum_pivot.to_csv('DepartmentDescription finelinenumber visit number frequency pivot table.csv',sep=',') # department_weekday_pivot.to_csv('DepartmentDescription Weekday visit number frequency pivot table.csv',sep=',') Weekday_trip_type_pivot.to_csv("TripType Weekday visit number frequency pivot table.csv", sep=",")
def _count_departments(data_items): # create table of trip_id/count_departments count_department = pd.pivot_table(data_items, index = ['trip_id', 'department_name']) count_department.reset_index(drop = False, inplace = True) count_department = pd.pivot_table(count_department, index = 'trip_id', aggfunc = 'count')['department_name'] count_department_df = pd.DataFrame(count_department) count_department_df.rename(columns={'department_name': 'department_count'}, inplace=True) return count_department_df
def analyze_trials(self, resultsDF): goDF = resultsDF[resultsDF.response==1.] ssDF = resultsDF[resultsDF.ttype==0.] rtBlocks = pd.pivot_table(goDF, values='rt', columns=self.blocksCol, index='idx').mean().values saccBlocks = pd.pivot_table(ssDF, values='acc', columns=self.blocksCol, index='idx').mean().values rtErr = np.sum((rtBlocks*10 - self.rtBlocks*10)**2) saccErr = np.sum((saccBlocks - self.saccBlocks)**2) return rtErr + saccErr
def getrecommendedbeer(x, y, k=20, address = "shippingProvince"): #get number of customer groups n = max(x.customerGroup) + 1 #create the output list customer_output = [] #get top beers for each group top_beer = [] for i in range(n): #split data by two groups group1 = x.loc[x.customerGroup == i,['productTitle','totalSales']] group2 = x.loc[x.customerGroup != i,['productTitle','totalSales']] group1 = pd.pivot_table(group1, index = ['productTitle'], values = ['totalSales'], aggfunc = np.sum, fill_value = 0) group2 = pd.pivot_table(group2, index = ['productTitle'], values = ['totalSales'], aggfunc = np.sum, fill_value = 0) #if # of recommended beers is larger than # of beers in that group, append top beers from other groups if group1.shape[0] >= k: beer_list = group1.sort('totalSales',ascending = False)[0:k] beer_recommended = beer_list.index.values.tolist() else: beer_list1 = group1.sort('totalSales',ascending = False) beer_list2 = group2.sort('totalSales',ascending = False)[0:k-group1.shape[0]] beer_recommended1 = beer_list1.index.values.tolist() beer_recommended2 = beer_list2.index.values.tolist() beer_recommended = beer_recommended1 + beer_recommended2 top_beer.append(beer_recommended) #get predicted recommended beers for new customers more = y.get("bestBeers",[]) + y.get("rareBeers",[]) + y.get("discountedBeers",[]) new = {} new["address"] = "new customer" new["customerGroup"] = "none" newx = pd.pivot_table(x,index=['productTitle'],values=['totalSales'],aggfunc=np.sum,fill_value=0) rec = newx.sort('totalSales',ascending = False).index.values.tolist()[0:k] add = [] for item in more: if item not in rec: add.append(item) new["recommendedBeer"] = rec[0:k-len(add)] + add customer_output.append(new) #reshape data x = pd.pivot_table(x, index = [address], values = ['customerGroup'], aggfunc = np.min, fill_value = 0) #get recommended beers for each customer for i in range(x.shape[0]): customer = {} customer[address] = x.index.values.tolist()[i] label = x["customerGroup"][i] customer["customerGroup"] = label #add top beers, rare beers and discounted beers rec = top_beer[label] add = [] for item in more: if item not in rec: add.append(item) #note that the number of more beers <= k customer["recommendedBeer"] = rec[0:k-len(add)] + add customer_output.append(customer) return(customer_output)
def tabulate_data( allele_df, dbh ): buf = [] buf2 = [] buf3 = [] table = pivot_table( allele_df, index='sample_id', columns='marker_id', values='value', aggfunc = lambda x: tuple(x) ) heights = pivot_table( allele_df, index='sample_id', columns='marker_id', values='height', aggfunc = lambda x: tuple(x) ) assay_ids = pivot_table( allele_df, index='sample_id', columns='marker_id', values='assay_id', aggfunc = lambda x: tuple(x) ) buf.append( tuple( ['Sample', 'ID'] + [ dbh.get_marker_by_id(x).code for x in table.columns ] ) ) buf2.append( tuple( ['Sample', 'ID'] + [ dbh.get_marker_by_id(x).code for x in heights.columns ] ) ) buf3.append( tuple( ['Sample', 'ID'] + [ dbh.get_marker_by_id(x).code for x in assay_ids.columns ] ) ) empty = tuple() rows = [ ((dbh.get_sample_by_id(r[0]).code,), (r[0],)) + r[1:] for r in table.itertuples() ] rows.sort() height_rows = [ ((dbh.get_sample_by_id(r[0]).code,), (r[0],)) + r[1:] for r in heights.itertuples() ] height_rows.sort() assayid_rows = [ ((dbh.get_sample_by_id(r[0]).code,), (r[0],)) + r[1:] for r in assay_ids.itertuples() ] assayid_rows.sort() for row in rows: data = [ x if type(x) == tuple else empty for x in row ] for cols in zip_longest( *data, fillvalue='' ): buf.append( cols ) for height_row in height_rows: data = [ x if type(x) == tuple else empty for x in height_row ] for cols in zip_longest( *data, fillvalue='' ): buf2.append( cols ) for assayid_row in assayid_rows: data = [ x if type(x) == tuple else empty for x in assayid_row ] for cols in zip_longest( *data, fillvalue='' ): buf3.append( cols ) return (buf, buf2, buf3)
def calc_averages(mc_t,gc_t): pvt_mc = pd.pivot_table(mc_t, index=['TripDate'],values=['Thickness'],aggfunc=np.nanmean).reset_index() pvt_gc = pd.pivot_table(gc_t, index=['TripDate'],values=['Thickness'],aggfunc=np.nanmean).reset_index() pvt_mc = pvt_mc[(pvt_mc['TripDate'] == '1990-06-10') | (pvt_mc['TripDate'] == '2003-09-20') | (pvt_mc['TripDate'] == '2016-10-01')] pvt_gc = pvt_gc[(pvt_gc['TripDate'] == '1990-06-10') | (pvt_gc['TripDate'] == '2003-09-20') | (pvt_gc['TripDate'] == '2016-10-01')] pvt_mc = pvt_mc.rename(columns={'Thickness':'Marble Canyon Average Thickness'}) pvt_gc = pvt_gc.rename(columns={'Thickness':'Grand Canyon Average Thickness'}) merge = pvt_mc.merge(pvt_gc, left_on='TripDate', right_on='TripDate',how='left') return merge
def getsimilarbeer(x, beerIds_dict, k = 20, l = 40): #add 1 to k in order to remove the beer itself from the similar beer list later on k = k + 1 #get number of beer groups n = max(x.beerGroup) + 1 #create the output list beer_output = [] #get top beers for each group top_beer = [] for i in range(n): #split data by two groups group1 = x.loc[x.beerGroup == i,['productTitle','totalSales']] group2 = x.loc[x.beerGroup != i,['productTitle','totalSales']] group1 = pd.pivot_table(group1, index = ['productTitle'], values = ['totalSales'], aggfunc = np.sum, fill_value = 0) group2 = pd.pivot_table(group2, index = ['productTitle'], values = ['totalSales'], aggfunc = np.sum, fill_value = 0) #if # of similar beers is larger than # of beers in that group, append top beers from other groups if group1.shape[0] >= k: beer_list = group1.sort('totalSales',ascending = False)[0:k] beer_similar = beer_list.index.values.tolist() else: beer_list1 = group1.sort('totalSales',ascending = False) beer_list2 = group2.sort('totalSales',ascending = False)[0:k-group1.shape[0]] beer_similar1 = beer_list1.index.values.tolist() beer_similar2 = beer_list2.index.values.tolist() beer_similar = beer_similar1 + beer_similar2 top_beer.append(beer_similar) #get predicted similar beers for new beers #when new beers are available, they won't have a group label because they have no sales #so the similar beers of "new beers" will be best-selling beers for now new = {} new["beerId"] = '' new["productTitle"] = "new beer" new["beerGroup"] = "none" newx = pd.pivot_table(x,index=['productTitle'],values=['totalSales'],aggfunc=np.sum,fill_value=0) new["similarBeer"] = newx.sort('totalSales',ascending = False).index.values.tolist()[0:l] beer_output.append(new) #reshape data x = pd.pivot_table(x, index = ['productTitle'], values = ['beerGroup'], aggfunc = np.min, fill_value = 0) #get similar beers for each product for i in range(x.shape[0]): beer_item = {} beer_item["productTitle"] = x.index.values.tolist()[i] beer_item["beerId"] = beerIds_dict.get(beer_item.get("productTitle",''),'') label = x["beerGroup"][i] beer_item["beerGroup"] = label #check whether the beer itself is in the similar beer list if beer_item.get("productTitle") in top_beer[label]: new_top_beer = [item for item in top_beer[label] if item != beer_item.get("productTitle")] beer_item["similarBeer"] = new_top_beer else: beer_item["similarBeer"] = top_beer[label][0:k-1] beer_output.append(beer_item) return(beer_output)
def prepare_data( allele_df ): #buf = io.StringIO() #temp_csv = csv.writer( buf, delimiter='\t' ) buf = [] buf2 = [] buf3 = [] table = pivot_table( allele_df, rows='sample_id', cols='marker_id', values='value', aggfunc = lambda x: tuple(x) ) heights = pivot_table( allele_df, rows='sample_id', cols='marker_id', values='height', aggfunc = lambda x: tuple(x) ) assay_ids = pivot_table( allele_df, rows='sample_id', cols='marker_id', values='assay_id', aggfunc = lambda x: tuple(x) ) buf.append( tuple( ['Sample'] + [ Marker.get(x).code for x in table.columns ] ) ) buf2.append( tuple( ['Sample'] + [ Marker.get(x).code for x in heights.columns ] ) ) buf3.append( tuple( ['Sample'] + [ Marker.get(x).code for x in assay_ids.columns ] ) ) empty = tuple() rows = [ (((Sample.get(r[0]).code, r[0]),),) + r[1:] for r in table.itertuples() ] rows.sort() height_rows = [ (((Sample.get(r[0]).code, r[0]),),) + r[1:] for r in heights.itertuples() ] height_rows.sort() assayid_rows = [ (((Sample.get(r[0]).code, r[0]),),) + r[1:] for r in assay_ids.itertuples() ] assayid_rows.sort() for row in rows: data = [ x if type(x) == tuple else empty for x in row ] for cols in zip_longest( *data, fillvalue='' ): buf.append( cols ) for height_row in height_rows: data = [ x if type(x) == tuple else empty for x in height_row ] for cols in zip_longest( *data, fillvalue='' ): buf2.append( cols ) for assayid_row in assayid_rows: data = [ x if type(x) == tuple else empty for x in assayid_row ] for cols in zip_longest( *data, fillvalue='' ): buf3.append( cols ) return (buf, buf2, buf3)
def main(fh_xls): # fh_xls=sys.argv[1] info=pd.read_excel(fh_xls,'info') info=info.set_index('varname') for var in info.iterrows() : val=info['input'][var[0]] if not pd.isnull(val): exec("%s=info['input']['%s']" % (var[0],var[0]),locals(), globals()) else: exec("%s=info['default']['%s']" % (var[0],var[0]),locals(), globals()) #../tests/test.xlsx # fh_xls='../test/test.xlsx' data_job=pd.read_excel(fh_xls,'JobView') # nd_dh="/media/Transcend/20160219_000356_267" data_fns=pd.pivot_table(data_job,values='File Name',index='Loop_bleach Index',columns='Well Name', aggfunc=lambda x: x.iloc[0]) data_fns_P =pd.pivot_table(data_job,values='File Name',index='TimeLapse1 Index',columns='Well Name', aggfunc=lambda x: x.iloc[0]) data_fns =pd.concat([data_fns,data_fns_P],axis=0) wells=[str(x) for x in list(data_job['Well Name'].unique())] wells.sort() wells_b, wells_u =wells[::2],wells[1::2] pt00s=range(24) for rowi in range(2,16): for coli in range(12): pt00s.append(coli) pt00s_all=[None] * 384 pt00s_all[::2]=pt00s pt00s_all[1::2]=pt00s info_pt00s=pd.DataFrame({'well' : wells, \ 'pt00': pt00s_all}) info_pt00s=info_pt00s.set_index('well') time=np.array(range(12))*2 # data_num=pd.DataFrame(columns=wells) # data_num.loc[:,'time']=time if not exists("%s.data_num_kin" % (fh_xls)): data_num_kin=pd.DataFrame(columns=wells) data_num_kin.loc[:,'time']=time for well in wells: logging.info("processing: %s" % well) nd_fns=data_fns[well].dropna().unique() well_kin=nd2kins(nd_fns,nd_dh,time_increment) if not pd.isnull(info_pt00s.loc[well])[0]: pt00=int(info_pt00s.loc[well]) data_num_kin[well]=well_kin[pt00:pt00+12].values else : data_num_kin[well]=well_kin[0:12].values data_num_kin.to_csv("%s.data_num_kin" % (fh_xls)) else: logging.info("already processed: %s.data_num_kin" % (fh_xls)) data_num_kin=pd.read_csv("%s.data_num_kin" % (fh_xls)) data_num_kin2ana(data_num_kin,wells,wells_b,wells_u)
def pathway_scores_from_variants(variants_df, pathway_df, index_field): """ Similar to `pathway_scores_from_zscores`, but with different subsetting logic that makes sense with integer variants per gene. """ x = variants_df.join(pathway_df) dfs = [] dfs.append(index_converter(pd.pivot_table(x, index=index_field, aggfunc=np.sum), 'sum_var')) dfs.append(index_converter(pd.pivot_table(x, index=index_field, aggfunc=np.mean), 'mean_var')) return pd.concat(dfs)
def make_dfs(): """ Creates dataframes for plotting using SQL queries. Returns a dataframe for each positional group (dfRB, dfWR, dfQB). """ con = sqlite3.connect('data/nflPPdb.sqlite') df1 = pd.read_sql_query( 'SELECT combine.name, combine.fortyyd, combine.heightinchestotal,\ combine.weight, combine.twentyss, combine.vertical, combine.year\ FROM combine\ WHERE combine.year < 2009 AND combine.pickround != 0', con) df1['speedscore'] = (df1['weight']*200)/(df1['fortyyd']**4) df2 = pd.read_sql_query( 'SELECT combine.name, combine.year, players.position\ FROM combine, players\ WHERE combine.name = players.name AND combine.year = players.draft_year', con) df3 = pd.merge(df1, df2, on=['name', 'year'], how='inner', suffixes=('df1', 'df2')) df3 = df3.drop_duplicates(subset='name', keep=False) df4 = pd.read_sql_query( 'SELECT DISTINCT combine.name, rr.rushing_yards, rr.receiving_yards\ FROM combine, rr\ WHERE combine.name = rr.name AND combine.year < 2009', con) df4 = pd.pivot_table(df4, index=['name'], aggfunc=np.sum).reset_index().fillna(0) df4['totYds'] = (df4['receiving_yards'].fillna(0.0) + df4['rushing_yards'].fillna(0.0)).astype(int) df5 = pd.merge(df3, df4, on='name', how='inner', suffixes=('df3', 'df4')) dfRB = df5[df5.position == 'RB'] dfRB = dfRB[dfRB.fortyyd < 5] # remove outliers dfWR = df5[df5.position == 'WR'] dfWR = dfWR[dfWR.fortyyd < 5] # remove outliers # Create QB data frame dfQB = pd.read_sql_query( 'SELECT DISTINCT combine.name, combine.fortyyd, combine.heightinchestotal,\ combine.weight, combine.twentyss, combine.vertical, passing.passing_yards\ FROM combine, passing\ WHERE combine.name = passing.name AND combine.year < 2009', con) # use to get 40 yard time back after aggregating dfQB['count'] = 1 dfQB = pd.pivot_table(dfQB, index=['name'], aggfunc=np.sum).reset_index() dfQB['fortyyd'] = dfQB['fortyyd']/dfQB['count'] dfQB['heightinchestotal'] = dfQB['heightinchestotal']/dfQB['count'] dfQB['twentyss'] = dfQB['twentyss']/dfQB['count'] dfQB['vertical'] = dfQB['vertical']/dfQB['count'] dfQB['weight'] = dfQB['weight']/dfQB['count'] dfQB['speedscore'] = (dfQB['weight']*200)/(dfQB['fortyyd']**4) dfQB = dfQB.drop('count', 1) # remove outliers dfQB = dfQB[dfQB.passing_yards > 175] return (dfRB, dfWR, dfQB)
def GetQIE8DataFrame( rel_qie8_path ): sql_engine_qie8 = create_engine( rel_qie8_path ) sql_con_qie8 = sql_engine_qie8.raw_connection() df_qie8 = pd.read_sql("SELECT * FROM QIE8CalibFNALNormal", con = sql_con_qie8, index_col = ['index']) df_qie8.drop(['Mode', 'Run'], axis=1, inplace=True) df_qie8_offset_pv = pd.pivot_table(df_qie8, index = ['QIE8ID', 'Ch'], columns=['Cap','Rng'], values='Offset') df_qie8_slope_pv = pd.pivot_table(df_qie8, index = ['QIE8ID', 'Ch'], columns=['Cap','Rng'], values='Slope') sql_con_qie8.close() return df_qie8_offset_pv, df_qie8_slope_pv
def SNP2Gene_breakdown(self,COB=None): ''' Provides a breakdown of SNP to gene mapping parameters for each term in the Overlap. Includes the number of initial Loci, the number of collapsed Loci (within a window) and the number of candidate genes (within a window and up to a flank limit) Parameters ---------- COB : str (default: 'average') If specfified, the results will be composed only of SNP to gene mappings from a single COB network. If 'average' is specified, the results will be the SET of genes across all COB networks. ''' # Get some help def bp_to_kb(bp): return "{}KB".format(int(bp/1000)) def get_level(df,level): ''' Returns the level values by name ''' level_index = df.columns.names.index(level) return df.columns.levels[level_index] # Prepare the data frame results if COB == None: results = self.results else: results = self.results.query('COB=="{}"'.format(COB)) # Total for the Ionome ont = co.GWAS(self.results.Ontology.unique()[0]) ref = co.COB(self.results.COB.unique()[0])._parent_refgen # Make an aggregate term total = co.Term('total',loci=set(chain(* [x.loci for x in ont.terms()]))) # Calculate number of SNPs snps = pd.DataFrame(pd.pivot_table(results,index="Term",values='TermLoci')) snps.columns = pd.MultiIndex.from_product([['GWAS SNPs'],['-'],['-']],names=['Name','WindowSize','FlankLimit']) snps.ix['Total'] = len(total.loci) # Calculate number of Candidate Loci loci = pd.pivot_table(results,index="Term",columns=['WindowSize'],values='TermCollapsedLoci') for window_size in loci.columns: loci.ix['Total',window_size] = len(total.effective_loci(window_size)) loci.columns = pd.MultiIndex.from_product([['Collapsed Loci'],list(map(bp_to_kb,loci.columns)),['-']],names=['Name','WindowSize','FlankLimit']) # Calculate number of Candidate Genes genes = pd.pivot_table(results,index='Term',columns=['WindowSize','FlankLimit'],values='gene',aggfunc=lambda x: len(set(x))) for window_size in get_level(genes,'WindowSize'): for flank_limit in get_level(genes,'FlankLimit'): genes.ix['Total',(window_size,flank_limit)] = len(ref.candidate_genes(total.effective_loci(window_size=window_size),flank_limit=flank_limit)) genes.columns = pd.MultiIndex.from_product( [['Candidate Genes'], list(map(bp_to_kb,get_level(genes,"WindowSize"))), get_level(genes,'FlankLimit') ], names=['Name','WindowSize','FlankLimit'] ) results = snps.join(loci).join(genes) #ionome_eff_loci = [len()] return results
def __UNUSED_prepare_array_like_data(action_hourly): """ これだと、user_idで対応関係が作れないことがわかったので、この関数は使わない scikit-learnのinputに適した形でのデータの作成をする 作成すべきデータは、1時間毎のユーザーログインするか、しないか、の対応関係のデータ 0時のログインデータ(月曜から〜日曜)が訓練で、次の週のある日のログインデータが答え training ndarray: [[user_1_monday, user_1_tuesday, ... user_1_sunday], [user_2_monday, user_2_tuesday, ... user_2_sunday], ... [user_n_monday, user_n_tuesday, ... user_n_sunday]] gold: [login_user_1, login_user_2, ..., login_user_n] :return: Map {key int hour: value tuple train_ans_tuple (train_2d_array, ans_1d_array)} """ dates = list(action_hourly.log_date.unique()) dates_for_train = dates[:-1] date_for_gold = dates[-1] ifelse_func = lambda x: 1 if sum(x) > 7 else 0 # 訓練データの日付けだけを選択 df_for_train = action_hourly[action_hourly['log_date'].str.contains('|'.join(dates_for_train))] df_for_gold = action_hourly[action_hourly['log_date'].str.contains(date_for_gold)] stack_map = {} # 時間ごとに訓練データとゴールドデータの作成 for target_hour in range(0, 24): df_train_for_target_hour = df_for_train[df_for_train.log_hour == target_hour] df_gold_for_target_hour = df_for_gold[df_for_gold.log_hour == target_hour] # 0,1のデータに変換する training_data = pd.pivot_table(df_train_for_target_hour, index='user_id', columns='log_date', values='count', fill_value=0, aggfunc=ifelse_func).reset_index() array_training_data = training_data.drop(u'user_id', axis=1).values gold_data = pd.pivot_table(df_gold_for_target_hour, index='user_id', columns='log_date', values='count', fill_value=0, aggfunc=ifelse_func).reset_index() array_gold_data = gold_data.drop(u'user_id', axis=1).values array_gold_data_1d = np.transpose(array_gold_data)[0, :] # user_idとarray_training_dataとarray_gold_data_1dをmapに記録する stack_map[target_hour] = (user_id, array_training_data, array_gold_data_1d)
def test_pivot_table_with_iterator_values(self): # GH 12017 aggs = {"D": "sum", "E": "mean"} pivot_values_list = pd.pivot_table(self.data, index=["A"], values=list(aggs.keys()), aggfunc=aggs) pivot_values_keys = pd.pivot_table(self.data, index=["A"], values=aggs.keys(), aggfunc=aggs) tm.assert_frame_equal(pivot_values_keys, pivot_values_list) agg_values_gen = (value for value in aggs.keys()) pivot_values_gen = pd.pivot_table(self.data, index=["A"], values=agg_values_gen, aggfunc=aggs) tm.assert_frame_equal(pivot_values_gen, pivot_values_list)
def generateGraphData(self): safePrint('Generating and uploading data files') allData = read_table(self.combinedFile, sep='\t', na_filter=False, parse_dates=[0], infer_datetime_format=True) xcsList = [xcs for xcs in allData.xcs.unique() if xcs != 'ERROR' and xcs[0:4] != 'TEST' and xcs != '000-00'] # filter type==DATA and site==wikipedia allData = allData[(allData['xcs'].isin(xcsList)) & (allData['site'] == 'wikipedia')] # By "iszero+via", e.g. a,b,aO,bO,..., where 'a' == zero-rated, 'b' == non-zero-rated, and 'O' == Opera data = DataFrame(pivot_table(allData, 'count', ['date', 'xcs', 'via', 'iszero'], aggfunc=np.sum)) data.reset_index(inplace=True) data['via'] = data.apply(lambda r: ('a' if r['iszero'][:1] == 'y' else 'b') + r['via'][:1], axis=1) data.drop('iszero', axis=1, inplace=True) self.createClippedData('RawData:YearDailyViaIsZero', data) self.createPeriodData('RawData:WeeklyViaIsZero', data, weekly) self.createPeriodData('RawData:MonthlyViaIsZero', data, monthly) allowedSubdomains = ['m', 'zero'] data = allData[(allData.ison == 'y') & (allData.iszero == 'y') & (allData.subdomain.isin(allowedSubdomains))] data = DataFrame(pivot_table(data, 'count', ['date', 'xcs', 'subdomain'], aggfunc=np.sum)) data.reset_index(inplace=True) self.createClippedData('RawData:YearDailySubdomains', data) self.createPeriodData('RawData:WeeklySubdomains', data, weekly) self.createPeriodData('RawData:MonthlySubdomains', data, monthly) # create an artificial yes/no/opera sums opera = allData[(allData.via == 'OPERA') & (allData.iszero == 'y')] opera['str'] = 'o' yes = allData[allData.iszero == 'y'] yes['str'] = 'y' no = allData[allData.iszero == 'n'] no['str'] = 'n' combined = opera.append(yes).append(no) data = DataFrame(pivot_table(combined, 'count', ['date', 'xcs', 'str'], aggfunc=np.sum)) data.reset_index(inplace=True) headerFields = 'date,xcs,iszero,count' # Override "str" as "iszero" self.createClippedData('RawData:YearDailyTotals', data, headerFields) self.createPeriodData('RawData:MonthlyTotals', data, monthly, headerFields) data = [] for xcsId in list(allData.xcs.unique()): byLang = pivot_table(allData[allData.xcs == xcsId], 'count', ['lang'], aggfunc=np.sum) \ .order('count', ascending=False) top = byLang.head(5) vals = list(top.iteritems()) vals.append(('other', byLang.sum() - top.sum())) valsTotal = sum([v[1] for v in vals]) / 100.0 data.extend(['%s,%s,%.1f' % (l, xcsId, c / valsTotal) for l, c in vals]) self.saveWikiPage('RawData:LangPercent', data, 'lang,xcs,count')
def area_data(df,section=None): if section is not None: df=df.query(section) tmp_pvt = pd.pivot_table(df, values=['Area_2D'], index=['TripDate'], aggfunc=np.std) tmp_pvt = tmp_pvt.rename(columns={'Area_2D':'std_dev'}) tmp_count = pd.pivot_table(df,values=['Area_2D'], index=['TripDate'], aggfunc='count') tmp_count = tmp_count.rename(columns={'Area_2D':'count'}) tmp_pvt['std_error'] = tmp_pvt['std_dev']/np.sqrt(tmp_count['count']) yerr = tmp_pvt[['std_error']] tmp_pvt = pd.pivot_table(df, values=['Area_2D'], index=['TripDate'], aggfunc=np.average) tmp_pvt['y_err']=yerr return tmp_pvt
def fit_decay_time(track_dfs, plot=True, savename=None): def expon_func(x, a, b, c): return a * np.exp(-b * x) + c # assemble all dataframes into one big df. full_data = [df for bid, df in track_dfs.items()] fdf = pd.concat(full_data, axis=0) fdf['m'] = np.array(fdf['minutes'], dtype=int) avg = pd.pivot_table(fdf, values='bl / s', cols='m', aggfunc=np.mean) counts = pd.pivot_table(fdf, values='bl / s', cols='m', aggfunc=len) x = np.array(avg.index) + 0.5 y = np.array(avg) params, pcov = curve_fit(expon_func, x, y) if plot: fig = plt.figure(figsize=(10,5)) ax = plt.subplot(axisbg='white') #ax.plot(fdf['minutes'], fdf['bl / s'], '.', alpha=0.1, label="Origional Data") ax.plot(x, y, 'o', label="Binned Data", color='steelblue', alpha=0.5) ax.plot(x, expon_func(x, *params), '-', color='k', label="Fitted Curve") ax.set_ylim([0, 0.2]) ax.legend(fontsize=15) ax.set_xlabel('time (min)', size=20) ax.set_ylabel('speed (bl / s)', size=20) plt.yticks(fontsize=15) plt.xticks(fontsize=15) if savename is not None: plt.savefig(savename) plt.show() fig = plt.figure(figsize=(10,3)) ax = plt.subplot(axisbg='white') #fig, ax = plt.subplots(figsize=(10,3)) x = np.array(counts.index) y = np.array(counts) ax.plot(x, y, 'k.-', label="Binned Data") ax.legend(loc='best', fontsize=15) ax.set_xlabel('time (min)', size=20) ax.set_ylabel('counts', size=20) ax.set_ylim([0, max(y)*1.1]) plt.yticks(fontsize=15) plt.xticks(fontsize=15) plt.show() return fdf, params, pcov
def get_hourly_data(df, values): df_c = df.copy() df_c["day"] = df_c.index.dayofyear df_c["hour"] = df_c.index.hour return pd.pivot_table(df_c, index=["hour"], columns=["day"], values=values)
def merge_pgr_data(): with open("data/source/mapas-data-concentrado.xlsx", "rb") as f: dfs = pd.read_excel(f, sheet_name=None) df = dfs['00 PGR'] pivot = pd.pivot_table(df, index=["state_code", "municipio_code", "year"], fill_value=0, aggfunc=np.sum, values=PROPS) pivot_dict = pivot.reset_index().to_dict("records") lookup = {} for row in pivot_dict: if not lookup.get(row['state_code']): lookup[row['state_code']] = {} if not lookup[row['state_code']].get(row['municipio_code']): lookup[row['state_code']][row['municipio_code']] = {} lookup[row['state_code']][row['municipio_code']][int( row['year'])] = dict( (prop, int(row.get(prop, 0))) for prop in PROPS) centers = { 'type': 'FeatureCollection', 'name': 'municipalescentroids', 'features': [], } with codecs.open('data/source-geojson/municipales.json', encoding='utf-8', errors="replace") as f: data = json.load(f) for feature in data['features']: totals = dict((prop, 0) for prop in PROPS) state_code = int(feature['properties']['CVE_ENT']) mun_code = int(feature['properties']['CVE_MUN']) try: mun_data = lookup[state_code].get(mun_code) except KeyError: continue cumulative_fosas_data = [] for year in range(2006, 2017): cumulative_dict = {'year': year} for prop in PROPS: if mun_data and mun_data.get(year) and mun_data[year].get( prop, 0) > 0: feature['properties'][prop + '_' + str(year)] = int( mun_data[year].get(prop)) totals[prop] += int(mun_data[year].get(prop)) else: feature['properties'][prop + '_' + str(year)] = 0 if len(cumulative_fosas_data): cumulative_dict[prop] = cumulative_fosas_data[-1].get( prop) + feature['properties'][prop + '_' + str(year)] feature['properties'][ prop + '_cumulative_' + str(year)] = cumulative_fosas_data[-1].get( prop) + feature['properties'][prop + '_' + str(year)] else: cumulative_dict[prop] = feature['properties'][ prop + '_' + str(year)] feature['properties'][ prop + '_cumulative_' + str(year)] = feature['properties'][prop + '_' + str(year)] cumulative_fosas_data.append(cumulative_dict) for prop in PROPS: feature['properties'][prop + '_total'] = totals[prop] # if totals[prop] > maxes[prop]: # maxes[prop] = totals[prop] make_centroid = False for prop in PROPS: if feature['properties'][prop + '_total'] > 0: make_centroid = True break if make_centroid: shp = shape(feature['geometry']) feature_centroid = mapping(shp.representative_point()) center_feature = deepcopy(feature) center_feature['geometry'] = feature_centroid centers['features'].append(center_feature) # Just adding to the ugliness. Only centroids have data... feature['properties'] = { 'CVE_ENT': int(feature['properties']['CVE_ENT']) } with open('data/processed-geojson/pgr-centroids.json', 'w') as f: json.dump(centers, f)
def merge_municipality_data(): lookup = {} with open("data/source/mapas-data-concentrado.xlsx", "rb") as f: dfs = pd.read_excel(f, sheet_name=None) for sheetname, df in dfs.items(): try: state_code, state_name = sheetname.split(' ', 1) state_code = int(state_code) except ValueError: print("%s is not a valid sheet name" % sheetname, file=sys.stderr) continue pivot = pd.pivot_table(df, index=["municipio_code", "year"], fill_value=0, aggfunc=np.sum, values=PROPS) pivot_dict = pivot.reset_index().to_dict("records") final_dict = {} for row in pivot_dict: if not final_dict.get(row['municipio_code']): final_dict[row['municipio_code']] = {'state_name': state_name} try: final_dict[row['municipio_code']][int(row['year'])] = dict( (prop, int(row.get(prop, 0))) for prop in PROPS) except ValueError: print(sheetname) lookup[state_code] = final_dict maxes = {prop: 0 for prop in PROPS} centers = { 'type': 'FeatureCollection', 'name': 'municipalescentroids', 'features': [], } with codecs.open('data/source-geojson/municipales.json', encoding='utf-8', errors="replace") as f: data = json.load(f) for feature in data['features']: totals = dict((prop, 0) for prop in PROPS) state_code = int(feature['properties']['CVE_ENT']) mun_code = int(feature['properties']['CVE_MUN']) mun_data = lookup[state_code].get(mun_code) cumulative_fosas_data = [] for year in range(2006, 2017): cumulative_dict = {'year': year} if mun_data: feature['properties']['state_name'] = mun_data[ 'state_name'] for prop in PROPS: if mun_data and mun_data.get(year) and mun_data[year].get( prop, 0) > 0: feature['properties'][prop + '_' + str(year)] = int( mun_data[year].get(prop)) totals[prop] += int(mun_data[year].get(prop)) # elif mun_data and mun_data.get(year) and mun_data[year].get(prop) == 0: # feature['properties'][prop + '_' + str(year)] = 0 else: feature['properties'][prop + '_' + str(year)] = 0 if len(cumulative_fosas_data): cumulative_dict[prop] = cumulative_fosas_data[-1].get( prop) + feature['properties'][prop + '_' + str(year)] feature['properties'][ prop + '_cumulative_' + str(year)] = cumulative_fosas_data[-1].get( prop) + feature['properties'][prop + '_' + str(year)] else: cumulative_dict[prop] = feature['properties'][ prop + '_' + str(year)] feature['properties'][ prop + '_cumulative_' + str(year)] = feature['properties'][prop + '_' + str(year)] cumulative_fosas_data.append(cumulative_dict) for prop in PROPS: feature['properties'][prop + '_total'] = totals[prop] if totals[prop] > maxes[prop]: maxes[prop] = totals[prop] make_centroid = False for prop in PROPS: if feature['properties'][prop + '_total'] > 0: make_centroid = True break if make_centroid: shp = shape(feature['geometry']) feature_centroid = mapping(shp.representative_point()) center_feature = deepcopy(feature) center_feature['geometry'] = feature_centroid centers['features'].append(center_feature) # Just adding to the ugliness. Only centroids have data... feature['properties'] = { 'CVE_ENT': int(feature['properties']['CVE_ENT']) } with open('data/processed-geojson/municipales.json', 'w') as f: json.dump(data, f) with open('data/processed-geojson/municipales-centroids.json', 'w') as f: json.dump(centers, f)
# Get the handles and labels. For this example it'll be 2 tuples # of length 4 each. handles, labels = ax.get_legend_handles_labels() # When creating the legend, only use the first two elements # to effectively remove the last two. # l = ax.legend(handles[0:2], labels[0:2], bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) loc = 'best' if metric == 'pearson_r2_score': loc = 'lower right' l = ax.legend(handles[0:3], split_names, frameon=False, loc=loc) ax.set_xlabel('') ax.set_ylabel(metric_name) ax.set_title( # f'ACNN performance on PDBbind {version} core subset\ncontaining {component_name} ' f'Performance on PDBbind {version} Core Set\n' f'Dataset: PDBbind ({component_name})') ax.set_yticks(np.linspace(0, 1, 11)) ax.set_ylim([0.5, 0.9]) fig.savefig(root / f"{version}.{component}.{metric}.png", dpi=300) #%% d = df.loc[df['set'] == 'test'] d = d.loc[d['metric'] == 'pearson_r2_score'] pt = pd.pivot_table(d, index=['version', 'split', 'component'], columns=['subset'], values=['value'], aggfunc=[np.mean, np.std]) pt.to_csv(root / 'pivot_table.csv') pt #%%
encoder = OneHotEncoder().fit(train[feaures_list]) train = pd.concat([ train, pd.DataFrame(encoder.transform(train[feaures_list]).toarray(), index=train.index, columns=encoder.get_feature_names(feaures_list)) ], axis=1) train.drop(feaures_list, axis=1, inplace=True) columns = ['Product_ID', 'Product_Category_1'] for column in columns: top_100 = data[column].value_counts().index[:100] user_purchase = pd.pivot_table(data[['User_ID', column, 'Purchase']], values='Purchase', index='User_ID', columns=column, aggfunc=np.sum).fillna(0)[top_100] train = train.join(user_purchase) train = train.join(data[['User_ID', 'Purchase']].groupby('User_ID').agg('sum')) train_scaled = StandardScaler().fit_transform(train) k_values = np.arange(1, 11) models = [] dists = [] for k in k_values: model = KMeans(k).fit(train_scaled) models.append(model) dists.append(model.inertia_)
y.drop(y.columns[1], axis=1, inplace=True) y = y.set_index("Year") z = y / x z.rename(columns={'Number of Cases Solved': 'Ratio'}, inplace=True) dataset_cases = dataset_cases.join(z) dataset_cases = dataset_cases.reset_index() dataset_cases = dataset_cases.set_index(['Year', 'Ratio', 'Arrest']) dataset_cases # In[136]: plt.figure(figsize=(30, 15)) heatmap1_data = pd.pivot_table(dataset_cases, values="Number of Cases Solved", index=['Arrest'], columns='Year') plt.title("Solved and Unsolved Arrets per Year") cases_heatmap = sns.heatmap(heatmap1_data, annot=True, cmap="coolwarm", center=2000) cases_heatmap plt.show() # In[137]: plt.figure(figsize=(20, 15)) sns.lineplot(data=dataset_cases, x=dataset_cases.index.get_level_values('Year'), y=dataset_cases.index.get_level_values('Ratio'))
2. Month: 月 3. Day: 日 4. PM: PM2.5值 ''' import pandas as pd import matplotlib.pyplot as plt filename_path = 'data/data_pd/pm2.csv' data_df = pd.read_csv(filename_path) print('数据基本信息:') print(data_df.head()) print(data_df.info()) print(data_df.describe()) print(data_df.shape) year_month_data = data_df.groupby(by=['Year', 'Month'])['PM'].mean() pivot_results = pd.pivot_table(data_df, index='Year', columns='Month', values=['PM'], aggfunc='mean') pivot_results.plot(kind='bar') plt.legend(loc='best') plt.tight_layout() plt.show()
'/Users/zhouxiaocong/Documents/pythoncode/text/duoqingjianke/多情剑客无情剑.txt', 'r'): adj['content'].append(line) words = pseg.cut(line) adj_cnt = 0 cnt = 0 for word, flag in words: if flag in ['Ag', 'a', 'ad', 'an']: adj_cnt += 1 cnt += 1 adj['cnt'].append(adj_cnt / cnt) adj2 = pd.DataFrame(adj) adj2 = adj2.sort_values(by='cnt', axis=0, ascending=False) print(adj2[:5]) c = pd.pivot_table(adj2, index=['content'], values=['cnt'], aggfunc=np.sum) c.to_csv( '/Users/zhouxiaocong/Documents/pythoncode/text/duoqingjianke/duoqingjianke.csv', mode='w', encoding='utf8') #加载背景图片 cloud_mask = np.array( Image.open( "/Users/zhouxiaocong/Documents/pythoncode/text/duoqingjianke/lxh.jpg")) #忽略显示的词 #st=set(["东西","这是"]) #生成wordcloud对象 wc = WordCloud( background_color="white", mask=cloud_mask,
'winnerset4', 'winnerset5', 'loserset1', 'loserset2', 'loserset3', 'loserset4', 'loserset5' ] mathch_melt = pd.melt(mathch, id_vars=match_column, value_vars=['winner1id', 'loser1id'], var_name='type', value_name='palyer_id') mathch_melt = mathch_melt.sort_values( ['palyer_id', 'resultdate'], ascending=False).reset_index(drop=True) mathch_melt = mathch_melt.groupby(['palyer_id']).head(30) player = mathch_melt['palyer_id'].drop_duplicates() mathch_pivot = pd.pivot_table(mathch_melt, index=match_column, columns='type', fill_value=0).reset_index() match_column.extend(['loser1id', 'winner1id']) mathch_pivot.columns = match_column mathch_pivot[['loser1id', 'winner1id']] = mathch_pivot[['loser1id', 'winner1id']].astype("int64") data_match = mathch_pivot.copy() data_match = data_match.sort_values('resultid') player_score, data_match = match_player_rank(data_match) player_score = player_score.sort_values( 'score', ascending=False).reset_index(drop=True) player_score['rank'] = player_score.index + 1 player_score[['rank', 'playerid']].to_csv('playe_rank.csv', index=False)
fillstyle='none', label='50 percentile') plt.plot( guide_date, weight_guide.P25.values, 'ro:', fillstyle='none', label='25 percentile') plt.legend(loc='upper left') # ----------- plot daily feeding quantity -------------------- ax3 = fig.add_subplot(gs[2], sharex=ax2) by_date = pd.pivot_table( df1, values='Vol', index=df1.index.date, columns=['BM'], aggfunc=np.sum, fill_value=0) # Pandas bar plot doesn't work out of box, see link below # https://stackoverflow.com/questions/49269927/missing-bars-in-matplotlib-bar-chart rects_bm = plt.bar( by_date.index, by_date[True].values, bottom=by_date[False].values, # width=0.3, label='breast milk', color='g') rects_f = plt.bar( by_date.index, by_date[False].values, label='formula', color='r') utils.autolabel(rects_bm, rects_f)
today = datetime.today() df['PassD'] = df['Issue Resolution Date'] < today Passdue = {True: 'Over due' , False: 'Not due'} df['Pass Due?'] = df['PassD'].map(Passdue) # calc date 12 month backward ---------------------------------------------------xxx df['datetoday'] = today N = 365 date_N_days_ago = today - timedelta(days=N) # get year/month form date object df['y.notificationdate'] = pd.DatetimeIndex(df['Notification Date']).year df['y.issueindentifiedate'] = pd.DatetimeIndex(df['Date Issue Identified']).year # pivoting df1 = pd.pivot_table(df , values='Issue ID' , index=['Issue Status' , 'Pass Due?' , 'Classification'] , aggfunc=pd.Series.nunique , ) # pd.Series.nunique df2 = df1.xs(('Open') , level=0) df3 = pd.pivot_table(df , values='Issue ID' , index=['Division' , 'Organisation Level'] , columns='Source' , aggfunc=pd.Series.nunique , ) # pd.Series.nunique df4 = pd.pivot_table(df , values='Issue ID' , index=['Division' , 'Organisation Level'] , columns='Classification' , aggfunc=pd.Series.nunique , ) # pd.Series.nunique # df2 = pd.pivot_table(df, values='Issue ID', index='y.notificationdate', columns=['Pass Due?', 'Classification'], # aggfunc=pd.Series.nunique, ) # pd.Series.nunique # df0 = pd.pivot_table(df, values='Issue ID', index=['Source', 'Issue Status', 'Pass Due?'], columns='Classification', # aggfunc=pd.Series.nunique, ) # pd.Series.nunique # df3 = pd.pivot_table(df, values='Issue ID', index=['Issue Status', 'Pass Due?','Classification'], # aggfunc=pd.Series.nunique, ) # pd.Series.nunique # df4 = pd.pivot_table(df, values='Issue ID', index='y.notificationdate', columns=['Pass Due?', 'Classification'], # aggfunc=pd.Series.nunique, ) # pd.Series.nunique
def user_id_feture(data, end_time, beforeoneday): # data = Data[(Data['daystime']<LabelDay) & (Data['daystime']>LabelDay-datetime.timedelta(days=FEATURE_EXTRACTION_SLOT))] user_count = pd.crosstab(data.user_id, data.behavior_type) user_count_before5 = None if (((end_time - datetime.timedelta(days=5)) < datetime.datetime( 2014, 12, 13, 0, 0, 0)) & ((end_time - datetime.timedelta(days=5)) > datetime.datetime( 2014, 12, 10, 0, 0, 0))): beforefiveday = data[data['daystime'] >= end_time - datetime.timedelta(days=5 + 2)] user_count_before5 = pd.crosstab(beforefiveday.user_id, beforefiveday.behavior_type) else: beforefiveday = data[data['daystime'] >= end_time - datetime.timedelta(days=5)] user_count_before5 = pd.crosstab(beforefiveday.user_id, beforefiveday.behavior_type) user_count_before_3 = None if (((end_time - datetime.timedelta(days=5)) < datetime.datetime( 2014, 12, 13, 0, 0, 0)) & ((end_time - datetime.timedelta(days=5)) > datetime.datetime( 2014, 12, 10, 0, 0, 0))): beforethreeday = data[data['daystime'] >= end_time - datetime.timedelta(days=3 + 2)] user_count_before_3 = pd.crosstab(beforethreeday.user_id, beforethreeday.behavior_type) else: beforethreeday = data[data['daystime'] >= end_time - datetime.timedelta(days=3)] user_count_before_3 = pd.crosstab(beforethreeday.user_id, beforethreeday.behavior_type) user_count_before_2 = None if (((end_time - datetime.timedelta(days=5)) < datetime.datetime( 2014, 12, 13, 0, 0, 0)) & ((end_time - datetime.timedelta(days=5)) > datetime.datetime( 2014, 12, 10, 0, 0, 0))): beforethreeday = data[data['daystime'] >= end_time - datetime.timedelta(days=7 + 2)] user_count_before_2 = pd.crosstab(beforethreeday.user_id, beforethreeday.behavior_type) else: beforethreeday = data[data['daystime'] >= end_time - datetime.timedelta(days=7)] user_count_before_2 = pd.crosstab(beforethreeday.user_id, beforethreeday.behavior_type) # beforeoneday = Data[Data['daystime'] == LabelDay-datetime.timedelta(days=1)] beforeonedayuser_count = pd.crosstab(beforeoneday.user_id, beforeoneday.behavior_type) countAverage = user_count / FEATURE_EXTRACTION_SLOT buyRate = pd.DataFrame() buyRate['click'] = user_count[1] / user_count[4] buyRate['skim'] = user_count[2] / user_count[4] buyRate['collect'] = user_count[3] / user_count[4] buyRate.index = user_count.index buyRate_2 = pd.DataFrame() buyRate_2['click'] = user_count_before5[1] / user_count_before5[4] buyRate_2['skim'] = user_count_before5[2] / user_count_before5[4] buyRate_2['collect'] = user_count_before5[3] / user_count_before5[4] buyRate_2.index = user_count_before5.index buyRate_3 = pd.DataFrame() buyRate_3['click'] = user_count_before_3[1] / user_count_before_3[4] buyRate_3['skim'] = user_count_before_3[2] / user_count_before_3[4] buyRate_3['collect'] = user_count_before_3[3] / user_count_before_3[4] buyRate_3.index = user_count_before_3.index buyRate = buyRate.replace([np.inf, -np.inf], 0) buyRate_2 = buyRate_2.replace([np.inf, -np.inf], 0) buyRate_3 = buyRate_3.replace([np.inf, -np.inf], 0) long_online = pd.pivot_table(beforeoneday, index=['user_id'], values=['hours'], aggfunc=[np.min, np.max, np.ptp]) user_id_feture = pd.merge(user_count, beforeonedayuser_count, how='left', right_index=True, left_index=True) user_id_feture = pd.merge(user_id_feture, countAverage, how='left', right_index=True, left_index=True) user_id_feture = pd.merge(user_id_feture, buyRate, how='left', right_index=True, left_index=True) user_id_feture = pd.merge(user_id_feture, user_count_before5, how='left', right_index=True, left_index=True) user_id_feture = pd.merge(user_id_feture, user_count_before_3, how='left', right_index=True, left_index=True) user_id_feture = pd.merge(user_id_feture, user_count_before_2, how='left', right_index=True, left_index=True) user_id_feture = pd.merge(user_id_feture, long_online, how='left', right_index=True, left_index=True) # user_id_feture = pd.merge(user_id_feture,buyRate_2,how='left',right_index=True,left_index=True) # user_id_feture = pd.merge(user_id_feture,buyRate_3,how='left',right_index=True,left_index=True) user_id_feture.fillna(0, inplace=True) return user_id_feture
import pandas as pd import numpy as np import os import webbrowser df_critics = pd.read_json("../output/albums.json") # TODO: Create unique id's # need to differentiate based on URL -- we have too much overlap from albums with similar names ratings_df = pd.pivot_table(df_critics, index="publication name", columns="album id") html = ratings_df.to_html(na_rep="") with open("review_matrix.html", "w") as f: f.write(html)
import matplotlib.pyplot as plt dfx = pd.DataFrame({ 'nama': ['a', 'b', 'a', 'b', 'a', 'c'], 'class': ['x', 'x', 'x', 'x', 'x', 'x'], 'type': ['y', 'z', 'y', 'z', 'y', 'z'], 'nilai': ['1', '1', '1', '1', '2', '2'] }) dfy = pd.DataFrame({'nama': ['a', 'b', 'c']}) #print(dfx.nilai.unique()) listNilai = pd.DataFrame({'nilai': dfx.nilai.unique()}) #print(listNilai) dfNamaNilai = pd.merge(listNilai, dfx, on='nilai') dfPrint = pd.pivot_table(dfx, values='nilai', index=['nama'], columns=['class', 'type'], aggfunc='max') print(dfPrint.index[2]) print(list(dfPrint)) print(dfPrint.iloc[2]) #print (dfPrint) print('-----------------') nilaiList = [] nilaiList.append([None if x != x else int(x) for x in dfPrint[('x', 'z')]]) print(nilaiList) listNow = [1, 2, 3, 99, 70] print(listNow[len(listNow) - 1])
''' The task is to reshape the DataFrame using the pivot function and plot it on a heatmap just like explained here. The year column of the dataset will be set as columns, the month will be set as row indexes, and the passengers will be the values. After reshaping, use the heatmap function of seaborn to plot the visualization. ''' #### Heatmap ############### import pandas as pd import seaborn as sns import matplotlib.pyplot as plt df = sns.load_dataset( 'flights') # Reading flights dataset from seaborn package # Reshaping the DataFrame df = pd.pivot_table(df, values='passengers', index=['month'], columns='year') # Plotting the heatmap sns.heatmap( df, annot=True, fmt='' ) ## The fmt parameter keeps the numbers on the heatmap concise for a better view and understanding. plt.show()
def recomendaciones(usuario, elecciones): users = pd.read_csv("users.csv") images = pd.read_csv("images.csv") print(usuarios_test(pd.read_csv("election.csv"), users)) clasificacion = pd.read_csv("election.csv") print(usuarios_test(clasificacion, users)) n_users = clasificacion.idUser.unique().shape[0] n_images = clasificacion.IdImage.unique().shape[0] print(str(n_users) + ' users') print(str(n_images) + ' images') #plt.hist(clasificacion.election,bins=8) #plt.show() print("prueba") print(clasificacion.groupby(["election"])["idUser"].count()) print("prueba") matrix = pd.pivot_table(clasificacion, values='election', index='idUser', columns='IdImage').fillna(0) print(matrix) ratings = matrix.values sparsity = float(len(ratings.nonzero()[0])) sparsity /= (ratings.shape[0] * ratings.shape[1]) sparsity *= 100 print('Sparsity: {:4.2f}%'.format(sparsity)) ratings_train, ratings_test = train_test_split(ratings, test_size=0.1, random_state=42) print(ratings_train.shape) print(ratings_train.shape[0]) print(ratings_test.shape) sim_matrix = 1 - sklearn.metrics.pairwise.cosine_distances(ratings) print(sim_matrix.shape) #plt.imshow(sim_matrix); #plt.colorbar() #plt.show() #separar las filas y columnas de train y test tam_sim_matrix_test = ratings_train.shape[0] + ratings_test.shape[0] sim_matrix_train = sim_matrix[0:ratings_train.shape[0], 0:ratings_train.shape[0]] sim_matrix_test = sim_matrix[ratings_train.shape[0]:tam_sim_matrix_test, ratings_train.shape[0]:tam_sim_matrix_test] print("MATRICES DE SIMUILITUD") print(sim_matrix_train.shape) print(sim_matrix_test.shape) users_predictions = sim_matrix_train.dot(ratings_train) / np.array( [np.abs(sim_matrix_train).sum(axis=1)]).T print("PREDICCIONES") print(users_predictions.shape) #plt.rcParams['figure.figsize'] = (20.0, 5.0) #plt.imshow(users_predictions) #plt.colorbar() #plt.show() imagen = 0 puntaje = 0 user = usuario data = users[users['user'] == user] usuario_ver = data.iloc[0][ 'idUser'] - 1 # resta 1 para obtener el index de pandas. print("USUARIO" + repr(usuario_ver)) user0 = users_predictions.argsort()[usuario_ver] print(user0) # Veamos los tres recomendados con mayor puntaje en la predic para este usuario for i, aImage in enumerate(user0[-10:]): selImage = images[images['idImage'] == (aImage + 1)] print("ANTES" + repr(imagen)) imagen = selImage.iloc[0]['idImage'] print("DESPUES" + repr(imagen)) puntaje = users_predictions[usuario_ver][aImage] if puntaje >= 0.51: elecciones['election'].append({ 'imagen': repr(imagen), 'puntaje': repr(puntaje) }) with open('election.json', 'w') as file: json.dump(elecciones, file, indent=4) return "exito"
print("n_features: ", X.shape[1]) from sklearn.model_selection import GridSearchCV from sklearn.ensemble import RandomForestClassifier param_grid = {'max_depth': [3, 5, 8, 10, 15, 20, 30], 'max_features': [4, 8, 16, 20, 25, 40]} grid = GridSearchCV(RandomForestClassifier(), param_grid=param_grid) # use [::10] to subsample by a factor of 10 for impatience # could also have used StratifiedShuffleSplit(train_size=.1) grid.fit(X_train[::10], y_train[::10]) res = pd.DataFrame(grid.cv_results_) print(res.keys()) res_piv = pd.pivot_table( res, values='mean_test_score', index='param_max_depth', columns='param_max_features') display(res_piv) import matplotlib.pyplot as plt %matplotlib inline plt.matshow(res_piv.values) plt.xlabel(res_piv.columns.name) plt.xticks(range(res_piv.shape[1]), res_piv.columns) plt.ylabel(res_piv.index.name) plt.yticks(range(res_piv.shape[0]), res_piv.index) plt.colorbar()
print(banks.isnull().sum()) bank_mode = banks.mode() #print(bank_mode) print(banks.columns.values) cols = banks.columns.values banks[cols] = banks[cols].fillna(bank_mode.iloc[0]) print(banks.isnull().sum()) #code ends here # -------------- # Code starts here avg_loan_amount = pd.pivot_table(banks, index=['Gender', 'Married', 'Self_Employed'], values='LoanAmount', aggfunc='mean') print(avg_loan_amount) #Another way to achieve this temp_var = banks.groupby(['Gender', 'Married', 'Self_Employed' ])['LoanAmount'].agg({'LoanAmount': 'mean'}) #print(temp_var) # code ends here # -------------- # code starts here loan_approved_se = banks[(banks['Self_Employed'] == 'Yes') & (banks['Loan_Status'] == 'Y')].shape[0] loan_approved_nse = banks[(banks['Self_Employed'] == 'No')
# Cross-validated performance distribution facet_grid = sns.factorplot(x='l1_ratio', y='score', col='alpha', data=cv_score_df, kind='violin', size=4, aspect=1) facet_grid.set_ylabels('AUROC') # In[23]: # Cross-validated performance heatmap cv_score_mat = pd.pivot_table(cv_score_df, values='score', index='l1_ratio', columns='alpha') ax = sns.heatmap(cv_score_mat, annot=True, fmt='.1%') ax.set_xlabel('Regularization strength multiplier (alpha)') ax.set_ylabel('Elastic net mixing parameter (l1_ratio)') # ## Use Optimal Hyperparameters to Output ROC Curve # In[24]: y_pred_train = pipeline.decision_function(X_train) y_pred_test = pipeline.decision_function(X_test) def get_threshold_metrics(y_true, y_pred): roc_columns = ['fpr', 'tpr', 'threshold']
import pandas as pd import numpy as np import csv # Load prediction rules from data files U = pickle.load(open("user_features.dat", "rb")) M = pickle.load(open("product_features.dat", "rb")) predicted_ratings = pickle.load(open("predicted_ratings.dat", "rb")) #print np.shape(predicted_ratings) raw_dataset_df = pd.read_csv('out.csv') #print raw_dataset_df #to know the index value of user_id ratings_df = pd.pivot_table(raw_dataset_df, index='comp_id', columns='user_id', aggfunc=np.max) #print ratings_df db = mdb.connect(host="localhost", user="******", passwd="pragya", db="recommendation") cursor = db.cursor() cursor.execute("select * from company;") desc = cursor.description with open("company2.csv", "wb") as csv_file: # Python 2 version csv_writer = csv.writer(csv_file) csv_writer.writerow([i[0] for i in desc]) # write headers csv_writer.writerows(cursor)
markersize=7, markerfacecolor='blue', markeredgecolor='teal', data=transaction ) plt.title('Linear Relationship between Price of Item and Units Sold', fontdict=title_font) plt.xlabel('Price of Item', fontdict=secondary_font) plt.ylabel('Units Sold', fontdict=secondary_font) plt.show() # Not a linear relationship ## Feature by units sold pivot table and bar plot feature_pivot = pd.pivot_table(data=transaction, index=['FEATURE'], values=['UNITS']) feature_pivot_t = pd.pivot_table(data=transaction, index=['FEATURE'], values=['UNITS']).plot(kind='bar', color='teal', title='Do Units Purchased vary by Feature Status?' ) # There are far more units sold when featured than not ## Display by units sold pivot table and bar plot display_pivot = pd.pivot_table(data=transaction, index=['DISPLAY'],
def call_KM(genre1, genre2, genre3): movies = pd.read_csv('mysite/movies.csv') ratings = pd.read_csv('mysite/ratings.csv') # genre1='Adventure' # genre2='Sci-Fi' # genre3='Action' my_clusters = 0 helper.set_Variables(genre1, genre2, genre3) genre_ratings = helper.get_genre_ratings(ratings, movies, [genre1, genre2], [Dict[genre1], Dict[genre2]]) biased_dataset = helper.bias_genre_rating_dataset(genre_ratings, 3.2, 2.5) print("Number of records: ", len(biased_dataset)) biased_dataset.head() helper.draw_scatterplot(biased_dataset[Dict[genre2]], Dict[genre2], biased_dataset[Dict[genre1]], Dict[genre1], 'mysite/static/mysite/Normal.png') # plt.savefig('mysite/static/mysite/Normal.png') # # plt.close('mysite/static/mysite/Normal.png') X = biased_dataset[[Dict[genre2], Dict[genre1]]].values # TODO: Create an instance of KMeans to find two clusters kmeans_1 = KMeans(n_clusters=2, random_state=0) predictions = kmeans_1.fit_predict(X) helper.draw_clusters(biased_dataset, predictions, 'mysite/static/mysite/TwoCluster.png') # plt.savefig('mysite/static/mysite/TwoCluster.png') # plt.close('TwoCluster.png') # TODO: Create an instance of KMeans to find three clusters kmeans_2 = KMeans(n_clusters=3, random_state=1) predictions_2 = kmeans_2.fit_predict(X) helper.draw_clusters(biased_dataset, predictions_2, 'mysite/static/mysite/ThreeCluster.png') # plt.savefig('mysite/static/mysite/ThreeCluster.png') # plt.close('ThreeCluster.png') # TODO: Create an instance of KMeans to find four clusters kmeans_3 = KMeans(n_clusters=4, random_state=3) predictions_3 = kmeans_3.fit_predict(X) helper.draw_clusters(biased_dataset, predictions_3, 'mysite/static/mysite/FourCluster.png') # plt.savefig('mysite/static/mysite/FourCluster.png') # plt.close('FourCluster.png') possible_k_values = range(2, len(X) + 1, 5) errors_per_k = [helper.clustering_errors(k, X) for k in possible_k_values] list(zip(possible_k_values, errors_per_k)) fig, ax = plt.subplots(figsize=(16, 6)) ax.set_xlabel('K - number of clusters') ax.set_ylabel('Silhouette Score (higher is better)') ax.plot(possible_k_values, errors_per_k) fig.savefig('mysite/static/mysite/score.png') plt.close(fig) # Ticks and grid xticks = np.arange(min(possible_k_values), max(possible_k_values) + 1, 5.0) ax.set_xticks(xticks, minor=False) ax.set_xticks(xticks, minor=True) ax.xaxis.grid(True, which='both') yticks = np.arange(round(min(errors_per_k), 2), max(errors_per_k), .05) ax.set_yticks(yticks, minor=False) ax.set_yticks(yticks, minor=True) ax.yaxis.grid(True, which='both') # TODO: Create an instance of KMeans to find seven clusters kmeans_4 = KMeans(n_clusters=7, random_state=6) predictions_4 = kmeans_4.fit_predict(X) helper.draw_clusters(biased_dataset, predictions_4, 'mysite/static/mysite/BestCluster.png', cmap='Accent') # plt.savefig('mysite/static/mysite/BestCluster.png') # plt.close('BestCluster.png') biased_dataset_3_genres = helper.get_genre_ratings( ratings, movies, [genre1, genre2, genre3], [Dict[genre1], Dict[genre2], Dict[genre3]]) biased_dataset_3_genres = helper.bias_genre_rating_dataset( biased_dataset_3_genres, 3.2, 2.5).dropna() print("Number of records: ", len(biased_dataset_3_genres)) X_with_action = biased_dataset_3_genres[[ Dict[genre2], Dict[genre1], Dict[genre3] ]].values # TODO: Create an instance of KMeans to find seven clusters kmeans_5 = KMeans(n_clusters=7) predictions_5 = kmeans_5.fit_predict(X_with_action) helper.draw_clusters_3d(biased_dataset_3_genres, predictions_5, 'mysite/static/mysite/3DCluster.png') # plt.savefig('mysite/static/mysite/3DCluster.png') # plt.close('3DCluster.png') #Merge the two tables then pivot so we have Users X Movies dataframe ratings_title = pd.merge(ratings, movies[['movieId', 'title']], on='movieId') user_movie_ratings = pd.pivot_table(ratings_title, index='userId', columns='title', values='rating') user_movie_ratings.iloc[:6, :10] n_movies = 30 n_users = 18 most_rated_movies_users_selection = helper.sort_by_rating_density( user_movie_ratings, n_movies, n_users) most_rated_movies_users_selection.head() helper.draw_movies_heatmap(most_rated_movies_users_selection, 'mysite/static/mysite/HeatMap.png')
def test(): """ import data """ print("* importing data") players = pd.read_csv("../data/nba-players-stats/player_data.csv") players = players[players.year_start >= 1980] # only choose players who started after 1980 players["player_id"] = range(0,len(players.name)) # assign id stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv") stats = stats[stats.Player.isin(players.name)] # only after 1980 stats = stats[stats.Year >= 1980] # without duplicated names --> to do: how to distinguish multiple player with the same name stats = removeDuplicated(players, stats) stats.Year = stats.Year.astype(int) stats.year_count = stats.year_count.astype(int) print("* preparing data") # transform stats to a dictionary composed of df's for each stat # the stats are re-calculated to get one stat for each year metricsPerGameColNames = ["PTS","AST","TOV","TRB","STL","BLK","3P"] metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames) metricsPerCentColNames = ["FG","FT"] metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames) metricsWeightedColNames = ["PER"] metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames) allMetricsDict = {**metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict} allPivotedTableDict = getPivotedTableDict(allMetricsDict) allMetrics = list(allMetricsDict.keys()) # this matrix will be used to mask the table df_year = pd.pivot_table(stats, values="Year", index="Player", columns = "year_count") """ experiment setup """ activePlayers = getActivePlayers(stats, 2016, 4) activePlayers.sort() activePlayers.remove("Kevin Garnett") activePlayers.remove("Kobe Bryant") # offMetrics = ["PTS_G","AST_G","TOV_G","3P_G","PER_w", "FG%","FT%"] # defMetrics = ["TRB_G","STL_G","BLK_G"] expSetup = ["sliding", "SVD", "all", "pinv", False] threshold = 0.97 metrics_to_use = ["PTS_G","AST_G","TOV_G","3P_G","PER_w", "FG%","FT%","TRB_G","STL_G","BLK_G"] print("* start experiment") print("*******************") print("one at once, 10 models in total") pred_all = pd.DataFrame() true_all = pd.DataFrame() for playerName in activePlayers: target = Target(playerName, allPivotedTableDict, df_year) donor = Donor(allPivotedTableDict, df_year) mrsc = mRSC(donor, target, probObservation=1) player_pred = pd.DataFrame() player_true = pd.DataFrame() for metric in metrics_to_use: mrsc.fit_threshold([metric], [1.], 2016, pred_length = 1, threshold = threshold, setup = expSetup) pred = mrsc.predict() true = mrsc.getTrue() pred.columns = [playerName] true.columns = [playerName] player_pred = pd.concat([player_pred, pred], axis=0) player_true = pd.concat([player_true, true], axis=0) pred_all = pd.concat([pred_all, player_pred], axis=1) true_all = pd.concat([true_all, player_true], axis=1) with open('pred_all.pkl', 'wb') as f: pickle.dump(pred_all, f, pickle.HIGHEST_PROTOCOL) with open('true_all.pkl', 'wb') as f: pickle.dump(true_all, f, pickle.HIGHEST_PROTOCOL) print() print("*** MAPE ***") mask = (true_all !=0 ) mape = np.abs(pred_all - true_all) / true_all[mask] print(mape.mean(axis=1)) print("MAPE for all: ", mape.mean().mean()) rmse = utils.rmse_2d(true_all, pred_all) print() print("*** RMSE ***") print(rmse) print("RMSE for all: ", rmse.mean())
def merge_state_data(): lookup = {} with open("data/source/mapas-data-concentrado.xlsx", "rb") as f: dfs = pd.read_excel(f, sheet_name=None) for sheetname, df in dfs.items(): try: sheet_state_code = int(sheetname.split(' ')[0]) except ValueError: print("%s is not a valid sheet name" % sheetname, file=sys.stderr) continue pivot = pd.pivot_table(df, index=["state_code", "year"], fill_value=0, aggfunc=np.sum, values=PROPS) pivot_dict = pivot.reset_index().to_dict("records") final_data = [ dict([k, int(v)] for k, v in row.items()) for row in pivot_dict ] lookup[sheet_state_code] = final_data maxes = {prop: 0 for prop in PROPS} centers = { 'type': 'FeatureCollection', 'name': 'estatalescentroids', 'features': [], } state_meta = [] with codecs.open('data/source-geojson/estatales.json', encoding='utf-8', errors="replace") as f: data = json.load(f) for feature in data['features']: state_code = int(feature['properties']['CVE_ENT']) state_data = lookup[state_code] yearlyData = [] state_data_dict = dict([(row['year'], row) for row in state_data]) for year in range(2006, 2017): row = state_data_dict.get(year, False) if not row: row = {'year': year, 'state_code': state_code} for prop in PROPS: row[prop] = -1 yearlyData.append(row) feature['properties']['yearlyFosasData'] = yearlyData for prop in PROPS: feature['properties'][prop + '_total'] = sum( item.get(prop, 0) for item in state_data) try: feature['properties'][prop + '_max'] = max( item.get(prop, 0) for item in state_data) except ValueError: feature['properties'][prop + '_max'] = 0 feature['properties']['all_max'] = max( feature['properties'][prop + '_max'] for prop in PROPS) shp = shape(feature['geometry']) representative_point = mapping(shp.representative_point()) centroid = mapping(shp.centroid) bounds = shp.bounds center_feature = deepcopy(feature) center_feature['geometry'] = representative_point centers['features'].append(center_feature) # Colima has islands if state_code == 6: bounds = [-104.8, 18.6, -103.4, 19.6] state_meta.append({ 'state_code': feature['properties']['CVE_ENT'], 'state_name': feature['properties']['NOM_ENT'], 'representative_point': representative_point, 'centroid': centroid, 'bounds': bounds, **feature['properties'] }) with open('data/processed-geojson/estatales.json', 'w') as f: json.dump(data, f) with open('data/processed-geojson/estatales-centroids.json', 'w') as f: json.dump(centers, f) with open('src/data/mxstates.json', 'w') as f: json.dump(state_meta, f)
def user_cate_feture(data, end_time, beforeoneday): # data = Data[(Data['daystime']<LabelDay) & (Data['daystime']>LabelDay-datetime.timedelta(days=FEATURE_EXTRACTION_SLOT))] user_item_count = pd.crosstab([data.user_id, data.item_category], data.behavior_type) # beforeoneday = Data[Data['daystime'] == LabelDay-datetime.timedelta(days=1)] user_cate_count_5 = None if (((end_time - datetime.timedelta(days=5)) < datetime.datetime( 2014, 12, 13, 0, 0, 0)) & ((end_time - datetime.timedelta(days=5)) > datetime.datetime( 2014, 12, 10, 0, 0, 0))): beforefiveday = data[data['daystime'] >= ( end_time - datetime.timedelta(days=5 + 2))] user_cate_count_5 = pd.crosstab( [beforefiveday.user_id, beforefiveday.item_category], beforefiveday.behavior_type) else: beforefiveday = data[data['daystime'] >= (end_time - datetime.timedelta(days=5))] user_cate_count_5 = pd.crosstab( [beforefiveday.user_id, beforefiveday.item_category], beforefiveday.behavior_type) user_cate_count_3 = None if (((end_time - datetime.timedelta(days=5)) < datetime.datetime( 2014, 12, 13, 0, 0, 0)) & ((end_time - datetime.timedelta(days=5)) > datetime.datetime( 2014, 12, 10, 0, 0, 0))): beforethreeday = data[data['daystime'] >= ( end_time - datetime.timedelta(days=3 + 2))] user_cate_count_3 = pd.crosstab( [beforethreeday.user_id, beforethreeday.item_category], beforethreeday.behavior_type) else: beforethreeday = data[data['daystime'] >= (end_time - datetime.timedelta(days=3))] user_cate_count_3 = pd.crosstab( [beforethreeday.user_id, beforethreeday.item_category], beforethreeday.behavior_type) user_cate_count_2 = None if (((end_time - datetime.timedelta(days=5)) < datetime.datetime( 2014, 12, 13, 0, 0, 0)) & ((end_time - datetime.timedelta(days=5)) > datetime.datetime( 2014, 12, 10, 0, 0, 0))): beforethreeday = data[data['daystime'] >= ( end_time - datetime.timedelta(days=7 + 2))] user_cate_count_2 = pd.crosstab( [beforethreeday.user_id, beforethreeday.item_category], beforethreeday.behavior_type) else: beforethreeday = data[data['daystime'] >= (end_time - datetime.timedelta(days=7))] user_cate_count_2 = pd.crosstab( [beforethreeday.user_id, beforethreeday.item_category], beforethreeday.behavior_type) # _live = user_cate_long_touch(data) beforeonedayuser_item_count = pd.crosstab( [beforeoneday.user_id, beforeoneday.item_category], beforeoneday.behavior_type) max_touchtime = pd.pivot_table(beforeoneday, index=['user_id', 'item_category'], values=['hours'], aggfunc=[np.min, np.max]) max_touchtype = pd.pivot_table(beforeoneday, index=['user_id', 'item_category'], values=['behavior_type'], aggfunc=np.max) user_cate_feture = pd.merge(user_item_count, beforeonedayuser_item_count, how='left', right_index=True, left_index=True) user_cate_feture = pd.merge(user_cate_feture, max_touchtime, how='left', right_index=True, left_index=True) user_cate_feture = pd.merge(user_cate_feture, max_touchtype, how='left', right_index=True, left_index=True) # user_cate_feture = pd.merge(user_cate_feture,_live,how='left',right_index=True,left_index=True) user_cate_feture = pd.merge(user_cate_feture, user_cate_count_5, how='left', right_index=True, left_index=True) user_cate_feture = pd.merge(user_cate_feture, user_cate_count_3, how='left', right_index=True, left_index=True) user_cate_feture = pd.merge(user_cate_feture, user_cate_count_2, how='left', right_index=True, left_index=True) user_cate_feture.fillna(0, inplace=True) return user_cate_feture
train_data, test_data, = sklearn.model_selection.train_test_split( raw, train_size=0.8 # % of data to use for training (rest used for testing) , random_state=1, shuffle=True) # Up-sample rare outcome variable in training data to match proportion made up by other variable values print('Training data before up-sampling outcome variable:') print( pd.pivot_table( data=train_data, index=outcome_variables, values=train_data.columns[0], aggfunc='count').rename( columns={train_data.columns[0]: 'Count'})) # Before up-sampling train_data_variables_no_upsampling = train_data[train_data[outcome_variables[ 0]].apply(lambda x: not (x in outcome_variable_values_to_upsample))] train_data_variables_upsampling = train_data[train_data[outcome_variables[ 0]].apply(lambda x: x in outcome_variable_values_to_upsample)] if len(outcome_variable_values_to_upsample) > 0: train_data_variables_upsampled = train_data_variables_upsampling.sample( n=int( np.ceil( len(train_data_variables_no_upsampling) * upsample_proportion_of_common_variable)), replace=True)
import numpy as np import pandas as pd from pandas import DataFrame, Series # df = pd.read_csv("……") # table = pd.pivot_table(df,index=['...','..'],value='..',aggfunc='sum',columns=['>>>']) #选择聚合的方式(和,平均……) #有点像小学的课程表 df = DataFrame(np.random.randint(10,size=[20,5]),index=list("AZXSAZXSDCXSAZXBSDCX"),columns=list("HJKLP")) print(df) df1 = pd.pivot_table(df,index=['H',"J"],values=["K",'L'],aggfunc="sum",columns="P",fill_value=0) print(df1)
def divvy_create_graph(station_id, station_name, weather): def haversine(row): # convert decimal degrees to radians #convert lon, lat to floats row['FROM LONGITUDE'], row['FROM LATITUDE'], row['TO LONGITUDE'], row['TO LATITUDE'] = \ map(float, [row['FROM LONGITUDE'], row['FROM LATITUDE'], row['TO LONGITUDE'], row['TO LATITUDE']]) lon1, lat1, lon2, lat2 = \ map(radians, [row['FROM LONGITUDE'], row['FROM LATITUDE'], row['TO LONGITUDE'], row['TO LATITUDE']]) # haversine formula dlon = lon2 - lon1 dlat = lat2 - lat1 a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 c = 2 * asin(sqrt(a)) r = 3956 # Radius of earth in kilometers. Use 3956 for miles 6371km for km return c * r font = {'family' : 'arial', 'weight' : 'bold', 'size' : 15} rc('font', **font); rc("figure", facecolor="white"); rc('axes', edgecolor='darkgray'); def runplot(pvtdf, title, path): pvtdf.plot(kind='bar', edgecolor='w',figsize=(20,5), width=0.5, fontsize = 10) locs, labels = plt.xticks() plt.title(title, weight='bold', size=24) lgd = plt.legend(loc='right', ncol=14, frameon=True, shadow=False, prop={'size': 12}, bbox_to_anchor=(1, 0.95)) for i in range(len(lgd.get_texts())): text = lgd.get_texts()[i].get_text().replace('(DISTANCE_Miles, ', '(') lgd.get_texts()[i].set_text(text) plt.xlabel('Year', weight='bold', size=24) plt.ylabel('Distance', weight='bold', size=20) plt.tick_params(axis='x', bottom='off', top='off', labelsize=15) plt.tick_params(axis='y', left='off', right='off', labelsize=15) plt.grid(b=True) plt.setp(labels, rotation=0, rotation_mode="anchor", ha="center") plt.tight_layout() plt.savefig(path) def htmlpage(datastring, title, filename): html = '''<!DOCTYPE html> <html> <head> <style type="text/css"> body{{ margin:15px; padding: 20px 100px 20px 100px; font-family:Arial, Helvetica, sans-serif; font-size:88%; }} h1, h2 {{ font:Arial black; color: #383838; valign: top; }} img {{ float: right; }} table{{ width:100%; font-size:13px; border-collapse:collapse; text-align: right; }} th{{ color: #383838 ; padding:2px; text-align:right; }} td{{ padding: 2px 5px 2px 5px; }} .footer{{ text-align: right; color: #A8A8A8; font-size: 12px; margin: 5px; }} </style> </head> <body> <h3>{0}<img src="../../divvylogo.svg" alt="divvy icon" height="50px"></h3> {1} </body> <div class="footer">Source: City of Chicago Data Portal</div> </html>'''.format(title, datastring) with open(os.path.join(cd, 'stations', station_name, filename), 'w') as f: f.write(html) CONN = sql.connect("/Users/divya/projects/pandas/divvy-example/Divvy_Trips.db") stations = "SELECT * FROM temp WHERE [FROM STATION ID] = {0} OR [TO STATION ID] = {0}".format(station_id); df = pd.read_sql(stations, con = CONN) CONN.close() # create folder cd = os.path.dirname(os.path.abspath(__file__)) if os.path.exists(os.path.join(cd, 'stations', station_name)) == False: os.makedirs(os.path.join(cd, 'stations', station_name)) #NEW COLUMNS df['DISTANCE_Miles'] = df.apply(haversine, axis=1) df["START TIME"] = pd.to_datetime(df["START TIME"], format="%m/%d/%Y %I:%M:%S %p") df["STOP TIME"] = pd.to_datetime(df["STOP TIME"], format="%m/%d/%Y %I:%M:%S %p") df.loc[:, 'TRIP DURATION'] = df['TRIP DURATION'].astype(float)/60 df.loc[:, 'START TIME MONTH'] = df['START TIME'].dt.month df.loc[:, 'START TIME YEAR'] = df['START TIME'].dt.year df.loc[:, 'STOP TIME MONTH'] = df['STOP TIME'].dt.month df.loc[:, 'STOP TIME YEAR'] = df['STOP TIME'].dt.year df.loc[:, 'START WEEKDAY'] = [cal.day_name[i] for i in df['START TIME'].dt.weekday] df['BIRTH YEAR'] = np.where(df["BIRTH YEAR"] == '', np.nan, df["BIRTH YEAR"]) df['BIRTH YEAR'] = df['BIRTH YEAR'].astype('float') df.loc[:, 'AGE'] = df['START TIME YEAR'] - df['BIRTH YEAR'] df['AGE_GROUP'] = np.where(df['AGE'].between(0,18, inclusive = True), '0 - 18', np.where(df['AGE'].between(19,29, inclusive = True), '19 - 29', np.where(df['AGE'].between(30,39, inclusive = True), '30 - 39', np.where(df['AGE'].between(40,49, inclusive = True), '40 - 49', np.where(df['AGE'].between(50,65, inclusive = True), '50 - 65', np.where(df['AGE'].between(65,80, inclusive = True), '65+', np.nan)))))) df.loc[:, 'START TIME HOUR'] = df['START TIME'].dt.hour df['TIME FRAME'] = np.where(df['START TIME HOUR'].between(0,1, inclusive = True), '00:00-01:00', np.where(df['START TIME HOUR'].between(1,2, inclusive = True), '01:00-02:00', np.where(df['START TIME HOUR'].between(2,3, inclusive = True), '02:00-03:00', np.where(df['START TIME HOUR'].between(3,4, inclusive = True), '03:00-04:00', np.where(df['START TIME HOUR'].between(4,5, inclusive = True), '04:00-05:00', np.where(df['START TIME HOUR'].between(5,6, inclusive = True), '05:00-06:00', np.where(df['START TIME HOUR'].between(6,7, inclusive = True), '06:00-07:00', np.where(df['START TIME HOUR'].between(7,8, inclusive = True), '07:00-08:00', np.where(df['START TIME HOUR'].between(8,9, inclusive = True), '08:00-09:00', np.where(df['START TIME HOUR'].between(9,10, inclusive = True), '09:00-10:00', np.where(df['START TIME HOUR'].between(10,11, inclusive = True), '10:00-11:00', np.where(df['START TIME HOUR'].between(11,12, inclusive = True), '11:00-12:00', np.where(df['START TIME HOUR'].between(12,13, inclusive = True), '12:00-13:00', np.where(df['START TIME HOUR'].between(13,14, inclusive = True), '13:00-14:00', np.where(df['START TIME HOUR'].between(14,15, inclusive = True), '14:00-15:00', np.where(df['START TIME HOUR'].between(15,16, inclusive = True), '15:00-16:00', np.where(df['START TIME HOUR'].between(16,17, inclusive = True), '16:00-17:00', np.where(df['START TIME HOUR'].between(17,18, inclusive = True), '17:00-18:00', np.where(df['START TIME HOUR'].between(18,19, inclusive = True), '18:00-19:00', np.where(df['START TIME HOUR'].between(19,20, inclusive = True), '19:00-20:00', np.where(df['START TIME HOUR'].between(20,21, inclusive = True), '20:00-21:00', np.where(df['START TIME HOUR'].between(21,22, inclusive = True), '21:00-22:00', np.where(df['START TIME HOUR'].between(22,23, inclusive = True), '22:00-23:00', np.where(df['START TIME HOUR'].between(23,24, inclusive = True), '23:00-24:00', np.nan)))))))))))))))))))))))) #overall df.groupby(['START TIME YEAR', 'START TIME MONTH'])['TRIP DURATION'].count().reset_index().to_csv(os.path.join(cd, 'stations', station_name, 'tripCount.csv')) htmlpage(df.groupby(['START TIME YEAR', 'START TIME MONTH'])['TRIP DURATION'].count().reset_index().to_html(), '{} - Trip Count'.format(station_name), 'TripCount.html') #df.groupby(['START TIME YEAR', 'START TIME MONTH'])['TRIP DURATION'].count().reset_index().to_html(os.path.join(cd, 'stations', station_name, 'tripCount.html')) df.groupby(['START TIME YEAR', 'START TIME MONTH'])['TRIP DURATION'].mean().reset_index().to_csv(os.path.join(cd, 'stations', station_name, 'averageDuration.csv')) #df.groupby(['START TIME YEAR', 'START TIME MONTH'])['TRIP DURATION'].mean().reset_index().to_html(os.path.join(cd, 'stations', station_name, 'averageDuration.html')) htmlpage(df.groupby(['START TIME YEAR', 'START TIME MONTH'])['TRIP DURATION'].mean().reset_index().to_html(), '{} - Trip Duration'.format(station_name), 'AvgDuration.html') df.groupby(['START TIME YEAR', 'START TIME MONTH'])['DISTANCE_Miles'].mean().reset_index().to_csv(os.path.join(cd, 'stations', station_name, 'averageDistance.csv')) htmlpage(df.groupby(['START TIME YEAR', 'START TIME MONTH'])['DISTANCE_Miles'].mean().reset_index().to_html(), '{} - Avg Miles'.format(station_name), 'AvgDistance.html') #df.groupby(['START TIME YEAR', 'START TIME MONTH'])['DISTANCE_Miles'].mean().reset_index().to_html(os.path.join(cd, 'stations', station_name, 'averageDistance.html')) #distance by gender distanceByGender = pd.pivot_table(df[(df['USER TYPE'] == 'Subscriber') & (df['GENDER'] != '')], index=['START TIME YEAR', 'START TIME MONTH'], columns=['GENDER'], values=['DISTANCE_Miles'], aggfunc=len) distanceByGender.plot(title= "{} - Distance in miles by gender".format(station_name), figsize=(20,5)) lgd = plt.legend() for i in range(len(lgd.get_texts())): text = lgd.get_texts()[i].get_text().replace('(DISTANCE_Miles, ', '(') lgd.get_texts()[i].set_text(text) plt.savefig(os.path.join(cd, 'stations', station_name, 'distanceByGender.png')) miles_by_gender = pd.pivot_table(df[(df['USER TYPE'] == 'Subscriber') & (df["AGE"] < 80) & (df["GENDER"].str.len() > 0) & (df["DISTANCE_Miles"] > 0)], index = ['START TIME YEAR'], columns=['GENDER'], values=['DISTANCE_Miles'], aggfunc = len) miles_by_gender.reset_index().to_csv(os.path.join(cd, 'stations', station_name, 'milesByGender.csv')) htmlpage(miles_by_gender.reset_index().to_html(), '{} - Miles By Gender'.format(station_name), 'MilesByGender.html') #miles_by_gender.reset_index().to_html(os.path.join(cd, 'stations', station_name, 'milesByGender.html')) wes.set_palette('FantasticFox') runplot(miles_by_gender,' {} - Divvy Data subscribers, distance by age and gender'.format(station_name), os.path.join(cd, 'stations', station_name, 'milesByGender.png')) mergelist = [] for yr in list(range(df['START TIME YEAR'].min(), df['START TIME YEAR'].max() + 1)): miles_by_gender_pie = None miles_by_gender_pie = df[(df['USER TYPE'] == 'Subscriber') & (df['START TIME YEAR']==yr) & (df["GENDER"].str.len() > 0)].\ groupby(["GENDER", "START TIME YEAR"])["DISTANCE_Miles"].apply(len).reset_index() mergelist.append(miles_by_gender_pie[['GENDER', 'DISTANCE_Miles']]) miles_by_gender_pie = reduce(lambda left,right: pd.merge(left, right, on='GENDER'), mergelist) miles_by_gender_pie.columns = ['GENDER'] + [str(i) for i in list(range(df['START TIME YEAR'].min(), df['START TIME YEAR'].max() + 1))] labels = 'Female', 'Male' #pie = plt.pie(miles_by_gender_pie, labels=labels) miles_by_gender_pie.plot.pie(subplots=True, labels=['', ''], figsize=(12,3.8), autopct='%.2f') plt.axis('equal') plt.legend(labels=labels) plt.savefig(os.path.join(cd, 'stations', station_name, 'milesByGenderPIE.png')) #Customer v Subscriber user_type_pie = df.groupby(["START TIME YEAR","USER TYPE"]).apply(len).reset_index() user_type_pie.columns = ['Year', 'User Type', 'People'] customervsubscriber = pd.pivot_table(user_type_pie, index=['User Type'], columns=['Year'], values=['People']) labels = ['Customer', 'Subscriber'] customervsubscriber.plot.pie(subplots=True, labels=['', ''], figsize=(12,3.8), autopct='%.2f') plt.axis('equal') plt.legend(labels=labels) plt.savefig(os.path.join(cd, 'stations', station_name, 'customervSubscriber.png')) #Miles by Age# miles_by_age = pd.pivot_table(df[(df['USER TYPE'] == 'Subscriber') & (df["AGE"] < 80) & (df["GENDER"].str.len() > 0) & (df["DISTANCE_Miles"] > 0)], index = ['START TIME YEAR'], columns=['AGE_GROUP'], values=['DISTANCE_Miles'], aggfunc = np.sum) runplot(miles_by_age, '{} - Divvy Data subscribers, total distance by age'.format(station_name), os.path.join(cd, 'stations', station_name, 'milesByAge.png')) htmlpage(miles_by_age.reset_index().to_html(), '{} - Miles By Age'.format(station_name), 'MilesByAge.html') #avDistanceByGender over Weather avDistanceMiles = pd.pivot_table(df[df['GENDER'] != ''] , index=['START TIME MONTH'], columns=['GENDER'], values=['DISTANCE_Miles'], aggfunc=np.sum) total = avDistanceMiles.join(weather) DistancevsWeather = total.sort_values(['ATF']) DistancevsWeather["START TIME MONTH"] = DistancevsWeather.index.values DistancevsWeather = DistancevsWeather.set_index(['ATF']) line = DistancevsWeather[[0,1]].plot(title="Mean distance travelled by gender", figsize=(20,5)) line.set_ylabel("Distance(miles)") line.set_xlabel("Time") lgd = plt.legend() for i in range(len(lgd.get_texts())): text = lgd.get_texts()[i].get_text().replace('(DISTANCE_Miles, ', '(') lgd.get_texts()[i].set_text(text) plt.savefig(os.path.join(cd, 'stations', station_name, 'milesvsWeather.png')) htmlpage(DistancevsWeather.reset_index().to_html(), '{} - Distance vs Weather'.format(station_name), 'DistancevsWeather.html')
books = pd.read_csv( "C:\\Users\\trupti\\Desktop\\BookRack\\home\\book_data2.csv", engine="python") # display(ratings.head()) # display(books.head()) # In[4]: # Merge the two tables then pivot so we have Users X Books dataframe. ratings_title = pd.merge(ratings, books[['book_id', 'book_title']], on='book_id') user_book_ratings = pd.pivot_table(ratings_title, index='user_id', columns='book_title', values='rating') print('dataset dimensions: ', user_book_ratings.shape, '\n\nSubset example:') #user_book_ratings.iloc[:25, :10] # In[5]: # Drop users that have given fewer than 100 ratings of these most-rated books user_book_ratings = user_book_ratings.dropna(thresh=100) # print('dataset dimensions: ', user_book_ratings.shape, '\n\nSubset example:') # user_book_ratings.iloc[:25, :10] # In[6]:
#loop through offices and find statewide ones for office in offices_tot: counties = [] counties.append(elec_df.loc[elec_df['office'] == office, 'county']) #list of counties that had an election for that office counties_office[office] = counties c = counties_office[office][0].values D = {I: True for I in c} count = D.keys() if len(count) == len(counties_tot): state_offices.append(office) state_elec = elec_df.loc[elec_df['office'].isin(state_offices)] #get table of elections by precinct prec_elec = pd.pivot_table(state_elec, index = ['loc_prec', 'county', 'precinct'], columns = ['office','party','candidate'], values = ['votes'], aggfunc = np.sum) prec_elec.columns = prec_elec.columns.to_series().str.join(' ') columns = prec_elec.columns.values # print columns and assign each one a 10 character name for the shapefile print(columns) # make dict for column name replacement using the columns printed in module above # columns can only have 10 character names prec_elec_rn = prec_elec.rename(columns = { 'votes Attorney General DEM Mike Lee': 'G18DATG1', 'votes Attorney General Dem Mike Lee': 'G18DATG2', 'votes Attorney General LIB Kerry Hicks': 'G18LATG1', 'votes Attorney General Lib Kerry Hicks': 'G18LATG2',