Пример #1
0
 def test_pivot_table_multi(self):
     res_sparse = pd.pivot_table(self.sparse, index='A', columns='B',
                                 values=['D', 'E'])
     res_dense = pd.pivot_table(self.dense, index='A', columns='B',
                                values=['D', 'E'])
     res_dense = res_dense.apply(lambda x: x.astype("Sparse[float64]"))
     tm.assert_frame_equal(res_sparse, res_dense)
Пример #2
0
def data2sub_matrix(data_fit,values_col,index_col,type_form,aggfunc='mean'): 
    """
    This creates substitition matrix from input data (frequncies `data_lbl` or `data_fit`).
    
    :param data_fit: fitness data (`data_fit`).
    :param values_col: column name with values (str). 
    :param index_col: column name with index (str).
    :param type_form: type of values ["aas" : amino acid | "cds" : codons].
    """                  
    from dms2dfe.lib.global_vars import aas_21,cds_64
    if aggfunc=="mean":
        data_sub_matrix=pd.pivot_table(data_fit,values=values_col,index=index_col,columns='ref')
    else:
        data_sub_matrix=pd.pivot_table(data_fit,values=values_col,index=index_col,columns='ref',aggfunc=aggfunc)        
    #make it 21X21
    if type_form=="aas":
        sub_matrix=pd.DataFrame(index=aas_21,columns=aas_21)
        sub_matrix.index.name='Mutation to'
        sub_matrix.columns.name='Wild type'
        for ref in aas_21:
            for mut in aas_21:
                try:
                    sub_matrix.loc[mut,ref]=data_sub_matrix.loc[mut,ref]
                except:
#                     a=1
                    sub_matrix.loc[mut,ref]=np.nan
    return sub_matrix.astype(float)
Пример #3
0
def main(fh_xls,well):
    info=pd.read_excel(fh_xls,'info')
    info=info.set_index('varname')
    for var in info.iterrows() :
        val=info['input'][var[0]]
        if not pd.isnull(val):
            exec("%s=info['input']['%s']" % (var[0],var[0]),locals(), globals())
        else:
            exec("%s=info['default']['%s']" % (var[0],var[0]),locals(), globals())

    data_job=pd.read_excel(fh_xls,'JobView')
    data_fns=pd.pivot_table(data_job,values='File Name',index='Loop_bleach Index',columns='Well Name', aggfunc=lambda x: x.iloc[0])
    data_fns_P =pd.pivot_table(data_job,values='File Name',index='TimeLapse1 Index',columns='Well Name', aggfunc=lambda x: x.iloc[0])
    data_fns   =pd.concat([data_fns,data_fns_P],axis=0)

    wells=[str(x) for x in list(data_job['Well Name'].unique())]
    if not any(x in well for x in wells):
        print "### ERROR : Could not find '%s'!" % well
        sys.exit(1)
    if not exists("%s.%sstb1.mp4" % (fh_xls,well)):
        print ">>> STATUS  : nd2vid : %s" % well 
        nd_fns=data_fns[well].dropna().unique()
        arr_list=nd2arr_list(nd_dh,nd_fns)
        arr_list_stb=raw2phasecorr(arr_list,clip=0.125)
        plt.imshow(arr_list_stb[0])
        plt.savefig('%s.%s_ref_frame.png' % (fh_xls,well))
        regions,kins_mean=arr_list2regions(arr_list_stb,time_increment)
        kins_mean.to_csv('%s.%s_kins_mean.csv' % (fh_xls,well))
        arr_list2vid(arr_list_stb,regions,kins_mean,('%s.%sstb.mp4' % (fh_xls,well)),384, 384)
        # arr_list2vid(arr_list    ,regions,kins_mean,('%s.%sraw.mp4' % (fh_xls,well)),384, 384)
    else:
        print ">>> STATUS  : nd2vid :already done"
Пример #4
0
 def num_hpo(self,fdr_cutoff=0.3,min_snp2gene_obs=2,dropna=False,drop_empty_cols=True):
     '''
         Returns a summary table with the number of HPO genes discoverd
         for different networks
     '''
     candidates = self.high_priority_candidates(fdr_cutoff=fdr_cutoff,min_snp2gene_obs=min_snp2gene_obs)
     candidates['fdr'] = fdr_cutoff
     # Calculate Totals
     total = pd.pivot_table(
         candidates,
         index=['fdr','Method','COB'],
         values='gene',
         dropna=dropna,
         aggfunc=lambda x: len(set(x))
     ).fillna(0).astype(int)
     total.columns = ['Total']
     total = total.T
     # Pivot and aggregate
     by_term =  pd.pivot_table(
         candidates,
         columns=['fdr','Method','COB'],
         index=['Term'],
         values='gene',
         dropna=dropna,
         aggfunc=lambda x: len(set(x))
     ).fillna(0).astype(int)
     num_hpo = by_term.append(total)
     if drop_empty_cols:
         num_hpo = num_hpo.loc[:,num_hpo.sum()>0]
     return num_hpo
Пример #5
0
def get_emp_SE(data, code_type):	

	from scipy.stats import stats
	
	
	allcor=data[data['acc'].isin([1])]
	cor_pivot=pd.pivot_table(allcor, values='rt', cols=['stim', 'cue'], rows=['subj_idx'], aggfunc=np.average)
	acc_pivot=pd.pivot_table(data, values='acc', cols=['stim', 'cue'], rows=['subj_idx'], aggfunc=np.average)
	#Get theoretical RT S.E.M's
	sem_rt=[]
	for img, cue in cor_pivot.columns:
		x=stats.sem(cor_pivot[img][cue])
		sem_rt.append(x)

	#Get theoretical ACCURACY S.E.M's
	sem_acc=[]
	for img, cue in acc_pivot.columns:
		x=stats.sem(acc_pivot[img][cue])
		sem_acc.append(x)

	if code_type=='HNL':
		face_emp_acc_SE=sem_acc[:3]
		house_emp_acc_SE=sem_acc[3:]
		face_emp_rts_SE=sem_rt[:3]
		house_emp_rts_SE=sem_rt[3:]
	else:
		face_emp_acc_SE=sem_acc[:5]
		house_emp_acc_SE=sem_acc[5:]
		face_emp_rts_SE=sem_rt[:5]
		house_emp_rts_SE=sem_rt[5:]
	
	sem_list=[face_emp_acc_SE, house_emp_acc_SE, face_emp_rts_SE, house_emp_rts_SE]

	return sem_list
Пример #6
0
def change_finder(extra_sites,long_term, all_sites):
    '''
    Function to calculate % difference between long term site time series 
    and bootstrapped all sties time series
    '''
    for n in xrange(len(extra_sites)):
        
        #subset all sites to drop one value
        drop_site = extra_sites.iloc[n][0]
        sites2test =all_sites[all_sites.Site != drop_site]
    
        #Find normal Volumes 
        long_t = pivot_df(long_term)
        all_s = pivot_df(sites2test)
        date_influence = (all_s['Norm_Vol'] - long_t['Norm_Vol'])/(all_s['Norm_Vol'])
        date_influence = date_influence.rename('Pct_Change')
        long_t= pd.pivot_table(long_t.reset_index(), index=['TripDate'], values=['Norm_Vol'])
        all_s = pd.pivot_table(all_s.reset_index(), index=['TripDate'], values=['Norm_Vol'])
        
        if n == 0:
            in_df=None
            in_df2 = None
            df_back, df2_back = merge_df(long_t,all_s,drop_site,in_df,date_influence,in_df2)
        else:
            df_back, df2_back = merge_df(long_t,all_s,drop_site,df_back,date_influence,df2_back)
        
    return df_back, df2_back
Пример #7
0
def calc_cleaning(df_calc, option=1):
    """ Take data, clean it, return for calculations
        option=1 means total minutes in raw or percent
        option=2 means for salaries
     """
    df_calc.fillna(0, inplace=True)  # Replace NaN with 0 for calcs
    df_calc['total_time'] = ((df_calc['hours_spent'] * 60)
                             + df_calc['minutes_spent'])
    df_calc['username'] = df_calc['username'].apply(
        lambda x: x.lower())

    if option == 1:  # General, returns total minutes (raw or percent)
        table = pd.pivot_table(df_calc, values=['total_time'],
                               rows=['username'],
                               columns=['program_name'],
                               aggfunc=np.sum, margins=True)  # Pivot data
        return table.total_time

    else:  # option == 2
        # For salary
        df_calc['hourly_wage'] = df_calc[['time.user.userprofile.salary']] / 52 / 35  # Annual to Hourly
        df_calc.rename(columns={'time.user.userprofile.department.name': 'dept'},
                       inplace=True)
        df_calc['cost'] = ((df_calc['total_time'] / 60) *
                           df_calc['hourly_wage'])  # Cost by dep
        df_calc['cost'] = ((df_calc['total_time'] / 60) *
                           df_calc['hourly_wage'])
        table = pd.pivot_table(df_calc, values=['cost'], rows=['dept'],
                               columns=['program_name'], aggfunc=np.sum,
                               margins=True)
        return table
Пример #8
0
def getFitLine(df): # returns one-column df with avg speed by hr (index)
    df2 = copy.deepcopy(df)
    counter = 1
    while counter < len(df2)-6:
        diff = np.sqrt((df2.speeds[counter] - df2.speeds[counter-1])**2) 
        percentDiff = diff / (df2.speeds[counter-1] + .00000001)    
        if percentDiff > .3:
            df2.ix[counter, 'speeds'] = np.nan
            df2.ix[counter-1, 'speeds'] = np.nan
            df2.ix[counter+1, 'speeds'] = np.nan
            df2.ix[counter+2, 'speeds'] = np.nan
            df2.ix[counter+3, 'speeds'] = np.nan
            df2.ix[counter+4, 'speeds'] = np.nan
            counter += 6
        else: counter += 1
    # only keeping entries where altDelta less than 1, speed greater than 1, distDelta greater than 1, and change in speed less than threshhold   
    df2 = df2[(np.sqrt(df2.altDeltas**2) < 1.0) & (df2.speeds > 0.5)]    
    # consolidating to hr groups
    hrGroups = np.arange(60,200,5) # buckets five wide
    def assignHrGroups(x):
        for i in range(len(hrGroups)-1):
            if x >= hrGroups[i] and x < hrGroups[i+1]:
                return hrGroups[i]                             
    hrs = df2.hr
    hrs = hrs.map(assignHrGroups)
    df2['hrGroup'] = hrs    
    # making pivot table to consolidate by hr
    fitLine = pivot_table(df2, ['speeds'], 'hrGroup', aggfunc=np.mean) 
    f = pivot_table(df2, ['speeds'], 'hrGroup', aggfunc=len) 
    
    fitLine = DataFrame({'date': df.date[0], 'hr':fitLine.index, 'avgSpeed':fitLine.speeds, 'count':f.speeds})   
    fitLine = fitLine[fitLine['count'] >= 10] # only keeping groups w at least 20 entries
    return fitLine
Пример #9
0
def mkfile(dfd, dfs):
	#first open file
	writer = pd.ExcelWriter('grouped_data_4heat.xlsx')
	dfd.sort_index()
	print(dfd)

	#using pivot_table method to create the half matrix
	pivotplay1 = pd.pivot_table(dfd, values="fc", index=['position1','mut_type1'], columns=['position2','mut_type2']) 
	
	#create the transposed half matrix
	pivotplay2 = pd.pivot_table(dfd, values="fc", index=['position2','mut_type2'], columns=['position1','mut_type1'])

	#sneakily rename the levels in transposed matrix
	pivotplay2.index.names = pivotplay1.index.names
	pivotplay2.columns.names = pivotplay1.columns.names

	#merge the two matrices
	combined_pivot = pivotplay1.combine_first(pivotplay2)
	combined_pivot.to_excel(writer,'combined_pivot')

   	#need to fill the ID portion of the matrix, using the set_value fxn: 
	#df.set_value('Col', 'Index', value) or using some nifty solns found here: http://manishamde.github.io/blog/2013/03/07/pandas-and-python-top-10/
	print(dfs)

	def fill_id(x):
		if type(x) is NaN:
			if dfs('mut_type', 'position') == combined_pivot(x, 'mut_type1','position1'):
				return dfs('fc')
			else:
				return 0

	combined_pivot.applymap(fill_id)
Пример #10
0
def pivot_one_sfdc(sfdc_file, sfdc_load_key, sfdc_pivot_key, sfdc_pivot_header):
    if sfdc_file is None or not os.path.isfile(sfdc_file) or not os.path.exists(sfdc_file):
        return None

    sfdc_dataframe = pd.read_csv(sfdc_file, index_col=sfdc_load_key)

    # generate pivot file without big deal filtering
    pivoted_file_name = os.path.join(os.path.dirname(sfdc_file), "Pivot_" + os.path.basename(sfdc_file))

    pivot_dataframe = pd.pivot_table(sfdc_dataframe, index=sfdc_pivot_key, values=sfdc_pivot_header, aggfunc=np.sum)

    pivot_dataframe.to_csv(pivoted_file_name)

    # generate pivot file with big deal as filtering
    pivoted_file_name = os.path.join(os.path.dirname(sfdc_file), "Pivot_bigdealfiltering" + os.path.basename(sfdc_file))

    pivot_dataframe = pd.pivot_table(sfdc_dataframe, index=sfdc_pivot_key, columns=["BIG DEAL"],
                                     values=sfdc_pivot_header, aggfunc=np.sum)

    pivot_dataframe.to_csv(pivoted_file_name)

    # generate pivot file with big deal as index
    pivoted_file_name = os.path.join(os.path.dirname(sfdc_file), "Pivot_bigdealindexing" + os.path.basename(sfdc_file))

    index_keys = [sfdc_pivot_key, "BIG DEAL"]

    pivot_dataframe = pd.pivot_table(sfdc_dataframe, index=index_keys, values=sfdc_pivot_header, aggfunc=np.sum)

    pivot_dataframe.to_csv(pivoted_file_name)

    return pivoted_file_name
Пример #11
0
    def blockify_data(self, data, get_var=False, measures=['rt', 'acc']):

        data = data.copy()
        if self.blocksCol not in data.columns:
            data = blockify_trials(data, nblocks=self.nblocks)
        goDF = data[data.response==1.]
        ssDF = data[data.ttype==0.]
        tableList = []
        if 'rt' in measures:
            rtTable = pd.pivot_table(goDF, values='rt', columns=self.blocksCol, index='idx')
            tableList.append(rtTable)
        if 'acc' in measures:
            ssTable = pd.pivot_table(ssDF, values='acc', columns=self.blocksCol, index='idx')
            tableList.append(ssTable)
        if 'bound' in measures:
            scoreTable = pd.pivot_table(data, values='bound', columns=self.blocksCol, index='idx')
            tableList.append(scoreTable)
        if 'score' in measures:
            scoreTable = pd.pivot_table(data, values='score', columns=self.blocksCol, index='idx')
            tableList.append(scoreTable)
        if 'vTrial' in measures:
            vTable = pd.pivot_table(data, values='drift', columns=self.blocksCol, index='idx')
            tableList.append(vTable)
        if get_var:
            blockedMeasures = [[table.mean().values, table.sem().values*1.96] for table in tableList]
            blockedMeasures = list(itertools.chain.from_iterable(blockedMeasures))
        else:
            blockedMeasures = [table.mean().values for table in tableList]
        return blockedMeasures
Пример #12
0
def main(in_file='avito_train.tsv'):
   # Takes 1min on full training set
   # category, subcategory, attr, is_blocked
   df = pd.read_csv(in_file, sep='\t', usecols=np.array([1,2,5,8]))
   frac_blocked(df)
   # view totals in category/subcategory
   print pd.pivot_table(df,'is_blocked','subcategory','category',aggfunc='count')
Пример #13
0
def understand_data(input_list):
    for choice in input_list:
        if choice == 1:
            print raw_data.head(10)
            # print pd.unique(raw_data.TripType)
            print "Count of unique product Upc", int(pd.DataFrame(pd.unique(raw_data.Upc)).count())
            print "Count of unique product department descriptions", int(
                pd.DataFrame(pd.unique(raw_data.DepartmentDescription)).count()
            )
        elif choice == 2:
            share_of_trip_type = pd.DataFrame(
                raw_data.groupby(["TripType"], axis=0)["VisitNumber"].count() * 100 / len(raw_data)
            )
            print share_of_trip_type

            products_departments = pd.DataFrame(
                raw_data.groupby(["DepartmentDescription"], axis=0)["Upc"].nunique()
            )  # http://stackoverflow.com/questions/15411158/pandas-countdistinct-equivalent

            print products_departments
        elif choice == 3:
            # http://pandas.pydata.org/pandas-docs/stable/reshaping.html
            # department_triptype_pivot = pd.pivot_table(raw_data, values='VisitNumber', index='DepartmentDescription', columns='TripType', aggfunc=np.size)
            # print department_triptype_pivot

            department_finelinenum_pivot = pd.pivot_table(
                raw_data, values="VisitNumber", index="FinelineNumber", columns="DepartmentDescription", aggfunc=np.size
            )
            print department_finelinenum_pivot

            department_weekday_pivot = pd.pivot_table(
                raw_data, values="VisitNumber", index="DepartmentDescription", columns="Weekday", aggfunc=np.size
            )
            print department_weekday_pivot

            Weekday_trip_type_pivot = pd.pivot_table(
                raw_data, values="VisitNumber", index="TripType", columns="Weekday", aggfunc=np.size
            )
            print Weekday_trip_type_pivot

        elif choice == 10:
            # http://stackoverflow.com/questions/21654635/scatter-plots-in-pandas-pyplot-how-to-plot-by-category
            groups = raw_data.groupby("TripType")
            fig, ax = plt.subplots()
            for name, group in groups:
                # print name
                # print group.DepartmentDescription
                ax.plot(group.ScanCount, group.Weekday_num, marker="o", linestyle="", ms=5, label=name)
                ax.legend()
            plt.show()

        elif choice == 20:
            # (pd.DataFrame(pd.unique(raw_data.TripType))).to_csv('Unique_trip_types.csv',sep = ',',index = False)
            # share_of_trip_type.to_csv('TripType_percentage_share.csv',sep = ',')
            # products_departments.to_csv('Unique products per department.csv',sep = ',')
            # department_triptype_pivot.to_csv('DepartmentDescription trip Type visit number frequency pivot table.csv',sep=',')
            # department_finelinenum_pivot.to_csv('DepartmentDescription finelinenumber visit number frequency pivot table.csv',sep=',')
            # department_weekday_pivot.to_csv('DepartmentDescription Weekday visit number frequency pivot table.csv',sep=',')
            Weekday_trip_type_pivot.to_csv("TripType Weekday visit number frequency pivot table.csv", sep=",")
def _count_departments(data_items):
  # create table of trip_id/count_departments
  count_department = pd.pivot_table(data_items, index = ['trip_id', 'department_name'])
  count_department.reset_index(drop = False, inplace = True)
  count_department = pd.pivot_table(count_department, index = 'trip_id', aggfunc = 'count')['department_name']
  count_department_df = pd.DataFrame(count_department)
  count_department_df.rename(columns={'department_name': 'department_count'}, inplace=True)
  return count_department_df
Пример #15
0
 def analyze_trials(self, resultsDF):
     goDF = resultsDF[resultsDF.response==1.]
     ssDF = resultsDF[resultsDF.ttype==0.]
     rtBlocks = pd.pivot_table(goDF, values='rt', columns=self.blocksCol, index='idx').mean().values
     saccBlocks = pd.pivot_table(ssDF, values='acc', columns=self.blocksCol, index='idx').mean().values
     rtErr = np.sum((rtBlocks*10 - self.rtBlocks*10)**2)
     saccErr = np.sum((saccBlocks - self.saccBlocks)**2)
     return rtErr + saccErr
def getrecommendedbeer(x, y, k=20, address = "shippingProvince"):
    #get number of customer groups
    n = max(x.customerGroup) + 1
    #create the output list
    customer_output = []
    #get top beers for each group
    top_beer = []
    for i in range(n):
        #split data by two groups
        group1 = x.loc[x.customerGroup == i,['productTitle','totalSales']]
        group2 = x.loc[x.customerGroup != i,['productTitle','totalSales']]
        group1 = pd.pivot_table(group1, index = ['productTitle'],
                                values = ['totalSales'], aggfunc = np.sum, fill_value = 0)
        group2 = pd.pivot_table(group2, index = ['productTitle'],
                                values = ['totalSales'], aggfunc = np.sum, fill_value = 0)
        #if # of recommended beers is larger than # of beers in that group, append top beers from other groups
        if group1.shape[0] >= k:
            beer_list = group1.sort('totalSales',ascending = False)[0:k]
            beer_recommended = beer_list.index.values.tolist()
        else:
            beer_list1 = group1.sort('totalSales',ascending = False)
            beer_list2 = group2.sort('totalSales',ascending = False)[0:k-group1.shape[0]]
            beer_recommended1 = beer_list1.index.values.tolist()
            beer_recommended2 = beer_list2.index.values.tolist()
            beer_recommended = beer_recommended1 + beer_recommended2
        top_beer.append(beer_recommended)
    #get predicted recommended beers for new customers
    more = y.get("bestBeers",[]) + y.get("rareBeers",[]) + y.get("discountedBeers",[])
    new = {}
    new["address"] = "new customer"
    new["customerGroup"] = "none"
    newx = pd.pivot_table(x,index=['productTitle'],values=['totalSales'],aggfunc=np.sum,fill_value=0)
    rec = newx.sort('totalSales',ascending = False).index.values.tolist()[0:k]
    add = []
    for item in more:
        if item not in rec:
            add.append(item)
    new["recommendedBeer"] = rec[0:k-len(add)] + add
    customer_output.append(new)
    #reshape data
    x = pd.pivot_table(x, index = [address],
                            values = ['customerGroup'], aggfunc = np.min, fill_value = 0)
    #get recommended beers for each customer
    for i in range(x.shape[0]):
        customer = {}
        customer[address] = x.index.values.tolist()[i]
        label = x["customerGroup"][i]
        customer["customerGroup"] = label
        #add top beers, rare beers and discounted beers
        rec = top_beer[label]
        add = []
        for item in more:
            if item not in rec:
                add.append(item)
        #note that the number of more beers <= k
        customer["recommendedBeer"] = rec[0:k-len(add)] + add
        customer_output.append(customer)
    return(customer_output)
Пример #17
0
def tabulate_data( allele_df, dbh ):

    buf = []
    buf2 = []
    buf3 = []

    table = pivot_table( allele_df,
                            index='sample_id', columns='marker_id', values='value',
                            aggfunc = lambda x: tuple(x) )

    heights = pivot_table( allele_df,
                            index='sample_id', columns='marker_id', values='height',
                            aggfunc = lambda x: tuple(x) )

    assay_ids = pivot_table( allele_df,
                            index='sample_id', columns='marker_id', values='assay_id',
                            aggfunc = lambda x: tuple(x) )

    buf.append( tuple( ['Sample', 'ID'] +
                    [ dbh.get_marker_by_id(x).code for x in table.columns ] ) )
    buf2.append( tuple( ['Sample', 'ID'] +
                    [ dbh.get_marker_by_id(x).code for x in heights.columns ] ) )
    buf3.append( tuple( ['Sample', 'ID'] +
                    [ dbh.get_marker_by_id(x).code for x in assay_ids.columns ] ) )


    empty = tuple()

    rows = [ ((dbh.get_sample_by_id(r[0]).code,), (r[0],)) + r[1:]
                for r in table.itertuples() ]
    rows.sort()

    height_rows = [ ((dbh.get_sample_by_id(r[0]).code,), (r[0],)) + r[1:]
                                for r in heights.itertuples() ]
    height_rows.sort()

    assayid_rows = [ ((dbh.get_sample_by_id(r[0]).code,), (r[0],)) + r[1:]
                                for r in assay_ids.itertuples() ]
    assayid_rows.sort()

    for row in rows:
        data = [ x if type(x) == tuple else empty for x in row ]
        for cols in zip_longest( *data, fillvalue='' ):
            buf.append( cols )

    for height_row in height_rows:
        data = [ x if type(x) == tuple else empty for x in height_row ]
        for cols in zip_longest( *data, fillvalue='' ):
            buf2.append( cols )

    for assayid_row in assayid_rows:
        data = [ x if type(x) == tuple else empty for x in assayid_row ]
        for cols in zip_longest( *data, fillvalue='' ):
            buf3.append( cols )


    return (buf, buf2, buf3)
Пример #18
0
def calc_averages(mc_t,gc_t):
    pvt_mc = pd.pivot_table(mc_t, index=['TripDate'],values=['Thickness'],aggfunc=np.nanmean).reset_index()
    pvt_gc = pd.pivot_table(gc_t, index=['TripDate'],values=['Thickness'],aggfunc=np.nanmean).reset_index()
    pvt_mc = pvt_mc[(pvt_mc['TripDate'] == '1990-06-10') | (pvt_mc['TripDate'] == '2003-09-20') | (pvt_mc['TripDate'] == '2016-10-01')]
    pvt_gc = pvt_gc[(pvt_gc['TripDate'] == '1990-06-10') | (pvt_gc['TripDate'] == '2003-09-20') | (pvt_gc['TripDate'] == '2016-10-01')]
    pvt_mc = pvt_mc.rename(columns={'Thickness':'Marble Canyon Average Thickness'})
    pvt_gc = pvt_gc.rename(columns={'Thickness':'Grand Canyon Average Thickness'})
    merge = pvt_mc.merge(pvt_gc, left_on='TripDate', right_on='TripDate',how='left')
    return merge
def getsimilarbeer(x, beerIds_dict, k = 20, l = 40):
    #add 1 to k in order to remove the beer itself from the similar beer list later on
    k = k + 1
    #get number of beer groups
    n = max(x.beerGroup) + 1
    #create the output list
    beer_output = []
    #get top beers for each group
    top_beer = []
    for i in range(n):
        #split data by two groups
        group1 = x.loc[x.beerGroup == i,['productTitle','totalSales']]
        group2 = x.loc[x.beerGroup != i,['productTitle','totalSales']]
        group1 = pd.pivot_table(group1, index = ['productTitle'],
                                values = ['totalSales'], aggfunc = np.sum, fill_value = 0)
        group2 = pd.pivot_table(group2, index = ['productTitle'],
                                values = ['totalSales'], aggfunc = np.sum, fill_value = 0)
        #if # of similar beers is larger than # of beers in that group, append top beers from other groups
        if group1.shape[0] >= k:
            beer_list = group1.sort('totalSales',ascending = False)[0:k]
            beer_similar = beer_list.index.values.tolist()
        else:
            beer_list1 = group1.sort('totalSales',ascending = False)
            beer_list2 = group2.sort('totalSales',ascending = False)[0:k-group1.shape[0]]
            beer_similar1 = beer_list1.index.values.tolist()
            beer_similar2 = beer_list2.index.values.tolist()
            beer_similar = beer_similar1 + beer_similar2
        top_beer.append(beer_similar)
    #get predicted similar beers for new beers
    #when new beers are available, they won't have a group label because they have no sales
    #so the similar beers of "new beers" will be best-selling beers for now
    new = {}
    new["beerId"] = ''
    new["productTitle"] = "new beer"
    new["beerGroup"] = "none"
    newx = pd.pivot_table(x,index=['productTitle'],values=['totalSales'],aggfunc=np.sum,fill_value=0)
    new["similarBeer"] = newx.sort('totalSales',ascending = False).index.values.tolist()[0:l]
    beer_output.append(new)
    #reshape data
    x = pd.pivot_table(x, index = ['productTitle'],
                            values = ['beerGroup'], aggfunc = np.min, fill_value = 0)
    #get similar beers for each product
    for i in range(x.shape[0]):
        beer_item = {}
        beer_item["productTitle"] = x.index.values.tolist()[i]
        beer_item["beerId"] = beerIds_dict.get(beer_item.get("productTitle",''),'')
        label = x["beerGroup"][i]
        beer_item["beerGroup"] = label
        #check whether the beer itself is in the similar beer list
        if beer_item.get("productTitle") in top_beer[label]:
            new_top_beer = [item for item in top_beer[label] if item != beer_item.get("productTitle")]
            beer_item["similarBeer"] = new_top_beer
        else:
            beer_item["similarBeer"] = top_beer[label][0:k-1]
        beer_output.append(beer_item)
    return(beer_output)
Пример #20
0
def prepare_data( allele_df ):

    #buf = io.StringIO()

    #temp_csv = csv.writer( buf, delimiter='\t' )
    buf = []
    buf2 = []
    buf3 = []

    table = pivot_table( allele_df, rows='sample_id', cols='marker_id', values='value',
                            aggfunc = lambda x: tuple(x) )

    heights = pivot_table( allele_df, rows='sample_id', cols='marker_id', values='height',
                            aggfunc = lambda x: tuple(x) )

    assay_ids = pivot_table( allele_df, rows='sample_id', cols='marker_id', values='assay_id',
                            aggfunc = lambda x: tuple(x) )

    buf.append( tuple( ['Sample'] + 
                    [ Marker.get(x).code for x in table.columns ] ) )
    buf2.append( tuple( ['Sample'] + 
                    [ Marker.get(x).code for x in heights.columns ] ) )
    buf3.append( tuple( ['Sample'] +
                    [ Marker.get(x).code for x in assay_ids.columns ] ) )


    empty = tuple()

    rows = [ (((Sample.get(r[0]).code, r[0]),),) + r[1:] for r in table.itertuples() ]
    rows.sort()

    height_rows = [ (((Sample.get(r[0]).code, r[0]),),) + r[1:]
                                for r in heights.itertuples() ]
    height_rows.sort()

    assayid_rows = [ (((Sample.get(r[0]).code, r[0]),),) + r[1:]
                                for r in assay_ids.itertuples() ]
    assayid_rows.sort()

    for row in rows:
        data = [ x if type(x) == tuple else empty for x in row ]
        for cols in zip_longest( *data, fillvalue='' ):
            buf.append( cols )

    for height_row in height_rows:
        data = [ x if type(x) == tuple else empty for x in height_row ]
        for cols in zip_longest( *data, fillvalue='' ):
            buf2.append( cols )

    for assayid_row in assayid_rows:
        data = [ x if type(x) == tuple else empty for x in assayid_row ]
        for cols in zip_longest( *data, fillvalue='' ):
            buf3.append( cols )


    return (buf, buf2, buf3)
Пример #21
0
def main(fh_xls):
    
    # fh_xls=sys.argv[1]
    info=pd.read_excel(fh_xls,'info')
    info=info.set_index('varname')
    for var in info.iterrows() :
        val=info['input'][var[0]]
        if not pd.isnull(val):
            exec("%s=info['input']['%s']" % (var[0],var[0]),locals(), globals())
        else:
            exec("%s=info['default']['%s']" % (var[0],var[0]),locals(), globals())

    #../tests/test.xlsx
    # fh_xls='../test/test.xlsx'
    data_job=pd.read_excel(fh_xls,'JobView')
    # nd_dh="/media/Transcend/20160219_000356_267"
    data_fns=pd.pivot_table(data_job,values='File Name',index='Loop_bleach Index',columns='Well Name', aggfunc=lambda x: x.iloc[0])
    data_fns_P =pd.pivot_table(data_job,values='File Name',index='TimeLapse1 Index',columns='Well Name', aggfunc=lambda x: x.iloc[0])
    data_fns   =pd.concat([data_fns,data_fns_P],axis=0)

    wells=[str(x) for x in list(data_job['Well Name'].unique())]
    wells.sort()
    wells_b, wells_u =wells[::2],wells[1::2]


    pt00s=range(24)
    for rowi in range(2,16):
        for coli in range(12):
            pt00s.append(coli) 
    pt00s_all=[None] * 384        
    pt00s_all[::2]=pt00s
    pt00s_all[1::2]=pt00s
    info_pt00s=pd.DataFrame({'well' : wells, \
                            'pt00': pt00s_all})
    info_pt00s=info_pt00s.set_index('well')
    time=np.array(range(12))*2
    # data_num=pd.DataFrame(columns=wells)
    # data_num.loc[:,'time']=time

    if not exists("%s.data_num_kin" % (fh_xls)):
        data_num_kin=pd.DataFrame(columns=wells)
        data_num_kin.loc[:,'time']=time
        for well in wells:
            logging.info("processing: %s" % well) 
            nd_fns=data_fns[well].dropna().unique()
            well_kin=nd2kins(nd_fns,nd_dh,time_increment)
            if not pd.isnull(info_pt00s.loc[well])[0]:
                pt00=int(info_pt00s.loc[well])
                data_num_kin[well]=well_kin[pt00:pt00+12].values
            else :   
                data_num_kin[well]=well_kin[0:12].values
        data_num_kin.to_csv("%s.data_num_kin" % (fh_xls))
    else:
        logging.info("already processed: %s.data_num_kin" % (fh_xls))
    data_num_kin=pd.read_csv("%s.data_num_kin" % (fh_xls))
    data_num_kin2ana(data_num_kin,wells,wells_b,wells_u)
def pathway_scores_from_variants(variants_df, pathway_df, index_field):
    """
    Similar to `pathway_scores_from_zscores`, but with different subsetting
    logic that makes sense with integer variants per gene.
    """
    x = variants_df.join(pathway_df)
    dfs = []
    dfs.append(index_converter(pd.pivot_table(x, index=index_field, aggfunc=np.sum), 'sum_var'))
    dfs.append(index_converter(pd.pivot_table(x, index=index_field, aggfunc=np.mean), 'mean_var'))
    return pd.concat(dfs)
def make_dfs():
    """
    Creates dataframes for plotting using SQL queries.

    Returns a dataframe for each positional group (dfRB, dfWR, dfQB).
    """
    con = sqlite3.connect('data/nflPPdb.sqlite')
    df1 = pd.read_sql_query(
     'SELECT combine.name, combine.fortyyd, combine.heightinchestotal,\
     combine.weight, combine.twentyss, combine.vertical, combine.year\
     FROM combine\
     WHERE combine.year < 2009 AND combine.pickround != 0', con)
    df1['speedscore'] = (df1['weight']*200)/(df1['fortyyd']**4)
    df2 = pd.read_sql_query(
     'SELECT combine.name, combine.year, players.position\
     FROM combine, players\
     WHERE combine.name = players.name AND combine.year = players.draft_year',
     con)
    df3 = pd.merge(df1, df2, on=['name', 'year'],
                   how='inner', suffixes=('df1', 'df2'))
    df3 = df3.drop_duplicates(subset='name', keep=False)
    df4 = pd.read_sql_query(
     'SELECT DISTINCT combine.name, rr.rushing_yards, rr.receiving_yards\
     FROM combine, rr\
     WHERE combine.name = rr.name AND combine.year < 2009', con)
    df4 = pd.pivot_table(df4, index=['name'],
                         aggfunc=np.sum).reset_index().fillna(0)
    df4['totYds'] = (df4['receiving_yards'].fillna(0.0) +
                     df4['rushing_yards'].fillna(0.0)).astype(int)
    df5 = pd.merge(df3, df4, on='name', how='inner', suffixes=('df3', 'df4'))
    dfRB = df5[df5.position == 'RB']
    dfRB = dfRB[dfRB.fortyyd < 5]  # remove outliers
    dfWR = df5[df5.position == 'WR']
    dfWR = dfWR[dfWR.fortyyd < 5]  # remove outliers

    # Create QB data frame
    dfQB = pd.read_sql_query(
     'SELECT DISTINCT combine.name, combine.fortyyd, combine.heightinchestotal,\
     combine.weight, combine.twentyss, combine.vertical, passing.passing_yards\
     FROM combine, passing\
     WHERE combine.name = passing.name AND combine.year < 2009', con)
    # use to get 40 yard time back after aggregating
    dfQB['count'] = 1
    dfQB = pd.pivot_table(dfQB, index=['name'], aggfunc=np.sum).reset_index()
    dfQB['fortyyd'] = dfQB['fortyyd']/dfQB['count']
    dfQB['heightinchestotal'] = dfQB['heightinchestotal']/dfQB['count']
    dfQB['twentyss'] = dfQB['twentyss']/dfQB['count']
    dfQB['vertical'] = dfQB['vertical']/dfQB['count']
    dfQB['weight'] = dfQB['weight']/dfQB['count']
    dfQB['speedscore'] = (dfQB['weight']*200)/(dfQB['fortyyd']**4)
    dfQB = dfQB.drop('count', 1)
    # remove outliers
    dfQB = dfQB[dfQB.passing_yards > 175]

    return (dfRB, dfWR, dfQB)
Пример #24
0
def GetQIE8DataFrame( rel_qie8_path ):
  sql_engine_qie8 = create_engine( rel_qie8_path )
  sql_con_qie8 = sql_engine_qie8.raw_connection()
  
  df_qie8 = pd.read_sql("SELECT * FROM QIE8CalibFNALNormal", con = sql_con_qie8, index_col = ['index'])
  df_qie8.drop(['Mode', 'Run'], axis=1, inplace=True)
  df_qie8_offset_pv = pd.pivot_table(df_qie8, index = ['QIE8ID', 'Ch'], columns=['Cap','Rng'], values='Offset')
  df_qie8_slope_pv = pd.pivot_table(df_qie8, index = ['QIE8ID', 'Ch'], columns=['Cap','Rng'], values='Slope')

  sql_con_qie8.close()
  return df_qie8_offset_pv, df_qie8_slope_pv
Пример #25
0
    def SNP2Gene_breakdown(self,COB=None):
        '''
        Provides a breakdown of SNP to gene mapping parameters for each term in the Overlap.
        Includes the number of initial Loci, the number of collapsed Loci (within a window)
        and the number of candidate genes (within a window and up to a flank limit)

        Parameters
        ----------
        COB : str (default: 'average')
            If specfified, the results will be composed only of SNP to gene
            mappings from a single COB network. If 'average' is specified,
            the results will be the SET of genes across all COB networks.
        '''
        # Get some help
        def bp_to_kb(bp):
            return "{}KB".format(int(bp/1000))
        def get_level(df,level):
            ''' Returns the level values by name '''
            level_index = df.columns.names.index(level)
            return df.columns.levels[level_index]
        # Prepare the data frame results
        if COB == None:
            results = self.results
        else:
            results = self.results.query('COB=="{}"'.format(COB))
        # Total for the Ionome
        ont = co.GWAS(self.results.Ontology.unique()[0])
        ref = co.COB(self.results.COB.unique()[0])._parent_refgen
        # Make an aggregate term
        total = co.Term('total',loci=set(chain(* [x.loci for x in ont.terms()])))
        # Calculate number of SNPs
        snps = pd.DataFrame(pd.pivot_table(results,index="Term",values='TermLoci'))
        snps.columns = pd.MultiIndex.from_product([['GWAS SNPs'],['-'],['-']],names=['Name','WindowSize','FlankLimit'])
        snps.ix['Total'] = len(total.loci)
        # Calculate number of Candidate Loci
        loci = pd.pivot_table(results,index="Term",columns=['WindowSize'],values='TermCollapsedLoci')
        for window_size in loci.columns:
            loci.ix['Total',window_size] = len(total.effective_loci(window_size))
        loci.columns = pd.MultiIndex.from_product([['Collapsed Loci'],list(map(bp_to_kb,loci.columns)),['-']],names=['Name','WindowSize','FlankLimit'])
        # Calculate number of Candidate Genes
        genes = pd.pivot_table(results,index='Term',columns=['WindowSize','FlankLimit'],values='gene',aggfunc=lambda x: len(set(x)))
        for window_size in get_level(genes,'WindowSize'):
            for flank_limit in get_level(genes,'FlankLimit'):
                genes.ix['Total',(window_size,flank_limit)] = len(ref.candidate_genes(total.effective_loci(window_size=window_size),flank_limit=flank_limit))
        genes.columns = pd.MultiIndex.from_product(
            [['Candidate Genes'],
                list(map(bp_to_kb,get_level(genes,"WindowSize"))),
                get_level(genes,'FlankLimit')
            ],
            names=['Name','WindowSize','FlankLimit']
        )
        results = snps.join(loci).join(genes)
        #ionome_eff_loci = [len()]
        return results
Пример #26
0
def __UNUSED_prepare_array_like_data(action_hourly):
    """
    これだと、user_idで対応関係が作れないことがわかったので、この関数は使わない

    scikit-learnのinputに適した形でのデータの作成をする
    作成すべきデータは、1時間毎のユーザーログインするか、しないか、の対応関係のデータ

    0時のログインデータ(月曜から〜日曜)が訓練で、次の週のある日のログインデータが答え
    training
    ndarray:
    [[user_1_monday, user_1_tuesday, ... user_1_sunday],
    [user_2_monday, user_2_tuesday, ... user_2_sunday],
    ...
    [user_n_monday, user_n_tuesday, ... user_n_sunday]]

    gold:
    [login_user_1, login_user_2, ..., login_user_n]

    :return: Map {key int hour: value tuple train_ans_tuple (train_2d_array, ans_1d_array)}
    """
    dates = list(action_hourly.log_date.unique())
    dates_for_train = dates[:-1]
    date_for_gold = dates[-1]
    ifelse_func = lambda x: 1 if sum(x) > 7 else 0
    # 訓練データの日付けだけを選択
    df_for_train = action_hourly[action_hourly['log_date'].str.contains('|'.join(dates_for_train))]
    df_for_gold = action_hourly[action_hourly['log_date'].str.contains(date_for_gold)]

    stack_map = {}
    # 時間ごとに訓練データとゴールドデータの作成
    for target_hour in range(0, 24):

        df_train_for_target_hour = df_for_train[df_for_train.log_hour == target_hour]
        df_gold_for_target_hour = df_for_gold[df_for_gold.log_hour == target_hour]

        # 0,1のデータに変換する
        training_data = pd.pivot_table(df_train_for_target_hour,
                                       index='user_id',
                                       columns='log_date',
                                       values='count',
                                       fill_value=0, aggfunc=ifelse_func).reset_index()
        array_training_data = training_data.drop(u'user_id', axis=1).values

        gold_data = pd.pivot_table(df_gold_for_target_hour,
                                       index='user_id',
                                       columns='log_date',
                                       values='count',
                                       fill_value=0, aggfunc=ifelse_func).reset_index()
        array_gold_data = gold_data.drop(u'user_id', axis=1).values
        array_gold_data_1d = np.transpose(array_gold_data)[0, :]

        # user_idとarray_training_dataとarray_gold_data_1dをmapに記録する
        stack_map[target_hour] = (user_id, array_training_data, array_gold_data_1d)
Пример #27
0
    def test_pivot_table_with_iterator_values(self):
        # GH 12017
        aggs = {"D": "sum", "E": "mean"}

        pivot_values_list = pd.pivot_table(self.data, index=["A"], values=list(aggs.keys()), aggfunc=aggs)

        pivot_values_keys = pd.pivot_table(self.data, index=["A"], values=aggs.keys(), aggfunc=aggs)
        tm.assert_frame_equal(pivot_values_keys, pivot_values_list)

        agg_values_gen = (value for value in aggs.keys())
        pivot_values_gen = pd.pivot_table(self.data, index=["A"], values=agg_values_gen, aggfunc=aggs)
        tm.assert_frame_equal(pivot_values_gen, pivot_values_list)
Пример #28
0
    def generateGraphData(self):
        safePrint('Generating and uploading data files')

        allData = read_table(self.combinedFile, sep='\t', na_filter=False, parse_dates=[0], infer_datetime_format=True)
        xcsList = [xcs for xcs in allData.xcs.unique() if xcs != 'ERROR' and xcs[0:4] != 'TEST' and xcs != '000-00']

        # filter type==DATA and site==wikipedia
        allData = allData[(allData['xcs'].isin(xcsList)) & (allData['site'] == 'wikipedia')]

        # By "iszero+via", e.g.  a,b,aO,bO,..., where 'a' == zero-rated, 'b' == non-zero-rated, and 'O' == Opera
        data = DataFrame(pivot_table(allData, 'count', ['date', 'xcs', 'via', 'iszero'], aggfunc=np.sum))
        data.reset_index(inplace=True)
        data['via'] = data.apply(lambda r: ('a' if r['iszero'][:1] == 'y' else 'b') + r['via'][:1], axis=1)
        data.drop('iszero', axis=1, inplace=True)
        self.createClippedData('RawData:YearDailyViaIsZero', data)
        self.createPeriodData('RawData:WeeklyViaIsZero', data, weekly)
        self.createPeriodData('RawData:MonthlyViaIsZero', data, monthly)

        allowedSubdomains = ['m', 'zero']
        data = allData[(allData.ison == 'y') & (allData.iszero == 'y') & (allData.subdomain.isin(allowedSubdomains))]
        data = DataFrame(pivot_table(data, 'count', ['date', 'xcs', 'subdomain'], aggfunc=np.sum))
        data.reset_index(inplace=True)

        self.createClippedData('RawData:YearDailySubdomains', data)
        self.createPeriodData('RawData:WeeklySubdomains', data, weekly)
        self.createPeriodData('RawData:MonthlySubdomains', data, monthly)

        # create an artificial yes/no/opera sums
        opera = allData[(allData.via == 'OPERA') & (allData.iszero == 'y')]
        opera['str'] = 'o'
        yes = allData[allData.iszero == 'y']
        yes['str'] = 'y'
        no = allData[allData.iszero == 'n']
        no['str'] = 'n'
        combined = opera.append(yes).append(no)
        data = DataFrame(pivot_table(combined, 'count', ['date', 'xcs', 'str'], aggfunc=np.sum))
        data.reset_index(inplace=True)

        headerFields = 'date,xcs,iszero,count'  # Override "str" as "iszero"
        self.createClippedData('RawData:YearDailyTotals', data, headerFields)
        self.createPeriodData('RawData:MonthlyTotals', data, monthly, headerFields)

        data = []
        for xcsId in list(allData.xcs.unique()):
            byLang = pivot_table(allData[allData.xcs == xcsId], 'count', ['lang'], aggfunc=np.sum) \
                .order('count', ascending=False)
            top = byLang.head(5)
            vals = list(top.iteritems())
            vals.append(('other', byLang.sum() - top.sum()))
            valsTotal = sum([v[1] for v in vals]) / 100.0
            data.extend(['%s,%s,%.1f' % (l, xcsId, c / valsTotal) for l, c in vals])

        self.saveWikiPage('RawData:LangPercent', data, 'lang,xcs,count')
Пример #29
0
def area_data(df,section=None):
    if section is not None:
        df=df.query(section)
    tmp_pvt = pd.pivot_table(df, values=['Area_2D'], index=['TripDate'], aggfunc=np.std)
    tmp_pvt = tmp_pvt.rename(columns={'Area_2D':'std_dev'})
    tmp_count = pd.pivot_table(df,values=['Area_2D'], index=['TripDate'], aggfunc='count')
    tmp_count = tmp_count.rename(columns={'Area_2D':'count'})
    tmp_pvt['std_error'] = tmp_pvt['std_dev']/np.sqrt(tmp_count['count'])
    yerr = tmp_pvt[['std_error']]
    tmp_pvt = pd.pivot_table(df, values=['Area_2D'], index=['TripDate'], aggfunc=np.average)
    tmp_pvt['y_err']=yerr
    return tmp_pvt
def fit_decay_time(track_dfs, plot=True, savename=None):

    def expon_func(x, a, b, c):
        return a * np.exp(-b * x) + c

    # assemble all dataframes into one big df.
    full_data = [df for bid, df in track_dfs.items()]
    fdf = pd.concat(full_data, axis=0)
    fdf['m'] = np.array(fdf['minutes'], dtype=int)

    avg = pd.pivot_table(fdf, values='bl / s', cols='m', aggfunc=np.mean)
    counts = pd.pivot_table(fdf, values='bl / s', cols='m', aggfunc=len)

    x = np.array(avg.index) + 0.5
    y = np.array(avg)

    params, pcov = curve_fit(expon_func, x, y)

    if plot:

        fig = plt.figure(figsize=(10,5))
        ax = plt.subplot(axisbg='white')
        #ax.plot(fdf['minutes'], fdf['bl / s'], '.', alpha=0.1, label="Origional Data")
        ax.plot(x, y, 'o', label="Binned Data", color='steelblue', alpha=0.5)
        ax.plot(x, expon_func(x, *params), '-', color='k', label="Fitted Curve")
        ax.set_ylim([0, 0.2])
        ax.legend(fontsize=15)
        ax.set_xlabel('time (min)', size=20)
        ax.set_ylabel('speed (bl / s)', size=20)
        plt.yticks(fontsize=15)
        plt.xticks(fontsize=15)
        if savename is not None:
            plt.savefig(savename)
        plt.show()

        fig = plt.figure(figsize=(10,3))
        ax = plt.subplot(axisbg='white')
        #fig, ax = plt.subplots(figsize=(10,3))
        x = np.array(counts.index)
        y = np.array(counts)

        ax.plot(x, y, 'k.-', label="Binned Data")
        ax.legend(loc='best', fontsize=15)
        ax.set_xlabel('time (min)', size=20)
        ax.set_ylabel('counts', size=20)
        ax.set_ylim([0, max(y)*1.1])

        plt.yticks(fontsize=15)
        plt.xticks(fontsize=15)

        plt.show()
    return fdf, params, pcov
Пример #31
0
def get_hourly_data(df, values):
    df_c = df.copy()
    df_c["day"] = df_c.index.dayofyear
    df_c["hour"] = df_c.index.hour
    return pd.pivot_table(df_c, index=["hour"], columns=["day"], values=values)
Пример #32
0
def merge_pgr_data():
    with open("data/source/mapas-data-concentrado.xlsx", "rb") as f:
        dfs = pd.read_excel(f, sheet_name=None)

    df = dfs['00 PGR']
    pivot = pd.pivot_table(df,
                           index=["state_code", "municipio_code", "year"],
                           fill_value=0,
                           aggfunc=np.sum,
                           values=PROPS)
    pivot_dict = pivot.reset_index().to_dict("records")
    lookup = {}
    for row in pivot_dict:
        if not lookup.get(row['state_code']):
            lookup[row['state_code']] = {}

        if not lookup[row['state_code']].get(row['municipio_code']):
            lookup[row['state_code']][row['municipio_code']] = {}

        lookup[row['state_code']][row['municipio_code']][int(
            row['year'])] = dict(
                (prop, int(row.get(prop, 0))) for prop in PROPS)

    centers = {
        'type': 'FeatureCollection',
        'name': 'municipalescentroids',
        'features': [],
    }
    with codecs.open('data/source-geojson/municipales.json',
                     encoding='utf-8',
                     errors="replace") as f:
        data = json.load(f)
        for feature in data['features']:

            totals = dict((prop, 0) for prop in PROPS)
            state_code = int(feature['properties']['CVE_ENT'])
            mun_code = int(feature['properties']['CVE_MUN'])
            try:
                mun_data = lookup[state_code].get(mun_code)
            except KeyError:
                continue

            cumulative_fosas_data = []

            for year in range(2006, 2017):
                cumulative_dict = {'year': year}

                for prop in PROPS:
                    if mun_data and mun_data.get(year) and mun_data[year].get(
                            prop, 0) > 0:
                        feature['properties'][prop + '_' + str(year)] = int(
                            mun_data[year].get(prop))
                        totals[prop] += int(mun_data[year].get(prop))
                    else:
                        feature['properties'][prop + '_' + str(year)] = 0

                    if len(cumulative_fosas_data):
                        cumulative_dict[prop] = cumulative_fosas_data[-1].get(
                            prop) + feature['properties'][prop + '_' +
                                                          str(year)]
                        feature['properties'][
                            prop + '_cumulative_' +
                            str(year)] = cumulative_fosas_data[-1].get(
                                prop) + feature['properties'][prop + '_' +
                                                              str(year)]
                    else:
                        cumulative_dict[prop] = feature['properties'][
                            prop + '_' + str(year)]
                        feature['properties'][
                            prop + '_cumulative_' +
                            str(year)] = feature['properties'][prop + '_' +
                                                               str(year)]
                cumulative_fosas_data.append(cumulative_dict)

            for prop in PROPS:
                feature['properties'][prop + '_total'] = totals[prop]
                # if totals[prop] > maxes[prop]:
                # maxes[prop] = totals[prop]

            make_centroid = False
            for prop in PROPS:
                if feature['properties'][prop + '_total'] > 0:
                    make_centroid = True
                    break

            if make_centroid:
                shp = shape(feature['geometry'])
                feature_centroid = mapping(shp.representative_point())
                center_feature = deepcopy(feature)
                center_feature['geometry'] = feature_centroid
                centers['features'].append(center_feature)

            # Just adding to the ugliness. Only centroids have data...
            feature['properties'] = {
                'CVE_ENT': int(feature['properties']['CVE_ENT'])
            }

    with open('data/processed-geojson/pgr-centroids.json', 'w') as f:
        json.dump(centers, f)
Пример #33
0
def merge_municipality_data():
    lookup = {}

    with open("data/source/mapas-data-concentrado.xlsx", "rb") as f:
        dfs = pd.read_excel(f, sheet_name=None)

    for sheetname, df in dfs.items():
        try:
            state_code, state_name = sheetname.split(' ', 1)
            state_code = int(state_code)
        except ValueError:
            print("%s is not a valid sheet name" % sheetname, file=sys.stderr)
            continue

        pivot = pd.pivot_table(df,
                               index=["municipio_code", "year"],
                               fill_value=0,
                               aggfunc=np.sum,
                               values=PROPS)
        pivot_dict = pivot.reset_index().to_dict("records")

        final_dict = {}
        for row in pivot_dict:
            if not final_dict.get(row['municipio_code']):
                final_dict[row['municipio_code']] = {'state_name': state_name}

            try:
                final_dict[row['municipio_code']][int(row['year'])] = dict(
                    (prop, int(row.get(prop, 0))) for prop in PROPS)
            except ValueError:
                print(sheetname)

        lookup[state_code] = final_dict

    maxes = {prop: 0 for prop in PROPS}
    centers = {
        'type': 'FeatureCollection',
        'name': 'municipalescentroids',
        'features': [],
    }
    with codecs.open('data/source-geojson/municipales.json',
                     encoding='utf-8',
                     errors="replace") as f:
        data = json.load(f)
        for feature in data['features']:
            totals = dict((prop, 0) for prop in PROPS)
            state_code = int(feature['properties']['CVE_ENT'])
            mun_code = int(feature['properties']['CVE_MUN'])
            mun_data = lookup[state_code].get(mun_code)

            cumulative_fosas_data = []

            for year in range(2006, 2017):
                cumulative_dict = {'year': year}

                if mun_data:
                    feature['properties']['state_name'] = mun_data[
                        'state_name']

                for prop in PROPS:
                    if mun_data and mun_data.get(year) and mun_data[year].get(
                            prop, 0) > 0:
                        feature['properties'][prop + '_' + str(year)] = int(
                            mun_data[year].get(prop))
                        totals[prop] += int(mun_data[year].get(prop))
                    # elif mun_data and mun_data.get(year) and mun_data[year].get(prop) == 0:
                    # feature['properties'][prop + '_' + str(year)] = 0
                    else:
                        feature['properties'][prop + '_' + str(year)] = 0

                    if len(cumulative_fosas_data):
                        cumulative_dict[prop] = cumulative_fosas_data[-1].get(
                            prop) + feature['properties'][prop + '_' +
                                                          str(year)]
                        feature['properties'][
                            prop + '_cumulative_' +
                            str(year)] = cumulative_fosas_data[-1].get(
                                prop) + feature['properties'][prop + '_' +
                                                              str(year)]
                    else:
                        cumulative_dict[prop] = feature['properties'][
                            prop + '_' + str(year)]
                        feature['properties'][
                            prop + '_cumulative_' +
                            str(year)] = feature['properties'][prop + '_' +
                                                               str(year)]

                cumulative_fosas_data.append(cumulative_dict)

            for prop in PROPS:
                feature['properties'][prop + '_total'] = totals[prop]
                if totals[prop] > maxes[prop]:
                    maxes[prop] = totals[prop]

            make_centroid = False
            for prop in PROPS:
                if feature['properties'][prop + '_total'] > 0:
                    make_centroid = True
                    break

            if make_centroid:
                shp = shape(feature['geometry'])
                feature_centroid = mapping(shp.representative_point())
                center_feature = deepcopy(feature)
                center_feature['geometry'] = feature_centroid
                centers['features'].append(center_feature)

            # Just adding to the ugliness. Only centroids have data...
            feature['properties'] = {
                'CVE_ENT': int(feature['properties']['CVE_ENT'])
            }

    with open('data/processed-geojson/municipales.json', 'w') as f:
        json.dump(data, f)

    with open('data/processed-geojson/municipales-centroids.json', 'w') as f:
        json.dump(centers, f)
Пример #34
0
        # Get the handles and labels. For this example it'll be 2 tuples
        # of length 4 each.
        handles, labels = ax.get_legend_handles_labels()
        # When creating the legend, only use the first two elements
        # to effectively remove the last two.
        # l = ax.legend(handles[0:2], labels[0:2], bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
        loc = 'best'
        if metric == 'pearson_r2_score':
            loc = 'lower right'
        l = ax.legend(handles[0:3], split_names, frameon=False, loc=loc)
        ax.set_xlabel('')
        ax.set_ylabel(metric_name)
        ax.set_title(
            # f'ACNN performance on PDBbind {version} core subset\ncontaining {component_name} '
            f'Performance on PDBbind {version} Core Set\n'
            f'Dataset: PDBbind ({component_name})')
        ax.set_yticks(np.linspace(0, 1, 11))
        ax.set_ylim([0.5, 0.9])
        fig.savefig(root / f"{version}.{component}.{metric}.png", dpi=300)

#%%
d = df.loc[df['set'] == 'test']
d = d.loc[d['metric'] == 'pearson_r2_score']
pt = pd.pivot_table(d,
                    index=['version', 'split', 'component'],
                    columns=['subset'],
                    values=['value'],
                    aggfunc=[np.mean, np.std])
pt.to_csv(root / 'pivot_table.csv')
pt
#%%
encoder = OneHotEncoder().fit(train[feaures_list])
train = pd.concat([
    train,
    pd.DataFrame(encoder.transform(train[feaures_list]).toarray(),
                 index=train.index,
                 columns=encoder.get_feature_names(feaures_list))
],
                  axis=1)
train.drop(feaures_list, axis=1, inplace=True)

columns = ['Product_ID', 'Product_Category_1']
for column in columns:
    top_100 = data[column].value_counts().index[:100]
    user_purchase = pd.pivot_table(data[['User_ID', column, 'Purchase']],
                                   values='Purchase',
                                   index='User_ID',
                                   columns=column,
                                   aggfunc=np.sum).fillna(0)[top_100]
    train = train.join(user_purchase)

train = train.join(data[['User_ID', 'Purchase']].groupby('User_ID').agg('sum'))

train_scaled = StandardScaler().fit_transform(train)

k_values = np.arange(1, 11)
models = []
dists = []
for k in k_values:
    model = KMeans(k).fit(train_scaled)
    models.append(model)
    dists.append(model.inertia_)
y.drop(y.columns[1], axis=1, inplace=True)
y = y.set_index("Year")

z = y / x
z.rename(columns={'Number of Cases Solved': 'Ratio'}, inplace=True)

dataset_cases = dataset_cases.join(z)
dataset_cases = dataset_cases.reset_index()
dataset_cases = dataset_cases.set_index(['Year', 'Ratio', 'Arrest'])
dataset_cases

# In[136]:

plt.figure(figsize=(30, 15))
heatmap1_data = pd.pivot_table(dataset_cases,
                               values="Number of Cases Solved",
                               index=['Arrest'],
                               columns='Year')
plt.title("Solved and Unsolved Arrets per Year")
cases_heatmap = sns.heatmap(heatmap1_data,
                            annot=True,
                            cmap="coolwarm",
                            center=2000)
cases_heatmap
plt.show()

# In[137]:

plt.figure(figsize=(20, 15))
sns.lineplot(data=dataset_cases,
             x=dataset_cases.index.get_level_values('Year'),
             y=dataset_cases.index.get_level_values('Ratio'))
2. Month: 月
3. Day: 日
4. PM: PM2.5值
'''

import pandas as pd
import matplotlib.pyplot as plt

filename_path = 'data/data_pd/pm2.csv'

data_df = pd.read_csv(filename_path)

print('数据基本信息:')
print(data_df.head())
print(data_df.info())
print(data_df.describe())
print(data_df.shape)

year_month_data = data_df.groupby(by=['Year', 'Month'])['PM'].mean()

pivot_results = pd.pivot_table(data_df,
                               index='Year',
                               columns='Month',
                               values=['PM'],
                               aggfunc='mean')

pivot_results.plot(kind='bar')
plt.legend(loc='best')
plt.tight_layout()
plt.show()
Пример #38
0
        '/Users/zhouxiaocong/Documents/pythoncode/text/duoqingjianke/多情剑客无情剑.txt',
        'r'):

    adj['content'].append(line)
    words = pseg.cut(line)
    adj_cnt = 0
    cnt = 0
    for word, flag in words:
        if flag in ['Ag', 'a', 'ad', 'an']:
            adj_cnt += 1
        cnt += 1
    adj['cnt'].append(adj_cnt / cnt)
adj2 = pd.DataFrame(adj)
adj2 = adj2.sort_values(by='cnt', axis=0, ascending=False)
print(adj2[:5])
c = pd.pivot_table(adj2, index=['content'], values=['cnt'], aggfunc=np.sum)
c.to_csv(
    '/Users/zhouxiaocong/Documents/pythoncode/text/duoqingjianke/duoqingjianke.csv',
    mode='w',
    encoding='utf8')

#加载背景图片
cloud_mask = np.array(
    Image.open(
        "/Users/zhouxiaocong/Documents/pythoncode/text/duoqingjianke/lxh.jpg"))
#忽略显示的词
#st=set(["东西","这是"])
#生成wordcloud对象
wc = WordCloud(
    background_color="white",
    mask=cloud_mask,
Пример #39
0
        'winnerset4', 'winnerset5', 'loserset1', 'loserset2', 'loserset3',
        'loserset4', 'loserset5'
    ]
    mathch_melt = pd.melt(mathch,
                          id_vars=match_column,
                          value_vars=['winner1id', 'loser1id'],
                          var_name='type',
                          value_name='palyer_id')
    mathch_melt = mathch_melt.sort_values(
        ['palyer_id', 'resultdate'], ascending=False).reset_index(drop=True)
    mathch_melt = mathch_melt.groupby(['palyer_id']).head(30)

    player = mathch_melt['palyer_id'].drop_duplicates()

    mathch_pivot = pd.pivot_table(mathch_melt,
                                  index=match_column,
                                  columns='type',
                                  fill_value=0).reset_index()
    match_column.extend(['loser1id', 'winner1id'])
    mathch_pivot.columns = match_column
    mathch_pivot[['loser1id',
                  'winner1id']] = mathch_pivot[['loser1id',
                                                'winner1id']].astype("int64")

    data_match = mathch_pivot.copy()
    data_match = data_match.sort_values('resultid')

    player_score, data_match = match_player_rank(data_match)
    player_score = player_score.sort_values(
        'score', ascending=False).reset_index(drop=True)
    player_score['rank'] = player_score.index + 1
    player_score[['rank', 'playerid']].to_csv('playe_rank.csv', index=False)
Пример #40
0
    fillstyle='none',
    label='50 percentile')
plt.plot(
    guide_date,
    weight_guide.P25.values,
    'ro:',
    fillstyle='none',
    label='25 percentile')
plt.legend(loc='upper left')

# ----------- plot daily feeding quantity --------------------
ax3 = fig.add_subplot(gs[2], sharex=ax2)
by_date = pd.pivot_table(
    df1,
    values='Vol',
    index=df1.index.date,
    columns=['BM'],
    aggfunc=np.sum,
    fill_value=0)
# Pandas bar plot doesn't work out of box, see link below
# https://stackoverflow.com/questions/49269927/missing-bars-in-matplotlib-bar-chart
rects_bm = plt.bar(
    by_date.index,
    by_date[True].values,
    bottom=by_date[False].values,
    #    width=0.3,
    label='breast milk',
    color='g')
rects_f = plt.bar(
    by_date.index, by_date[False].values, label='formula', color='r')
utils.autolabel(rects_bm, rects_f)
Пример #41
0
today = datetime.today()
df['PassD'] = df['Issue Resolution Date'] < today
Passdue = {True: 'Over due' , False: 'Not due'}
df['Pass Due?'] = df['PassD'].map(Passdue)

# calc date 12 month backward ---------------------------------------------------xxx
df['datetoday'] = today
N = 365
date_N_days_ago = today - timedelta(days=N)

# get year/month form date object
df['y.notificationdate'] = pd.DatetimeIndex(df['Notification Date']).year
df['y.issueindentifiedate'] = pd.DatetimeIndex(df['Date Issue Identified']).year

# pivoting
df1 = pd.pivot_table(df , values='Issue ID' , index=['Issue Status' , 'Pass Due?' , 'Classification'] ,
                     aggfunc=pd.Series.nunique , )  # pd.Series.nunique
df2 = df1.xs(('Open') , level=0)
df3 = pd.pivot_table(df , values='Issue ID' , index=['Division' , 'Organisation Level'] , columns='Source' ,
                     aggfunc=pd.Series.nunique , )  # pd.Series.nunique
df4 = pd.pivot_table(df , values='Issue ID' , index=['Division' , 'Organisation Level'] , columns='Classification' ,
                     aggfunc=pd.Series.nunique , )  # pd.Series.nunique

# df2 = pd.pivot_table(df, values='Issue ID', index='y.notificationdate', columns=['Pass Due?', 'Classification'],
#                    aggfunc=pd.Series.nunique, )  # pd.Series.nunique
# df0 = pd.pivot_table(df, values='Issue ID', index=['Source', 'Issue Status', 'Pass Due?'], columns='Classification',
#                  aggfunc=pd.Series.nunique, )  # pd.Series.nunique
# df3 = pd.pivot_table(df, values='Issue ID', index=['Issue Status', 'Pass Due?','Classification'],
#                 aggfunc=pd.Series.nunique, )  # pd.Series.nunique
# df4 = pd.pivot_table(df, values='Issue ID', index='y.notificationdate', columns=['Pass Due?', 'Classification'],
#                 aggfunc=pd.Series.nunique, )  # pd.Series.nunique
Пример #42
0
def user_id_feture(data, end_time, beforeoneday):
    # data = Data[(Data['daystime']<LabelDay) & (Data['daystime']>LabelDay-datetime.timedelta(days=FEATURE_EXTRACTION_SLOT))]
    user_count = pd.crosstab(data.user_id, data.behavior_type)
    user_count_before5 = None
    if (((end_time - datetime.timedelta(days=5)) < datetime.datetime(
            2014, 12, 13, 0, 0, 0)) &
        ((end_time - datetime.timedelta(days=5)) > datetime.datetime(
            2014, 12, 10, 0, 0, 0))):
        beforefiveday = data[data['daystime'] >= end_time -
                             datetime.timedelta(days=5 + 2)]
        user_count_before5 = pd.crosstab(beforefiveday.user_id,
                                         beforefiveday.behavior_type)
    else:
        beforefiveday = data[data['daystime'] >= end_time -
                             datetime.timedelta(days=5)]
        user_count_before5 = pd.crosstab(beforefiveday.user_id,
                                         beforefiveday.behavior_type)

    user_count_before_3 = None
    if (((end_time - datetime.timedelta(days=5)) < datetime.datetime(
            2014, 12, 13, 0, 0, 0)) &
        ((end_time - datetime.timedelta(days=5)) > datetime.datetime(
            2014, 12, 10, 0, 0, 0))):
        beforethreeday = data[data['daystime'] >= end_time -
                              datetime.timedelta(days=3 + 2)]
        user_count_before_3 = pd.crosstab(beforethreeday.user_id,
                                          beforethreeday.behavior_type)
    else:
        beforethreeday = data[data['daystime'] >= end_time -
                              datetime.timedelta(days=3)]
        user_count_before_3 = pd.crosstab(beforethreeday.user_id,
                                          beforethreeday.behavior_type)

    user_count_before_2 = None
    if (((end_time - datetime.timedelta(days=5)) < datetime.datetime(
            2014, 12, 13, 0, 0, 0)) &
        ((end_time - datetime.timedelta(days=5)) > datetime.datetime(
            2014, 12, 10, 0, 0, 0))):
        beforethreeday = data[data['daystime'] >= end_time -
                              datetime.timedelta(days=7 + 2)]
        user_count_before_2 = pd.crosstab(beforethreeday.user_id,
                                          beforethreeday.behavior_type)
    else:
        beforethreeday = data[data['daystime'] >= end_time -
                              datetime.timedelta(days=7)]
        user_count_before_2 = pd.crosstab(beforethreeday.user_id,
                                          beforethreeday.behavior_type)

    # beforeoneday = Data[Data['daystime'] == LabelDay-datetime.timedelta(days=1)]
    beforeonedayuser_count = pd.crosstab(beforeoneday.user_id,
                                         beforeoneday.behavior_type)
    countAverage = user_count / FEATURE_EXTRACTION_SLOT
    buyRate = pd.DataFrame()
    buyRate['click'] = user_count[1] / user_count[4]
    buyRate['skim'] = user_count[2] / user_count[4]
    buyRate['collect'] = user_count[3] / user_count[4]
    buyRate.index = user_count.index

    buyRate_2 = pd.DataFrame()
    buyRate_2['click'] = user_count_before5[1] / user_count_before5[4]
    buyRate_2['skim'] = user_count_before5[2] / user_count_before5[4]
    buyRate_2['collect'] = user_count_before5[3] / user_count_before5[4]
    buyRate_2.index = user_count_before5.index

    buyRate_3 = pd.DataFrame()
    buyRate_3['click'] = user_count_before_3[1] / user_count_before_3[4]
    buyRate_3['skim'] = user_count_before_3[2] / user_count_before_3[4]
    buyRate_3['collect'] = user_count_before_3[3] / user_count_before_3[4]
    buyRate_3.index = user_count_before_3.index

    buyRate = buyRate.replace([np.inf, -np.inf], 0)
    buyRate_2 = buyRate_2.replace([np.inf, -np.inf], 0)
    buyRate_3 = buyRate_3.replace([np.inf, -np.inf], 0)

    long_online = pd.pivot_table(beforeoneday,
                                 index=['user_id'],
                                 values=['hours'],
                                 aggfunc=[np.min, np.max, np.ptp])

    user_id_feture = pd.merge(user_count,
                              beforeonedayuser_count,
                              how='left',
                              right_index=True,
                              left_index=True)
    user_id_feture = pd.merge(user_id_feture,
                              countAverage,
                              how='left',
                              right_index=True,
                              left_index=True)
    user_id_feture = pd.merge(user_id_feture,
                              buyRate,
                              how='left',
                              right_index=True,
                              left_index=True)
    user_id_feture = pd.merge(user_id_feture,
                              user_count_before5,
                              how='left',
                              right_index=True,
                              left_index=True)
    user_id_feture = pd.merge(user_id_feture,
                              user_count_before_3,
                              how='left',
                              right_index=True,
                              left_index=True)
    user_id_feture = pd.merge(user_id_feture,
                              user_count_before_2,
                              how='left',
                              right_index=True,
                              left_index=True)
    user_id_feture = pd.merge(user_id_feture,
                              long_online,
                              how='left',
                              right_index=True,
                              left_index=True)
    #    user_id_feture = pd.merge(user_id_feture,buyRate_2,how='left',right_index=True,left_index=True)
    #    user_id_feture = pd.merge(user_id_feture,buyRate_3,how='left',right_index=True,left_index=True)
    user_id_feture.fillna(0, inplace=True)
    return user_id_feture
Пример #43
0
import pandas as pd
import numpy as np
import os
import webbrowser

df_critics = pd.read_json("../output/albums.json")

# TODO: Create unique id's
# need to differentiate based on URL -- we have too much overlap from albums with similar names
ratings_df = pd.pivot_table(df_critics,
                            index="publication name",
                            columns="album id")

html = ratings_df.to_html(na_rep="")

with open("review_matrix.html", "w") as f:
    f.write(html)
Пример #44
0
import matplotlib.pyplot as plt

dfx = pd.DataFrame({
    'nama': ['a', 'b', 'a', 'b', 'a', 'c'],
    'class': ['x', 'x', 'x', 'x', 'x', 'x'],
    'type': ['y', 'z', 'y', 'z', 'y', 'z'],
    'nilai': ['1', '1', '1', '1', '2', '2']
})
dfy = pd.DataFrame({'nama': ['a', 'b', 'c']})
#print(dfx.nilai.unique())

listNilai = pd.DataFrame({'nilai': dfx.nilai.unique()})
#print(listNilai)

dfNamaNilai = pd.merge(listNilai, dfx, on='nilai')
dfPrint = pd.pivot_table(dfx,
                         values='nilai',
                         index=['nama'],
                         columns=['class', 'type'],
                         aggfunc='max')
print(dfPrint.index[2])
print(list(dfPrint))
print(dfPrint.iloc[2])
#print (dfPrint)
print('-----------------')
nilaiList = []
nilaiList.append([None if x != x else int(x) for x in dfPrint[('x', 'z')]])
print(nilaiList)

listNow = [1, 2, 3, 99, 70]
print(listNow[len(listNow) - 1])
Пример #45
0
'''
The task is to reshape the DataFrame using the pivot function and plot it on a heatmap just like explained here.
The year column of the dataset will be set as columns, the month will be set as row indexes, and the passengers will be the values.
 After reshaping, use the heatmap function of seaborn to plot the visualization.
'''

#### Heatmap ###############

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = sns.load_dataset(
    'flights')  # Reading flights dataset from seaborn package
# Reshaping the DataFrame
df = pd.pivot_table(df, values='passengers', index=['month'], columns='year')

# Plotting the heatmap
sns.heatmap(
    df, annot=True, fmt=''
)  ## The fmt parameter keeps the numbers on the heatmap concise for a better view and understanding.

plt.show()
Пример #46
0
def recomendaciones(usuario, elecciones):
    users = pd.read_csv("users.csv")
    images = pd.read_csv("images.csv")
    print(usuarios_test(pd.read_csv("election.csv"), users))
    clasificacion = pd.read_csv("election.csv")
    print(usuarios_test(clasificacion, users))
    n_users = clasificacion.idUser.unique().shape[0]
    n_images = clasificacion.IdImage.unique().shape[0]
    print(str(n_users) + ' users')
    print(str(n_images) + ' images')
    #plt.hist(clasificacion.election,bins=8)
    #plt.show()
    print("prueba")
    print(clasificacion.groupby(["election"])["idUser"].count())
    print("prueba")
    matrix = pd.pivot_table(clasificacion,
                            values='election',
                            index='idUser',
                            columns='IdImage').fillna(0)
    print(matrix)
    ratings = matrix.values
    sparsity = float(len(ratings.nonzero()[0]))
    sparsity /= (ratings.shape[0] * ratings.shape[1])
    sparsity *= 100
    print('Sparsity: {:4.2f}%'.format(sparsity))
    ratings_train, ratings_test = train_test_split(ratings,
                                                   test_size=0.1,
                                                   random_state=42)
    print(ratings_train.shape)
    print(ratings_train.shape[0])
    print(ratings_test.shape)
    sim_matrix = 1 - sklearn.metrics.pairwise.cosine_distances(ratings)
    print(sim_matrix.shape)
    #plt.imshow(sim_matrix);
    #plt.colorbar()
    #plt.show()
    #separar las filas y columnas de train y test
    tam_sim_matrix_test = ratings_train.shape[0] + ratings_test.shape[0]
    sim_matrix_train = sim_matrix[0:ratings_train.shape[0],
                                  0:ratings_train.shape[0]]
    sim_matrix_test = sim_matrix[ratings_train.shape[0]:tam_sim_matrix_test,
                                 ratings_train.shape[0]:tam_sim_matrix_test]
    print("MATRICES DE SIMUILITUD")
    print(sim_matrix_train.shape)
    print(sim_matrix_test.shape)
    users_predictions = sim_matrix_train.dot(ratings_train) / np.array(
        [np.abs(sim_matrix_train).sum(axis=1)]).T
    print("PREDICCIONES")
    print(users_predictions.shape)
    #plt.rcParams['figure.figsize'] = (20.0, 5.0)
    #plt.imshow(users_predictions)
    #plt.colorbar()
    #plt.show()
    imagen = 0
    puntaje = 0
    user = usuario
    data = users[users['user'] == user]
    usuario_ver = data.iloc[0][
        'idUser'] - 1  # resta 1 para obtener el index de pandas.
    print("USUARIO" + repr(usuario_ver))
    user0 = users_predictions.argsort()[usuario_ver]
    print(user0)
    # Veamos los tres recomendados con mayor puntaje en la predic para este usuario
    for i, aImage in enumerate(user0[-10:]):
        selImage = images[images['idImage'] == (aImage + 1)]
        print("ANTES" + repr(imagen))
        imagen = selImage.iloc[0]['idImage']
        print("DESPUES" + repr(imagen))
        puntaje = users_predictions[usuario_ver][aImage]
        if puntaje >= 0.51:
            elecciones['election'].append({
                'imagen': repr(imagen),
                'puntaje': repr(puntaje)
            })
            with open('election.json', 'w') as file:
                json.dump(elecciones, file, indent=4)
    return "exito"
Пример #47
0
print("n_features: ", X.shape[1])

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {'max_depth': [3, 5, 8, 10, 15, 20, 30],
              'max_features': [4, 8, 16, 20, 25, 40]}
grid = GridSearchCV(RandomForestClassifier(),
                    param_grid=param_grid)

# use [::10] to subsample by a factor of 10 for impatience
# could also have used StratifiedShuffleSplit(train_size=.1)
grid.fit(X_train[::10], y_train[::10])

res = pd.DataFrame(grid.cv_results_)
print(res.keys())
res_piv = pd.pivot_table(
    res, values='mean_test_score', index='param_max_depth',
    columns='param_max_features')

display(res_piv)

import matplotlib.pyplot as plt
%matplotlib inline
plt.matshow(res_piv.values)
plt.xlabel(res_piv.columns.name)
plt.xticks(range(res_piv.shape[1]), res_piv.columns)
plt.ylabel(res_piv.index.name)
plt.yticks(range(res_piv.shape[0]), res_piv.index)
plt.colorbar()
Пример #48
0
print(banks.isnull().sum())

bank_mode = banks.mode()
#print(bank_mode)

print(banks.columns.values)
cols = banks.columns.values
banks[cols] = banks[cols].fillna(bank_mode.iloc[0])
print(banks.isnull().sum())
#code ends here

# --------------
# Code starts here
avg_loan_amount = pd.pivot_table(banks,
                                 index=['Gender', 'Married', 'Self_Employed'],
                                 values='LoanAmount',
                                 aggfunc='mean')

print(avg_loan_amount)

#Another way to achieve this
temp_var = banks.groupby(['Gender', 'Married', 'Self_Employed'
                          ])['LoanAmount'].agg({'LoanAmount': 'mean'})
#print(temp_var)
# code ends here

# --------------
# code starts here
loan_approved_se = banks[(banks['Self_Employed'] == 'Yes')
                         & (banks['Loan_Status'] == 'Y')].shape[0]
loan_approved_nse = banks[(banks['Self_Employed'] == 'No')
Пример #49
0
# Cross-validated performance distribution
facet_grid = sns.factorplot(x='l1_ratio',
                            y='score',
                            col='alpha',
                            data=cv_score_df,
                            kind='violin',
                            size=4,
                            aspect=1)
facet_grid.set_ylabels('AUROC')

# In[23]:

# Cross-validated performance heatmap
cv_score_mat = pd.pivot_table(cv_score_df,
                              values='score',
                              index='l1_ratio',
                              columns='alpha')
ax = sns.heatmap(cv_score_mat, annot=True, fmt='.1%')
ax.set_xlabel('Regularization strength multiplier (alpha)')
ax.set_ylabel('Elastic net mixing parameter (l1_ratio)')

# ## Use Optimal Hyperparameters to Output ROC Curve

# In[24]:

y_pred_train = pipeline.decision_function(X_train)
y_pred_test = pipeline.decision_function(X_test)


def get_threshold_metrics(y_true, y_pred):
    roc_columns = ['fpr', 'tpr', 'threshold']
Пример #50
0
import pandas as pd
import numpy as np
import csv

# Load prediction rules from data files
U = pickle.load(open("user_features.dat", "rb"))
M = pickle.load(open("product_features.dat", "rb"))
predicted_ratings = pickle.load(open("predicted_ratings.dat", "rb"))

#print np.shape(predicted_ratings)
raw_dataset_df = pd.read_csv('out.csv')

#print raw_dataset_df
#to know the index value of user_id
ratings_df = pd.pivot_table(raw_dataset_df,
                            index='comp_id',
                            columns='user_id',
                            aggfunc=np.max)
#print ratings_df

db = mdb.connect(host="localhost",
                 user="******",
                 passwd="pragya",
                 db="recommendation")
cursor = db.cursor()
cursor.execute("select * from company;")
desc = cursor.description

with open("company2.csv", "wb") as csv_file:  # Python 2 version
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow([i[0] for i in desc])  # write headers
    csv_writer.writerows(cursor)
         markersize=7,
         markerfacecolor='blue',
         markeredgecolor='teal',
         data=transaction
         )
plt.title('Linear Relationship between Price of Item and Units Sold', fontdict=title_font)
plt.xlabel('Price of Item', fontdict=secondary_font)
plt.ylabel('Units Sold', fontdict=secondary_font)
plt.show()
# Not a linear relationship



## Feature by units sold pivot table and bar plot
feature_pivot = pd.pivot_table(data=transaction, 
                               index=['FEATURE'],
                               values=['UNITS'])

feature_pivot_t = pd.pivot_table(data=transaction, 
                               index=['FEATURE'],
                               values=['UNITS']).plot(kind='bar', 
                                                      color='teal',
                                                      title='Do Units Purchased vary by Feature Status?'
                                                      )
# There are far more units sold when featured than not



## Display by units sold pivot table and bar plot
display_pivot = pd.pivot_table(data=transaction,
                               index=['DISPLAY'],
def call_KM(genre1, genre2, genre3):
    movies = pd.read_csv('mysite/movies.csv')
    ratings = pd.read_csv('mysite/ratings.csv')

    # genre1='Adventure'
    # genre2='Sci-Fi'
    # genre3='Action'
    my_clusters = 0
    helper.set_Variables(genre1, genre2, genre3)

    genre_ratings = helper.get_genre_ratings(ratings, movies, [genre1, genre2],
                                             [Dict[genre1], Dict[genre2]])
    biased_dataset = helper.bias_genre_rating_dataset(genre_ratings, 3.2, 2.5)

    print("Number of records: ", len(biased_dataset))
    biased_dataset.head()
    helper.draw_scatterplot(biased_dataset[Dict[genre2]], Dict[genre2],
                            biased_dataset[Dict[genre1]], Dict[genre1],
                            'mysite/static/mysite/Normal.png')
    # plt.savefig('mysite/static/mysite/Normal.png')
    #
    # plt.close('mysite/static/mysite/Normal.png')

    X = biased_dataset[[Dict[genre2], Dict[genre1]]].values

    # TODO: Create an instance of KMeans to find two clusters
    kmeans_1 = KMeans(n_clusters=2, random_state=0)
    predictions = kmeans_1.fit_predict(X)
    helper.draw_clusters(biased_dataset, predictions,
                         'mysite/static/mysite/TwoCluster.png')
    # plt.savefig('mysite/static/mysite/TwoCluster.png')
    # plt.close('TwoCluster.png')

    # TODO: Create an instance of KMeans to find three clusters
    kmeans_2 = KMeans(n_clusters=3, random_state=1)
    predictions_2 = kmeans_2.fit_predict(X)
    helper.draw_clusters(biased_dataset, predictions_2,
                         'mysite/static/mysite/ThreeCluster.png')
    # plt.savefig('mysite/static/mysite/ThreeCluster.png')
    # plt.close('ThreeCluster.png')

    # TODO: Create an instance of KMeans to find four clusters
    kmeans_3 = KMeans(n_clusters=4, random_state=3)
    predictions_3 = kmeans_3.fit_predict(X)
    helper.draw_clusters(biased_dataset, predictions_3,
                         'mysite/static/mysite/FourCluster.png')
    # plt.savefig('mysite/static/mysite/FourCluster.png')
    # plt.close('FourCluster.png')

    possible_k_values = range(2, len(X) + 1, 5)
    errors_per_k = [helper.clustering_errors(k, X) for k in possible_k_values]
    list(zip(possible_k_values, errors_per_k))
    fig, ax = plt.subplots(figsize=(16, 6))
    ax.set_xlabel('K - number of clusters')
    ax.set_ylabel('Silhouette Score (higher is better)')
    ax.plot(possible_k_values, errors_per_k)
    fig.savefig('mysite/static/mysite/score.png')
    plt.close(fig)

    # Ticks and grid
    xticks = np.arange(min(possible_k_values), max(possible_k_values) + 1, 5.0)
    ax.set_xticks(xticks, minor=False)
    ax.set_xticks(xticks, minor=True)
    ax.xaxis.grid(True, which='both')
    yticks = np.arange(round(min(errors_per_k), 2), max(errors_per_k), .05)
    ax.set_yticks(yticks, minor=False)
    ax.set_yticks(yticks, minor=True)
    ax.yaxis.grid(True, which='both')

    # TODO: Create an instance of KMeans to find seven clusters
    kmeans_4 = KMeans(n_clusters=7, random_state=6)
    predictions_4 = kmeans_4.fit_predict(X)
    helper.draw_clusters(biased_dataset,
                         predictions_4,
                         'mysite/static/mysite/BestCluster.png',
                         cmap='Accent')
    # plt.savefig('mysite/static/mysite/BestCluster.png')
    # plt.close('BestCluster.png')

    biased_dataset_3_genres = helper.get_genre_ratings(
        ratings, movies, [genre1, genre2, genre3],
        [Dict[genre1], Dict[genre2], Dict[genre3]])
    biased_dataset_3_genres = helper.bias_genre_rating_dataset(
        biased_dataset_3_genres, 3.2, 2.5).dropna()
    print("Number of records: ", len(biased_dataset_3_genres))

    X_with_action = biased_dataset_3_genres[[
        Dict[genre2], Dict[genre1], Dict[genre3]
    ]].values

    # TODO: Create an instance of KMeans to find seven clusters
    kmeans_5 = KMeans(n_clusters=7)
    predictions_5 = kmeans_5.fit_predict(X_with_action)
    helper.draw_clusters_3d(biased_dataset_3_genres, predictions_5,
                            'mysite/static/mysite/3DCluster.png')
    # plt.savefig('mysite/static/mysite/3DCluster.png')
    # plt.close('3DCluster.png')

    #Merge the two tables then pivot so we have Users X Movies dataframe
    ratings_title = pd.merge(ratings,
                             movies[['movieId', 'title']],
                             on='movieId')
    user_movie_ratings = pd.pivot_table(ratings_title,
                                        index='userId',
                                        columns='title',
                                        values='rating')
    user_movie_ratings.iloc[:6, :10]
    n_movies = 30
    n_users = 18
    most_rated_movies_users_selection = helper.sort_by_rating_density(
        user_movie_ratings, n_movies, n_users)
    most_rated_movies_users_selection.head()

    helper.draw_movies_heatmap(most_rated_movies_users_selection,
                               'mysite/static/mysite/HeatMap.png')
def test():
    """
    import data
    """
    print("* importing data")
    players = pd.read_csv("../data/nba-players-stats/player_data.csv")
    players = players[players.year_start >= 1980] # only choose players who started after 1980
    players["player_id"] = range(0,len(players.name)) # assign id

    stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
    stats = stats[stats.Player.isin(players.name)]

    # only after 1980
    stats = stats[stats.Year >= 1980]

    # without duplicated names --> to do: how to distinguish multiple player with the same name
    stats = removeDuplicated(players, stats)
    stats.Year = stats.Year.astype(int)
    stats.year_count = stats.year_count.astype(int)

    print("* preparing data")
    # transform stats to a dictionary composed of df's for each stat
    # the stats are re-calculated to get one stat for each year
    metricsPerGameColNames = ["PTS","AST","TOV","TRB","STL","BLK","3P"]
    metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

    metricsPerCentColNames = ["FG","FT"]
    metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

    metricsWeightedColNames = ["PER"]
    metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames)

    allMetricsDict = {**metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict}
    allPivotedTableDict = getPivotedTableDict(allMetricsDict)
    allMetrics = list(allMetricsDict.keys())

    # this matrix will be used to mask the table
    df_year = pd.pivot_table(stats, values="Year", index="Player", columns = "year_count")

    """
    experiment setup
    """
    activePlayers = getActivePlayers(stats, 2016, 4)
    activePlayers.sort()
    activePlayers.remove("Kevin Garnett")
    activePlayers.remove("Kobe Bryant")

    # offMetrics = ["PTS_G","AST_G","TOV_G","3P_G","PER_w", "FG%","FT%"]
    # defMetrics = ["TRB_G","STL_G","BLK_G"]
    expSetup = ["sliding", "SVD", "all", "pinv", False]
    threshold = 0.97
    metrics_to_use = ["PTS_G","AST_G","TOV_G","3P_G","PER_w", "FG%","FT%","TRB_G","STL_G","BLK_G"]

    print("* start experiment")

    print("*******************")
    print("one at once, 10 models in total")
    pred_all = pd.DataFrame()
    true_all = pd.DataFrame()
    for playerName in activePlayers:
        target = Target(playerName, allPivotedTableDict, df_year)
        donor = Donor(allPivotedTableDict, df_year)

        mrsc = mRSC(donor, target, probObservation=1)

        player_pred = pd.DataFrame()
        player_true = pd.DataFrame()
        for metric in metrics_to_use:
            mrsc.fit_threshold([metric], [1.], 2016, pred_length = 1, threshold = threshold, setup = expSetup)
          
            pred = mrsc.predict()
            true = mrsc.getTrue()
            pred.columns = [playerName]
            true.columns = [playerName]
            
            player_pred = pd.concat([player_pred, pred], axis=0)
            player_true = pd.concat([player_true, true], axis=0)
        
        pred_all = pd.concat([pred_all, player_pred], axis=1)
        true_all = pd.concat([true_all, player_true], axis=1)

    with open('pred_all.pkl', 'wb') as f:
        pickle.dump(pred_all, f, pickle.HIGHEST_PROTOCOL)
    with open('true_all.pkl', 'wb') as f:
        pickle.dump(true_all, f, pickle.HIGHEST_PROTOCOL)

    print()
    print("*** MAPE ***")
    mask = (true_all !=0 )
    mape = np.abs(pred_all - true_all) / true_all[mask]
    print(mape.mean(axis=1))
    print("MAPE for all: ", mape.mean().mean())
    rmse = utils.rmse_2d(true_all, pred_all)
    print()
    print("*** RMSE ***")
    print(rmse)
    print("RMSE for all: ", rmse.mean())
Пример #54
0
def merge_state_data():
    lookup = {}

    with open("data/source/mapas-data-concentrado.xlsx", "rb") as f:
        dfs = pd.read_excel(f, sheet_name=None)

    for sheetname, df in dfs.items():
        try:
            sheet_state_code = int(sheetname.split(' ')[0])
        except ValueError:
            print("%s is not a valid sheet name" % sheetname, file=sys.stderr)
            continue
        pivot = pd.pivot_table(df,
                               index=["state_code", "year"],
                               fill_value=0,
                               aggfunc=np.sum,
                               values=PROPS)
        pivot_dict = pivot.reset_index().to_dict("records")
        final_data = [
            dict([k, int(v)] for k, v in row.items()) for row in pivot_dict
        ]
        lookup[sheet_state_code] = final_data

    maxes = {prop: 0 for prop in PROPS}
    centers = {
        'type': 'FeatureCollection',
        'name': 'estatalescentroids',
        'features': [],
    }

    state_meta = []
    with codecs.open('data/source-geojson/estatales.json',
                     encoding='utf-8',
                     errors="replace") as f:
        data = json.load(f)
        for feature in data['features']:
            state_code = int(feature['properties']['CVE_ENT'])
            state_data = lookup[state_code]

            yearlyData = []
            state_data_dict = dict([(row['year'], row) for row in state_data])
            for year in range(2006, 2017):
                row = state_data_dict.get(year, False)
                if not row:
                    row = {'year': year, 'state_code': state_code}
                    for prop in PROPS:
                        row[prop] = -1
                yearlyData.append(row)

            feature['properties']['yearlyFosasData'] = yearlyData

            for prop in PROPS:
                feature['properties'][prop + '_total'] = sum(
                    item.get(prop, 0) for item in state_data)
                try:
                    feature['properties'][prop + '_max'] = max(
                        item.get(prop, 0) for item in state_data)
                except ValueError:
                    feature['properties'][prop + '_max'] = 0

            feature['properties']['all_max'] = max(
                feature['properties'][prop + '_max'] for prop in PROPS)

            shp = shape(feature['geometry'])
            representative_point = mapping(shp.representative_point())
            centroid = mapping(shp.centroid)
            bounds = shp.bounds

            center_feature = deepcopy(feature)
            center_feature['geometry'] = representative_point
            centers['features'].append(center_feature)

            # Colima has islands
            if state_code == 6:
                bounds = [-104.8, 18.6, -103.4, 19.6]

            state_meta.append({
                'state_code': feature['properties']['CVE_ENT'],
                'state_name': feature['properties']['NOM_ENT'],
                'representative_point': representative_point,
                'centroid': centroid,
                'bounds': bounds,
                **feature['properties']
            })

    with open('data/processed-geojson/estatales.json', 'w') as f:
        json.dump(data, f)

    with open('data/processed-geojson/estatales-centroids.json', 'w') as f:
        json.dump(centers, f)

    with open('src/data/mxstates.json', 'w') as f:
        json.dump(state_meta, f)
Пример #55
0
def user_cate_feture(data, end_time, beforeoneday):
    # data = Data[(Data['daystime']<LabelDay) & (Data['daystime']>LabelDay-datetime.timedelta(days=FEATURE_EXTRACTION_SLOT))]
    user_item_count = pd.crosstab([data.user_id, data.item_category],
                                  data.behavior_type)
    # beforeoneday = Data[Data['daystime'] == LabelDay-datetime.timedelta(days=1)]

    user_cate_count_5 = None
    if (((end_time - datetime.timedelta(days=5)) < datetime.datetime(
            2014, 12, 13, 0, 0, 0)) &
        ((end_time - datetime.timedelta(days=5)) > datetime.datetime(
            2014, 12, 10, 0, 0, 0))):
        beforefiveday = data[data['daystime'] >= (
            end_time - datetime.timedelta(days=5 + 2))]
        user_cate_count_5 = pd.crosstab(
            [beforefiveday.user_id, beforefiveday.item_category],
            beforefiveday.behavior_type)
    else:
        beforefiveday = data[data['daystime'] >= (end_time -
                                                  datetime.timedelta(days=5))]
        user_cate_count_5 = pd.crosstab(
            [beforefiveday.user_id, beforefiveday.item_category],
            beforefiveday.behavior_type)
    user_cate_count_3 = None
    if (((end_time - datetime.timedelta(days=5)) < datetime.datetime(
            2014, 12, 13, 0, 0, 0)) &
        ((end_time - datetime.timedelta(days=5)) > datetime.datetime(
            2014, 12, 10, 0, 0, 0))):
        beforethreeday = data[data['daystime'] >= (
            end_time - datetime.timedelta(days=3 + 2))]
        user_cate_count_3 = pd.crosstab(
            [beforethreeday.user_id, beforethreeday.item_category],
            beforethreeday.behavior_type)
    else:
        beforethreeday = data[data['daystime'] >= (end_time -
                                                   datetime.timedelta(days=3))]
        user_cate_count_3 = pd.crosstab(
            [beforethreeday.user_id, beforethreeday.item_category],
            beforethreeday.behavior_type)

    user_cate_count_2 = None
    if (((end_time - datetime.timedelta(days=5)) < datetime.datetime(
            2014, 12, 13, 0, 0, 0)) &
        ((end_time - datetime.timedelta(days=5)) > datetime.datetime(
            2014, 12, 10, 0, 0, 0))):
        beforethreeday = data[data['daystime'] >= (
            end_time - datetime.timedelta(days=7 + 2))]
        user_cate_count_2 = pd.crosstab(
            [beforethreeday.user_id, beforethreeday.item_category],
            beforethreeday.behavior_type)
    else:
        beforethreeday = data[data['daystime'] >= (end_time -
                                                   datetime.timedelta(days=7))]
        user_cate_count_2 = pd.crosstab(
            [beforethreeday.user_id, beforethreeday.item_category],
            beforethreeday.behavior_type)


#    _live = user_cate_long_touch(data)
    beforeonedayuser_item_count = pd.crosstab(
        [beforeoneday.user_id, beforeoneday.item_category],
        beforeoneday.behavior_type)
    max_touchtime = pd.pivot_table(beforeoneday,
                                   index=['user_id', 'item_category'],
                                   values=['hours'],
                                   aggfunc=[np.min, np.max])
    max_touchtype = pd.pivot_table(beforeoneday,
                                   index=['user_id', 'item_category'],
                                   values=['behavior_type'],
                                   aggfunc=np.max)
    user_cate_feture = pd.merge(user_item_count,
                                beforeonedayuser_item_count,
                                how='left',
                                right_index=True,
                                left_index=True)
    user_cate_feture = pd.merge(user_cate_feture,
                                max_touchtime,
                                how='left',
                                right_index=True,
                                left_index=True)
    user_cate_feture = pd.merge(user_cate_feture,
                                max_touchtype,
                                how='left',
                                right_index=True,
                                left_index=True)
    #    user_cate_feture = pd.merge(user_cate_feture,_live,how='left',right_index=True,left_index=True)
    user_cate_feture = pd.merge(user_cate_feture,
                                user_cate_count_5,
                                how='left',
                                right_index=True,
                                left_index=True)
    user_cate_feture = pd.merge(user_cate_feture,
                                user_cate_count_3,
                                how='left',
                                right_index=True,
                                left_index=True)
    user_cate_feture = pd.merge(user_cate_feture,
                                user_cate_count_2,
                                how='left',
                                right_index=True,
                                left_index=True)
    user_cate_feture.fillna(0, inplace=True)
    return user_cate_feture
train_data, test_data, = sklearn.model_selection.train_test_split(
    raw,
    train_size=0.8  # % of data to use for training (rest used for testing)
    ,
    random_state=1,
    shuffle=True)

# Up-sample rare outcome variable in training data to match proportion made up by other variable values

print('Training data before up-sampling outcome variable:')

print(
    pd.pivot_table(
        data=train_data,
        index=outcome_variables,
        values=train_data.columns[0],
        aggfunc='count').rename(
            columns={train_data.columns[0]: 'Count'}))  # Before up-sampling

train_data_variables_no_upsampling = train_data[train_data[outcome_variables[
    0]].apply(lambda x: not (x in outcome_variable_values_to_upsample))]
train_data_variables_upsampling = train_data[train_data[outcome_variables[
    0]].apply(lambda x: x in outcome_variable_values_to_upsample)]

if len(outcome_variable_values_to_upsample) > 0:
    train_data_variables_upsampled = train_data_variables_upsampling.sample(
        n=int(
            np.ceil(
                len(train_data_variables_no_upsampling) *
                upsample_proportion_of_common_variable)),
        replace=True)
Пример #57
0
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

# df = pd.read_csv("……")
# table = pd.pivot_table(df,index=['...','..'],value='..',aggfunc='sum',columns=['>>>'])    #选择聚合的方式(和,平均……)
#有点像小学的课程表




df = DataFrame(np.random.randint(10,size=[20,5]),index=list("AZXSAZXSDCXSAZXBSDCX"),columns=list("HJKLP"))
print(df)
df1 = pd.pivot_table(df,index=['H',"J"],values=["K",'L'],aggfunc="sum",columns="P",fill_value=0)
print(df1)
Пример #58
0
def divvy_create_graph(station_id, station_name, weather):

    def haversine(row):
        # convert decimal degrees to radians     
        #convert lon, lat to floats
        row['FROM LONGITUDE'], row['FROM LATITUDE'], row['TO LONGITUDE'], row['TO LATITUDE'] = \
                  map(float, [row['FROM LONGITUDE'], row['FROM LATITUDE'], row['TO LONGITUDE'], row['TO LATITUDE']])
        lon1, lat1, lon2, lat2 = \
                  map(radians, [row['FROM LONGITUDE'], row['FROM LATITUDE'], row['TO LONGITUDE'], row['TO LATITUDE']])

        # haversine formula 
        dlon = lon2 - lon1 
        dlat = lat2 - lat1 
        a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
        
        c = 2 * asin(sqrt(a))    
        r = 3956  # Radius of earth in kilometers. Use 3956 for miles 6371km for km
        
        return c * r
    
    font = {'family' : 'arial', 'weight' : 'bold', 'size'   : 15}
    rc('font', **font); rc("figure", facecolor="white"); rc('axes', edgecolor='darkgray');

    def runplot(pvtdf, title, path):        
        pvtdf.plot(kind='bar', edgecolor='w',figsize=(20,5), width=0.5, fontsize = 10)
        locs, labels = plt.xticks()    
        plt.title(title, weight='bold', size=24)
        lgd = plt.legend(loc='right', ncol=14, frameon=True, shadow=False, prop={'size': 12}, bbox_to_anchor=(1, 0.95))
        for i in range(len(lgd.get_texts())): 
            text = lgd.get_texts()[i].get_text().replace('(DISTANCE_Miles, ', '(')
            lgd.get_texts()[i].set_text(text)
        plt.xlabel('Year', weight='bold', size=24)
        plt.ylabel('Distance', weight='bold', size=20)
        plt.tick_params(axis='x', bottom='off', top='off', labelsize=15)
        plt.tick_params(axis='y', left='off', right='off', labelsize=15)
        plt.grid(b=True)
        plt.setp(labels, rotation=0, rotation_mode="anchor", ha="center")
        plt.tight_layout()
        plt.savefig(path)

    def htmlpage(datastring, title, filename):
        html = '''<!DOCTYPE html>
                  <html>
                      <head>
                         <style type="text/css">
                              body{{ 
                                      margin:15px; padding: 20px 100px 20px 100px;
                                      font-family:Arial, Helvetica, sans-serif; font-size:88%; 
                              }}
                              h1, h2 {{
                                      font:Arial black; color: #383838;
                                      valign: top;
                              }}       
                              img {{
                                    float: right;
                              }}
                              table{{
                                      width:100%; font-size:13px;                  
                                      border-collapse:collapse;
                                      text-align: right;                  
                              }}
                              th{{ color: #383838 ; padding:2px; text-align:right; }}
                              td{{ padding: 2px 5px 2px 5px; }}
                              .footer{{
                                      text-align: right;
                                      color: #A8A8A8;
                                      font-size: 12px;
                                      margin: 5px;
                              }}                   
                         </style>
                      </head>                      
                      <body>
                        <h3>{0}<img src="../../divvylogo.svg" alt="divvy icon" height="50px"></h3>
                        {1}
                      </body>
                      <div class="footer">Source: City of Chicago Data Portal</div>
                  </html>'''.format(title, datastring)
        with open(os.path.join(cd, 'stations', station_name, filename), 'w') as f: 
            f.write(html)

    CONN = sql.connect("/Users/divya/projects/pandas/divvy-example/Divvy_Trips.db")
    stations = "SELECT * FROM temp WHERE [FROM STATION ID] = {0} OR [TO STATION ID] = {0}".format(station_id);
    df = pd.read_sql(stations, con = CONN)
    CONN.close()

    # create folder
    cd = os.path.dirname(os.path.abspath(__file__))
    if os.path.exists(os.path.join(cd, 'stations', station_name)) == False:
        os.makedirs(os.path.join(cd, 'stations', station_name))

    #NEW COLUMNS
    df['DISTANCE_Miles'] = df.apply(haversine, axis=1)
    df["START TIME"] = pd.to_datetime(df["START TIME"], format="%m/%d/%Y %I:%M:%S %p")
    df["STOP TIME"] = pd.to_datetime(df["STOP TIME"], format="%m/%d/%Y %I:%M:%S %p")
    
    df.loc[:, 'TRIP DURATION'] = df['TRIP DURATION'].astype(float)/60

    df.loc[:, 'START TIME MONTH'] = df['START TIME'].dt.month
    df.loc[:, 'START TIME YEAR'] = df['START TIME'].dt.year
    df.loc[:, 'STOP TIME MONTH'] = df['STOP TIME'].dt.month
    df.loc[:, 'STOP TIME YEAR'] = df['STOP TIME'].dt.year
    df.loc[:, 'START WEEKDAY'] = [cal.day_name[i] for i in df['START TIME'].dt.weekday]

    df['BIRTH YEAR'] = np.where(df["BIRTH YEAR"] == '', np.nan, df["BIRTH YEAR"])
    df['BIRTH YEAR'] = df['BIRTH YEAR'].astype('float')
    df.loc[:, 'AGE'] = df['START TIME YEAR'] - df['BIRTH YEAR']

    df['AGE_GROUP'] = np.where(df['AGE'].between(0,18, inclusive = True), '0 - 18',
                          np.where(df['AGE'].between(19,29, inclusive = True), '19 - 29',
                              np.where(df['AGE'].between(30,39, inclusive = True), '30 - 39',
                                  np.where(df['AGE'].between(40,49, inclusive = True), '40 - 49',
                                      np.where(df['AGE'].between(50,65, inclusive = True), '50 - 65',
                                          np.where(df['AGE'].between(65,80, inclusive = True), '65+', np.nan))))))

    df.loc[:, 'START TIME HOUR'] = df['START TIME'].dt.hour
    df['TIME FRAME'] = np.where(df['START TIME HOUR'].between(0,1, inclusive = True), '00:00-01:00',
                           np.where(df['START TIME HOUR'].between(1,2, inclusive = True), '01:00-02:00',
                               np.where(df['START TIME HOUR'].between(2,3, inclusive = True), '02:00-03:00',
                                   np.where(df['START TIME HOUR'].between(3,4, inclusive = True), '03:00-04:00',
                                       np.where(df['START TIME HOUR'].between(4,5, inclusive = True), '04:00-05:00',
                                           np.where(df['START TIME HOUR'].between(5,6, inclusive = True), '05:00-06:00',
                                           np.where(df['START TIME HOUR'].between(6,7, inclusive = True), '06:00-07:00',
                                               np.where(df['START TIME HOUR'].between(7,8, inclusive = True), '07:00-08:00',
                                                   np.where(df['START TIME HOUR'].between(8,9, inclusive = True), '08:00-09:00',
                                                       np.where(df['START TIME HOUR'].between(9,10, inclusive = True), '09:00-10:00', 
                                                           np.where(df['START TIME HOUR'].between(10,11, inclusive = True), '10:00-11:00',
                                                               np.where(df['START TIME HOUR'].between(11,12, inclusive = True), '11:00-12:00',
                                                                   np.where(df['START TIME HOUR'].between(12,13, inclusive = True), '12:00-13:00', 
                                                                       np.where(df['START TIME HOUR'].between(13,14, inclusive = True), '13:00-14:00', 
                                                                           np.where(df['START TIME HOUR'].between(14,15, inclusive = True), '14:00-15:00', 
                                                                               np.where(df['START TIME HOUR'].between(15,16, inclusive = True), '15:00-16:00', 
                                                                                   np.where(df['START TIME HOUR'].between(16,17, inclusive = True), '16:00-17:00', 
                                                                                       np.where(df['START TIME HOUR'].between(17,18, inclusive = True), '17:00-18:00', 
                                                                                           np.where(df['START TIME HOUR'].between(18,19, inclusive = True), '18:00-19:00', 
                                                                                               np.where(df['START TIME HOUR'].between(19,20, inclusive = True), '19:00-20:00', 
                                                                                                   np.where(df['START TIME HOUR'].between(20,21, inclusive = True), '20:00-21:00', 
                                                                                                       np.where(df['START TIME HOUR'].between(21,22, inclusive = True), '21:00-22:00', 
                                                                                                           np.where(df['START TIME HOUR'].between(22,23, inclusive = True), '22:00-23:00', 
                                                                                                               np.where(df['START TIME HOUR'].between(23,24, inclusive = True), '23:00-24:00', np.nan)))))))))))))))))))))))) 

    #overall
    df.groupby(['START TIME YEAR', 'START TIME MONTH'])['TRIP DURATION'].count().reset_index().to_csv(os.path.join(cd, 'stations', station_name, 'tripCount.csv'))
    htmlpage(df.groupby(['START TIME YEAR', 'START TIME MONTH'])['TRIP DURATION'].count().reset_index().to_html(), '{} - Trip Count'.format(station_name), 'TripCount.html')
    #df.groupby(['START TIME YEAR', 'START TIME MONTH'])['TRIP DURATION'].count().reset_index().to_html(os.path.join(cd, 'stations', station_name, 'tripCount.html'))
    df.groupby(['START TIME YEAR', 'START TIME MONTH'])['TRIP DURATION'].mean().reset_index().to_csv(os.path.join(cd, 'stations', station_name, 'averageDuration.csv'))
    #df.groupby(['START TIME YEAR', 'START TIME MONTH'])['TRIP DURATION'].mean().reset_index().to_html(os.path.join(cd, 'stations', station_name, 'averageDuration.html'))
    htmlpage(df.groupby(['START TIME YEAR', 'START TIME MONTH'])['TRIP DURATION'].mean().reset_index().to_html(), '{} - Trip Duration'.format(station_name), 'AvgDuration.html')
    df.groupby(['START TIME YEAR', 'START TIME MONTH'])['DISTANCE_Miles'].mean().reset_index().to_csv(os.path.join(cd, 'stations', station_name, 'averageDistance.csv'))
    htmlpage(df.groupby(['START TIME YEAR', 'START TIME MONTH'])['DISTANCE_Miles'].mean().reset_index().to_html(), '{} - Avg Miles'.format(station_name), 'AvgDistance.html')
    #df.groupby(['START TIME YEAR', 'START TIME MONTH'])['DISTANCE_Miles'].mean().reset_index().to_html(os.path.join(cd, 'stations', station_name, 'averageDistance.html'))

    #distance by gender
    distanceByGender = pd.pivot_table(df[(df['USER TYPE'] == 'Subscriber') & (df['GENDER'] != '')], index=['START TIME YEAR', 'START TIME MONTH'], columns=['GENDER'], values=['DISTANCE_Miles'], aggfunc=len)
    distanceByGender.plot(title= "{} - Distance in miles by gender".format(station_name), figsize=(20,5))
    lgd = plt.legend()
    for i in range(len(lgd.get_texts())): 
        text = lgd.get_texts()[i].get_text().replace('(DISTANCE_Miles, ', '(')
        lgd.get_texts()[i].set_text(text)
    plt.savefig(os.path.join(cd, 'stations', station_name, 'distanceByGender.png'))

    miles_by_gender = pd.pivot_table(df[(df['USER TYPE'] == 'Subscriber') 
            & (df["AGE"] < 80) & (df["GENDER"].str.len() > 0)
            & (df["DISTANCE_Miles"] > 0)], index = ['START TIME YEAR'], 
        columns=['GENDER'], values=['DISTANCE_Miles'], aggfunc = len)

    miles_by_gender.reset_index().to_csv(os.path.join(cd, 'stations', station_name, 'milesByGender.csv'))
    htmlpage(miles_by_gender.reset_index().to_html(), '{} - Miles By Gender'.format(station_name), 'MilesByGender.html')
    #miles_by_gender.reset_index().to_html(os.path.join(cd, 'stations', station_name, 'milesByGender.html'))

    wes.set_palette('FantasticFox')    
    runplot(miles_by_gender,' {} - Divvy Data subscribers, distance by age and gender'.format(station_name), os.path.join(cd, 'stations', station_name, 'milesByGender.png'))

    mergelist = []

    for yr in list(range(df['START TIME YEAR'].min(), df['START TIME YEAR'].max() + 1)):
        miles_by_gender_pie = None
        miles_by_gender_pie = df[(df['USER TYPE'] == 'Subscriber') & (df['START TIME YEAR']==yr) & (df["GENDER"].str.len() > 0)].\
            groupby(["GENDER", "START TIME YEAR"])["DISTANCE_Miles"].apply(len).reset_index()
        
        mergelist.append(miles_by_gender_pie[['GENDER', 'DISTANCE_Miles']])
    
    miles_by_gender_pie = reduce(lambda left,right: pd.merge(left, right, on='GENDER'), mergelist)    
    miles_by_gender_pie.columns = ['GENDER'] + [str(i) for i in list(range(df['START TIME YEAR'].min(), df['START TIME YEAR'].max() + 1))]

    labels = 'Female', 'Male'
    #pie = plt.pie(miles_by_gender_pie, labels=labels)
    miles_by_gender_pie.plot.pie(subplots=True, labels=['', ''], figsize=(12,3.8), autopct='%.2f')
    plt.axis('equal')
    plt.legend(labels=labels)
    plt.savefig(os.path.join(cd, 'stations', station_name, 'milesByGenderPIE.png'))

    #Customer v Subscriber
    user_type_pie = df.groupby(["START TIME YEAR","USER TYPE"]).apply(len).reset_index()
    user_type_pie.columns = ['Year', 'User Type', 'People']

    customervsubscriber = pd.pivot_table(user_type_pie, index=['User Type'], columns=['Year'], values=['People'])
    labels = ['Customer', 'Subscriber']
    customervsubscriber.plot.pie(subplots=True, labels=['', ''], figsize=(12,3.8), autopct='%.2f')
    plt.axis('equal')
    plt.legend(labels=labels)
    plt.savefig(os.path.join(cd, 'stations', station_name, 'customervSubscriber.png'))
    
    #Miles by Age#
    miles_by_age = pd.pivot_table(df[(df['USER TYPE'] == 'Subscriber') 
            & (df["AGE"] < 80)
            & (df["GENDER"].str.len() > 0)
            & (df["DISTANCE_Miles"] > 0)], index = ['START TIME YEAR'], 
        columns=['AGE_GROUP'], values=['DISTANCE_Miles'], aggfunc = np.sum)

    runplot(miles_by_age, '{} - Divvy Data subscribers, total distance by age'.format(station_name), os.path.join(cd, 'stations', station_name, 'milesByAge.png'))
    htmlpage(miles_by_age.reset_index().to_html(), '{} - Miles By Age'.format(station_name), 'MilesByAge.html')

    #avDistanceByGender over Weather
    avDistanceMiles = pd.pivot_table(df[df['GENDER'] != ''] , 
                                        index=['START TIME MONTH'], columns=['GENDER'],
                                        values=['DISTANCE_Miles'], aggfunc=np.sum)
    total = avDistanceMiles.join(weather)

    DistancevsWeather = total.sort_values(['ATF'])
    DistancevsWeather["START TIME MONTH"] = DistancevsWeather.index.values
    DistancevsWeather = DistancevsWeather.set_index(['ATF'])

    line = DistancevsWeather[[0,1]].plot(title="Mean distance travelled by gender", figsize=(20,5))
    line.set_ylabel("Distance(miles)")
    line.set_xlabel("Time")
    lgd = plt.legend()
    for i in range(len(lgd.get_texts())): 
        text = lgd.get_texts()[i].get_text().replace('(DISTANCE_Miles, ', '(')
        lgd.get_texts()[i].set_text(text)
    plt.savefig(os.path.join(cd, 'stations', station_name, 'milesvsWeather.png'))
    htmlpage(DistancevsWeather.reset_index().to_html(), '{} - Distance vs Weather'.format(station_name), 'DistancevsWeather.html')
Пример #59
0
books = pd.read_csv(
    "C:\\Users\\trupti\\Desktop\\BookRack\\home\\book_data2.csv",
    engine="python")

# display(ratings.head())
# display(books.head())

# In[4]:

# Merge the two tables then pivot so we have Users X Books dataframe.
ratings_title = pd.merge(ratings,
                         books[['book_id', 'book_title']],
                         on='book_id')
user_book_ratings = pd.pivot_table(ratings_title,
                                   index='user_id',
                                   columns='book_title',
                                   values='rating')

print('dataset dimensions: ', user_book_ratings.shape, '\n\nSubset example:')
#user_book_ratings.iloc[:25, :10]

# In[5]:

# Drop users that have given fewer than 100 ratings of these most-rated books
user_book_ratings = user_book_ratings.dropna(thresh=100)

# print('dataset dimensions: ', user_book_ratings.shape, '\n\nSubset example:')
# user_book_ratings.iloc[:25, :10]

# In[6]:
#loop through offices and find statewide ones
for office in offices_tot:
    counties  = []
    counties.append(elec_df.loc[elec_df['office'] == office, 'county'])
    #list of counties that had an election for that office
    counties_office[office] = counties
    c = counties_office[office][0].values
    D = {I: True for I in c}
    count = D.keys()
    if len(count) ==  len(counties_tot):
        state_offices.append(office)
        
state_elec = elec_df.loc[elec_df['office'].isin(state_offices)]

#get table of elections by precinct
prec_elec = pd.pivot_table(state_elec, index = ['loc_prec', 'county', 'precinct'], columns = ['office','party','candidate'], values = ['votes'], aggfunc = np.sum)

prec_elec.columns = prec_elec.columns.to_series().str.join(' ')

columns = prec_elec.columns.values

# print columns and assign each one a 10 character name for the shapefile
print(columns)

# make dict for column name replacement using the columns printed in module above
# columns can only have 10 character names
prec_elec_rn = prec_elec.rename(columns = {
        'votes Attorney General DEM Mike Lee': 'G18DATG1',
        'votes Attorney General Dem Mike Lee': 'G18DATG2',
        'votes Attorney General LIB Kerry Hicks': 'G18LATG1',
        'votes Attorney General Lib Kerry Hicks': 'G18LATG2',