def Data_Pull(Brands, start_date, end_date, ReportType, conn): i = 0 for ID in Brands.iloc[:, 1]: print(Brands.iloc[i][0]) List = Query_Mod(Brands.iloc[i, :].tolist(), ReportType) df = Data_Loop(List, ID, ReportType, start_date, end_date, conn) if Brands.iloc[i, 1] == 119847739: df = df.rename(columns={'goal4Completions': 'KPI'}) df = df >> mutate(Brand=Brands.iloc[i, 0]) else: df = df.rename(columns={'goal16Completions': 'KPI'}) df = df >> mutate(Brand=Brands.iloc[i, 0]) #Adding Indication column for swoop and yieldbot reports if (ReportType in ('Swoop', 'Yieldbot', 'Medium')): if (ID == 120033970): df = pd.merge(df, Indicators, on='Campaign', how='left') else: df = df >> mutate(Indication='N/A') #Merging data frames in the loop if i == 0: BE = df else: frames = [BE, df] BE = pd.concat(frames) i = i + 1 BE['Start Date'] = start_date BE['End Date'] = end_date df = Column_Adjust(BE, ReportType) ''' if(ReportType == 'Swoop'): ReportType2 = 'Swoop_P23' List = Query_Mod(Brands.iloc[26,:].tolist(),ReportType2) P23 = Data_Loop(List,Brands.iloc[26,1],ReportType2,start_date,end_date,conn) P23 = P23 >> mutate(Keyword = np.nan) >> mutate(Indication = 'N/A') >> mutate(NewUsers = np.nan) >> mutate(Pageviews = np.nan) >> mutate(Bounces = np.nan) >> mutate(pageviewsPerSession = np.nan) P23 = P23 >> mutate(avgTimeOnPage = np.nan) >> mutate(KPI = np.nan) >> mutate(StartDate = start_date) >> mutate(EndDate = end_date) >> mutate(Brand = Brands.iloc[26,0]) P23 = Column_Adjust(P23,ReportType2) frames2 = [BE,P23] BE = pd.concat(frames2) ''' return (df)
def create_task(): df = clean_df(request.json) df_customer = df >> mutate(name=X.FullNameBilling.str.upper()) >> group_by(X.name) >> summarize(contact = X.PhoneBilling.head(1), email = X.EmailBilling.head(1), address = X.Address2Billing.head(1), num_items_purchased = (X.name).count() ) jsondf = df_customer.to_json(orient='records') return (jsondf);
def create_task(): df = clean_df(request.json) df_customer = df >> mutate(name=X.FullNameBilling.str.upper()) >> group_by( X.name) >> summarize(contact=X.PhoneBilling.head(1), email=X.EmailBilling.head(1), address=X.Address2Billing.head(1), num_items_purchased=(X.name).count()) jsondf = df_customer.to_json(orient='records') return (jsondf)
def add_day(df, countriez): df_temp = df.copy() df = pd.DataFrame(index=range(0, 100000)) for country in countriez: data = (DplyFrame(df_temp) >> sift(X.country == country)) df_filt = (data >> mutate(day=range(1, len(data) + 1))) df = pd.concat([df, df_filt], sort=False).dropna(how='all') return df
# dplyr::group_by(Species) %>% # summarise(media=mean(Petal.Length)) iris >> dp.group_by(X.Species) >> dp.summarize(media=X.PetalLength.mean()) iris.groupby(['Species'])['PetalLength'].agg(['mean', 'sum', 'count']) iris.groupby(['Species'])['PetalLength'].agg({'var1':'mean', 'var2':'sum', 'var3':'count'}) iris.groupby(['Species'])['PetalLength'].agg({'var1':['mean', 'sum']}) aggregations = { 'dsuma':'sum', } import math iris.groupby(['Species'])['PetalLength'].agg({'dsuma':'sum', 'otro': lambda x: math.sqrt(x.mean()) - 1}) # data=iris %>% # mutate(total=Sepal.Length+Petal.Length, otro=ifelse(Petal.Length>2, "grande", "pequeño")) iris >> dp.mutate(redondeado=X.PetalLength.round(), redondeado2=X.SepalLength.round()) iris.assign(redondeado = lambda x: x.PetalLength.round(), redondeado2 = lambda x: x.SepalLength.round()) #ifelse(y==0, 0, 1) np.where((y == 0), 0, 1) # data=iris %>% # distinct(Species, Sepal.Length, .keep_all = T) iris >> dp.distinct(X.SepalLength) iris.drop_duplicates() iris.drop_duplicates(subset='PetalLength') # #ordenando # data=iris %>% # arrange(Sepal.Length, Sepal.Width) iris >> dp.arrange(X.PetalLength)
diamonds >> sample(frac=0.0001, replace=False) # % of df rows returned diamonds >> sample(n=3, replace=True) # number of rows returned diamonds >> distinct(X.color) # Selection of unique rows diamonds >> mask(X.cut == 'Ideal') >> head(4) # Filtering rows with logical criteria # mask() can also be called using the alias filter_by() diamonds >> filter_by(X.cut == 'Ideal', X.color == 'E', X.table < 55, X.price < 500) # pull simply retrieves a column and returns it as a pandas series, in case you only care about one particular column at the end of your pipeline (diamonds >> filter_by(X.cut == 'Ideal', X.color == 'E', X.table < 55, X.price < 500) >> pull('carat')) # DataFrame transformation diamonds >> mutate(x_plus_y=X.x + X.y) >> select(columns_from('x')) >> head(3) diamonds >> mutate(x_plus_y=X.x + X.y, y_div_z=(X.y / X.z)) >> select(columns_from('x')) >> head(3) # The transmute() function is a combination of a mutate and a selection of the created variables. diamonds >> transmute(x_plus_y=X.x + X.y, y_div_z=(X.y / X.z)) >> head(3) # group_by() and ungroup() diamonds >> head(5) >> group_by(X.color) >> mutate(avg_price=X.price.mean()) (diamonds >> group_by(X.cut) >> mutate(price_lead=lead(X.price), price_lag=lag(X.price)) >> head(2) >> select(X.cut, X.price, X.price_lead, X.price_lag)) # ungroup() (diamonds >> group_by(X.cut) >> arrange(X.price) >> head(3) >> ungroup() >> mask(X.carat < 0.23)) # Reshaping
def czMatchmaker(data, Q, precursor_fasta): data = pd.read_csv( "/Users/matteo/Documents/czMatchmaker/data/examplaryData.csv") data = DplyFrame(data) precursors = data >> \ sift( X.tag == 'precursor' ) >> \ select( X.active, X.neutral, X.estimates) fragments = data >> sift( X.tag != 'precursor' ) >> \ group_by( X.tag, X.active, X.broken_bond ) >> \ summarize( estimates = X.estimates.sum() ) I_on_fragments = {} optiminfos = {} for break_point, data in fragments.groupby('broken_bond'): pairing, optiminfo = collect_fragments(data, Q) I_on_fragments[break_point] = pairing optiminfos[break_point] = optiminfo cations_fragmented_I = sum( sum(I_on_fragments[bP][p] for p in I_on_fragments[bP]) for bP in I_on_fragments) I_no_reactions = precursors >> \ sift( X.active==Q, X.neutral == 0) >> \ select( X.estimates ) I_no_reactions = I_no_reactions.values.flatten()[0] prec_ETnoD_PTR_I = precursors >> \ sift( X.active != Q ) >> \ rename( ETnoD = X.neutral, I = X.estimates ) >> \ mutate( PTR = Q - X.ETnoD - X.active ) >> \ select( X.ETnoD, X.PTR, X.I ) I_prec_no_frag = prec_ETnoD_PTR_I >> \ summarize( I = X.I.sum() ) I_prec_no_frag = I_prec_no_frag.values.flatten()[0] precursorNoReactions = precursors >> \ sift( X.active == Q ) >> \ select( X.estimates ) prec_ETnoD_PTR_I = prec_ETnoD_PTR_I >> mutate( I_PTR = crossprod(X.PTR, X.I), \ I_ETnoD = crossprod(X.ETnoD, X.I) ) >> \ summarize( I_PTR = X.I_PTR.sum(), I_ETnoD = X.I_ETnoD.sum() ) I_PTR_no_frag, I_ETnoD_no_frag = prec_ETnoD_PTR_I.values.flatten() prob_PTR = I_PTR_no_frag / (I_PTR_no_frag + I_ETnoD_no_frag) prob_ETnoD = 1. - prob_PTR I_frags = dict( (bP, sum(I_on_fragments[bP][pairing] for pairing in I_on_fragments[bP])) for bP in I_on_fragments) I_frag_total = sum(I_frags[bP] for bP in I_frags) prob_frag = Counter( dict((int(bP), I_frags[bP] / I_frag_total) for bP in I_frags)) prob_frag = [prob_frag[i] for i in range(len(precursor_fasta))] I_frags_PTRETnoD_total = sum( (Q - 1 - sum(q for cz, q in pairing)) * I_on_fragments[bP][pairing] for bP in I_on_fragments for pairing in I_on_fragments[bP]) anion_meets_cation = I_frags_PTRETnoD_total + I_PTR_no_frag + I_ETnoD_no_frag prob_fragmentation = I_frags_PTRETnoD_total / anion_meets_cation prob_no_fragmentation = 1 - prob_fragmentation prob_no_reaction = I_no_reactions / (I_no_reactions + I_frag_total + I_prec_no_frag) prob_reaction = 1. - prob_no_reaction res = {} res['reaction'] = (prob_reaction, prob_no_reaction) res['fragmentation'] = (prob_fragmentation, prob_no_fragmentation) res['fragmentation_amino_acids'] = tuple(prob_frag) return res
slat = X.latitude_start * math.pi / 180, elat = X.latitude_end * math.pi / 180, slng = X.longitude_start * math.pi / 180, elng = X.longitude_end * math.pi / 180 ) print( 'pandas_ply: ' + str( round( time.clock() - start_time, 2 ) ) + ' seconds.' ) # dplython import pandas from dplython import (DplyFrame, X, diamonds, select, sift, sample_n, sample_frac, head, arrange, mutate, group_by, summarize, DelayFunction) start_time = time.clock() dt = DplyFrame(d0) >> sift( X.usertype == 'Subscirber' ) >> mutate( slat = X.latitude_start * math.pi / 180, elat = X.latitude_end * math.pi / 180, slng = X.longitude_start * math.pi / 180, elng = X.longitude_end * math.pi / 180 ) print( 'dplython: ' + str( round( time.clock() - start_time, 2 ) ) + ' seconds.' ) # dfply from dfply import * import pandas as pd start_time = time.clock() dt =d0 >> mask( X.usertype == 'Subscirber' ) >> mutate( slat = X.latitude_start * math.pi / 180, elat = X.latitude_end * math.pi / 180, slng = X.longitude_start * math.pi / 180, elng = X.longitude_end * math.pi / 180 )
import altair as alt firsts = pd.read_csv( 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-06-09/firsts.csv' ) firsts.to_csv('/Users/vivekparashar/Downloads/firsts.csv') # Create/Convert a pandas dataframe to dplython df firsts = DplyFrame(firsts) firsts.columns firsts.gender.unique() firsts.category.unique() # firsts df summary by category t1 = (firsts >> mutate(year_grp=((X.year / 10).round()) * 10) >> group_by( X.year_grp, X.category) >> summarize(nrows=X.accomplishment.count())) c1 = alt.Chart(t1).mark_circle().encode(x='year_grp:O', y='category:O', size='nrows:Q') c3 = alt.Chart(t1).mark_bar().encode(x='year_grp', y='nrows', color='category') # firsts df summary by gender t2 = (firsts >> mutate(year_grp=((X.year / 10).round()) * 10) >> group_by( X.year_grp, X.gender) >> summarize(nrows=X.accomplishment.count())) c2 = alt.Chart(t2).mark_circle().encode(x='year_grp:O', y='gender:O', size='nrows:Q') chart = alt.vconcat(c2, c1, c3) chart.save(
import pandas from dplython import (DplyFrame, X, diamonds, select, sift, sample_n, sample_frac, head, arrange, mutate, group_by, summarize, DelayFunction) diamonds >> head(5) diamonds >> select(X.carat, X.cut, X.price) >> head(5) d = (diamonds >> sift(X.carat > 4) >> select(X.carat, X.cut, X.depth, X.price) >> head(2)) (diamonds >> mutate(carat_bin=X.carat.round()) >> group_by(X.cut, X.carat_bin) >> summarize(avg_price=X.price.mean())) test = df['deaths'] < 0 less_than_zero = df[test] print(less_than_zero.shape) print(less_than_zero.head()) test #df['deaths_fixed'] = df['deaths_new'].apply(lambda x: 'True' if x <= 0 else 'False')
def difference_quantifier(esm_series, hector_run_series): calculate_df = DplyFrame({"hector": hector_run_series, "esm": esm_series}) calculate_df = calculate_df >> mutate(percentdiff=(X.hector - X.esm) / X.esm) return mean(abs(calculate_df.percentdiff))
from dplython import X, mutate, group_by, diamonds diamonds = diamonds >> mutate(bin=X["Unnamed: 0"] % 5000) gbinp = diamonds.groupby("bin") gbind = diamonds >> group_by(X.bin) # Test 1 gbinp["foo"] = gbinp.x.transform('mean') gbind = gbind >> mutate(foo=X.x.mean()) print gbinp["foo"].equals(gbind["foo"]) # Test 2 gbinp["foo"] = gbinp.x.transform('mean') + gbinp.y.transform('mean') gbind = gbind >> mutate(foo=X.x.mean() + X.y.mean()) print gbinp["foo"].equals(gbind["foo"])