def add_day(df, countriez): df_temp = df.copy() df = pd.DataFrame(index=range(0, 100000)) for country in countriez: data = (DplyFrame(df_temp) >> sift(X.country == country)) df_filt = (data >> mutate(day=range(1, len(data) + 1))) df = pd.concat([df, df_filt], sort=False).dropna(how='all') return df
def run_process(dat, cols, max_k, vis=0): tod_result = pd.DataFrame() info_result = pd.DataFrame() #iterate from weekday, saturday and sunday (0,3) for i in range(3): temp = dat >> sift(X.DAY == i) tod_result = tod_result.append(estimate_tod(temp, max_k)) info_result = info_result.append(tod_traffic_analyzer(i, temp, cols)) # if(vis == 1): # print("Visualize the Silhouette score of day group (0:weekday, 1: sat, 2:sun)", i) # visualize_tod(k) return tod_result, info_result
def process_data(longform_df): """ Process data before beginning analysis. Only going to focus on the actual match. Doesn't make sense to consider the buildup & post-match here, since we're focusing on the 'interruptions' that match events have that should theoretically cause movement in search volume levels. """ stage_2_df = longform_df >> sift(X.stage_2_ind == 1) stage_2_df = stage_2_df.reset_index(drop=True) stage_2_df["date"] = stage_2_df.match_id.apply( lambda x: "20" + x.split("20")[-1]) stage_2_df['date_time'] = (stage_2_df['date'].astype(str) + " " + stage_2_df['time'].astype(str)) stage_2_df['date_time'] = pd.to_datetime(stage_2_df['date_time'], errors="coerce", infer_datetime_format=True) return stage_2_df
# longform_df.competitive_idx) # # model = ARIMA(endog=longform_df.shorthand_search_vol, # exog=x_mat, # dates=longform_df.date_time, # order=(2, 0, 2)) # # model_fit = model.fit(disp=0) # disp=0 turns off debug information # with open('model2.txt', 'w') as f: # # print summary # print >> f, model_fit.summary() # ARIMA MODEL #3: STAGE 2, MATCH EVENTS ONLY # LET'S ONLY FOCUS ON STAGE 2...makes more sense to consider match events in the context of the match itself, # and not the buildup or post-match reaction time... stage_2_df = longform_df >> sift(X.stage_2_ind == 1) stage_2_df = stage_2_df.reset_index(drop=True) # new, more thoughtful ARIMA model parameters # d = 1 ("first difference"); let's predict the delta b/w volumes at consecutive intervals # aka, "stationarizing" the time series # q = 1 (a series displays moving average behavior if it apparently undergoes random # "shocks" whose effects are felt in 2+ consecutive periods. ) # TODO, diff b/w q = 1 & q = 2? x_mat = stage_2_df >> select(stage_2_df.home_goal, stage_2_df.away_goal, stage_2_df.home_yellow, stage_2_df.away_yellow, stage_2_df.home_red, stage_2_df.away_red, stage_2_df.competitive_idx) model = ARIMA(endog=stage_2_df.shorthand_search_vol,
import dplython as dp iris = dp.DplyFrame(iris) from dplython import (DplyFrame, X, diamonds, select, sift, sample_n, sample_frac, head, arrange, mutate, group_by, summarize, DelayFunction) # data(iris) # data=iris %>% # select(Petal.Length, Petal.Width, Sepal.Length, Sepal.Width, Species) iris >> dp.select(X.Species) >> dp.head() iris[['Species', 'PetalLength']] iris.drop('SepalLength', axis=1) #quitar esa columna iris.drop(5, axis=0) #quitar la sexta fila # data=iris %>% # filter(Petal.Length>1 & Petal.Length<100) iris >> dp.sift(X.PetalLength>5) iris[(iris['PetalLength']>5) & (iris['PetalLength']<6)] # data=iris %>% # dplyr::group_by(Species) %>% # summarise(media=mean(Petal.Length)) iris >> dp.group_by(X.Species) >> dp.summarize(media=X.PetalLength.mean()) iris.groupby(['Species'])['PetalLength'].agg(['mean', 'sum', 'count']) iris.groupby(['Species'])['PetalLength'].agg({'var1':'mean', 'var2':'sum', 'var3':'count'}) iris.groupby(['Species'])['PetalLength'].agg({'var1':['mean', 'sum']}) aggregations = { 'dsuma':'sum', } import math
def main(argv): yURL = None outdir = None maxFrames = 500 yURL = input("Enter the youtube url:") outdir = input("Enter the output directory:") maxFrames = int(input("Enter the maximum number of frames to check:")) faceDet = cv2.CascadeClassifier( "haarcascade/haarcascade_frontalface_default.xml") faceDet2 = cv2.CascadeClassifier( "haarcascade/haarcascade_frontalface_alt2.xml") faceDet3 = cv2.CascadeClassifier( "haarcascade/haarcascade_frontalface_alt.xml") faceDet4 = cv2.CascadeClassifier( "haarcascade/haarcascade_frontalface_alt_tree.xml") # pdata, pframes, pfacedims = getNewInstances(yURL, faceDet, faceDet2, faceDet3, faceDet4, maxCount=maxFrames) # headers = dict() headers['Ocp-Apim-Subscription-Key'] = ms_key1 headers['Content-Type'] = 'application/octet-stream' # resultsDf = pd.DataFrame() frameId = 0 for image in pframes: print("posting frame %d of %d" % (frameId, len(pframes))) #sending the frame image to MS cognitive services resultMS = processRequest(image, headers) #isinstance == type() if isinstance(resultMS, list): for result in resultMS: if isinstance(result, dict): resFrameList = [] for res in result['scores'].items(): resFrameList.append( (frameId, res[0], res[1], result["faceRectangle"]['left'], result["faceRectangle"]['top'], result["faceRectangle"]['width'], result["faceRectangle"]['height'])) appendDf = pd.DataFrame(resFrameList, columns=[ "frameId", "emotionLabel", "conf", "faceleft", "facetop", "faceW", "faceH" ]) resultsDf = resultsDf.append(appendDf) time.sleep(2) frameId += 1 # # print(resultsDf) #we append all the data to the dataframe #http://bluescreen.club/2017/06/18/import-pandas-as-pd/ #then we convert the dataframe to a Dplyframe object which allows us to do higher level data analytics #for this one, we will select out the top most ranking face frames for each of the emotions #microsoft provides us with around 8 emotions #so we sort out 8 faces for 8 emotions and then save them accordingly dfFaces = DplyFrame(resultsDf) # print(dfFaces) topFaces = ( dfFaces >> group_by(X.emotionLabel) >> sift(X.conf == X.conf.max()) >> sift(X.frameId == X.frameId.min()) >> ungroup() >> group_by( X.frameId) >> sift(X.conf == X.conf.max()) >> ungroup() >> arrange( X.emotionLabel)) topFaces = topFaces.drop_duplicates() #print(topFaces) i = 0 for index, row in topFaces.iterrows(): print("saving emotion frame %d of %d" % (i, len(topFaces.index))) # emotion = row["emotionLabel"] confid = int(row["conf"] * 100) image = pframes[int(row["frameId"])] faceL = row["faceleft"] faceT = row["facetop"] faceW = row["faceW"] faceH = row["faceH"] #save cropped face imageW = image[faceT:faceT + faceH, faceL:faceL + faceW] cv2.imwrite( os.path.expanduser("%s/Cropped_%s.jpg" % (outdir, emotion)), imageW) #if you wish to put a rectangle on the faces then uncomment below # # cv2.rectangle( image,(faceL,faceT), # (faceL+faceW, faceT + faceH), # color = (255,0,0), thickness = 5 ) # cv2.putText( image, emotion, (faceL,faceT-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,0,0), 1 ) # cv2.imwrite(os.path.expanduser("%s/%s.jpg" % (outdir, emotion)), image) i += 1
# Extending dfply with custom functions # https://github.com/kieferk/dfply/blob/master/examples/basics-extending-functionality.ipynb ############### dplython ################# from dplython import (DplyFrame, X, diamonds, select, sift, sample_n, sample_frac, head, arrange, mutate, group_by, summarize, DelayFunction) df = DplyFrame(df) df >> head(5) df >> sample_n(5) df >> select(X.name, X.category, X.country, X.role, X.description) df >> sift(X.category == 'Leadership') # As in pandas, use bitwise logical operators like |, & (, is same as &) df >> arrange(X.country) # couldnt find a way to sort descending so moved to dfply library df >> mutate(carat_bin=X.carat.round()) df >> group_by(X.category) >> summarize(num_of_people = X.name.count()) # It's possible to pass the entire dataframe using X._ # The special Later name, "_" will refer to the entire DataFrame. # Combine multiple (df >> sift(X.name != 'Unsung hero') >> group_by(X.category) >> summarize(num_of_people = X.name.count()) ).set_index('category').plot(title='# of Women Recognized by Category', y='num_of_people', ylabel='', legend=False, kind='pie')
denominator = np.max(data, 0) - np.min(data, 0) return numerator / (denominator + 1e-7) # train Parameters seq_length = 60 data_dim = 8 hidden_dim = 10 output_dim = 1 learning_rate = 0.01 iterations = 500 # last, diff_24h, diff_per_24h, bid, ask, low, high, volume data = DplyFrame(pd.read_csv('./bitcoin_ticker.csv', delimiter=',')) data = data >> sift(X.rpt_key == 'btc_krw') >> select( X.last, X.diff_24h, X.diff_per_24h, X.bid, X.ask, X.low, X.high, X.volume) data = np.asarray(data) #data = MinMaxScaler(data) data = tf.layers.batch_normalization(data) x = data y = data[:, [0]] # last as label # build a dataset dataX = [] dataY = [] for i in range(0, len(y) - seq_length): _x = x[i:i + seq_length] _y = y[i + seq_length] # Next close price print(_x, "->", _y) dataX.append(_x)
plt.tight_layout(pad = 0) plt.show() """stacked bar plot""" dfx = (dfr >> arrange(-X.tot)) dfx=dfx.head(50) from plotnine import * (ggplot(dfx, aes(x='word', y='tot', fill='source')) + geom_col() + theme(axis_text_x=element_text(rotation=45, hjust=1)) ) """each newspaper""" dfr = DplyFrame(output) df_tele =(dfr >>sift(X.source=="guardian")) df_tele = (df_tele >> group_by(X.word, X.source) >> summarize(tot=X.count.sum())) df_tele = (df_tele >>select(X.word, X.tot )) d = {} for a, x in dff.values: d[a] = x wordcloud = WordCloud(width = 1000, height = 1000, background_color ='white', min_font_size =10, max_font_size=150).generate_from_frequencies(frequencies=d) # plot the WordCloud image plt.figure(figsize = (8, 8), facecolor = None) plt.imshow(wordcloud)
def czMatchmaker(data, Q, precursor_fasta): data = pd.read_csv( "/Users/matteo/Documents/czMatchmaker/data/examplaryData.csv") data = DplyFrame(data) precursors = data >> \ sift( X.tag == 'precursor' ) >> \ select( X.active, X.neutral, X.estimates) fragments = data >> sift( X.tag != 'precursor' ) >> \ group_by( X.tag, X.active, X.broken_bond ) >> \ summarize( estimates = X.estimates.sum() ) I_on_fragments = {} optiminfos = {} for break_point, data in fragments.groupby('broken_bond'): pairing, optiminfo = collect_fragments(data, Q) I_on_fragments[break_point] = pairing optiminfos[break_point] = optiminfo cations_fragmented_I = sum( sum(I_on_fragments[bP][p] for p in I_on_fragments[bP]) for bP in I_on_fragments) I_no_reactions = precursors >> \ sift( X.active==Q, X.neutral == 0) >> \ select( X.estimates ) I_no_reactions = I_no_reactions.values.flatten()[0] prec_ETnoD_PTR_I = precursors >> \ sift( X.active != Q ) >> \ rename( ETnoD = X.neutral, I = X.estimates ) >> \ mutate( PTR = Q - X.ETnoD - X.active ) >> \ select( X.ETnoD, X.PTR, X.I ) I_prec_no_frag = prec_ETnoD_PTR_I >> \ summarize( I = X.I.sum() ) I_prec_no_frag = I_prec_no_frag.values.flatten()[0] precursorNoReactions = precursors >> \ sift( X.active == Q ) >> \ select( X.estimates ) prec_ETnoD_PTR_I = prec_ETnoD_PTR_I >> mutate( I_PTR = crossprod(X.PTR, X.I), \ I_ETnoD = crossprod(X.ETnoD, X.I) ) >> \ summarize( I_PTR = X.I_PTR.sum(), I_ETnoD = X.I_ETnoD.sum() ) I_PTR_no_frag, I_ETnoD_no_frag = prec_ETnoD_PTR_I.values.flatten() prob_PTR = I_PTR_no_frag / (I_PTR_no_frag + I_ETnoD_no_frag) prob_ETnoD = 1. - prob_PTR I_frags = dict( (bP, sum(I_on_fragments[bP][pairing] for pairing in I_on_fragments[bP])) for bP in I_on_fragments) I_frag_total = sum(I_frags[bP] for bP in I_frags) prob_frag = Counter( dict((int(bP), I_frags[bP] / I_frag_total) for bP in I_frags)) prob_frag = [prob_frag[i] for i in range(len(precursor_fasta))] I_frags_PTRETnoD_total = sum( (Q - 1 - sum(q for cz, q in pairing)) * I_on_fragments[bP][pairing] for bP in I_on_fragments for pairing in I_on_fragments[bP]) anion_meets_cation = I_frags_PTRETnoD_total + I_PTR_no_frag + I_ETnoD_no_frag prob_fragmentation = I_frags_PTRETnoD_total / anion_meets_cation prob_no_fragmentation = 1 - prob_fragmentation prob_no_reaction = I_no_reactions / (I_no_reactions + I_frag_total + I_prec_no_frag) prob_reaction = 1. - prob_no_reaction res = {} res['reaction'] = (prob_reaction, prob_no_reaction) res['fragmentation'] = (prob_fragmentation, prob_no_fragmentation) res['fragmentation_amino_acids'] = tuple(prob_frag) return res
start_time = time.clock() dt = d0.ply_where( X.usertype == 'Subscriber' ).ply_select( slat = X.latitude_start * math.pi / 180, elat = X.latitude_end * math.pi / 180, slng = X.longitude_start * math.pi / 180, elng = X.longitude_end * math.pi / 180 ) print( 'pandas_ply: ' + str( round( time.clock() - start_time, 2 ) ) + ' seconds.' ) # dplython import pandas from dplython import (DplyFrame, X, diamonds, select, sift, sample_n, sample_frac, head, arrange, mutate, group_by, summarize, DelayFunction) start_time = time.clock() dt = DplyFrame(d0) >> sift( X.usertype == 'Subscirber' ) >> mutate( slat = X.latitude_start * math.pi / 180, elat = X.latitude_end * math.pi / 180, slng = X.longitude_start * math.pi / 180, elng = X.longitude_end * math.pi / 180 ) print( 'dplython: ' + str( round( time.clock() - start_time, 2 ) ) + ' seconds.' ) # dfply from dfply import * import pandas as pd start_time = time.clock() dt =d0 >> mask( X.usertype == 'Subscirber' ) >> mutate( slat = X.latitude_start * math.pi / 180, elat = X.latitude_end * math.pi / 180,
import pandas from dplython import (DplyFrame, X, diamonds, select, sift, sample_n, sample_frac, head, arrange, mutate, group_by, summarize, DelayFunction) diamonds >> head(5) diamonds >> select(X.carat, X.cut, X.price) >> head(5) d = (diamonds >> sift(X.carat > 4) >> select(X.carat, X.cut, X.depth, X.price) >> head(2)) (diamonds >> mutate(carat_bin=X.carat.round()) >> group_by(X.cut, X.carat_bin) >> summarize(avg_price=X.price.mean())) test = df['deaths'] < 0 less_than_zero = df[test] print(less_than_zero.shape) print(less_than_zero.head()) test #df['deaths_fixed'] = df['deaths_new'].apply(lambda x: 'True' if x <= 0 else 'False')
def load_data(input_dir, crsrd_id): cctv_log = pd.read_csv(input_dir + "/ORT_CCTV_5MIN_LOG.csv") cctv_mst = pd.read_csv(input_dir + "/ORT_CCTV_MST.csv") cctv_log['DATE'] = pd.DataFrame(pd.DatetimeIndex(cctv_log['REG_DT']).date) cctv_log['HOUR'] = pd.DataFrame(pd.DatetimeIndex(cctv_log['REG_DT']).hour) cctv_log['MINUTE'] = ( pd.DataFrame(pd.DatetimeIndex(cctv_log['REG_DT']).minute) // 30) * 30 cctv_log['temp_DAY'] = pd.to_datetime(cctv_log['DATE']).dt.dayofweek cctv_log.loc[cctv_log['temp_DAY'] < 5, 'DAY'] = int(0) #mon - fri cctv_log.loc[cctv_log['temp_DAY'] == 5, 'DAY'] = int(1) #sat cctv_log.loc[cctv_log['temp_DAY'] == 6, 'DAY'] = int(2) #sun df0 = DplyFrame(cctv_log) >> group_by( X.DATE, X.DAY, X.HOUR, X.MINUTE, X.CCTV_ID) >> summarize( GO_TRF=X.GO_BIKE.sum() + X.GO_CAR.sum() + X.GO_SUV.sum() + X.GO_VAN.sum() + X.GO_TRUCK.sum() + X.GO_BUS.sum() + X.RIGHT_BIKE.sum() + X.RIGHT_CAR.sum() + X.RIGHT_SUV.sum() + X.RIGHT_VAN.sum() + X.RIGHT_TRUCK.sum() + X.RIGHT_BUS.sum(), LEFT_TRF=X.LEFT_BIKE.sum() + X.LEFT_CAR.sum() + X.LEFT_SUV.sum() + X.LEFT_VAN.sum() + X.LEFT_TRUCK.sum() + X.LEFT_BUS.sum()) # Extract records of selected crossroad cctv_mst = DplyFrame(cctv_mst) >> sift(X.CRSRD_ID == crsrd_id) >> select( X.CRSRD_ID, X.CCTV_ID) df0 = pd.merge(df0, cctv_mst, how="inner", on="CCTV_ID") df0 = df0.sort_values(['DATE', 'HOUR', 'MINUTE', 'CCTV_ID']) # Time frame from existing dataset tf = DplyFrame( df0.drop_duplicates( ['DATE', 'DAY', 'HOUR', 'MINUTE'], keep='last')) >> select( X.DATE, X.DAY, X.HOUR, X.MINUTE) # Process the datastructure into pivot cctv_list = sorted(cctv_mst['CCTV_ID'].unique()) df1 = tf for cctv in cctv_list: a = df0 >> sift(X.CCTV_ID == cctv) >> select( X.DATE, X.DAY, X.HOUR, X.MINUTE, X.GO_TRF, X.LEFT_TRF) df1 = pd.merge(df1, a, how='left', on=['DATE', 'DAY', 'HOUR', 'MINUTE'], suffixes=('', '_' + str(cctv))) df1 = df1.set_index(['DATE', 'DAY', 'HOUR', 'MINUTE']) df1 = df1.fillna(df1.rolling(window=24, min_periods=1, center=True).mean()) df1 = df1.fillna(0) df1 = df1.reset_index() df1['TOTAL_TRF'] = DplyFrame(df1.iloc[:, 4:3 + len(cctv_list) * 2].sum( axis=1, skipna=True)) df1 = df1 >> sift(X.TOTAL_TRF > 0) print(df1) # Name the cctv id and direction - for tod_traffic_analyzer cols = [cctv + '_GO_RATE' for cctv in cctv_list] cols.extend([cctv + '_LEFT_RATE' for cctv in cctv_list]) cols = sorted(cols) cols = ['TOD'] + cols + ['TOTAL_TRF'] return df1, cols
def main(argv): ytURL = None outdir = None maxFrames = 500 try: opts, args = getopt.getopt(argv, "hy:o:m:", ["yturl=", "odir=", "maxframes="]) except getopt.GetoptError: print 'Error: shellScript.py -y <yturl> -o <odir> -m <maxframes>' sys.exit(2) #print opts for opt, arg in opts: if opt == '-h': print 'help: shellScript.py -y <yturl> -o <odir> -m <maxframes>' sys.exit() elif opt in ("-y", "--yturl"): print("--yturl={}".format(arg)) ytURL = arg elif opt in ("-o", "--odir"): print("--odir={}".format(arg)) outdir = arg elif opt in ("-m", "--maxframes"): print("--maxframes={}".format(arg)) maxFrames = int(arg) # if ytURL is None: print 'bad yt: shellScript.py -y <yturl> -o <odir> -m <maxframes>' sys.exit() # if outdir is None: print 'bad outdir: shellScript.py -y <yturl> -o <odir> -m <maxframes>' sys.exit() # if False == isinstance(maxFrames, (int, long)): print 'bad maxFrames: shellScript.py -y <yturl> -o <odir> -m <maxframes>' sys.exit() # # faceDet = cv2.CascadeClassifier( "haarcascade/haarcascade_frontalface_default.xml") faceDet2 = cv2.CascadeClassifier( "haarcascade/haarcascade_frontalface_alt2.xml") faceDet3 = cv2.CascadeClassifier( "haarcascade/haarcascade_frontalface_alt.xml") faceDet4 = cv2.CascadeClassifier( "haarcascade/haarcascade_frontalface_alt_tree.xml") # pdata, pframes, pfacedims = getNewInstances(ytURL, faceDet, faceDet2, faceDet3, faceDet4, maxCount=maxFrames) # headers = dict() headers['Ocp-Apim-Subscription-Key'] = ms_key1 headers['Content-Type'] = 'application/octet-stream' # resultsDf = pd.DataFrame() frameId = 0 for image in pframes: print("posting frame %d of %d" % (frameId, len(pframes))) resultMS = processRequest(image, headers) # if isinstance(resultMS, list): for result in resultMS: if isinstance(result, dict): resFrameList = [] for res in result['scores'].items(): resFrameList.append( (frameId, res[0], res[1], result["faceRectangle"]['left'], result["faceRectangle"]['top'], result["faceRectangle"]['width'], result["faceRectangle"]['height'])) appendDf = pd.DataFrame(resFrameList, columns=[ "frameId", "emotionLabel", "conf", "faceleft", "facetop", "faceW", "faceH" ]) resultsDf = resultsDf.append(appendDf) time.sleep(2) frameId += 1 # dfFaces = DplyFrame(resultsDf) # topFaces = ( dfFaces >> group_by(X.emotionLabel) >> sift(X.conf == X.conf.max()) >> sift(X.frameId == X.frameId.min()) >> ungroup() >> group_by( X.frameId) >> sift(X.conf == X.conf.max()) >> ungroup() >> arrange( X.emotionLabel)) topFaces = topFaces.drop_duplicates() #print(topFaces) # i = 0 for index, row in topFaces.iterrows(): print("saving emotion frame %d of %d" % (i, len(topFaces.index))) # emotion = row["emotionLabel"] confid = int(row["conf"] * 100) image = pframes[int(row["frameId"])] faceL = row["faceleft"] faceT = row["facetop"] faceW = row["faceW"] faceH = row["faceH"] # #save cropped face imageW = image[faceT:faceT + faceH, faceL:faceL + faceW] cv2.imwrite( os.path.expanduser("%s/Cropped_%s.jpg" % (outdir, emotion)), imageW) # cv2.rectangle(image, (faceL, faceT), (faceL + faceW, faceT + faceH), color=(255, 0, 0), thickness=5) cv2.putText(image, emotion, (faceL, faceT - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1) # cv2.imwrite(os.path.expanduser("%s/box%s.jpg" % (outdir, emotion)), image) i += 1
'away_yellows': away_yellows, 'home_reds': home_reds, 'away_reds': away_reds}) df_container.append(data) # concatenate all into one master data frames & generate descriptive stats master_df = DplyFrame(pd.concat(df_container)) print("home goals", "\n", master_df.home_goals.describe()) print("away goals", "\n", master_df.away_goals.describe()) print("home yellows", "\n", master_df.home_yellows.describe()) print("away yellows", "\n", master_df.away_yellows.describe()) print("home reds", "\n", master_df.home_reds.describe()) print("away reds", "\n", master_df.away_reds.describe()) print("frequency of home goals", len(master_df >> sift(X.home_goals > 0))) print("frequency of away goals", len(master_df >> sift(X.away_goals > 0))) print("frequency of home yellows", len(master_df >> sift(X.home_yellows > 0))) print("frequency of away yellows", len(master_df >> sift(X.away_yellows > 0))) print("frequency of home reds", len(master_df >> sift(X.home_reds > 0))) print("frequency of away reds", len(master_df >> sift(X.away_reds > 0))) goals = master_df.apply(lambda row: row.home_goals + row.away_goals, axis=1) print("goals", goals.describe()) print("freq", len(goals.nonzero()[0])) yellows = master_df.apply(lambda row: row.home_yellows + row.away_yellows, axis=1) print("yellows", yellows.describe()) print("yellows", len(yellows.nonzero()[0])) reds = master_df.apply(lambda row: row.home_reds + row.away_reds, axis=1)