def to_player(repoRaw, repoPbP, repoPSt): # --- List all seasons allS = ut_find_folders(repoRaw, True) # --- Retrieve list of all players - across seasons plNames = get_player_names(repoPbP) # --- Retrieve stats for each player count = 0 # Loop on players for pl in plNames: # Instantiate new player frame plFrame = pd.DataFrame() # Loop on seasons for isea in allS: # Load data datA = path.join(repoRaw, isea, 'all_data.p') with open(datA, 'rb') as f: datA = pickle.load(f) datA['playerName'] = datA['playerName'].apply(lambda x: x.upper()) # Find lines w/players datPL = datA[datA.loc[:, 'playerName'] == pl] # Concatenate plFrame = pd.concat((plFrame, datPL), axis=0) # Save to new file svFile = path.join(repoPSt, pl.replace(' ', '_') + '.p') with open(svFile, 'wb') as f: pickle.dump(plFrame, f) # Print status bar count += 1 stdout.write('\r') # the exact output you're looking for: stdout.write( "Player %i/%i - %s: [%-40s] %d%%, completed" % (count, len(plNames), pl, '=' * int(count / len(plNames) * 40), 100 * count / len(plNames))) stdout.flush()
def get_player_names(repoPbP): # --- Retrieve list of all players - across seasons # Loop on rosters and get names allS_p = ut_find_folders(repoPbP, True) plNames = [ pd.read_csv( path.join(repoPbP, x, x.replace('Season', 'roster') + '.csv')) for x in allS_p ] plNames = [x['firstlast'] for x in plNames] plNames = pd.concat(plNames) plNames.drop_duplicates(inplace=True, keep='first') # Are strings? areS = [type(x) is str for x in plNames] plNames = plNames[areS] return plNames
def do_assess_clustering_robustness(dtCols, normalizer, global_centers, pca, nGames=80): # This function computes the confusion of the global classification # List years allS_p = ut_find_folders(repoPbP, True) years = [[x.split('_')[1][:4], x.split('_')[1][4:]] for x in allS_p] years.pop(years.index(['2003', '2004'])) nGamesL = list( range(10,81,10) ) accuracy = pd.DataFrame(np.zeros([len(years), 8]), columns=nGamesL, index=[''.join(x) for x in years]) for iy in years: for nG in nGamesL: # Retrieve data data, classes = get_data_for_clustering(repoModel, dtCols, normalizer, pca, upto=iy[1]+'-07-01', asof=iy[0]+'-09-01', nGames=nG) # Get trophy nominees selke = to_pandas_selke(path.join(root, 'Databases/Hockey/PlayerStats/raw/' + ''.join(iy) + '/trophy_selke_nominees.csv')) selke = selke[~selke.index.duplicated(keep='first')] selke_id = [list(data.index).index(x) for x in selke[selke['Pos'] != 'D'].index] ross = to_pandas_ross(path.join(root, 'Databases/Hockey/PlayerStats/raw/' + ''.join(iy) + '/trophy_ross_nominees.csv')) ross = ross[~ross.index.duplicated(keep='first')] ross_id = [list(data.index).index(x) for x in ross[ross['pos'] != 'D'].index] seed = classes.min(axis=0) distance = np.sqrt(((classes - seed) ** 2).sum(axis=1)).sort_values() poor_id = [classes.index.get_loc(x) for x in distance.index[:30]] poor_id = ut_difference(ut_difference(poor_id, selke_id), ross_id) # --- Compute confusion # Selke players dst_slk = np.sqrt( np.sum(np.subtract(classes.iloc[selke_id].values, global_centers['selke'])**2, axis=1) ) dst_ross = np.sqrt( np.sum(np.subtract(classes.iloc[selke_id].values, global_centers['ross']) ** 2, axis=1)) dst_poor = np.sqrt( np.sum(np.subtract(classes.iloc[selke_id].values, global_centers['poor']) ** 2, axis=1)) slk_min = np.argmin( np.array([dst_slk, dst_ross, dst_poor]), axis=0 ) # Ross players dst_slk = np.sqrt(np.sum(np.subtract(classes.iloc[ross_id].values, global_centers['selke']) ** 2, axis=1)) dst_ross = np.sqrt(np.sum(np.subtract(classes.iloc[ross_id].values, global_centers['ross']) ** 2, axis=1)) dst_poor = np.sqrt(np.sum(np.subtract(classes.iloc[ross_id].values, global_centers['poor']) ** 2, axis=1)) ros_min = np.argmin(np.array([dst_slk, dst_ross, dst_poor]), axis=0) # Poor players dst_slk = np.sqrt(np.sum(np.subtract(classes.iloc[poor_id].values, global_centers['selke']) ** 2, axis=1)) dst_ross = np.sqrt(np.sum(np.subtract(classes.iloc[poor_id].values, global_centers['ross']) ** 2, axis=1)) dst_poor = np.sqrt(np.sum(np.subtract(classes.iloc[poor_id].values, global_centers['poor']) ** 2, axis=1)) por_min = np.argmin(np.array([dst_slk, dst_ross, dst_poor]), axis=0) # Make matrix MTX = [[np.sum(x==0)/len(x), np.sum(x==1)/len(x), np.sum(x==2)/len(x)] for x in [slk_min, ros_min, por_min]] accuracy.loc[''.join(iy)][nG] = np.sum(np.diag(MTX))/np.sum(MTX) return accuracy
def do_ANN_training(repoPSt, repoPbP, repoCode, repoModel, allS_p=None, minGames=0, stats_fetcher='default'): # --- GET TRAINING DATASET if allS_p is None: # List non-lockout seasons allS_p = ut_find_folders(repoPbP, True) if stats_fetcher=='default': stats_fetcher = PlayerStatsFetcher(repoPSt, repoPbP, True) # Get data X,Y, X_all,POS_all,PLD_all, colNm= get_training_data(repoPSt, repoPbP, allS_p, stats_fetcher, minGames=minGames) """ with open( path.join(repoCode, 'ReinforcementLearning/NHL/playerstats/offVSdef/Automatic_classification/trainingData.p'), 'wb') as f: pickle.dump({'X':X, 'Y':Y, 'X_all':X_all, 'colNm':colNm, 'POS_all':POS_all}, f) with open( path.join(repoCode, 'ReinforcementLearning/NHL/playerstats/offVSdef/Automatic_classification/trainingData.p'), 'rb') as f: DT = pickle.load(f) colNm = DT['colNm'] X = DT['X'][colNm] Y = DT['Y'] X_all = DT['X_all'] POS_all = DT['POS_all'] """ # --- PRE-PROCESS DATA Y, X, POS_all, PLD_all, X_all = ut_sanitize_matrix(Y, X), ut_sanitize_matrix(X), ut_sanitize_matrix(POS_all, X_all), ut_sanitize_matrix(PLD_all, X_all), ut_sanitize_matrix(X_all) # --- KEEP >N GAMES if minGames > 0 and minGames < 1: # Fraction on the max minGames = (np.max(PLD_all) / 5).astype('int')[0] X_all = X_all[(PLD_all>=minGames).values] POS_all = POS_all[(PLD_all>=minGames).values] X_all_S, Nrm = do_normalize_data(X_all[(POS_all!='D').values]) X_S, _ = do_normalize_data(X, normalizer=Nrm) _, pca = do_reduce_data(X_all_S, nComp=18) X_S_P, _ = do_reduce_data(X_S, pca=pca, nComp=18) # --- BUILD THE NETWORK nNodes = [X_S_P.shape[1], 15, Y.shape[1]] CLS = ANN_classifier(deepcopy(nNodes)) # --- TRAIN THE NETWORK nIter = 50 CLS.ann_train_network(nIter, X_S_P, Y.values, svname=repoModel) # --- DISPLAY NETWORK ACCURACY #CLS.ann_display_accuracy() return Nrm, pca, colNm, CLS
def to_pandas(repoRaw): # --- List all seasons allS = ut_find_folders(repoRaw, True) # Loop on seasons for isea in allS: # --- ART Ross nominees # Load csv file csvF = path.join( repoRaw, isea, 'trophy_ross_nominees.csv' ) df_r = to_pandas_ross(csvF) # --- SELKE nominees # Load csv file csvF = path.join(repoRaw, isea, 'trophy_selke_nominees.csv') df_s = to_pandas_selke(csvF) # --- PICKLE IT OUT svname = path.join( repoRaw, isea, 'trophy_nominees.p') with open(svname, 'wb') as f: pickle.dump({'ross':df_r, 'selke':df_s}, f)
def to_pandas(repoPbP): # --- List all seasons allS = ut_find_folders(repoPbP, True) # Loop on seasons for isea in allS: # --- ART Ross nominees isea2 = isea.replace('Season_', '') # Load csv file csvF = path.join(repoPbP, isea, 'playbyplay_' + isea2 + '.csv') df_pbp = pd.read_csv(csvF, engine='python') # --- SELKE nominees # Load csv file csvF = path.join(repoPbP, isea, 'roster_' + isea2 + '.csv') df_rst = pd.read_csv(csvF, engine='python') # --- PICKLE IT OUT svname = path.join(repoPbP, isea, 'converted_data.p') with open(svname, 'wb') as f: pickle.dump({'playbyplay': df_pbp, 'roster': df_rst}, f)
import pickle import numpy as np import pandas as pd import matplotlib.pyplot as plt from os import path from Utils.programming.ut_find_folders import * # Visualize goal difference: show home advantage # ============================================== # list seasons repoPbP = '/home/younesz/Documents/Databases/Hockey/PlayByPlay' allS = ut_find_folders(repoPbP, True) allG = pd.DataFrame() for iS in allS: # Load data dt = pickle.load( open(path.join(repoPbP, iS, 'converted_data.p'), 'rb') )['playbyplay'] # Keep only final lines for each game dt = dt.drop_duplicates(subset=['gcode', 'season'], keep='last')[['away.score', 'home.score']] # Concatenate allG= pd.concat( (allG, dt), axis=0 ) # Viz ann = "Win percentage for home team: %.1f %%" %(np.sum(allG['home.score']>allG['away.score'])/len(allG)*100) plt.figure(); plt.hist(allG['home.score']-allG['away.score']) plt.xlabel('Home team goal diff') plt.ylabel('Nb of games') plt.annotate(ann, xy=(-9,5000), xytext=(-9,5000))
def to_pandas(repoRaw): # --- List all seasons allS = ut_find_folders(repoRaw, True) # Loop on seasons for isea in allS: # --- Process summary - get header sumF = path.join(repoRaw, isea, 'summary.csv') # Read the file sumR = open(sumF, 'r') sumR = eval(sumR.readline().replace('null', 'None')) # Process the file sEntries = sumR['total'] sumRdt = sumR['data'] sumKeys = sumRdt[0].keys() # Turn into a pandas dataframe sumdt = [list(x.values()) for x in sumRdt] sumDF = pd.DataFrame(np.array(sumdt), columns=sumKeys) # Sort by teamAbbrev, playerName, gameId sumDF = sumDF.sort_values(by=['gameId', 'teamAbbrev', 'playerName'], ascending=True) sumDF = sumDF.set_index(np.arange(len(sumDF))) # --- Process hits - get header hitF = path.join(repoRaw, isea, 'hits.csv') # Read the file hitR = open(hitF, 'r') hitR = eval(hitR.readline().replace('null', 'None')) # Process the file hEntries = hitR['total'] hitRdt = hitR['data'] hitKeys = hitRdt[0].keys() # Turn into a pandas dataframe hitdt = [list(x.values()) for x in hitRdt] hitDF = pd.DataFrame(np.array(hitdt), columns=hitKeys) # Sort by teamAbbrev, playerName, gameId hitDF = hitDF.sort_values(by=['gameId', 'teamAbbrev', 'playerName'], ascending=True) hitDF = hitDF.set_index(np.arange(len(hitDF))) # --- Process powerplays - get header ppF = path.join(repoRaw, isea, 'powerplay.csv') # Read the file ppR = open(ppF, 'r') ppR = eval(ppR.readline().replace('null', 'None')) # Process the file pEntries = ppR['total'] ppRdt = ppR['data'] ppKeys = ppRdt[0].keys() # Turn into a pandas dataframe ppdt = [list(x.values()) for x in ppRdt] ppDF = pd.DataFrame(np.array(ppdt), columns=ppKeys) # Sort by teamAbbrev, playerName, gameId pplDF = ppDF.sort_values(by=['gameId', 'teamAbbrev', 'playerName'], ascending=True) ppDF = ppDF.set_index(np.arange(len(ppDF))) # --- Process penalty kills - get header kilF = path.join(repoRaw, isea, 'penalty_kills.csv') # Read the file kilR = open(kilF, 'r') kilR = eval(kilR.readline().replace('null', 'None')) # Process the file kEntries = kilR['total'] kilRdt = kilR['data'] kilKeys = kilRdt[0].keys() # Turn into a pandas dataframe kildt = [list(x.values()) for x in kilRdt] kilDF = pd.DataFrame(np.array(kildt), columns=kilKeys) # Sort by teamAbbrev, playerName, gameId kilDF = kilDF.sort_values(by=['gameId', 'teamAbbrev', 'playerName'], ascending=True) kilDF = kilDF.set_index(np.arange(len(kilDF))) # --- Join the two tables into a pandas series DF = pd.concat((sumDF, hitDF, ppDF, kilDF), axis=1) # Drop duplicates DF = DF.loc[:, ~DF.columns.duplicated()] # Pickle the result with open(path.join(repoRaw, isea, 'all_data.p'), 'wb') as f: pickle.dump(DF, f)
def do_clustering_multiyear(repoModel, repoPSt, repoPbP, dtCols, normalizer, pca, root, plFetcher='default'): # Make constraints allS_p = ut_find_folders(repoPbP, True) years = [[x.split('_')[1][:4], x.split('_')[1][4:]] for x in allS_p] count = 0 allCla = pd.DataFrame() allCON = pd.DataFrame() allCls = [] allSLK = [] allROS = [] ###### ECLUDE YEAR 2003-2004 : PROBLEM WITH FREDRIK MODIN'S DATA - NAMED AS FREDDY MODIN IN THE NHL STATS PAGE years.pop( years.index(['2003', '2004']) ) all_centers = [] for iy in years: # Get data data, classes = get_data_for_clustering(repoModel, repoPSt, repoPbP, dtCols, normalizer, pca, upto=iy[1]+'-07-01', asof=iy[0]+'-09-01', nGames=81, plFetcher=plFetcher) # Get trophy nominees selke = to_pandas_selke( path.join(root, 'Databases/Hockey/PlayerStats/raw/' + ''.join(iy) + '/trophy_selke_nominees.csv') ) selke = selke[~selke.index.duplicated(keep='first')] selke_id = [list(data.index).index(x) for x in selke[selke['Pos'] != 'D'].index] ross = to_pandas_ross( path.join(root, 'Databases/Hockey/PlayerStats/raw/' + ''.join(iy) + '/trophy_ross_nominees.csv') ) ross = ross[~ross.index.duplicated(keep='first')] ross_id = [list(data.index).index(x) for x in ross[ross['pos'] != 'D'].index] # --- Clean constraints # Remove duplicates torem = list( set(ross_id).intersection(selke_id) ) maxV = np.argmax(classes.iloc[torem].values, axis=1).astype(bool) selke_id = ut_difference( selke_id, list( compress(torem, maxV) )) selke_wgt = selke.loc[data.iloc[selke_id].index]['WEIGHT_rank'].values selke_wgt = selke_wgt/np.max(selke_wgt) ross_id = ut_difference( ross_id, list( compress(torem, maxV!=True) )) ross_wgt = ross.loc[data.iloc[ross_id].index]['WEIGHT_rank'].values ross_wgt = ross_wgt / np.max(ross_wgt) # Get poorest ranked players seed = classes.min(axis=0) distance = np.sqrt( ((classes - seed)**2).sum(axis=1) ).sort_values() poor_id = [classes.index.get_loc(x) for x in distance.index[:30]] poor_id = ut_difference( ut_difference( poor_id, selke_id ), ross_id ) # Make the constraints constraints = ut_make_constraints( (selke_id, selke_wgt), (ross_id, ross_wgt), poor_id ) constraints = pd.DataFrame(constraints) constraints = constraints[constraints[0] != constraints[1]] # Make clusters cls_data = list(list(x) for x in classes.values) cOm = [list(ut_center_of_mass(classes.iloc[x].values, np.reshape(y, [-1, 1]))) for x, y in zip([selke_id, ross_id, poor_id], [selke_wgt, ross_wgt, np.ones([1, len(poor_id)])])] ml, cl, dmp = [], [], [] [ml.append(tuple(x[:2].astype('int'))) if x[-1] > 0.5 else dmp.append(tuple(x[:2])) for x in constraints.values] [cl.append(tuple(x[:2].astype('int'))) if x[-1] < -0.5 else dmp.append(tuple(x[:2])) for x in constraints.values] clusters, centers, cost = cop_kmeans(cls_data, 3, ml, cl, max_iter=1000, tol=1e-4, initialization=cOm) all_centers.append(centers) # Append allSLK = allSLK + list(np.add(selke_id, len(allCla))) allROS = allROS + list(np.add(ross_id, len(allCla))) allCla = pd.concat((allCla, classes), axis=0) allCls = allCls + clusters allCtr = [list(x) for x in np.mean(np.array(all_centers),axis=0)] #display_clustering(classes, clusters, centers, ross_id, selke_id) #print('year: ', iy, 'cost: ', np.sum(cost)) display_clustering(allCla, allCls, allCtr, allROS, allSLK) # Cluster the centers all_centers = np.concatenate( np.array(all_centers), axis=0 ) all_centers = list([list(x) for x in all_centers]) index = np.array(range( int(len(all_centers)/3) ))*3 constraints = ut_make_constraints( (list(index), list(np.ones([len(index),1]))), (list(index+1), list(np.ones([len(index),1]))), list(index+2)) constraints = pd.DataFrame(constraints) constraints = constraints[constraints[0] != constraints[1]] ml, cl, dmp = [], [], [] [ml.append(tuple(x[:2].astype('int'))) if x[-1] > 0.5 else dmp.append(tuple(x[:2])) for x in constraints.values] [cl.append(tuple(x[:2].astype('int'))) if x[-1] < -0.5 else dmp.append(tuple(x[:2])) for x in constraints.values] cOm = [list(ut_center_of_mass(classes.iloc[x].values, np.reshape(y, [-1,1]) )) for x,y in zip([selke_id, ross_id, poor_id], [selke_wgt, ross_wgt, np.ones([1, len(poor_id)])])] glCL, glCT, _ = cop_kmeans(all_centers, 3, ml, cl, max_iter=1000, tol=1e-4, initialization='hockey') display_clustering(pd.DataFrame(all_centers, columns=['OFF', 'DEF']), glCL, glCT, list(index), list(index+1)) # Relate centers to trophies iSlk = np.argmin( [np.sqrt(np.sum(np.subtract([0,1], x)**2)) for x in glCT] ) iRoss = np.argmin( [np.sqrt(np.sum(np.subtract([1,0], x)**2)) for x in glCT] ) iPoor = list( set(range(3)).difference([iSlk,iRoss]) )[0] global_centers = {'selke':glCT[iSlk], 'ross':glCT[iRoss], 'poor':glCT[iPoor]} # Save result pickle.dump({'global_centers':global_centers, 'normalizer':normalizer, 'pca':pca, 'dtCols':dtCols}, \ open(path.join(repoModel, 'baseVariables.p'), 'wb') ) return global_centers