Пример #1
0
def to_player(repoRaw, repoPbP, repoPSt):
    # --- List all seasons
    allS = ut_find_folders(repoRaw, True)
    # --- Retrieve list of all players - across seasons
    plNames = get_player_names(repoPbP)
    # --- Retrieve stats for each player
    count = 0
    # Loop on players
    for pl in plNames:
        # Instantiate new player frame
        plFrame = pd.DataFrame()
        # Loop on seasons
        for isea in allS:
            # Load data
            datA = path.join(repoRaw, isea, 'all_data.p')
            with open(datA, 'rb') as f:
                datA = pickle.load(f)
            datA['playerName'] = datA['playerName'].apply(lambda x: x.upper())
            # Find lines w/players
            datPL = datA[datA.loc[:, 'playerName'] == pl]
            # Concatenate
            plFrame = pd.concat((plFrame, datPL), axis=0)
        # Save to new file
        svFile = path.join(repoPSt, pl.replace(' ', '_') + '.p')
        with open(svFile, 'wb') as f:
            pickle.dump(plFrame, f)
        # Print status bar
        count += 1
        stdout.write('\r')
        # the exact output you're looking for:
        stdout.write(
            "Player %i/%i - %s: [%-40s] %d%%, completed" %
            (count, len(plNames), pl, '=' * int(count / len(plNames) * 40),
             100 * count / len(plNames)))
        stdout.flush()
Пример #2
0
def get_player_names(repoPbP):
    # --- Retrieve list of all players - across seasons
    # Loop on rosters and get names
    allS_p = ut_find_folders(repoPbP, True)
    plNames = [
        pd.read_csv(
            path.join(repoPbP, x,
                      x.replace('Season', 'roster') + '.csv')) for x in allS_p
    ]
    plNames = [x['firstlast'] for x in plNames]
    plNames = pd.concat(plNames)
    plNames.drop_duplicates(inplace=True, keep='first')
    # Are strings?
    areS = [type(x) is str for x in plNames]
    plNames = plNames[areS]
    return plNames
Пример #3
0
def do_assess_clustering_robustness(dtCols, normalizer, global_centers, pca, nGames=80):
    # This function computes the confusion of the global classification
    # List years
    allS_p      =   ut_find_folders(repoPbP, True)
    years       =   [[x.split('_')[1][:4], x.split('_')[1][4:]] for x in allS_p]
    years.pop(years.index(['2003', '2004']))
    nGamesL     =   list( range(10,81,10) )
    accuracy    =   pd.DataFrame(np.zeros([len(years), 8]), columns=nGamesL, index=[''.join(x) for x in years])
    for iy in years:
        for nG in nGamesL:
            # Retrieve data
            data, classes   =   get_data_for_clustering(repoModel, dtCols, normalizer, pca, upto=iy[1]+'-07-01', asof=iy[0]+'-09-01', nGames=nG)
            # Get trophy nominees
            selke       =   to_pandas_selke(path.join(root, 'Databases/Hockey/PlayerStats/raw/' + ''.join(iy) + '/trophy_selke_nominees.csv'))
            selke       =   selke[~selke.index.duplicated(keep='first')]
            selke_id    =   [list(data.index).index(x) for x in selke[selke['Pos'] != 'D'].index]
            ross        =   to_pandas_ross(path.join(root, 'Databases/Hockey/PlayerStats/raw/' + ''.join(iy) + '/trophy_ross_nominees.csv'))
            ross        =   ross[~ross.index.duplicated(keep='first')]
            ross_id     =   [list(data.index).index(x) for x in ross[ross['pos'] != 'D'].index]
            seed        =   classes.min(axis=0)
            distance    =   np.sqrt(((classes - seed) ** 2).sum(axis=1)).sort_values()
            poor_id     =   [classes.index.get_loc(x) for x in distance.index[:30]]
            poor_id     =   ut_difference(ut_difference(poor_id, selke_id), ross_id)
            # --- Compute confusion
            # Selke players
            dst_slk     =   np.sqrt( np.sum(np.subtract(classes.iloc[selke_id].values, global_centers['selke'])**2, axis=1) )
            dst_ross    =   np.sqrt( np.sum(np.subtract(classes.iloc[selke_id].values, global_centers['ross']) ** 2, axis=1))
            dst_poor    =   np.sqrt( np.sum(np.subtract(classes.iloc[selke_id].values, global_centers['poor']) ** 2, axis=1))
            slk_min     =   np.argmin( np.array([dst_slk, dst_ross, dst_poor]), axis=0 )
            # Ross players
            dst_slk     =   np.sqrt(np.sum(np.subtract(classes.iloc[ross_id].values, global_centers['selke']) ** 2, axis=1))
            dst_ross    =   np.sqrt(np.sum(np.subtract(classes.iloc[ross_id].values, global_centers['ross']) ** 2, axis=1))
            dst_poor    =   np.sqrt(np.sum(np.subtract(classes.iloc[ross_id].values, global_centers['poor']) ** 2, axis=1))
            ros_min     =   np.argmin(np.array([dst_slk, dst_ross, dst_poor]), axis=0)
            # Poor players
            dst_slk     =   np.sqrt(np.sum(np.subtract(classes.iloc[poor_id].values, global_centers['selke']) ** 2, axis=1))
            dst_ross    =   np.sqrt(np.sum(np.subtract(classes.iloc[poor_id].values, global_centers['ross']) ** 2, axis=1))
            dst_poor    =   np.sqrt(np.sum(np.subtract(classes.iloc[poor_id].values, global_centers['poor']) ** 2, axis=1))
            por_min     =   np.argmin(np.array([dst_slk, dst_ross, dst_poor]), axis=0)
            # Make matrix
            MTX         =   [[np.sum(x==0)/len(x), np.sum(x==1)/len(x), np.sum(x==2)/len(x)] for x in [slk_min, ros_min, por_min]]
            accuracy.loc[''.join(iy)][nG]    =   np.sum(np.diag(MTX))/np.sum(MTX)
    return accuracy
Пример #4
0
def do_ANN_training(repoPSt, repoPbP, repoCode, repoModel, allS_p=None, minGames=0, stats_fetcher='default'):
    # --- GET TRAINING DATASET
    if allS_p is None:
        # List non-lockout seasons
        allS_p          =   ut_find_folders(repoPbP, True)

    if stats_fetcher=='default':
        stats_fetcher   =   PlayerStatsFetcher(repoPSt, repoPbP, True)

    # Get data
    X,Y, X_all,POS_all,PLD_all, colNm=   get_training_data(repoPSt, repoPbP, allS_p, stats_fetcher, minGames=minGames)
    """
    with open( path.join(repoCode, 'ReinforcementLearning/NHL/playerstats/offVSdef/Automatic_classification/trainingData.p'), 'wb') as f:
        pickle.dump({'X':X, 'Y':Y, 'X_all':X_all, 'colNm':colNm, 'POS_all':POS_all}, f)
    with open( path.join(repoCode, 'ReinforcementLearning/NHL/playerstats/offVSdef/Automatic_classification/trainingData.p'), 'rb') as f:
        DT      =   pickle.load(f)
        colNm   =   DT['colNm']
        X       =   DT['X'][colNm]
        Y       =   DT['Y']
        X_all   =   DT['X_all']
        POS_all =   DT['POS_all']
    """

    # --- PRE-PROCESS DATA
    Y, X, POS_all, PLD_all, X_all =   ut_sanitize_matrix(Y, X), ut_sanitize_matrix(X), ut_sanitize_matrix(POS_all, X_all), ut_sanitize_matrix(PLD_all, X_all), ut_sanitize_matrix(X_all)
    # --- KEEP >N GAMES
    if minGames > 0 and minGames < 1:  # Fraction on the max
        minGames    =   (np.max(PLD_all) / 5).astype('int')[0]
    X_all           =   X_all[(PLD_all>=minGames).values]
    POS_all         =   POS_all[(PLD_all>=minGames).values]
    X_all_S, Nrm    =   do_normalize_data(X_all[(POS_all!='D').values])
    X_S, _          =   do_normalize_data(X, normalizer=Nrm)
    _, pca          =   do_reduce_data(X_all_S, nComp=18)
    X_S_P, _        =   do_reduce_data(X_S, pca=pca, nComp=18)
    # --- BUILD THE NETWORK
    nNodes          =   [X_S_P.shape[1], 15, Y.shape[1]]
    CLS             =   ANN_classifier(deepcopy(nNodes))
    # --- TRAIN THE NETWORK
    nIter           =   50
    CLS.ann_train_network(nIter, X_S_P, Y.values, svname=repoModel)
    # --- DISPLAY NETWORK ACCURACY
    #CLS.ann_display_accuracy()
    return Nrm, pca, colNm, CLS
Пример #5
0
def to_pandas(repoRaw):
    # --- List all seasons
    allS = ut_find_folders(repoRaw, True)
    # Loop on seasons
    for isea in allS:
        # --- ART Ross nominees
        # Load csv file
        csvF    =   path.join( repoRaw, isea, 'trophy_ross_nominees.csv' )
        df_r    =   to_pandas_ross(csvF)

        # --- SELKE nominees
        # Load csv file
        csvF    =   path.join(repoRaw, isea, 'trophy_selke_nominees.csv')
        df_s    =   to_pandas_selke(csvF)

        # --- PICKLE IT OUT
        svname  =   path.join( repoRaw, isea, 'trophy_nominees.p')
        with open(svname, 'wb') as f:
            pickle.dump({'ross':df_r, 'selke':df_s}, f)
Пример #6
0
def to_pandas(repoPbP):
    # --- List all seasons
    allS = ut_find_folders(repoPbP, True)
    # Loop on seasons
    for isea in allS:
        # --- ART Ross nominees
        isea2 = isea.replace('Season_', '')
        # Load csv file
        csvF = path.join(repoPbP, isea, 'playbyplay_' + isea2 + '.csv')
        df_pbp = pd.read_csv(csvF, engine='python')

        # --- SELKE nominees
        # Load csv file
        csvF = path.join(repoPbP, isea, 'roster_' + isea2 + '.csv')
        df_rst = pd.read_csv(csvF, engine='python')

        # --- PICKLE IT OUT
        svname = path.join(repoPbP, isea, 'converted_data.p')
        with open(svname, 'wb') as f:
            pickle.dump({'playbyplay': df_pbp, 'roster': df_rst}, f)
Пример #7
0
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from os import path
from Utils.programming.ut_find_folders import *



# Visualize goal difference: show home advantage
# ==============================================
# list seasons
repoPbP =   '/home/younesz/Documents/Databases/Hockey/PlayByPlay'
allS    =   ut_find_folders(repoPbP, True)
allG    =   pd.DataFrame()

for iS in allS:
    # Load data
    dt  =   pickle.load( open(path.join(repoPbP, iS, 'converted_data.p'), 'rb') )['playbyplay']
    # Keep only final lines for each game
    dt  =   dt.drop_duplicates(subset=['gcode', 'season'], keep='last')[['away.score', 'home.score']]
    # Concatenate
    allG=   pd.concat( (allG, dt), axis=0 )
# Viz
ann     =   "Win percentage for home team: %.1f %%" %(np.sum(allG['home.score']>allG['away.score'])/len(allG)*100)
plt.figure(); plt.hist(allG['home.score']-allG['away.score'])
plt.xlabel('Home team goal diff')
plt.ylabel('Nb of games')
plt.annotate(ann, xy=(-9,5000), xytext=(-9,5000))

Пример #8
0
def to_pandas(repoRaw):
    # --- List all seasons
    allS = ut_find_folders(repoRaw, True)
    # Loop on seasons
    for isea in allS:
        # --- Process summary - get header
        sumF = path.join(repoRaw, isea, 'summary.csv')
        # Read the file
        sumR = open(sumF, 'r')
        sumR = eval(sumR.readline().replace('null', 'None'))
        # Process the file
        sEntries = sumR['total']
        sumRdt = sumR['data']
        sumKeys = sumRdt[0].keys()
        # Turn into a pandas dataframe
        sumdt = [list(x.values()) for x in sumRdt]
        sumDF = pd.DataFrame(np.array(sumdt), columns=sumKeys)
        # Sort by teamAbbrev, playerName, gameId
        sumDF = sumDF.sort_values(by=['gameId', 'teamAbbrev', 'playerName'],
                                  ascending=True)
        sumDF = sumDF.set_index(np.arange(len(sumDF)))

        # --- Process hits - get header
        hitF = path.join(repoRaw, isea, 'hits.csv')
        # Read the file
        hitR = open(hitF, 'r')
        hitR = eval(hitR.readline().replace('null', 'None'))
        # Process the file
        hEntries = hitR['total']
        hitRdt = hitR['data']
        hitKeys = hitRdt[0].keys()
        # Turn into a pandas dataframe
        hitdt = [list(x.values()) for x in hitRdt]
        hitDF = pd.DataFrame(np.array(hitdt), columns=hitKeys)
        # Sort by teamAbbrev, playerName, gameId
        hitDF = hitDF.sort_values(by=['gameId', 'teamAbbrev', 'playerName'],
                                  ascending=True)
        hitDF = hitDF.set_index(np.arange(len(hitDF)))

        # --- Process powerplays - get header
        ppF = path.join(repoRaw, isea, 'powerplay.csv')
        # Read the file
        ppR = open(ppF, 'r')
        ppR = eval(ppR.readline().replace('null', 'None'))
        # Process the file
        pEntries = ppR['total']
        ppRdt = ppR['data']
        ppKeys = ppRdt[0].keys()
        # Turn into a pandas dataframe
        ppdt = [list(x.values()) for x in ppRdt]
        ppDF = pd.DataFrame(np.array(ppdt), columns=ppKeys)
        # Sort by teamAbbrev, playerName, gameId
        pplDF = ppDF.sort_values(by=['gameId', 'teamAbbrev', 'playerName'],
                                 ascending=True)
        ppDF = ppDF.set_index(np.arange(len(ppDF)))

        # --- Process penalty kills - get header
        kilF = path.join(repoRaw, isea, 'penalty_kills.csv')
        # Read the file
        kilR = open(kilF, 'r')
        kilR = eval(kilR.readline().replace('null', 'None'))
        # Process the file
        kEntries = kilR['total']
        kilRdt = kilR['data']
        kilKeys = kilRdt[0].keys()
        # Turn into a pandas dataframe
        kildt = [list(x.values()) for x in kilRdt]
        kilDF = pd.DataFrame(np.array(kildt), columns=kilKeys)
        # Sort by teamAbbrev, playerName, gameId
        kilDF = kilDF.sort_values(by=['gameId', 'teamAbbrev', 'playerName'],
                                  ascending=True)
        kilDF = kilDF.set_index(np.arange(len(kilDF)))

        # --- Join the two tables into a pandas series
        DF = pd.concat((sumDF, hitDF, ppDF, kilDF), axis=1)
        # Drop duplicates
        DF = DF.loc[:, ~DF.columns.duplicated()]
        # Pickle the result
        with open(path.join(repoRaw, isea, 'all_data.p'), 'wb') as f:
            pickle.dump(DF, f)
Пример #9
0
def do_clustering_multiyear(repoModel, repoPSt, repoPbP, dtCols, normalizer, pca, root, plFetcher='default'):
    # Make constraints
    allS_p      =   ut_find_folders(repoPbP, True)
    years       =   [[x.split('_')[1][:4], x.split('_')[1][4:]] for x in allS_p]
    count       =   0
    allCla      =   pd.DataFrame()
    allCON      =   pd.DataFrame()
    allCls      =   []
    allSLK      =   []
    allROS      =   []

    ###### ECLUDE YEAR 2003-2004 : PROBLEM WITH FREDRIK MODIN'S DATA - NAMED AS FREDDY MODIN IN THE NHL STATS PAGE
    years.pop( years.index(['2003', '2004']) )
    all_centers =   []
    for iy in years:
        # Get data
        data, classes   =   get_data_for_clustering(repoModel, repoPSt, repoPbP, dtCols, normalizer, pca, upto=iy[1]+'-07-01', asof=iy[0]+'-09-01', nGames=81, plFetcher=plFetcher)
        # Get trophy nominees
        selke       =   to_pandas_selke( path.join(root, 'Databases/Hockey/PlayerStats/raw/' + ''.join(iy) + '/trophy_selke_nominees.csv') )
        selke       =   selke[~selke.index.duplicated(keep='first')]
        selke_id    =   [list(data.index).index(x) for x in selke[selke['Pos'] != 'D'].index]
        ross        =   to_pandas_ross( path.join(root, 'Databases/Hockey/PlayerStats/raw/' + ''.join(iy) + '/trophy_ross_nominees.csv') )
        ross        =   ross[~ross.index.duplicated(keep='first')]
        ross_id     =   [list(data.index).index(x) for x in ross[ross['pos'] != 'D'].index]
        # --- Clean constraints
        # Remove duplicates
        torem       =   list( set(ross_id).intersection(selke_id) )
        maxV        =   np.argmax(classes.iloc[torem].values, axis=1).astype(bool)
        selke_id    =   ut_difference( selke_id, list( compress(torem, maxV) ))
        selke_wgt   =   selke.loc[data.iloc[selke_id].index]['WEIGHT_rank'].values
        selke_wgt   =   selke_wgt/np.max(selke_wgt)
        ross_id     =   ut_difference( ross_id, list( compress(torem, maxV!=True) ))
        ross_wgt    =   ross.loc[data.iloc[ross_id].index]['WEIGHT_rank'].values
        ross_wgt    =   ross_wgt / np.max(ross_wgt)
        # Get poorest ranked players
        seed        =   classes.min(axis=0)
        distance    =   np.sqrt( ((classes - seed)**2).sum(axis=1) ).sort_values()
        poor_id     =   [classes.index.get_loc(x) for x in distance.index[:30]]
        poor_id     =   ut_difference( ut_difference( poor_id, selke_id ), ross_id )
        # Make the constraints
        constraints =   ut_make_constraints( (selke_id, selke_wgt), (ross_id, ross_wgt), poor_id )
        constraints =   pd.DataFrame(constraints)
        constraints =   constraints[constraints[0] != constraints[1]]

        # Make clusters
        cls_data        =   list(list(x) for x in classes.values)
        cOm             =   [list(ut_center_of_mass(classes.iloc[x].values, np.reshape(y, [-1, 1]))) for x, y in zip([selke_id, ross_id, poor_id], [selke_wgt, ross_wgt, np.ones([1, len(poor_id)])])]
        ml, cl, dmp     = [], [], []
        [ml.append(tuple(x[:2].astype('int'))) if x[-1] > 0.5 else dmp.append(tuple(x[:2])) for x in constraints.values]
        [cl.append(tuple(x[:2].astype('int'))) if x[-1] < -0.5 else dmp.append(tuple(x[:2])) for x in constraints.values]
        clusters, centers, cost   =   cop_kmeans(cls_data, 3, ml, cl, max_iter=1000, tol=1e-4, initialization=cOm)
        all_centers.append(centers)

        # Append
        allSLK  =   allSLK + list(np.add(selke_id, len(allCla)))
        allROS  =   allROS + list(np.add(ross_id, len(allCla)))
        allCla  =   pd.concat((allCla, classes), axis=0)
        allCls  =   allCls + clusters
        allCtr  =   [list(x) for x in np.mean(np.array(all_centers),axis=0)]
        #display_clustering(classes, clusters, centers, ross_id, selke_id)
    #print('year: ', iy, 'cost: ', np.sum(cost))
    display_clustering(allCla, allCls, allCtr, allROS, allSLK)

    # Cluster the centers
    all_centers     =   np.concatenate( np.array(all_centers), axis=0 )
    all_centers     =   list([list(x) for x in all_centers])
    index           =   np.array(range( int(len(all_centers)/3) ))*3
    constraints     =   ut_make_constraints( (list(index), list(np.ones([len(index),1]))), (list(index+1), list(np.ones([len(index),1]))), list(index+2))
    constraints     =   pd.DataFrame(constraints)
    constraints     =   constraints[constraints[0] != constraints[1]]
    ml, cl, dmp     =   [], [], []
    [ml.append(tuple(x[:2].astype('int'))) if x[-1] > 0.5 else dmp.append(tuple(x[:2])) for x in constraints.values]
    [cl.append(tuple(x[:2].astype('int'))) if x[-1] < -0.5 else dmp.append(tuple(x[:2])) for x in constraints.values]
    cOm             =   [list(ut_center_of_mass(classes.iloc[x].values, np.reshape(y, [-1,1]) )) for x,y in zip([selke_id, ross_id, poor_id], [selke_wgt, ross_wgt, np.ones([1, len(poor_id)])])]
    glCL, glCT, _   =   cop_kmeans(all_centers, 3, ml, cl, max_iter=1000, tol=1e-4, initialization='hockey')
    display_clustering(pd.DataFrame(all_centers, columns=['OFF', 'DEF']), glCL, glCT, list(index), list(index+1))
    # Relate centers to trophies
    iSlk            =   np.argmin( [np.sqrt(np.sum(np.subtract([0,1], x)**2)) for x in glCT] )
    iRoss           =   np.argmin( [np.sqrt(np.sum(np.subtract([1,0], x)**2)) for x in glCT] )
    iPoor           =   list( set(range(3)).difference([iSlk,iRoss]) )[0]
    global_centers  =   {'selke':glCT[iSlk], 'ross':glCT[iRoss], 'poor':glCT[iPoor]}
    # Save result
    pickle.dump({'global_centers':global_centers, 'normalizer':normalizer, 'pca':pca, 'dtCols':dtCols}, \
                open(path.join(repoModel, 'baseVariables.p'), 'wb') )
    return global_centers