def test(): """ import data """ print("* importing data") players = pd.read_csv("../data/nba-players-stats/player_data.csv") players = players[players.year_start >= 1980] # only choose players who started after 1980 players["player_id"] = range(0, len(players.name)) # assign id stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv") stats = stats[stats.Player.isin(players.name)] # only after 1980 stats = stats[stats.Year >= 1980] # without duplicated names --> to do: how to distinguish multiple player with the same name stats = removeDuplicated(players, stats) stats.Year = stats.Year.astype(int) stats.year_count = stats.year_count.astype(int) print("* preparing data") # transform stats to a dictionary composed of df's for each stat # the stats are re-calculated to get one stat for each year metricsPerGameColNames = ["PTS", "AST", "TOV", "TRB", "STL", "BLK", "3P"] metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames) metricsPerCentColNames = ["FG", "FT"] metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames) metricsWeightedColNames = ["PER"] metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames) allMetricsDict = { **metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict } allPivotedTableDict = getPivotedTableDict(allMetricsDict) # this matrix will be used to mask the table df_year = pd.pivot_table(stats, values="Year", index="Player", columns="year_count") """ experiment setup """ activePlayers = getActivePlayers(stats, 2016, 4) activePlayers.sort() activePlayers.remove("Kevin Garnett") activePlayers.remove("Kobe Bryant") offMetrics = ["PTS_G", "AST_G", "TOV_G", "3P_G", "PER_w", "FG%", "FT%"] defMetrics = ["TRB_G", "STL_G", "BLK_G"] #### uniform weights # weightsOff = [1.,1.,1.,1.,1.,1.,1.] # weightsDef = [1.,1.,1.] #### mean-standardized weights # weightsOff = [0.12623068620631453, 0.55687314142618904, 0.82115849366536209, 0.080245455622805287, 2.2838580004246301, 1.4304474472757014, 4.7552939398878413] # weightsDef = [0.28744431242409424, 1.5323016513327052, 2.4985245915220626] #### variance-standardized weights (1/(x+1)) metrics_list = [offMetrics, defMetrics] expSetup = ["sliding", "SVD", "all", "pinv", False] threshold = 0.97 print("* start experiment") pred_all = pd.DataFrame() true_all = pd.DataFrame() for playerName in activePlayers: target = Target(playerName, allPivotedTableDict, df_year) donor = Donor(allPivotedTableDict, df_year) weights_list = getWeitghts(target, donor, metrics_list, expSetup, method="mean") mrsc = mRSC(donor, target, probObservation=1) mrsc.fit_threshold(offMetrics, weights_list[0], 2016, pred_length=1, threshold=threshold, setup=expSetup) predOff = mrsc.predict() trueOff = mrsc.getTrue() predOff.columns = [playerName] trueOff.columns = [playerName] mrsc.fit_threshold(defMetrics, weights_list[1], 2016, pred_length=1, threshold=threshold, setup=expSetup) predDef = mrsc.predict() trueDef = mrsc.getTrue() predDef.columns = [playerName] trueDef.columns = [playerName] pred = pd.concat([predOff, predDef], axis=0) true = pd.concat([trueOff, trueDef], axis=0) pred_all = pd.concat([pred_all, pred], axis=1) true_all = pd.concat([true_all, true], axis=1) ################### mask = (true_all != 0) mape = np.abs(pred_all - true_all) / true_all[mask] print() print(mape.mean(axis=1)) print("MAPE for all: ", mape.mean().mean()) rmse = utils.rmse_2d(true_all, pred_all) print() print(rmse) print("RMSE for all: ", rmse.mean()) weirdo = mape.T[mape.T.PTS_G > 100].T print() print(weirdo) print(weirdo.shape)
def test(): """ import data """ print("* importing data") players = pd.read_csv("../data/nba-players-stats/player_data.csv") players = players[players.year_start >= 1980] # only choose players who started after 1980 players["player_id"] = range(0, len(players.name)) # assign id stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv") stats = stats[stats.Player.isin(players.name)] # only after 1980 stats = stats[stats.Year >= 1980] # without duplicated names --> to do: how to distinguish multiple player with the same name stats = removeDuplicated(players, stats) stats.Year = stats.Year.astype(int) stats.year_count = stats.year_count.astype(int) print("* preparing data") # transform stats to a dictionary composed of df's for each stat # the stats are re-calculated to get one stat for each year metricsPerGameColNames = ["PTS", "AST", "TOV", "TRB", "STL", "BLK", "3P"] metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames) metricsPerCentColNames = ["FG", "FT"] metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames) metricsWeightedColNames = ["PER"] metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames) allMetricsDict = { **metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict } allPivotedTableDict = getPivotedTableDict(allMetricsDict) allMetrics = list(allMetricsDict.keys()) # this matrix will be used to mask the table df_year = pd.pivot_table(stats, values="Year", index="Player", columns="year_count") """ experiment setup """ activePlayers = getActivePlayers(stats, 2016, 4) activePlayers.sort() activePlayers.remove("Kevin Garnett") activePlayers.remove("Kobe Bryant") # offMetrics = ["PTS_G","AST_G","TOV_G","3P_G","PER_w", "FG%","FT%"] # defMetrics = ["TRB_G","STL_G","BLK_G"] expSetup = ["sliding", "SVD", "all", "pinv", False] threshold = 0.97 metrics_to_use = [ "PTS_G", "AST_G", "TOV_G", "3P_G", "PER_w", "FG%", "FT%", "TRB_G", "STL_G", "BLK_G" ] print("* start experiment") print("*******************") print("uniform weights") weights = [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.] pred_all = pd.DataFrame() true_all = pd.DataFrame() for playerName in activePlayers: target = Target(playerName, allPivotedTableDict, df_year) donor = Donor(allPivotedTableDict, df_year) mrsc = mRSC(donor, target, probObservation=1) mrsc.fit_threshold(metrics_to_use, weights, 2016, pred_length=1, threshold=threshold, setup=expSetup) pred = mrsc.predict() true = mrsc.getTrue() pred.columns = [playerName] true.columns = [playerName] pred_all = pd.concat([pred_all, pred], axis=1) true_all = pd.concat([true_all, true], axis=1) print() print("*** MAPE ***") mask = (true_all != 0) mape = np.abs(pred_all - true_all) / true_all[mask] print(mape.mean(axis=1)) print("MAPE for all: ", mape.mean().mean()) rmse = utils.rmse_2d(true_all, pred_all) print() print("*** RMSE ***") print(rmse) print("RMSE for all: ", rmse.mean()) print("*******************") print("mean - standardized weights") metrics_list = [metrics_to_use] weights = getWeitghts(target, donor, metrics_list, expSetup, method="mean")[0] pred_all = pd.DataFrame() true_all = pd.DataFrame() for playerName in activePlayers: target = Target(playerName, allPivotedTableDict, df_year) donor = Donor(allPivotedTableDict, df_year) mrsc = mRSC(donor, target, probObservation=1) mrsc.fit_threshold(metrics_to_use, weights, 2016, pred_length=1, threshold=threshold, setup=expSetup) pred = mrsc.predict() true = mrsc.getTrue() pred.columns = [playerName] true.columns = [playerName] pred_all = pd.concat([pred_all, pred], axis=1) true_all = pd.concat([true_all, true], axis=1) print() print("*** MAPE ***") mask = (true_all != 0) mape = np.abs(pred_all - true_all) / true_all[mask] print(mape.mean(axis=1)) print("MAPE for all: ", mape.mean().mean()) rmse = utils.rmse_2d(true_all, pred_all) print() print("*** RMSE ***") print(rmse) print("RMSE for all: ", rmse.mean()) print("*******************") print("var - standardized weights") metrics_list = [metrics_to_use] weights = getWeitghts(target, donor, metrics_list, expSetup, method="var")[0] pred_all = pd.DataFrame() true_all = pd.DataFrame() for playerName in activePlayers: target = Target(playerName, allPivotedTableDict, df_year) donor = Donor(allPivotedTableDict, df_year) mrsc = mRSC(donor, target, probObservation=1) mrsc.fit_threshold(metrics_to_use, weights, 2016, pred_length=1, threshold=threshold, setup=expSetup) pred = mrsc.predict() true = mrsc.getTrue() pred.columns = [playerName] true.columns = [playerName] pred_all = pd.concat([pred_all, pred], axis=1) true_all = pd.concat([true_all, true], axis=1) print() print("*** MAPE ***") mask = (true_all != 0) mape = np.abs(pred_all - true_all) / true_all[mask] print(mape.mean(axis=1)) print("MAPE for all: ", mape.mean().mean()) rmse = utils.rmse_2d(true_all, pred_all) print() print("*** RMSE ***") print(rmse) print("RMSE for all: ", rmse.mean())
def test(): """ import data """ pred_year = 2015 # the year that we are living in pred_interval = 1 # we are making predictions for pred_year+1 and +2 print("*** importing data ***") players = pd.read_csv("../data/nba-players-stats/player_data.csv") players = players[players.year_start >= 1980] # only choose players who started after 1980 # players["player_id"] = range(0,len(players.name)) # assign id stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv") stats = stats[stats.Player.isin(players.name)] # only after 1980 stats = stats[stats.Year >= 1980] # without duplicated names --> to do: how to distinguish multiple player with the same name stats = removeDuplicated(players, stats) stats.Year = stats.Year.astype(int) stats.year_count = stats.year_count.astype(int) print("*** preparing data ***") ########### Donor ########## # filter stats by the year stats_donor = stats[stats.Year <= pred_year] allPivotedTableDict, allMetrics = prepareData(stats_donor) donor = Donor(allPivotedTableDict) ########### Target ########## # filter stats by the year stats_target = stats[stats.Year <= pred_year+pred_interval] allPivotedTableDict, allMetrics = prepareData(stats_target) # just to debug df_year = pd.pivot_table(stats, values="Year", index="Player", columns = "year_count") """ experiment setup """ # overall setup donorSetup= [None,"fixed", True] # weighting = donorSetup[0] # None / "normalize" # mat_form_method = donorSetup[1] # "fixed" # skipNan = donorSetup[2] # (Boolean) denoiseSetup = ["SVD", "all"] # denoise_method = denoiseSetup[0] # "SVD" # denoise_mat_method = denoiseSetup[1] # "all" regression_method = "pinv" threshold = 0.97 verbose = False ################################################### offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P_G"] defMetrics = ["TRB_G","STL_G","BLK_G"] # metrics_list = [offMetrics, defMetrics] metrics_list = [allMetrics] ################################################### ############################################################## # test 1 ############################################################## playerNames = getActivePlayers(stats, pred_year, buffer=4) playerNames.remove("Kevin Garnett") playerNames.remove("Kobe Bryant") # playerNames.remove("Jason Kidd") all_pred = pd.DataFrame() all_true = pd.DataFrame() for playerName in playerNames: # print(playerName) # print("*** year - year_count matching for this player") # a = df_year[df_year.index == playerName] # print(a.dropna(axis=1)) target = Target(playerName, allPivotedTableDict) # print("*** target - total index: ", target.total_index) # print(target.concat(metrics_list[1])) mrsc = mRSC(donor, target, pred_interval, probObservation=1) player_pred = pd.DataFrame() player_true = pd.DataFrame() for i in range(len(metrics_list)): mrsc.fit_threshold(metrics_list[i], pred_interval, threshold, donorSetup, denoiseSetup,regression_method, verbose) pred = mrsc.predict() true = mrsc.getTrue() pred.columns = [playerName] true.columns = [playerName] player_pred = pd.concat([player_pred, pred], axis=0) player_true = pd.concat([player_true, true], axis=0) all_pred = pd.concat([all_pred, player_pred], axis=1) all_true = pd.concat([all_true, player_true], axis=1) ################### print(all_pred) print(all_pred.shape) mask = (all_true !=0 ) mape = np.abs(all_pred - all_true) / all_true[mask] print("*** MAPE ***") print(mape.mean(axis=1)) print("MAPE for all: ", mape.mean().mean()) rmse = utils.rmse_2d(all_true, all_pred) print() print("*** RMSE ***") print(rmse) print("RMSE for all: ", rmse.mean())
def annual_predictions(playerNames, allPivotedTableDict, donor, pred_interval, metrics, pred_metrics, threshold, donorSetup, denoiseSetup, regression_method, verbose, dir_name, top_players): all_pred = pd.DataFrame() all_true = pd.DataFrame() all_bench = pd.DataFrame() all_R2 = pd.DataFrame() for playerName in playerNames: target = Target(playerName, allPivotedTableDict) mrsc = mRSC(donor, target, pred_interval, probObservation=1) player_pred = pd.DataFrame() player_true = pd.DataFrame() # benchmark true, benchmark = getBenchmark(target, pred_metrics, pred_interval) for metric in metrics: mrsc.fit_threshold(metric, threshold, donorSetup, denoiseSetup,regression_method, verbose) pred = mrsc.predict() pred = pred[pred.index.isin(pred_metrics)] true = mrsc.getTrue() # # ARMA # data = mrsc.target_data.T.ewm(com=0.5).mean().T.values.flatten() # data = data[:-1] # ewm = data[-1] # # if (np.sum(data != 0)==0): # # pred_arima = 0 # # else: # # model = ARMA(data, order=(1, 1)) # # model_fit = model.fit(disp=False) # # pred_arma = model_fit.predict(len(data), len(data)) # pred = 0.5*pred + 0.5*ewm pred.columns = [playerName+" "+ str(a) for a in range(pred_interval)] true.columns = [playerName+" "+ str(a) for a in range(pred_interval)] player_pred = pd.concat([player_pred, pred], axis=0) player_true = pd.concat([player_true, true], axis=0) all_pred = pd.concat([all_pred, player_pred], axis=1) all_true = pd.concat([all_true, player_true], axis=1) all_bench = pd.concat([all_bench, benchmark], axis=1) R2 = getR2(player_true, player_pred, benchmark) all_R2 = pd.concat([all_R2, R2], axis=1) ################### print("Number of metrics: {}".format(all_pred.shape[0])) print("Number of players: {}".format(all_pred.shape[1])) print() mask = (all_true !=0 ) mape = np.abs(all_pred[mask] - all_true[mask]) / all_true[mask] print("*** MAPE ***") print(mape.mean(axis=1)) print("MAPE for all: ", mape.mean().mean()) rmse = utils.rmse_2d(all_true, all_pred) print() print("*** RMSE ***") print(rmse) print("RMSE for all: ", rmse.mean()) print() print("*** R2 ***") print(all_R2.mean(axis=1)) print("R2 for all: ", all_R2.mean(axis=1).mean(axis=0)) edited_R2 = copy.deepcopy(all_R2) edited_R2[edited_R2 <0] = 0 print() print("*** edited R2 ***") print(edited_R2.mean(axis=1)) print("R2 for all: ", edited_R2.mean().mean()) return all_pred, all_true, all_R2, all_bench
def test(): """ import data """ print("* importing data") players = pd.read_csv("../data/nba-players-stats/player_data.csv") players = players[players.year_start >= 1980] # only choose players who started after 1980 players["player_id"] = range(0, len(players.name)) # assign id stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv") stats = stats[stats.Player.isin(players.name)] # only after 1980 stats = stats[stats.Year >= 1980] # without duplicated names --> to do: how to distinguish multiple player with the same name stats = removeDuplicated(players, stats) stats.Year = stats.Year.astype(int) stats.year_count = stats.year_count.astype(int) print("* preparing data") # transform stats to a dictionary composed of df's for each stat # the stats are re-calculated to get one stat for each year metricsPerGameColNames = ["PTS", "AST", "TOV", "TRB", "STL", "BLK", "3P"] metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames) metricsPerCentColNames = ["FG", "FT"] metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames) metricsWeightedColNames = ["PER"] metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames) allMetricsDict = { **metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict } allPivotedTableDict = getPivotedTableDict(allMetricsDict) allMetrics = list(allMetricsDict.keys()) # this matrix will be used to mask the table df_year = pd.pivot_table(stats, values="Year", index="Player", columns="year_count") """ experiment setup """ pred_year = 2016 # targets activePlayers = getActivePlayers(stats, pred_year, 4) activePlayers.sort() activePlayers.remove("Kevin Garnett") activePlayers.remove("Kobe Bryant") # overall setup expSetup = ["sliding", "SVD", "all", "pinv", False] threshold = 0.97 ################################################### # offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P_G"] # defMetrics = ["TRB_G","STL_G","BLK_G"] # metrics_list = [offMetrics, defMetrics] ################################################### print("* start experiment") pred_all = pd.DataFrame() true_all = pd.DataFrame() metrics_all = [] with open('metrics_all.pkl', 'wb') as f: pickle.dump(metrics_all, f, pickle.HIGHEST_PROTOCOL) for playerName in activePlayers: # print() # print("***********", playerName , "************") target = Target(playerName, allPivotedTableDict, df_year) donor = Donor(allPivotedTableDict, df_year) metrics_list = getMetrics(target, donor, pred_year, allMetrics, threshold, expSetup, boundary="threshold") weights_list = getWeitghts(target, donor, metrics_list, expSetup, method="mean") metrics_all.append(metrics_list) # print(metrics_list) mrsc = mRSC(donor, target, probObservation=1) player_pred = pd.DataFrame() player_true = pd.DataFrame() for i in range(len(metrics_list)): mrsc.fit_threshold(metrics_list[i], weights_list[i], pred_year, pred_length=1, threshold=threshold, setup=expSetup) pred = mrsc.predict() true = mrsc.getTrue() pred.columns = [playerName] true.columns = [playerName] c = metrics_list[i].index(allMetrics[i]) player_pred = pd.concat([player_pred, pred.iloc[c:(c + 1), :]], axis=0) player_true = pd.concat([player_true, true.iloc[c:(c + 1), :]], axis=0) pred_all = pd.concat([pred_all, player_pred], axis=1) true_all = pd.concat([true_all, player_true], axis=1) mask = (true_all != 0) mape = np.abs(pred_all - true_all) / true_all[mask] # print(mape.mean(axis=1)) ################### print("******** RESULT ********") mask = (true_all != 0) mape = np.abs(pred_all - true_all) / true_all[mask] print() print("*** MAPE ***") print(mape.mean(axis=1)) print("MAPE for all: ", mape.mean().mean()) rmse = utils.rmse_2d(pred_all, true_all) print() print("*** RMSE ***") print(rmse) print("RMSE for all: ", rmse.mean()) with open('metrics_all.pkl', 'wb') as f: pickle.dump(metrics_all, f, pickle.HIGHEST_PROTOCOL)
def test(): """ import data """ print("importing data") players = pd.read_csv("../data/nba-players-stats/player_data.csv") players = players[players.year_start >= 1980] # only choose players who started after 1980 players["player_id"] = range(0, len(players.name)) # assign id stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv") stats = stats[stats.Player.isin(players.name)] # only after 1980 stats = stats[stats.Year >= 1980] # without duplicated names --> to do: how to distinguish multiple player with the same name stats = removeDuplicated(players, stats) stats.Year = stats.Year.astype(int) stats.year_count = stats.year_count.astype(int) print("preparing data") # transform stats to a dictionary composed of df's for each stat # the stats are re-calculated to get one stat for each year metricsPerGameColNames = ["PTS", "AST", "TOV", "TRB", "STL", "BLK", "3P"] metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames) metricsPerCentColNames = ["FG", "FT"] metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames) metricsWeightedColNames = ["PER"] metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames) allMetricsDict = { **metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict } allPivotedTableDict = getPivotedTableDict(allMetricsDict) allMetrics = list(allMetricsDict.keys()) # this matrix will be used to mask the table df_year = pd.pivot_table(stats, values="Year", index="Player", columns="year_count") """ experiment setup """ # overall setup expSetup = ["sliding", "SVD", "all", "pinv", False] threshold = 0.97 metrics1 = ["PTS_G", "PER_w", "TRB_G", "3P_G"] metrics2 = ["FG%", "FT%"] metrics3 = ["BLK_G", "AST_G", "TOV_G", "STL_G"] metrics_list = [metrics1, metrics2, metrics3] ################################################### # offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P%"] # defMetrics = ["TRB_G","STL_G","BLK_G"] # metrics_list = [offMetrics, defMetrics] ################################################### playerName = "Ryan Anderson" target = Target(playerName, allPivotedTableDict, df_year) donor = Donor(allPivotedTableDict, df_year) weights_list = getWeitghts(target, donor, metrics_list, expSetup, method="var") mrsc = mRSC(donor, target, probObservation=1) fig, axs = plt.subplots(3, 5) player_pred = pd.DataFrame() player_true = pd.DataFrame() for i in range(len(metrics_list)): mrsc.fit_threshold(metrics_list[i], weights_list[i], 2016, pred_length=1, threshold=threshold, setup=expSetup) pred = mrsc.predict() true = mrsc.getTrue() pred.columns = [playerName] true.columns = [playerName] player_pred = pd.concat([player_pred, pred], axis=0) player_true = pd.concat([player_true, true], axis=0) # mrsc.plot() for j in range(len(metrics_list[i])): metric = metrics_list[i][j] true_trajectory = target.data[metric].dropna( axis='columns').iloc[:, :mrsc.total_index] pred_val = np.dot( mrsc.model.donor_data.iloc[:, j * mrsc.model.total_index:( (j + 1) * mrsc.model.total_index)].T, mrsc.model.beta).T pred_trajectory = pd.DataFrame(pred_val, columns=true_trajectory.columns, index=true_trajectory.index) markers_on = [true_trajectory.shape[1] - 1] axs[i, j].plot(true_trajectory.T, marker='o', color='red', label='true') axs[i, j].plot(pred_trajectory.T, marker='o', markevery=markers_on, color='blue', label='prediction') axs[i, j].set_title(playerName + ": " + metric) # axs[i, j].legend() for ax in axs.flat: ax.set(xlabel='years played in NBA') plt.subplots_adjust(wspace=0.5, hspace=0.5) plt.show() ################### mask = (player_true != 0) mape = np.abs(player_pred - player_true) / player_true[mask] print("*** MAPE ***") print(mape.mean(axis=1)) print("MAPE for all: ", mape.mean().mean()) rmse = utils.rmse_2d(player_true, player_pred) print() print("*** RMSE ***") print(rmse) print("RMSE for all: ", rmse.mean())