Пример #1
0
def mrscPredict(targetPlayer, allPivotedTableDict, donor, pred_interval,
                metric, pred_metrics, threshold, donorSetup, denoiseSetup,
                regression_method, verbose):

    # create target object
    target = Target(targetPlayer, allPivotedTableDict)

    # create mrsc model
    mrsc = mRSC(donor, target, pred_interval, probObservation=1)

    # fit model
    mrsc.fit_threshold(metric, threshold, donorSetup, denoiseSetup,
                       regression_method, verbose)

    # predict model for 'pred_metric'
    pred = mrsc.predict()
    pred = pred[pred.index.isin(pred_metrics)]

    return pred
Пример #2
0
def test():
    """
    import data
    """
    print("* importing data")
    players = pd.read_csv("../data/nba-players-stats/player_data.csv")
    players = players[players.year_start >=
                      1980]  # only choose players who started after 1980
    players["player_id"] = range(0, len(players.name))  # assign id

    stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
    stats = stats[stats.Player.isin(players.name)]

    # only after 1980
    stats = stats[stats.Year >= 1980]

    # without duplicated names --> to do: how to distinguish multiple player with the same name
    stats = removeDuplicated(players, stats)
    stats.Year = stats.Year.astype(int)
    stats.year_count = stats.year_count.astype(int)

    print("* preparing data")
    # transform stats to a dictionary composed of df's for each stat
    # the stats are re-calculated to get one stat for each year
    metricsPerGameColNames = ["PTS", "AST", "TOV", "TRB", "STL", "BLK", "3P"]
    metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

    metricsPerCentColNames = ["FG", "FT"]
    metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

    metricsWeightedColNames = ["PER"]
    metricsWeightedDict = getMetricsWeightedDict(stats,
                                                 metricsWeightedColNames)

    allMetricsDict = {
        **metricsPerGameDict,
        **metricsPerCentDict,
        **metricsWeightedDict
    }
    allPivotedTableDict = getPivotedTableDict(allMetricsDict)

    # this matrix will be used to mask the table
    df_year = pd.pivot_table(stats,
                             values="Year",
                             index="Player",
                             columns="year_count")
    """
    experiment setup
    """
    activePlayers = getActivePlayers(stats, 2016, 4)
    activePlayers.sort()
    activePlayers.remove("Kevin Garnett")
    activePlayers.remove("Kobe Bryant")

    offMetrics = ["PTS_G", "AST_G", "TOV_G", "3P_G", "PER_w", "FG%", "FT%"]
    defMetrics = ["TRB_G", "STL_G", "BLK_G"]

    #### uniform weights
    # weightsOff = [1.,1.,1.,1.,1.,1.,1.]
    # weightsDef = [1.,1.,1.]

    #### mean-standardized weights
    # weightsOff = [0.12623068620631453, 0.55687314142618904, 0.82115849366536209, 0.080245455622805287, 2.2838580004246301, 1.4304474472757014, 4.7552939398878413]
    # weightsDef = [0.28744431242409424, 1.5323016513327052, 2.4985245915220626]

    #### variance-standardized weights (1/(x+1))

    metrics_list = [offMetrics, defMetrics]

    expSetup = ["sliding", "SVD", "all", "pinv", False]
    threshold = 0.97

    print("* start experiment")
    pred_all = pd.DataFrame()
    true_all = pd.DataFrame()
    for playerName in activePlayers:
        target = Target(playerName, allPivotedTableDict, df_year)
        donor = Donor(allPivotedTableDict, df_year)
        weights_list = getWeitghts(target,
                                   donor,
                                   metrics_list,
                                   expSetup,
                                   method="mean")

        mrsc = mRSC(donor, target, probObservation=1)
        mrsc.fit_threshold(offMetrics,
                           weights_list[0],
                           2016,
                           pred_length=1,
                           threshold=threshold,
                           setup=expSetup)

        predOff = mrsc.predict()
        trueOff = mrsc.getTrue()
        predOff.columns = [playerName]
        trueOff.columns = [playerName]

        mrsc.fit_threshold(defMetrics,
                           weights_list[1],
                           2016,
                           pred_length=1,
                           threshold=threshold,
                           setup=expSetup)
        predDef = mrsc.predict()
        trueDef = mrsc.getTrue()
        predDef.columns = [playerName]
        trueDef.columns = [playerName]

        pred = pd.concat([predOff, predDef], axis=0)
        true = pd.concat([trueOff, trueDef], axis=0)

        pred_all = pd.concat([pred_all, pred], axis=1)
        true_all = pd.concat([true_all, true], axis=1)


###################
    mask = (true_all != 0)
    mape = np.abs(pred_all - true_all) / true_all[mask]
    print()
    print(mape.mean(axis=1))
    print("MAPE for all: ", mape.mean().mean())

    rmse = utils.rmse_2d(true_all, pred_all)
    print()
    print(rmse)
    print("RMSE for all: ", rmse.mean())

    weirdo = mape.T[mape.T.PTS_G > 100].T
    print()
    print(weirdo)
    print(weirdo.shape)
Пример #3
0
def test():
    """
    import data
    """
    print("* importing data")
    players = pd.read_csv("../data/nba-players-stats/player_data.csv")
    players = players[players.year_start >=
                      1980]  # only choose players who started after 1980
    players["player_id"] = range(0, len(players.name))  # assign id

    stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
    stats = stats[stats.Player.isin(players.name)]

    # only after 1980
    stats = stats[stats.Year >= 1980]

    # without duplicated names --> to do: how to distinguish multiple player with the same name
    stats = removeDuplicated(players, stats)
    stats.Year = stats.Year.astype(int)
    stats.year_count = stats.year_count.astype(int)

    print("* preparing data")
    # transform stats to a dictionary composed of df's for each stat
    # the stats are re-calculated to get one stat for each year
    metricsPerGameColNames = ["PTS", "AST", "TOV", "TRB", "STL", "BLK", "3P"]
    metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

    metricsPerCentColNames = ["FG", "FT"]
    metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

    metricsWeightedColNames = ["PER"]
    metricsWeightedDict = getMetricsWeightedDict(stats,
                                                 metricsWeightedColNames)

    allMetricsDict = {
        **metricsPerGameDict,
        **metricsPerCentDict,
        **metricsWeightedDict
    }
    allPivotedTableDict = getPivotedTableDict(allMetricsDict)
    allMetrics = list(allMetricsDict.keys())

    # this matrix will be used to mask the table
    df_year = pd.pivot_table(stats,
                             values="Year",
                             index="Player",
                             columns="year_count")
    """
    experiment setup
    """
    activePlayers = getActivePlayers(stats, 2016, 4)
    activePlayers.sort()
    activePlayers.remove("Kevin Garnett")
    activePlayers.remove("Kobe Bryant")

    # offMetrics = ["PTS_G","AST_G","TOV_G","3P_G","PER_w", "FG%","FT%"]
    # defMetrics = ["TRB_G","STL_G","BLK_G"]
    expSetup = ["sliding", "SVD", "all", "pinv", False]
    threshold = 0.97
    metrics_to_use = [
        "PTS_G", "AST_G", "TOV_G", "3P_G", "PER_w", "FG%", "FT%", "TRB_G",
        "STL_G", "BLK_G"
    ]

    print("* start experiment")

    print("*******************")
    print("uniform weights")
    weights = [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]
    pred_all = pd.DataFrame()
    true_all = pd.DataFrame()
    for playerName in activePlayers:
        target = Target(playerName, allPivotedTableDict, df_year)
        donor = Donor(allPivotedTableDict, df_year)

        mrsc = mRSC(donor, target, probObservation=1)
        mrsc.fit_threshold(metrics_to_use,
                           weights,
                           2016,
                           pred_length=1,
                           threshold=threshold,
                           setup=expSetup)

        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName]
        true.columns = [playerName]

        pred_all = pd.concat([pred_all, pred], axis=1)
        true_all = pd.concat([true_all, true], axis=1)
    print()
    print("*** MAPE ***")
    mask = (true_all != 0)
    mape = np.abs(pred_all - true_all) / true_all[mask]
    print(mape.mean(axis=1))
    print("MAPE for all: ", mape.mean().mean())
    rmse = utils.rmse_2d(true_all, pred_all)
    print()
    print("*** RMSE ***")
    print(rmse)
    print("RMSE for all: ", rmse.mean())

    print("*******************")
    print("mean - standardized weights")
    metrics_list = [metrics_to_use]
    weights = getWeitghts(target, donor, metrics_list, expSetup,
                          method="mean")[0]

    pred_all = pd.DataFrame()
    true_all = pd.DataFrame()
    for playerName in activePlayers:
        target = Target(playerName, allPivotedTableDict, df_year)
        donor = Donor(allPivotedTableDict, df_year)

        mrsc = mRSC(donor, target, probObservation=1)
        mrsc.fit_threshold(metrics_to_use,
                           weights,
                           2016,
                           pred_length=1,
                           threshold=threshold,
                           setup=expSetup)

        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName]
        true.columns = [playerName]

        pred_all = pd.concat([pred_all, pred], axis=1)
        true_all = pd.concat([true_all, true], axis=1)
    print()
    print("*** MAPE ***")
    mask = (true_all != 0)
    mape = np.abs(pred_all - true_all) / true_all[mask]
    print(mape.mean(axis=1))
    print("MAPE for all: ", mape.mean().mean())
    rmse = utils.rmse_2d(true_all, pred_all)
    print()
    print("*** RMSE ***")
    print(rmse)
    print("RMSE for all: ", rmse.mean())

    print("*******************")
    print("var - standardized weights")
    metrics_list = [metrics_to_use]
    weights = getWeitghts(target, donor, metrics_list, expSetup,
                          method="var")[0]

    pred_all = pd.DataFrame()
    true_all = pd.DataFrame()
    for playerName in activePlayers:
        target = Target(playerName, allPivotedTableDict, df_year)
        donor = Donor(allPivotedTableDict, df_year)

        mrsc = mRSC(donor, target, probObservation=1)
        mrsc.fit_threshold(metrics_to_use,
                           weights,
                           2016,
                           pred_length=1,
                           threshold=threshold,
                           setup=expSetup)

        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName]
        true.columns = [playerName]

        pred_all = pd.concat([pred_all, pred], axis=1)
        true_all = pd.concat([true_all, true], axis=1)
    print()
    print("*** MAPE ***")
    mask = (true_all != 0)
    mape = np.abs(pred_all - true_all) / true_all[mask]
    print(mape.mean(axis=1))
    print("MAPE for all: ", mape.mean().mean())
    rmse = utils.rmse_2d(true_all, pred_all)
    print()
    print("*** RMSE ***")
    print(rmse)
    print("RMSE for all: ", rmse.mean())
Пример #4
0
def test():
    """
    import data
    """
    print("* importing data")
    players = pd.read_csv("../data/nba-players-stats/player_data.csv")
    players = players[players.year_start >=
                      1980]  # only choose players who started after 1980
    players["player_id"] = range(0, len(players.name))  # assign id

    stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
    stats = stats[stats.Player.isin(players.name)]

    # only after 1980
    stats = stats[stats.Year >= 1980]

    # without duplicated names --> to do: how to distinguish multiple player with the same name
    stats = removeDuplicated(players, stats)
    stats.Year = stats.Year.astype(int)
    stats.year_count = stats.year_count.astype(int)

    print("* preparing data")
    # transform stats to a dictionary composed of df's for each stat
    # the stats are re-calculated to get one stat for each year
    metricsPerGameColNames = ["PTS", "AST", "TOV", "TRB", "STL", "BLK", "3P"]
    metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

    metricsPerCentColNames = ["FG", "FT"]
    metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

    metricsWeightedColNames = ["PER"]
    metricsWeightedDict = getMetricsWeightedDict(stats,
                                                 metricsWeightedColNames)

    allMetricsDict = {
        **metricsPerGameDict,
        **metricsPerCentDict,
        **metricsWeightedDict
    }
    allPivotedTableDict = getPivotedTableDict(allMetricsDict)

    # this matrix will be used to mask the table
    df_year = pd.pivot_table(stats,
                             values="Year",
                             index="Player",
                             columns="year_count")
    """
    experiment setup
    """
    pred_year = 2016
    activePlayers = getActivePlayers(stats, pred_year, 4)
    activePlayers.sort()
    activePlayers.remove("Kevin Garnett")
    activePlayers.remove("Kobe Bryant")

    metrics1 = ["PTS_G", "PER_w"]
    metrics2 = ["3P_G", "FG%", "FT%"]
    metrics3 = ["TOV_G"]
    metrics4 = ["TRB_G", "STL_G"]
    metrics5 = ["AST_G", "BLK_G"]
    metrics_list = [metrics1, metrics2, metrics3, metrics4, metrics5]
    print(metrics_list)
    #### uniform weights
    # weightsOff = [1.,1.,1.,1.,1.,1.,1.]
    # weightsDef = [1.,1.,1.]

    expSetup = ["sliding", "SVD", "all", "pinv", False]
    threshold = 0.97

    #### position groups
    group1 = [
        "C", "SF", "PF", "C-PF", "PF-C", "C-SF", "SF-C", "SF-PF", "PF-SF"
    ]
    group2 = [
        "SG", "SF", "PG", "SG-SF", "SF-SG", "SF-PG", "PG-SF", "SG-PG", "PG-SF"
    ]
    group3 = [
        "SG", "SF", "PF", "SG-SF", "SF-SG", "SF-PF", "PF-SF", "SG-PF", "PF-SG"
    ]

    print("* start experiment")
    pred_all = pd.DataFrame()
    true_all = pd.DataFrame()
    for playerName in activePlayers:
        playerPos = stats.loc[(stats.Player == playerName) &
                              (stats.Year == pred_year), "Pos"].values[-1]
        # print(playerPos)

        if playerPos in group1:
            players_in_group = stats[stats.Pos.isin(group1)].Player.unique()
            # print("group 1")
        elif playerPos in group2:
            players_in_group = stats[stats.Pos.isin(group2)].Player.unique()
            # print("group 2")
        elif playerPos in group3:
            players_in_group = stats[stats.Pos.isin(group2)].Player.unique()
            # print("group 3")
        else:
            raise Exception("invalid position")

        donorPivotedTableDict = filterDonor(allPivotedTableDict,
                                            players_in_group)

        target = Target(playerName, allPivotedTableDict, df_year)
        donor = Donor(donorPivotedTableDict, df_year)

        # print("***sanitary check***")
        # print("players_in_group: ", len(players_in_group))
        # print("donor pool size for PTS_G", donorPivotedTableDict["PTS_G"].shape)

        weights_list = getWeitghts(target,
                                   donor,
                                   metrics_list,
                                   expSetup,
                                   method="var")

        mrsc = mRSC(donor, target, probObservation=1)

        player_pred = pd.DataFrame()
        player_true = pd.DataFrame()
        for i in range(len(metrics_list)):
            mrsc.fit_threshold(metrics_list[i],
                               weights_list[i],
                               pred_year,
                               pred_length=1,
                               threshold=threshold,
                               setup=expSetup)

            pred = mrsc.predict()
            true = mrsc.getTrue()
            pred.columns = [playerName]
            true.columns = [playerName]

            player_pred = pd.concat([player_pred, pred], axis=0)
            player_true = pd.concat([player_true, true], axis=0)

        pred_all = pd.concat([pred_all, player_pred], axis=1)
        true_all = pd.concat([true_all, player_true], axis=1)


###################
    mask = (true_all != 0)
    mape = np.abs(pred_all - true_all) / true_all[mask]
    print()
    print("******* MAPE *******")
    print(mape.mean(axis=1))
    print("MAPE for all: ", mape.mean().mean())
def test():
    """
    import data
    """
    pred_year = 2015 # the year that we are living in
    pred_interval = 1 # we are making predictions for pred_year+1 and +2

    print("*** importing data ***")
    players = pd.read_csv("../data/nba-players-stats/player_data.csv")
    players = players[players.year_start >= 1980] # only choose players who started after 1980
    # players["player_id"] = range(0,len(players.name)) # assign id

    stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
    stats = stats[stats.Player.isin(players.name)]

    # only after 1980
    stats = stats[stats.Year >= 1980]

    # without duplicated names --> to do: how to distinguish multiple player with the same name
    stats = removeDuplicated(players, stats)
    stats.Year = stats.Year.astype(int)
    stats.year_count = stats.year_count.astype(int)

    print("*** preparing data ***")

    ########### Donor ##########
    # filter stats by the year
    stats_donor = stats[stats.Year <= pred_year]
    allPivotedTableDict, allMetrics = prepareData(stats_donor)
    donor = Donor(allPivotedTableDict)

    ########### Target ##########
    # filter stats by the year
    stats_target = stats[stats.Year <= pred_year+pred_interval]
    allPivotedTableDict, allMetrics = prepareData(stats_target)
    
    # just to debug
    df_year = pd.pivot_table(stats, values="Year", index="Player", columns = "year_count")

    """
    experiment setup
    """
    # overall setup
    donorSetup= [None,"fixed", True]
    # weighting = donorSetup[0] # None / "normalize"
    # mat_form_method = donorSetup[1] # "fixed"
    # skipNan = donorSetup[2] # (Boolean)
    denoiseSetup = ["SVD", "all"]
    # denoise_method = denoiseSetup[0] # "SVD"
    # denoise_mat_method = denoiseSetup[1] # "all"
    regression_method = "pinv"

    threshold = 0.97
    verbose = False

    ###################################################
    offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P_G"]
    defMetrics = ["TRB_G","STL_G","BLK_G"]
    # metrics_list = [offMetrics, defMetrics]

    metrics_list = [allMetrics]

    ###################################################

    ##############################################################
    # test 1
    ##############################################################
    playerNames = getActivePlayers(stats, pred_year, buffer=4)
    playerNames.remove("Kevin Garnett")
    playerNames.remove("Kobe Bryant")
    # playerNames.remove("Jason Kidd")

    all_pred = pd.DataFrame()
    all_true = pd.DataFrame()
    for playerName in playerNames:
        # print(playerName)
        # print("*** year - year_count matching for this player")
        # a = df_year[df_year.index == playerName]
        # print(a.dropna(axis=1))

        target = Target(playerName, allPivotedTableDict)
        # print("*** target - total index: ", target.total_index)
        # print(target.concat(metrics_list[1]))

        mrsc = mRSC(donor, target, pred_interval, probObservation=1)
        
        player_pred = pd.DataFrame()
        player_true = pd.DataFrame()
        for i in range(len(metrics_list)):
            mrsc.fit_threshold(metrics_list[i], pred_interval, threshold, donorSetup, denoiseSetup,regression_method, verbose)
            pred = mrsc.predict()
            true = mrsc.getTrue()
            pred.columns = [playerName]
            true.columns = [playerName]
            player_pred = pd.concat([player_pred, pred], axis=0)
            player_true = pd.concat([player_true, true], axis=0)
        all_pred = pd.concat([all_pred, player_pred], axis=1)
        all_true = pd.concat([all_true, player_true], axis=1)

    ###################
    print(all_pred)
    print(all_pred.shape)
    mask = (all_true !=0 )
    mape = np.abs(all_pred - all_true) / all_true[mask]
    print("*** MAPE ***")
    print(mape.mean(axis=1))
    print("MAPE for all: ", mape.mean().mean())

    rmse = utils.rmse_2d(all_true, all_pred)
    print()
    print("*** RMSE ***")
    print(rmse)
    print("RMSE for all: ", rmse.mean())
Пример #6
0
def test():
    """
    import data
    """
    print("importing data")
    players = pd.read_csv("../data/nba-players-stats/player_data.csv")
    players = players[players.year_start >=
                      1980]  # only choose players who started after 1980
    players["player_id"] = range(0, len(players.name))  # assign id

    stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
    stats = stats[stats.Player.isin(players.name)]

    # only after 1980
    stats = stats[stats.Year >= 1980]

    # without duplicated names --> to do: how to distinguish multiple player with the same name
    stats = removeDuplicated(players, stats)
    stats.Year = stats.Year.astype(int)
    stats.year_count = stats.year_count.astype(int)

    print("preparing data")
    # transform stats to a dictionary composed of df's for each stat
    # the stats are re-calculated to get one stat for each year
    metricsPerGameColNames = ["PTS", "AST", "TOV", "TRB", "STL", "BLK"]
    metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

    metricsPerCentColNames = ["FG", "FT", "3P"]
    metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

    metricsWeightedColNames = ["PER"]
    metricsWeightedDict = getMetricsWeightedDict(stats,
                                                 metricsWeightedColNames)

    allMetricsDict = {
        **metricsPerGameDict,
        **metricsPerCentDict,
        **metricsWeightedDict
    }
    allPivotedTableDict = getPivotedTableDict(allMetricsDict)

    # this matrix will be used to mask the table
    df_year = pd.pivot_table(stats,
                             values="Year",
                             index="Player",
                             columns="year_count")
    """
    experiment setup
    """
    activePlayers = getActivePlayers(stats, 2016, 5)
    activePlayers.sort()
    # offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P%"]
    # defMetrics = ["TRB_G","STL_G","BLK_G"]

    metrics_to_use = [
        "PTS_G", "AST_G", "TOV_G", "PER_w", "FG%", "FT%", "3P%", "TRB_G",
        "STL_G", "BLK_G"
    ]

    weights = [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]
    expSetup = ["fixed", "SVD", "all", "pinv", False]

    singvals_list = [1, 2, 4, 8, 16, 32]

    print("start experiment")
    for singvals in singvals_list:
        pred_all = pd.DataFrame()
        true_all = pd.DataFrame()
        for playerName in activePlayers:
            target = Target(playerName, allPivotedTableDict, df_year)
            donor = Donor(allPivotedTableDict, df_year)

            mrsc = mRSC(donor, target, probObservation=1)
            mrsc.fit(metrics_to_use,
                     weights,
                     2016,
                     pred_length=1,
                     singvals=singvals,
                     setup=expSetup)

            pred = mrsc.predict()
            true = mrsc.getTrue()
            pred.columns = [playerName]
            true.columns = [playerName]

            pred_all = pd.concat([pred_all, pred], axis=1)
            true_all = pd.concat([true_all, true], axis=1)

        mask = (true_all != 0)
        mape = np.abs(pred_all - true_all) / true_all[mask]
        print(singvals)
        print(mape.mean(axis=1))
Пример #7
0
def annual_predictions(playerNames, allPivotedTableDict, donor, pred_interval, metrics, pred_metrics,
                      threshold, donorSetup, denoiseSetup, regression_method, verbose, dir_name, top_players):
    all_pred = pd.DataFrame()
    all_true = pd.DataFrame()
    all_bench = pd.DataFrame()
    all_R2 = pd.DataFrame()
    
    for playerName in playerNames:            
        target = Target(playerName, allPivotedTableDict)
        mrsc = mRSC(donor, target, pred_interval, probObservation=1)        
        player_pred = pd.DataFrame()
        player_true = pd.DataFrame()
        
        # benchmark
        true, benchmark = getBenchmark(target, pred_metrics, pred_interval)
        
        for metric in metrics:
            mrsc.fit_threshold(metric, threshold, donorSetup, denoiseSetup,regression_method, verbose)
            pred = mrsc.predict()
            pred = pred[pred.index.isin(pred_metrics)]
            true = mrsc.getTrue()
            
#             # ARMA
#             data = mrsc.target_data.T.ewm(com=0.5).mean().T.values.flatten()
#             data = data[:-1]
#             ewm = data[-1]
#     #         if (np.sum(data != 0)==0):
#     #             pred_arima = 0
#     #         else:
#     #             model = ARMA(data, order=(1, 1))
#     #             model_fit = model.fit(disp=False)
#     #             pred_arma = model_fit.predict(len(data), len(data))

#             pred = 0.5*pred + 0.5*ewm

            pred.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
            true.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
            player_pred = pd.concat([player_pred, pred], axis=0)
            player_true = pd.concat([player_true, true], axis=0)

        all_pred = pd.concat([all_pred, player_pred], axis=1)
        all_true = pd.concat([all_true, player_true], axis=1)
        all_bench = pd.concat([all_bench, benchmark], axis=1)

        R2 = getR2(player_true, player_pred, benchmark)
        all_R2 = pd.concat([all_R2, R2], axis=1)

    ###################
    print("Number of metrics: {}".format(all_pred.shape[0]))
    print("Number of players: {}".format(all_pred.shape[1]))
    print()
    mask = (all_true !=0 )
    mape = np.abs(all_pred[mask] - all_true[mask]) / all_true[mask]
    print("*** MAPE ***")
    print(mape.mean(axis=1))
    print("MAPE for all: ", mape.mean().mean())
    
    rmse = utils.rmse_2d(all_true, all_pred)
    print()
    print("*** RMSE ***")
    print(rmse)
    print("RMSE for all: ", rmse.mean())

    print()
    print("*** R2 ***")
    print(all_R2.mean(axis=1))
    print("R2 for all: ", all_R2.mean(axis=1).mean(axis=0))

    edited_R2 = copy.deepcopy(all_R2)
    edited_R2[edited_R2 <0] = 0
    print()
    print("*** edited R2 ***")
    print(edited_R2.mean(axis=1))
    print("R2 for all: ", edited_R2.mean().mean())
        
    return all_pred, all_true, all_R2, all_bench
Пример #8
0
def test():
    """
	import data
	"""
    print("* importing data")
    players = pd.read_csv("../data/nba-players-stats/player_data.csv")
    players = players[players.year_start >=
                      1980]  # only choose players who started after 1980
    players["player_id"] = range(0, len(players.name))  # assign id

    stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
    stats = stats[stats.Player.isin(players.name)]

    # only after 1980
    stats = stats[stats.Year >= 1980]

    # without duplicated names --> to do: how to distinguish multiple player with the same name
    stats = removeDuplicated(players, stats)
    stats.Year = stats.Year.astype(int)
    stats.year_count = stats.year_count.astype(int)

    print("* preparing data")
    # transform stats to a dictionary composed of df's for each stat
    # the stats are re-calculated to get one stat for each year
    metricsPerGameColNames = ["PTS", "AST", "TOV", "TRB", "STL", "BLK", "3P"]
    metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

    metricsPerCentColNames = ["FG", "FT"]
    metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

    metricsWeightedColNames = ["PER"]
    metricsWeightedDict = getMetricsWeightedDict(stats,
                                                 metricsWeightedColNames)

    allMetricsDict = {
        **metricsPerGameDict,
        **metricsPerCentDict,
        **metricsWeightedDict
    }
    allPivotedTableDict = getPivotedTableDict(allMetricsDict)
    allMetrics = list(allMetricsDict.keys())

    # this matrix will be used to mask the table
    df_year = pd.pivot_table(stats,
                             values="Year",
                             index="Player",
                             columns="year_count")
    """
	experiment setup
	"""
    pred_year = 2016
    # targets
    activePlayers = getActivePlayers(stats, pred_year, 4)
    activePlayers.sort()
    activePlayers.remove("Kevin Garnett")
    activePlayers.remove("Kobe Bryant")

    # overall setup
    expSetup = ["sliding", "SVD", "all", "pinv", False]
    threshold = 0.97

    ###################################################
    # offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P_G"]
    # defMetrics = ["TRB_G","STL_G","BLK_G"]
    # metrics_list = [offMetrics, defMetrics]
    ###################################################

    print("* start experiment")
    pred_all = pd.DataFrame()
    true_all = pd.DataFrame()
    metrics_all = []
    with open('metrics_all.pkl', 'wb') as f:
        pickle.dump(metrics_all, f, pickle.HIGHEST_PROTOCOL)
    for playerName in activePlayers:
        # print()
        # print("***********", playerName , "************")
        target = Target(playerName, allPivotedTableDict, df_year)
        donor = Donor(allPivotedTableDict, df_year)

        metrics_list = getMetrics(target,
                                  donor,
                                  pred_year,
                                  allMetrics,
                                  threshold,
                                  expSetup,
                                  boundary="threshold")
        weights_list = getWeitghts(target,
                                   donor,
                                   metrics_list,
                                   expSetup,
                                   method="mean")

        metrics_all.append(metrics_list)
        # print(metrics_list)

        mrsc = mRSC(donor, target, probObservation=1)

        player_pred = pd.DataFrame()
        player_true = pd.DataFrame()
        for i in range(len(metrics_list)):
            mrsc.fit_threshold(metrics_list[i],
                               weights_list[i],
                               pred_year,
                               pred_length=1,
                               threshold=threshold,
                               setup=expSetup)

            pred = mrsc.predict()
            true = mrsc.getTrue()
            pred.columns = [playerName]
            true.columns = [playerName]

            c = metrics_list[i].index(allMetrics[i])
            player_pred = pd.concat([player_pred, pred.iloc[c:(c + 1), :]],
                                    axis=0)
            player_true = pd.concat([player_true, true.iloc[c:(c + 1), :]],
                                    axis=0)

        pred_all = pd.concat([pred_all, player_pred], axis=1)
        true_all = pd.concat([true_all, player_true], axis=1)

        mask = (true_all != 0)
        mape = np.abs(pred_all - true_all) / true_all[mask]
        # print(mape.mean(axis=1))

    ###################
    print("******** RESULT ********")
    mask = (true_all != 0)
    mape = np.abs(pred_all - true_all) / true_all[mask]

    print()
    print("*** MAPE ***")
    print(mape.mean(axis=1))
    print("MAPE for all: ", mape.mean().mean())

    rmse = utils.rmse_2d(pred_all, true_all)
    print()
    print("*** RMSE ***")
    print(rmse)
    print("RMSE for all: ", rmse.mean())

    with open('metrics_all.pkl', 'wb') as f:
        pickle.dump(metrics_all, f, pickle.HIGHEST_PROTOCOL)
Пример #9
0
def test():
    """
    import data
    """
    print("importing data")
    players = pd.read_csv("../data/nba-players-stats/player_data.csv")
    players = players[players.year_start >=
                      1980]  # only choose players who started after 1980
    players["player_id"] = range(0, len(players.name))  # assign id

    stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
    stats = stats[stats.Player.isin(players.name)]

    # only after 1980
    stats = stats[stats.Year >= 1980]

    # without duplicated names --> to do: how to distinguish multiple player with the same name
    stats = removeDuplicated(players, stats)
    stats.Year = stats.Year.astype(int)
    stats.year_count = stats.year_count.astype(int)

    print("preparing data")
    # transform stats to a dictionary composed of df's for each stat
    # the stats are re-calculated to get one stat for each year
    metricsPerGameColNames = ["PTS", "AST", "TOV", "TRB", "STL", "BLK", "3P"]
    metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

    metricsPerCentColNames = ["FG", "FT"]
    metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

    metricsWeightedColNames = ["PER"]
    metricsWeightedDict = getMetricsWeightedDict(stats,
                                                 metricsWeightedColNames)

    allMetricsDict = {
        **metricsPerGameDict,
        **metricsPerCentDict,
        **metricsWeightedDict
    }
    allPivotedTableDict = getPivotedTableDict(allMetricsDict)
    allMetrics = list(allMetricsDict.keys())

    # this matrix will be used to mask the table
    df_year = pd.pivot_table(stats,
                             values="Year",
                             index="Player",
                             columns="year_count")
    """
    experiment setup
    """
    # overall setup
    expSetup = ["sliding", "SVD", "all", "pinv", False]
    threshold = 0.97

    metrics1 = ["PTS_G", "PER_w", "TRB_G", "3P_G"]
    metrics2 = ["FG%", "FT%"]
    metrics3 = ["BLK_G", "AST_G", "TOV_G", "STL_G"]

    metrics_list = [metrics1, metrics2, metrics3]

    ###################################################
    # offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P%"]
    # defMetrics = ["TRB_G","STL_G","BLK_G"]
    # metrics_list = [offMetrics, defMetrics]

    ###################################################
    playerName = "Ryan Anderson"

    target = Target(playerName, allPivotedTableDict, df_year)
    donor = Donor(allPivotedTableDict, df_year)

    weights_list = getWeitghts(target,
                               donor,
                               metrics_list,
                               expSetup,
                               method="var")

    mrsc = mRSC(donor, target, probObservation=1)

    fig, axs = plt.subplots(3, 5)
    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i],
                           weights_list[i],
                           2016,
                           pred_length=1,
                           threshold=threshold,
                           setup=expSetup)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName]
        true.columns = [playerName]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)

        # mrsc.plot()
        for j in range(len(metrics_list[i])):
            metric = metrics_list[i][j]
            true_trajectory = target.data[metric].dropna(
                axis='columns').iloc[:, :mrsc.total_index]

            pred_val = np.dot(
                mrsc.model.donor_data.iloc[:, j * mrsc.model.total_index:(
                    (j + 1) * mrsc.model.total_index)].T, mrsc.model.beta).T
            pred_trajectory = pd.DataFrame(pred_val,
                                           columns=true_trajectory.columns,
                                           index=true_trajectory.index)

            markers_on = [true_trajectory.shape[1] - 1]

            axs[i, j].plot(true_trajectory.T,
                           marker='o',
                           color='red',
                           label='true')
            axs[i, j].plot(pred_trajectory.T,
                           marker='o',
                           markevery=markers_on,
                           color='blue',
                           label='prediction')
            axs[i, j].set_title(playerName + ": " + metric)
            # axs[i, j].legend()

    for ax in axs.flat:
        ax.set(xlabel='years played in NBA')
    plt.subplots_adjust(wspace=0.5, hspace=0.5)
    plt.show()

    ###################
    mask = (player_true != 0)
    mape = np.abs(player_pred - player_true) / player_true[mask]
    print("*** MAPE ***")
    print(mape.mean(axis=1))
    print("MAPE for all: ", mape.mean().mean())

    rmse = utils.rmse_2d(player_true, player_pred)
    print()
    print("*** RMSE ***")
    print(rmse)
    print("RMSE for all: ", rmse.mean())