Exemplo n.º 1
0
    def __init__(self, train_data, test_data):
        self.feature_creation = FeatureCreation()

        self.train_data = train_data
        self.test_data = test_data
        self.model = CatBoostRegressionModel(SPS_MODEL_PARAMS)

        self.regressors = []
        self.regressand = 'SPS'

        self.created_features = False
        self.trained_model = False
Exemplo n.º 2
0
    def __init__(self, train_data, test_data, site):
        self.feature_creation = FeatureCreation()
        self.clean_data = CleanData()

        self.train_data = train_data
        self.test_data = test_data
        self.site = site
        self.model = XGBoostRegressionModel(OWNERSHIP_MODEL_PARAMS)

        self.regressors = []
        self.regressand = 'OWNERSHIP'

        self.created_features = False
        self.trained_model = False
Exemplo n.º 3
0
class CleanData(object):
    def __init__(self):
        self.feature_creation = FeatureCreation()

    def drop_rows_player_inactive(self, df):
        df = df.loc[df['SECONDSPLAYED'] > 0]
        return df

    def drop_rows_player_injured(self, df):
        df = df.loc[(df['SECONDSPLAYED'] != 0) |
                    (df['COMMENT'] == "DNP - Coach's Decision")]
        return df

    def drop_rows_player_rest(self, df, thresh=1200):
        df = self.feature_creation.expanding_mean(
            df=df,
            group_col_names=['SEASON', 'PLAYERID'],
            col_name='SECONDSPLAYED',
            new_col_name='AVG_SP')
        df = df.loc[~((df['AVG_SP'] > thresh) &
                      (df['COMMENT'] == "DNP - Coach's Decision"))]
        df = df.drop(columns=['AVG_SP'])
        return df

    def roto_name_to_nba_name(self, name):
        name_list = name.split(',')
        name = "{} {}".format(name_list[-1].lstrip(), ' '.join(name_list[:-1]))
        if name in ROTO_NAME_TO_NBA_NAME:
            return ROTO_NAME_TO_NBA_NAME[name]
        return name
Exemplo n.º 4
0
    def generate_regressors(self, boxscores, start_date, end_date):
        feature_creation = FeatureCreation()

        relevant_seasons = boxscores.loc[(boxscores['DATE'] >= start_date) & (
            boxscores['DATE'] <= end_date)]['SEASON'].unique()
        boxscores = boxscores.loc[boxscores['SEASON'].isin(relevant_seasons)]

        boxscores['ASSISTS/POSSESSION'] = boxscores['AST'] / boxscores['POSS']

        # average player assists/possession
        boxscores = feature_creation.expanding_weighted_mean(
            df=boxscores,
            group_col_names=['SEASON', 'TEAM', 'PLAYERID'],
            col_name='ASSISTS/POSSESSION',
            new_col_name='AVG_ASSISTS/POSSESSION',
            weight_col_name='POSS')

        boxscores = boxscores.loc[(boxscores['DATE'] >= start_date)
                                  & (boxscores['DATE'] <= end_date)]

        return boxscores
Exemplo n.º 5
0
    def __init__(self, train_data, test_data):
        self.feature_creation = FeatureCreation()
        self.clean_data = CleanData()

        self.train_data = train_data
        self.test_data = test_data
        self.model = XGBoostRegressionModel(TOPSCORE_MODEL_PARAMS)

        self.regressors = ['GAMECOUNT', 'TOTALENTRIES', 'AVERAGE_TOTAL']
        self.regressand = 'TOPSCORE'

        self.created_features = False
        self.trained_model = False
Exemplo n.º 6
0
class VarianceModel(object):
    def __init__(self, test_data):
        self.feature_creation = FeatureCreation()
        self.test_data = test_data
        self.original_columns = list(self.test_data.columns)

    def predict(self, y):
        output_column = 'STD_{}'.format(y)
        self.test_data = self.feature_creation.expanding_standard_deviation(
            df=self.test_data,
            group_col_names=['SEASON', 'PLAYERID', 'START'],
            col_name=y,
            new_col_name=output_column,
            min_periods=4)

        return self.test_data[self.original_columns +
                              [output_column]], output_column
Exemplo n.º 7
0
class RPSModel(object):
    def __init__(self, train_data, test_data):
        self.feature_creation = FeatureCreation()

        self.train_data = train_data
        self.test_data = test_data
        self.model = CatBoostRegressionModel(RPS_MODEL_PARAMS)

        self.regressors = []
        self.regressand = 'RPS'

        self.created_features = False
        self.generated_weights = False
        self.trained_model = False

    def create_features(self, odds_data, sp_threshold=60):
        data = pd.concat([self.train_data, self.test_data])

        data['REB'] = data['DREB'] + data['OREB']
        data[self.regressand] = data['REB']/data['SECONDSPLAYED']
        data['ORPS'] = data['OREB']/data['SECONDSPLAYED']
        data['DRPS'] = data['DREB']/data['SECONDSPLAYED']

        data['CLEAN_DRPS'] = data['DRPS']
        data.loc[data['SECONDSPLAYED'] <= sp_threshold, 'CLEAN_DRPS'] = np.nan
        data['CLEAN_ORPS'] = data['ORPS']
        data.loc[data['SECONDSPLAYED'] <= sp_threshold, 'CLEAN_ORPS'] = np.nan

        train_index = self.train_data.set_index(['GAMEID', 'PLAYERID']).index
        test_index = self.test_data.set_index(['GAMEID', 'PLAYERID']).index

        # season averages
        data = self.feature_creation.expanding_weighted_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='DRPS', weight_col_name='SECONDSPLAYED',
            new_col_name='AVG_DRPS'
        )
        data = self.feature_creation.expanding_weighted_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='ORPS', weight_col_name='SECONDSPLAYED',
            new_col_name='AVG_ORPS'
        )
        self.regressors.append('AVG_DRPS')
        self.regressors.append('AVG_ORPS')

        data = self.feature_creation.expanding_weighted_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'OPP_TEAM', 'PLAYERID'], col_name=self.regressand,
            weight_col_name='SECONDSPLAYED', new_col_name='AVG_Y_OPP_TEAM'
        )
        self.regressors.append('AVG_Y_OPP_TEAM')

        # 1 game lags
        data = self.feature_creation.lag(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='CLEAN_DRPS', new_col_name='L1_DRPS',
            n_shift=1
        )
        self.regressors.append('L1_DRPS')

        # exponentially weighted means
        data = self.feature_creation.expanding_ewm(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='DRPS', new_col_name='EWM_DRPS',
            alpha=0.90
        )
        data = self.feature_creation.expanding_ewm(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='ORPS', new_col_name='EWM_ORPS',
            alpha=0.90
        )
        self.regressors.append('EWM_DRPS')
        self.regressors.append('EWM_ORPS')

        # moving averages
        data = self.feature_creation.rolling_weighted_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='DRPS', new_col_name='MA2_DRPS',
            weight_col_name='SECONDSPLAYED', n_rolling=2, min_periods=1
        )
        data = self.feature_creation.rolling_weighted_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='DRPS', new_col_name='MA15_DRPS',
            weight_col_name='SECONDSPLAYED', n_rolling=15, min_periods=8
        )
        data = self.feature_creation.rolling_weighted_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='ORPS', new_col_name='MA6_ORPS',
            weight_col_name='SECONDSPLAYED', n_rolling=6, min_periods=3
        )
        data = self.feature_creation.rolling_weighted_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='ORPS', new_col_name='MA18_ORPS',
            weight_col_name='SECONDSPLAYED', n_rolling=18, min_periods=9
        )
        self.regressors.append('MA2_DRPS')
        self.regressors.append('MA15_DRPS')
        self.regressors.append('MA6_ORPS')
        self.regressors.append('MA18_ORPS')

        # start
        data = self.feature_creation.expanding_weighted_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID', 'START'], col_name=self.regressand,
            weight_col_name='SECONDSPLAYED', new_col_name='AVG_Y_R'
        )
        self.regressors.append('AVG_Y_R')

        # position
        data['NORM_POS'] = data['POSITION'].apply(lambda x: x if '-' not in x else x.split('-')[0])
        data['GUARD'] = 0
        data.loc[data['NORM_POS'] == 'Guard', 'GUARD'] = 1
        self.regressors.append('GUARD')

        # defense
        data = self.feature_creation.expanding_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='DREB', new_col_name='AVG_DREB'
        )
        data = self.feature_creation.expanding_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='OREB', new_col_name='AVG_OREB'
        )
        data = self.feature_creation.expanding_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='SECONDSPLAYED', new_col_name='AVG_SP'
        )

        temp = data.dropna(subset=['DREB', 'SECONDSPLAYED', 'AVG_DREB', 'AVG_SP'])
        grouped_defensive_boxscores = temp.groupby(['SEASON', 'DATE', 'OPP_TEAM']).apply(
            lambda x: pd.Series({
                'TEAM_DRPS_ALLOWED': x['DREB'].sum()/x['SECONDSPLAYED'].sum(),
                'TEAM_DRPS_AVG': x['AVG_DREB'].sum()/x['AVG_SP'].sum()
            })
        ).reset_index()
        grouped_defensive_boxscores['TEAM_DRPS_DIFF_ALLOWED'] = grouped_defensive_boxscores['TEAM_DRPS_ALLOWED'] - \
            grouped_defensive_boxscores['TEAM_DRPS_AVG']
        grouped_defensive_boxscores = self.feature_creation.expanding_mean(
            df=grouped_defensive_boxscores, group_col_names=['SEASON', 'OPP_TEAM'], col_name='TEAM_DRPS_DIFF_ALLOWED',
            new_col_name='AVG_TEAM_DRPS_DIFF_ALLOWED', order_idx_name='DATE', min_periods=5
        )
        data = data.merge(grouped_defensive_boxscores, on=['SEASON', 'DATE', 'OPP_TEAM'], how='left')
        self.regressors.append('AVG_TEAM_DRPS_DIFF_ALLOWED')

        temp = data.dropna(subset=['DREB', 'SECONDSPLAYED', 'AVG_DREB', 'AVG_SP'])
        grouped_defensive_boxscores = temp.groupby(['SEASON', 'DATE', 'START', 'OPP_TEAM']).apply(
            lambda x: pd.Series({
                'TEAM_DRPS_ALLOWED_R': x['DREB'].sum()/x['SECONDSPLAYED'].sum(),
                'TEAM_DRPS_AVG_R': x['AVG_DREB'].sum()/x['AVG_SP'].sum()
            })
        ).reset_index()
        grouped_defensive_boxscores['TEAM_DRPS_DIFF_ALLOWED_R'] = grouped_defensive_boxscores['TEAM_DRPS_ALLOWED_R'] - \
            grouped_defensive_boxscores['TEAM_DRPS_AVG_R']
        grouped_defensive_boxscores = self.feature_creation.expanding_mean(
            df=grouped_defensive_boxscores, group_col_names=['SEASON', 'START', 'OPP_TEAM'],
            col_name='TEAM_DRPS_DIFF_ALLOWED_R', new_col_name='AVG_TEAM_DRPS_DIFF_ALLOWED_R', order_idx_name='DATE',
            min_periods=5
        )
        data = data.merge(grouped_defensive_boxscores, on=['SEASON', 'DATE', 'START', 'OPP_TEAM'], how='left')
        self.regressors.append('AVG_TEAM_DRPS_DIFF_ALLOWED_R')

        temp = data.dropna(subset=['DREB', 'OREB', 'SECONDSPLAYED', 'AVG_DREB', 'AVG_OREB', 'AVG_SP'])
        grouped_defensive_boxscores = temp.groupby(['SEASON', 'DATE', 'NORM_POS', 'OPP_TEAM']).apply(
            lambda x: pd.Series({
                'TEAM_DRPS_ALLOWED_P': x['DREB'].sum()/x['SECONDSPLAYED'].sum(),
                'TEAM_DRPS_AVG_P': x['AVG_DREB'].sum()/x['AVG_SP'].sum(),
                'TEAM_ORPS_ALLOWED_P': x['OREB'].sum()/x['SECONDSPLAYED'].sum(),
                'TEAM_ORPS_AVG_P': x['AVG_OREB'].sum()/x['AVG_SP'].sum()
            })
        ).reset_index()
        grouped_defensive_boxscores['TEAM_DRPS_DIFF_ALLOWED_P'] = grouped_defensive_boxscores['TEAM_DRPS_ALLOWED_P'] - \
            grouped_defensive_boxscores['TEAM_DRPS_AVG_P']
        grouped_defensive_boxscores['TEAM_ORPS_DIFF_ALLOWED_P'] = grouped_defensive_boxscores['TEAM_ORPS_ALLOWED_P'] - \
            grouped_defensive_boxscores['TEAM_ORPS_AVG_P']
        grouped_defensive_boxscores = self.feature_creation.expanding_mean(
            df=grouped_defensive_boxscores, group_col_names=['SEASON', 'NORM_POS', 'OPP_TEAM'],
            col_name='TEAM_DRPS_DIFF_ALLOWED_P', new_col_name='AVG_TEAM_DRPS_DIFF_ALLOWED_P', order_idx_name='DATE',
            min_periods=5
        )
        grouped_defensive_boxscores = self.feature_creation.expanding_mean(
            df=grouped_defensive_boxscores, group_col_names=['SEASON', 'NORM_POS', 'OPP_TEAM'],
            col_name='TEAM_ORPS_DIFF_ALLOWED_P', new_col_name='AVG_TEAM_ORPS_DIFF_ALLOWED_P', order_idx_name='DATE',
            min_periods=5
        )
        data = data.merge(grouped_defensive_boxscores, on=['SEASON', 'DATE', 'NORM_POS', 'OPP_TEAM'], how='left')
        self.regressors.append('AVG_TEAM_DRPS_DIFF_ALLOWED_P')
        self.regressors.append('AVG_TEAM_ORPS_DIFF_ALLOWED_P')

        # total
        full_game_odds = odds_data.loc[odds_data['PERIOD'] == 'Full Game']
        full_game_odds['TOTAL'] = full_game_odds['TOTAL'].replace(['PK', '-'], np.nan)
        data = data.merge(full_game_odds, on=['DATE', 'TEAM'], how='left')
        self.regressors.append('TOTAL')

        # injuries
        data = self.feature_creation.expanding_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='REB', new_col_name='AVG_REB'
        )

        temp = data.dropna(subset=['DREB', 'AVG_DREB', 'SECONDSPLAYED', 'AVG_SP'])
        temp = temp.groupby(['SEASON', 'DATE', 'TEAM']).apply(
            lambda x: pd.Series({
                'TEAM_ACTIVE_AVG_DRPS': x['AVG_DREB'].sum()/x['AVG_SP'].sum(),
                'TEAM_DRPS': x['DREB'].sum()/x['SECONDSPLAYED'].sum(),
                'TEAM_ACTIVE_AVG_RPS': x['AVG_REB'].sum()/x['AVG_SP'].sum(),
                'TEAM_RPS': x['REB'].sum()/x['SECONDSPLAYED'].sum()
            })
        )
        temp = self.feature_creation.expanding_mean(
            df=temp, group_col_names=['SEASON', 'TEAM'], col_name='TEAM_DRPS', new_col_name='AVG_TEAM_DRPS'
        )
        temp['TEAM_ACTIVE_AVG_DRPS_DIFF'] = temp['TEAM_ACTIVE_AVG_DRPS'] - temp['AVG_TEAM_DRPS']
        data = data.merge(temp, on=['DATE', 'TEAM'], how='left')
        self.regressors.append('TEAM_ACTIVE_AVG_DRPS_DIFF')

        # regressand by lineup
        data['START_LINEUP'] = np.nan
        data['STARS'] = np.nan
        data = data.set_index(['GAMEID', 'TEAM'])
        for (game_id, team), temp in data.groupby(['GAMEID', 'TEAM']):
            start_lineup = list(temp.loc[temp['START'] == 1, 'PLAYERID'].values)
            start_lineup.sort()
            start_lineup = '_'.join(start_lineup)
            data.loc[(game_id, team), 'START_LINEUP'] = start_lineup

            stars = list(temp.loc[temp['AVG_DREB'] >= 7, 'PLAYERID'].values)
            stars.sort()
            stars = '_'.join(stars)
            data.loc[(game_id, team), 'STARS'] = stars
        data = data.reset_index()

        data = self.feature_creation.expanding_weighted_mean(
            df=data, group_col_names=['SEASON', 'START_LINEUP', 'PLAYERID'], col_name=self.regressand,
            weight_col_name='SECONDSPLAYED', new_col_name='AVG_Y_STARTERS'
        )

        data = self.feature_creation.expanding_weighted_mean(
            df=data, group_col_names=['SEASON', 'STARS', 'PLAYERID'], col_name=self.regressand,
            weight_col_name='SECONDSPLAYED', new_col_name='AVG_Y_STARS'
        )
        self.regressors.append('AVG_Y_STARTERS')
        self.regressors.append('AVG_Y_STARS')

        # misc
        data['GP'] = 1
        data = self.feature_creation.expanding_sum(
            df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='GP', new_col_name='COUNT_GP'
            )
        self.regressors.append('COUNT_GP')
        self.regressors.append('AVG_SP')

        # to fill
        data = self.feature_creation.expanding_weighted_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name=self.regressand,
            weight_col_name='SECONDSPLAYED', new_col_name='AVG_Y'
        )

        data = self.generate_weights(data)
        data = self.preprocess(data)
        data = data.set_index(['GAMEID', 'PLAYERID'])

        train_index = list(set(data.index.values).intersection(set(train_index.values)))
        self.train_data = data.loc[train_index].reset_index()
        test_index = list(set(data.index.values).intersection(set(test_index.values)))
        self.test_data = data.loc[test_index].reset_index()

        self.created_features = True

    def preprocess(self, data):
        data['AVG_Y_R'] = data['AVG_Y_R'].fillna(data['AVG_Y'])
        data['AVG_Y_OPP_TEAM'] = data['AVG_Y_OPP_TEAM'].fillna(data['AVG_Y'])

        data['L1_DRPS'] = data['L1_DRPS'].fillna(data['AVG_DRPS'])

        data['EWM_DRPS'] = data['EWM_DRPS'].fillna(data['AVG_DRPS'])
        data['EWM_ORPS'] = data['EWM_ORPS'].fillna(data['AVG_ORPS'])

        data['MA2_DRPS'] = data['MA2_DRPS'].fillna(data['AVG_DRPS'])
        data['MA15_DRPS'] = data['MA15_DRPS'].fillna(data['MA2_DRPS'])
        data['MA6_ORPS'] = data['MA6_ORPS'].fillna(data['AVG_ORPS'])
        data['MA18_ORPS'] = data['MA18_ORPS'].fillna(data['MA6_ORPS'])

        data['AVG_TEAM_DRPS_DIFF_ALLOWED'] = data['AVG_TEAM_DRPS_DIFF_ALLOWED'].fillna(0)
        data['AVG_TEAM_DRPS_DIFF_ALLOWED_R'] = data['AVG_TEAM_DRPS_DIFF_ALLOWED_R'].fillna(0)
        data['AVG_TEAM_DRPS_DIFF_ALLOWED_P'] = data['AVG_TEAM_DRPS_DIFF_ALLOWED_P'].fillna(0)
        data['AVG_TEAM_ORPS_DIFF_ALLOWED_P'] = data['AVG_TEAM_ORPS_DIFF_ALLOWED_P'].fillna(0)

        data['TOTAL'] = data['TOTAL'].fillna(200)

        data['TEAM_ACTIVE_AVG_DRPS_DIFF'] = data['TEAM_ACTIVE_AVG_DRPS_DIFF'].fillna(0)
        data['AVG_Y_STARS'] = data['AVG_Y_STARS'].fillna(data['AVG_Y'])
        data['AVG_Y_STARTERS'] = data['AVG_Y_STARTERS'].fillna(data['AVG_Y_STARS'])

        data['COUNT_GP'] = data['COUNT_GP'].fillna(0)

        # we can predict Y for a player as long as AVG_Y is not nan
        data = data.dropna(subset=['AVG_Y'])

        return data

    def generate_weights(self, data):
        data = self.feature_creation.expanding_sum(
            df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='SECONDSPLAYED', new_col_name='SUM_SP'
        )

        self.weight = 'WEIGHT'
        data[self.weight] = data['SECONDSPLAYED'].apply(WeightFunctions.game_seconds_played_weight) * \
            data['SUM_SP'].apply(WeightFunctions.season_seconds_played_weight)

        return data

    def train_model(self):
        if not self.created_features:
            raise Exception('Must create features before training model')

        # drop games in which players played a minute or less
        self.train_data = self.train_data.loc[self.train_data['SECONDSPLAYED'] > 60]

        X = self.train_data[self.regressors]
        y = self.train_data[self.regressand]
        w = self.train_data[self.weight]
        self.model.fit(X, y, sample_weight=w, test_size=0.25, early_stopping_rounds=25)

        self.trained_model = True

    def predict(self):
        if not self.trained_model:
            raise Exception('Must train model before generating predictions')

        self.test_data['{}_HAT'.format(self.regressand)] = self.model.predict(self.test_data[self.regressors])

        return self.test_data[['GAMEID', 'PLAYERID', '{}_HAT'.format(self.regressand)]]
Exemplo n.º 8
0
    def generate_regressors(self, boxscores, start_date, end_date):
        feature_creation = FeatureCreation()

        relevant_seasons = boxscores.loc[(boxscores['DATE'] >= start_date) & (
            boxscores['DATE'] <= end_date)]['SEASON'].unique()
        boxscores = boxscores.loc[boxscores['SEASON'].isin(relevant_seasons)]

        boxscores['MP'] = boxscores['SECONDSPLAYED'] / 60
        boxscores['POSSESSIONS/MINUTE'] = boxscores['POSS'] / boxscores['MP']

        # average player possessions/minute
        boxscores = feature_creation.expanding_weighted_mean(
            df=boxscores,
            group_col_names=['SEASON', 'TEAM', 'PLAYERID'],
            col_name='POSSESSIONS/MINUTE',
            new_col_name='AVG_POSSESSIONS/MINUTE',
            weight_col_name='MP')

        # average possessions/minute that opp team allowed
        team_boxscores = boxscores.groupby([
            'SEASON', 'DATE', 'TEAM', 'OPP_TEAM'
        ]).apply(lambda x: pd.Series({
            'TEAM_POSSESSIONS': x['POSS'].sum(),
            'TEAM_MP': x['MP'].sum()
        })).reset_index()
        team_boxscores['TEAM_POSSESSIONS/MINUTE'] = team_boxscores[
            'TEAM_POSSESSIONS'] / team_boxscores['TEAM_MP']

        opp_team_boxscores = team_boxscores.rename(
            columns={
                'TEAM': 'OPP_TEAM',
                'OPP_TEAM': 'TEAM',
                'TEAM_POSSESSIONS': 'OPP_TEAM_POSSESSIONS',
                'TEAM_MP': 'OPP_TEAM_MP',
                'TEAM_POSSESSIONS/MINUTE': 'OPP_TEAM_POSSESSIONS/MINUTE'
            })

        team_boxscores = team_boxscores.merge(
            opp_team_boxscores,
            on=['SEASON', 'DATE', 'TEAM', 'OPP_TEAM'],
            how='left')

        team_boxscores = feature_creation.expanding_weighted_mean(
            df=team_boxscores,
            group_col_names=['SEASON', 'OPP_TEAM'],
            col_name='TEAM_POSSESSIONS/MINUTE',
            new_col_name='AVG_POSSESSIONS/MINUTE_OPP_TEAM_ALLOWED',
            weight_col_name='TEAM_MP')

        # average possessions/minute that opp team played against
        season_stats = team_boxscores.groupby(
            ['SEASON', 'TEAM']).apply(lambda x: pd.Series({
                'TEAM_POSSESSIONS(SEASON)':
                x['TEAM_POSSESSIONS'].mean(),
                'TEAM_MP(SEASON)':
                x['TEAM_MP'].mean(),
                'TEAM_POSSESSIONS_ALLOWED(SEASON)':
                x['OPP_TEAM_POSSESSIONS'].mean(),
                'TEAM_MP_ALLOWED(SEASON)':
                x['OPP_TEAM_MP'].mean()
            })).reset_index()

        season_stats['TEAM_POSSESSIONS/MINUTE(SEASON)'] = season_stats[
            'TEAM_POSSESSIONS(SEASON)'] / season_stats['TEAM_MP(SEASON)']
        season_stats['TEAM_POSSESSIONS/MINUTE_ALLOWED(SEASON)'] = \
            season_stats['TEAM_POSSESSIONS_ALLOWED(SEASON)']/season_stats['TEAM_MP_ALLOWED(SEASON)']

        opp_season_stats = season_stats.rename(
            columns={
                'TEAM':
                'OPP_TEAM',
                'TEAM_POSSESSIONS(SEASON)':
                'OPP_TEAM_POSSESSIONS(SEASON)',
                'TEAM_MP(SEASON)':
                'OPP_TEAM_MP(SEASON)',
                'TEAM_POSSESSIONS_ALLOWED(SEASON)':
                'OPP_TEAM_POSSESSIONS_ALLOWED(SEASON)',
                'TEAM_MP_ALLOWED(SEASON)':
                'OPP_TEAM_MP_ALLOWED(SEASON)',
                'TEAM_POSSESSIONS/MINUTE(SEASON)':
                'OPP_TEAM_POSSESSIONS/MINUTE(SEASON)',
                'TEAM_POSSESSIONS/MINUTE_ALLOWED(SEASON)':
                'OPP_TEAM_POSSESSIONS/MINUTE_ALLOWED(SEASON)'
            })

        team_boxscores = team_boxscores.merge(season_stats,
                                              on=['SEASON', 'TEAM'],
                                              how='left')
        team_boxscores = team_boxscores.merge(opp_season_stats,
                                              on=['SEASON', 'OPP_TEAM'],
                                              how='left')

        team_boxscores = feature_creation.expanding_weighted_mean(
            df=team_boxscores,
            group_col_names=['SEASON', 'OPP_TEAM'],
            col_name='TEAM_POSSESSIONS/MINUTE(SEASON)',
            new_col_name='AVG_POSSESSIONS/MINUTE(SEASON)_OPP_TEAM_P.A.',
            weight_col_name='OPP_TEAM_MP')

        # possessions/minute allowed that player played against
        boxscores = boxscores.merge(team_boxscores,
                                    on=['SEASON', 'DATE', 'TEAM', 'OPP_TEAM'],
                                    how='left')

        boxscores = feature_creation.expanding_weighted_mean(
            df=boxscores,
            group_col_names=['SEASON', 'TEAM', 'PLAYERID'],
            col_name='OPP_TEAM_POSSESSIONS/MINUTE_ALLOWED(SEASON)',
            new_col_name='AVG_POSSESSIONS/MINUTE_ALLOWED(SEASON)_PLAYER_P.A.',
            weight_col_name='MP')

        # player possessions/minute
        boxscores['PLAYER_POSSESSIONS/MINUTE'] = \
            2*boxscores['AVG_POSSESSIONS/MINUTE'] - boxscores['AVG_POSSESSIONS/MINUTE_ALLOWED(SEASON)_PLAYER_P.A.']

        # opp possessions/minute allowed
        boxscores['OPP_POSSESSIONS/MINUTE_ALLOWED'] = \
            2*boxscores['AVG_POSSESSIONS/MINUTE_OPP_TEAM_ALLOWED'] - boxscores['AVG_POSSESSIONS/MINUTE(SEASON)_OPP_TEAM_P.A.']

        boxscores = boxscores.loc[(boxscores['DATE'] >= start_date)
                                  & (boxscores['DATE'] <= end_date)]

        return boxscores
Exemplo n.º 9
0
    def generate_regressors(self, boxscores, start_date, end_date):
        feature_creation = FeatureCreation()

        relevant_seasons = boxscores.loc[(boxscores['DATE'] >= start_date) & (
            boxscores['DATE'] <= end_date)]['SEASON'].unique()
        boxscores = boxscores.loc[boxscores['SEASON'].isin(relevant_seasons)]

        team_boxscores = boxscores.groupby([
            'SEASON', 'DATE', 'TEAM', 'OPP_TEAM'
        ]).apply(lambda x: pd.Series({
            'TEAM_POSSESSIONS': x['POSS'].sum() / 5,
            'TEAM_OREB': x['OREB'].sum(),
            'TEAM_DREB': x['DREB'].sum()
        })).reset_index()

        opp_team_boxscores = team_boxscores.drop(columns='OPP_TEAM')
        opp_team_boxscores = opp_team_boxscores.rename(
            columns={
                'TEAM': 'OPP_TEAM',
                'TEAM_POSSESSIONS': 'OPP_TEAM_POSSESSIONS',
                'TEAM_OREB': 'OPP_TEAM_OREB',
                'TEAM_DREB': 'OPP_TEAM_DREB'
            })
        team_boxscores = team_boxscores.merge(
            opp_team_boxscores, on=['SEASON', 'DATE', 'OPP_TEAM'], how='left')

        team_boxscores['TEAM_OREB_CHANCES'] = team_boxscores[
            'TEAM_OREB'] + team_boxscores['OPP_TEAM_DREB']
        team_boxscores['TEAM_DREB_CHANCES'] = team_boxscores[
            'TEAM_DREB'] + team_boxscores['OPP_TEAM_OREB']

        # average team oreb chances/possession
        team_boxscores['TEAM_OREB_CHANCES/POSSESSION'] = team_boxscores[
            'TEAM_OREB_CHANCES'] / team_boxscores['TEAM_POSSESSIONS']

        team_boxscores = feature_creation.expanding_weighted_mean(
            df=team_boxscores,
            group_col_names=['SEASON', 'TEAM'],
            col_name='TEAM_OREB_CHANCES/POSSESSION',
            new_col_name='AVG_TEAM_OREB_CHANCES/POSSESSION',
            weight_col_name='TEAM_POSSESSIONS')

        # average oreb chances/possession that opp team allowed
        team_boxscores = feature_creation.expanding_weighted_mean(
            df=team_boxscores,
            group_col_names=['SEASON', 'OPP_TEAM'],
            col_name='TEAM_OREB_CHANCES/POSSESSION',
            new_col_name='AVG_OREB_CHANCES/POSSESSION_OPP_TEAM_ALLOWED',
            weight_col_name='TEAM_POSSESSIONS')

        # average oreb chances/possession allowed that team played against
        season_stats = team_boxscores.groupby(
            ['SEASON', 'TEAM']).apply(lambda x: pd.Series({
                'TEAM_OREB_ALLOWED(SEASON)':
                x['OPP_TEAM_OREB'].mean(),
                'TEAM_OREB_CHANCES(SEASON)':
                x['TEAM_OREB_CHANCES'].mean(),
                'TEAM_OREB_CHANCES_ALLOWED(SEASON)':
                x['TEAM_DREB_CHANCES'].mean(),
                'TEAM_DREB_ALLOWED(SEASON)':
                x['OPP_TEAM_DREB'].mean(),
                'TEAM_DREB_CHANCES(SEASON)':
                x['TEAM_DREB_CHANCES'].mean(),
                'TEAM_DREB_CHANCES_ALLOWED(SEASON)':
                x['TEAM_DREB_CHANCES'].mean(),
                'TEAM_POSSESSIONS(SEASON)':
                x['TEAM_POSSESSIONS'].mean(),
                'TEAM_POSSESSIONS_ALLOWED(SEASON)':
                x['OPP_TEAM_POSSESSIONS'].mean()
            })).reset_index()

        opp_season_stats = season_stats.rename(
            columns={
                'TEAM':
                'OPP_TEAM',
                'TEAM_OREB_ALLOWED(SEASON)':
                'OPP_TEAM_OREB_ALLOWED(SEASON)',
                'TEAM_OREB_CHANCES(SEASON)':
                'OPP_TEAM_OREB_CHANCES(SEASON)',
                'TEAM_OREB_CHANCES_ALLOWED(SEASON)':
                'OPP_TEAM_OREB_CHANCES_ALLOWED(SEASON)',
                'TEAM_DREB_ALLOWED(SEASON)':
                'OPP_TEAM_DREB_ALLOWED(SEASON)',
                'TEAM_DREB_CHANCES(SEASON)':
                'OPP_TEAM_DREB_CHANCES(SEASON)',
                'TEAM_DREB_CHANCES_ALLOWED(SEASON)':
                'OPP_TEAM_DREB_CHANCES_ALLOWED(SEASON)',
                'TEAM_POSSESSIONS(SEASON)':
                'OPP_TEAM_POSSESSIONS(SEASON)',
                'TEAM_POSSESSIONS_ALLOWED(SEASON)':
                'OPP_TEAM_POSSESSIONS_ALLOWED(SEASON)'
            })

        team_boxscores = team_boxscores.merge(season_stats,
                                              on=['SEASON', 'TEAM'],
                                              how='left')
        team_boxscores = team_boxscores.merge(opp_season_stats,
                                              on=['SEASON', 'OPP_TEAM'],
                                              how='left')

        team_boxscores['OPP_TEAM_OREB_CHANCES/POSSESSION_ALLOWED(SEASON)'] = \
            team_boxscores['OPP_TEAM_OREB_CHANCES_ALLOWED(SEASON)']/team_boxscores['OPP_TEAM_POSSESSIONS_ALLOWED(SEASON)']

        team_boxscores = feature_creation.expanding_weighted_mean(
            df=team_boxscores,
            group_col_names=['SEASON', 'TEAM'],
            col_name='OPP_TEAM_OREB_CHANCES/POSSESSION_ALLOWED(SEASON)',
            new_col_name=
            'AVG_OREB_CHANCES/POSSESSION_ALLOWED(SEASON)_TEAM_P.A.',
            weight_col_name='TEAM_POSSESSIONS')

        # average oreb chances/possession that opp team played against
        team_boxscores['TEAM_OREB_CHANCES/POSSESSION(SEASON)'] = \
            team_boxscores['TEAM_OREB_CHANCES(SEASON)']/team_boxscores['TEAM_POSSESSIONS(SEASON)']

        team_boxscores = feature_creation.expanding_weighted_mean(
            df=team_boxscores,
            group_col_names=['SEASON', 'OPP_TEAM'],
            col_name='TEAM_OREB_CHANCES/POSSESSION(SEASON)',
            new_col_name='AVG_OREB_CHANCES/POSSESSION(SEASON)_OPP_TEAM_P.A.',
            weight_col_name='OPP_TEAM_POSSESSIONS')

        # team oreb chances/possession
        team_boxscores['TEAM_OREB_CHANCES/POSSESSION_HAT'] = \
            2*team_boxscores['AVG_TEAM_OREB_CHANCES/POSSESSION'] - \
                team_boxscores['AVG_OREB_CHANCES/POSSESSION_ALLOWED(SEASON)_TEAM_P.A.']

        # opp team oreb chances/possession allowed
        team_boxscores['OPP_TEAM_OREB_CHANCES/POSSESSION_ALLOWED_HAT'] = \
            2*team_boxscores['AVG_OREB_CHANCES/POSSESSION_OPP_TEAM_ALLOWED'] - \
                team_boxscores['AVG_OREB_CHANCES/POSSESSION(SEASON)_OPP_TEAM_P.A.']

        boxscores = boxscores.merge(team_boxscores,
                                    on=['SEASON', 'DATE', 'TEAM', 'OPP_TEAM'],
                                    how='left')

        boxscores['OREB_CHANCES'] = np.nan
        boxscores.loc[boxscores['OREB'] > 0, 'OREB_CHANCES'] = (
            boxscores.loc[boxscores['OREB'] > 0, 'OREB'] /
            boxscores.loc[boxscores['OREB'] > 0,
                          'OREB_PCT']).apply(lambda x: round(x))
        boxscores.loc[boxscores['OREB'] == 0, 'OREB_CHANCES'] = \
            boxscores.loc[boxscores['OREB'] == 0, 'TEAM_OREB_CHANCES/POSSESSION']*boxscores.loc[boxscores['OREB'] == 0, 'POSS']

        boxscores['TEAM_DREB_CHANCES/POSSESSION'] = boxscores[
            'TEAM_DREB_CHANCES'] / boxscores['TEAM_POSSESSIONS']

        boxscores['DREB_CHANCES'] = np.nan
        boxscores.loc[boxscores['DREB'] > 0, 'DREB_CHANCES'] = (
            boxscores.loc[boxscores['DREB'] > 0, 'DREB'] /
            boxscores.loc[boxscores['DREB'] > 0,
                          'DREB_PCT']).apply(lambda x: round(x))
        boxscores.loc[boxscores['DREB'] == 0, 'DREB_CHANCES'] = \
            boxscores.loc[boxscores['DREB'] == 0, 'TEAM_DREB_CHANCES/POSSESSION']*boxscores.loc[boxscores['DREB'] == 0, 'POSS']

        temp = boxscores.groupby(
            ['SEASON', 'DATE', 'TEAM', 'OPP_TEAM']).apply(lambda x: pd.Series(
                {
                    'IMPLIED_TEAM_OREB_CHANCES': x['OREB_CHANCES'].sum() / 5,
                    'IMPLIED_TEAM_DREB_CHANCES': x['DREB_CHANCES'].sum() / 5
                })).reset_index()
        boxscores = boxscores.merge(temp,
                                    on=['SEASON', 'DATE', 'TEAM', 'OPP_TEAM'],
                                    how='left')

        # average player oreb/chance
        boxscores['OREB_CHANCES'] = boxscores['OREB_CHANCES'] * (
            boxscores['TEAM_OREB_CHANCES'] /
            boxscores['IMPLIED_TEAM_OREB_CHANCES'])
        boxscores[
            'OREB/OREB_CHANCE'] = boxscores['OREB'] / boxscores['OREB_CHANCES']

        boxscores = feature_creation.expanding_weighted_mean(
            df=boxscores,
            group_col_names=['SEASON', 'TEAM', 'PLAYERID'],
            col_name='OREB/OREB_CHANCE',
            new_col_name='AVG_OREB/OREB_CHANCE',
            weight_col_name='OREB_CHANCES')

        # average player dreb/chance
        boxscores['DREB_CHANCES'] = boxscores['DREB_CHANCES'] * (
            boxscores['TEAM_DREB_CHANCES'] /
            boxscores['IMPLIED_TEAM_DREB_CHANCES'])
        boxscores[
            'DREB/DREB_CHANCE'] = boxscores['DREB'] / boxscores['DREB_CHANCES']

        boxscores = feature_creation.expanding_weighted_mean(
            df=boxscores,
            group_col_names=['SEASON', 'TEAM', 'PLAYERID'],
            col_name='DREB/DREB_CHANCE',
            new_col_name='AVG_DREB/DREB_CHANCE',
            weight_col_name='DREB_CHANCES')

        # average oreb/oreb chance that opp team allowed
        team_game_boxscores = boxscores.groupby(
            ['SEASON', 'DATE', 'TEAM', 'OPP_TEAM']).apply(lambda x: pd.Series(
                {
                    'TEAM_OREB': x['OREB'].sum(),
                    'TEAM_OREB_CHANCES': x['OREB_CHANCES'].sum() / 5,
                    'TEAM_DREB': x['DREB'].sum(),
                    'TEAM_DREB_CHANCES': x['DREB_CHANCES'].sum() / 5
                })).reset_index()

        opp_team_game_boxscores = team_game_boxscores.drop(columns='OPP_TEAM')
        opp_team_game_boxscores = opp_team_game_boxscores.rename(
            columns={
                'TEAM': 'OPP_TEAM',
                'TEAM_OREB': 'OPP_TEAM_OREB',
                'TEAM_OREB_CHANCES': 'OPP_TEAM_OREB_CHANCES',
                'TEAM_DREB': 'OPP_TEAM_DREB',
                'TEAM_DREB_CHANCES': 'OPP_TEAM_DREB_CHANCES'
            })
        team_game_boxscores = team_game_boxscores.merge(
            opp_team_game_boxscores,
            on=['SEASON', 'DATE', 'OPP_TEAM'],
            how='left')

        team_game_boxscores['TEAM_OREB/OREB_CHANCE'] = \
            team_game_boxscores['TEAM_OREB']/team_game_boxscores['TEAM_OREB_CHANCES']

        team_game_boxscores = feature_creation.expanding_weighted_mean(
            df=team_game_boxscores,
            group_col_names=['SEASON', 'OPP_TEAM'],
            col_name='TEAM_OREB/OREB_CHANCE',
            new_col_name='AVG_TEAM_OREB/OREB_CHANCE_OPP_ALLOWED',
            weight_col_name='TEAM_OREB_CHANCES')

        # average dreb/dreb chance that opp team allowed
        team_game_boxscores['TEAM_DREB/DREB_CHANCE'] = \
            team_game_boxscores['TEAM_DREB']/team_game_boxscores['TEAM_DREB_CHANCES']

        team_game_boxscores = feature_creation.expanding_weighted_mean(
            df=team_game_boxscores,
            group_col_names=['SEASON', 'OPP_TEAM'],
            col_name='TEAM_DREB/DREB_CHANCE',
            new_col_name='AVG_TEAM_DREB/DREB_CHANCE_OPP_ALLOWED',
            weight_col_name='TEAM_DREB_CHANCES')

        boxscores = boxscores.merge(team_game_boxscores,
                                    on=['SEASON', 'DATE', 'TEAM', 'OPP_TEAM'],
                                    how='left')

        # average oreb/oreb chance allowed that player played against
        boxscores['OPP_TEAM_OREB/OREB_CHANCE_ALLOWED(SEASON)'] = \
            boxscores['OPP_TEAM_OREB_ALLOWED(SEASON)'] / boxscores['OPP_TEAM_OREB_CHANCES_ALLOWED(SEASON)']

        boxscores = feature_creation.expanding_weighted_mean(
            df=boxscores,
            group_col_names=['SEASON', 'TEAM', 'PLAYERID'],
            col_name='OPP_TEAM_OREB/OREB_CHANCE_ALLOWED(SEASON)',
            new_col_name='AVG_TEAM_OREB/OREB_CHANCE(SEASON)_ALLOWED_PLAYER_P.A',
            weight_col_name='OREB_CHANCES')

        # average dreb/dreb chance allowed that player played against
        boxscores['OPP_TEAM_DREB/DREB_CHANCE_ALLOWED(SEASON)'] = \
            boxscores['OPP_TEAM_DREB_ALLOWED(SEASON)'] / boxscores['OPP_TEAM_DREB_CHANCES_ALLOWED(SEASON)']

        boxscores = feature_creation.expanding_weighted_mean(
            df=boxscores,
            group_col_names=['SEASON', 'TEAM', 'PLAYERID'],
            col_name='OPP_TEAM_DREB/DREB_CHANCE_ALLOWED(SEASON)',
            new_col_name='AVG_TEAM_DREB/DREB_CHANCE(SEASON)_ALLOWED_PLAYER_P.A',
            weight_col_name='DREB_CHANCES')

        # oreb/oreb chance defense
        boxscores['OREB/CH_DEF'] = \
            boxscores['AVG_TEAM_OREB/OREB_CHANCE_OPP_ALLOWED'] / \
                boxscores['AVG_TEAM_OREB/OREB_CHANCE(SEASON)_ALLOWED_PLAYER_P.A']

        # dreb/dreb chance defense
        boxscores['DREB/CH_DEF'] = \
            boxscores['AVG_TEAM_DREB/DREB_CHANCE_OPP_ALLOWED'] / \
                boxscores['AVG_TEAM_DREB/DREB_CHANCE(SEASON)_ALLOWED_PLAYER_P.A']

        boxscores = boxscores.loc[(boxscores['DATE'] >= start_date)
                                  & (boxscores['DATE'] <= end_date)]

        return boxscores
Exemplo n.º 10
0
 def __init__(self, test_data):
     self.feature_creation = FeatureCreation()
     self.test_data = test_data
     self.original_columns = list(self.test_data.columns)
Exemplo n.º 11
0
    def generate_regressors(self, boxscores, start_date, end_date):
        feature_creation = FeatureCreation()
        helpers = Helpers()

        relevant_seasons = boxscores.loc[(boxscores['DATE'] >= start_date) & (
            boxscores['DATE'] <= end_date)]['SEASON'].unique()
        boxscores = boxscores.loc[boxscores['SEASON'].isin(relevant_seasons)]

        boxscores['ATTEMPTS'] = boxscores['TOTAL_ATTEMPTS']
        boxscores[
            'ATTEMPTS/POSSESSION'] = boxscores['ATTEMPTS'] / boxscores['POSS']

        # average attempts per possession
        boxscores = feature_creation.expanding_weighted_mean(
            df=boxscores,
            group_col_names=['SEASON', 'TEAM', 'PLAYERID'],
            col_name='ATTEMPTS/POSSESSION',
            new_col_name='AVG_ATTEMPTS/POSSESSION',
            weight_col_name='POSS')

        boxscores = feature_creation.expanding_sum(
            df=boxscores,
            group_col_names=['SEASON', 'TEAM', 'PLAYERID'],
            col_name='POSS',
            new_col_name='SUM_POSS')

        boxscores.loc[boxscores['SUM_POSS'] == 0,
                      'AVG_ATTEMPTS/POSSESSION'] = boxscores['ATTEMPTS'].sum(
                      ) / boxscores['POSS'].sum()

        boxscores['POINTS/ATTEMPT'] = boxscores['PTS'] / boxscores['ATTEMPTS']

        # average points per attempt
        boxscores = feature_creation.expanding_weighted_mean(
            df=boxscores,
            group_col_names=['SEASON', 'TEAM', 'PLAYERID'],
            col_name='POINTS/ATTEMPT',
            new_col_name='AVG_POINTS/ATTEMPT',
            weight_col_name='ATTEMPTS')

        boxscores = feature_creation.expanding_sum(
            df=boxscores,
            group_col_names=['SEASON', 'TEAM', 'PLAYERID'],
            col_name='ATTEMPTS',
            new_col_name='SUM_ATTEMPTS')

        boxscores.loc[boxscores['SUM_ATTEMPTS'] == 0,
                      'AVG_POINTS/ATTEMPT'] = boxscores['PTS'].sum(
                      ) / boxscores['ATTEMPTS'].sum()

        # adjustment for defense (points per attempt)
        for play_type in PLAY_TYPES:
            player_play_type_data = pd.DataFrame()
            team_play_type_data = pd.DataFrame()
            for season in relevant_seasons:
                player_data = helpers.get_play_type_breakdown(
                    play_type, season, 'player')

                player_data['SEASON'] = season
                player_data['PLAYER_ID'] = player_data['PLAYER_ID'].apply(
                    lambda x: str(x))
                player_data = player_data.rename(
                    columns={
                        'TEAM_ABBREVIATION': 'TEAM',
                        'PLAYER_ID': 'PLAYERID',
                        'PPP': '{}_PPP'.format(play_type),
                        'POSS_PCT': '{}_POSS_PCT'.format(play_type)
                    })
                player_data = player_data[[
                    'SEASON', 'PLAYERID', 'TEAM', '{}_PPP'.format(play_type),
                    '{}_POSS_PCT'.format(play_type)
                ]]

                player_play_type_data = player_play_type_data.append(
                    player_data)

                team_data = helpers.get_play_type_breakdown(
                    play_type, season, 'team')

                team_data['SEASON'] = season
                team_data = team_data.rename(
                    columns={
                        'TEAM_ABBREVIATION': 'OPP_TEAM',
                        'PPP': '{}_PPP_ALLOWED'.format(play_type),
                        'POSS_PCT': '{}_POSS_PCT_ALLOWED'.format(play_type)
                    })
                team_data = team_data[[
                    'SEASON', 'OPP_TEAM', '{}_PPP_ALLOWED'.format(play_type),
                    '{}_POSS_PCT_ALLOWED'.format(play_type)
                ]]

                team_play_type_data = team_play_type_data.append(team_data)

            boxscores = boxscores.merge(player_play_type_data,
                                        on=['SEASON', 'PLAYERID', 'TEAM'],
                                        how='left')
            boxscores = boxscores.merge(team_play_type_data,
                                        on=['SEASON', 'OPP_TEAM'],
                                        how='left')

        poss_pct_cols = ['{}_POSS_PCT'.format(i) for i in PLAY_TYPES]
        poss_pct_allowed_cols = [
            '{}_POSS_PCT_ALLOWED'.format(i) for i in PLAY_TYPES
        ]
        boxscores[poss_pct_cols] = boxscores[poss_pct_cols].replace([0], 0.001)
        boxscores[poss_pct_allowed_cols] = boxscores[
            poss_pct_allowed_cols].replace([0], 0.001)
        boxscores['TOTAL_POSS_PCT'] = boxscores[poss_pct_cols].sum(axis=1)
        boxscores['TOTAL_POSS_PCT_ALLOWED'] = boxscores[
            poss_pct_allowed_cols].sum(axis=1)
        boxscores[poss_pct_cols] = boxscores[poss_pct_cols].div(
            boxscores['TOTAL_POSS_PCT'], axis=0)
        boxscores[poss_pct_allowed_cols] = boxscores[
            poss_pct_allowed_cols].div(boxscores['TOTAL_POSS_PCT_ALLOWED'],
                                       axis=0)

        boxscores['NET_POINTS/ATTEMPT'] = 0
        boxscores['TOTAL_POSS_PCT'] = 0
        boxscores['IMPLIED_NET_POINTS/ATTEMPT'] = 0
        boxscores['TOTAL_IMPLIED_POSS_PCT'] = 0
        for play_type in PLAY_TYPES:
            boxscores = feature_creation.expanding_weighted_mean(
                df=boxscores,
                group_col_names=['SEASON', 'TEAM', 'PLAYERID'],
                col_name='{}_PPP_ALLOWED'.format(play_type),
                weight_col_name='ATTEMPTS',
                new_col_name='AVG_{}_PPP_ALLOWED_PLAYED_AGAINST'.format(
                    play_type))
            boxscores = feature_creation.expanding_weighted_mean(
                df=boxscores,
                group_col_names=['SEASON', 'TEAM', 'PLAYERID'],
                col_name='{}_POSS_PCT_ALLOWED'.format(play_type),
                weight_col_name='POSS',
                new_col_name='AVG_{}_POSS_PCT_ALLOWED_PLAYED_AGAINST'.format(
                    play_type))

            boxscores['PPP_ADJ'] = boxscores.apply(
                lambda row: row['{}_PPP_ALLOWED'.format(play_type)]/row['AVG_{}_PPP_ALLOWED_PLAYED_AGAINST'.format(play_type)] \
                    if (not np.isnan(row['{}_PPP_ALLOWED'.format(play_type)]) and \
                        not np.isnan(row['AVG_{}_PPP_ALLOWED_PLAYED_AGAINST'.format(play_type)]) and
                        row['AVG_{}_PPP_ALLOWED_PLAYED_AGAINST'.format(play_type)] != 0) \
                        else 1,
                axis = 1
                )
            boxscores['POSS_PCT_ADJ'] = boxscores.apply(
                lambda row: row['{}_POSS_PCT_ALLOWED'.format(play_type)]/row['AVG_{}_POSS_PCT_ALLOWED_PLAYED_AGAINST'.format(play_type)] \
                    if (not np.isnan(row['{}_POSS_PCT_ALLOWED'.format(play_type)]) and \
                        not np.isnan(row['AVG_{}_POSS_PCT_ALLOWED_PLAYED_AGAINST'.format(play_type)]) and
                        row['AVG_{}_POSS_PCT_ALLOWED_PLAYED_AGAINST'.format(play_type)] != 0) \
                        else 1,
                axis = 1
                )

            boxscores['NET_POINTS/ATTEMPT'] += \
                boxscores['{}_PPP'.format(play_type)].fillna(0) * boxscores['{}_POSS_PCT'.format(play_type)].fillna(0)
            boxscores['TOTAL_POSS_PCT'] += boxscores['{}_POSS_PCT'.format(
                play_type)].fillna(0)

            boxscores['IMPLIED_NET_POINTS/ATTEMPT'] += (boxscores['{}_PPP'.format(play_type)].fillna(0) * boxscores['PPP_ADJ']) * \
                (boxscores['{}_POSS_PCT'.format(play_type)].fillna(0) * boxscores['POSS_PCT_ADJ'])
            boxscores['TOTAL_IMPLIED_POSS_PCT'] += boxscores[
                '{}_POSS_PCT'.format(play_type)].fillna(
                    0) * boxscores['POSS_PCT_ADJ']

        boxscores['POINTS/ATTEMPT_DEF_ADJ'] = boxscores.apply(
            lambda row: (row['IMPLIED_NET_POINTS/ATTEMPT']/row['TOTAL_IMPLIED_POSS_PCT']) - \
                (row['NET_POINTS/ATTEMPT']/row['TOTAL_POSS_PCT']) \
                    if (row['TOTAL_POSS_PCT'] > 0 and row['TOTAL_IMPLIED_POSS_PCT'] > 0) else 0,
            axis = 1
        )

        boxscores = boxscores.loc[(boxscores['DATE'] >= start_date)
                                  & (boxscores['DATE'] <= end_date)]

        return boxscores
Exemplo n.º 12
0
 def __init__(self):
     self.feature_creation = FeatureCreation()
Exemplo n.º 13
0
class OwnershipModel(object):
    def __init__(self, train_data, test_data, site):
        self.feature_creation = FeatureCreation()
        self.clean_data = CleanData()

        self.train_data = train_data
        self.test_data = test_data
        self.site = site
        self.model = XGBoostRegressionModel(OWNERSHIP_MODEL_PARAMS)

        self.regressors = []
        self.regressand = 'OWNERSHIP'

        self.created_features = False
        self.trained_model = False

    def create_features(self, salary_data, contest_data, ownership_data,
                        odds_data):
        data = pd.concat([self.train_data, self.test_data])

        train_index = self.train_data.set_index(['GAMEID', 'PLAYERID']).index
        test_index = self.test_data.set_index(['GAMEID', 'PLAYERID']).index

        salary_data = salary_data.loc[salary_data['SITE'] == self.site]
        data = data.merge(salary_data, on=['DATE', 'NAME'], how='inner')

        # player stat features
        CustomFPCalculator = FPCalculator(self.site)

        data['REB'] = data['DREB'] + data['OREB']
        data['DKFP'] = data.apply(
            lambda x: CustomFPCalculator.calculate_fantasy_points(
                x['SEASON'], x['PTS'], x['REB'], x['AST'], x['TOV'], x['BLK'],
                x['STL'], x['FG3M']),
            axis=1)

        data = self.feature_creation.expanding_mean(
            df=data,
            group_col_names=['SEASON', 'PLAYERID'],
            col_name='DKFP',
            new_col_name='AVG_DKFP')
        self.regressors.append('AVG_DKFP')

        data['VALUE'] = data['AVG_DKFP'] / data['SALARY']
        self.regressors.append('VALUE')

        data = self.feature_creation.lag(
            df=data,
            group_col_names=['SEASON', 'PLAYERID'],
            col_name='DKFP',
            new_col_name='L1_DKFP',
            n_shift=1)
        self.regressors.append('L1_DKFP')

        data = self.feature_creation.rolling_mean(
            df=data,
            group_col_names=['SEASON', 'PLAYERID'],
            col_name='DKFP',
            new_col_name='MA5_DKFP',
            n_rolling=5)
        self.regressors.append('MA5_DKFP')

        data = self.feature_creation.lag(
            df=data,
            group_col_names=['SEASON', 'PLAYERID'],
            col_name='SALARY',
            new_col_name='L1_SALARY',
            n_shift=1)
        data['SALARY_CHANGE'] = data['SALARY'] - data['L1_SALARY']
        self.regressors.append('SALARY')
        self.regressors.append('SALARY_CHANGE')

        data = self.feature_creation.expanding_standard_deviation(
            df=data,
            group_col_names=['SEASON', 'PLAYERID'],
            col_name='DKFP',
            new_col_name='STD_DKFP',
            min_periods=5)
        self.regressors.append('STD_DKFP')

        self.regressors.append('START')

        data['DFS_POSITIONS'] = data['DFS_POSITION'].apply(
            lambda x: x.split('_') if isinstance(x, str) else np.nan)
        data['NUM_POSITIONS'] = data['DFS_POSITIONS'].apply(
            lambda x: len(x) if isinstance(x, list) else np.nan)
        self.regressors.append('NUM_POSITIONS')

        for position in ['SG', 'PG', 'C']:
            data[position] = 0
            data.loc[data['DFS_POSITION'].str.contains(position), position] = 1
            self.regressors.append(position)

        # historical ownership of player
        ownership_data['NAME'] = ownership_data['PLAYERNAME'].apply(
            lambda x: x if x not in OWNERSHIP_NAME_TO_NBA_NAME else
            OWNERSHIP_NAME_TO_NBA_NAME[x])
        ownership_data = ownership_data.merge(contest_data,
                                              on=['SLATEID', 'CONTESTNAME'],
                                              how='inner')
        ownership_data = ownership_data.groupby(
            ['DATE', 'SLATEID', 'GAMECOUNT',
             'NAME']).apply(lambda x: pd.Series({
                 'OWNERSHIP': (x['OWNERSHIP'] * x['TOTALENTRIES']).sum() / x[
                     'TOTALENTRIES'].sum()
             })).reset_index()

        aggregated_ownership = ownership_data.groupby(
            ['DATE', 'NAME']).apply(lambda x: pd.Series(
                {'TOTAL_OWNERSHIP': x['OWNERSHIP'].mean()})).reset_index()
        data = data.merge(aggregated_ownership,
                          on=['DATE', 'NAME'],
                          how='inner')

        data = self.feature_creation.expanding_mean(
            df=data,
            group_col_names=['SEASON', 'NAME'],
            col_name='TOTAL_OWNERSHIP',
            new_col_name='AVG_OWNERSHIP')
        self.regressors.append('AVG_OWNERSHIP')

        data = self.feature_creation.lag(df=data,
                                         group_col_names=['SEASON', 'NAME'],
                                         col_name='TOTAL_OWNERSHIP',
                                         new_col_name='L1_OWNERSHIP',
                                         n_shift=1)
        self.regressors.append('L1_OWNERSHIP')

        data = self.feature_creation.rolling_mean(
            df=data,
            group_col_names=['SEASON', 'NAME'],
            col_name='TOTAL_OWNERSHIP',
            new_col_name='MA5_OWNERSHIP',
            n_rolling=5)
        self.regressors.append('MA5_OWNERSHIP')

        # defense
        data['NORM_POS'] = data['POSITION'].apply(
            lambda x: x if '-' not in x else x.split('-')[0])

        temp = data.dropna(subset=['DKFP', 'AVG_DKFP'])
        grouped_defensive_boxscores = temp.groupby([
            'SEASON', 'DATE', 'NORM_POS', 'OPP_TEAM'
        ]).apply(lambda x: pd.Series({
            'TEAM_DKFP_ALLOWED_P': x['DKFP'].sum(),
            'TEAM_DKFP_AVG_P': x['AVG_DKFP'].sum()
        })).reset_index()

        grouped_defensive_boxscores['DvP'] = grouped_defensive_boxscores['TEAM_DKFP_ALLOWED_P'] - \
            grouped_defensive_boxscores['TEAM_DKFP_AVG_P']

        grouped_defensive_boxscores = self.feature_creation.expanding_mean(
            df=grouped_defensive_boxscores,
            group_col_names=['SEASON', 'OPP_TEAM', 'NORM_POS'],
            col_name='DvP',
            new_col_name='AVG_DvP',
            order_idx_name='DATE',
            min_periods=5)
        self.regressors.append('AVG_DvP')

        data = data.merge(grouped_defensive_boxscores,
                          on=['SEASON', 'DATE', 'OPP_TEAM', 'NORM_POS'],
                          how='left')

        # vegas lines
        odds_data['TOTAL'] = odds_data['TOTAL'].replace(['PK', '-'], np.nan)
        odds_data['POINTSPREAD'] = odds_data['POINTSPREAD'].replace(
            ['PK', '-'], 0)
        full_game_odds = odds_data.loc[odds_data['PERIOD'] == 'Full Game']
        data = data.merge(full_game_odds, on=['DATE', 'TEAM'], how='left')
        self.regressors.append('TOTAL')
        self.regressors.append('POINTSPREAD')

        # slate info
        self.regressors.append('GAMECOUNT')

        slates = contest_data.loc[
            contest_data['SITE'] == self.site,
            ['DATE', 'SLATEID', 'TEAMS']].drop_duplicates()
        slates['TEAMS'] = slates['TEAMS'].apply(lambda x: x.split('_'))
        slates = slates.explode('TEAMS').rename(columns={"TEAMS": "TEAM"})
        slates['TEAM'] = slates['TEAM'].apply(
            lambda x: x
            if x not in DB_TEAM_TO_NBA_TEAM else DB_TEAM_TO_NBA_TEAM[x])

        slate_players = data[[
            'DATE', 'TEAM', 'NAME', 'DFS_POSITIONS', 'SALARY', 'VALUE'
        ]].merge(slates, on=['DATE', 'TEAM'], how='inner')
        slate_players['SALARY_BIN'] = pd.cut(slate_players['SALARY'],
                                             bins=list(range(
                                                 3000, 15000, 1000)),
                                             duplicates='drop',
                                             include_lowest=True)
        slate_players = slate_players.explode('DFS_POSITIONS').rename(
            columns={'DFS_POSITIONS': 'SINGLE_DFS_POSITION'})

        MIN_VALUE = 0.002

        all_temp = slate_players.groupby(
            ['SLATEID', 'SINGLE_DFS_POSITION']).apply(lambda x: pd.Series(
                {'L1P_COUNT': x.loc[x['VALUE'] > MIN_VALUE, 'NAME'].count()})
                                                      ).reset_index().dropna()
        slate_players = slate_players.merge(
            all_temp, on=['SLATEID', 'SINGLE_DFS_POSITION'], how='left')

        sb_temp = slate_players.groupby(
            ['SLATEID', 'SINGLE_DFS_POSITION',
             'SALARY_BIN']).apply(lambda x: pd.Series(
                 {'L1P_SB_COUNT': x['NAME'].count()})).reset_index().dropna()
        slate_players = slate_players.merge(
            sb_temp,
            on=['SLATEID', 'SINGLE_DFS_POSITION', 'SALARY_BIN'],
            how='left')

        L1_TO_L2 = {'PG': 'G', 'SG': 'G', 'SF': 'F', 'PF': 'F', 'C': 'C'}
        slate_players['LEVEL2_DFS_POSITION'] = slate_players[
            'SINGLE_DFS_POSITION'].apply(lambda x: L1_TO_L2[x]
                                         if isinstance(x, str) else np.nan)

        all_temp = slate_players.groupby(
            ['SLATEID', 'LEVEL2_DFS_POSITION']).apply(lambda x: pd.Series(
                {'L2P_COUNT': x.loc[x['VALUE'] > MIN_VALUE, 'NAME'].count()})
                                                      ).reset_index().dropna()
        slate_players = slate_players.merge(
            all_temp, on=['SLATEID', 'LEVEL2_DFS_POSITION'], how='left')

        all_temp = slate_players.groupby(
            ['SLATEID']).apply(lambda x: pd.Series(
                {'L3P_COUNT': x.loc[x['VALUE'] > MIN_VALUE, 'NAME'].count()})
                               ).reset_index().dropna()
        slate_players = slate_players.merge(all_temp,
                                            on=['SLATEID'],
                                            how='left')

        sb_temp = slate_players.groupby([
            'SLATEID', 'SALARY_BIN'
        ]).apply(lambda x: pd.Series(
            {'L3P_SB_COUNT': x.loc[x['VALUE'] > MIN_VALUE, 'NAME'].count()})
                 ).reset_index().dropna()
        slate_players = slate_players.merge(sb_temp,
                                            on=['SLATEID', 'SALARY_BIN'],
                                            how='left')

        slate_players['SALARY_FLOOR'] = slate_players['SALARY_BIN'].apply(
            lambda x: x.left)

        slate_players['L1P_RANK'] = slate_players.groupby(
            ['SLATEID', 'SINGLE_DFS_POSITION'])['VALUE'].rank(method='min',
                                                              ascending=False)

        slate_players['L1P_SB_RANK'] = slate_players.groupby(
            ['SLATEID', 'SINGLE_DFS_POSITION',
             'SALARY_FLOOR'])['VALUE'].rank(method='min', ascending=False)

        slate_players['L3P_RANK'] = slate_players.groupby(
            ['SLATEID'])['VALUE'].rank(method='min', ascending=False)

        slate_players['L3P_SB_RANK'] = slate_players.groupby(
            ['SLATEID', 'SALARY_FLOOR'])['VALUE'].rank(method='min',
                                                       ascending=False)

        slate_data = slate_players.groupby([
            'DATE', 'SLATEID', 'NAME'
        ]).apply(lambda x: pd.Series({
            'L1P_COUNT': x['L1P_COUNT'].mean(),
            'L1P_RANK': x['L1P_RANK'].mean(),
            'L1P_SB_COUNT': x['L1P_SB_COUNT'].mean(),
            'L1P_SB_RANK': x['L1P_SB_RANK'].mean(),
            'L2P_COUNT': x['L2P_COUNT'].mean(),
            'L3P_COUNT': x['L3P_COUNT'].mean(),
            'L3P_RANK': x['L3P_RANK'].mean(),
            'L3P_SB_COUNT': x['L3P_SB_COUNT'].mean(),
            'L3P_SB_RANK': x['L3P_SB_RANK'].mean()
        })).reset_index()

        self.regressors.append('L1P_COUNT')
        self.regressors.append('L1P_RANK')
        self.regressors.append('L1P_SB_COUNT')
        self.regressors.append('L1P_SB_RANK')
        self.regressors.append('L2P_COUNT')
        self.regressors.append('L3P_COUNT')
        self.regressors.append('L3P_RANK')
        self.regressors.append('L3P_SB_COUNT')
        self.regressors.append('L3P_SB_RANK')

        data['GP'] = 1
        data = self.feature_creation.expanding_sum(
            df=data,
            group_col_names=['SEASON', 'PLAYERID'],
            col_name='GP',
            new_col_name='COUNT_GP')
        self.regressors.append('COUNT_GP')

        data = self.preprocess(data, slate_data, ownership_data)
        data = data.set_index(['GAMEID', 'PLAYERID'])

        train_index = list(
            set(data.index.values).intersection(set(train_index.values)))
        self.train_data = data.loc[train_index].reset_index()
        test_index = list(
            set(data.index.values).intersection(set(test_index.values)))
        self.test_data = data.loc[test_index].reset_index()

        self.created_features = True

    def preprocess(self, data, slate_data, ownership_data):
        ownership_data = ownership_data.merge(slate_data,
                                              on=['DATE', 'SLATEID', 'NAME'],
                                              how='inner')
        data = ownership_data.merge(data, on=['DATE', 'NAME'], how='inner')

        data['L1_DKFP'] = data['L1_DKFP'].fillna(data['AVG_DKFP'])
        data['MA5_DKFP'] = data['MA5_DKFP'].fillna(data['AVG_DKFP'])

        data['SALARY_CHANGE'] = data['SALARY_CHANGE'].fillna(0)

        data['STD_DKFP'] = data['STD_DKFP'].fillna(DEFAULT_STD *
                                                   data['AVG_DKFP'])

        data['L1_OWNERSHIP'] = data['L1_OWNERSHIP'].fillna(
            data['AVG_OWNERSHIP'])
        data['MA5_OWNERSHIP'] = data['MA5_OWNERSHIP'].fillna(
            data['AVG_OWNERSHIP'])

        data['AVG_DvP'] = data['AVG_DvP'].fillna(0)

        data['TOTAL'] = data['TOTAL'].fillna(data['TOTAL'].mean())
        data['POINTSPREAD'] = data['POINTSPREAD'].fillna(0)

        data['L1P_SB_COUNT'] = data['L1P_SB_COUNT'].fillna(0)
        data['L3P_SB_COUNT'] = data['L3P_SB_COUNT'].fillna(0)

        # we can predict Y for a player as long as AVG_Y is not nan
        data = data.dropna(subset=['AVG_OWNERSHIP'])

        return data

    def train_model(self):
        if not self.created_features:
            raise Exception('Must create features before training model')

        X = self.train_data[self.regressors]
        y = self.train_data[self.regressand]
        self.model.fit(X, y, test_size=0.25, early_stopping_rounds=25)

        self.trained_model = True

    def predict(self):
        if not self.trained_model:
            raise Exception('Must train model before generating predictions')

        output_column = '{}_HAT'.format(self.regressand)

        self.test_data[output_column] = self.model.predict(
            self.test_data[self.regressors])

        return self.test_data[['DATE', 'SLATEID', 'NAME',
                               output_column]], output_column