def __init__(self, train_data, test_data): self.feature_creation = FeatureCreation() self.train_data = train_data self.test_data = test_data self.model = CatBoostRegressionModel(SPS_MODEL_PARAMS) self.regressors = [] self.regressand = 'SPS' self.created_features = False self.trained_model = False
def __init__(self, train_data, test_data, site): self.feature_creation = FeatureCreation() self.clean_data = CleanData() self.train_data = train_data self.test_data = test_data self.site = site self.model = XGBoostRegressionModel(OWNERSHIP_MODEL_PARAMS) self.regressors = [] self.regressand = 'OWNERSHIP' self.created_features = False self.trained_model = False
class CleanData(object): def __init__(self): self.feature_creation = FeatureCreation() def drop_rows_player_inactive(self, df): df = df.loc[df['SECONDSPLAYED'] > 0] return df def drop_rows_player_injured(self, df): df = df.loc[(df['SECONDSPLAYED'] != 0) | (df['COMMENT'] == "DNP - Coach's Decision")] return df def drop_rows_player_rest(self, df, thresh=1200): df = self.feature_creation.expanding_mean( df=df, group_col_names=['SEASON', 'PLAYERID'], col_name='SECONDSPLAYED', new_col_name='AVG_SP') df = df.loc[~((df['AVG_SP'] > thresh) & (df['COMMENT'] == "DNP - Coach's Decision"))] df = df.drop(columns=['AVG_SP']) return df def roto_name_to_nba_name(self, name): name_list = name.split(',') name = "{} {}".format(name_list[-1].lstrip(), ' '.join(name_list[:-1])) if name in ROTO_NAME_TO_NBA_NAME: return ROTO_NAME_TO_NBA_NAME[name] return name
def generate_regressors(self, boxscores, start_date, end_date): feature_creation = FeatureCreation() relevant_seasons = boxscores.loc[(boxscores['DATE'] >= start_date) & ( boxscores['DATE'] <= end_date)]['SEASON'].unique() boxscores = boxscores.loc[boxscores['SEASON'].isin(relevant_seasons)] boxscores['ASSISTS/POSSESSION'] = boxscores['AST'] / boxscores['POSS'] # average player assists/possession boxscores = feature_creation.expanding_weighted_mean( df=boxscores, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='ASSISTS/POSSESSION', new_col_name='AVG_ASSISTS/POSSESSION', weight_col_name='POSS') boxscores = boxscores.loc[(boxscores['DATE'] >= start_date) & (boxscores['DATE'] <= end_date)] return boxscores
def __init__(self, train_data, test_data): self.feature_creation = FeatureCreation() self.clean_data = CleanData() self.train_data = train_data self.test_data = test_data self.model = XGBoostRegressionModel(TOPSCORE_MODEL_PARAMS) self.regressors = ['GAMECOUNT', 'TOTALENTRIES', 'AVERAGE_TOTAL'] self.regressand = 'TOPSCORE' self.created_features = False self.trained_model = False
class VarianceModel(object): def __init__(self, test_data): self.feature_creation = FeatureCreation() self.test_data = test_data self.original_columns = list(self.test_data.columns) def predict(self, y): output_column = 'STD_{}'.format(y) self.test_data = self.feature_creation.expanding_standard_deviation( df=self.test_data, group_col_names=['SEASON', 'PLAYERID', 'START'], col_name=y, new_col_name=output_column, min_periods=4) return self.test_data[self.original_columns + [output_column]], output_column
class RPSModel(object): def __init__(self, train_data, test_data): self.feature_creation = FeatureCreation() self.train_data = train_data self.test_data = test_data self.model = CatBoostRegressionModel(RPS_MODEL_PARAMS) self.regressors = [] self.regressand = 'RPS' self.created_features = False self.generated_weights = False self.trained_model = False def create_features(self, odds_data, sp_threshold=60): data = pd.concat([self.train_data, self.test_data]) data['REB'] = data['DREB'] + data['OREB'] data[self.regressand] = data['REB']/data['SECONDSPLAYED'] data['ORPS'] = data['OREB']/data['SECONDSPLAYED'] data['DRPS'] = data['DREB']/data['SECONDSPLAYED'] data['CLEAN_DRPS'] = data['DRPS'] data.loc[data['SECONDSPLAYED'] <= sp_threshold, 'CLEAN_DRPS'] = np.nan data['CLEAN_ORPS'] = data['ORPS'] data.loc[data['SECONDSPLAYED'] <= sp_threshold, 'CLEAN_ORPS'] = np.nan train_index = self.train_data.set_index(['GAMEID', 'PLAYERID']).index test_index = self.test_data.set_index(['GAMEID', 'PLAYERID']).index # season averages data = self.feature_creation.expanding_weighted_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='DRPS', weight_col_name='SECONDSPLAYED', new_col_name='AVG_DRPS' ) data = self.feature_creation.expanding_weighted_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='ORPS', weight_col_name='SECONDSPLAYED', new_col_name='AVG_ORPS' ) self.regressors.append('AVG_DRPS') self.regressors.append('AVG_ORPS') data = self.feature_creation.expanding_weighted_mean( df=data, group_col_names=['SEASON', 'TEAM', 'OPP_TEAM', 'PLAYERID'], col_name=self.regressand, weight_col_name='SECONDSPLAYED', new_col_name='AVG_Y_OPP_TEAM' ) self.regressors.append('AVG_Y_OPP_TEAM') # 1 game lags data = self.feature_creation.lag( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='CLEAN_DRPS', new_col_name='L1_DRPS', n_shift=1 ) self.regressors.append('L1_DRPS') # exponentially weighted means data = self.feature_creation.expanding_ewm( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='DRPS', new_col_name='EWM_DRPS', alpha=0.90 ) data = self.feature_creation.expanding_ewm( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='ORPS', new_col_name='EWM_ORPS', alpha=0.90 ) self.regressors.append('EWM_DRPS') self.regressors.append('EWM_ORPS') # moving averages data = self.feature_creation.rolling_weighted_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='DRPS', new_col_name='MA2_DRPS', weight_col_name='SECONDSPLAYED', n_rolling=2, min_periods=1 ) data = self.feature_creation.rolling_weighted_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='DRPS', new_col_name='MA15_DRPS', weight_col_name='SECONDSPLAYED', n_rolling=15, min_periods=8 ) data = self.feature_creation.rolling_weighted_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='ORPS', new_col_name='MA6_ORPS', weight_col_name='SECONDSPLAYED', n_rolling=6, min_periods=3 ) data = self.feature_creation.rolling_weighted_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='ORPS', new_col_name='MA18_ORPS', weight_col_name='SECONDSPLAYED', n_rolling=18, min_periods=9 ) self.regressors.append('MA2_DRPS') self.regressors.append('MA15_DRPS') self.regressors.append('MA6_ORPS') self.regressors.append('MA18_ORPS') # start data = self.feature_creation.expanding_weighted_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID', 'START'], col_name=self.regressand, weight_col_name='SECONDSPLAYED', new_col_name='AVG_Y_R' ) self.regressors.append('AVG_Y_R') # position data['NORM_POS'] = data['POSITION'].apply(lambda x: x if '-' not in x else x.split('-')[0]) data['GUARD'] = 0 data.loc[data['NORM_POS'] == 'Guard', 'GUARD'] = 1 self.regressors.append('GUARD') # defense data = self.feature_creation.expanding_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='DREB', new_col_name='AVG_DREB' ) data = self.feature_creation.expanding_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='OREB', new_col_name='AVG_OREB' ) data = self.feature_creation.expanding_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='SECONDSPLAYED', new_col_name='AVG_SP' ) temp = data.dropna(subset=['DREB', 'SECONDSPLAYED', 'AVG_DREB', 'AVG_SP']) grouped_defensive_boxscores = temp.groupby(['SEASON', 'DATE', 'OPP_TEAM']).apply( lambda x: pd.Series({ 'TEAM_DRPS_ALLOWED': x['DREB'].sum()/x['SECONDSPLAYED'].sum(), 'TEAM_DRPS_AVG': x['AVG_DREB'].sum()/x['AVG_SP'].sum() }) ).reset_index() grouped_defensive_boxscores['TEAM_DRPS_DIFF_ALLOWED'] = grouped_defensive_boxscores['TEAM_DRPS_ALLOWED'] - \ grouped_defensive_boxscores['TEAM_DRPS_AVG'] grouped_defensive_boxscores = self.feature_creation.expanding_mean( df=grouped_defensive_boxscores, group_col_names=['SEASON', 'OPP_TEAM'], col_name='TEAM_DRPS_DIFF_ALLOWED', new_col_name='AVG_TEAM_DRPS_DIFF_ALLOWED', order_idx_name='DATE', min_periods=5 ) data = data.merge(grouped_defensive_boxscores, on=['SEASON', 'DATE', 'OPP_TEAM'], how='left') self.regressors.append('AVG_TEAM_DRPS_DIFF_ALLOWED') temp = data.dropna(subset=['DREB', 'SECONDSPLAYED', 'AVG_DREB', 'AVG_SP']) grouped_defensive_boxscores = temp.groupby(['SEASON', 'DATE', 'START', 'OPP_TEAM']).apply( lambda x: pd.Series({ 'TEAM_DRPS_ALLOWED_R': x['DREB'].sum()/x['SECONDSPLAYED'].sum(), 'TEAM_DRPS_AVG_R': x['AVG_DREB'].sum()/x['AVG_SP'].sum() }) ).reset_index() grouped_defensive_boxscores['TEAM_DRPS_DIFF_ALLOWED_R'] = grouped_defensive_boxscores['TEAM_DRPS_ALLOWED_R'] - \ grouped_defensive_boxscores['TEAM_DRPS_AVG_R'] grouped_defensive_boxscores = self.feature_creation.expanding_mean( df=grouped_defensive_boxscores, group_col_names=['SEASON', 'START', 'OPP_TEAM'], col_name='TEAM_DRPS_DIFF_ALLOWED_R', new_col_name='AVG_TEAM_DRPS_DIFF_ALLOWED_R', order_idx_name='DATE', min_periods=5 ) data = data.merge(grouped_defensive_boxscores, on=['SEASON', 'DATE', 'START', 'OPP_TEAM'], how='left') self.regressors.append('AVG_TEAM_DRPS_DIFF_ALLOWED_R') temp = data.dropna(subset=['DREB', 'OREB', 'SECONDSPLAYED', 'AVG_DREB', 'AVG_OREB', 'AVG_SP']) grouped_defensive_boxscores = temp.groupby(['SEASON', 'DATE', 'NORM_POS', 'OPP_TEAM']).apply( lambda x: pd.Series({ 'TEAM_DRPS_ALLOWED_P': x['DREB'].sum()/x['SECONDSPLAYED'].sum(), 'TEAM_DRPS_AVG_P': x['AVG_DREB'].sum()/x['AVG_SP'].sum(), 'TEAM_ORPS_ALLOWED_P': x['OREB'].sum()/x['SECONDSPLAYED'].sum(), 'TEAM_ORPS_AVG_P': x['AVG_OREB'].sum()/x['AVG_SP'].sum() }) ).reset_index() grouped_defensive_boxscores['TEAM_DRPS_DIFF_ALLOWED_P'] = grouped_defensive_boxscores['TEAM_DRPS_ALLOWED_P'] - \ grouped_defensive_boxscores['TEAM_DRPS_AVG_P'] grouped_defensive_boxscores['TEAM_ORPS_DIFF_ALLOWED_P'] = grouped_defensive_boxscores['TEAM_ORPS_ALLOWED_P'] - \ grouped_defensive_boxscores['TEAM_ORPS_AVG_P'] grouped_defensive_boxscores = self.feature_creation.expanding_mean( df=grouped_defensive_boxscores, group_col_names=['SEASON', 'NORM_POS', 'OPP_TEAM'], col_name='TEAM_DRPS_DIFF_ALLOWED_P', new_col_name='AVG_TEAM_DRPS_DIFF_ALLOWED_P', order_idx_name='DATE', min_periods=5 ) grouped_defensive_boxscores = self.feature_creation.expanding_mean( df=grouped_defensive_boxscores, group_col_names=['SEASON', 'NORM_POS', 'OPP_TEAM'], col_name='TEAM_ORPS_DIFF_ALLOWED_P', new_col_name='AVG_TEAM_ORPS_DIFF_ALLOWED_P', order_idx_name='DATE', min_periods=5 ) data = data.merge(grouped_defensive_boxscores, on=['SEASON', 'DATE', 'NORM_POS', 'OPP_TEAM'], how='left') self.regressors.append('AVG_TEAM_DRPS_DIFF_ALLOWED_P') self.regressors.append('AVG_TEAM_ORPS_DIFF_ALLOWED_P') # total full_game_odds = odds_data.loc[odds_data['PERIOD'] == 'Full Game'] full_game_odds['TOTAL'] = full_game_odds['TOTAL'].replace(['PK', '-'], np.nan) data = data.merge(full_game_odds, on=['DATE', 'TEAM'], how='left') self.regressors.append('TOTAL') # injuries data = self.feature_creation.expanding_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='REB', new_col_name='AVG_REB' ) temp = data.dropna(subset=['DREB', 'AVG_DREB', 'SECONDSPLAYED', 'AVG_SP']) temp = temp.groupby(['SEASON', 'DATE', 'TEAM']).apply( lambda x: pd.Series({ 'TEAM_ACTIVE_AVG_DRPS': x['AVG_DREB'].sum()/x['AVG_SP'].sum(), 'TEAM_DRPS': x['DREB'].sum()/x['SECONDSPLAYED'].sum(), 'TEAM_ACTIVE_AVG_RPS': x['AVG_REB'].sum()/x['AVG_SP'].sum(), 'TEAM_RPS': x['REB'].sum()/x['SECONDSPLAYED'].sum() }) ) temp = self.feature_creation.expanding_mean( df=temp, group_col_names=['SEASON', 'TEAM'], col_name='TEAM_DRPS', new_col_name='AVG_TEAM_DRPS' ) temp['TEAM_ACTIVE_AVG_DRPS_DIFF'] = temp['TEAM_ACTIVE_AVG_DRPS'] - temp['AVG_TEAM_DRPS'] data = data.merge(temp, on=['DATE', 'TEAM'], how='left') self.regressors.append('TEAM_ACTIVE_AVG_DRPS_DIFF') # regressand by lineup data['START_LINEUP'] = np.nan data['STARS'] = np.nan data = data.set_index(['GAMEID', 'TEAM']) for (game_id, team), temp in data.groupby(['GAMEID', 'TEAM']): start_lineup = list(temp.loc[temp['START'] == 1, 'PLAYERID'].values) start_lineup.sort() start_lineup = '_'.join(start_lineup) data.loc[(game_id, team), 'START_LINEUP'] = start_lineup stars = list(temp.loc[temp['AVG_DREB'] >= 7, 'PLAYERID'].values) stars.sort() stars = '_'.join(stars) data.loc[(game_id, team), 'STARS'] = stars data = data.reset_index() data = self.feature_creation.expanding_weighted_mean( df=data, group_col_names=['SEASON', 'START_LINEUP', 'PLAYERID'], col_name=self.regressand, weight_col_name='SECONDSPLAYED', new_col_name='AVG_Y_STARTERS' ) data = self.feature_creation.expanding_weighted_mean( df=data, group_col_names=['SEASON', 'STARS', 'PLAYERID'], col_name=self.regressand, weight_col_name='SECONDSPLAYED', new_col_name='AVG_Y_STARS' ) self.regressors.append('AVG_Y_STARTERS') self.regressors.append('AVG_Y_STARS') # misc data['GP'] = 1 data = self.feature_creation.expanding_sum( df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='GP', new_col_name='COUNT_GP' ) self.regressors.append('COUNT_GP') self.regressors.append('AVG_SP') # to fill data = self.feature_creation.expanding_weighted_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name=self.regressand, weight_col_name='SECONDSPLAYED', new_col_name='AVG_Y' ) data = self.generate_weights(data) data = self.preprocess(data) data = data.set_index(['GAMEID', 'PLAYERID']) train_index = list(set(data.index.values).intersection(set(train_index.values))) self.train_data = data.loc[train_index].reset_index() test_index = list(set(data.index.values).intersection(set(test_index.values))) self.test_data = data.loc[test_index].reset_index() self.created_features = True def preprocess(self, data): data['AVG_Y_R'] = data['AVG_Y_R'].fillna(data['AVG_Y']) data['AVG_Y_OPP_TEAM'] = data['AVG_Y_OPP_TEAM'].fillna(data['AVG_Y']) data['L1_DRPS'] = data['L1_DRPS'].fillna(data['AVG_DRPS']) data['EWM_DRPS'] = data['EWM_DRPS'].fillna(data['AVG_DRPS']) data['EWM_ORPS'] = data['EWM_ORPS'].fillna(data['AVG_ORPS']) data['MA2_DRPS'] = data['MA2_DRPS'].fillna(data['AVG_DRPS']) data['MA15_DRPS'] = data['MA15_DRPS'].fillna(data['MA2_DRPS']) data['MA6_ORPS'] = data['MA6_ORPS'].fillna(data['AVG_ORPS']) data['MA18_ORPS'] = data['MA18_ORPS'].fillna(data['MA6_ORPS']) data['AVG_TEAM_DRPS_DIFF_ALLOWED'] = data['AVG_TEAM_DRPS_DIFF_ALLOWED'].fillna(0) data['AVG_TEAM_DRPS_DIFF_ALLOWED_R'] = data['AVG_TEAM_DRPS_DIFF_ALLOWED_R'].fillna(0) data['AVG_TEAM_DRPS_DIFF_ALLOWED_P'] = data['AVG_TEAM_DRPS_DIFF_ALLOWED_P'].fillna(0) data['AVG_TEAM_ORPS_DIFF_ALLOWED_P'] = data['AVG_TEAM_ORPS_DIFF_ALLOWED_P'].fillna(0) data['TOTAL'] = data['TOTAL'].fillna(200) data['TEAM_ACTIVE_AVG_DRPS_DIFF'] = data['TEAM_ACTIVE_AVG_DRPS_DIFF'].fillna(0) data['AVG_Y_STARS'] = data['AVG_Y_STARS'].fillna(data['AVG_Y']) data['AVG_Y_STARTERS'] = data['AVG_Y_STARTERS'].fillna(data['AVG_Y_STARS']) data['COUNT_GP'] = data['COUNT_GP'].fillna(0) # we can predict Y for a player as long as AVG_Y is not nan data = data.dropna(subset=['AVG_Y']) return data def generate_weights(self, data): data = self.feature_creation.expanding_sum( df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='SECONDSPLAYED', new_col_name='SUM_SP' ) self.weight = 'WEIGHT' data[self.weight] = data['SECONDSPLAYED'].apply(WeightFunctions.game_seconds_played_weight) * \ data['SUM_SP'].apply(WeightFunctions.season_seconds_played_weight) return data def train_model(self): if not self.created_features: raise Exception('Must create features before training model') # drop games in which players played a minute or less self.train_data = self.train_data.loc[self.train_data['SECONDSPLAYED'] > 60] X = self.train_data[self.regressors] y = self.train_data[self.regressand] w = self.train_data[self.weight] self.model.fit(X, y, sample_weight=w, test_size=0.25, early_stopping_rounds=25) self.trained_model = True def predict(self): if not self.trained_model: raise Exception('Must train model before generating predictions') self.test_data['{}_HAT'.format(self.regressand)] = self.model.predict(self.test_data[self.regressors]) return self.test_data[['GAMEID', 'PLAYERID', '{}_HAT'.format(self.regressand)]]
def generate_regressors(self, boxscores, start_date, end_date): feature_creation = FeatureCreation() relevant_seasons = boxscores.loc[(boxscores['DATE'] >= start_date) & ( boxscores['DATE'] <= end_date)]['SEASON'].unique() boxscores = boxscores.loc[boxscores['SEASON'].isin(relevant_seasons)] boxscores['MP'] = boxscores['SECONDSPLAYED'] / 60 boxscores['POSSESSIONS/MINUTE'] = boxscores['POSS'] / boxscores['MP'] # average player possessions/minute boxscores = feature_creation.expanding_weighted_mean( df=boxscores, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='POSSESSIONS/MINUTE', new_col_name='AVG_POSSESSIONS/MINUTE', weight_col_name='MP') # average possessions/minute that opp team allowed team_boxscores = boxscores.groupby([ 'SEASON', 'DATE', 'TEAM', 'OPP_TEAM' ]).apply(lambda x: pd.Series({ 'TEAM_POSSESSIONS': x['POSS'].sum(), 'TEAM_MP': x['MP'].sum() })).reset_index() team_boxscores['TEAM_POSSESSIONS/MINUTE'] = team_boxscores[ 'TEAM_POSSESSIONS'] / team_boxscores['TEAM_MP'] opp_team_boxscores = team_boxscores.rename( columns={ 'TEAM': 'OPP_TEAM', 'OPP_TEAM': 'TEAM', 'TEAM_POSSESSIONS': 'OPP_TEAM_POSSESSIONS', 'TEAM_MP': 'OPP_TEAM_MP', 'TEAM_POSSESSIONS/MINUTE': 'OPP_TEAM_POSSESSIONS/MINUTE' }) team_boxscores = team_boxscores.merge( opp_team_boxscores, on=['SEASON', 'DATE', 'TEAM', 'OPP_TEAM'], how='left') team_boxscores = feature_creation.expanding_weighted_mean( df=team_boxscores, group_col_names=['SEASON', 'OPP_TEAM'], col_name='TEAM_POSSESSIONS/MINUTE', new_col_name='AVG_POSSESSIONS/MINUTE_OPP_TEAM_ALLOWED', weight_col_name='TEAM_MP') # average possessions/minute that opp team played against season_stats = team_boxscores.groupby( ['SEASON', 'TEAM']).apply(lambda x: pd.Series({ 'TEAM_POSSESSIONS(SEASON)': x['TEAM_POSSESSIONS'].mean(), 'TEAM_MP(SEASON)': x['TEAM_MP'].mean(), 'TEAM_POSSESSIONS_ALLOWED(SEASON)': x['OPP_TEAM_POSSESSIONS'].mean(), 'TEAM_MP_ALLOWED(SEASON)': x['OPP_TEAM_MP'].mean() })).reset_index() season_stats['TEAM_POSSESSIONS/MINUTE(SEASON)'] = season_stats[ 'TEAM_POSSESSIONS(SEASON)'] / season_stats['TEAM_MP(SEASON)'] season_stats['TEAM_POSSESSIONS/MINUTE_ALLOWED(SEASON)'] = \ season_stats['TEAM_POSSESSIONS_ALLOWED(SEASON)']/season_stats['TEAM_MP_ALLOWED(SEASON)'] opp_season_stats = season_stats.rename( columns={ 'TEAM': 'OPP_TEAM', 'TEAM_POSSESSIONS(SEASON)': 'OPP_TEAM_POSSESSIONS(SEASON)', 'TEAM_MP(SEASON)': 'OPP_TEAM_MP(SEASON)', 'TEAM_POSSESSIONS_ALLOWED(SEASON)': 'OPP_TEAM_POSSESSIONS_ALLOWED(SEASON)', 'TEAM_MP_ALLOWED(SEASON)': 'OPP_TEAM_MP_ALLOWED(SEASON)', 'TEAM_POSSESSIONS/MINUTE(SEASON)': 'OPP_TEAM_POSSESSIONS/MINUTE(SEASON)', 'TEAM_POSSESSIONS/MINUTE_ALLOWED(SEASON)': 'OPP_TEAM_POSSESSIONS/MINUTE_ALLOWED(SEASON)' }) team_boxscores = team_boxscores.merge(season_stats, on=['SEASON', 'TEAM'], how='left') team_boxscores = team_boxscores.merge(opp_season_stats, on=['SEASON', 'OPP_TEAM'], how='left') team_boxscores = feature_creation.expanding_weighted_mean( df=team_boxscores, group_col_names=['SEASON', 'OPP_TEAM'], col_name='TEAM_POSSESSIONS/MINUTE(SEASON)', new_col_name='AVG_POSSESSIONS/MINUTE(SEASON)_OPP_TEAM_P.A.', weight_col_name='OPP_TEAM_MP') # possessions/minute allowed that player played against boxscores = boxscores.merge(team_boxscores, on=['SEASON', 'DATE', 'TEAM', 'OPP_TEAM'], how='left') boxscores = feature_creation.expanding_weighted_mean( df=boxscores, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='OPP_TEAM_POSSESSIONS/MINUTE_ALLOWED(SEASON)', new_col_name='AVG_POSSESSIONS/MINUTE_ALLOWED(SEASON)_PLAYER_P.A.', weight_col_name='MP') # player possessions/minute boxscores['PLAYER_POSSESSIONS/MINUTE'] = \ 2*boxscores['AVG_POSSESSIONS/MINUTE'] - boxscores['AVG_POSSESSIONS/MINUTE_ALLOWED(SEASON)_PLAYER_P.A.'] # opp possessions/minute allowed boxscores['OPP_POSSESSIONS/MINUTE_ALLOWED'] = \ 2*boxscores['AVG_POSSESSIONS/MINUTE_OPP_TEAM_ALLOWED'] - boxscores['AVG_POSSESSIONS/MINUTE(SEASON)_OPP_TEAM_P.A.'] boxscores = boxscores.loc[(boxscores['DATE'] >= start_date) & (boxscores['DATE'] <= end_date)] return boxscores
def generate_regressors(self, boxscores, start_date, end_date): feature_creation = FeatureCreation() relevant_seasons = boxscores.loc[(boxscores['DATE'] >= start_date) & ( boxscores['DATE'] <= end_date)]['SEASON'].unique() boxscores = boxscores.loc[boxscores['SEASON'].isin(relevant_seasons)] team_boxscores = boxscores.groupby([ 'SEASON', 'DATE', 'TEAM', 'OPP_TEAM' ]).apply(lambda x: pd.Series({ 'TEAM_POSSESSIONS': x['POSS'].sum() / 5, 'TEAM_OREB': x['OREB'].sum(), 'TEAM_DREB': x['DREB'].sum() })).reset_index() opp_team_boxscores = team_boxscores.drop(columns='OPP_TEAM') opp_team_boxscores = opp_team_boxscores.rename( columns={ 'TEAM': 'OPP_TEAM', 'TEAM_POSSESSIONS': 'OPP_TEAM_POSSESSIONS', 'TEAM_OREB': 'OPP_TEAM_OREB', 'TEAM_DREB': 'OPP_TEAM_DREB' }) team_boxscores = team_boxscores.merge( opp_team_boxscores, on=['SEASON', 'DATE', 'OPP_TEAM'], how='left') team_boxscores['TEAM_OREB_CHANCES'] = team_boxscores[ 'TEAM_OREB'] + team_boxscores['OPP_TEAM_DREB'] team_boxscores['TEAM_DREB_CHANCES'] = team_boxscores[ 'TEAM_DREB'] + team_boxscores['OPP_TEAM_OREB'] # average team oreb chances/possession team_boxscores['TEAM_OREB_CHANCES/POSSESSION'] = team_boxscores[ 'TEAM_OREB_CHANCES'] / team_boxscores['TEAM_POSSESSIONS'] team_boxscores = feature_creation.expanding_weighted_mean( df=team_boxscores, group_col_names=['SEASON', 'TEAM'], col_name='TEAM_OREB_CHANCES/POSSESSION', new_col_name='AVG_TEAM_OREB_CHANCES/POSSESSION', weight_col_name='TEAM_POSSESSIONS') # average oreb chances/possession that opp team allowed team_boxscores = feature_creation.expanding_weighted_mean( df=team_boxscores, group_col_names=['SEASON', 'OPP_TEAM'], col_name='TEAM_OREB_CHANCES/POSSESSION', new_col_name='AVG_OREB_CHANCES/POSSESSION_OPP_TEAM_ALLOWED', weight_col_name='TEAM_POSSESSIONS') # average oreb chances/possession allowed that team played against season_stats = team_boxscores.groupby( ['SEASON', 'TEAM']).apply(lambda x: pd.Series({ 'TEAM_OREB_ALLOWED(SEASON)': x['OPP_TEAM_OREB'].mean(), 'TEAM_OREB_CHANCES(SEASON)': x['TEAM_OREB_CHANCES'].mean(), 'TEAM_OREB_CHANCES_ALLOWED(SEASON)': x['TEAM_DREB_CHANCES'].mean(), 'TEAM_DREB_ALLOWED(SEASON)': x['OPP_TEAM_DREB'].mean(), 'TEAM_DREB_CHANCES(SEASON)': x['TEAM_DREB_CHANCES'].mean(), 'TEAM_DREB_CHANCES_ALLOWED(SEASON)': x['TEAM_DREB_CHANCES'].mean(), 'TEAM_POSSESSIONS(SEASON)': x['TEAM_POSSESSIONS'].mean(), 'TEAM_POSSESSIONS_ALLOWED(SEASON)': x['OPP_TEAM_POSSESSIONS'].mean() })).reset_index() opp_season_stats = season_stats.rename( columns={ 'TEAM': 'OPP_TEAM', 'TEAM_OREB_ALLOWED(SEASON)': 'OPP_TEAM_OREB_ALLOWED(SEASON)', 'TEAM_OREB_CHANCES(SEASON)': 'OPP_TEAM_OREB_CHANCES(SEASON)', 'TEAM_OREB_CHANCES_ALLOWED(SEASON)': 'OPP_TEAM_OREB_CHANCES_ALLOWED(SEASON)', 'TEAM_DREB_ALLOWED(SEASON)': 'OPP_TEAM_DREB_ALLOWED(SEASON)', 'TEAM_DREB_CHANCES(SEASON)': 'OPP_TEAM_DREB_CHANCES(SEASON)', 'TEAM_DREB_CHANCES_ALLOWED(SEASON)': 'OPP_TEAM_DREB_CHANCES_ALLOWED(SEASON)', 'TEAM_POSSESSIONS(SEASON)': 'OPP_TEAM_POSSESSIONS(SEASON)', 'TEAM_POSSESSIONS_ALLOWED(SEASON)': 'OPP_TEAM_POSSESSIONS_ALLOWED(SEASON)' }) team_boxscores = team_boxscores.merge(season_stats, on=['SEASON', 'TEAM'], how='left') team_boxscores = team_boxscores.merge(opp_season_stats, on=['SEASON', 'OPP_TEAM'], how='left') team_boxscores['OPP_TEAM_OREB_CHANCES/POSSESSION_ALLOWED(SEASON)'] = \ team_boxscores['OPP_TEAM_OREB_CHANCES_ALLOWED(SEASON)']/team_boxscores['OPP_TEAM_POSSESSIONS_ALLOWED(SEASON)'] team_boxscores = feature_creation.expanding_weighted_mean( df=team_boxscores, group_col_names=['SEASON', 'TEAM'], col_name='OPP_TEAM_OREB_CHANCES/POSSESSION_ALLOWED(SEASON)', new_col_name= 'AVG_OREB_CHANCES/POSSESSION_ALLOWED(SEASON)_TEAM_P.A.', weight_col_name='TEAM_POSSESSIONS') # average oreb chances/possession that opp team played against team_boxscores['TEAM_OREB_CHANCES/POSSESSION(SEASON)'] = \ team_boxscores['TEAM_OREB_CHANCES(SEASON)']/team_boxscores['TEAM_POSSESSIONS(SEASON)'] team_boxscores = feature_creation.expanding_weighted_mean( df=team_boxscores, group_col_names=['SEASON', 'OPP_TEAM'], col_name='TEAM_OREB_CHANCES/POSSESSION(SEASON)', new_col_name='AVG_OREB_CHANCES/POSSESSION(SEASON)_OPP_TEAM_P.A.', weight_col_name='OPP_TEAM_POSSESSIONS') # team oreb chances/possession team_boxscores['TEAM_OREB_CHANCES/POSSESSION_HAT'] = \ 2*team_boxscores['AVG_TEAM_OREB_CHANCES/POSSESSION'] - \ team_boxscores['AVG_OREB_CHANCES/POSSESSION_ALLOWED(SEASON)_TEAM_P.A.'] # opp team oreb chances/possession allowed team_boxscores['OPP_TEAM_OREB_CHANCES/POSSESSION_ALLOWED_HAT'] = \ 2*team_boxscores['AVG_OREB_CHANCES/POSSESSION_OPP_TEAM_ALLOWED'] - \ team_boxscores['AVG_OREB_CHANCES/POSSESSION(SEASON)_OPP_TEAM_P.A.'] boxscores = boxscores.merge(team_boxscores, on=['SEASON', 'DATE', 'TEAM', 'OPP_TEAM'], how='left') boxscores['OREB_CHANCES'] = np.nan boxscores.loc[boxscores['OREB'] > 0, 'OREB_CHANCES'] = ( boxscores.loc[boxscores['OREB'] > 0, 'OREB'] / boxscores.loc[boxscores['OREB'] > 0, 'OREB_PCT']).apply(lambda x: round(x)) boxscores.loc[boxscores['OREB'] == 0, 'OREB_CHANCES'] = \ boxscores.loc[boxscores['OREB'] == 0, 'TEAM_OREB_CHANCES/POSSESSION']*boxscores.loc[boxscores['OREB'] == 0, 'POSS'] boxscores['TEAM_DREB_CHANCES/POSSESSION'] = boxscores[ 'TEAM_DREB_CHANCES'] / boxscores['TEAM_POSSESSIONS'] boxscores['DREB_CHANCES'] = np.nan boxscores.loc[boxscores['DREB'] > 0, 'DREB_CHANCES'] = ( boxscores.loc[boxscores['DREB'] > 0, 'DREB'] / boxscores.loc[boxscores['DREB'] > 0, 'DREB_PCT']).apply(lambda x: round(x)) boxscores.loc[boxscores['DREB'] == 0, 'DREB_CHANCES'] = \ boxscores.loc[boxscores['DREB'] == 0, 'TEAM_DREB_CHANCES/POSSESSION']*boxscores.loc[boxscores['DREB'] == 0, 'POSS'] temp = boxscores.groupby( ['SEASON', 'DATE', 'TEAM', 'OPP_TEAM']).apply(lambda x: pd.Series( { 'IMPLIED_TEAM_OREB_CHANCES': x['OREB_CHANCES'].sum() / 5, 'IMPLIED_TEAM_DREB_CHANCES': x['DREB_CHANCES'].sum() / 5 })).reset_index() boxscores = boxscores.merge(temp, on=['SEASON', 'DATE', 'TEAM', 'OPP_TEAM'], how='left') # average player oreb/chance boxscores['OREB_CHANCES'] = boxscores['OREB_CHANCES'] * ( boxscores['TEAM_OREB_CHANCES'] / boxscores['IMPLIED_TEAM_OREB_CHANCES']) boxscores[ 'OREB/OREB_CHANCE'] = boxscores['OREB'] / boxscores['OREB_CHANCES'] boxscores = feature_creation.expanding_weighted_mean( df=boxscores, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='OREB/OREB_CHANCE', new_col_name='AVG_OREB/OREB_CHANCE', weight_col_name='OREB_CHANCES') # average player dreb/chance boxscores['DREB_CHANCES'] = boxscores['DREB_CHANCES'] * ( boxscores['TEAM_DREB_CHANCES'] / boxscores['IMPLIED_TEAM_DREB_CHANCES']) boxscores[ 'DREB/DREB_CHANCE'] = boxscores['DREB'] / boxscores['DREB_CHANCES'] boxscores = feature_creation.expanding_weighted_mean( df=boxscores, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='DREB/DREB_CHANCE', new_col_name='AVG_DREB/DREB_CHANCE', weight_col_name='DREB_CHANCES') # average oreb/oreb chance that opp team allowed team_game_boxscores = boxscores.groupby( ['SEASON', 'DATE', 'TEAM', 'OPP_TEAM']).apply(lambda x: pd.Series( { 'TEAM_OREB': x['OREB'].sum(), 'TEAM_OREB_CHANCES': x['OREB_CHANCES'].sum() / 5, 'TEAM_DREB': x['DREB'].sum(), 'TEAM_DREB_CHANCES': x['DREB_CHANCES'].sum() / 5 })).reset_index() opp_team_game_boxscores = team_game_boxscores.drop(columns='OPP_TEAM') opp_team_game_boxscores = opp_team_game_boxscores.rename( columns={ 'TEAM': 'OPP_TEAM', 'TEAM_OREB': 'OPP_TEAM_OREB', 'TEAM_OREB_CHANCES': 'OPP_TEAM_OREB_CHANCES', 'TEAM_DREB': 'OPP_TEAM_DREB', 'TEAM_DREB_CHANCES': 'OPP_TEAM_DREB_CHANCES' }) team_game_boxscores = team_game_boxscores.merge( opp_team_game_boxscores, on=['SEASON', 'DATE', 'OPP_TEAM'], how='left') team_game_boxscores['TEAM_OREB/OREB_CHANCE'] = \ team_game_boxscores['TEAM_OREB']/team_game_boxscores['TEAM_OREB_CHANCES'] team_game_boxscores = feature_creation.expanding_weighted_mean( df=team_game_boxscores, group_col_names=['SEASON', 'OPP_TEAM'], col_name='TEAM_OREB/OREB_CHANCE', new_col_name='AVG_TEAM_OREB/OREB_CHANCE_OPP_ALLOWED', weight_col_name='TEAM_OREB_CHANCES') # average dreb/dreb chance that opp team allowed team_game_boxscores['TEAM_DREB/DREB_CHANCE'] = \ team_game_boxscores['TEAM_DREB']/team_game_boxscores['TEAM_DREB_CHANCES'] team_game_boxscores = feature_creation.expanding_weighted_mean( df=team_game_boxscores, group_col_names=['SEASON', 'OPP_TEAM'], col_name='TEAM_DREB/DREB_CHANCE', new_col_name='AVG_TEAM_DREB/DREB_CHANCE_OPP_ALLOWED', weight_col_name='TEAM_DREB_CHANCES') boxscores = boxscores.merge(team_game_boxscores, on=['SEASON', 'DATE', 'TEAM', 'OPP_TEAM'], how='left') # average oreb/oreb chance allowed that player played against boxscores['OPP_TEAM_OREB/OREB_CHANCE_ALLOWED(SEASON)'] = \ boxscores['OPP_TEAM_OREB_ALLOWED(SEASON)'] / boxscores['OPP_TEAM_OREB_CHANCES_ALLOWED(SEASON)'] boxscores = feature_creation.expanding_weighted_mean( df=boxscores, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='OPP_TEAM_OREB/OREB_CHANCE_ALLOWED(SEASON)', new_col_name='AVG_TEAM_OREB/OREB_CHANCE(SEASON)_ALLOWED_PLAYER_P.A', weight_col_name='OREB_CHANCES') # average dreb/dreb chance allowed that player played against boxscores['OPP_TEAM_DREB/DREB_CHANCE_ALLOWED(SEASON)'] = \ boxscores['OPP_TEAM_DREB_ALLOWED(SEASON)'] / boxscores['OPP_TEAM_DREB_CHANCES_ALLOWED(SEASON)'] boxscores = feature_creation.expanding_weighted_mean( df=boxscores, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='OPP_TEAM_DREB/DREB_CHANCE_ALLOWED(SEASON)', new_col_name='AVG_TEAM_DREB/DREB_CHANCE(SEASON)_ALLOWED_PLAYER_P.A', weight_col_name='DREB_CHANCES') # oreb/oreb chance defense boxscores['OREB/CH_DEF'] = \ boxscores['AVG_TEAM_OREB/OREB_CHANCE_OPP_ALLOWED'] / \ boxscores['AVG_TEAM_OREB/OREB_CHANCE(SEASON)_ALLOWED_PLAYER_P.A'] # dreb/dreb chance defense boxscores['DREB/CH_DEF'] = \ boxscores['AVG_TEAM_DREB/DREB_CHANCE_OPP_ALLOWED'] / \ boxscores['AVG_TEAM_DREB/DREB_CHANCE(SEASON)_ALLOWED_PLAYER_P.A'] boxscores = boxscores.loc[(boxscores['DATE'] >= start_date) & (boxscores['DATE'] <= end_date)] return boxscores
def __init__(self, test_data): self.feature_creation = FeatureCreation() self.test_data = test_data self.original_columns = list(self.test_data.columns)
def generate_regressors(self, boxscores, start_date, end_date): feature_creation = FeatureCreation() helpers = Helpers() relevant_seasons = boxscores.loc[(boxscores['DATE'] >= start_date) & ( boxscores['DATE'] <= end_date)]['SEASON'].unique() boxscores = boxscores.loc[boxscores['SEASON'].isin(relevant_seasons)] boxscores['ATTEMPTS'] = boxscores['TOTAL_ATTEMPTS'] boxscores[ 'ATTEMPTS/POSSESSION'] = boxscores['ATTEMPTS'] / boxscores['POSS'] # average attempts per possession boxscores = feature_creation.expanding_weighted_mean( df=boxscores, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='ATTEMPTS/POSSESSION', new_col_name='AVG_ATTEMPTS/POSSESSION', weight_col_name='POSS') boxscores = feature_creation.expanding_sum( df=boxscores, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='POSS', new_col_name='SUM_POSS') boxscores.loc[boxscores['SUM_POSS'] == 0, 'AVG_ATTEMPTS/POSSESSION'] = boxscores['ATTEMPTS'].sum( ) / boxscores['POSS'].sum() boxscores['POINTS/ATTEMPT'] = boxscores['PTS'] / boxscores['ATTEMPTS'] # average points per attempt boxscores = feature_creation.expanding_weighted_mean( df=boxscores, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='POINTS/ATTEMPT', new_col_name='AVG_POINTS/ATTEMPT', weight_col_name='ATTEMPTS') boxscores = feature_creation.expanding_sum( df=boxscores, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='ATTEMPTS', new_col_name='SUM_ATTEMPTS') boxscores.loc[boxscores['SUM_ATTEMPTS'] == 0, 'AVG_POINTS/ATTEMPT'] = boxscores['PTS'].sum( ) / boxscores['ATTEMPTS'].sum() # adjustment for defense (points per attempt) for play_type in PLAY_TYPES: player_play_type_data = pd.DataFrame() team_play_type_data = pd.DataFrame() for season in relevant_seasons: player_data = helpers.get_play_type_breakdown( play_type, season, 'player') player_data['SEASON'] = season player_data['PLAYER_ID'] = player_data['PLAYER_ID'].apply( lambda x: str(x)) player_data = player_data.rename( columns={ 'TEAM_ABBREVIATION': 'TEAM', 'PLAYER_ID': 'PLAYERID', 'PPP': '{}_PPP'.format(play_type), 'POSS_PCT': '{}_POSS_PCT'.format(play_type) }) player_data = player_data[[ 'SEASON', 'PLAYERID', 'TEAM', '{}_PPP'.format(play_type), '{}_POSS_PCT'.format(play_type) ]] player_play_type_data = player_play_type_data.append( player_data) team_data = helpers.get_play_type_breakdown( play_type, season, 'team') team_data['SEASON'] = season team_data = team_data.rename( columns={ 'TEAM_ABBREVIATION': 'OPP_TEAM', 'PPP': '{}_PPP_ALLOWED'.format(play_type), 'POSS_PCT': '{}_POSS_PCT_ALLOWED'.format(play_type) }) team_data = team_data[[ 'SEASON', 'OPP_TEAM', '{}_PPP_ALLOWED'.format(play_type), '{}_POSS_PCT_ALLOWED'.format(play_type) ]] team_play_type_data = team_play_type_data.append(team_data) boxscores = boxscores.merge(player_play_type_data, on=['SEASON', 'PLAYERID', 'TEAM'], how='left') boxscores = boxscores.merge(team_play_type_data, on=['SEASON', 'OPP_TEAM'], how='left') poss_pct_cols = ['{}_POSS_PCT'.format(i) for i in PLAY_TYPES] poss_pct_allowed_cols = [ '{}_POSS_PCT_ALLOWED'.format(i) for i in PLAY_TYPES ] boxscores[poss_pct_cols] = boxscores[poss_pct_cols].replace([0], 0.001) boxscores[poss_pct_allowed_cols] = boxscores[ poss_pct_allowed_cols].replace([0], 0.001) boxscores['TOTAL_POSS_PCT'] = boxscores[poss_pct_cols].sum(axis=1) boxscores['TOTAL_POSS_PCT_ALLOWED'] = boxscores[ poss_pct_allowed_cols].sum(axis=1) boxscores[poss_pct_cols] = boxscores[poss_pct_cols].div( boxscores['TOTAL_POSS_PCT'], axis=0) boxscores[poss_pct_allowed_cols] = boxscores[ poss_pct_allowed_cols].div(boxscores['TOTAL_POSS_PCT_ALLOWED'], axis=0) boxscores['NET_POINTS/ATTEMPT'] = 0 boxscores['TOTAL_POSS_PCT'] = 0 boxscores['IMPLIED_NET_POINTS/ATTEMPT'] = 0 boxscores['TOTAL_IMPLIED_POSS_PCT'] = 0 for play_type in PLAY_TYPES: boxscores = feature_creation.expanding_weighted_mean( df=boxscores, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='{}_PPP_ALLOWED'.format(play_type), weight_col_name='ATTEMPTS', new_col_name='AVG_{}_PPP_ALLOWED_PLAYED_AGAINST'.format( play_type)) boxscores = feature_creation.expanding_weighted_mean( df=boxscores, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='{}_POSS_PCT_ALLOWED'.format(play_type), weight_col_name='POSS', new_col_name='AVG_{}_POSS_PCT_ALLOWED_PLAYED_AGAINST'.format( play_type)) boxscores['PPP_ADJ'] = boxscores.apply( lambda row: row['{}_PPP_ALLOWED'.format(play_type)]/row['AVG_{}_PPP_ALLOWED_PLAYED_AGAINST'.format(play_type)] \ if (not np.isnan(row['{}_PPP_ALLOWED'.format(play_type)]) and \ not np.isnan(row['AVG_{}_PPP_ALLOWED_PLAYED_AGAINST'.format(play_type)]) and row['AVG_{}_PPP_ALLOWED_PLAYED_AGAINST'.format(play_type)] != 0) \ else 1, axis = 1 ) boxscores['POSS_PCT_ADJ'] = boxscores.apply( lambda row: row['{}_POSS_PCT_ALLOWED'.format(play_type)]/row['AVG_{}_POSS_PCT_ALLOWED_PLAYED_AGAINST'.format(play_type)] \ if (not np.isnan(row['{}_POSS_PCT_ALLOWED'.format(play_type)]) and \ not np.isnan(row['AVG_{}_POSS_PCT_ALLOWED_PLAYED_AGAINST'.format(play_type)]) and row['AVG_{}_POSS_PCT_ALLOWED_PLAYED_AGAINST'.format(play_type)] != 0) \ else 1, axis = 1 ) boxscores['NET_POINTS/ATTEMPT'] += \ boxscores['{}_PPP'.format(play_type)].fillna(0) * boxscores['{}_POSS_PCT'.format(play_type)].fillna(0) boxscores['TOTAL_POSS_PCT'] += boxscores['{}_POSS_PCT'.format( play_type)].fillna(0) boxscores['IMPLIED_NET_POINTS/ATTEMPT'] += (boxscores['{}_PPP'.format(play_type)].fillna(0) * boxscores['PPP_ADJ']) * \ (boxscores['{}_POSS_PCT'.format(play_type)].fillna(0) * boxscores['POSS_PCT_ADJ']) boxscores['TOTAL_IMPLIED_POSS_PCT'] += boxscores[ '{}_POSS_PCT'.format(play_type)].fillna( 0) * boxscores['POSS_PCT_ADJ'] boxscores['POINTS/ATTEMPT_DEF_ADJ'] = boxscores.apply( lambda row: (row['IMPLIED_NET_POINTS/ATTEMPT']/row['TOTAL_IMPLIED_POSS_PCT']) - \ (row['NET_POINTS/ATTEMPT']/row['TOTAL_POSS_PCT']) \ if (row['TOTAL_POSS_PCT'] > 0 and row['TOTAL_IMPLIED_POSS_PCT'] > 0) else 0, axis = 1 ) boxscores = boxscores.loc[(boxscores['DATE'] >= start_date) & (boxscores['DATE'] <= end_date)] return boxscores
def __init__(self): self.feature_creation = FeatureCreation()
class OwnershipModel(object): def __init__(self, train_data, test_data, site): self.feature_creation = FeatureCreation() self.clean_data = CleanData() self.train_data = train_data self.test_data = test_data self.site = site self.model = XGBoostRegressionModel(OWNERSHIP_MODEL_PARAMS) self.regressors = [] self.regressand = 'OWNERSHIP' self.created_features = False self.trained_model = False def create_features(self, salary_data, contest_data, ownership_data, odds_data): data = pd.concat([self.train_data, self.test_data]) train_index = self.train_data.set_index(['GAMEID', 'PLAYERID']).index test_index = self.test_data.set_index(['GAMEID', 'PLAYERID']).index salary_data = salary_data.loc[salary_data['SITE'] == self.site] data = data.merge(salary_data, on=['DATE', 'NAME'], how='inner') # player stat features CustomFPCalculator = FPCalculator(self.site) data['REB'] = data['DREB'] + data['OREB'] data['DKFP'] = data.apply( lambda x: CustomFPCalculator.calculate_fantasy_points( x['SEASON'], x['PTS'], x['REB'], x['AST'], x['TOV'], x['BLK'], x['STL'], x['FG3M']), axis=1) data = self.feature_creation.expanding_mean( df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='DKFP', new_col_name='AVG_DKFP') self.regressors.append('AVG_DKFP') data['VALUE'] = data['AVG_DKFP'] / data['SALARY'] self.regressors.append('VALUE') data = self.feature_creation.lag( df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='DKFP', new_col_name='L1_DKFP', n_shift=1) self.regressors.append('L1_DKFP') data = self.feature_creation.rolling_mean( df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='DKFP', new_col_name='MA5_DKFP', n_rolling=5) self.regressors.append('MA5_DKFP') data = self.feature_creation.lag( df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='SALARY', new_col_name='L1_SALARY', n_shift=1) data['SALARY_CHANGE'] = data['SALARY'] - data['L1_SALARY'] self.regressors.append('SALARY') self.regressors.append('SALARY_CHANGE') data = self.feature_creation.expanding_standard_deviation( df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='DKFP', new_col_name='STD_DKFP', min_periods=5) self.regressors.append('STD_DKFP') self.regressors.append('START') data['DFS_POSITIONS'] = data['DFS_POSITION'].apply( lambda x: x.split('_') if isinstance(x, str) else np.nan) data['NUM_POSITIONS'] = data['DFS_POSITIONS'].apply( lambda x: len(x) if isinstance(x, list) else np.nan) self.regressors.append('NUM_POSITIONS') for position in ['SG', 'PG', 'C']: data[position] = 0 data.loc[data['DFS_POSITION'].str.contains(position), position] = 1 self.regressors.append(position) # historical ownership of player ownership_data['NAME'] = ownership_data['PLAYERNAME'].apply( lambda x: x if x not in OWNERSHIP_NAME_TO_NBA_NAME else OWNERSHIP_NAME_TO_NBA_NAME[x]) ownership_data = ownership_data.merge(contest_data, on=['SLATEID', 'CONTESTNAME'], how='inner') ownership_data = ownership_data.groupby( ['DATE', 'SLATEID', 'GAMECOUNT', 'NAME']).apply(lambda x: pd.Series({ 'OWNERSHIP': (x['OWNERSHIP'] * x['TOTALENTRIES']).sum() / x[ 'TOTALENTRIES'].sum() })).reset_index() aggregated_ownership = ownership_data.groupby( ['DATE', 'NAME']).apply(lambda x: pd.Series( {'TOTAL_OWNERSHIP': x['OWNERSHIP'].mean()})).reset_index() data = data.merge(aggregated_ownership, on=['DATE', 'NAME'], how='inner') data = self.feature_creation.expanding_mean( df=data, group_col_names=['SEASON', 'NAME'], col_name='TOTAL_OWNERSHIP', new_col_name='AVG_OWNERSHIP') self.regressors.append('AVG_OWNERSHIP') data = self.feature_creation.lag(df=data, group_col_names=['SEASON', 'NAME'], col_name='TOTAL_OWNERSHIP', new_col_name='L1_OWNERSHIP', n_shift=1) self.regressors.append('L1_OWNERSHIP') data = self.feature_creation.rolling_mean( df=data, group_col_names=['SEASON', 'NAME'], col_name='TOTAL_OWNERSHIP', new_col_name='MA5_OWNERSHIP', n_rolling=5) self.regressors.append('MA5_OWNERSHIP') # defense data['NORM_POS'] = data['POSITION'].apply( lambda x: x if '-' not in x else x.split('-')[0]) temp = data.dropna(subset=['DKFP', 'AVG_DKFP']) grouped_defensive_boxscores = temp.groupby([ 'SEASON', 'DATE', 'NORM_POS', 'OPP_TEAM' ]).apply(lambda x: pd.Series({ 'TEAM_DKFP_ALLOWED_P': x['DKFP'].sum(), 'TEAM_DKFP_AVG_P': x['AVG_DKFP'].sum() })).reset_index() grouped_defensive_boxscores['DvP'] = grouped_defensive_boxscores['TEAM_DKFP_ALLOWED_P'] - \ grouped_defensive_boxscores['TEAM_DKFP_AVG_P'] grouped_defensive_boxscores = self.feature_creation.expanding_mean( df=grouped_defensive_boxscores, group_col_names=['SEASON', 'OPP_TEAM', 'NORM_POS'], col_name='DvP', new_col_name='AVG_DvP', order_idx_name='DATE', min_periods=5) self.regressors.append('AVG_DvP') data = data.merge(grouped_defensive_boxscores, on=['SEASON', 'DATE', 'OPP_TEAM', 'NORM_POS'], how='left') # vegas lines odds_data['TOTAL'] = odds_data['TOTAL'].replace(['PK', '-'], np.nan) odds_data['POINTSPREAD'] = odds_data['POINTSPREAD'].replace( ['PK', '-'], 0) full_game_odds = odds_data.loc[odds_data['PERIOD'] == 'Full Game'] data = data.merge(full_game_odds, on=['DATE', 'TEAM'], how='left') self.regressors.append('TOTAL') self.regressors.append('POINTSPREAD') # slate info self.regressors.append('GAMECOUNT') slates = contest_data.loc[ contest_data['SITE'] == self.site, ['DATE', 'SLATEID', 'TEAMS']].drop_duplicates() slates['TEAMS'] = slates['TEAMS'].apply(lambda x: x.split('_')) slates = slates.explode('TEAMS').rename(columns={"TEAMS": "TEAM"}) slates['TEAM'] = slates['TEAM'].apply( lambda x: x if x not in DB_TEAM_TO_NBA_TEAM else DB_TEAM_TO_NBA_TEAM[x]) slate_players = data[[ 'DATE', 'TEAM', 'NAME', 'DFS_POSITIONS', 'SALARY', 'VALUE' ]].merge(slates, on=['DATE', 'TEAM'], how='inner') slate_players['SALARY_BIN'] = pd.cut(slate_players['SALARY'], bins=list(range( 3000, 15000, 1000)), duplicates='drop', include_lowest=True) slate_players = slate_players.explode('DFS_POSITIONS').rename( columns={'DFS_POSITIONS': 'SINGLE_DFS_POSITION'}) MIN_VALUE = 0.002 all_temp = slate_players.groupby( ['SLATEID', 'SINGLE_DFS_POSITION']).apply(lambda x: pd.Series( {'L1P_COUNT': x.loc[x['VALUE'] > MIN_VALUE, 'NAME'].count()}) ).reset_index().dropna() slate_players = slate_players.merge( all_temp, on=['SLATEID', 'SINGLE_DFS_POSITION'], how='left') sb_temp = slate_players.groupby( ['SLATEID', 'SINGLE_DFS_POSITION', 'SALARY_BIN']).apply(lambda x: pd.Series( {'L1P_SB_COUNT': x['NAME'].count()})).reset_index().dropna() slate_players = slate_players.merge( sb_temp, on=['SLATEID', 'SINGLE_DFS_POSITION', 'SALARY_BIN'], how='left') L1_TO_L2 = {'PG': 'G', 'SG': 'G', 'SF': 'F', 'PF': 'F', 'C': 'C'} slate_players['LEVEL2_DFS_POSITION'] = slate_players[ 'SINGLE_DFS_POSITION'].apply(lambda x: L1_TO_L2[x] if isinstance(x, str) else np.nan) all_temp = slate_players.groupby( ['SLATEID', 'LEVEL2_DFS_POSITION']).apply(lambda x: pd.Series( {'L2P_COUNT': x.loc[x['VALUE'] > MIN_VALUE, 'NAME'].count()}) ).reset_index().dropna() slate_players = slate_players.merge( all_temp, on=['SLATEID', 'LEVEL2_DFS_POSITION'], how='left') all_temp = slate_players.groupby( ['SLATEID']).apply(lambda x: pd.Series( {'L3P_COUNT': x.loc[x['VALUE'] > MIN_VALUE, 'NAME'].count()}) ).reset_index().dropna() slate_players = slate_players.merge(all_temp, on=['SLATEID'], how='left') sb_temp = slate_players.groupby([ 'SLATEID', 'SALARY_BIN' ]).apply(lambda x: pd.Series( {'L3P_SB_COUNT': x.loc[x['VALUE'] > MIN_VALUE, 'NAME'].count()}) ).reset_index().dropna() slate_players = slate_players.merge(sb_temp, on=['SLATEID', 'SALARY_BIN'], how='left') slate_players['SALARY_FLOOR'] = slate_players['SALARY_BIN'].apply( lambda x: x.left) slate_players['L1P_RANK'] = slate_players.groupby( ['SLATEID', 'SINGLE_DFS_POSITION'])['VALUE'].rank(method='min', ascending=False) slate_players['L1P_SB_RANK'] = slate_players.groupby( ['SLATEID', 'SINGLE_DFS_POSITION', 'SALARY_FLOOR'])['VALUE'].rank(method='min', ascending=False) slate_players['L3P_RANK'] = slate_players.groupby( ['SLATEID'])['VALUE'].rank(method='min', ascending=False) slate_players['L3P_SB_RANK'] = slate_players.groupby( ['SLATEID', 'SALARY_FLOOR'])['VALUE'].rank(method='min', ascending=False) slate_data = slate_players.groupby([ 'DATE', 'SLATEID', 'NAME' ]).apply(lambda x: pd.Series({ 'L1P_COUNT': x['L1P_COUNT'].mean(), 'L1P_RANK': x['L1P_RANK'].mean(), 'L1P_SB_COUNT': x['L1P_SB_COUNT'].mean(), 'L1P_SB_RANK': x['L1P_SB_RANK'].mean(), 'L2P_COUNT': x['L2P_COUNT'].mean(), 'L3P_COUNT': x['L3P_COUNT'].mean(), 'L3P_RANK': x['L3P_RANK'].mean(), 'L3P_SB_COUNT': x['L3P_SB_COUNT'].mean(), 'L3P_SB_RANK': x['L3P_SB_RANK'].mean() })).reset_index() self.regressors.append('L1P_COUNT') self.regressors.append('L1P_RANK') self.regressors.append('L1P_SB_COUNT') self.regressors.append('L1P_SB_RANK') self.regressors.append('L2P_COUNT') self.regressors.append('L3P_COUNT') self.regressors.append('L3P_RANK') self.regressors.append('L3P_SB_COUNT') self.regressors.append('L3P_SB_RANK') data['GP'] = 1 data = self.feature_creation.expanding_sum( df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='GP', new_col_name='COUNT_GP') self.regressors.append('COUNT_GP') data = self.preprocess(data, slate_data, ownership_data) data = data.set_index(['GAMEID', 'PLAYERID']) train_index = list( set(data.index.values).intersection(set(train_index.values))) self.train_data = data.loc[train_index].reset_index() test_index = list( set(data.index.values).intersection(set(test_index.values))) self.test_data = data.loc[test_index].reset_index() self.created_features = True def preprocess(self, data, slate_data, ownership_data): ownership_data = ownership_data.merge(slate_data, on=['DATE', 'SLATEID', 'NAME'], how='inner') data = ownership_data.merge(data, on=['DATE', 'NAME'], how='inner') data['L1_DKFP'] = data['L1_DKFP'].fillna(data['AVG_DKFP']) data['MA5_DKFP'] = data['MA5_DKFP'].fillna(data['AVG_DKFP']) data['SALARY_CHANGE'] = data['SALARY_CHANGE'].fillna(0) data['STD_DKFP'] = data['STD_DKFP'].fillna(DEFAULT_STD * data['AVG_DKFP']) data['L1_OWNERSHIP'] = data['L1_OWNERSHIP'].fillna( data['AVG_OWNERSHIP']) data['MA5_OWNERSHIP'] = data['MA5_OWNERSHIP'].fillna( data['AVG_OWNERSHIP']) data['AVG_DvP'] = data['AVG_DvP'].fillna(0) data['TOTAL'] = data['TOTAL'].fillna(data['TOTAL'].mean()) data['POINTSPREAD'] = data['POINTSPREAD'].fillna(0) data['L1P_SB_COUNT'] = data['L1P_SB_COUNT'].fillna(0) data['L3P_SB_COUNT'] = data['L3P_SB_COUNT'].fillna(0) # we can predict Y for a player as long as AVG_Y is not nan data = data.dropna(subset=['AVG_OWNERSHIP']) return data def train_model(self): if not self.created_features: raise Exception('Must create features before training model') X = self.train_data[self.regressors] y = self.train_data[self.regressand] self.model.fit(X, y, test_size=0.25, early_stopping_rounds=25) self.trained_model = True def predict(self): if not self.trained_model: raise Exception('Must train model before generating predictions') output_column = '{}_HAT'.format(self.regressand) self.test_data[output_column] = self.model.predict( self.test_data[self.regressors]) return self.test_data[['DATE', 'SLATEID', 'NAME', output_column]], output_column