class CleanData(object): def __init__(self): self.feature_creation = FeatureCreation() def drop_rows_player_inactive(self, df): df = df.loc[df['SECONDSPLAYED'] > 0] return df def drop_rows_player_injured(self, df): df = df.loc[(df['SECONDSPLAYED'] != 0) | (df['COMMENT'] == "DNP - Coach's Decision")] return df def drop_rows_player_rest(self, df, thresh=1200): df = self.feature_creation.expanding_mean( df=df, group_col_names=['SEASON', 'PLAYERID'], col_name='SECONDSPLAYED', new_col_name='AVG_SP') df = df.loc[~((df['AVG_SP'] > thresh) & (df['COMMENT'] == "DNP - Coach's Decision"))] df = df.drop(columns=['AVG_SP']) return df def roto_name_to_nba_name(self, name): name_list = name.split(',') name = "{} {}".format(name_list[-1].lstrip(), ' '.join(name_list[:-1])) if name in ROTO_NAME_TO_NBA_NAME: return ROTO_NAME_TO_NBA_NAME[name] return name
class RPSModel(object): def __init__(self, train_data, test_data): self.feature_creation = FeatureCreation() self.train_data = train_data self.test_data = test_data self.model = CatBoostRegressionModel(RPS_MODEL_PARAMS) self.regressors = [] self.regressand = 'RPS' self.created_features = False self.generated_weights = False self.trained_model = False def create_features(self, odds_data, sp_threshold=60): data = pd.concat([self.train_data, self.test_data]) data['REB'] = data['DREB'] + data['OREB'] data[self.regressand] = data['REB']/data['SECONDSPLAYED'] data['ORPS'] = data['OREB']/data['SECONDSPLAYED'] data['DRPS'] = data['DREB']/data['SECONDSPLAYED'] data['CLEAN_DRPS'] = data['DRPS'] data.loc[data['SECONDSPLAYED'] <= sp_threshold, 'CLEAN_DRPS'] = np.nan data['CLEAN_ORPS'] = data['ORPS'] data.loc[data['SECONDSPLAYED'] <= sp_threshold, 'CLEAN_ORPS'] = np.nan train_index = self.train_data.set_index(['GAMEID', 'PLAYERID']).index test_index = self.test_data.set_index(['GAMEID', 'PLAYERID']).index # season averages data = self.feature_creation.expanding_weighted_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='DRPS', weight_col_name='SECONDSPLAYED', new_col_name='AVG_DRPS' ) data = self.feature_creation.expanding_weighted_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='ORPS', weight_col_name='SECONDSPLAYED', new_col_name='AVG_ORPS' ) self.regressors.append('AVG_DRPS') self.regressors.append('AVG_ORPS') data = self.feature_creation.expanding_weighted_mean( df=data, group_col_names=['SEASON', 'TEAM', 'OPP_TEAM', 'PLAYERID'], col_name=self.regressand, weight_col_name='SECONDSPLAYED', new_col_name='AVG_Y_OPP_TEAM' ) self.regressors.append('AVG_Y_OPP_TEAM') # 1 game lags data = self.feature_creation.lag( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='CLEAN_DRPS', new_col_name='L1_DRPS', n_shift=1 ) self.regressors.append('L1_DRPS') # exponentially weighted means data = self.feature_creation.expanding_ewm( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='DRPS', new_col_name='EWM_DRPS', alpha=0.90 ) data = self.feature_creation.expanding_ewm( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='ORPS', new_col_name='EWM_ORPS', alpha=0.90 ) self.regressors.append('EWM_DRPS') self.regressors.append('EWM_ORPS') # moving averages data = self.feature_creation.rolling_weighted_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='DRPS', new_col_name='MA2_DRPS', weight_col_name='SECONDSPLAYED', n_rolling=2, min_periods=1 ) data = self.feature_creation.rolling_weighted_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='DRPS', new_col_name='MA15_DRPS', weight_col_name='SECONDSPLAYED', n_rolling=15, min_periods=8 ) data = self.feature_creation.rolling_weighted_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='ORPS', new_col_name='MA6_ORPS', weight_col_name='SECONDSPLAYED', n_rolling=6, min_periods=3 ) data = self.feature_creation.rolling_weighted_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='ORPS', new_col_name='MA18_ORPS', weight_col_name='SECONDSPLAYED', n_rolling=18, min_periods=9 ) self.regressors.append('MA2_DRPS') self.regressors.append('MA15_DRPS') self.regressors.append('MA6_ORPS') self.regressors.append('MA18_ORPS') # start data = self.feature_creation.expanding_weighted_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID', 'START'], col_name=self.regressand, weight_col_name='SECONDSPLAYED', new_col_name='AVG_Y_R' ) self.regressors.append('AVG_Y_R') # position data['NORM_POS'] = data['POSITION'].apply(lambda x: x if '-' not in x else x.split('-')[0]) data['GUARD'] = 0 data.loc[data['NORM_POS'] == 'Guard', 'GUARD'] = 1 self.regressors.append('GUARD') # defense data = self.feature_creation.expanding_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='DREB', new_col_name='AVG_DREB' ) data = self.feature_creation.expanding_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='OREB', new_col_name='AVG_OREB' ) data = self.feature_creation.expanding_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='SECONDSPLAYED', new_col_name='AVG_SP' ) temp = data.dropna(subset=['DREB', 'SECONDSPLAYED', 'AVG_DREB', 'AVG_SP']) grouped_defensive_boxscores = temp.groupby(['SEASON', 'DATE', 'OPP_TEAM']).apply( lambda x: pd.Series({ 'TEAM_DRPS_ALLOWED': x['DREB'].sum()/x['SECONDSPLAYED'].sum(), 'TEAM_DRPS_AVG': x['AVG_DREB'].sum()/x['AVG_SP'].sum() }) ).reset_index() grouped_defensive_boxscores['TEAM_DRPS_DIFF_ALLOWED'] = grouped_defensive_boxscores['TEAM_DRPS_ALLOWED'] - \ grouped_defensive_boxscores['TEAM_DRPS_AVG'] grouped_defensive_boxscores = self.feature_creation.expanding_mean( df=grouped_defensive_boxscores, group_col_names=['SEASON', 'OPP_TEAM'], col_name='TEAM_DRPS_DIFF_ALLOWED', new_col_name='AVG_TEAM_DRPS_DIFF_ALLOWED', order_idx_name='DATE', min_periods=5 ) data = data.merge(grouped_defensive_boxscores, on=['SEASON', 'DATE', 'OPP_TEAM'], how='left') self.regressors.append('AVG_TEAM_DRPS_DIFF_ALLOWED') temp = data.dropna(subset=['DREB', 'SECONDSPLAYED', 'AVG_DREB', 'AVG_SP']) grouped_defensive_boxscores = temp.groupby(['SEASON', 'DATE', 'START', 'OPP_TEAM']).apply( lambda x: pd.Series({ 'TEAM_DRPS_ALLOWED_R': x['DREB'].sum()/x['SECONDSPLAYED'].sum(), 'TEAM_DRPS_AVG_R': x['AVG_DREB'].sum()/x['AVG_SP'].sum() }) ).reset_index() grouped_defensive_boxscores['TEAM_DRPS_DIFF_ALLOWED_R'] = grouped_defensive_boxscores['TEAM_DRPS_ALLOWED_R'] - \ grouped_defensive_boxscores['TEAM_DRPS_AVG_R'] grouped_defensive_boxscores = self.feature_creation.expanding_mean( df=grouped_defensive_boxscores, group_col_names=['SEASON', 'START', 'OPP_TEAM'], col_name='TEAM_DRPS_DIFF_ALLOWED_R', new_col_name='AVG_TEAM_DRPS_DIFF_ALLOWED_R', order_idx_name='DATE', min_periods=5 ) data = data.merge(grouped_defensive_boxscores, on=['SEASON', 'DATE', 'START', 'OPP_TEAM'], how='left') self.regressors.append('AVG_TEAM_DRPS_DIFF_ALLOWED_R') temp = data.dropna(subset=['DREB', 'OREB', 'SECONDSPLAYED', 'AVG_DREB', 'AVG_OREB', 'AVG_SP']) grouped_defensive_boxscores = temp.groupby(['SEASON', 'DATE', 'NORM_POS', 'OPP_TEAM']).apply( lambda x: pd.Series({ 'TEAM_DRPS_ALLOWED_P': x['DREB'].sum()/x['SECONDSPLAYED'].sum(), 'TEAM_DRPS_AVG_P': x['AVG_DREB'].sum()/x['AVG_SP'].sum(), 'TEAM_ORPS_ALLOWED_P': x['OREB'].sum()/x['SECONDSPLAYED'].sum(), 'TEAM_ORPS_AVG_P': x['AVG_OREB'].sum()/x['AVG_SP'].sum() }) ).reset_index() grouped_defensive_boxscores['TEAM_DRPS_DIFF_ALLOWED_P'] = grouped_defensive_boxscores['TEAM_DRPS_ALLOWED_P'] - \ grouped_defensive_boxscores['TEAM_DRPS_AVG_P'] grouped_defensive_boxscores['TEAM_ORPS_DIFF_ALLOWED_P'] = grouped_defensive_boxscores['TEAM_ORPS_ALLOWED_P'] - \ grouped_defensive_boxscores['TEAM_ORPS_AVG_P'] grouped_defensive_boxscores = self.feature_creation.expanding_mean( df=grouped_defensive_boxscores, group_col_names=['SEASON', 'NORM_POS', 'OPP_TEAM'], col_name='TEAM_DRPS_DIFF_ALLOWED_P', new_col_name='AVG_TEAM_DRPS_DIFF_ALLOWED_P', order_idx_name='DATE', min_periods=5 ) grouped_defensive_boxscores = self.feature_creation.expanding_mean( df=grouped_defensive_boxscores, group_col_names=['SEASON', 'NORM_POS', 'OPP_TEAM'], col_name='TEAM_ORPS_DIFF_ALLOWED_P', new_col_name='AVG_TEAM_ORPS_DIFF_ALLOWED_P', order_idx_name='DATE', min_periods=5 ) data = data.merge(grouped_defensive_boxscores, on=['SEASON', 'DATE', 'NORM_POS', 'OPP_TEAM'], how='left') self.regressors.append('AVG_TEAM_DRPS_DIFF_ALLOWED_P') self.regressors.append('AVG_TEAM_ORPS_DIFF_ALLOWED_P') # total full_game_odds = odds_data.loc[odds_data['PERIOD'] == 'Full Game'] full_game_odds['TOTAL'] = full_game_odds['TOTAL'].replace(['PK', '-'], np.nan) data = data.merge(full_game_odds, on=['DATE', 'TEAM'], how='left') self.regressors.append('TOTAL') # injuries data = self.feature_creation.expanding_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='REB', new_col_name='AVG_REB' ) temp = data.dropna(subset=['DREB', 'AVG_DREB', 'SECONDSPLAYED', 'AVG_SP']) temp = temp.groupby(['SEASON', 'DATE', 'TEAM']).apply( lambda x: pd.Series({ 'TEAM_ACTIVE_AVG_DRPS': x['AVG_DREB'].sum()/x['AVG_SP'].sum(), 'TEAM_DRPS': x['DREB'].sum()/x['SECONDSPLAYED'].sum(), 'TEAM_ACTIVE_AVG_RPS': x['AVG_REB'].sum()/x['AVG_SP'].sum(), 'TEAM_RPS': x['REB'].sum()/x['SECONDSPLAYED'].sum() }) ) temp = self.feature_creation.expanding_mean( df=temp, group_col_names=['SEASON', 'TEAM'], col_name='TEAM_DRPS', new_col_name='AVG_TEAM_DRPS' ) temp['TEAM_ACTIVE_AVG_DRPS_DIFF'] = temp['TEAM_ACTIVE_AVG_DRPS'] - temp['AVG_TEAM_DRPS'] data = data.merge(temp, on=['DATE', 'TEAM'], how='left') self.regressors.append('TEAM_ACTIVE_AVG_DRPS_DIFF') # regressand by lineup data['START_LINEUP'] = np.nan data['STARS'] = np.nan data = data.set_index(['GAMEID', 'TEAM']) for (game_id, team), temp in data.groupby(['GAMEID', 'TEAM']): start_lineup = list(temp.loc[temp['START'] == 1, 'PLAYERID'].values) start_lineup.sort() start_lineup = '_'.join(start_lineup) data.loc[(game_id, team), 'START_LINEUP'] = start_lineup stars = list(temp.loc[temp['AVG_DREB'] >= 7, 'PLAYERID'].values) stars.sort() stars = '_'.join(stars) data.loc[(game_id, team), 'STARS'] = stars data = data.reset_index() data = self.feature_creation.expanding_weighted_mean( df=data, group_col_names=['SEASON', 'START_LINEUP', 'PLAYERID'], col_name=self.regressand, weight_col_name='SECONDSPLAYED', new_col_name='AVG_Y_STARTERS' ) data = self.feature_creation.expanding_weighted_mean( df=data, group_col_names=['SEASON', 'STARS', 'PLAYERID'], col_name=self.regressand, weight_col_name='SECONDSPLAYED', new_col_name='AVG_Y_STARS' ) self.regressors.append('AVG_Y_STARTERS') self.regressors.append('AVG_Y_STARS') # misc data['GP'] = 1 data = self.feature_creation.expanding_sum( df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='GP', new_col_name='COUNT_GP' ) self.regressors.append('COUNT_GP') self.regressors.append('AVG_SP') # to fill data = self.feature_creation.expanding_weighted_mean( df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name=self.regressand, weight_col_name='SECONDSPLAYED', new_col_name='AVG_Y' ) data = self.generate_weights(data) data = self.preprocess(data) data = data.set_index(['GAMEID', 'PLAYERID']) train_index = list(set(data.index.values).intersection(set(train_index.values))) self.train_data = data.loc[train_index].reset_index() test_index = list(set(data.index.values).intersection(set(test_index.values))) self.test_data = data.loc[test_index].reset_index() self.created_features = True def preprocess(self, data): data['AVG_Y_R'] = data['AVG_Y_R'].fillna(data['AVG_Y']) data['AVG_Y_OPP_TEAM'] = data['AVG_Y_OPP_TEAM'].fillna(data['AVG_Y']) data['L1_DRPS'] = data['L1_DRPS'].fillna(data['AVG_DRPS']) data['EWM_DRPS'] = data['EWM_DRPS'].fillna(data['AVG_DRPS']) data['EWM_ORPS'] = data['EWM_ORPS'].fillna(data['AVG_ORPS']) data['MA2_DRPS'] = data['MA2_DRPS'].fillna(data['AVG_DRPS']) data['MA15_DRPS'] = data['MA15_DRPS'].fillna(data['MA2_DRPS']) data['MA6_ORPS'] = data['MA6_ORPS'].fillna(data['AVG_ORPS']) data['MA18_ORPS'] = data['MA18_ORPS'].fillna(data['MA6_ORPS']) data['AVG_TEAM_DRPS_DIFF_ALLOWED'] = data['AVG_TEAM_DRPS_DIFF_ALLOWED'].fillna(0) data['AVG_TEAM_DRPS_DIFF_ALLOWED_R'] = data['AVG_TEAM_DRPS_DIFF_ALLOWED_R'].fillna(0) data['AVG_TEAM_DRPS_DIFF_ALLOWED_P'] = data['AVG_TEAM_DRPS_DIFF_ALLOWED_P'].fillna(0) data['AVG_TEAM_ORPS_DIFF_ALLOWED_P'] = data['AVG_TEAM_ORPS_DIFF_ALLOWED_P'].fillna(0) data['TOTAL'] = data['TOTAL'].fillna(200) data['TEAM_ACTIVE_AVG_DRPS_DIFF'] = data['TEAM_ACTIVE_AVG_DRPS_DIFF'].fillna(0) data['AVG_Y_STARS'] = data['AVG_Y_STARS'].fillna(data['AVG_Y']) data['AVG_Y_STARTERS'] = data['AVG_Y_STARTERS'].fillna(data['AVG_Y_STARS']) data['COUNT_GP'] = data['COUNT_GP'].fillna(0) # we can predict Y for a player as long as AVG_Y is not nan data = data.dropna(subset=['AVG_Y']) return data def generate_weights(self, data): data = self.feature_creation.expanding_sum( df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='SECONDSPLAYED', new_col_name='SUM_SP' ) self.weight = 'WEIGHT' data[self.weight] = data['SECONDSPLAYED'].apply(WeightFunctions.game_seconds_played_weight) * \ data['SUM_SP'].apply(WeightFunctions.season_seconds_played_weight) return data def train_model(self): if not self.created_features: raise Exception('Must create features before training model') # drop games in which players played a minute or less self.train_data = self.train_data.loc[self.train_data['SECONDSPLAYED'] > 60] X = self.train_data[self.regressors] y = self.train_data[self.regressand] w = self.train_data[self.weight] self.model.fit(X, y, sample_weight=w, test_size=0.25, early_stopping_rounds=25) self.trained_model = True def predict(self): if not self.trained_model: raise Exception('Must train model before generating predictions') self.test_data['{}_HAT'.format(self.regressand)] = self.model.predict(self.test_data[self.regressors]) return self.test_data[['GAMEID', 'PLAYERID', '{}_HAT'.format(self.regressand)]]
class OwnershipModel(object): def __init__(self, train_data, test_data, site): self.feature_creation = FeatureCreation() self.clean_data = CleanData() self.train_data = train_data self.test_data = test_data self.site = site self.model = XGBoostRegressionModel(OWNERSHIP_MODEL_PARAMS) self.regressors = [] self.regressand = 'OWNERSHIP' self.created_features = False self.trained_model = False def create_features(self, salary_data, contest_data, ownership_data, odds_data): data = pd.concat([self.train_data, self.test_data]) train_index = self.train_data.set_index(['GAMEID', 'PLAYERID']).index test_index = self.test_data.set_index(['GAMEID', 'PLAYERID']).index salary_data = salary_data.loc[salary_data['SITE'] == self.site] data = data.merge(salary_data, on=['DATE', 'NAME'], how='inner') # player stat features CustomFPCalculator = FPCalculator(self.site) data['REB'] = data['DREB'] + data['OREB'] data['DKFP'] = data.apply( lambda x: CustomFPCalculator.calculate_fantasy_points( x['SEASON'], x['PTS'], x['REB'], x['AST'], x['TOV'], x['BLK'], x['STL'], x['FG3M']), axis=1) data = self.feature_creation.expanding_mean( df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='DKFP', new_col_name='AVG_DKFP') self.regressors.append('AVG_DKFP') data['VALUE'] = data['AVG_DKFP'] / data['SALARY'] self.regressors.append('VALUE') data = self.feature_creation.lag( df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='DKFP', new_col_name='L1_DKFP', n_shift=1) self.regressors.append('L1_DKFP') data = self.feature_creation.rolling_mean( df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='DKFP', new_col_name='MA5_DKFP', n_rolling=5) self.regressors.append('MA5_DKFP') data = self.feature_creation.lag( df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='SALARY', new_col_name='L1_SALARY', n_shift=1) data['SALARY_CHANGE'] = data['SALARY'] - data['L1_SALARY'] self.regressors.append('SALARY') self.regressors.append('SALARY_CHANGE') data = self.feature_creation.expanding_standard_deviation( df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='DKFP', new_col_name='STD_DKFP', min_periods=5) self.regressors.append('STD_DKFP') self.regressors.append('START') data['DFS_POSITIONS'] = data['DFS_POSITION'].apply( lambda x: x.split('_') if isinstance(x, str) else np.nan) data['NUM_POSITIONS'] = data['DFS_POSITIONS'].apply( lambda x: len(x) if isinstance(x, list) else np.nan) self.regressors.append('NUM_POSITIONS') for position in ['SG', 'PG', 'C']: data[position] = 0 data.loc[data['DFS_POSITION'].str.contains(position), position] = 1 self.regressors.append(position) # historical ownership of player ownership_data['NAME'] = ownership_data['PLAYERNAME'].apply( lambda x: x if x not in OWNERSHIP_NAME_TO_NBA_NAME else OWNERSHIP_NAME_TO_NBA_NAME[x]) ownership_data = ownership_data.merge(contest_data, on=['SLATEID', 'CONTESTNAME'], how='inner') ownership_data = ownership_data.groupby( ['DATE', 'SLATEID', 'GAMECOUNT', 'NAME']).apply(lambda x: pd.Series({ 'OWNERSHIP': (x['OWNERSHIP'] * x['TOTALENTRIES']).sum() / x[ 'TOTALENTRIES'].sum() })).reset_index() aggregated_ownership = ownership_data.groupby( ['DATE', 'NAME']).apply(lambda x: pd.Series( {'TOTAL_OWNERSHIP': x['OWNERSHIP'].mean()})).reset_index() data = data.merge(aggregated_ownership, on=['DATE', 'NAME'], how='inner') data = self.feature_creation.expanding_mean( df=data, group_col_names=['SEASON', 'NAME'], col_name='TOTAL_OWNERSHIP', new_col_name='AVG_OWNERSHIP') self.regressors.append('AVG_OWNERSHIP') data = self.feature_creation.lag(df=data, group_col_names=['SEASON', 'NAME'], col_name='TOTAL_OWNERSHIP', new_col_name='L1_OWNERSHIP', n_shift=1) self.regressors.append('L1_OWNERSHIP') data = self.feature_creation.rolling_mean( df=data, group_col_names=['SEASON', 'NAME'], col_name='TOTAL_OWNERSHIP', new_col_name='MA5_OWNERSHIP', n_rolling=5) self.regressors.append('MA5_OWNERSHIP') # defense data['NORM_POS'] = data['POSITION'].apply( lambda x: x if '-' not in x else x.split('-')[0]) temp = data.dropna(subset=['DKFP', 'AVG_DKFP']) grouped_defensive_boxscores = temp.groupby([ 'SEASON', 'DATE', 'NORM_POS', 'OPP_TEAM' ]).apply(lambda x: pd.Series({ 'TEAM_DKFP_ALLOWED_P': x['DKFP'].sum(), 'TEAM_DKFP_AVG_P': x['AVG_DKFP'].sum() })).reset_index() grouped_defensive_boxscores['DvP'] = grouped_defensive_boxscores['TEAM_DKFP_ALLOWED_P'] - \ grouped_defensive_boxscores['TEAM_DKFP_AVG_P'] grouped_defensive_boxscores = self.feature_creation.expanding_mean( df=grouped_defensive_boxscores, group_col_names=['SEASON', 'OPP_TEAM', 'NORM_POS'], col_name='DvP', new_col_name='AVG_DvP', order_idx_name='DATE', min_periods=5) self.regressors.append('AVG_DvP') data = data.merge(grouped_defensive_boxscores, on=['SEASON', 'DATE', 'OPP_TEAM', 'NORM_POS'], how='left') # vegas lines odds_data['TOTAL'] = odds_data['TOTAL'].replace(['PK', '-'], np.nan) odds_data['POINTSPREAD'] = odds_data['POINTSPREAD'].replace( ['PK', '-'], 0) full_game_odds = odds_data.loc[odds_data['PERIOD'] == 'Full Game'] data = data.merge(full_game_odds, on=['DATE', 'TEAM'], how='left') self.regressors.append('TOTAL') self.regressors.append('POINTSPREAD') # slate info self.regressors.append('GAMECOUNT') slates = contest_data.loc[ contest_data['SITE'] == self.site, ['DATE', 'SLATEID', 'TEAMS']].drop_duplicates() slates['TEAMS'] = slates['TEAMS'].apply(lambda x: x.split('_')) slates = slates.explode('TEAMS').rename(columns={"TEAMS": "TEAM"}) slates['TEAM'] = slates['TEAM'].apply( lambda x: x if x not in DB_TEAM_TO_NBA_TEAM else DB_TEAM_TO_NBA_TEAM[x]) slate_players = data[[ 'DATE', 'TEAM', 'NAME', 'DFS_POSITIONS', 'SALARY', 'VALUE' ]].merge(slates, on=['DATE', 'TEAM'], how='inner') slate_players['SALARY_BIN'] = pd.cut(slate_players['SALARY'], bins=list(range( 3000, 15000, 1000)), duplicates='drop', include_lowest=True) slate_players = slate_players.explode('DFS_POSITIONS').rename( columns={'DFS_POSITIONS': 'SINGLE_DFS_POSITION'}) MIN_VALUE = 0.002 all_temp = slate_players.groupby( ['SLATEID', 'SINGLE_DFS_POSITION']).apply(lambda x: pd.Series( {'L1P_COUNT': x.loc[x['VALUE'] > MIN_VALUE, 'NAME'].count()}) ).reset_index().dropna() slate_players = slate_players.merge( all_temp, on=['SLATEID', 'SINGLE_DFS_POSITION'], how='left') sb_temp = slate_players.groupby( ['SLATEID', 'SINGLE_DFS_POSITION', 'SALARY_BIN']).apply(lambda x: pd.Series( {'L1P_SB_COUNT': x['NAME'].count()})).reset_index().dropna() slate_players = slate_players.merge( sb_temp, on=['SLATEID', 'SINGLE_DFS_POSITION', 'SALARY_BIN'], how='left') L1_TO_L2 = {'PG': 'G', 'SG': 'G', 'SF': 'F', 'PF': 'F', 'C': 'C'} slate_players['LEVEL2_DFS_POSITION'] = slate_players[ 'SINGLE_DFS_POSITION'].apply(lambda x: L1_TO_L2[x] if isinstance(x, str) else np.nan) all_temp = slate_players.groupby( ['SLATEID', 'LEVEL2_DFS_POSITION']).apply(lambda x: pd.Series( {'L2P_COUNT': x.loc[x['VALUE'] > MIN_VALUE, 'NAME'].count()}) ).reset_index().dropna() slate_players = slate_players.merge( all_temp, on=['SLATEID', 'LEVEL2_DFS_POSITION'], how='left') all_temp = slate_players.groupby( ['SLATEID']).apply(lambda x: pd.Series( {'L3P_COUNT': x.loc[x['VALUE'] > MIN_VALUE, 'NAME'].count()}) ).reset_index().dropna() slate_players = slate_players.merge(all_temp, on=['SLATEID'], how='left') sb_temp = slate_players.groupby([ 'SLATEID', 'SALARY_BIN' ]).apply(lambda x: pd.Series( {'L3P_SB_COUNT': x.loc[x['VALUE'] > MIN_VALUE, 'NAME'].count()}) ).reset_index().dropna() slate_players = slate_players.merge(sb_temp, on=['SLATEID', 'SALARY_BIN'], how='left') slate_players['SALARY_FLOOR'] = slate_players['SALARY_BIN'].apply( lambda x: x.left) slate_players['L1P_RANK'] = slate_players.groupby( ['SLATEID', 'SINGLE_DFS_POSITION'])['VALUE'].rank(method='min', ascending=False) slate_players['L1P_SB_RANK'] = slate_players.groupby( ['SLATEID', 'SINGLE_DFS_POSITION', 'SALARY_FLOOR'])['VALUE'].rank(method='min', ascending=False) slate_players['L3P_RANK'] = slate_players.groupby( ['SLATEID'])['VALUE'].rank(method='min', ascending=False) slate_players['L3P_SB_RANK'] = slate_players.groupby( ['SLATEID', 'SALARY_FLOOR'])['VALUE'].rank(method='min', ascending=False) slate_data = slate_players.groupby([ 'DATE', 'SLATEID', 'NAME' ]).apply(lambda x: pd.Series({ 'L1P_COUNT': x['L1P_COUNT'].mean(), 'L1P_RANK': x['L1P_RANK'].mean(), 'L1P_SB_COUNT': x['L1P_SB_COUNT'].mean(), 'L1P_SB_RANK': x['L1P_SB_RANK'].mean(), 'L2P_COUNT': x['L2P_COUNT'].mean(), 'L3P_COUNT': x['L3P_COUNT'].mean(), 'L3P_RANK': x['L3P_RANK'].mean(), 'L3P_SB_COUNT': x['L3P_SB_COUNT'].mean(), 'L3P_SB_RANK': x['L3P_SB_RANK'].mean() })).reset_index() self.regressors.append('L1P_COUNT') self.regressors.append('L1P_RANK') self.regressors.append('L1P_SB_COUNT') self.regressors.append('L1P_SB_RANK') self.regressors.append('L2P_COUNT') self.regressors.append('L3P_COUNT') self.regressors.append('L3P_RANK') self.regressors.append('L3P_SB_COUNT') self.regressors.append('L3P_SB_RANK') data['GP'] = 1 data = self.feature_creation.expanding_sum( df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='GP', new_col_name='COUNT_GP') self.regressors.append('COUNT_GP') data = self.preprocess(data, slate_data, ownership_data) data = data.set_index(['GAMEID', 'PLAYERID']) train_index = list( set(data.index.values).intersection(set(train_index.values))) self.train_data = data.loc[train_index].reset_index() test_index = list( set(data.index.values).intersection(set(test_index.values))) self.test_data = data.loc[test_index].reset_index() self.created_features = True def preprocess(self, data, slate_data, ownership_data): ownership_data = ownership_data.merge(slate_data, on=['DATE', 'SLATEID', 'NAME'], how='inner') data = ownership_data.merge(data, on=['DATE', 'NAME'], how='inner') data['L1_DKFP'] = data['L1_DKFP'].fillna(data['AVG_DKFP']) data['MA5_DKFP'] = data['MA5_DKFP'].fillna(data['AVG_DKFP']) data['SALARY_CHANGE'] = data['SALARY_CHANGE'].fillna(0) data['STD_DKFP'] = data['STD_DKFP'].fillna(DEFAULT_STD * data['AVG_DKFP']) data['L1_OWNERSHIP'] = data['L1_OWNERSHIP'].fillna( data['AVG_OWNERSHIP']) data['MA5_OWNERSHIP'] = data['MA5_OWNERSHIP'].fillna( data['AVG_OWNERSHIP']) data['AVG_DvP'] = data['AVG_DvP'].fillna(0) data['TOTAL'] = data['TOTAL'].fillna(data['TOTAL'].mean()) data['POINTSPREAD'] = data['POINTSPREAD'].fillna(0) data['L1P_SB_COUNT'] = data['L1P_SB_COUNT'].fillna(0) data['L3P_SB_COUNT'] = data['L3P_SB_COUNT'].fillna(0) # we can predict Y for a player as long as AVG_Y is not nan data = data.dropna(subset=['AVG_OWNERSHIP']) return data def train_model(self): if not self.created_features: raise Exception('Must create features before training model') X = self.train_data[self.regressors] y = self.train_data[self.regressand] self.model.fit(X, y, test_size=0.25, early_stopping_rounds=25) self.trained_model = True def predict(self): if not self.trained_model: raise Exception('Must train model before generating predictions') output_column = '{}_HAT'.format(self.regressand) self.test_data[output_column] = self.model.predict( self.test_data[self.regressors]) return self.test_data[['DATE', 'SLATEID', 'NAME', output_column]], output_column