class VarianceModel(object): def __init__(self, test_data): self.feature_creation = FeatureCreation() self.test_data = test_data self.original_columns = list(self.test_data.columns) def predict(self, y): output_column = 'STD_{}'.format(y) self.test_data = self.feature_creation.expanding_standard_deviation( df=self.test_data, group_col_names=['SEASON', 'PLAYERID', 'START'], col_name=y, new_col_name=output_column, min_periods=4) return self.test_data[self.original_columns + [output_column]], output_column
class OwnershipModel(object): def __init__(self, train_data, test_data, site): self.feature_creation = FeatureCreation() self.clean_data = CleanData() self.train_data = train_data self.test_data = test_data self.site = site self.model = XGBoostRegressionModel(OWNERSHIP_MODEL_PARAMS) self.regressors = [] self.regressand = 'OWNERSHIP' self.created_features = False self.trained_model = False def create_features(self, salary_data, contest_data, ownership_data, odds_data): data = pd.concat([self.train_data, self.test_data]) train_index = self.train_data.set_index(['GAMEID', 'PLAYERID']).index test_index = self.test_data.set_index(['GAMEID', 'PLAYERID']).index salary_data = salary_data.loc[salary_data['SITE'] == self.site] data = data.merge(salary_data, on=['DATE', 'NAME'], how='inner') # player stat features CustomFPCalculator = FPCalculator(self.site) data['REB'] = data['DREB'] + data['OREB'] data['DKFP'] = data.apply( lambda x: CustomFPCalculator.calculate_fantasy_points( x['SEASON'], x['PTS'], x['REB'], x['AST'], x['TOV'], x['BLK'], x['STL'], x['FG3M']), axis=1) data = self.feature_creation.expanding_mean( df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='DKFP', new_col_name='AVG_DKFP') self.regressors.append('AVG_DKFP') data['VALUE'] = data['AVG_DKFP'] / data['SALARY'] self.regressors.append('VALUE') data = self.feature_creation.lag( df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='DKFP', new_col_name='L1_DKFP', n_shift=1) self.regressors.append('L1_DKFP') data = self.feature_creation.rolling_mean( df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='DKFP', new_col_name='MA5_DKFP', n_rolling=5) self.regressors.append('MA5_DKFP') data = self.feature_creation.lag( df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='SALARY', new_col_name='L1_SALARY', n_shift=1) data['SALARY_CHANGE'] = data['SALARY'] - data['L1_SALARY'] self.regressors.append('SALARY') self.regressors.append('SALARY_CHANGE') data = self.feature_creation.expanding_standard_deviation( df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='DKFP', new_col_name='STD_DKFP', min_periods=5) self.regressors.append('STD_DKFP') self.regressors.append('START') data['DFS_POSITIONS'] = data['DFS_POSITION'].apply( lambda x: x.split('_') if isinstance(x, str) else np.nan) data['NUM_POSITIONS'] = data['DFS_POSITIONS'].apply( lambda x: len(x) if isinstance(x, list) else np.nan) self.regressors.append('NUM_POSITIONS') for position in ['SG', 'PG', 'C']: data[position] = 0 data.loc[data['DFS_POSITION'].str.contains(position), position] = 1 self.regressors.append(position) # historical ownership of player ownership_data['NAME'] = ownership_data['PLAYERNAME'].apply( lambda x: x if x not in OWNERSHIP_NAME_TO_NBA_NAME else OWNERSHIP_NAME_TO_NBA_NAME[x]) ownership_data = ownership_data.merge(contest_data, on=['SLATEID', 'CONTESTNAME'], how='inner') ownership_data = ownership_data.groupby( ['DATE', 'SLATEID', 'GAMECOUNT', 'NAME']).apply(lambda x: pd.Series({ 'OWNERSHIP': (x['OWNERSHIP'] * x['TOTALENTRIES']).sum() / x[ 'TOTALENTRIES'].sum() })).reset_index() aggregated_ownership = ownership_data.groupby( ['DATE', 'NAME']).apply(lambda x: pd.Series( {'TOTAL_OWNERSHIP': x['OWNERSHIP'].mean()})).reset_index() data = data.merge(aggregated_ownership, on=['DATE', 'NAME'], how='inner') data = self.feature_creation.expanding_mean( df=data, group_col_names=['SEASON', 'NAME'], col_name='TOTAL_OWNERSHIP', new_col_name='AVG_OWNERSHIP') self.regressors.append('AVG_OWNERSHIP') data = self.feature_creation.lag(df=data, group_col_names=['SEASON', 'NAME'], col_name='TOTAL_OWNERSHIP', new_col_name='L1_OWNERSHIP', n_shift=1) self.regressors.append('L1_OWNERSHIP') data = self.feature_creation.rolling_mean( df=data, group_col_names=['SEASON', 'NAME'], col_name='TOTAL_OWNERSHIP', new_col_name='MA5_OWNERSHIP', n_rolling=5) self.regressors.append('MA5_OWNERSHIP') # defense data['NORM_POS'] = data['POSITION'].apply( lambda x: x if '-' not in x else x.split('-')[0]) temp = data.dropna(subset=['DKFP', 'AVG_DKFP']) grouped_defensive_boxscores = temp.groupby([ 'SEASON', 'DATE', 'NORM_POS', 'OPP_TEAM' ]).apply(lambda x: pd.Series({ 'TEAM_DKFP_ALLOWED_P': x['DKFP'].sum(), 'TEAM_DKFP_AVG_P': x['AVG_DKFP'].sum() })).reset_index() grouped_defensive_boxscores['DvP'] = grouped_defensive_boxscores['TEAM_DKFP_ALLOWED_P'] - \ grouped_defensive_boxscores['TEAM_DKFP_AVG_P'] grouped_defensive_boxscores = self.feature_creation.expanding_mean( df=grouped_defensive_boxscores, group_col_names=['SEASON', 'OPP_TEAM', 'NORM_POS'], col_name='DvP', new_col_name='AVG_DvP', order_idx_name='DATE', min_periods=5) self.regressors.append('AVG_DvP') data = data.merge(grouped_defensive_boxscores, on=['SEASON', 'DATE', 'OPP_TEAM', 'NORM_POS'], how='left') # vegas lines odds_data['TOTAL'] = odds_data['TOTAL'].replace(['PK', '-'], np.nan) odds_data['POINTSPREAD'] = odds_data['POINTSPREAD'].replace( ['PK', '-'], 0) full_game_odds = odds_data.loc[odds_data['PERIOD'] == 'Full Game'] data = data.merge(full_game_odds, on=['DATE', 'TEAM'], how='left') self.regressors.append('TOTAL') self.regressors.append('POINTSPREAD') # slate info self.regressors.append('GAMECOUNT') slates = contest_data.loc[ contest_data['SITE'] == self.site, ['DATE', 'SLATEID', 'TEAMS']].drop_duplicates() slates['TEAMS'] = slates['TEAMS'].apply(lambda x: x.split('_')) slates = slates.explode('TEAMS').rename(columns={"TEAMS": "TEAM"}) slates['TEAM'] = slates['TEAM'].apply( lambda x: x if x not in DB_TEAM_TO_NBA_TEAM else DB_TEAM_TO_NBA_TEAM[x]) slate_players = data[[ 'DATE', 'TEAM', 'NAME', 'DFS_POSITIONS', 'SALARY', 'VALUE' ]].merge(slates, on=['DATE', 'TEAM'], how='inner') slate_players['SALARY_BIN'] = pd.cut(slate_players['SALARY'], bins=list(range( 3000, 15000, 1000)), duplicates='drop', include_lowest=True) slate_players = slate_players.explode('DFS_POSITIONS').rename( columns={'DFS_POSITIONS': 'SINGLE_DFS_POSITION'}) MIN_VALUE = 0.002 all_temp = slate_players.groupby( ['SLATEID', 'SINGLE_DFS_POSITION']).apply(lambda x: pd.Series( {'L1P_COUNT': x.loc[x['VALUE'] > MIN_VALUE, 'NAME'].count()}) ).reset_index().dropna() slate_players = slate_players.merge( all_temp, on=['SLATEID', 'SINGLE_DFS_POSITION'], how='left') sb_temp = slate_players.groupby( ['SLATEID', 'SINGLE_DFS_POSITION', 'SALARY_BIN']).apply(lambda x: pd.Series( {'L1P_SB_COUNT': x['NAME'].count()})).reset_index().dropna() slate_players = slate_players.merge( sb_temp, on=['SLATEID', 'SINGLE_DFS_POSITION', 'SALARY_BIN'], how='left') L1_TO_L2 = {'PG': 'G', 'SG': 'G', 'SF': 'F', 'PF': 'F', 'C': 'C'} slate_players['LEVEL2_DFS_POSITION'] = slate_players[ 'SINGLE_DFS_POSITION'].apply(lambda x: L1_TO_L2[x] if isinstance(x, str) else np.nan) all_temp = slate_players.groupby( ['SLATEID', 'LEVEL2_DFS_POSITION']).apply(lambda x: pd.Series( {'L2P_COUNT': x.loc[x['VALUE'] > MIN_VALUE, 'NAME'].count()}) ).reset_index().dropna() slate_players = slate_players.merge( all_temp, on=['SLATEID', 'LEVEL2_DFS_POSITION'], how='left') all_temp = slate_players.groupby( ['SLATEID']).apply(lambda x: pd.Series( {'L3P_COUNT': x.loc[x['VALUE'] > MIN_VALUE, 'NAME'].count()}) ).reset_index().dropna() slate_players = slate_players.merge(all_temp, on=['SLATEID'], how='left') sb_temp = slate_players.groupby([ 'SLATEID', 'SALARY_BIN' ]).apply(lambda x: pd.Series( {'L3P_SB_COUNT': x.loc[x['VALUE'] > MIN_VALUE, 'NAME'].count()}) ).reset_index().dropna() slate_players = slate_players.merge(sb_temp, on=['SLATEID', 'SALARY_BIN'], how='left') slate_players['SALARY_FLOOR'] = slate_players['SALARY_BIN'].apply( lambda x: x.left) slate_players['L1P_RANK'] = slate_players.groupby( ['SLATEID', 'SINGLE_DFS_POSITION'])['VALUE'].rank(method='min', ascending=False) slate_players['L1P_SB_RANK'] = slate_players.groupby( ['SLATEID', 'SINGLE_DFS_POSITION', 'SALARY_FLOOR'])['VALUE'].rank(method='min', ascending=False) slate_players['L3P_RANK'] = slate_players.groupby( ['SLATEID'])['VALUE'].rank(method='min', ascending=False) slate_players['L3P_SB_RANK'] = slate_players.groupby( ['SLATEID', 'SALARY_FLOOR'])['VALUE'].rank(method='min', ascending=False) slate_data = slate_players.groupby([ 'DATE', 'SLATEID', 'NAME' ]).apply(lambda x: pd.Series({ 'L1P_COUNT': x['L1P_COUNT'].mean(), 'L1P_RANK': x['L1P_RANK'].mean(), 'L1P_SB_COUNT': x['L1P_SB_COUNT'].mean(), 'L1P_SB_RANK': x['L1P_SB_RANK'].mean(), 'L2P_COUNT': x['L2P_COUNT'].mean(), 'L3P_COUNT': x['L3P_COUNT'].mean(), 'L3P_RANK': x['L3P_RANK'].mean(), 'L3P_SB_COUNT': x['L3P_SB_COUNT'].mean(), 'L3P_SB_RANK': x['L3P_SB_RANK'].mean() })).reset_index() self.regressors.append('L1P_COUNT') self.regressors.append('L1P_RANK') self.regressors.append('L1P_SB_COUNT') self.regressors.append('L1P_SB_RANK') self.regressors.append('L2P_COUNT') self.regressors.append('L3P_COUNT') self.regressors.append('L3P_RANK') self.regressors.append('L3P_SB_COUNT') self.regressors.append('L3P_SB_RANK') data['GP'] = 1 data = self.feature_creation.expanding_sum( df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='GP', new_col_name='COUNT_GP') self.regressors.append('COUNT_GP') data = self.preprocess(data, slate_data, ownership_data) data = data.set_index(['GAMEID', 'PLAYERID']) train_index = list( set(data.index.values).intersection(set(train_index.values))) self.train_data = data.loc[train_index].reset_index() test_index = list( set(data.index.values).intersection(set(test_index.values))) self.test_data = data.loc[test_index].reset_index() self.created_features = True def preprocess(self, data, slate_data, ownership_data): ownership_data = ownership_data.merge(slate_data, on=['DATE', 'SLATEID', 'NAME'], how='inner') data = ownership_data.merge(data, on=['DATE', 'NAME'], how='inner') data['L1_DKFP'] = data['L1_DKFP'].fillna(data['AVG_DKFP']) data['MA5_DKFP'] = data['MA5_DKFP'].fillna(data['AVG_DKFP']) data['SALARY_CHANGE'] = data['SALARY_CHANGE'].fillna(0) data['STD_DKFP'] = data['STD_DKFP'].fillna(DEFAULT_STD * data['AVG_DKFP']) data['L1_OWNERSHIP'] = data['L1_OWNERSHIP'].fillna( data['AVG_OWNERSHIP']) data['MA5_OWNERSHIP'] = data['MA5_OWNERSHIP'].fillna( data['AVG_OWNERSHIP']) data['AVG_DvP'] = data['AVG_DvP'].fillna(0) data['TOTAL'] = data['TOTAL'].fillna(data['TOTAL'].mean()) data['POINTSPREAD'] = data['POINTSPREAD'].fillna(0) data['L1P_SB_COUNT'] = data['L1P_SB_COUNT'].fillna(0) data['L3P_SB_COUNT'] = data['L3P_SB_COUNT'].fillna(0) # we can predict Y for a player as long as AVG_Y is not nan data = data.dropna(subset=['AVG_OWNERSHIP']) return data def train_model(self): if not self.created_features: raise Exception('Must create features before training model') X = self.train_data[self.regressors] y = self.train_data[self.regressand] self.model.fit(X, y, test_size=0.25, early_stopping_rounds=25) self.trained_model = True def predict(self): if not self.trained_model: raise Exception('Must train model before generating predictions') output_column = '{}_HAT'.format(self.regressand) self.test_data[output_column] = self.model.predict( self.test_data[self.regressors]) return self.test_data[['DATE', 'SLATEID', 'NAME', output_column]], output_column