def main(pred_week, vegas_adjustment=False, run_query=False, expert_projections=False): db = nfldb.connect() result_path='../results' full_train, pipe, stats = load_feature_set(db, load_cached=not run_query, to_yr_wk=(2015, pred_week)) # picks columns to model lag_cols = [stat + '_lag' for stat in stats] mean_cols = [stat + '_mean' for stat in stats] other_cols = ['same_year_lag', 'played_lag'] infoColumns = ExtractColumns(like=[], exact=['year','week','time','player_id','full_name']) row_info = infoColumns.fit_transform(X=full_train) pred_data, predict_i, pred_info, pred_yr_wk = prediction_feature_set(db, pipe, infoColumns, pred_week=pred_week) X_all = full_train pred_all = pred_data.iloc[predict_i] pred_results = pred_info.iloc[predict_i] # which rows did players play played_bool = full_train['played'] == 1 played_index = [i for i in range(X_all.shape[0]) if played_bool[i]] # random split train and test train_index, test_index = train_test_split_index(X_all.shape[0], test_size=0.1, seed=0) feature_cols = lag_cols + mean_cols + other_cols XColumns = ExtractColumns(like=feature_cols) X = XColumns.fit_transform(X=X_all) X_pred = XColumns.fit_transform(X=pred_all) played_only = True y_cols = ['played', 'receiving_rec', 'receiving_tds', 'receiving_yds', 'rushing_att', 'rushing_tds','rushing_yds'] # added for saving test results for model evaluation rows = [] for y_col in y_cols: y = X_all[y_col] if(played_only and y_col != 'played'): train_i = list(set.intersection(set(train_index), set(played_index))) test_i = list(set.intersection(set(test_index), set(played_index))) else: train_i = train_index test_i = test_index X_train = X.iloc[train_i] y_train = y.iloc[train_i] X_test = X.iloc[test_i] y_test = y.iloc[test_i] # get player info for train and test data X_train_info = row_info.iloc[train_i] X_test_info = row_info.iloc[test_i] ### Test Predictions predict_proba = y_col == 'played' if(predict_proba): models = { 'gb':GradientBoostingClassifier(n_estimators=100, learning_rate=0.1), 'rf':RandomForestClassifier(), 'lin':LogisticRegression(), 'dum':DummyClassifier() } else: models = { 'gb':GradientBoostingRegressor(n_estimators=100, learning_rate=0.1), 'rf':RandomForestRegressor(), 'lin':LinearRegression(), 'dum':DummyRegressor() } gb, gb_test, gb_scores = fit_predict( model=models['gb'], X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, predict_proba=predict_proba) rf, rf_test, rf_scores = fit_predict( model=models['rf'], X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, predict_proba=predict_proba) lin, lin_test, lin_scores = fit_predict( model=models['lin'], X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, predict_proba=predict_proba) dum, dum_test, dum_scores = fit_predict( model=models['dum'], X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, predict_proba=predict_proba) if vegas_adjustment and y_col != 'played': print '-'*50 print 'Vegas Adjusted:', y_col X_train_all = build_vegas_dataframe(X=X_train, y=y_train, row_info=X_train_info, model=gb, db=db, y_col=y_col) X_test_all = build_vegas_dataframe(X=X_test, y=y_test, row_info=X_test_info, model=gb, db=db, y_col=y_col) features = [y_col, 'Total','is_favorite','spread_x_favorite'] X_cols = ExtractColumns(exact=features) X_train = X_cols.fit_transform(X=X_train_all) X_test = X_cols.fit_transform(X=X_test_all) gb_a, gb_test_a, gb_scores_a = fit_predict( model=models['gb'], X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) rf_a, rf_test_a, rf_scores_a = fit_predict( model=models['rf'], X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) lin_a, lin_test_a, lin_scores_a = fit_predict( model=models['lin'], X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) print 'Predicting %s' % (y_col) print lin_a.coef_ print 'Gradient Boosting: RMSE %.2f | MAE %.2f' % (gb_scores_a['rmse'], gb_scores_a['mae']) print 'Random Forest: RMSE %.2f | MAE %.2f' % (rf_scores_a['rmse'], rf_scores_a['mae']) print '%s Regression: RMSE %.2f | MAE %.2f' % ('Linear', lin_scores_a['rmse'], lin_scores_a['mae']) print '-'*50 print 'Historical Prediction:', y_col # Print Results print 'Gradient Boosting: RMSE %.2f | MAE %.2f' % (gb_scores['rmse'], gb_scores['mae']) print 'Random Forest: RMSE %.2f | MAE %.2f' % (rf_scores['rmse'], rf_scores['mae']) print '%s Regression: RMSE %.2f | MAE %.2f' % ('Logistic' if predict_proba else 'Linear', lin_scores['rmse'], lin_scores['mae']) print 'Baseline: RMSE %.2f | MAE %.2f' % (dum_scores['rmse'], dum_scores['mae']) result_row(method='Historical Only', results=gb_scores, stat=y_col, learner='Gradient Boosting', rows=rows) result_row(method='Historical Only', results=rf_scores, stat=y_col, learner='Random Forest', rows=rows) result_row(method='Historical Only', results=lin_scores, stat=y_col, learner='Logistic' if predict_proba else 'Linear', rows=rows) result_row(method='Baseline', results=dum_scores, stat=y_col, learner='Stratified' if predict_proba else 'Mean', rows=rows) # result_row(method='Vegas Adjusted', results=gb_scores_a, stat=y_col, learner='Gradient Boosting', rows=rows) # result_row(method='Vegas Adjusted', results=rf_scores_a, stat=y_col, learner='Random Forest', rows=rows) # result_row(method='Vegas Adjusted', results=lin_scores_a, stat=y_col, learner='Logistic' if predict_proba else 'Linear', rows=rows) # Build full models on all data gb = gb.fit(X, y) rf = rf.fit(X, y) lin = lin.fit(X, y) #### Next week's predictions # Make prediction, just gbr for now if(y_col == 'played'): preds = gb.predict_proba(X_pred)[:,1] else: preds = gb.predict(X_pred) if expert_projections: # create dataframe with predictions for all weeks of current # year to use with the expert prediction mask = (row_info.year == 2015) & (row_info.week <= pred_week) X_2015 = X[mask] y_2015 = X_all[mask][y_col] preds_2015 = cross_val_predict(gb, X_2015, y_2015) info_2015 = row_info[mask][['full_name','week','year']] info_2015.loc[:, y_col] = preds_2015 info_2015.loc[:,'position'] = 'RB' info_2015.loc[:, y_col] = X_all[mask][y_col] # add our prediction based on historical data to output pred_results.loc[:,y_col] = preds # add expert projections, then make a final prediction if expert_projections and y_col != 'played': pred_results = add_expert_projections(pred_results, pred_week, y_col, info_2015, result_rows=rows) pred_results.replace(0, np.nan, inplace=True) out_path = result_path + '/predictions' + '_' + str(int(pred_yr_wk[0])) + '_' + str(int(pred_yr_wk[1])) + '.json' pred_results.to_json(path_or_buf = out_path, orient = 'records')
def main(y_col, predict_week): # get historical data db = nfldb.connect() yr_wk = [(2015, i) for i in range(1,predict_week)] stats = ['receiving_rec', 'receiving_tar', 'receiving_tds', 'receiving_yac_yds', 'receiving_yds', 'rushing_att', 'rushing_tds','rushing_yds'] player_info = ['player_id','full_name','position'] position = 'RB' playerdata = WeeklyPlayerData(db=db, yr_wk=yr_wk, stats=stats, player_info=player_info, fill_time=True, position=position) pipe = Pipeline(steps=[('data',playerdata), ('key',AddNameKey()), ('nan',HandleNaN(method='fill'))]) hist_data = pipe.fit_transform(X=None) client = MongoClient() mdb = client.data include_stats = ['fumbles','receptions','rec_yds','rec_tds', 'rush_attempts','rush_yds','rush_tds'] data = ProjectedPlayerData(db=mdb, stats=include_stats, position='RB') pipe = Pipeline(steps=[('data', data), ('nan',HandleNaN())]) proj_data = pipe.fit_transform(X=None) # get stats for next week then remove player info next_week_proj = proj_data.loc[proj_data.week == predict_week] info_cols = ['name','name_key','team','week','year','position'] pred_info = next_week_proj[info_cols] next_week_proj.drop(info_cols, axis=1, inplace=True) # join the two dataframes df = pd.merge(proj_data, hist_data, how="inner", on=["name_key","week","year"]) # drop the week you want to predict from the training data df = df[df.week < predict_week] # store the player data and then remove those columns from the training data info_cols = ['name','name_key','position_x','position_y','team','week','year', 'full_name','played','player_id'] hist_info = df[info_cols] df.drop(info_cols, axis=1, inplace=True) hist_cols = ['rushing_tds','rushing_att','receiving_yds','receiving_yac_yds', 'receiving_tds','receiving_tar','receiving_rec','rushing_yds'] y = df[y_col] X = df.drop(hist_cols, axis=1) train, test = train_test_split_index(df.shape[0], test_size=0.1, seed=0) X_train = X.iloc[train] y_train = y.iloc[train] X_test = X.iloc[test] y_test = y.iloc[test] regr = LinearRegression() regr.fit(X_train, y_train) pred = regr.predict(X_test) print 'Predicting ', y_col print 'Historical summary statistics:' print y_test.describe() print print 'Projected summary statistics:' print sp.stats.describe(pred) print print "RMSE: ", mean_squared_error(y_test, pred)**0.5 print "MAE: ", mean_absolute_error(y_test, pred) print "intercept: ", regr.intercept_ print "coefficients:" for col, coef in zip(X_train.columns, regr.coef_): print " ", col, coef print pred_labels = [] for player in pred_info.iterrows(): pred_labels.append((player[1]['name'], player[1]['week'])) pred = regr.predict(next_week_proj) # print out predicted data for inspection print 'Player\tWeek\tRushing yards' for (player,week), prediction in zip(pred_labels,pred): print "\t".join([player,str(week),str(prediction)])
def add_expert_projections(pred_results, pred_week, y_col, info_2015, result_rows=[]): db = nfldb.connect() rows = result_rows yr_wk = [(2015, i) for i in range(1,pred_week + 1)] stats = ['receiving_rec', 'receiving_tar', 'receiving_tds', 'receiving_yac_yds', 'receiving_yds', 'rushing_att', 'rushing_tds','rushing_yds'] player_info = ['player_id','full_name','position'] position = 'RB' playerdata = WeeklyPlayerData(db=db, yr_wk=yr_wk, stats=stats, player_info=player_info, fill_time=True, position=position) pipe = Pipeline(steps=[('data',playerdata), ('key',AddNameKey()), ('nan',HandleNaN(method='fill'))]) hist_data = pipe.fit_transform(X=None) client = MongoClient() mdb = client.data include_stats = ['fumbles','receptions','rec_yds','rec_tds', 'rush_attempts','rush_yds','rush_tds'] data = ProjectedPlayerData(db=mdb, stats=include_stats, position='RB') # data already has name_key, so don't need to add it to pipeline pipe = Pipeline(steps=[('data', data), ('nan',HandleNaN())]) proj_data = pipe.fit_transform(X=None) # join the two datasets merged_df = pd.merge(proj_data, hist_data, how="inner", on=["name_key","week","year"]) # drop the week you want to predict from the training data df = merged_df[merged_df.week < pred_week] # store the player data and then remove those columns from the training data info_cols = ['name','name_key','position_x','position_y','team','week', 'year','full_name','played','player_id'] hist_info = df[info_cols] df.drop(info_cols, axis=1, inplace=True) hist_cols = ['rushing_tds','rushing_att','receiving_yds','receiving_yac_yds', 'receiving_tds','receiving_tar','receiving_rec','rushing_yds'] y = df[y_col] X = df.drop(hist_cols, axis=1) train, test = train_test_split_index(df.shape[0], test_size=0.1, seed=0) X_train = X.iloc[train] y_train = y.iloc[train] X_test = X.iloc[test] y_test = y.iloc[test] models = { 'gb':GradientBoostingRegressor(n_estimators=100, learning_rate=0.1), 'rf':RandomForestRegressor(), 'lin':LinearRegression(), 'dum':DummyRegressor() } gb, gb_test, gb_scores = fit_predict( model=models['gb'], X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) rf, rf_test, rf_scores = fit_predict( model=models['rf'], X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) lin, lin_test, lin_scores = fit_predict( model=models['lin'], X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) dum, dum_test, dum_scores = fit_predict( model=models['dum'], X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, predict_proba=False) print "-"*50 print "Expert prediction: ", y_col print 'Gradient Boosting: RMSE %.2f | MAE %.2f' % (gb_scores['rmse'], gb_scores['mae']) print 'Random Forest: RMSE %.2f | MAE %.2f' % (rf_scores['rmse'], rf_scores['mae']) print '%s Regression: RMSE %.2f | MAE %.2f' % ('Linear', lin_scores['rmse'], lin_scores['mae']) result_row(method='Expert Only', results=gb_scores, stat=y_col, learner='Gradient Boosting', rows=rows) result_row(method='Expert Only', results=rf_scores, stat=y_col, learner='Random Forest', rows=rows) result_row(method='Expert Only', results=lin_scores, stat=y_col, learner='Linear', rows=rows) result_row(method='Baseline For Expert Adjusted', results=dum_scores, stat=y_col, learner='Mean', rows=rows) # get stats for the prediction week then remove player info next_week_proj = proj_data[proj_data.week == pred_week] info_cols = ['name','name_key','team','position','year','week'] pred_info = next_week_proj[info_cols] next_week_proj.drop(info_cols, axis=1, inplace=True) # make prediction for next week gb.fit(X, y) pred = gb.predict(next_week_proj) df = pd.DataFrame(pred_info['name_key']) df.columns = ['name_key'] df.loc[:,'expert_' + y_col] = pred # before adding name_key, need to add position pred_results.loc[:,'position'] = 'RB' for column in next_week_proj.columns: if column not in pred_results.columns: df.loc[:,column] = next_week_proj[column] # add name_key to data predicted from historical data pipe = Pipeline(steps=[('key',AddNameKey())]) results_with_key = pipe.fit_transform(X=pred_results) results_with_expert = pd.merge(results_with_key, df, how="left", on="name_key") # make a combined prediction pred_with_y = pipe.fit_transform(X=info_2015) df_all = pd.merge(proj_data, pred_with_y, how="inner", on=["name_key","week","year"]) df = df_all[df_all.week < pred_week] y = df[y_col] info_cols = ['name', 'name_key', 'position_x', 'position_y', 'team', 'week', 'year', 'full_name'] X = df.drop(info_cols + [y_col], axis=1) train, test = train_test_split_index(df.shape[0], test_size=0.1, seed=0) X_train = X.iloc[train] y_train = y.iloc[train] X_test = X.iloc[test] y_test = y.iloc[test] gb, gb_test, gb_scores = fit_predict( model=models['gb'], X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) rf, rf_test, rf_scores = fit_predict( model=models['rf'], X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) lin, lin_test, lin_scores = fit_predict( model=models['lin'], X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) print "-"*50 print "Historical expert-adjusted prediction:", y_col print 'Gradient Boosting: RMSE %.2f | MAE %.2f' % (gb_scores['rmse'], gb_scores['mae']) print 'Random Forest: RMSE %.2f | MAE %.2f' % (rf_scores['rmse'], rf_scores['mae']) print '%s Regression: RMSE %.2f | MAE %.2f' % ('Linear', lin_scores['rmse'], lin_scores['mae']) print print result_row(method='Historical + Expert', results=gb_scores, stat=y_col, learner='Gradient Boosting', rows=rows) result_row(method='Historical + Expert', results=rf_scores, stat=y_col, learner='Random Forest', rows=rows) result_row(method='Historical + Expert', results=lin_scores, stat=y_col, learner='Linear', rows=rows) next_week_proj = proj_data[proj_data.week == pred_week] info_cols = ['name','name_key','team','position','year','week'] pred_info = next_week_proj[info_cols] next_week_proj.drop(info_cols, axis=1, inplace=True) # make prediction for next week gb.fit(X, y) pred = gb.predict(next_week_proj) df = pd.DataFrame(pred_info['name_key']) df.columns = ['name_key'] df.loc[:,'hist_expert_' + y_col] = pred # # before adding name_key, need to add position # pred_results.loc[:,'position'] = 'RB' for column in next_week_proj.columns: if column not in results_with_expert.columns: df.loc[:,column] = next_week_proj[column] results_with_expert_2 = pd.merge(results_with_expert, df, how="left", on="name_key") return results_with_expert_2
def main(): ################################ ### CONFIGURE pred_week = 14 #None db = nfldb.connect() result_path='../results' ### LOAD DATA # load train data full_train, pipe, stats = load_feature_set(db) # picks columns to model lag_cols = [stat + '_lag' for stat in stats] mean_cols = [stat + '_mean' for stat in stats] other_cols = ['same_year_lag', 'played_lag'] infoColumns = ExtractColumns(like=[], exact=['year','week','time','player_id','full_name']) row_info = infoColumns.fit_transform(X=full_train) # load prediction data pred_data, predict_i, pred_info, pred_yr_wk = prediction_feature_set(db, pipe, infoColumns, pred_week=pred_week) ################################## ### PREPARE DATA FOR TRAIN AND PREDICT # train data with all columns X_all = full_train # prediction data with all columns pred_all = pred_data.iloc[predict_i] # which rows did players play played_bool = full_train['played'] == 1 played_index = [i for i in range(X_all.shape[0]) if played_bool[i]] # random split train and test train_index, test_index = train_test_split_index(X_all.shape[0], test_size=0.1, seed=0) feature_cols = lag_cols + mean_cols + other_cols XColumns = ExtractColumns(like=feature_cols) X = XColumns.fit_transform(X=X_all) X_pred = XColumns.fit_transform(X=pred_all) ################################## ### SET UP & TRAIN KNN # fit k nearest neighbors k = 100 played_only = True i_knn = played_index if played_only else range(X.shape[0]) #nn = NearestNeighbors(n_neighbors=k).fit(X.iloc[i_knn]) # regularization reg = CoefScaler(linear_model=Ridge()) reg = reg.fit(X=X.iloc[i_knn], y = score_stats(X_all, make_scorer(base_type='standard')).iloc[i_knn]) X_reg = reg.transform(X.iloc[i_knn]) nn = NearestNeighbors(n_neighbors=k).fit(X_reg) # returns tuple of (distances, indices of neighbors) # for prediction set #distance, neighbor = nn.kneighbors(X=X_pred) X_reg_pred = reg.transform(X=X_pred) distance, neighbor = nn.kneighbors(X=X_reg_pred) ################################## ### READ AND PLOT KNN RESULTS nn_dict = {} for check_i in range(pred_all.shape[0]): # check neighbors # check_nn is a data frame where the first row is the player # and the rest of the rows are the nearest neighbors check_nn = pred_all.iloc[[check_i],:].append(X_all.iloc[i_knn].iloc[neighbor[check_i,:]]) check_nn['StandardPoints'] = score_stats(check_nn, make_scorer(base_type='standard')) check_nn['PPRPoints'] = score_stats(check_nn, make_scorer(base_type='ppr')) nn_i = plot_knn(check_nn, save_image=True, plot_stat='StandardPoints', pred_yr_wk=pred_yr_wk, result_path=plot_image_path(result_path, pred_yr_wk), n_bins=25, bandwidth=2.5) nn_dict.update(nn_i) save_plot_data_json(nn_dict, result_path, pred_yr_wk)