Python train_test_split_index示例

编程语言: Python

命名空间/包名称: ml.helpers.testing_helpers

方法/功能: train_test_split_index

hotexamples.com的示例: 4

Python train_test_split_index - 已找到4个示例。这些是从开源项目中提取的最受好评的ml.helpers.testing_helpers.train_test_split_index现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： rb_stats.py 项目： kevinallen/waiver_coach

def main(pred_week, vegas_adjustment=False, run_query=False, expert_projections=False):

    db = nfldb.connect()
    result_path='../results'

    full_train, pipe, stats = load_feature_set(db, load_cached=not run_query, to_yr_wk=(2015, pred_week))

    # picks columns to model
    lag_cols = [stat + '_lag' for stat in stats]
    mean_cols = [stat + '_mean' for stat in stats]
    other_cols = ['same_year_lag', 'played_lag']

    infoColumns = ExtractColumns(like=[], exact=['year','week','time','player_id','full_name'])
    row_info = infoColumns.fit_transform(X=full_train)

    pred_data, predict_i, pred_info, pred_yr_wk = prediction_feature_set(db, pipe, infoColumns, pred_week=pred_week)

    X_all = full_train
    pred_all = pred_data.iloc[predict_i]
    pred_results = pred_info.iloc[predict_i]

    # which rows did players play
    played_bool = full_train['played'] == 1
    played_index = [i for i in range(X_all.shape[0]) if played_bool[i]]

    # random split train and test
    train_index, test_index = train_test_split_index(X_all.shape[0], test_size=0.1, seed=0)

    feature_cols = lag_cols + mean_cols + other_cols
    XColumns = ExtractColumns(like=feature_cols)
    X = XColumns.fit_transform(X=X_all)
    X_pred = XColumns.fit_transform(X=pred_all)

    played_only = True

    y_cols = ['played', 'receiving_rec', 'receiving_tds', 'receiving_yds', 'rushing_att', 'rushing_tds','rushing_yds']

    # added for saving test results for model evaluation
    rows = []

    for y_col in y_cols:

        y = X_all[y_col]

        if(played_only and y_col != 'played'):
            train_i = list(set.intersection(set(train_index), set(played_index)))
            test_i = list(set.intersection(set(test_index), set(played_index)))
        else:
            train_i = train_index
            test_i = test_index

        X_train = X.iloc[train_i]
        y_train = y.iloc[train_i]
        X_test = X.iloc[test_i]
        y_test = y.iloc[test_i]

        # get player info for train and test data
        X_train_info = row_info.iloc[train_i]
        X_test_info = row_info.iloc[test_i]

        ### Test Predictions

        predict_proba = y_col == 'played'

        if(predict_proba):
            models = {
                'gb':GradientBoostingClassifier(n_estimators=100, learning_rate=0.1),
                'rf':RandomForestClassifier(),
                'lin':LogisticRegression(),
                'dum':DummyClassifier()
            }
        else:
            models = {
                'gb':GradientBoostingRegressor(n_estimators=100, learning_rate=0.1),
                'rf':RandomForestRegressor(),
                'lin':LinearRegression(),
                'dum':DummyRegressor()
            }

        gb, gb_test, gb_scores = fit_predict(
            model=models['gb'],
            X_train=X_train,
            y_train=y_train,
            X_test=X_test,
            y_test=y_test,
            predict_proba=predict_proba)

        rf, rf_test, rf_scores = fit_predict(
            model=models['rf'],
            X_train=X_train,
            y_train=y_train,
            X_test=X_test,
            y_test=y_test,
            predict_proba=predict_proba)

        lin, lin_test, lin_scores = fit_predict(
            model=models['lin'],
            X_train=X_train,
            y_train=y_train,
            X_test=X_test,
            y_test=y_test,
            predict_proba=predict_proba)


        dum, dum_test, dum_scores = fit_predict(
            model=models['dum'],
            X_train=X_train,
            y_train=y_train,
            X_test=X_test,
            y_test=y_test,
            predict_proba=predict_proba)

        if vegas_adjustment and y_col != 'played':
            print '-'*50
            print 'Vegas Adjusted:', y_col

            X_train_all = build_vegas_dataframe(X=X_train, y=y_train,
                row_info=X_train_info, model=gb, db=db, y_col=y_col)
            X_test_all = build_vegas_dataframe(X=X_test, y=y_test,
                row_info=X_test_info, model=gb, db=db, y_col=y_col)

            features = [y_col, 'Total','is_favorite','spread_x_favorite']
            X_cols = ExtractColumns(exact=features)
            X_train = X_cols.fit_transform(X=X_train_all)
            X_test = X_cols.fit_transform(X=X_test_all)

            gb_a, gb_test_a, gb_scores_a = fit_predict(
                model=models['gb'],
                X_train=X_train,
                y_train=y_train,
                X_test=X_test,
                y_test=y_test)

            rf_a, rf_test_a, rf_scores_a = fit_predict(
                model=models['rf'],
                X_train=X_train,
                y_train=y_train,
                X_test=X_test,
                y_test=y_test)

            lin_a, lin_test_a, lin_scores_a = fit_predict(
                model=models['lin'],
                X_train=X_train,
                y_train=y_train,
                X_test=X_test,
                y_test=y_test)

            print 'Predicting %s' % (y_col)
            print lin_a.coef_
            print 'Gradient Boosting: RMSE %.2f | MAE %.2f' % (gb_scores_a['rmse'], gb_scores_a['mae'])
            print 'Random Forest: RMSE %.2f | MAE %.2f' % (rf_scores_a['rmse'], rf_scores_a['mae'])
            print '%s Regression: RMSE %.2f | MAE %.2f' % ('Linear', lin_scores_a['rmse'], lin_scores_a['mae'])

        print '-'*50
        print 'Historical Prediction:', y_col
        # Print Results
        print 'Gradient Boosting: RMSE %.2f | MAE %.2f' % (gb_scores['rmse'], gb_scores['mae'])
        print 'Random Forest: RMSE %.2f | MAE %.2f' % (rf_scores['rmse'], rf_scores['mae'])
        print '%s Regression: RMSE %.2f | MAE %.2f' % ('Logistic' if predict_proba else 'Linear', lin_scores['rmse'], lin_scores['mae'])
        print 'Baseline: RMSE %.2f | MAE %.2f' % (dum_scores['rmse'], dum_scores['mae'])


        result_row(method='Historical Only', results=gb_scores, stat=y_col, learner='Gradient Boosting', rows=rows)
        result_row(method='Historical Only', results=rf_scores, stat=y_col, learner='Random Forest', rows=rows)
        result_row(method='Historical Only', results=lin_scores, stat=y_col, learner='Logistic' if predict_proba else 'Linear', rows=rows)

        result_row(method='Baseline', results=dum_scores, stat=y_col, learner='Stratified' if predict_proba else 'Mean', rows=rows)

        # result_row(method='Vegas Adjusted', results=gb_scores_a, stat=y_col, learner='Gradient Boosting', rows=rows)
        # result_row(method='Vegas Adjusted', results=rf_scores_a, stat=y_col, learner='Random Forest', rows=rows)
        # result_row(method='Vegas Adjusted', results=lin_scores_a, stat=y_col, learner='Logistic' if predict_proba else 'Linear', rows=rows)


        # Build full models on all data
        gb = gb.fit(X, y)
        rf = rf.fit(X, y)
        lin = lin.fit(X, y)
        #### Next week's predictions
        # Make prediction, just gbr for now

        if(y_col == 'played'):
            preds = gb.predict_proba(X_pred)[:,1]
        else:
            preds = gb.predict(X_pred)
            if expert_projections:
                # create dataframe with predictions for all weeks of current
                # year to use with the expert prediction
                mask = (row_info.year == 2015) & (row_info.week <= pred_week)
                X_2015 = X[mask]
                y_2015 = X_all[mask][y_col]
                preds_2015 = cross_val_predict(gb, X_2015, y_2015)
                info_2015 = row_info[mask][['full_name','week','year']]
                info_2015.loc[:, y_col] = preds_2015
                info_2015.loc[:,'position'] = 'RB'
                info_2015.loc[:, y_col] = X_all[mask][y_col]

        # add our prediction based on historical data to output
        pred_results.loc[:,y_col] = preds

        # add expert projections, then make a final prediction
        if expert_projections and y_col != 'played':
            pred_results = add_expert_projections(pred_results, pred_week, y_col, info_2015, result_rows=rows)

    pred_results.replace(0, np.nan, inplace=True)
    out_path = result_path + '/predictions' + '_' + str(int(pred_yr_wk[0])) + '_' + str(int(pred_yr_wk[1])) + '.json'
    pred_results.to_json(path_or_buf = out_path, orient = 'records')

示例#2

显示文件

文件： rb_web_data.py 项目： kevinallen/waiver_coach

def main(y_col, predict_week):
    # get historical data
    db = nfldb.connect()

    yr_wk = [(2015, i) for i in range(1,predict_week)]
    stats = ['receiving_rec', 'receiving_tar', 'receiving_tds', 'receiving_yac_yds',
             'receiving_yds', 'rushing_att', 'rushing_tds','rushing_yds']
    player_info = ['player_id','full_name','position']
    position = 'RB'

    playerdata = WeeklyPlayerData(db=db, yr_wk=yr_wk, stats=stats,
                                  player_info=player_info, fill_time=True,
                                  position=position)

    pipe = Pipeline(steps=[('data',playerdata), ('key',AddNameKey()),
                           ('nan',HandleNaN(method='fill'))])
    hist_data = pipe.fit_transform(X=None)

    client = MongoClient()
    mdb = client.data

    include_stats = ['fumbles','receptions','rec_yds','rec_tds',
                     'rush_attempts','rush_yds','rush_tds']

    data = ProjectedPlayerData(db=mdb, stats=include_stats, position='RB')
    pipe = Pipeline(steps=[('data', data), ('nan',HandleNaN())])
    proj_data = pipe.fit_transform(X=None)

    # get stats for next week then remove player info
    next_week_proj = proj_data.loc[proj_data.week == predict_week]
    info_cols = ['name','name_key','team','week','year','position']
    pred_info = next_week_proj[info_cols]
    next_week_proj.drop(info_cols, axis=1, inplace=True)

    # join the two dataframes
    df = pd.merge(proj_data, hist_data, how="inner", on=["name_key","week","year"])
    # drop the week you want to predict from the training data
    df = df[df.week < predict_week]

    # store the player data and then remove those columns from the training data
    info_cols = ['name','name_key','position_x','position_y','team','week','year',
                 'full_name','played','player_id']
    hist_info = df[info_cols]
    df.drop(info_cols, axis=1, inplace=True)

    hist_cols = ['rushing_tds','rushing_att','receiving_yds','receiving_yac_yds',
                 'receiving_tds','receiving_tar','receiving_rec','rushing_yds']

    y = df[y_col]
    X = df.drop(hist_cols, axis=1)

    train, test = train_test_split_index(df.shape[0], test_size=0.1, seed=0)

    X_train = X.iloc[train]
    y_train = y.iloc[train]
    X_test = X.iloc[test]
    y_test = y.iloc[test]

    regr = LinearRegression()
    regr.fit(X_train, y_train)
    pred = regr.predict(X_test)
    print 'Predicting ', y_col
    print 'Historical summary statistics:'
    print y_test.describe()
    print
    print 'Projected summary statistics:'
    print sp.stats.describe(pred)
    print
    print "RMSE: ", mean_squared_error(y_test, pred)**0.5
    print "MAE: ", mean_absolute_error(y_test, pred)
    print "intercept: ", regr.intercept_
    print "coefficients:"
    for col, coef in zip(X_train.columns, regr.coef_):
        print "   ", col, coef
    print

    pred_labels = []
    for player in pred_info.iterrows():
        pred_labels.append((player[1]['name'], player[1]['week']))

    pred = regr.predict(next_week_proj)

    # print out predicted data for inspection
    print 'Player\tWeek\tRushing yards'
    for (player,week), prediction in zip(pred_labels,pred):
        print "\t".join([player,str(week),str(prediction)])

示例#3

显示文件

文件： rb_stats.py 项目： kevinallen/waiver_coach

def add_expert_projections(pred_results, pred_week, y_col, info_2015, result_rows=[]):
    db = nfldb.connect()

    rows = result_rows

    yr_wk = [(2015, i) for i in range(1,pred_week + 1)]
    stats = ['receiving_rec', 'receiving_tar', 'receiving_tds', 'receiving_yac_yds',
             'receiving_yds', 'rushing_att', 'rushing_tds','rushing_yds']
    player_info = ['player_id','full_name','position']
    position = 'RB'

    playerdata = WeeklyPlayerData(db=db, yr_wk=yr_wk, stats=stats,
                                  player_info=player_info, fill_time=True,
                                  position=position)

    pipe = Pipeline(steps=[('data',playerdata), ('key',AddNameKey()),
                           ('nan',HandleNaN(method='fill'))])
    hist_data = pipe.fit_transform(X=None)

    client = MongoClient()
    mdb = client.data

    include_stats = ['fumbles','receptions','rec_yds','rec_tds',
                     'rush_attempts','rush_yds','rush_tds']

    data = ProjectedPlayerData(db=mdb, stats=include_stats, position='RB')
    # data already has name_key, so don't need to add it to pipeline
    pipe = Pipeline(steps=[('data', data), ('nan',HandleNaN())])
    proj_data = pipe.fit_transform(X=None)

    # join the two datasets
    merged_df = pd.merge(proj_data, hist_data, how="inner", on=["name_key","week","year"])

    # drop the week you want to predict from the training data
    df = merged_df[merged_df.week < pred_week]

    # store the player data and then remove those columns from the training data
    info_cols = ['name','name_key','position_x','position_y','team','week',
        'year','full_name','played','player_id']
    hist_info = df[info_cols]
    df.drop(info_cols, axis=1, inplace=True)

    hist_cols = ['rushing_tds','rushing_att','receiving_yds','receiving_yac_yds',
                 'receiving_tds','receiving_tar','receiving_rec','rushing_yds']

    y = df[y_col]
    X = df.drop(hist_cols, axis=1)


    train, test = train_test_split_index(df.shape[0], test_size=0.1, seed=0)

    X_train = X.iloc[train]
    y_train = y.iloc[train]
    X_test = X.iloc[test]
    y_test = y.iloc[test]

    models = {
        'gb':GradientBoostingRegressor(n_estimators=100, learning_rate=0.1),
        'rf':RandomForestRegressor(),
        'lin':LinearRegression(),
        'dum':DummyRegressor()
    }

    gb, gb_test, gb_scores = fit_predict(
        model=models['gb'],
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test)

    rf, rf_test, rf_scores = fit_predict(
        model=models['rf'],
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test)

    lin, lin_test, lin_scores = fit_predict(
        model=models['lin'],
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test)

    dum, dum_test, dum_scores = fit_predict(
        model=models['dum'],
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        predict_proba=False)

    print "-"*50
    print "Expert prediction: ", y_col
    print 'Gradient Boosting: RMSE %.2f | MAE %.2f' % (gb_scores['rmse'], gb_scores['mae'])
    print 'Random Forest: RMSE %.2f | MAE %.2f' % (rf_scores['rmse'], rf_scores['mae'])
    print '%s Regression: RMSE %.2f | MAE %.2f' % ('Linear', lin_scores['rmse'], lin_scores['mae'])

    result_row(method='Expert Only', results=gb_scores, stat=y_col, learner='Gradient Boosting', rows=rows)
    result_row(method='Expert Only', results=rf_scores, stat=y_col, learner='Random Forest', rows=rows)
    result_row(method='Expert Only', results=lin_scores, stat=y_col, learner='Linear', rows=rows)

    result_row(method='Baseline For Expert Adjusted', results=dum_scores, stat=y_col, learner='Mean', rows=rows)

    # get stats for the prediction week then remove player info
    next_week_proj = proj_data[proj_data.week == pred_week]
    info_cols = ['name','name_key','team','position','year','week']
    pred_info = next_week_proj[info_cols]
    next_week_proj.drop(info_cols, axis=1, inplace=True)

    # make prediction for next week
    gb.fit(X, y)
    pred = gb.predict(next_week_proj)

    df = pd.DataFrame(pred_info['name_key'])
    df.columns = ['name_key']
    df.loc[:,'expert_' + y_col] = pred

    # before adding name_key, need to add position
    pred_results.loc[:,'position'] = 'RB'

    for column in next_week_proj.columns:
        if column not in pred_results.columns:
            df.loc[:,column] = next_week_proj[column]

    # add name_key to data predicted from historical data
    pipe = Pipeline(steps=[('key',AddNameKey())])
    results_with_key = pipe.fit_transform(X=pred_results)
    results_with_expert = pd.merge(results_with_key, df, how="left", on="name_key")

    # make a combined prediction
    pred_with_y = pipe.fit_transform(X=info_2015)
    df_all = pd.merge(proj_data, pred_with_y, how="inner", on=["name_key","week","year"])
    df = df_all[df_all.week < pred_week]

    y = df[y_col]
    info_cols = ['name', 'name_key', 'position_x', 'position_y', 'team', 'week', 'year', 'full_name']
    X = df.drop(info_cols + [y_col], axis=1)

    train, test = train_test_split_index(df.shape[0], test_size=0.1, seed=0)

    X_train = X.iloc[train]
    y_train = y.iloc[train]
    X_test = X.iloc[test]
    y_test = y.iloc[test]

    gb, gb_test, gb_scores = fit_predict(
        model=models['gb'],
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test)

    rf, rf_test, rf_scores = fit_predict(
        model=models['rf'],
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test)

    lin, lin_test, lin_scores = fit_predict(
        model=models['lin'],
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test)

    print "-"*50
    print "Historical expert-adjusted prediction:", y_col
    print 'Gradient Boosting: RMSE %.2f | MAE %.2f' % (gb_scores['rmse'], gb_scores['mae'])
    print 'Random Forest: RMSE %.2f | MAE %.2f' % (rf_scores['rmse'], rf_scores['mae'])
    print '%s Regression: RMSE %.2f | MAE %.2f' % ('Linear', lin_scores['rmse'], lin_scores['mae'])
    print
    print

    result_row(method='Historical + Expert', results=gb_scores, stat=y_col, learner='Gradient Boosting', rows=rows)
    result_row(method='Historical + Expert', results=rf_scores, stat=y_col, learner='Random Forest', rows=rows)
    result_row(method='Historical + Expert', results=lin_scores, stat=y_col, learner='Linear', rows=rows)

    next_week_proj = proj_data[proj_data.week == pred_week]
    info_cols = ['name','name_key','team','position','year','week']
    pred_info = next_week_proj[info_cols]
    next_week_proj.drop(info_cols, axis=1, inplace=True)

    # make prediction for next week
    gb.fit(X, y)
    pred = gb.predict(next_week_proj)

    df = pd.DataFrame(pred_info['name_key'])
    df.columns = ['name_key']
    df.loc[:,'hist_expert_' + y_col] = pred

    # # before adding name_key, need to add position
    # pred_results.loc[:,'position'] = 'RB'

    for column in next_week_proj.columns:
        if column not in results_with_expert.columns:
            df.loc[:,column] = next_week_proj[column]

    results_with_expert_2 = pd.merge(results_with_expert, df, how="left", on="name_key")
    return results_with_expert_2

示例#4

显示文件

文件： rb_nn.py 项目： kevinallen/waiver_coach

def main():
	################################
	### CONFIGURE
	pred_week = 14 #None
	db = nfldb.connect()
	result_path='../results'

	### LOAD DATA
	# load train data
	full_train, pipe, stats = load_feature_set(db)

	# picks columns to model
	lag_cols = [stat + '_lag' for stat in stats]
	mean_cols = [stat + '_mean' for stat in stats]
	other_cols = ['same_year_lag', 'played_lag']

	infoColumns = ExtractColumns(like=[], exact=['year','week','time','player_id','full_name'])
	row_info = infoColumns.fit_transform(X=full_train)


	# load prediction data
	pred_data, predict_i, pred_info, pred_yr_wk = prediction_feature_set(db, pipe, infoColumns, pred_week=pred_week)

	##################################
	### PREPARE DATA FOR TRAIN AND PREDICT
	# train data with all columns
	X_all = full_train

	# prediction data with all columns
	pred_all = pred_data.iloc[predict_i]

	# which rows did players play
	played_bool = full_train['played'] == 1
	played_index = [i for i in range(X_all.shape[0]) if played_bool[i]]

	# random split train and test
	train_index, test_index = train_test_split_index(X_all.shape[0], test_size=0.1, seed=0)

	feature_cols = lag_cols + mean_cols + other_cols
	XColumns = ExtractColumns(like=feature_cols)
	X = XColumns.fit_transform(X=X_all)
	X_pred = XColumns.fit_transform(X=pred_all)

	##################################
	### SET UP & TRAIN KNN
	# fit k nearest neighbors
	k = 100
	played_only = True
	i_knn = played_index if played_only else range(X.shape[0])

	#nn = NearestNeighbors(n_neighbors=k).fit(X.iloc[i_knn])
	# regularization
	reg = CoefScaler(linear_model=Ridge())
	reg = reg.fit(X=X.iloc[i_knn], y = score_stats(X_all, make_scorer(base_type='standard')).iloc[i_knn])
	X_reg = reg.transform(X.iloc[i_knn])
	nn = NearestNeighbors(n_neighbors=k).fit(X_reg)

	# returns tuple of (distances, indices of neighbors)
	# for prediction set
	#distance, neighbor = nn.kneighbors(X=X_pred)
	X_reg_pred = reg.transform(X=X_pred)
	distance, neighbor = nn.kneighbors(X=X_reg_pred)

	##################################
	### READ AND PLOT KNN RESULTS
	nn_dict = {}
	for check_i in range(pred_all.shape[0]):
	    # check neighbors
	    # check_nn is a data frame where the first row is the player
	    # and the rest of the rows are the nearest neighbors
	    check_nn = pred_all.iloc[[check_i],:].append(X_all.iloc[i_knn].iloc[neighbor[check_i,:]])
	    check_nn['StandardPoints'] = score_stats(check_nn, make_scorer(base_type='standard'))
	    check_nn['PPRPoints'] = score_stats(check_nn, make_scorer(base_type='ppr'))

	    nn_i = plot_knn(check_nn, save_image=True, plot_stat='StandardPoints', pred_yr_wk=pred_yr_wk, result_path=plot_image_path(result_path, pred_yr_wk), n_bins=25, bandwidth=2.5)
	    nn_dict.update(nn_i)

	save_plot_data_json(nn_dict, result_path, pred_yr_wk)