def main(dist_file, pred_file, scoring_type):
    # dist_file = '2015_10.json'
    # pred_file = '../predictions.json'
    # scoring_type = 'standard'

    # Load dist data file
    with open(dist_file) as f:
        distdata = json.load(f)

    # load prediction file
    with open(pred_file) as f:
        preddata = {d["player_id"]: d for d in json.load(f)}

    # make scorer
    scorer = make_scorer(base_type=scoring_type)

    # score stats
    preddata = {k: score_stats_dict(v, scorer=scorer) for k, v in preddata.iteritems()}

    # add pred data to dist data
    for k, v in distdata.iteritems():
        if k in preddata.keys():
            distdata[k].update({"player_info": preddata[k]})
        else:
            print(k)

    # save updated dist data
    with open(dist_file, "w") as f:
        json.dump(distdata, f)
def score_stats_dict(stat_dict, scorer=make_scorer(base_type="standard")):
    stat_dict.update(
        {u"standard_score": sum([v * stat_dict[k] for k, v in scorer.iteritems() if k in stat_dict.keys()])}
    )
    return stat_dict
示例#3
0
def main():
	################################
	### CONFIGURE
	pred_week = 14 #None
	db = nfldb.connect()
	result_path='../results'

	### LOAD DATA
	# load train data
	full_train, pipe, stats = load_feature_set(db)

	# picks columns to model
	lag_cols = [stat + '_lag' for stat in stats]
	mean_cols = [stat + '_mean' for stat in stats]
	other_cols = ['same_year_lag', 'played_lag']

	infoColumns = ExtractColumns(like=[], exact=['year','week','time','player_id','full_name'])
	row_info = infoColumns.fit_transform(X=full_train)


	# load prediction data
	pred_data, predict_i, pred_info, pred_yr_wk = prediction_feature_set(db, pipe, infoColumns, pred_week=pred_week)

	##################################
	### PREPARE DATA FOR TRAIN AND PREDICT
	# train data with all columns
	X_all = full_train

	# prediction data with all columns
	pred_all = pred_data.iloc[predict_i]

	# which rows did players play
	played_bool = full_train['played'] == 1
	played_index = [i for i in range(X_all.shape[0]) if played_bool[i]]

	# random split train and test
	train_index, test_index = train_test_split_index(X_all.shape[0], test_size=0.1, seed=0)

	feature_cols = lag_cols + mean_cols + other_cols
	XColumns = ExtractColumns(like=feature_cols)
	X = XColumns.fit_transform(X=X_all)
	X_pred = XColumns.fit_transform(X=pred_all)

	##################################
	### SET UP & TRAIN KNN
	# fit k nearest neighbors
	k = 100
	played_only = True
	i_knn = played_index if played_only else range(X.shape[0])

	#nn = NearestNeighbors(n_neighbors=k).fit(X.iloc[i_knn])
	# regularization
	reg = CoefScaler(linear_model=Ridge())
	reg = reg.fit(X=X.iloc[i_knn], y = score_stats(X_all, make_scorer(base_type='standard')).iloc[i_knn])
	X_reg = reg.transform(X.iloc[i_knn])
	nn = NearestNeighbors(n_neighbors=k).fit(X_reg)

	# returns tuple of (distances, indices of neighbors)
	# for prediction set
	#distance, neighbor = nn.kneighbors(X=X_pred)
	X_reg_pred = reg.transform(X=X_pred)
	distance, neighbor = nn.kneighbors(X=X_reg_pred)

	##################################
	### READ AND PLOT KNN RESULTS
	nn_dict = {}
	for check_i in range(pred_all.shape[0]):
	    # check neighbors
	    # check_nn is a data frame where the first row is the player
	    # and the rest of the rows are the nearest neighbors
	    check_nn = pred_all.iloc[[check_i],:].append(X_all.iloc[i_knn].iloc[neighbor[check_i,:]])
	    check_nn['StandardPoints'] = score_stats(check_nn, make_scorer(base_type='standard'))
	    check_nn['PPRPoints'] = score_stats(check_nn, make_scorer(base_type='ppr'))

	    nn_i = plot_knn(check_nn, save_image=True, plot_stat='StandardPoints', pred_yr_wk=pred_yr_wk, result_path=plot_image_path(result_path, pred_yr_wk), n_bins=25, bandwidth=2.5)
	    nn_dict.update(nn_i)

	save_plot_data_json(nn_dict, result_path, pred_yr_wk)