예제 #1
0
                         ('data/original/wc_2010_games_real.csv',
                          'data/original/wc_2010_bets.csv', "2010-06-11")]
feature_sets = [("all_features", all_features),
                ("general_features", other_features),
                ("player_features", player_features)]

file_name = "outcome_report_full.txt"

reports = []
for (name, feature_set) in feature_sets:
    write_log(file_name, str(datetime.datetime.now()))
    write_log(file_name,
              f"Running test for feature set: {name}",
              print_text=True)

    data_loader = DataLoader(feature_set)
    X, y = data_loader.get_all_data("home_win")

    arguments = get_grid_search_arguments(X)
    results = run_grid_search(arguments, X, y)
    results.to_csv(f"gboost_hyperparam_optimization_{name}.csv")
    best_params_dict = get_best_params(results)
    optimal_params = {'n_estimators': 250}
    optimal_params["learning_rate"] = best_params_dict["learning_rate"]
    optimal_params["max_depth"] = best_params_dict["max_depth"]
    optimal_params["min_samples_leaf"] = best_params_dict["min_samples_leaf"]
    optimal_params["max_features"] = best_params_dict["max_features"]

    write_log(file_name, str(optimal_params), print_text=True)

    for (tt_file, bet_file, filter_start) in tournament_parameters:
예제 #2
0
]
feature_sets = [
    # ("all_features", all_features, "score_hyperparam_optimization_all_features.csv"),
    # ("general_features", other_features, "score_hyperparam_optimization_general_features.csv"),
    # ("player_features", player_features, "score_hyperparam_optimization_player_features.csv")
    ("rfe_features", rfe_feature, "score_hyperparam_optimization_rfe.csv"))
]

file_name = "score_report_full.txt"

reports = []
for (name, feature_set, fname) in feature_sets:
    write_log(file_name, str(datetime.datetime.now()))
    write_log(file_name, f"Running test for feature set: {name}", print_text=True)

    data_loader = DataLoader(feature_set)
    params = get_default_parameters()

    if os.path.isfile(fname):
        write_log(file_name, f"Hyperparameters found for: {name}", print_text=True)
        results = pd.read_csv(fname)
    else:
        Xhome, yhome, Xaway, yaway = data_loader.get_all_data(["home_score", "away_score"])
        _, outcomes = data_loader.get_all_data("home_win")

        arguments = get_cv_grid_search_arguments(params, Xhome)
        results = run_grid_search_for_score(arguments, Xhome, yhome, Xaway, yaway, outcomes)
        results.to_csv(f"score_hyperparam_optimization_{name}.csv")

    best_params_dict = get_best_params(results)
    write_log(file_name, str(best_params_dict), print_text=True)
예제 #3
0
params = {
    'oob_score' : True,
    'bootstrap': True,
    'n_jobs':-1,
    'n_estimators': 1000,
    "max_features": "sqrt",
    "max_depth": 8,
    "min_samples_leaf": 3
}

avg_accuracies = []
avg_log_lossss = []
features = []

while len(feature_set) > 0:
    data_loader = DataLoader(feature_set)

    accuracies = []
    log_losses = []
    feature_values = {}
    for i in range(100):
        model = RandomForestClassifier(**params)

        X_train, y_train, X_test, y_test = data_loader.get_train_and_test_dataset("home_win", random_state=None)
        model.fit(X_train, y_train)

        y_true, y_pred = y_test, model.predict(X_test)
        accuracies.append(accuracy_score(y_true, y_pred))
        y_true, y_prob = y_test, model.predict_proba(X_test)
        log_losses.append(log_loss(y_true, y_prob))
예제 #4
0
    if args.y == 2010:
        tt_file = 'data/original/wc_2010_games_real.csv'
        mb_file = 'data/original/wc_2010_bets.csv'
        filter_start = "2010-06-11"
    elif args.y == 2014:
        tt_file = 'data/original/wc_2014_games_real.csv'
        mb_file = 'data/original/wc_2014_bets.csv'
        filter_start = "2014-06-12"
    else:
        tt_file = 'data/original/wc_2018_games_real.csv'
        mb_file = 'data/original/wc_2018_bets.csv'
        filter_start = "2018-06-13"

    prefix = f"{args.f}_{args.y}"

    dl = DataLoader(all_features, filter_start=filter_start)
    model_parameters = get_default_parameters()
    model_parameters["max_depth"] = 8
    model_parameters["max_features"] = "sqrt"
    model_parameters["min_samples_leaf"] = 1
    af_data = simulate(tt_file, mb_file, dl, model_parameters,
                       f"{prefix}_all_features")

    dl = DataLoader(other_features, filter_start=filter_start)
    model_parameters["max_depth"] = 8
    model_parameters["max_features"] = "log2"
    model_parameters["min_samples_leaf"] = 10
    gf_data = simulate(tt_file, mb_file, dl, model_parameters,
                       f"{prefix}_general_features")

    dl = DataLoader(player_features, filter_start=filter_start)