示例#1
0
def end_to_end_test():
    np.random.seed(0)

    # load data
    match_data, match_features, match_labels, bkm_quotes = simple_data_prep(
        verbose=1,
        fable_observed_matches=40,
        padding=False,
        fable="match_hist",
        label_format="hot_vectors")

    # split data
    split_ratio = 0.15
    X_train, X_val, (indices_train,
                     indices_val) = split_input(match_features,
                                                1. - split_ratio,
                                                random=True,
                                                return_indices=True)
    Y_train, Y_val = match_labels.iloc[indices_train], match_labels.iloc[
        indices_val]
    bkm_quotes_train, bkm_quotes_val = bkm_quotes.iloc[
        indices_train], bkm_quotes.iloc[indices_val]

    display_shapes(X_train, X_val, Y_train, Y_val)

    epochs = 200
    convolution_model = False
    # define and configure model
    if convolution_model:
        add_tag = "conv"
        # n_activations = 64
        n_activations = 16
        n_conv_filter = 4
        # activation_fct = "sigmoid"
        activation_fct = "relu"
        dropout = 0.45
        # l2_reg = 0.003
        l2_reg = 0.07
        model = prepare_simple_nn_model_conv(X_train.shape[1:],
                                             n_activations=n_activations,
                                             n_conv_filter=n_conv_filter,
                                             activation_fct=activation_fct,
                                             base_dropout=dropout,
                                             l2_regularization_factor=l2_reg)
    else:
        add_tag = "simple"
        # n_activations = 128  # sigmoid
        n_activations = 50  # relu
        # activation_fct = "sigmoid"
        activation_fct = "relu"
        dropout = 0.45
        # l2_reg = 0.002  # sigmoid
        l2_reg = 0.05  # relu
        model = prepare_simple_nn_model(X_train.shape[1:],
                                        n_activations=n_activations,
                                        activation_fct=activation_fct,
                                        base_dropout=dropout,
                                        l2_regularization_factor=l2_reg)

    # creates a model label containing most of its param (used to load / save it)
    model_label = add_tag + '_model_' + str(n_activations) + '_' + activation_fct + '_d' + str(dropout) + '_reg' + \
                  str(l2_reg) + '_shape_' + ''.join(str(e) + '_' for e in X_train.shape[1:] if e > 1) + 'e' + \
                  str(epochs)
    model_label = model_label.replace('.', '')
    model_label += '.h5py'

    # Its better to have a decreasing learning rate during the training to reach efficiently the global
    # minimum of the loss function.
    # To keep the advantage of the fast computation time with a high LR, i decreased the LR dynamically
    # every X steps (epochs) depending if it is necessary (when accuracy is not improved).
    # With the ReduceLROnPlateau function from Keras.callbacks, i choose to reduce the LR by half if the accuracy
    learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss',
                                                patience=60,
                                                verbose=1,
                                                factor=0.6,
                                                min_lr=0.0001)

    # Define the optimizer
    # optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    optimizer = Adam(lr=0.001,
                     beta_1=0.9,
                     beta_2=0.999,
                     epsilon=1e-08,
                     decay=0)

    batch_size = 256
    try:
        model = load_model(MODEL_PATH + model_label)
    except OSError:
        # Compile the model
        # model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"])
        model.compile(optimizer=optimizer, loss="categorical_crossentropy")

        if VERBOSE: model.summary()
        history = model.fit(x=X_train,
                            y=Y_train,
                            epochs=epochs,
                            batch_size=batch_size,
                            validation_data=(X_val, Y_val),
                            verbose=2,
                            callbacks=[learning_rate_reduction])
        if SAVE_MODEL: model.save(MODEL_PATH + model_label)

        # Plot the loss and accuracy curves for training and validation
        fig, ax = plt.subplots(2, 1)
        ax[0].plot(history.history['loss'][5:],
                   color='b',
                   label="Training loss")
        ax[0].plot(history.history['val_loss'][5:],
                   color='r',
                   label="validation loss",
                   axes=ax[0])
        legend = ax[0].legend(loc='best', shadow=True)

        # ax[1].plot(history.history['acc'], color='b', label="Training accuracy")
        # ax[1].plot(history.history['val_acc'], color='r', label="Validation accuracy")
        # legend = ax[1].legend(loc='best', shadow=True)
        if DISPLAY_GRAPH: plt.show()

    # model predictions
    predictions_val = model.predict(X_val)  # to get percentages
    predictions_train = model.predict(X_train)  # to get percentages

    if VERBOSE:
        print("\n --- TRAIN ANALYSIS --- ")
        analyze_predictions(Y_train,
                            predictions_train,
                            bkm_quotes_train,
                            nb_max_matchs_displayed=0)
        print("\n --- VAL ANALYSIS --- ")
        analyze_predictions(Y_val,
                            predictions_val,
                            bkm_quotes_val,
                            nb_max_matchs_displayed=0)

    # on the below, reduce universe to matches with quotes
    remove_nan_mask_val = [
        not contain_nan(bkm_quotes_val.iloc[i])
        for i in range(bkm_quotes_val.shape[0])
    ]
    bkm_quotes_val_r = bkm_quotes_val.iloc[remove_nan_mask_val]
    Y_val_r = Y_val.iloc[remove_nan_mask_val]
    predictions_val_r = predictions_val[remove_nan_mask_val]

    constant_invest_stgy = ConstantAmountInvestStrategy(
        1.)  # invest 1 in each match (if expected return > 1% actually)
    constant_sigma_invest_stgy = ConstantStdDevInvestStrategy(
        0.01)  # stdDev of each bet is 1% of wealth
    kelly_invest_stgy = KellyInvestStrategy(
    )  # Kelly's ratio investment to maximize's wealth long term return
    constant_percent_stgy = ConstantPercentInvestStrategy(
        0.01)  # invest 1% of money each time

    for invest_stgy in [
            constant_invest_stgy, constant_sigma_invest_stgy,
            kelly_invest_stgy, constant_percent_stgy
    ]:
        print("\n#### results for ", invest_stgy.__class__.__name__, "####")
        init_wealth = 100
        df_recap_stgy = invest_stgy.apply_invest_strategy(
            predictions_val_r,
            bkm_quotes_val_r,
            Y_val_r,
            init_wealth=init_wealth)

        print(df_recap_stgy[[
            'invested_amounts', 'exp_gain_amounts', 'gain_amounts'
        ]].sum())
        print('wealth: from', init_wealth, 'to',
              round(df_recap_stgy['wealth'].iloc[-1], 4))
示例#2
0
def stacking_predictions():

    np.random.seed(2)

    nn_pred = np.genfromtxt("D:/Football_betting/predictions/" + 'conv_nn_predictions.csv', delimiter=',')
    dixon_pred = np.genfromtxt("D:/Football_betting/predictions/" + 'dixon_coles_predictions.csv', delimiter=',')
    bkm_quotes = pd.read_csv("D:/Football_betting/predictions/" + 'bookmaker_quotes.csv', header=0)
    result_labels = pd.read_csv("D:/Football_betting/predictions/" + 'actual_results.csv', header=0)
    bkm_probas = bkm_quote_to_probas(bkm_quotes)

    # on the below, reduce universe to matches with quotes
    remove_nan_mask_val = [not contain_nan(bkm_probas[i]) for i in range(bkm_probas.shape[0])]
    bkm_probas = bkm_probas[remove_nan_mask_val]
    nn_pred = nn_pred[remove_nan_mask_val]
    dixon_pred = dixon_pred[remove_nan_mask_val]
    result_labels = result_labels.iloc[remove_nan_mask_val]

    y_hot_vectors_train, y_hot_vectors_val, (indices_train, indices_val) = split_input(result_labels, split_ratio=0.8,
                                                                                       random=True, return_indices=True)

    bkm_probas_train, bkm_probas_val = bkm_probas[indices_train], bkm_probas[indices_val]
    nn_pred_train, nn_pred_val = nn_pred[indices_train], nn_pred[indices_val]
    dixon_pred_train, dixon_pred_val = dixon_pred[indices_train], dixon_pred[indices_val]

    x_train = np.concatenate(tuple([bkm_probas_train, nn_pred_train, dixon_pred_train]), axis=1)
    x_val = np.concatenate(tuple([bkm_probas_val, nn_pred_val, dixon_pred_val]), axis=1)
    y_train = y_hot_vectors_train
    y_val = y_hot_vectors_val

    print('inputs shapes')
    print('x_train', x_train.shape)
    print('x_val', x_val.shape)
    print('y_train', y_train.shape)
    print('y_val', y_val.shape)

    n_activations = 20
    model = simple_stacking_nn_model(n_activations, x_train.shape[1:], l2_regularization_factor=0.00005,
                                     dropout_factor=0.3)

    # Define the optimizer
    # optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    optimizer = Adam(lr=0.005, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0)

    model.compile(optimizer=optimizer, loss="categorical_crossentropy")

    # Its better to have a decreasing learning rate during the training to reach efficiently the global
    # minimum of the loss function.
    # To keep the advantage of the fast computation time with a high LR, i decreased the LR dynamically
    # every X steps (epochs) depending if it is necessary (when accuracy is not improved).
    # With the ReduceLROnPlateau function from Keras.callbacks, i choose to reduce the LR by half if the accuracy
    learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', patience=60, verbose=1, factor=0.6, min_lr=0.0001)

    epochs = 800
    batch_size = 512  # all ?

    # if VERBOSE: model.summary()
    history = model.fit(x=x_train, y=y_train, epochs=epochs, batch_size=batch_size, validation_data=(x_val, y_val),
                        verbose=2, callbacks=[learning_rate_reduction])

    # Plot the loss and accuracy curves for training and validation
    fig, ax = plt.subplots(2, 1)
    ax[0].plot(history.history['loss'][5:], color='b', label="Training loss")
    ax[0].plot(history.history['val_loss'][5:], color='r', label="validation loss", axes=ax[0])
    legend = ax[0].legend(loc='best', shadow=True)

    # ax[1].plot(history.history['acc'], color='b', label="Training accuracy")
    # ax[1].plot(history.history['val_acc'], color='r', label="Validation accuracy")
    # legend = ax[1].legend(loc='best', shadow=True)
    if DISPLAY_GRAPH: plt.show()

    # model predictions
    predictions_val = model.predict(x_val)  # to get percentages
    predictions_train = model.predict(x_train)  # to get percentages

    print('predictions_train  ; ', log_loss(y_train, predictions_train))
    print('predictions_val    ; ', log_loss(y_val, predictions_val))
    print('bkm_train    ; ', log_loss(y_train, bkm_probas_train))
    print('bkm_val      ; ', log_loss(y_val, bkm_probas_val))
    print('--')
    print('nn_train    ; ', log_loss(y_train, nn_pred_train))
    print('nn_val      ; ', log_loss(y_val, nn_pred_val))
    print('dx_train    ; ', log_loss(y_train, dixon_pred_train))
    print('dx_val      ; ', log_loss(y_val, dixon_pred_val))
示例#3
0
def test_train_classifier():
    nb_teams = 20
    nb_seasons = 20

    # load everything
    data = full_data_creation(nb_teams,
                              nb_seasons,
                              dynamic_tag="dynamic",
                              nb_seasons_val=2,
                              fable_observed_seasons=1,
                              bkm_noise=0.03,
                              label_format="indices",
                              horizontal_fable_features=True)
    # split data
    X_train, X_val, Y_train, Y_val, actual_probas_train, actual_probas_val, bkm_quotes_train, bkm_quotes_val = data

    X_train, X_calib, [indices_train,
                       indices_calib] = split_input(X_train,
                                                    split_ratio=0.7,
                                                    random=True,
                                                    return_indices=True)
    Y_calib = Y_train.iloc[indices_calib]
    Y_train = Y_train.iloc[indices_train]

    # print(Y_train.iloc[-10:])
    # X_train, _, Y_train, _ = train_test_split(X_train, Y_train, test_size=0.1, shuffle=True, stratify=Y_train)
    # print(Y_train[-10:])
    # input()

    LOG_clf = linear_model.LogisticRegression(multi_class="ovr",
                                              solver="sag",
                                              class_weight='balanced')
    # LOG_clf.fit(X_train, Y_train)
    # print("Score of {} for training set: {:.4f}.".format(LOG_clf.__class__.__name__,
    #                                                      accuracy_score(Y_train, LOG_clf.predict(X_train))))
    # print(
    #     "Score of {} for test set: {:.4f}.".format(LOG_clf.__class__.__name__, accuracy_score(Y_val,
    #                                                                                           LOG_clf.predict(X_val))))

    dm_reduction = PCA()
    RF_clf = RandomForestClassifier(n_estimators=200,
                                    random_state=1,
                                    class_weight='balanced')

    # Creating cross validation data splits
    cv_sets = model_selection.StratifiedShuffleSplit(n_splits=5,
                                                     test_size=0.20,
                                                     random_state=5)
    cv_sets.get_n_splits(X_train, Y_train)

    n_features = X_train.shape[1]
    parameters_RF = {
        'clf__max_features': ['auto', 'log2'],
        'dm_reduce__n_components':
        np.arange(5, n_features, np.around(n_features / 5))
    }
    parameters_LOG = {
        'clf__C':
        np.logspace(1, 1000, 5),
        'dm_reduce__n_components':
        np.arange(5, n_features, np.around(n_features / 5))
    }

    # scorer = make_scorer(accuracy_score)
    scorer = make_scorer(log_loss)
    # scorer = make_scorer(lambda x, y: log_loss(x, y, labels=sorted(np.unique(y))))  # to improve
    # scorer = lambda y: make_scorer(log_loss, greater_is_better=False, needs_proba=True,
    #                                labels=sorted(np.unique(y)))

    # computations core to use
    jobs = 1

    # best_pipe = train_classifier(LOG_clf, dm_reduction, X_train, Y_train, cv_sets, parameters_LOG, scorer, jobs,
    #                              use_grid_search=True, best_components=None, best_params=None)
    best_pipe = train_classifier(RF_clf,
                                 dm_reduction,
                                 X_train,
                                 Y_train,
                                 cv_sets,
                                 parameters_RF,
                                 scorer,
                                 jobs,
                                 use_grid_search=True,
                                 best_components=None,
                                 best_params=None)

    print(best_pipe)

    clf, dm_reduce, train_score, test_score = train_calibrate_predict(
        RF_clf,
        dm_reduction,
        X_train,
        Y_train,
        X_calib,
        Y_calib,
        X_val,
        Y_val,
        cv_sets,
        parameters_RF,
        scorer,
        jobs,
        use_grid_search=True)
    print(clf)
    print(dm_reduce)
    print(train_score)
    print(test_score)
示例#4
0
def test_stacking_model():

    # load data
    y_hot_vectors = pd.read_csv(PATH + "labels.csv")
    perfect_preds = pd.read_csv(PATH + "perfect_pred.csv")
    noisy_preds = pd.read_csv(PATH + "noisy_pred.csv")
    wrong_preds = pd.read_csv(PATH + "wrong_pred.csv")

    print('perfect; ', log_loss(y_hot_vectors, perfect_preds))
    print('noisy  ; ', log_loss(y_hot_vectors, noisy_preds))
    print('wrong  ; ', log_loss(y_hot_vectors, wrong_preds))

    np.random.seed(2)

    perfect_preds_train, perfect_preds_val, (indices_train, indices_val)= split_input(perfect_preds, split_ratio=0.8,
                                                                                      random=True, return_indices=True)
    y_hot_vectors_train, y_hot_vectors_val = y_hot_vectors.iloc[indices_train], y_hot_vectors.iloc[indices_val]
    noisy_preds_train, noisy_preds_val = noisy_preds.iloc[indices_train], noisy_preds.iloc[indices_val]
    wrong_preds_train, wrong_preds_val = wrong_preds.iloc[indices_train], wrong_preds.iloc[indices_val]

    x_train = np.concatenate(tuple([noisy_preds_train, wrong_preds_train]), axis=1)
    x_val = np.concatenate(tuple([noisy_preds_val, wrong_preds_val]), axis=1)
    y_train = y_hot_vectors_train
    y_val = y_hot_vectors_val

    n_activations = 20
    model = simple_stacking_nn_model(n_activations, x_train.shape[1:], l2_regularization_factor=0.00005,
                                     dropout_factor=0.3)

    # Define the optimizer
    # optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    optimizer = Adam(lr=0.005, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0)

    model.compile(optimizer=optimizer, loss="categorical_crossentropy")

    # Its better to have a decreasing learning rate during the training to reach efficiently the global
    # minimum of the loss function.
    # To keep the advantage of the fast computation time with a high LR, i decreased the LR dynamically
    # every X steps (epochs) depending if it is necessary (when accuracy is not improved).
    # With the ReduceLROnPlateau function from Keras.callbacks, i choose to reduce the LR by half if the accuracy
    learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', patience=60, verbose=1, factor=0.6, min_lr=0.0001)

    epochs = 800
    batch_size = 512  # all ?

    # if VERBOSE: model.summary()
    history = model.fit(x=x_train, y=y_train, epochs=epochs, batch_size=batch_size, validation_data=(x_val, y_val),
                        verbose=2, callbacks=[learning_rate_reduction])

    # Plot the loss and accuracy curves for training and validation
    fig, ax = plt.subplots(2, 1)
    ax[0].plot(history.history['loss'][5:], color='b', label="Training loss")
    ax[0].plot(history.history['val_loss'][5:], color='r', label="validation loss", axes=ax[0])
    legend = ax[0].legend(loc='best', shadow=True)

    # ax[1].plot(history.history['acc'], color='b', label="Training accuracy")
    # ax[1].plot(history.history['val_acc'], color='r', label="Validation accuracy")
    # legend = ax[1].legend(loc='best', shadow=True)
    if DISPLAY_GRAPH: plt.show()

    # model predictions
    predictions_val = model.predict(x_val)  # to get percentages
    predictions_train = model.predict(x_train)  # to get percentages

    print('perfect; ', log_loss(y_hot_vectors, perfect_preds))
    print('noisy  ; ', log_loss(y_hot_vectors, noisy_preds))
    print('wrong  ; ', log_loss(y_hot_vectors, wrong_preds))

    print('predictions_train  ; ', log_loss(y_train, predictions_train))
    print('predictions_val    ; ', log_loss(y_val, predictions_val))
    print('perfect_val    ; ', log_loss(y_val, perfect_preds_val))
示例#5
0
def full_data_creation(nb_teams, nb_seasons, dynamic_tag="dynamic", nb_seasons_val=2, fable_observed_seasons=1,
                       bkm_noise=0.03, bkm_fees=0.05, nb_fixed_seasons=0, fable='match_hist',
                       label_format="hot_vectors", horizontal_fable_features=False, verbose=1, data_path=DATA_PATH):

    # Check inputs
    assert(nb_seasons_val + fable_observed_seasons < nb_seasons)
    assert(label_format in ("hot_vectors", "indices", "labels"))
    assert(fable in ("match_hist", "stats"))

    # dynamic_tag = "stationary"
    params_str = 't' + str(nb_teams) + '_s' + str(nb_seasons) + '_'

    np.random.seed(0)
    try:
        match_results = pd.read_csv(data_path + params_str + dynamic_tag + "_poisson_results.csv")
        actual_probas = pd.read_csv(data_path + params_str + dynamic_tag + "_poisson_results_probabilities.csv")
        print(" ... data files have been loaded ...")
    except FileNotFoundError:
        print("no data files found: ... creating data ...")
        if dynamic_tag == "dynamic":
            match_results, actual_probas, team_params = create_dynamic_poisson_match_results(nb_teams, nb_seasons,
                                                                                             nb_fixed_seasons=
                                                                                             nb_fixed_seasons,
                                                                                             export=True)
        elif dynamic_tag == "stationary":
            match_results, actual_probas, team_params = create_stationary_poisson_match_results(nb_teams, nb_seasons,
                                                                                                export=True)

    bkm_quotes = create_noisy_bookmaker_quotes(actual_probas, std_dev=bkm_noise, fees=bkm_fees)

    match_results['date'] = create_time_feature_from_season_and_stage(match_results, base=100)

    if verbose: print(" ... creating fables ...")
    if fable == "match_hist":
        match_fables = simple_fable(match_results, nb_observed_match=(nb_teams - 1) * fable_observed_seasons * 2,
                                    horizontal_features=horizontal_fable_features)
    elif fable == "stats":
        match_fables = simple_stats_fable(match_results, nb_observed_match=(nb_teams - 1) * fable_observed_seasons * 2)

    if label_format == "hot_vectors":
        match_labels = match_outcomes_hot_vectors(match_results)
    elif label_format == "indices":
        match_labels = match_outcomes_indices(match_results)
    elif label_format == "labels":
        match_labels = match_results.apply(get_match_label, axis=1)

    # Split the train and the validation set for the fitting
    split_ratio_1 = 1. - nb_seasons_val / nb_seasons
    # X_train, X_val, Y_train, Y_val = train_test_split(x_data, y_data, test_size=0.1, random_state=random_seed)
    X_train, X_val, (indices90, indices10) = split_input(match_fables, split_ratio=split_ratio_1,
                                                         random=False, return_indices=True)

    # eliminate first season (no fable)
    split_ratio_2 = fable_observed_seasons / (nb_seasons - nb_seasons_val)
    _, X_train, (_, remaining_train_indices) = split_input(X_train, split_ratio=split_ratio_2, random=False,
                                                           return_indices=True)

    Y_train = match_labels.iloc[indices90].iloc[remaining_train_indices]
    Y_val = match_labels.iloc[indices10]
    bkm_quotes_train = bkm_quotes.iloc[indices90].iloc[remaining_train_indices]
    bkm_quotes_val = bkm_quotes.iloc[indices10]

    if verbose: display_shapes(X_train, X_val, Y_train, Y_val)

    # get actual probabilities of issues for the validation set of matches
    actual_probas_train = actual_probas.iloc[indices90].iloc[remaining_train_indices]
    actual_probas_val = actual_probas.iloc[indices10]
    if verbose:
        print("best possible honest score on train set:", log_loss(Y_train, actual_probas_train))
        print("best possible honest score on validation set:", log_loss(Y_val, actual_probas_val))

    return X_train, X_val, Y_train, Y_val, actual_probas_train, actual_probas_val, bkm_quotes_train, bkm_quotes_val