Пример #1
0
def dixon_coles_predictions(matches_to_predict,
                            full_match_history,
                            nb_obs_years=3,
                            dixon_coles_params=None,
                            verbose=1,
                            intermediary_analysis=True,
                            home_team_key='home_team_id',
                            away_team_key='away_team_id',
                            home_goals_key='home_team_goal',
                            away_goals_key='away_team_goal',
                            time_key='date',
                            season_key='season',
                            stage_key='stage',
                            bkm_home_win_key='B365H',
                            bkm_draw_key='B365D',
                            bkm_away_win_key='B365A'):

    # default model params
    if dixon_coles_params is None:
        dixon_coles_params = dict()

    # create an index to be able to return predictions in the order of the input (not the order it s been computed)
    matches_to_predict['tmp_index'] = range(len(matches_to_predict))
    countries = list(matches_to_predict['league_country'].unique())
    all_predictions = None
    for country in countries:
        printv(1, verbose, "\n ####  WORKING WITH DATA FROM", country,
               " #### ")
        match_data = matches_to_predict[
            matches_to_predict['league_country'].isin([
                country,
            ])]
        match_history = full_match_history[
            full_match_history['league_country'].isin([
                country,
            ])]

        # on the below: define our team universe (teams we calibrate parameters on)
        team_universe = set(match_history[home_team_key].unique()) | set(
            match_history[away_team_key].unique())
        printv(1, verbose, ' ...', len(team_universe), ' teams involved:',
               *team_universe, '...')
        printv(1, verbose, ' ...', match_data.shape[0],
               'matches to predict ...')

        model = DixonColes(team_universe, **dixon_coles_params)
        printv(
            1, verbose,
            " ... fit dixon coles parameters and predict match outcomes ... ")
        predictions, param_histo = model.fit_and_predict(
            match_data,
            match_history,
            nb_obs_years=nb_obs_years,
            verbose=verbose,
            home_team_key=home_team_key,
            away_team_key=away_team_key,
            home_goals_key=home_goals_key,
            away_goals_key=away_goals_key,
            time_key=time_key,
            season_key=season_key,
            stage_key=stage_key)
        printv(1, verbose, " ... match outcomes predicted ... ")

        if len(
                countries
        ) > 1 and intermediary_analysis:  # display or not intermediary predictions quality
            match_outcomes = match_outcomes_hot_vectors(match_data)
            bkm_quotes = pd.DataFrame()
            bkm_quotes['W'] = match_data[bkm_home_win_key]
            bkm_quotes['D'] = match_data[bkm_draw_key]
            bkm_quotes['L'] = match_data[bkm_away_win_key]
            analysis = analyze_predictions(match_outcomes,
                                           predictions,
                                           bkm_quotes,
                                           nb_max_matchs_displayed=40,
                                           fully_labelled_matches=match_data,
                                           verbose=verbose,
                                           home_team_key=home_team_key,
                                           away_team_key=away_team_key,
                                           home_goals_key=home_goals_key,
                                           away_goals_key=away_goals_key)

            model_log_loss, model_rps, (log_loss_comparison_l,
                                        rps_comparison_l) = analysis

        # add predictions to those already made
        predictions_with_index = np.append(
            match_data['tmp_index'].values.reshape((-1, 1)),
            predictions,
            axis=1)
        if all_predictions is not None:
            all_predictions = np.append(all_predictions,
                                        predictions_with_index,
                                        axis=0)
        else:
            all_predictions = predictions_with_index

    # exctract all predictions, resort them by their index, and remove the index
    all_predictions = all_predictions[all_predictions[:, 0].argsort()][:, 1:]

    # perform a global analysis
    all_match_outcomes = match_outcomes_hot_vectors(matches_to_predict)
    all_bkm_quotes = pd.DataFrame()
    all_bkm_quotes['W'] = matches_to_predict[bkm_home_win_key]
    all_bkm_quotes['D'] = matches_to_predict[bkm_draw_key]
    all_bkm_quotes['L'] = matches_to_predict[bkm_away_win_key]
    analysis = analyze_predictions(all_match_outcomes,
                                   all_predictions,
                                   all_bkm_quotes,
                                   nb_max_matchs_displayed=40,
                                   fully_labelled_matches=matches_to_predict,
                                   verbose=verbose,
                                   home_team_key=home_team_key,
                                   away_team_key=away_team_key,
                                   home_goals_key=home_goals_key,
                                   away_goals_key=away_goals_key)
    print("final_pred shape", all_predictions.shape)
    return all_predictions
Пример #2
0
def end_to_end_test_2():

    player_data, player_stats_data, team_data, match_data = first_data_preparation(
    )
    countries = ['France', 'England', 'Germany', 'Spain', 'Italy']
    min_date_str = '2013-07-31'

    # filter by countries
    mask_countries = match_data['league_country'].isin(countries)
    match_data = match_data.loc[mask_countries]

    # convert date input string to actual python date
    match_data['date'] = match_data.apply(
        lambda x: datetime.strptime(x['date'], "%Y-%m-%d %H:%M:%S").date(),
        axis=1)

    # filter on recent matchs only (to make predictions on them)
    min_date = datetime.strptime(min_date_str, "%Y-%m-%d").date()
    mask_date = match_data['date'] >= min_date
    matches_to_predict = match_data[mask_date]
    id_matches_to_predict = matches_to_predict['id']

    # load data
    match_data, match_features, match_labels, bkm_quotes = simple_data_prep(
        verbose=1,
        fable_observed_matches=40,
        padding=True,
        remove_nan=False,
        fable="match_hist",
        label_format="hot_vectors")

    matches_to_predict_mask = match_data['id'].isin(id_matches_to_predict)
    X_train, X_val = match_features[~matches_to_predict_mask], match_features[
        matches_to_predict_mask]
    Y_train, Y_val = match_labels[~matches_to_predict_mask], match_labels[
        matches_to_predict_mask]
    bkm_quotes_train, bkm_quotes_val = bkm_quotes[
        ~matches_to_predict_mask], bkm_quotes[matches_to_predict_mask]
    display_shapes(X_train, X_val, Y_train, Y_val)

    # bkm_quotes_val.to_csv("D:/Football_betting/predictions/" + "bookmaker_quotes.csv", index=False)
    # Y_val.to_csv("D:/Football_betting/predictions/" + "actual_results.csv", index=False)
    # input('export done')

    # # split data
    # split_ratio = 0.15
    # X_train, X_val, (indices_train, indices_val) = split_input(match_features, 1.-split_ratio, random=True,
    #                                                            return_indices=True)
    # Y_train, Y_val = match_labels.iloc[indices_train], match_labels.iloc[indices_val]
    # bkm_quotes_train, bkm_quotes_val = bkm_quotes.iloc[indices_train], bkm_quotes.iloc[indices_val]

    display_shapes(X_train, X_val, Y_train, Y_val)

    epochs = 200
    convolution_model = True
    # define and configure model
    if convolution_model:
        add_tag = "conv"
        # n_activations = 64
        n_activations = 16
        n_conv_filter = 4
        # activation_fct = "sigmoid"
        activation_fct = "relu"
        dropout = 0.45
        # l2_reg = 0.003
        l2_reg = 0.07
        model = prepare_simple_nn_model_conv(X_train.shape[1:],
                                             n_activations=n_activations,
                                             n_conv_filter=n_conv_filter,
                                             activation_fct=activation_fct,
                                             base_dropout=dropout,
                                             l2_regularization_factor=l2_reg)
    else:
        add_tag = "simple"
        # n_activations = 128  # sigmoid
        n_activations = 50  # relu
        # activation_fct = "sigmoid"
        activation_fct = "relu"
        dropout = 0.45
        # l2_reg = 0.002  # sigmoid
        l2_reg = 0.05  # relu
        model = prepare_simple_nn_model(X_train.shape[1:],
                                        n_activations=n_activations,
                                        activation_fct=activation_fct,
                                        base_dropout=dropout,
                                        l2_regularization_factor=l2_reg)

    # creates a model label containing most of its param (used to load / save it)
    model_label = add_tag + '_model_' + str(n_activations) + '_' + activation_fct + '_d' + str(dropout) + '_reg' + \
                  str(l2_reg) + '_shape_' + ''.join(str(e) + '_' for e in X_train.shape[1:] if e > 1) + 'e' + \
                  str(epochs)
    model_label = model_label.replace('.', '')
    model_label += '.h5py'

    # Its better to have a decreasing learning rate during the training to reach efficiently the global
    # minimum of the loss function.
    # To keep the advantage of the fast computation time with a high LR, i decreased the LR dynamically
    # every X steps (epochs) depending if it is necessary (when accuracy is not improved).
    # With the ReduceLROnPlateau function from Keras.callbacks, i choose to reduce the LR by half if the accuracy
    learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss',
                                                patience=60,
                                                verbose=1,
                                                factor=0.6,
                                                min_lr=0.0001)

    # Define the optimizer
    # optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    optimizer = Adam(lr=0.001,
                     beta_1=0.9,
                     beta_2=0.999,
                     epsilon=1e-08,
                     decay=0)

    batch_size = 256
    try:
        model = load_model(MODEL_PATH + model_label)
    except OSError:
        # Compile the model
        # model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"])
        model.compile(optimizer=optimizer, loss="categorical_crossentropy")

        if VERBOSE: model.summary()
        history = model.fit(x=X_train,
                            y=Y_train,
                            epochs=epochs,
                            batch_size=batch_size,
                            validation_data=(X_val, Y_val),
                            verbose=2,
                            callbacks=[learning_rate_reduction])
        if SAVE_MODEL: model.save(MODEL_PATH + model_label)

        # Plot the loss and accuracy curves for training and validation
        fig, ax = plt.subplots(2, 1)
        ax[0].plot(history.history['loss'][5:],
                   color='b',
                   label="Training loss")
        ax[0].plot(history.history['val_loss'][5:],
                   color='r',
                   label="validation loss",
                   axes=ax[0])
        legend = ax[0].legend(loc='best', shadow=True)

        # ax[1].plot(history.history['acc'], color='b', label="Training accuracy")
        # ax[1].plot(history.history['val_acc'], color='r', label="Validation accuracy")
        # legend = ax[1].legend(loc='best', shadow=True)
        if DISPLAY_GRAPH: plt.show()

    # model predictions
    predictions_val = model.predict(X_val)  # to get percentages
    predictions_train = model.predict(X_train)  # to get percentages

    # np.savetxt("D:/Football_betting/predictions/" + "conv_nn_predictions.csv", predictions_val, delimiter=',',
    #            fmt='%.6e')

    if VERBOSE:
        print("\n --- TRAIN ANALYSIS --- ")
        analyze_predictions(Y_train,
                            predictions_train,
                            bkm_quotes_train,
                            nb_max_matchs_displayed=0)
        print("\n --- VAL ANALYSIS --- ")
        analyze_predictions(Y_val,
                            predictions_val,
                            bkm_quotes_val,
                            nb_max_matchs_displayed=0)

    # on the below, reduce universe to matches with quotes
    remove_nan_mask_val = [
        not contain_nan(bkm_quotes_val.iloc[i])
        for i in range(bkm_quotes_val.shape[0])
    ]
    bkm_quotes_val_r = bkm_quotes_val.iloc[remove_nan_mask_val]
    Y_val_r = Y_val.iloc[remove_nan_mask_val]
    predictions_val_r = predictions_val[remove_nan_mask_val]

    constant_invest_stgy = ConstantAmountInvestStrategy(
        1.)  # invest 1 in each match (if expected return > 1% actually)
    constant_sigma_invest_stgy = ConstantStdDevInvestStrategy(
        0.01)  # stdDev of each bet is 1% of wealth
    kelly_invest_stgy = KellyInvestStrategy(
    )  # Kelly's ratio investment to maximize's wealth long term return
    constant_percent_stgy = ConstantPercentInvestStrategy(
        0.01)  # invest 1% of money each time

    for invest_stgy in [
            constant_invest_stgy, constant_sigma_invest_stgy,
            kelly_invest_stgy, constant_percent_stgy
    ]:
        print("\n#### results for ", invest_stgy.__class__.__name__, "####")
        init_wealth = 100
        df_recap_stgy = invest_stgy.apply_invest_strategy(
            predictions_val_r,
            bkm_quotes_val_r,
            Y_val_r,
            init_wealth=init_wealth)

        print(df_recap_stgy[[
            'invested_amounts', 'exp_gain_amounts', 'gain_amounts'
        ]].sum())
        print('wealth: from', init_wealth, 'to',
              round(df_recap_stgy['wealth'].iloc[-1], 4))
def test_case_dixon_coles_one_country_predictions():

    player_data, player_stats_data, team_data, match_data = first_data_preparation(
    )
    countries = [
        'France',
    ]
    # countries = ['England', ]
    min_date = datetime.strptime('2016-04-30', "%Y-%m-%d").date()

    mask_countries = match_data['league_country'].isin(countries)
    match_data = match_data[mask_countries]
    # input(match_data['league_country'].unique())

    # convert date input string to actual python date
    match_data['date'] = match_data.apply(
        lambda x: datetime.strptime(x['date'], "%Y-%m-%d %H:%M:%S").date(),
        axis=1)

    # on the below: non effective way to use team names as id (easier for human-checking and debugging)
    team_id_to_name, team_name_to_id = create_dict_involved_teams(
        match_data, team_data)
    match_data['home_team_id'] = match_data.apply(
        lambda x: team_id_to_name[x['home_team_api_id']], axis=1)
    match_data['away_team_id'] = match_data.apply(
        lambda x: team_id_to_name[x['away_team_api_id']], axis=1)

    # on the below: we define our team universe (teams we calibrate parameters on)
    mask_home = team_data['team_api_id'].isin(match_data['home_team_api_id'])
    mask_away = team_data['team_api_id'].isin(match_data['away_team_api_id'])
    team_universe = list(team_data[mask_home | mask_away]['team_long_name'])
    printv(1, VERBOSE, len(team_universe), team_universe)
    printv(1, VERBOSE, 'nb matches', match_data.shape[0])

    # save full_history before selecting recent matches to predict
    full_history = match_data

    mask_date = match_data['date'] >= min_date
    match_data = match_data[mask_date]

    exp_weight_fct = lambda t1, t2: np.exp(-0.3 * (t2 - t1).days / 365.25)
    model = DixonColes(team_universe, weight_fct=exp_weight_fct)
    printv(1, VERBOSE,
           " ... fit dixon coles parameters and predict match outcomes ... ")
    predictions, param_histo = model.fit_and_predict(
        match_data,
        full_history,
        nb_obs_years=1,
        verbose=VERBOSE,
        home_goals_key='home_team_goal',
        away_goals_key='away_team_goal')
    printv(1, VERBOSE, " ... match outcomes predicted ... ")

    match_outcomes = match_outcomes_hot_vectors(match_data)
    bkm_quotes = pd.DataFrame()
    bkm_quotes['W'], bkm_quotes['D'], bkm_quotes['L'] = match_data[
        'B365H'], match_data['B365D'], match_data['B365A']

    analysis = analyze_predictions(match_outcomes,
                                   predictions,
                                   bkm_quotes,
                                   verbose=VERBOSE,
                                   nb_max_matchs_displayed=40,
                                   fully_labelled_matches=match_data)
    model_log_loss, model_rps, (log_loss_comparison_l,
                                rps_comparison_l) = analysis

    remove_nan_mask = [
        not contain_nan(bkm_quotes.iloc[i]) for i in range(bkm_quotes.shape[0])
    ]
    bkm_quotes_r = bkm_quotes.iloc[remove_nan_mask]
    match_outcomes_r = match_outcomes.iloc[remove_nan_mask]
    predictions_r = predictions[remove_nan_mask]

    constant_invest_stgy = ConstantAmountInvestStrategy(
        1.)  # invest 1 in each match (if expected return > 1% actually)
    constant_sigma_invest_stgy = ConstantStdDevInvestStrategy(
        0.01)  # stdDev of each bet is 1% of wealth
    kelly_invest_stgy = KellyInvestStrategy(
    )  # Kelly's ratio investment to maximize's wealth long term return
    constant_percent_stgy = ConstantPercentInvestStrategy(
        0.01)  # invest 1% of money each time

    for invest_stgy in [
            constant_invest_stgy, constant_sigma_invest_stgy,
            kelly_invest_stgy, constant_percent_stgy
    ]:
        printv(1, VERBOSE, "\n#### results for ",
               invest_stgy.__class__.__name__, "####")
        init_wealth = 100
        df_recap_stgy = invest_stgy.apply_invest_strategy(
            predictions_r,
            bkm_quotes_r,
            match_outcomes_r,
            init_wealth=init_wealth)

        printv(
            1, VERBOSE, df_recap_stgy[[
                'invested_amounts', 'exp_gain_amounts', 'gain_amounts'
            ]].sum())
        printv(1, VERBOSE, 'wealth: from', init_wealth, 'to',
               round(df_recap_stgy['wealth'].iloc[-1], 4))