def dixon_coles_predictions(matches_to_predict, full_match_history, nb_obs_years=3, dixon_coles_params=None, verbose=1, intermediary_analysis=True, home_team_key='home_team_id', away_team_key='away_team_id', home_goals_key='home_team_goal', away_goals_key='away_team_goal', time_key='date', season_key='season', stage_key='stage', bkm_home_win_key='B365H', bkm_draw_key='B365D', bkm_away_win_key='B365A'): # default model params if dixon_coles_params is None: dixon_coles_params = dict() # create an index to be able to return predictions in the order of the input (not the order it s been computed) matches_to_predict['tmp_index'] = range(len(matches_to_predict)) countries = list(matches_to_predict['league_country'].unique()) all_predictions = None for country in countries: printv(1, verbose, "\n #### WORKING WITH DATA FROM", country, " #### ") match_data = matches_to_predict[ matches_to_predict['league_country'].isin([ country, ])] match_history = full_match_history[ full_match_history['league_country'].isin([ country, ])] # on the below: define our team universe (teams we calibrate parameters on) team_universe = set(match_history[home_team_key].unique()) | set( match_history[away_team_key].unique()) printv(1, verbose, ' ...', len(team_universe), ' teams involved:', *team_universe, '...') printv(1, verbose, ' ...', match_data.shape[0], 'matches to predict ...') model = DixonColes(team_universe, **dixon_coles_params) printv( 1, verbose, " ... fit dixon coles parameters and predict match outcomes ... ") predictions, param_histo = model.fit_and_predict( match_data, match_history, nb_obs_years=nb_obs_years, verbose=verbose, home_team_key=home_team_key, away_team_key=away_team_key, home_goals_key=home_goals_key, away_goals_key=away_goals_key, time_key=time_key, season_key=season_key, stage_key=stage_key) printv(1, verbose, " ... match outcomes predicted ... ") if len( countries ) > 1 and intermediary_analysis: # display or not intermediary predictions quality match_outcomes = match_outcomes_hot_vectors(match_data) bkm_quotes = pd.DataFrame() bkm_quotes['W'] = match_data[bkm_home_win_key] bkm_quotes['D'] = match_data[bkm_draw_key] bkm_quotes['L'] = match_data[bkm_away_win_key] analysis = analyze_predictions(match_outcomes, predictions, bkm_quotes, nb_max_matchs_displayed=40, fully_labelled_matches=match_data, verbose=verbose, home_team_key=home_team_key, away_team_key=away_team_key, home_goals_key=home_goals_key, away_goals_key=away_goals_key) model_log_loss, model_rps, (log_loss_comparison_l, rps_comparison_l) = analysis # add predictions to those already made predictions_with_index = np.append( match_data['tmp_index'].values.reshape((-1, 1)), predictions, axis=1) if all_predictions is not None: all_predictions = np.append(all_predictions, predictions_with_index, axis=0) else: all_predictions = predictions_with_index # exctract all predictions, resort them by their index, and remove the index all_predictions = all_predictions[all_predictions[:, 0].argsort()][:, 1:] # perform a global analysis all_match_outcomes = match_outcomes_hot_vectors(matches_to_predict) all_bkm_quotes = pd.DataFrame() all_bkm_quotes['W'] = matches_to_predict[bkm_home_win_key] all_bkm_quotes['D'] = matches_to_predict[bkm_draw_key] all_bkm_quotes['L'] = matches_to_predict[bkm_away_win_key] analysis = analyze_predictions(all_match_outcomes, all_predictions, all_bkm_quotes, nb_max_matchs_displayed=40, fully_labelled_matches=matches_to_predict, verbose=verbose, home_team_key=home_team_key, away_team_key=away_team_key, home_goals_key=home_goals_key, away_goals_key=away_goals_key) print("final_pred shape", all_predictions.shape) return all_predictions
def end_to_end_test_2(): player_data, player_stats_data, team_data, match_data = first_data_preparation( ) countries = ['France', 'England', 'Germany', 'Spain', 'Italy'] min_date_str = '2013-07-31' # filter by countries mask_countries = match_data['league_country'].isin(countries) match_data = match_data.loc[mask_countries] # convert date input string to actual python date match_data['date'] = match_data.apply( lambda x: datetime.strptime(x['date'], "%Y-%m-%d %H:%M:%S").date(), axis=1) # filter on recent matchs only (to make predictions on them) min_date = datetime.strptime(min_date_str, "%Y-%m-%d").date() mask_date = match_data['date'] >= min_date matches_to_predict = match_data[mask_date] id_matches_to_predict = matches_to_predict['id'] # load data match_data, match_features, match_labels, bkm_quotes = simple_data_prep( verbose=1, fable_observed_matches=40, padding=True, remove_nan=False, fable="match_hist", label_format="hot_vectors") matches_to_predict_mask = match_data['id'].isin(id_matches_to_predict) X_train, X_val = match_features[~matches_to_predict_mask], match_features[ matches_to_predict_mask] Y_train, Y_val = match_labels[~matches_to_predict_mask], match_labels[ matches_to_predict_mask] bkm_quotes_train, bkm_quotes_val = bkm_quotes[ ~matches_to_predict_mask], bkm_quotes[matches_to_predict_mask] display_shapes(X_train, X_val, Y_train, Y_val) # bkm_quotes_val.to_csv("D:/Football_betting/predictions/" + "bookmaker_quotes.csv", index=False) # Y_val.to_csv("D:/Football_betting/predictions/" + "actual_results.csv", index=False) # input('export done') # # split data # split_ratio = 0.15 # X_train, X_val, (indices_train, indices_val) = split_input(match_features, 1.-split_ratio, random=True, # return_indices=True) # Y_train, Y_val = match_labels.iloc[indices_train], match_labels.iloc[indices_val] # bkm_quotes_train, bkm_quotes_val = bkm_quotes.iloc[indices_train], bkm_quotes.iloc[indices_val] display_shapes(X_train, X_val, Y_train, Y_val) epochs = 200 convolution_model = True # define and configure model if convolution_model: add_tag = "conv" # n_activations = 64 n_activations = 16 n_conv_filter = 4 # activation_fct = "sigmoid" activation_fct = "relu" dropout = 0.45 # l2_reg = 0.003 l2_reg = 0.07 model = prepare_simple_nn_model_conv(X_train.shape[1:], n_activations=n_activations, n_conv_filter=n_conv_filter, activation_fct=activation_fct, base_dropout=dropout, l2_regularization_factor=l2_reg) else: add_tag = "simple" # n_activations = 128 # sigmoid n_activations = 50 # relu # activation_fct = "sigmoid" activation_fct = "relu" dropout = 0.45 # l2_reg = 0.002 # sigmoid l2_reg = 0.05 # relu model = prepare_simple_nn_model(X_train.shape[1:], n_activations=n_activations, activation_fct=activation_fct, base_dropout=dropout, l2_regularization_factor=l2_reg) # creates a model label containing most of its param (used to load / save it) model_label = add_tag + '_model_' + str(n_activations) + '_' + activation_fct + '_d' + str(dropout) + '_reg' + \ str(l2_reg) + '_shape_' + ''.join(str(e) + '_' for e in X_train.shape[1:] if e > 1) + 'e' + \ str(epochs) model_label = model_label.replace('.', '') model_label += '.h5py' # Its better to have a decreasing learning rate during the training to reach efficiently the global # minimum of the loss function. # To keep the advantage of the fast computation time with a high LR, i decreased the LR dynamically # every X steps (epochs) depending if it is necessary (when accuracy is not improved). # With the ReduceLROnPlateau function from Keras.callbacks, i choose to reduce the LR by half if the accuracy learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', patience=60, verbose=1, factor=0.6, min_lr=0.0001) # Define the optimizer # optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0) optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0) batch_size = 256 try: model = load_model(MODEL_PATH + model_label) except OSError: # Compile the model # model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"]) model.compile(optimizer=optimizer, loss="categorical_crossentropy") if VERBOSE: model.summary() history = model.fit(x=X_train, y=Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, Y_val), verbose=2, callbacks=[learning_rate_reduction]) if SAVE_MODEL: model.save(MODEL_PATH + model_label) # Plot the loss and accuracy curves for training and validation fig, ax = plt.subplots(2, 1) ax[0].plot(history.history['loss'][5:], color='b', label="Training loss") ax[0].plot(history.history['val_loss'][5:], color='r', label="validation loss", axes=ax[0]) legend = ax[0].legend(loc='best', shadow=True) # ax[1].plot(history.history['acc'], color='b', label="Training accuracy") # ax[1].plot(history.history['val_acc'], color='r', label="Validation accuracy") # legend = ax[1].legend(loc='best', shadow=True) if DISPLAY_GRAPH: plt.show() # model predictions predictions_val = model.predict(X_val) # to get percentages predictions_train = model.predict(X_train) # to get percentages # np.savetxt("D:/Football_betting/predictions/" + "conv_nn_predictions.csv", predictions_val, delimiter=',', # fmt='%.6e') if VERBOSE: print("\n --- TRAIN ANALYSIS --- ") analyze_predictions(Y_train, predictions_train, bkm_quotes_train, nb_max_matchs_displayed=0) print("\n --- VAL ANALYSIS --- ") analyze_predictions(Y_val, predictions_val, bkm_quotes_val, nb_max_matchs_displayed=0) # on the below, reduce universe to matches with quotes remove_nan_mask_val = [ not contain_nan(bkm_quotes_val.iloc[i]) for i in range(bkm_quotes_val.shape[0]) ] bkm_quotes_val_r = bkm_quotes_val.iloc[remove_nan_mask_val] Y_val_r = Y_val.iloc[remove_nan_mask_val] predictions_val_r = predictions_val[remove_nan_mask_val] constant_invest_stgy = ConstantAmountInvestStrategy( 1.) # invest 1 in each match (if expected return > 1% actually) constant_sigma_invest_stgy = ConstantStdDevInvestStrategy( 0.01) # stdDev of each bet is 1% of wealth kelly_invest_stgy = KellyInvestStrategy( ) # Kelly's ratio investment to maximize's wealth long term return constant_percent_stgy = ConstantPercentInvestStrategy( 0.01) # invest 1% of money each time for invest_stgy in [ constant_invest_stgy, constant_sigma_invest_stgy, kelly_invest_stgy, constant_percent_stgy ]: print("\n#### results for ", invest_stgy.__class__.__name__, "####") init_wealth = 100 df_recap_stgy = invest_stgy.apply_invest_strategy( predictions_val_r, bkm_quotes_val_r, Y_val_r, init_wealth=init_wealth) print(df_recap_stgy[[ 'invested_amounts', 'exp_gain_amounts', 'gain_amounts' ]].sum()) print('wealth: from', init_wealth, 'to', round(df_recap_stgy['wealth'].iloc[-1], 4))
def test_case_dixon_coles_one_country_predictions(): player_data, player_stats_data, team_data, match_data = first_data_preparation( ) countries = [ 'France', ] # countries = ['England', ] min_date = datetime.strptime('2016-04-30', "%Y-%m-%d").date() mask_countries = match_data['league_country'].isin(countries) match_data = match_data[mask_countries] # input(match_data['league_country'].unique()) # convert date input string to actual python date match_data['date'] = match_data.apply( lambda x: datetime.strptime(x['date'], "%Y-%m-%d %H:%M:%S").date(), axis=1) # on the below: non effective way to use team names as id (easier for human-checking and debugging) team_id_to_name, team_name_to_id = create_dict_involved_teams( match_data, team_data) match_data['home_team_id'] = match_data.apply( lambda x: team_id_to_name[x['home_team_api_id']], axis=1) match_data['away_team_id'] = match_data.apply( lambda x: team_id_to_name[x['away_team_api_id']], axis=1) # on the below: we define our team universe (teams we calibrate parameters on) mask_home = team_data['team_api_id'].isin(match_data['home_team_api_id']) mask_away = team_data['team_api_id'].isin(match_data['away_team_api_id']) team_universe = list(team_data[mask_home | mask_away]['team_long_name']) printv(1, VERBOSE, len(team_universe), team_universe) printv(1, VERBOSE, 'nb matches', match_data.shape[0]) # save full_history before selecting recent matches to predict full_history = match_data mask_date = match_data['date'] >= min_date match_data = match_data[mask_date] exp_weight_fct = lambda t1, t2: np.exp(-0.3 * (t2 - t1).days / 365.25) model = DixonColes(team_universe, weight_fct=exp_weight_fct) printv(1, VERBOSE, " ... fit dixon coles parameters and predict match outcomes ... ") predictions, param_histo = model.fit_and_predict( match_data, full_history, nb_obs_years=1, verbose=VERBOSE, home_goals_key='home_team_goal', away_goals_key='away_team_goal') printv(1, VERBOSE, " ... match outcomes predicted ... ") match_outcomes = match_outcomes_hot_vectors(match_data) bkm_quotes = pd.DataFrame() bkm_quotes['W'], bkm_quotes['D'], bkm_quotes['L'] = match_data[ 'B365H'], match_data['B365D'], match_data['B365A'] analysis = analyze_predictions(match_outcomes, predictions, bkm_quotes, verbose=VERBOSE, nb_max_matchs_displayed=40, fully_labelled_matches=match_data) model_log_loss, model_rps, (log_loss_comparison_l, rps_comparison_l) = analysis remove_nan_mask = [ not contain_nan(bkm_quotes.iloc[i]) for i in range(bkm_quotes.shape[0]) ] bkm_quotes_r = bkm_quotes.iloc[remove_nan_mask] match_outcomes_r = match_outcomes.iloc[remove_nan_mask] predictions_r = predictions[remove_nan_mask] constant_invest_stgy = ConstantAmountInvestStrategy( 1.) # invest 1 in each match (if expected return > 1% actually) constant_sigma_invest_stgy = ConstantStdDevInvestStrategy( 0.01) # stdDev of each bet is 1% of wealth kelly_invest_stgy = KellyInvestStrategy( ) # Kelly's ratio investment to maximize's wealth long term return constant_percent_stgy = ConstantPercentInvestStrategy( 0.01) # invest 1% of money each time for invest_stgy in [ constant_invest_stgy, constant_sigma_invest_stgy, kelly_invest_stgy, constant_percent_stgy ]: printv(1, VERBOSE, "\n#### results for ", invest_stgy.__class__.__name__, "####") init_wealth = 100 df_recap_stgy = invest_stgy.apply_invest_strategy( predictions_r, bkm_quotes_r, match_outcomes_r, init_wealth=init_wealth) printv( 1, VERBOSE, df_recap_stgy[[ 'invested_amounts', 'exp_gain_amounts', 'gain_amounts' ]].sum()) printv(1, VERBOSE, 'wealth: from', init_wealth, 'to', round(df_recap_stgy['wealth'].iloc[-1], 4))