示例#1
0
    def optimize_parameters(self,
                            matches,
                            current_time,
                            init_params=None,
                            padding=True,
                            verbose=1,
                            home_team_key='home_team_id',
                            away_team_key='away_team_id',
                            home_goals_key='home_goals',
                            away_goals_key='away_goals',
                            time_key='date',
                            control_dates=True):

        if control_dates:  # control we calibrate params on past data
            assert (matches[time_key].max() < current_time)

        # init params
        if init_params is None:
            init_params = np.ones((self.nb_params, 1))

        bounds = ((0.2, 5), ) * self.nb_params

        # define local functions involved in optimization
        def constraint_fct(params):  # avg of alphas and betas must be one
            return (np.sum(params[0:self.nb_teams]) - self.nb_teams) ** 2 + \
                   (np.sum(params[self.nb_teams:2*self.nb_teams]) - self.nb_teams)**2

        def constraint_fct_der(params):
            jac = np.zeros_like(params)
            alpha_cur = np.sum(params[0:self.nb_teams]) - self.nb_teams
            beta_cur = np.sum(
                params[self.nb_teams:2 * self.nb_teams]) - self.nb_teams
            for i in range(self.nb_teams):
                jac[i] = 2. * params[i] * alpha_cur
            for i in range(self.nb_teams, 2 * self.nb_teams):
                jac[i] = 2. * params[i] * beta_cur
            return jac

        def likelihood_m(params):
            return -self._likelihood(matches,
                                     params,
                                     current_time,
                                     padding=padding,
                                     home_team_key=home_team_key,
                                     away_team_key=away_team_key,
                                     home_goals_key=home_goals_key,
                                     away_goals_key=away_goals_key,
                                     time_key=time_key)

        def likelihood_jac_m(params):
            return -self._likelihood_jac(matches,
                                         params,
                                         current_time,
                                         padding=padding,
                                         home_team_key=home_team_key,
                                         away_team_key=away_team_key,
                                         home_goals_key=home_goals_key,
                                         away_goals_key=away_goals_key,
                                         time_key=time_key)

        # other ok methods; TNC or L-BFGS-B
        res = minimize(likelihood_m,
                       init_params,
                       jac=likelihood_jac_m,
                       method='Newton-CG',
                       bounds=bounds,
                       options={
                           'xtol': 10e-3,
                           'disp': False
                       },
                       constraints=({
                           'type': 'eq',
                           'fun': constraint_fct,
                           'jac': constraint_fct_der
                       }, ))
        if not res.success:
            printv(
                1, verbose,
                " fail to calibrate parameters with method Newton-CG. trying another method (TNC)"
            )
            res = minimize(likelihood_m,
                           init_params,
                           jac=likelihood_jac_m,
                           method='TNC',
                           bounds=bounds,
                           options={
                               'xtol': 10e-3,
                               'disp': False
                           },
                           constraints=({
                               'type': 'eq',
                               'fun': constraint_fct,
                               'jac': constraint_fct_der
                           }, ))
            if not res.success:
                print('\033[91m' + "fail to calibrate parameters on date " +
                      str(current_time) + '\033[0m')
                return None
        return res.x
示例#2
0
    def fit_and_predict(self,
                        matches_to_predict,
                        full_matches_history,
                        nb_obs_years=3,
                        padding=True,
                        verbose=1,
                        home_team_key='home_team_id',
                        away_team_key='away_team_id',
                        home_goals_key='home_goals',
                        away_goals_key='away_goals',
                        time_key='date',
                        season_key='season',
                        stage_key='stage'):

        start_time = time()
        sorted_matches = matches_to_predict.sort_values(
            by=[season_key, stage_key, time_key])

        # first parameters calibration
        pred_season = sorted_matches[season_key].iloc[0]
        pred_stage = sorted_matches[stage_key].iloc[0]
        pred_time = sorted_matches[time_key].iloc[0]
        printv(2, verbose, "current calibration;   season", pred_season,
               "  day", pred_stage, "  time", pred_time)
        min_hist_time = pred_time - relativedelta(years=nb_obs_years)
        relevant_match_history = full_matches_history[
            min_hist_time <= full_matches_history[time_key]]
        relevant_match_history = relevant_match_history[
            relevant_match_history[time_key] < pred_time]
        opti_params = self.optimize_parameters(relevant_match_history,
                                               pred_time,
                                               padding=padding,
                                               verbose=verbose,
                                               home_team_key=home_team_key,
                                               away_team_key=away_team_key,
                                               home_goals_key=home_goals_key,
                                               away_goals_key=away_goals_key,
                                               time_key=time_key)
        if verbose >= 3:
            self.print_params(opti_params)
        params_history = [
            [pred_time, opti_params],
        ]

        # storage of outcomes probabilities
        df_predictions = pd.DataFrame(columns=['W', 'D', 'L'])

        for i, match in sorted_matches.iterrows():
            # parameters calibration
            cur_season = match[season_key]
            cur_stage = match[stage_key]
            cur_time = match[time_key]
            if cur_season != pred_season or cur_stage != pred_stage:
                printv(2, verbose, "current calibration;   season", cur_season,
                       "  day", cur_stage, "  time", cur_time)
                pred_season, pred_stage, pred_time = cur_season, cur_stage, cur_time
                min_hist_time = pred_time - relativedelta(years=nb_obs_years)
                relevant_match_history = full_matches_history[
                    min_hist_time <= full_matches_history[time_key]]
                relevant_match_history = relevant_match_history[
                    relevant_match_history[time_key] < pred_time]
                # we start optimization by last most relevant params
                opti_params = self.optimize_parameters(
                    relevant_match_history,
                    pred_time,
                    init_params=opti_params,
                    padding=padding,
                    verbose=verbose,
                    home_team_key=home_team_key,
                    away_team_key=away_team_key,
                    home_goals_key=home_goals_key,
                    away_goals_key=away_goals_key,
                    time_key=time_key)
                if opti_params is not None:
                    params_history.append([pred_time, opti_params])
                if verbose >= 3:
                    self.print_params(opti_params)

        # make prediction by finding adapted param and use it to predict outcome
        sorted_params_history = sorted(params_history, key=lambda x: x[0])
        for i, match in matches_to_predict.iterrows():

            # find adapted params for given match
            match_t = match[time_key]
            t, params_t = sorted_params_history[0]
            for next_t, params_next_t in sorted_params_history:
                if next_t > match_t:
                    break  # params to use have been found ! --> params_t
                params_t = params_next_t

            # predictions using params
            p_w, p_d, p_l = self.predict_match_outcome(match[home_team_key],
                                                       match[away_team_key],
                                                       params_t)
            df_predictions = df_predictions.append(
                {
                    'W': p_w,
                    'D': p_d,
                    'L': p_l
                }, ignore_index=True)
            printv(3, verbose, "prediction;", match[home_team_key],
                   match[away_team_key], " --> ", round(p_w, 4), round(p_d, 4),
                   round(p_l, 4))

        end_time = time()
        printv(1, verbose, " ... fit_and_predict computations performed in",
               round(end_time - start_time, 2), "seconds ...")

        return df_predictions.values, sorted_params_history
示例#3
0
def dixon_coles_predictions(matches_to_predict,
                            full_match_history,
                            nb_obs_years=3,
                            dixon_coles_params=None,
                            verbose=1,
                            intermediary_analysis=True,
                            home_team_key='home_team_id',
                            away_team_key='away_team_id',
                            home_goals_key='home_team_goal',
                            away_goals_key='away_team_goal',
                            time_key='date',
                            season_key='season',
                            stage_key='stage',
                            bkm_home_win_key='B365H',
                            bkm_draw_key='B365D',
                            bkm_away_win_key='B365A'):

    # default model params
    if dixon_coles_params is None:
        dixon_coles_params = dict()

    # create an index to be able to return predictions in the order of the input (not the order it s been computed)
    matches_to_predict['tmp_index'] = range(len(matches_to_predict))
    countries = list(matches_to_predict['league_country'].unique())
    all_predictions = None
    for country in countries:
        printv(1, verbose, "\n ####  WORKING WITH DATA FROM", country,
               " #### ")
        match_data = matches_to_predict[
            matches_to_predict['league_country'].isin([
                country,
            ])]
        match_history = full_match_history[
            full_match_history['league_country'].isin([
                country,
            ])]

        # on the below: define our team universe (teams we calibrate parameters on)
        team_universe = set(match_history[home_team_key].unique()) | set(
            match_history[away_team_key].unique())
        printv(1, verbose, ' ...', len(team_universe), ' teams involved:',
               *team_universe, '...')
        printv(1, verbose, ' ...', match_data.shape[0],
               'matches to predict ...')

        model = DixonColes(team_universe, **dixon_coles_params)
        printv(
            1, verbose,
            " ... fit dixon coles parameters and predict match outcomes ... ")
        predictions, param_histo = model.fit_and_predict(
            match_data,
            match_history,
            nb_obs_years=nb_obs_years,
            verbose=verbose,
            home_team_key=home_team_key,
            away_team_key=away_team_key,
            home_goals_key=home_goals_key,
            away_goals_key=away_goals_key,
            time_key=time_key,
            season_key=season_key,
            stage_key=stage_key)
        printv(1, verbose, " ... match outcomes predicted ... ")

        if len(
                countries
        ) > 1 and intermediary_analysis:  # display or not intermediary predictions quality
            match_outcomes = match_outcomes_hot_vectors(match_data)
            bkm_quotes = pd.DataFrame()
            bkm_quotes['W'] = match_data[bkm_home_win_key]
            bkm_quotes['D'] = match_data[bkm_draw_key]
            bkm_quotes['L'] = match_data[bkm_away_win_key]
            analysis = analyze_predictions(match_outcomes,
                                           predictions,
                                           bkm_quotes,
                                           nb_max_matchs_displayed=40,
                                           fully_labelled_matches=match_data,
                                           verbose=verbose,
                                           home_team_key=home_team_key,
                                           away_team_key=away_team_key,
                                           home_goals_key=home_goals_key,
                                           away_goals_key=away_goals_key)

            model_log_loss, model_rps, (log_loss_comparison_l,
                                        rps_comparison_l) = analysis

        # add predictions to those already made
        predictions_with_index = np.append(
            match_data['tmp_index'].values.reshape((-1, 1)),
            predictions,
            axis=1)
        if all_predictions is not None:
            all_predictions = np.append(all_predictions,
                                        predictions_with_index,
                                        axis=0)
        else:
            all_predictions = predictions_with_index

    # exctract all predictions, resort them by their index, and remove the index
    all_predictions = all_predictions[all_predictions[:, 0].argsort()][:, 1:]

    # perform a global analysis
    all_match_outcomes = match_outcomes_hot_vectors(matches_to_predict)
    all_bkm_quotes = pd.DataFrame()
    all_bkm_quotes['W'] = matches_to_predict[bkm_home_win_key]
    all_bkm_quotes['D'] = matches_to_predict[bkm_draw_key]
    all_bkm_quotes['L'] = matches_to_predict[bkm_away_win_key]
    analysis = analyze_predictions(all_match_outcomes,
                                   all_predictions,
                                   all_bkm_quotes,
                                   nb_max_matchs_displayed=40,
                                   fully_labelled_matches=matches_to_predict,
                                   verbose=verbose,
                                   home_team_key=home_team_key,
                                   away_team_key=away_team_key,
                                   home_goals_key=home_goals_key,
                                   away_goals_key=away_goals_key)
    print("final_pred shape", all_predictions.shape)
    return all_predictions
示例#4
0
def analyze_predictions(y,
                        pred,
                        bkm_quotes,
                        nb_max_matchs_displayed=10,
                        compare_to_dummy_pred=True,
                        fully_labelled_matches=None,
                        verbose=1,
                        home_team_key='home_team_id',
                        away_team_key='away_team_id',
                        home_goals_key='home_team_goal',
                        away_goals_key='away_team_goal'):
    assert (y.shape[0] == pred.shape[0] == bkm_quotes.shape[0])
    if fully_labelled_matches is not None:
        assert (y.shape[0] == fully_labelled_matches.shape[0])

    bkm_probas = bkm_quote_to_probas(bkm_quotes)
    if nb_max_matchs_displayed:
        printv(2, verbose, "--- on the below, few prediction examples")
    for i in range(min(y.shape[0], nb_max_matchs_displayed)):
        if fully_labelled_matches is not None:
            printv(2, verbose, fully_labelled_matches[home_team_key].iloc[i],
                   '  ', fully_labelled_matches[home_goals_key].iloc[i],
                   fully_labelled_matches[away_goals_key].iloc[i], '  ',
                   fully_labelled_matches[away_team_key].iloc[i])
        printv(2, verbose, '\nmodel predictions :', pred[i])
        printv(2, verbose, 'bkm probas:', list(bkm_probas[i]))
        printv(2, verbose, 'bkm quote:', list(bkm_quotes.iloc[i]))

    # log loss analysis
    model_log_loss = log_loss(y, pred)
    remove_nan_mask = [
        not contain_nan(bkm_probas[i]) for i in range(bkm_probas.shape[0])
    ]
    bkm_log_loss = log_loss(y.iloc[remove_nan_mask],
                            bkm_probas[remove_nan_mask])
    model_log_loss_bkm_comparison = log_loss(y.iloc[remove_nan_mask],
                                             pred[remove_nan_mask])
    printv(1, verbose, "\ntotal model log loss score                   :",
           round(model_log_loss, 4))
    printv(1, verbose, "model log loss score on matches with bkm data:",
           round(model_log_loss_bkm_comparison, 4))
    printv(1, verbose, "bkm log loss score (on matches with bkm data):",
           round(bkm_log_loss, 4))

    # rank probability score analysis
    model_rps = rank_prob_score(pred, y)
    bkm_rps = rank_prob_score(bkm_probas[remove_nan_mask],
                              y.iloc[remove_nan_mask])
    model_rps_bkm_comparison = rank_prob_score(pred[remove_nan_mask],
                                               y.iloc[remove_nan_mask])
    printv(1, verbose, "total model rps score                   :",
           round(model_rps, 4))
    printv(1, verbose, "model rps score on matches with bkm data:",
           round(model_rps_bkm_comparison, 4))
    printv(1, verbose, "bkm rps score (on matches with bkm data):",
           round(bkm_rps, 4))
    if compare_to_dummy_pred:
        printv(2, verbose, "score of equiprobability prediction :",
               round(log_loss(y, np.full(y.shape, 1. / 3)), 4))

    return model_log_loss, model_rps, [[
        model_log_loss_bkm_comparison, bkm_log_loss
    ], [model_rps_bkm_comparison, bkm_rps]]
def test_case_dixon_coles_one_country_predictions():

    player_data, player_stats_data, team_data, match_data = first_data_preparation(
    )
    countries = [
        'France',
    ]
    # countries = ['England', ]
    min_date = datetime.strptime('2016-04-30', "%Y-%m-%d").date()

    mask_countries = match_data['league_country'].isin(countries)
    match_data = match_data[mask_countries]
    # input(match_data['league_country'].unique())

    # convert date input string to actual python date
    match_data['date'] = match_data.apply(
        lambda x: datetime.strptime(x['date'], "%Y-%m-%d %H:%M:%S").date(),
        axis=1)

    # on the below: non effective way to use team names as id (easier for human-checking and debugging)
    team_id_to_name, team_name_to_id = create_dict_involved_teams(
        match_data, team_data)
    match_data['home_team_id'] = match_data.apply(
        lambda x: team_id_to_name[x['home_team_api_id']], axis=1)
    match_data['away_team_id'] = match_data.apply(
        lambda x: team_id_to_name[x['away_team_api_id']], axis=1)

    # on the below: we define our team universe (teams we calibrate parameters on)
    mask_home = team_data['team_api_id'].isin(match_data['home_team_api_id'])
    mask_away = team_data['team_api_id'].isin(match_data['away_team_api_id'])
    team_universe = list(team_data[mask_home | mask_away]['team_long_name'])
    printv(1, VERBOSE, len(team_universe), team_universe)
    printv(1, VERBOSE, 'nb matches', match_data.shape[0])

    # save full_history before selecting recent matches to predict
    full_history = match_data

    mask_date = match_data['date'] >= min_date
    match_data = match_data[mask_date]

    exp_weight_fct = lambda t1, t2: np.exp(-0.3 * (t2 - t1).days / 365.25)
    model = DixonColes(team_universe, weight_fct=exp_weight_fct)
    printv(1, VERBOSE,
           " ... fit dixon coles parameters and predict match outcomes ... ")
    predictions, param_histo = model.fit_and_predict(
        match_data,
        full_history,
        nb_obs_years=1,
        verbose=VERBOSE,
        home_goals_key='home_team_goal',
        away_goals_key='away_team_goal')
    printv(1, VERBOSE, " ... match outcomes predicted ... ")

    match_outcomes = match_outcomes_hot_vectors(match_data)
    bkm_quotes = pd.DataFrame()
    bkm_quotes['W'], bkm_quotes['D'], bkm_quotes['L'] = match_data[
        'B365H'], match_data['B365D'], match_data['B365A']

    analysis = analyze_predictions(match_outcomes,
                                   predictions,
                                   bkm_quotes,
                                   verbose=VERBOSE,
                                   nb_max_matchs_displayed=40,
                                   fully_labelled_matches=match_data)
    model_log_loss, model_rps, (log_loss_comparison_l,
                                rps_comparison_l) = analysis

    remove_nan_mask = [
        not contain_nan(bkm_quotes.iloc[i]) for i in range(bkm_quotes.shape[0])
    ]
    bkm_quotes_r = bkm_quotes.iloc[remove_nan_mask]
    match_outcomes_r = match_outcomes.iloc[remove_nan_mask]
    predictions_r = predictions[remove_nan_mask]

    constant_invest_stgy = ConstantAmountInvestStrategy(
        1.)  # invest 1 in each match (if expected return > 1% actually)
    constant_sigma_invest_stgy = ConstantStdDevInvestStrategy(
        0.01)  # stdDev of each bet is 1% of wealth
    kelly_invest_stgy = KellyInvestStrategy(
    )  # Kelly's ratio investment to maximize's wealth long term return
    constant_percent_stgy = ConstantPercentInvestStrategy(
        0.01)  # invest 1% of money each time

    for invest_stgy in [
            constant_invest_stgy, constant_sigma_invest_stgy,
            kelly_invest_stgy, constant_percent_stgy
    ]:
        printv(1, VERBOSE, "\n#### results for ",
               invest_stgy.__class__.__name__, "####")
        init_wealth = 100
        df_recap_stgy = invest_stgy.apply_invest_strategy(
            predictions_r,
            bkm_quotes_r,
            match_outcomes_r,
            init_wealth=init_wealth)

        printv(
            1, VERBOSE, df_recap_stgy[[
                'invested_amounts', 'exp_gain_amounts', 'gain_amounts'
            ]].sum())
        printv(1, VERBOSE, 'wealth: from', init_wealth, 'to',
               round(df_recap_stgy['wealth'].iloc[-1], 4))
示例#6
0
    def optimize_parameters(self,
                            matches,
                            current_time,
                            init_params=None,
                            padding=True,
                            verbose=1,
                            home_team_key='home_team_id',
                            away_team_key='away_team_id',
                            home_goals_key='home_goals',
                            away_goals_key='away_goals',
                            time_key='date',
                            control_dates=True):

        if control_dates:  # control we calibrate params on past data
            assert (matches[time_key].max() < current_time)

        # init params
        if init_params is None:
            init_params = np.array([
                0. if (i < self.nb_teams or i == self.nb_params - 1) else 1.
                for i in range(self.nb_params)
            ])
            # init_params = np.ones((self.nb_params, 0))

        # bounds = tuple([(-5, 5) if i % 2 == 0 else (0.01, 5) for i in range(self.nb_params)])
        bounds = tuple([(-3, 3) if
                        (i < self.nb_teams or i == self.nb_params - 1) else
                        (0.01, 3) for i in range(self.nb_params)])

        # # define local functions involved in optimization
        def constraint_fct(params):  # sum of alphas must be 0
            return np.sum(params[0:self.nb_teams])

        def constraint_fct_der(params):
            jac = np.zeros_like(params)
            for i in range(self.nb_teams):
                jac[i] = 1.
            return jac

        def likelihood_m(params):
            return -self._likelihood(matches,
                                     params,
                                     current_time,
                                     padding=padding,
                                     home_team_key=home_team_key,
                                     away_team_key=away_team_key,
                                     home_goals_key=home_goals_key,
                                     away_goals_key=away_goals_key,
                                     time_key=time_key)

        def likelihood_jac_m(params):
            return -self._likelihood_jac(matches,
                                         params,
                                         current_time,
                                         padding=padding,
                                         home_team_key=home_team_key,
                                         away_team_key=away_team_key,
                                         home_goals_key=home_goals_key,
                                         away_goals_key=away_goals_key,
                                         time_key=time_key)

        # other ok methods; TNC or L-BFGS-B
        res = minimize(likelihood_m,
                       init_params,
                       jac=likelihood_jac_m,
                       method='Newton-CG',
                       bounds=bounds,
                       options={
                           'xtol': 10e-3,
                           'disp': False
                       },
                       constraints=({
                           'type': 'eq',
                           'fun': constraint_fct,
                           'jac': constraint_fct_der
                       }, ))
        if not res.success:
            printv(
                1, verbose,
                " fail to calibrate parameters with method Newton-CG. trying another method (TNC)"
            )
            res = minimize(likelihood_m,
                           init_params,
                           jac=likelihood_jac_m,
                           method='TNC',
                           bounds=bounds,
                           options={
                               'xtol': 10e-3,
                               'disp': False
                           },
                           constraints=({
                               'type': 'eq',
                               'fun': constraint_fct,
                               'jac': constraint_fct_der
                           }, ))
            if not res.success:
                print('\033[91m' + "fail to calibrate parameters on date " +
                      str(current_time) + '\033[0m')
                return None
        return res.x