def rate(self, unstacked): """ Run a Markov Chain Monte Carlo (MCMC) simulation on the defined directed graphical model (aka Bayesian Network). References ---------- TODO Parameters ---------- unstacked : dataframe Unstacked dataframe containing game and stat information. Returns ------- TODO """ util.validate_games(unstacked, ['poss']) assert (self.burn_rate >= 0) and (self.burn_rate <= 1), \ "burn rate must be between 0 and 1, but was %s" % self.burn_rate unstacked = unstacked.sort('dt') teams = Pace._get_teams(unstacked) num_teams = teams.shape[0] home_team_idx = unstacked.i_hteam.values away_team_idx = unstacked.i_ateam.values observed_pace = unstacked.poss.values pace_initial = self._initial_guess() # tau = 1. / pymc.Uniform('sigma', 3, 20)**2 tau = pymc.Uniform('tau', 1. / 40**2, 1. / 20**2) pace_prior = pymc.Normal("pace_prior", mu=0, tau=tau, size=num_teams, value=pace_initial) pace_intercept = pymc.Normal('intercept', 66, 1 / 1**2, value=66) @pymc.deterministic def pace_rtg(pace=pace_prior): p = pace.copy() p = p - np.mean(pace) return p @pymc.deterministic def mu_pace(home_team=home_team_idx, away_team=away_team_idx, paces=pace_rtg, pace_intercept=pace_intercept): return pace_intercept + paces[home_team] + paces[away_team] tau_poss = 1. / pymc.Uniform('sigma_poss', 1., 10.)**2 poss = pymc.Normal('poss', mu=mu_pace, tau=tau_poss, value=observed_pace, observed=True) poss_pred = pymc.Normal('poss_pred', mu=mu_pace, tau=tau_poss) model = pymc.Model([mu_pace, pace_prior, tau, pace_rtg, poss, pace_intercept, tau_poss, poss_pred]) # map_ = pymc.MAP(model) # map_.fit(method='fmin_powell') mcmc = pymc.MCMC(model) mcmc.sample(self.n_samples, self.n_samples * self.burn_rate) return model, mcmc
def rate(self, unstacked): """ Run an adjusted stat model rating for the games data provided. By default, provides incremental ratings throughout the season, running the rating algorithm every `game_skip` games. For example, if `game_skip` is 1, then the algorithm provides update ratings after each game played during the season, and there will be `num_games` sets of ratings. The run time gets progressively slower as the data included grows throughout the season. Stats are adjusted according to: adj = \sum raw_stat / adj_opp_stat * avg_stat * w_i * \\ loc_i + w_pre * stat_pre References ---------- -Kenpom's own ratings explanation http://kenpom.com/blog/index.php/weblog/entry/ratings_explanation -Kenpom's explanation of margin of victory adjustment: http://kenpom.com/blog/index.php/weblog/entry/pomeroy_ratings_version_2.0 -Kenpom's adjusted stats calculations explanation: http://kenpom.com/blog/index.php/weblog/entry/national_efficiency/ Parameters ---------- unstacked : dataframe Unstacked dataframe containing game and stat information. Returns ------- unstacked : dataframe Original unstacked dataframe with ratings columns appended. """ util.validate_games(unstacked, ['pts', 'poss', 'ppp']) if AdjustedStat._is_multiple_seasons(unstacked): return self._rate_multiple(unstacked) # need games to be in sequential order unstacked = unstacked.sort('dt') teams, team_index = AdjustedStat._get_teams(unstacked) num_teams = teams.shape[0] num_games = unstacked.shape[0] unstacked = AdjustedStat._add_team_index(unstacked, team_index) idx, loc, oraw, draw = self._initialize(unstacked, teams) o_pre, d_pre = self._preseason_rank(teams) # Add the preseason rank as a starting point adj_o_history = [o_pre] adj_d_history = [d_pre] game_indices = unstacked[['i_hteam', 'i_ateam']].values current_index = {team: 0 for team in xrange(num_teams)} dates = unstacked['dt'].values games_included = [0] zero_summary = AdjustedStat._empty_iteration_summary(date=dates[0]) cumulative_home_o = np.zeros(num_games) cumulative_home_d = np.zeros(num_games) cumulative_away_o = np.zeros(num_games) cumulative_away_d = np.zeros(num_games) results = [zero_summary] prev_idx = 0 for gidx, (hidx, aidx) in enumerate(game_indices): # increment team vector indices to include new game current_index[hidx] += 1 current_index[aidx] += 1 if not self._should_rate(gidx, num_games): continue if self.verbose: print 'No. of games included: %s' % gidx avg_o, avg_d = self._average_stats(oraw, draw, current_index) if gidx == 0: adj_o, adj_d = self._initial_guess(unstacked, teams, gidx) else: # the initial guess is simply the ratings from the previous iteration # TODO: some weird convergence issues for this method adj_o = adj_o.copy() adj_d = adj_d.copy() adj_o, adj_d, iter_results = \ self._rate_one(oraw, draw, avg_o, avg_d, loc, idx, current_index, o_pre, d_pre, start_o=adj_o, start_d=adj_d) self._update_cumulative_ratings(cumulative_home_o, cumulative_home_d, cumulative_away_o, cumulative_away_d, adj_o, adj_d, game_indices[:, 0], game_indices[:, 1], gidx, prev_idx) adj_o_history.append(adj_o.copy()) adj_d_history.append(adj_d.copy()) results.append(iter_results) games_included.append(gidx + 1) prev_idx = gidx # add a rating column to include ratings for each game in the dataframe unstacked = AdjustedStat._rate_for_games(unstacked, games_included, adj_o_history, adj_d_history, self.stat) self.offensive_ratings = np.array(adj_o_history) self.defensive_ratings = np.array(adj_d_history) self.results = results self.team_index = team_index return unstacked