def test_value_iteration(gamma, S, A): for epsilon in np.logspace(-1, -6, num=5): for sim in range(5): # generate random MDP R, P = get_random_mdp(S, A) # run value iteration Q, V, n_it = value_iteration(R, P, gamma, epsilon) # check precision TQ = bellman_operator(Q, R, P, gamma) assert np.abs(TQ - Q).max() <= epsilon
def fit(self, budget=None, **kwargs): """Build empirical MDP and run value iteration.""" del kwargs S = self.env.observation_space.n A = self.env.action_space.n self.N_sa = np.zeros((S, A)) self.N_sas = np.zeros((S, A, S)) self.S_sa = np.zeros((S, A)) # collect data total_samples = S * A * self.n_samples count = 0 logger.debug( f"[{self.name}] collecting {self.n_samples} samples per (s,a)" f", total = {total_samples} samples.") for ss in range(S): for aa in range(A): for _ in range(self.n_samples): next_state, reward, _, _ = self.env.sample(ss, aa) self._update(ss, aa, next_state, reward) count += 1 if count % 10000 == 0: completed = 100 * count / total_samples logger.debug("[{}] ... {}/{} ({:0.0f}%)".format( self.name, count, total_samples, completed)) # build model and run VI logger.debug( f"{self.name} building model and running backward induction...") N_sa = np.maximum(self.N_sa, 1) self.R_hat = self.S_sa / N_sa self.P_hat = np.zeros((S, A, S)) for ss in range(S): self.P_hat[:, :, ss] = self.N_sas[:, :, ss] / N_sa info = {} info["n_samples"] = self.n_samples info["total_samples"] = total_samples if self.horizon is None: assert self.gamma < 1.0, "The discounted setting requires gamma < 1.0" self.Q, self.V, n_it = value_iteration(self.R_hat, self.P_hat, self.gamma, self.epsilon) info["n_iterations"] = n_it info["precision"] = self.epsilon else: self.Q, self.V = backward_induction(self.R_hat, self.P_hat, self.horizon, self.gamma) info["n_iterations"] = self.horizon info["precision"] = 0.0 return info
def fit(self, **kwargs): """ Run value iteration. """ info = {} if self.horizon is None: assert self.gamma < 1.0, \ "The discounted setting requires gamma < 1.0" self.Q, self.V, n_it = value_iteration(self.env.R, self.env.P, self.gamma, self.epsilon) info["n_iterations"] = n_it info["precision"] = self.epsilon else: self.Q, self.V = backward_induction(self.env.R, self.env.P, self.horizon, self.gamma) info["n_iterations"] = self.horizon info["precision"] = 0.0 return info