示例#1
0
def test_value_iteration(gamma, S, A):
    for epsilon in np.logspace(-1, -6, num=5):
        for sim in range(5):
            # generate random MDP
            R, P = get_random_mdp(S, A)

            # run value iteration
            Q, V, n_it = value_iteration(R, P, gamma, epsilon)
            # check precision
            TQ = bellman_operator(Q, R, P, gamma)
            assert np.abs(TQ - Q).max() <= epsilon
示例#2
0
    def fit(self, budget=None, **kwargs):
        """Build empirical MDP and run value iteration."""
        del kwargs
        S = self.env.observation_space.n
        A = self.env.action_space.n
        self.N_sa = np.zeros((S, A))
        self.N_sas = np.zeros((S, A, S))
        self.S_sa = np.zeros((S, A))

        # collect data
        total_samples = S * A * self.n_samples
        count = 0
        logger.debug(
            f"[{self.name}] collecting {self.n_samples} samples per (s,a)"
            f", total = {total_samples} samples.")
        for ss in range(S):
            for aa in range(A):
                for _ in range(self.n_samples):
                    next_state, reward, _, _ = self.env.sample(ss, aa)
                    self._update(ss, aa, next_state, reward)

                    count += 1
                    if count % 10000 == 0:
                        completed = 100 * count / total_samples
                        logger.debug("[{}] ... {}/{} ({:0.0f}%)".format(
                            self.name, count, total_samples, completed))

        # build model and run VI
        logger.debug(
            f"{self.name} building model and running backward induction...")

        N_sa = np.maximum(self.N_sa, 1)
        self.R_hat = self.S_sa / N_sa
        self.P_hat = np.zeros((S, A, S))
        for ss in range(S):
            self.P_hat[:, :, ss] = self.N_sas[:, :, ss] / N_sa

        info = {}
        info["n_samples"] = self.n_samples
        info["total_samples"] = total_samples
        if self.horizon is None:
            assert self.gamma < 1.0, "The discounted setting requires gamma < 1.0"
            self.Q, self.V, n_it = value_iteration(self.R_hat, self.P_hat,
                                                   self.gamma, self.epsilon)
            info["n_iterations"] = n_it
            info["precision"] = self.epsilon
        else:
            self.Q, self.V = backward_induction(self.R_hat, self.P_hat,
                                                self.horizon, self.gamma)
            info["n_iterations"] = self.horizon
            info["precision"] = 0.0
        return info
示例#3
0
 def fit(self, **kwargs):
     """
     Run value iteration.
     """
     info = {}
     if self.horizon is None:
         assert self.gamma < 1.0, \
             "The discounted setting requires gamma < 1.0"
         self.Q, self.V, n_it = value_iteration(self.env.R, self.env.P,
                                                self.gamma, self.epsilon)
         info["n_iterations"] = n_it
         info["precision"] = self.epsilon
     else:
         self.Q, self.V = backward_induction(self.env.R, self.env.P,
                                             self.horizon, self.gamma)
         info["n_iterations"] = self.horizon
         info["precision"] = 0.0
     return info