예제 #1
0
def test_backward_induction_sd(horizon, S, A):
    """
    Test stage-dependent MDPs
    """
    for sim in range(5):
        # generate random MDP
        Rstat, Pstat = get_random_mdp(S, A)
        R = np.zeros((horizon, S, A))
        P = np.zeros((horizon, S, A, S))
        for ii in range(horizon):
            R[ii, :, :] = Rstat
            P[ii, :, :, :] = Pstat

        # run backward induction in stationary MDP
        Qstat, Vstat = backward_induction(Rstat, Pstat, horizon)

        # run backward induction in stage-dependent MDP
        Q = np.zeros((horizon, S, A))
        V = np.zeros((horizon, S))
        backward_induction_sd(Q, V, R, P)

        # run backward induction with stage-dependent rewards
        Q2 = np.zeros((horizon, S, A))
        V2 = np.zeros((horizon, S))
        backward_induction_reward_sd(Q2, V2, R, Pstat)

        assert np.array_equal(Q, Qstat)
        assert np.array_equal(V, Vstat)
        assert np.array_equal(Q2, Qstat)
        assert np.array_equal(V2, Vstat)
예제 #2
0
    def fit(self, budget: int, **kwargs):
        del kwargs
        n_episodes_to_run = budget
        count = 0
        while count < n_episodes_to_run:
            self._run_episode()
            count += 1

        # compute Q function for the recommended policy
        if self.stage_dependent:
            backward_induction_sd(
                self.Q_policy,
                self.V_policy,
                self.R_hat,
                self.P_hat,
                self.gamma,
                self.v_max[0],
            )
        else:
            backward_induction_in_place(
                self.Q_policy,
                self.V_policy,
                self.R_hat,
                self.P_hat,
                self.horizon,
                self.gamma,
                self.v_max[0],
            )
예제 #3
0
    def _run_episode(self):
        # interact for H steps
        episode_rewards = 0
        # stds scale/sqrt(n)+(H-h+1)/n
        std_sa = self.std1_sa / np.sqrt(self.N_sa) + self.std2_sa / self.N_sa
        noise_sa = self.rng.normal(self.R_hat, std_sa)
        # run backward noisy induction
        if self.stage_dependent:
            backward_induction_sd(
                self.Q,
                self.V,
                self.R_hat + noise_sa,
                self.P_hat,
                self.gamma,
                self.v_max[0],
            )
        else:
            backward_induction_reward_sd(
                self.Q,
                self.V,
                self.R_hat + noise_sa,
                self.P_hat,
                self.gamma,
                self.v_max[0],
            )

        state = self.env.reset()
        for hh in range(self.horizon):
            action = self._get_action(state, hh)
            next_state, reward, done, _ = self.env.step(action)
            episode_rewards += reward  # used for logging only

            self.counter.update(state, action)

            if self.reward_free:
                reward = 0.0  # set to zero before update if reward_free

            self._update(state, action, next_state, reward, hh)

            state = next_state
            if done:
                break

        # update info
        self.episode += 1

        # writer
        if self.writer is not None:
            self.writer.add_scalar("episode_rewards", episode_rewards,
                                   self.episode)
            self.writer.add_scalar("n_visited_states",
                                   self.counter.get_n_visited_states(),
                                   self.episode)

        # return sum of rewards collected in the episode
        return episode_rewards
예제 #4
0
파일: ucbvi.py 프로젝트: omardrwch/rlberry
    def _run_episode(self):
        # interact for H steps
        episode_rewards = 0
        state = self.env.reset()
        for hh in range(self.horizon):
            action = self._get_action(state, hh)
            next_state, reward, done, _ = self.env.step(action)
            episode_rewards += reward  # used for logging only

            self.counter.update(state, action)

            if self.reward_free:
                reward = 0.0  # set to zero before update if reward_free

            self._update(state, action, next_state, reward, hh)

            state = next_state
            if done:
                break

        # run backward induction
        if not self.real_time_dp:
            if self.stage_dependent:
                backward_induction_sd(
                    self.Q,
                    self.V,
                    self.R_hat + self.B_sa,
                    self.P_hat,
                    self.gamma,
                    self.v_max[0],
                )
            else:
                backward_induction_reward_sd(
                    self.Q,
                    self.V,
                    self.R_hat + self.B_sa,
                    self.P_hat,
                    self.gamma,
                    self.v_max[0],
                )

        # update info
        self.episode += 1

        # writer
        if self.writer is not None:
            self.writer.add_scalar("episode_rewards", episode_rewards,
                                   self.episode)
            self.writer.add_scalar("n_visited_states",
                                   self.counter.get_n_visited_states(),
                                   self.episode)

        # return sum of rewards collected in the episode
        return episode_rewards
예제 #5
0
파일: psrl.py 프로젝트: omardrwch/rlberry
    def _run_episode(self):
        # sample reward and transitions from posterior
        self.R_sample = self.rng.beta(self.M_sa[..., 0], self.M_sa[..., 1])
        self.P_sample = self.rng.gamma(self.N_sas)
        self.P_sample = self.P_sample / self.P_sample.sum(-1, keepdims=True)
        # run backward induction
        if self.stage_dependent:
            backward_induction_sd(self.Q, self.V, self.R_sample, self.P_sample,
                                  self.gamma, self.v_max[0])
        else:
            backward_induction_in_place(
                self.Q,
                self.V,
                self.R_sample,
                self.P_sample,
                self.horizon,
                self.gamma,
                self.v_max[0],
            )
        # interact for H steps
        episode_rewards = 0
        state = self.env.reset()
        for hh in range(self.horizon):
            action = self._get_action(state, hh)
            next_state, reward, done, _ = self.env.step(action)
            episode_rewards += reward  # used for logging only

            self.counter.update(state, action)

            if self.reward_free:
                reward = 0.0  # set to zero before update if reward_free

            self._update(state, action, next_state, reward, hh)

            state = next_state
            if done:
                break

        # update info
        self.episode += 1

        # writer
        if self.writer is not None:
            self.writer.add_scalar("episode_rewards", episode_rewards,
                                   self.episode)
            self.writer.add_scalar("n_visited_states",
                                   self.counter.get_n_visited_states(),
                                   self.episode)

        # return sum of rewards collected in the episode
        return episode_rewards
예제 #6
0
    def partial_fit(self, fraction, **kwargs):
        assert 0.0 < fraction <= 1.0
        n_episodes_to_run = int(np.ceil(fraction * self.n_episodes))
        count = 0
        while count < n_episodes_to_run and self.episode < self.n_episodes:
            self._run_episode()
            count += 1

        # compute Q function for the recommended policy
        if self.stage_dependent:
            backward_induction_sd(self.Q_policy, self.V_policy, self.R_hat,
                                  self.P_hat, self.gamma, self.v_max[0])
        else:
            backward_induction_in_place(self.Q_policy, self.V_policy,
                                        self.R_hat, self.P_hat, self.horizon,
                                        self.gamma, self.v_max[0])

        info = {
            "n_episodes": self.episode,
            "episode_rewards": self._rewards[:self.episode]
        }
        return info
예제 #7
0
파일: ucbvi.py 프로젝트: omardrwch/rlberry
    def fit(self, budget: int, **kwargs):
        """

        Train the agent using the provided environment.

        Parameters
        ----------
        budget: int
            number of episodes
        **kwargs
            Extra arguments. Not used for this agent.
        """
        del kwargs
        n_episodes_to_run = budget
        count = 0
        while count < n_episodes_to_run:
            self._run_episode()
            count += 1

        # compute Q function for the recommended policy
        if self.stage_dependent:
            backward_induction_sd(
                self.Q_policy,
                self.V_policy,
                self.R_hat,
                self.P_hat,
                self.gamma,
                self.v_max[0],
            )
        else:
            backward_induction_in_place(
                self.Q_policy,
                self.V_policy,
                self.R_hat,
                self.P_hat,
                self.horizon,
                self.gamma,
                self.v_max[0],
            )
예제 #8
0
파일: psrl.py 프로젝트: omardrwch/rlberry
    def fit(self, budget: int, **kwargs):
        del kwargs
        n_episodes_to_run = budget
        count = 0
        while count < n_episodes_to_run:
            self._run_episode()
            count += 1

        # compute Q function for the recommended policy
        R_hat = self.M_sa[..., 0] / (self.M_sa[..., 0] + self.M_sa[..., 1])
        P_hat = self.N_sas / self.N_sas.sum(-1, keepdims=True)
        if self.stage_dependent:
            backward_induction_sd(self.Q_policy, self.V_policy, R_hat, P_hat,
                                  self.gamma, self.v_max[0])
        else:
            backward_induction_in_place(
                self.Q_policy,
                self.V_policy,
                R_hat,
                P_hat,
                self.horizon,
                self.gamma,
                self.v_max[0],
            )