示例#1
0
    def fit(self, budget: int, **kwargs):
        del kwargs
        n_episodes_to_run = budget
        count = 0
        while count < n_episodes_to_run:
            self._run_episode()
            count += 1

        # compute Q function for the recommended policy
        if self.stage_dependent:
            backward_induction_sd(
                self.Q_policy,
                self.V_policy,
                self.R_hat,
                self.P_hat,
                self.gamma,
                self.v_max[0],
            )
        else:
            backward_induction_in_place(
                self.Q_policy,
                self.V_policy,
                self.R_hat,
                self.P_hat,
                self.horizon,
                self.gamma,
                self.v_max[0],
            )
示例#2
0
    def _run_episode(self):
        # interact for H steps
        episode_rewards = 0
        state = self.env.reset()
        for hh in range(self.horizon):
            action = self._get_action(state, hh)
            next_state, reward, done, _ = self.env.step(action)
            self._update(state, action, next_state, reward)
            state = next_state
            episode_rewards += reward

            if done:
                break

        # run backward induction
        backward_induction_in_place(
            self.Q[:, :self.M, :],
            self.V[:, :self.M],
            self.R_hat[:self.M, :] + self.B_sa[:self.M, :],
            self.P_hat[:self.M, :, :self.M],
            self.horizon,
            self.gamma,
            self.v_max,
        )

        self.episode += 1
        #
        if self.writer is not None:
            self.writer.add_scalar("episode_rewards", episode_rewards,
                                   self.episode)
            self.writer.add_scalar("representative states", self.M,
                                   self.episode)

        # return sum of rewards collected in the episode
        return episode_rewards
示例#3
0
文件: psrl.py 项目: omardrwch/rlberry
    def _run_episode(self):
        # sample reward and transitions from posterior
        self.R_sample = self.rng.beta(self.M_sa[..., 0], self.M_sa[..., 1])
        self.P_sample = self.rng.gamma(self.N_sas)
        self.P_sample = self.P_sample / self.P_sample.sum(-1, keepdims=True)
        # run backward induction
        if self.stage_dependent:
            backward_induction_sd(self.Q, self.V, self.R_sample, self.P_sample,
                                  self.gamma, self.v_max[0])
        else:
            backward_induction_in_place(
                self.Q,
                self.V,
                self.R_sample,
                self.P_sample,
                self.horizon,
                self.gamma,
                self.v_max[0],
            )
        # interact for H steps
        episode_rewards = 0
        state = self.env.reset()
        for hh in range(self.horizon):
            action = self._get_action(state, hh)
            next_state, reward, done, _ = self.env.step(action)
            episode_rewards += reward  # used for logging only

            self.counter.update(state, action)

            if self.reward_free:
                reward = 0.0  # set to zero before update if reward_free

            self._update(state, action, next_state, reward, hh)

            state = next_state
            if done:
                break

        # update info
        self.episode += 1

        # writer
        if self.writer is not None:
            self.writer.add_scalar("episode_rewards", episode_rewards,
                                   self.episode)
            self.writer.add_scalar("n_visited_states",
                                   self.counter.get_n_visited_states(),
                                   self.episode)

        # return sum of rewards collected in the episode
        return episode_rewards
示例#4
0
    def _run_episode(self):
        # interact for H steps
        episode_rewards = 0
        state = self.env.reset()
        for hh in range(self.horizon):
            action = self._get_action(state, hh)
            next_state, reward, done, _ = self.env.step(action)
            episode_rewards += reward  # used for logging only

            self.counter.update(state, action)

            if self.reward_free:
                reward = 0.0  # set to zero before update if reward_free

            self._update(state, action, next_state, reward, hh)

            state = next_state
            if done:
                break

        # run backward induction
        if not self.real_time_dp:
            if self.stage_dependent:
                backward_induction_sd(self.Q, self.V, self.R_hat + self.B_sa,
                                      self.P_hat, self.gamma, self.v_max[0])
            else:
                backward_induction_in_place(self.Q, self.V,
                                            self.R_hat + self.B_sa, self.P_hat,
                                            self.horizon, self.gamma,
                                            self.v_max[0])

        # update info
        ep = self.episode
        self._rewards[ep] = episode_rewards
        self.episode += 1

        # writer
        if self.writer is not None:
            self.writer.add_scalar("ep reward", episode_rewards, self.episode)
            self.writer.add_scalar("n_visited_states",
                                   self.counter.get_n_visited_states(),
                                   self.episode)

        # return sum of rewards collected in the episode
        return episode_rewards
示例#5
0
    def _run_episode(self):
        # interact for H steps
        episode_rewards = 0
        state = self.env.reset()
        for hh in range(self.horizon):
            action = self._get_action(state, hh)
            next_state, reward, done, _ = self.env.step(action)
            episode_rewards += reward  # used for logging only

            if self.reward_free:
                reward = 0.0  # set to zero before update if reward_free

            self._update(state, action, next_state, reward)

            state = next_state
            if done:
                break

        # run backward induction
        backward_induction_in_place(
            self.Q[:, :self.M, :], self.V[:, :self.M],
            self.R_hat[:self.M, :] + self.B_sa[:self.M, :],
            self.P_hat[:self.M, :, :self.M], self.horizon, self.gamma,
            self.v_max)

        ep = self.episode
        self._rewards[ep] = episode_rewards
        self._cumul_rewards[ep] = episode_rewards \
            + self._cumul_rewards[max(0, ep - 1)]

        self.episode += 1
        #
        if self.writer is not None:
            avg_reward = self._cumul_rewards[ep] / max(1, ep)

            self.writer.add_scalar("ep reward", episode_rewards, self.episode)
            self.writer.add_scalar("avg reward", avg_reward, self.episode)
            self.writer.add_scalar("representative states", self.M,
                                   self.episode)

        # return sum of rewards collected in the episode
        return episode_rewards
示例#6
0
def test_backward_induction(horizon, S, A):
    for sim in range(5):
        # generate random MDP
        R, P = get_random_mdp(S, A)

        # run backward induction
        Q, V = backward_induction(R, P, horizon)

        assert Q.max() <= horizon
        assert V.max() <= horizon

        # run backward with clipping V to 1.0
        Q, V = backward_induction(R, P, horizon, vmax=1.0)
        assert V.max() <= 1.0

        # run bacward induction in place
        Q2 = np.zeros((horizon, S, A))
        V2 = np.zeros((horizon, S))
        backward_induction_in_place(Q2, V2, R, P, horizon, vmax=1.0)
        assert np.array_equal(Q, Q2)
        assert np.array_equal(V, V2)
示例#7
0
    def partial_fit(self, fraction, **kwargs):
        assert 0.0 < fraction <= 1.0
        n_episodes_to_run = int(np.ceil(fraction * self.n_episodes))
        count = 0
        while count < n_episodes_to_run and self.episode < self.n_episodes:
            self._run_episode()
            count += 1

        # compute Q function for the recommended policy
        if self.stage_dependent:
            backward_induction_sd(self.Q_policy, self.V_policy, self.R_hat,
                                  self.P_hat, self.gamma, self.v_max[0])
        else:
            backward_induction_in_place(self.Q_policy, self.V_policy,
                                        self.R_hat, self.P_hat, self.horizon,
                                        self.gamma, self.v_max[0])

        info = {
            "n_episodes": self.episode,
            "episode_rewards": self._rewards[:self.episode]
        }
        return info
示例#8
0
    def fit(self, budget: int, **kwargs):
        """

        Train the agent using the provided environment.

        Parameters
        ----------
        budget: int
            number of episodes
        **kwargs
            Extra arguments. Not used for this agent.
        """
        del kwargs
        n_episodes_to_run = budget
        count = 0
        while count < n_episodes_to_run:
            self._run_episode()
            count += 1

        # compute Q function for the recommended policy
        if self.stage_dependent:
            backward_induction_sd(
                self.Q_policy,
                self.V_policy,
                self.R_hat,
                self.P_hat,
                self.gamma,
                self.v_max[0],
            )
        else:
            backward_induction_in_place(
                self.Q_policy,
                self.V_policy,
                self.R_hat,
                self.P_hat,
                self.horizon,
                self.gamma,
                self.v_max[0],
            )
示例#9
0
文件: psrl.py 项目: omardrwch/rlberry
    def fit(self, budget: int, **kwargs):
        del kwargs
        n_episodes_to_run = budget
        count = 0
        while count < n_episodes_to_run:
            self._run_episode()
            count += 1

        # compute Q function for the recommended policy
        R_hat = self.M_sa[..., 0] / (self.M_sa[..., 0] + self.M_sa[..., 1])
        P_hat = self.N_sas / self.N_sas.sum(-1, keepdims=True)
        if self.stage_dependent:
            backward_induction_sd(self.Q_policy, self.V_policy, R_hat, P_hat,
                                  self.gamma, self.v_max[0])
        else:
            backward_induction_in_place(
                self.Q_policy,
                self.V_policy,
                R_hat,
                P_hat,
                self.horizon,
                self.gamma,
                self.v_max[0],
            )