def test_backward_induction_sd(horizon, S, A): """ Test stage-dependent MDPs """ for sim in range(5): # generate random MDP Rstat, Pstat = get_random_mdp(S, A) R = np.zeros((horizon, S, A)) P = np.zeros((horizon, S, A, S)) for ii in range(horizon): R[ii, :, :] = Rstat P[ii, :, :, :] = Pstat # run backward induction in stationary MDP Qstat, Vstat = backward_induction(Rstat, Pstat, horizon) # run backward induction in stage-dependent MDP Q = np.zeros((horizon, S, A)) V = np.zeros((horizon, S)) backward_induction_sd(Q, V, R, P) # run backward induction with stage-dependent rewards Q2 = np.zeros((horizon, S, A)) V2 = np.zeros((horizon, S)) backward_induction_reward_sd(Q2, V2, R, Pstat) assert np.array_equal(Q, Qstat) assert np.array_equal(V, Vstat) assert np.array_equal(Q2, Qstat) assert np.array_equal(V2, Vstat)
def fit(self, budget: int, **kwargs): del kwargs n_episodes_to_run = budget count = 0 while count < n_episodes_to_run: self._run_episode() count += 1 # compute Q function for the recommended policy if self.stage_dependent: backward_induction_sd( self.Q_policy, self.V_policy, self.R_hat, self.P_hat, self.gamma, self.v_max[0], ) else: backward_induction_in_place( self.Q_policy, self.V_policy, self.R_hat, self.P_hat, self.horizon, self.gamma, self.v_max[0], )
def _run_episode(self): # interact for H steps episode_rewards = 0 # stds scale/sqrt(n)+(H-h+1)/n std_sa = self.std1_sa / np.sqrt(self.N_sa) + self.std2_sa / self.N_sa noise_sa = self.rng.normal(self.R_hat, std_sa) # run backward noisy induction if self.stage_dependent: backward_induction_sd( self.Q, self.V, self.R_hat + noise_sa, self.P_hat, self.gamma, self.v_max[0], ) else: backward_induction_reward_sd( self.Q, self.V, self.R_hat + noise_sa, self.P_hat, self.gamma, self.v_max[0], ) state = self.env.reset() for hh in range(self.horizon): action = self._get_action(state, hh) next_state, reward, done, _ = self.env.step(action) episode_rewards += reward # used for logging only self.counter.update(state, action) if self.reward_free: reward = 0.0 # set to zero before update if reward_free self._update(state, action, next_state, reward, hh) state = next_state if done: break # update info self.episode += 1 # writer if self.writer is not None: self.writer.add_scalar("episode_rewards", episode_rewards, self.episode) self.writer.add_scalar("n_visited_states", self.counter.get_n_visited_states(), self.episode) # return sum of rewards collected in the episode return episode_rewards
def _run_episode(self): # interact for H steps episode_rewards = 0 state = self.env.reset() for hh in range(self.horizon): action = self._get_action(state, hh) next_state, reward, done, _ = self.env.step(action) episode_rewards += reward # used for logging only self.counter.update(state, action) if self.reward_free: reward = 0.0 # set to zero before update if reward_free self._update(state, action, next_state, reward, hh) state = next_state if done: break # run backward induction if not self.real_time_dp: if self.stage_dependent: backward_induction_sd( self.Q, self.V, self.R_hat + self.B_sa, self.P_hat, self.gamma, self.v_max[0], ) else: backward_induction_reward_sd( self.Q, self.V, self.R_hat + self.B_sa, self.P_hat, self.gamma, self.v_max[0], ) # update info self.episode += 1 # writer if self.writer is not None: self.writer.add_scalar("episode_rewards", episode_rewards, self.episode) self.writer.add_scalar("n_visited_states", self.counter.get_n_visited_states(), self.episode) # return sum of rewards collected in the episode return episode_rewards
def _run_episode(self): # sample reward and transitions from posterior self.R_sample = self.rng.beta(self.M_sa[..., 0], self.M_sa[..., 1]) self.P_sample = self.rng.gamma(self.N_sas) self.P_sample = self.P_sample / self.P_sample.sum(-1, keepdims=True) # run backward induction if self.stage_dependent: backward_induction_sd(self.Q, self.V, self.R_sample, self.P_sample, self.gamma, self.v_max[0]) else: backward_induction_in_place( self.Q, self.V, self.R_sample, self.P_sample, self.horizon, self.gamma, self.v_max[0], ) # interact for H steps episode_rewards = 0 state = self.env.reset() for hh in range(self.horizon): action = self._get_action(state, hh) next_state, reward, done, _ = self.env.step(action) episode_rewards += reward # used for logging only self.counter.update(state, action) if self.reward_free: reward = 0.0 # set to zero before update if reward_free self._update(state, action, next_state, reward, hh) state = next_state if done: break # update info self.episode += 1 # writer if self.writer is not None: self.writer.add_scalar("episode_rewards", episode_rewards, self.episode) self.writer.add_scalar("n_visited_states", self.counter.get_n_visited_states(), self.episode) # return sum of rewards collected in the episode return episode_rewards
def partial_fit(self, fraction, **kwargs): assert 0.0 < fraction <= 1.0 n_episodes_to_run = int(np.ceil(fraction * self.n_episodes)) count = 0 while count < n_episodes_to_run and self.episode < self.n_episodes: self._run_episode() count += 1 # compute Q function for the recommended policy if self.stage_dependent: backward_induction_sd(self.Q_policy, self.V_policy, self.R_hat, self.P_hat, self.gamma, self.v_max[0]) else: backward_induction_in_place(self.Q_policy, self.V_policy, self.R_hat, self.P_hat, self.horizon, self.gamma, self.v_max[0]) info = { "n_episodes": self.episode, "episode_rewards": self._rewards[:self.episode] } return info
def fit(self, budget: int, **kwargs): """ Train the agent using the provided environment. Parameters ---------- budget: int number of episodes **kwargs Extra arguments. Not used for this agent. """ del kwargs n_episodes_to_run = budget count = 0 while count < n_episodes_to_run: self._run_episode() count += 1 # compute Q function for the recommended policy if self.stage_dependent: backward_induction_sd( self.Q_policy, self.V_policy, self.R_hat, self.P_hat, self.gamma, self.v_max[0], ) else: backward_induction_in_place( self.Q_policy, self.V_policy, self.R_hat, self.P_hat, self.horizon, self.gamma, self.v_max[0], )
def fit(self, budget: int, **kwargs): del kwargs n_episodes_to_run = budget count = 0 while count < n_episodes_to_run: self._run_episode() count += 1 # compute Q function for the recommended policy R_hat = self.M_sa[..., 0] / (self.M_sa[..., 0] + self.M_sa[..., 1]) P_hat = self.N_sas / self.N_sas.sum(-1, keepdims=True) if self.stage_dependent: backward_induction_sd(self.Q_policy, self.V_policy, R_hat, P_hat, self.gamma, self.v_max[0]) else: backward_induction_in_place( self.Q_policy, self.V_policy, R_hat, P_hat, self.horizon, self.gamma, self.v_max[0], )