def fit(self, budget: int, **kwargs): del kwargs n_episodes_to_run = budget count = 0 while count < n_episodes_to_run: self._run_episode() count += 1 # compute Q function for the recommended policy if self.stage_dependent: backward_induction_sd( self.Q_policy, self.V_policy, self.R_hat, self.P_hat, self.gamma, self.v_max[0], ) else: backward_induction_in_place( self.Q_policy, self.V_policy, self.R_hat, self.P_hat, self.horizon, self.gamma, self.v_max[0], )
def _run_episode(self): # interact for H steps episode_rewards = 0 state = self.env.reset() for hh in range(self.horizon): action = self._get_action(state, hh) next_state, reward, done, _ = self.env.step(action) self._update(state, action, next_state, reward) state = next_state episode_rewards += reward if done: break # run backward induction backward_induction_in_place( self.Q[:, :self.M, :], self.V[:, :self.M], self.R_hat[:self.M, :] + self.B_sa[:self.M, :], self.P_hat[:self.M, :, :self.M], self.horizon, self.gamma, self.v_max, ) self.episode += 1 # if self.writer is not None: self.writer.add_scalar("episode_rewards", episode_rewards, self.episode) self.writer.add_scalar("representative states", self.M, self.episode) # return sum of rewards collected in the episode return episode_rewards
def _run_episode(self): # sample reward and transitions from posterior self.R_sample = self.rng.beta(self.M_sa[..., 0], self.M_sa[..., 1]) self.P_sample = self.rng.gamma(self.N_sas) self.P_sample = self.P_sample / self.P_sample.sum(-1, keepdims=True) # run backward induction if self.stage_dependent: backward_induction_sd(self.Q, self.V, self.R_sample, self.P_sample, self.gamma, self.v_max[0]) else: backward_induction_in_place( self.Q, self.V, self.R_sample, self.P_sample, self.horizon, self.gamma, self.v_max[0], ) # interact for H steps episode_rewards = 0 state = self.env.reset() for hh in range(self.horizon): action = self._get_action(state, hh) next_state, reward, done, _ = self.env.step(action) episode_rewards += reward # used for logging only self.counter.update(state, action) if self.reward_free: reward = 0.0 # set to zero before update if reward_free self._update(state, action, next_state, reward, hh) state = next_state if done: break # update info self.episode += 1 # writer if self.writer is not None: self.writer.add_scalar("episode_rewards", episode_rewards, self.episode) self.writer.add_scalar("n_visited_states", self.counter.get_n_visited_states(), self.episode) # return sum of rewards collected in the episode return episode_rewards
def _run_episode(self): # interact for H steps episode_rewards = 0 state = self.env.reset() for hh in range(self.horizon): action = self._get_action(state, hh) next_state, reward, done, _ = self.env.step(action) episode_rewards += reward # used for logging only self.counter.update(state, action) if self.reward_free: reward = 0.0 # set to zero before update if reward_free self._update(state, action, next_state, reward, hh) state = next_state if done: break # run backward induction if not self.real_time_dp: if self.stage_dependent: backward_induction_sd(self.Q, self.V, self.R_hat + self.B_sa, self.P_hat, self.gamma, self.v_max[0]) else: backward_induction_in_place(self.Q, self.V, self.R_hat + self.B_sa, self.P_hat, self.horizon, self.gamma, self.v_max[0]) # update info ep = self.episode self._rewards[ep] = episode_rewards self.episode += 1 # writer if self.writer is not None: self.writer.add_scalar("ep reward", episode_rewards, self.episode) self.writer.add_scalar("n_visited_states", self.counter.get_n_visited_states(), self.episode) # return sum of rewards collected in the episode return episode_rewards
def _run_episode(self): # interact for H steps episode_rewards = 0 state = self.env.reset() for hh in range(self.horizon): action = self._get_action(state, hh) next_state, reward, done, _ = self.env.step(action) episode_rewards += reward # used for logging only if self.reward_free: reward = 0.0 # set to zero before update if reward_free self._update(state, action, next_state, reward) state = next_state if done: break # run backward induction backward_induction_in_place( self.Q[:, :self.M, :], self.V[:, :self.M], self.R_hat[:self.M, :] + self.B_sa[:self.M, :], self.P_hat[:self.M, :, :self.M], self.horizon, self.gamma, self.v_max) ep = self.episode self._rewards[ep] = episode_rewards self._cumul_rewards[ep] = episode_rewards \ + self._cumul_rewards[max(0, ep - 1)] self.episode += 1 # if self.writer is not None: avg_reward = self._cumul_rewards[ep] / max(1, ep) self.writer.add_scalar("ep reward", episode_rewards, self.episode) self.writer.add_scalar("avg reward", avg_reward, self.episode) self.writer.add_scalar("representative states", self.M, self.episode) # return sum of rewards collected in the episode return episode_rewards
def test_backward_induction(horizon, S, A): for sim in range(5): # generate random MDP R, P = get_random_mdp(S, A) # run backward induction Q, V = backward_induction(R, P, horizon) assert Q.max() <= horizon assert V.max() <= horizon # run backward with clipping V to 1.0 Q, V = backward_induction(R, P, horizon, vmax=1.0) assert V.max() <= 1.0 # run bacward induction in place Q2 = np.zeros((horizon, S, A)) V2 = np.zeros((horizon, S)) backward_induction_in_place(Q2, V2, R, P, horizon, vmax=1.0) assert np.array_equal(Q, Q2) assert np.array_equal(V, V2)
def partial_fit(self, fraction, **kwargs): assert 0.0 < fraction <= 1.0 n_episodes_to_run = int(np.ceil(fraction * self.n_episodes)) count = 0 while count < n_episodes_to_run and self.episode < self.n_episodes: self._run_episode() count += 1 # compute Q function for the recommended policy if self.stage_dependent: backward_induction_sd(self.Q_policy, self.V_policy, self.R_hat, self.P_hat, self.gamma, self.v_max[0]) else: backward_induction_in_place(self.Q_policy, self.V_policy, self.R_hat, self.P_hat, self.horizon, self.gamma, self.v_max[0]) info = { "n_episodes": self.episode, "episode_rewards": self._rewards[:self.episode] } return info
def fit(self, budget: int, **kwargs): """ Train the agent using the provided environment. Parameters ---------- budget: int number of episodes **kwargs Extra arguments. Not used for this agent. """ del kwargs n_episodes_to_run = budget count = 0 while count < n_episodes_to_run: self._run_episode() count += 1 # compute Q function for the recommended policy if self.stage_dependent: backward_induction_sd( self.Q_policy, self.V_policy, self.R_hat, self.P_hat, self.gamma, self.v_max[0], ) else: backward_induction_in_place( self.Q_policy, self.V_policy, self.R_hat, self.P_hat, self.horizon, self.gamma, self.v_max[0], )
def fit(self, budget: int, **kwargs): del kwargs n_episodes_to_run = budget count = 0 while count < n_episodes_to_run: self._run_episode() count += 1 # compute Q function for the recommended policy R_hat = self.M_sa[..., 0] / (self.M_sa[..., 0] + self.M_sa[..., 1]) P_hat = self.N_sas / self.N_sas.sum(-1, keepdims=True) if self.stage_dependent: backward_induction_sd(self.Q_policy, self.V_policy, R_hat, P_hat, self.gamma, self.v_max[0]) else: backward_induction_in_place( self.Q_policy, self.V_policy, R_hat, P_hat, self.horizon, self.gamma, self.v_max[0], )