def reset(self): Agent.reset(self) self.Q = self.model_lambda() self.target_Q = self.model_lambda() self.target_Q.set_weights(self.Q.get_weights()) self.buffer.reset() self.updates_since_target_updated = 0
def train(nb_steps: int, env: Env, agent: Agent, start_obs: Arrayable): """Trains for one epoch. :args nb_steps: number of interaction steps :args env: environment :args agent: interacting agent :start_obs: starting observation :return: final observation """ agent.train() agent.reset() obs = start_obs for _ in range(nb_steps): # interact obs, _, _ = interact(env, agent, obs) return obs
def evaluate( dt: float, epoch: int, env: Env, agent: Agent, eval_gap: float, # noqa: C901 time_limit: Optional[float] = None, eval_return: bool = False, progress_bar: bool = False, video: bool = False, no_log: bool = False, test: bool = False, eval_policy: bool = True) -> Optional[float]: """Evaluate agent in environment. :args dt: time discretization :args epoch: index of the current epoch :args env: environment :args agent: interacting agent :args eval_gap: number of normalized epochs (epochs divided by dt) between training steps :args time_limit: maximal physical time (number of steps divided by dt) spent in the environment :args eval_return: do we only perform specific evaluation? :args progress_bar: use a progress bar? :args video: log a video of the interaction? :args no_log: do we log results :args test: log to a different test summary :args eval_policy: if the exploitation policy is noisy, remove the noise before evaluating :return: return evaluated, None if no return is evaluated """ log_gap = int(eval_gap / dt) agent.eval() if not eval_policy and isinstance(agent, OnlineAgent): agent.noisy_eval() agent.reset() R = None if eval_return: rewards, dones = [], [] imgs = [] time_limit = time_limit if time_limit else 10 nb_steps = int(time_limit / dt) info(f"eval> evaluating on a physical time {time_limit}" f" ({nb_steps} steps in total)") obs = env.reset() iter_range = tqdm(range(nb_steps)) if progress_bar else range(nb_steps) for _ in iter_range: obs, reward, done = interact(env, agent, obs) rewards.append(reward) dones.append(done) if video: imgs.append(env.render(mode='rgb_array')) R = compute_return(np.stack(rewards, axis=0), np.stack(dones, axis=0)) tag = "noisy" if not eval_policy else "" info(f"eval> At epoch {epoch}, {tag} return: {R}") if not no_log: if not eval_policy: log("Return_noisy", R, epoch) elif not video: # don't log when outputing video if not test: log("Return", R, epoch) else: log("Return_test", R, epoch) if video: log_video("demo", epoch, np.stack(imgs, axis=0)) if not no_log: specific_evaluation(epoch, log_gap, dt, env, agent) return R