def evaluate_policy(self, policy: AbstractGaussianPolicy, render: bool = False, deterministic: bool = True): """ Evaluate a given policy Args: policy: policy to evaluate render: render policy behavior deterministic: choosing deterministic actions Returns: Dict with performance metrics. """ if self.n_test_envs == 0: return n_runs = 1 ep_rewards = np.zeros(( n_runs, self.n_test_envs, )) ep_lengths = np.zeros(( n_runs, self.n_test_envs, )) for i in range(n_runs): not_dones = np.ones((self.n_test_envs, ), np.bool) obs = self.envs.reset_test() while np.any(not_dones): ep_lengths[i, not_dones] += 1 if render: self.envs.render_test(mode="human") with ch.no_grad(): p = policy(tensorize(obs, self.cpu, self.dtype)) actions = p[0] if deterministic else policy.sample(p) actions = policy.squash(actions) obs, rews, dones, infos = self.envs.step_test( get_numpy(actions)) ep_rewards[i, not_dones] += rews[not_dones] # only set to False when env has never terminated before, otherwise we favor earlier terminating envs. not_dones = np.logical_and(~dones, not_dones) return self.get_reward_dict(ep_rewards, ep_lengths)
def run(self, rollout_steps, policy: AbstractGaussianPolicy, vf_model: Union[VFNet, None] = None, reset_envs: bool = False) -> TrajectoryOnPolicyRaw: """ Generate trajectories of the environment. Args: rollout_steps: Number of steps to generate policy: Policy model to generate samples for vf_model: vf model to generate value estimate for all states. reset_envs: Whether to reset all envs in the beginning. Returns: Trajectory with the respective data as torch tensors. """ # Here, we init the lists that will contain the mb of experiences num_envs = self.n_envs base_shape = (rollout_steps, num_envs) base_shape_p1 = (rollout_steps + 1, num_envs) base_action_shape = base_shape + self.envs.action_space.shape mb_obs = ch.zeros(base_shape_p1 + self.envs.observation_space.shape, dtype=self.dtype) mb_actions = ch.zeros(base_action_shape, dtype=self.dtype) mb_rewards = ch.zeros(base_shape, dtype=self.dtype) mb_dones = ch.zeros(base_shape, dtype=ch.bool) ep_infos = [] mb_time_limit_dones = ch.zeros(base_shape, dtype=ch.bool) mb_means = ch.zeros(base_action_shape, dtype=self.dtype) mb_stds = ch.zeros(base_action_shape + self.envs.action_space.shape, dtype=self.dtype) # continue from last state # Before first step we already have self.obs because env calls self.obs = env.reset() on init obs = self.envs.reset() if reset_envs else self.envs.last_obs obs = tensorize(obs, self.cpu, self.dtype) # For n in range number of steps for i in range(rollout_steps): # Given observations, get action value and lopacs pds = policy(obs, train=False) actions = policy.sample(pds) squashed_actions = policy.squash(actions) mb_obs[i] = obs mb_actions[i] = squashed_actions obs, rewards, dones, infos = self.envs.step( squashed_actions.cpu().numpy()) obs = tensorize(obs, self.cpu, self.dtype) mb_means[i] = pds[0] mb_stds[i] = pds[1] mb_time_limit_dones[i] = tensorize(infos["horizon"], self.cpu, ch.bool) if infos.get("done"): ep_infos.extend(infos.get("done")) mb_rewards[i] = tensorize(rewards, self.cpu, self.dtype) mb_dones[i] = tensorize(dones, self.cpu, ch.bool) # need value prediction for last obs in rollout to estimate loss mb_obs[-1] = obs # compute all logpacs and value estimates at once --> less computation mb_logpacs = policy.log_probability((mb_means, mb_stds), mb_actions) mb_values = (vf_model if vf_model else policy.get_value)(mb_obs, train=False) out = (mb_obs[:-1], mb_actions, mb_logpacs, mb_rewards, mb_values, mb_dones, mb_time_limit_dones, mb_means, mb_stds) if not self.cpu: out = tuple(map(to_gpu, out)) if ep_infos: ep_infos = np.array(ep_infos) ep_length, ep_reward = ep_infos[:, 0], ep_infos[:, 1] self.total_rewards.extend(ep_reward) self.total_steps.extend(ep_length) return TrajectoryOnPolicyRaw(*out)