def _compute_loss(self, obs, actions, rewards, valids, baselines): r"""Compute mean value of loss. Notes: P is the maximum episode length (self.max_episode_length) Args: obs (torch.Tensor): Observation from the environment with shape :math:`(N, P, O*)`. actions (torch.Tensor): Actions fed to the environment with shape :math:`(N, P, A*)`. rewards (torch.Tensor): Acquired rewards with shape :math:`(N, P)`. valids (list[int]): Numbers of valid steps in each episode baselines (torch.Tensor): Value function estimation at each step with shape :math:`(N, P)`. Returns: torch.Tensor: Calculated negative mean scalar value of objective (float). """ obs_flat = torch.cat(filter_valids(obs, valids)) actions_flat = torch.cat(filter_valids(actions, valids)) rewards_flat = torch.cat(filter_valids(rewards, valids)) advantages_flat = self._compute_advantage(rewards, valids, baselines) return self._compute_loss_with_adv(obs_flat, actions_flat, rewards_flat, advantages_flat)
def _compute_log_probs(self, obs, actions, rewards, valids, baselines): r"""Compute log likelihoods Notes: P is the maximum episode length (self.max_episode_length) Args: obs (torch.Tensor): Observation from the environment with shape :math:`(N, P, O*)`. actions (torch.Tensor): Actions fed to the environment with shape :math:`(N, P, A*)`. rewards (torch.Tensor): Acquired rewards with shape :math:`(N, P)`. valids (list[int]): Numbers of valid steps in each episode baselines (torch.Tensor): Value function estimation at each step with shape :math:`(N, P)`. Returns: torch.Tensor: log likelihood (float). """ obs_flat = torch.cat(filter_valids(obs, valids)) actions_flat = torch.cat(filter_valids(actions, valids)) advantages_flat = self._compute_advantage(rewards, valids, baselines) log_likelihoods = self.policy(obs_flat)[0].log_prob(actions_flat) return log_likelihoods.mean()
def _compute_advantage(self, rewards, valids, baselines): r"""Compute mean value of loss. Notes: P is the maximum episode length (self.max_episode_length) Args: rewards (torch.Tensor): Acquired rewards with shape :math:`(N, P)`. valids (list[int]): Numbers of valid steps in each episode baselines (torch.Tensor): Value function estimation at each step with shape :math:`(N, P)`. Returns: torch.Tensor: Calculated advantage values given rewards and baselines with shape :math:`(N \dot [T], )`. """ advantages = compute_advantages(self.discount, self._gae_lambda, self.max_episode_length, baselines, rewards) advantage_flat = torch.cat(filter_valids(advantages, valids)) if self._center_adv: means = advantage_flat.mean() variance = advantage_flat.var() advantage_flat = (advantage_flat - means) / (variance + 1e-8) if self._positive_adv: advantage_flat -= advantage_flat.min() return advantage_flat
def train_once(self, itr, paths): """Train the algorithm once. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: numpy.float64: Calculated mean value of undiscounted returns. """ obs, actions, rewards, returns, valids, baselines = \ self.process_samples(paths) if self._maximum_entropy: policy_entropies = self._compute_policy_entropy(obs) rewards += self._policy_ent_coeff * policy_entropies obs_flat = torch.cat(filter_valids(obs, valids)) actions_flat = torch.cat(filter_valids(actions, valids)) rewards_flat = torch.cat(filter_valids(rewards, valids)) returns_flat = torch.cat(filter_valids(returns, valids)) advs_flat = self._compute_advantage(rewards, valids, baselines) with torch.no_grad(): policy_loss_before = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_before = self._value_function.compute_loss( obs_flat, returns_flat) kl_before = self._compute_kl_constraint(obs) self._train(obs_flat, actions_flat, rewards_flat, returns_flat, advs_flat) with torch.no_grad(): policy_loss_after = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_after = self._value_function.compute_loss( obs_flat, returns_flat) kl_after = self._compute_kl_constraint(obs) policy_entropy = self._compute_policy_entropy(obs) with tabular.prefix(self.policy.name): tabular.record('/LossBefore', policy_loss_before.item()) tabular.record('/LossAfter', policy_loss_after.item()) tabular.record('/dLoss', (policy_loss_before - policy_loss_after).item()) tabular.record('/KLBefore', kl_before.item()) tabular.record('/KL', kl_after.item()) tabular.record('/Entropy', policy_entropy.mean().item()) with tabular.prefix(self._value_function.name): tabular.record('/LossBefore', vf_loss_before.item()) tabular.record('/LossAfter', vf_loss_after.item()) tabular.record('/dLoss', vf_loss_before.item() - vf_loss_after.item()) self._old_policy.load_state_dict(self.policy.state_dict()) undiscounted_returns = log_performance(itr, EpisodeBatch.from_list( self._env_spec, paths), discount=self.discount) return np.mean(undiscounted_returns)
def _train_once(self, itr, eps): """Train the algorithm once. Args: itr (int): Iteration number. eps (EpisodeBatch): A batch of collected paths. Returns: numpy.float64: Calculated mean value of undiscounted returns. """ obs = torch.Tensor(eps.padded_observations) rewards = torch.Tensor(eps.padded_rewards) returns = torch.Tensor( np.stack([ discount_cumsum(reward, self.discount) for reward in eps.padded_rewards ])) valids = eps.lengths with torch.no_grad(): baselines = self._value_function(obs) if self._maximum_entropy: policy_entropies = self._compute_policy_entropy(obs) rewards += self._policy_ent_coeff * policy_entropies obs_flat = torch.Tensor(eps.observations) actions_flat = torch.Tensor(eps.actions) rewards_flat = torch.Tensor(eps.rewards) returns_flat = torch.cat(filter_valids(returns, valids)) advs_flat = self._compute_advantage(rewards, valids, baselines) with torch.no_grad(): policy_loss_before = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_before = self._value_function.compute_loss( obs_flat, returns_flat) kl_before = self._compute_kl_constraint(obs) self._train(obs_flat, actions_flat, rewards_flat, returns_flat, advs_flat) with torch.no_grad(): policy_loss_after = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_after = self._value_function.compute_loss( obs_flat, returns_flat) kl_after = self._compute_kl_constraint(obs) policy_entropy = self._compute_policy_entropy(obs) with tabular.prefix(self.policy.name): tabular.record('/LossBefore', policy_loss_before.item()) tabular.record('/LossAfter', policy_loss_after.item()) tabular.record('/dLoss', (policy_loss_before - policy_loss_after).item()) tabular.record('/KLBefore', kl_before.item()) tabular.record('/KL', kl_after.item()) tabular.record('/Entropy', policy_entropy.mean().item()) with tabular.prefix(self._value_function.name): tabular.record('/LossBefore', vf_loss_before.item()) tabular.record('/LossAfter', vf_loss_after.item()) tabular.record('/dLoss', vf_loss_before.item() - vf_loss_after.item()) self._old_policy.load_state_dict(self.policy.state_dict()) undiscounted_returns = log_performance(itr, eps, discount=self._discount) return np.mean(undiscounted_returns)