def sample_actions( self, vec_obs: List[torch.Tensor], vis_obs: List[torch.Tensor], masks: Optional[torch.Tensor] = None, memories: Optional[torch.Tensor] = None, seq_len: int = 1, all_log_probs: bool = False, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Dict[ str, torch.Tensor], torch.Tensor]: """ :param all_log_probs: Returns (for discrete actions) a tensor of log probs, one for each action. """ dists, value_heads, memories = self.actor_critic.get_dist_and_value( vec_obs, vis_obs, masks, memories, seq_len) action_list = self.actor_critic.sample_action(dists) log_probs, entropies, all_logs = ModelUtils.get_probs_and_entropy( action_list, dists) actions = torch.stack(action_list, dim=-1) if self.use_continuous_act: actions = actions[:, :, 0] else: actions = actions[:, 0, :] return ( actions, all_logs if all_log_probs else log_probs, entropies, value_heads, memories, )
def test_get_probs_and_entropy(): # Test continuous # Add two dists to the list. This isn't done in the code but we'd like to support it. dist_list = [ GaussianDistInstance(torch.zeros((1, 2)), torch.ones((1, 2))), GaussianDistInstance(torch.zeros((1, 2)), torch.ones((1, 2))), ] action_list = [torch.zeros((1, 2)), torch.zeros((1, 2))] log_probs, entropies, all_probs = ModelUtils.get_probs_and_entropy( action_list, dist_list) assert log_probs.shape == (1, 2, 2) assert entropies.shape == (1, 2, 2) assert all_probs is None for log_prob in log_probs.flatten(): # Log prob of standard normal at 0 assert log_prob == pytest.approx(-0.919, abs=0.01) for ent in entropies.flatten(): # entropy of standard normal at 0 assert ent == pytest.approx(1.42, abs=0.01) # Test continuous # Add two dists to the list. act_size = 2 test_prob = torch.tensor([[1.0 - 0.1 * (act_size - 1)] + [0.1] * (act_size - 1)]) # High prob for first action dist_list = [ CategoricalDistInstance(test_prob), CategoricalDistInstance(test_prob) ] action_list = [torch.tensor([0]), torch.tensor([1])] log_probs, entropies, all_probs = ModelUtils.get_probs_and_entropy( action_list, dist_list) assert all_probs.shape == (1, len(dist_list * act_size)) assert entropies.shape == (1, len(dist_list)) # Make sure the first action has high probability than the others. assert log_probs.flatten()[0] > log_probs.flatten()[1]
def sample_actions( self, vec_obs: List[torch.Tensor], vis_obs: List[torch.Tensor], masks: Optional[torch.Tensor] = None, memories: Optional[torch.Tensor] = None, seq_len: int = 1, all_log_probs: bool = False, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ :param vec_obs: List of vector observations. :param vis_obs: List of visual observations. :param masks: Loss masks for RNN, else None. :param memories: Input memories when using RNN, else None. :param seq_len: Sequence length when using RNN. :param all_log_probs: Returns (for discrete actions) a tensor of log probs, one for each action. :return: Tuple of actions, actions clipped to -1, 1, log probabilities (dependent on all_log_probs), entropies, and output memories, all as Torch Tensors. """ if memories is None: dists, memories = self.actor_critic.get_dists( vec_obs, vis_obs, masks, memories, seq_len) else: # If we're using LSTM. we need to execute the values to get the critic memories dists, _, memories = self.actor_critic.get_dist_and_value( vec_obs, vis_obs, masks, memories, seq_len) action_list = self.actor_critic.sample_action(dists) log_probs, entropies, all_logs = ModelUtils.get_probs_and_entropy( action_list, dists) actions = torch.stack(action_list, dim=-1) if self.use_continuous_act: actions = actions[:, :, 0] else: actions = actions[:, 0, :] # Use the sum of entropy across actions, not the mean entropy_sum = torch.sum(entropies, dim=1) if self._clip_action and self.use_continuous_act: clipped_action = torch.clamp(actions, -3, 3) / 3 else: clipped_action = actions return ( actions, clipped_action, all_logs if all_log_probs else log_probs, entropy_sum, memories, )
def evaluate_actions( self, vec_obs: torch.Tensor, vis_obs: torch.Tensor, actions: torch.Tensor, masks: Optional[torch.Tensor] = None, memories: Optional[torch.Tensor] = None, seq_len: int = 1, ) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor]]: dists, value_heads, _ = self.actor_critic.get_dist_and_value( vec_obs, vis_obs, masks, memories, seq_len) action_list = [actions[..., i] for i in range(actions.shape[-1])] log_probs, entropies, _ = ModelUtils.get_probs_and_entropy( action_list, dists) return log_probs, entropies, value_heads