def forward_tail(self, core_output, with_action_distribution=False): self.termination_prob = self.termination(core_output) self.termination_mask = torch.where( self.termination_prob > torch.rand_like(self.termination_prob), torch.ones(1, device=self.termination_prob.device), torch.zeros(1, device=self.termination_prob.device)) values = self.critic_linear(core_output) action_distribution_params, action_distribution = self.action_parameterization(core_output) # for non-trivial action spaces it is faster to do these together actions, log_prob_actions = sample_actions_log_probs(action_distribution) # perhaps `action_logits` is not the best name here since we now support continuous actions result = AttrDict( dict( actions=actions, # (B * O) x (num_actions/D) # B x num_action_logits x O -> (B * O) x num_action_logits action_logits=action_distribution_params.reshape(-1, action_distribution.num_actions), log_prob_actions=log_prob_actions, # (B * O) x 1 values=values, termination_prob=self.termination_prob, termination_mask=self.termination_mask, )) if with_action_distribution: result.action_distribution = action_distribution return result
def forward_tail(self, core_output, with_action_distribution=False): core_outputs = core_output.chunk(len(self.cores), dim=1) # first core output corresponds to the actor action_distribution_params, action_distribution = self.action_parameterization( core_outputs[0]) # for non-trivial action spaces it is faster to do these together actions, log_prob_actions = sample_actions_log_probs( action_distribution) # second core output corresponds to the critic values = self.critic_linear(core_outputs[1]) result = AttrDict( dict( actions=actions, action_logits=action_distribution_params, log_prob_actions=log_prob_actions, values=values, )) if with_action_distribution: result.action_distribution = action_distribution return result
def forward_tail(self, core_output, with_action_distribution=False): values = self.critic_linear(core_output) action_distribution_params, action_distribution = self.action_parameterization(core_output) # for non-trivial action spaces it is faster to do these together actions, log_prob_actions = sample_actions_log_probs(action_distribution) result = AttrDict(dict( actions=actions, action_logits=action_distribution_params, # perhaps `action_logits` is not the best name here since we now support continuous actions log_prob_actions=log_prob_actions, values=values, )) if with_action_distribution: result.action_distribution = action_distribution return result