def forward( self, obs, reparameterize=True, deterministic=False, return_log_prob=False, ): """ :param obs: Observation :param deterministic: If True, do not sample :param return_log_prob: If True, return a sample and its log probability """ h = obs # import pdb; pdb.set_trace() for i, fc in enumerate(self.fcs): h = self.hidden_activation(fc(h)) mean = self.last_fc(h) if self.std is None: log_std = self.last_fc_log_std(h) log_std = torch.clamp(log_std, LOG_SIG_MIN, LOG_SIG_MAX) std = torch.exp(log_std) else: std = self.std log_std = self.log_std log_prob = None entropy = None mean_action_log_prob = None pre_tanh_value = None if deterministic: action = torch.tanh(mean) else: tanh_normal = TanhNormal(mean, std) if return_log_prob: if reparameterize is True: action, pre_tanh_value = tanh_normal.rsample( return_pretanh_value=True) else: action, pre_tanh_value = tanh_normal.sample( return_pretanh_value=True) log_prob = tanh_normal.log_prob(action, pre_tanh_value=pre_tanh_value) log_prob = log_prob.sum(dim=1, keepdim=True) else: if reparameterize is True: action = tanh_normal.rsample() else: action = tanh_normal.sample() return ( action, mean, log_std, log_prob, entropy, std, mean_action_log_prob, pre_tanh_value, )
def forward( self, obs, reparameterize=False, deterministic=False, return_log_prob=False, ): """ :param obs: Observation :param deterministic: If True, do not sample :param return_log_prob: If True, return a sample and its log probability """ t, b, _ = obs.size() h = obs h = self.inner_forward(h) mean = self.last_fc(h) if self.std is None: log_std = self.last_fc_log_std(h) log_std = torch.clamp(log_std, LOG_SIG_MIN, LOG_SIG_MAX) std = torch.exp(log_std) else: std = self.std log_std = self.log_std log_prob = None expected_log_prob = None mean_action_log_prob = None pre_tanh_value = None if deterministic: action = torch.tanh(mean) else: tanh_normal = TanhNormal(mean, std) if return_log_prob: if reparameterize: action, pre_tanh_value = tanh_normal.rsample( return_pretanh_value=True) else: action, pre_tanh_value = tanh_normal.sample( return_pretanh_value=True) log_prob = tanh_normal.log_prob(action, pre_tanh_value=pre_tanh_value) log_prob = log_prob.sum(dim=1, keepdim=True) else: if reparameterize: action = tanh_normal.rsample() else: action = tanh_normal.sample() return ( action, mean, log_std, log_prob, expected_log_prob, std, mean_action_log_prob, pre_tanh_value, )
def forward( self, obs, reparameterize=reparameterize, deterministic=False, ): """ :param obs: Observation :param deterministic: If True, do not sample :param return_log_prob: If True, return a sample and its log probability """ h = obs for i, fc in enumerate(self.fcs): h = self.hidden_activation(fc(h)) mean = self.last_fc(h) if self.std is None: log_std = self.last_fc_log_std(h) log_std = torch.clamp(log_std, LOGMIN, LOGMAX) std = torch.exp(log_std) else: std = self.std log_std = self.log_std log_prob = None expected_log_prob = None mean_action_log_prob = None pre_tanh_value = None tanh_normal = TanhNormal(mean, std) if reparameterize: action, pre_tanh_value = tanh_normal.rsample( return_pretanh_value=True) else: action, pre_tanh_value = tanh_normal.sample( return_pretanh_value=True) log_prob = tanh_normal.log_prob(action, pre_tanh_value=pre_tanh_value) log_prob = log_prob.sum(dim=1, keepdim=True) return action.cuda(), log_prob.cuda(), pre_tanh_value.cuda( ), mean, log_std
def forward( self, obs, reparameterize=True, deterministic=False, return_log_prob=False, ): """ :param obs: Observation :param deterministic: If True, do not sample :param return_log_prob: If True, return a sample and its log probability """ h = obs for i, fc in enumerate(self.fcs): h = self.hidden_activation(fc(h)) mean = self.last_fc(h) # actions * heads if self.std is None: log_std = self.last_fc_log_std(h) log_std = torch.clamp(log_std, LOG_SIG_MIN, LOG_SIG_MAX) std = torch.exp(log_std) else: std = self.std log_std = self.log_std log_prob = None entropy = None mean_action_log_prob = None pre_tanh_value = None log_stds = None log_probs = None if deterministic: means = mean.view(-1, self.action_dim) actions = torch.tanh(means) actions = actions.view(-1, self.heads, self.action_dim) else: all_actions = [] means = mean.view(-1, self.action_dim) stds = std.view(-1, self.action_dim) log_stds = log_std.view(-1, self.action_dim) tanh_normal = TanhNormal(means, stds) if return_log_prob: if reparameterize is True: action, pre_tanh_value = tanh_normal.rsample( return_pretanh_value=True) else: action, pre_tanh_value = tanh_normal.sample( return_pretanh_value=True) log_prob = tanh_normal.log_prob(action, pre_tanh_value=pre_tanh_value) log_prob = log_prob.sum(dim=1, keepdim=True) log_probs = log_prob.view(-1, self.heads, self.action_dim) else: if reparameterize is True: action = tanh_normal.rsample() else: action = tanh_normal.sample() actions = action.view(-1, self.heads, self.action_dim) return ( actions, means, log_stds, log_probs, entropy, std, mean_action_log_prob, pre_tanh_value, )
def forward( self, obs, reparameterize=True, deterministic=False, return_log_prob=False, ): """ :param obs: Observation :param deterministic: If True, do not sample :param return_log_prob: If True, return a sample and its log probability """ h = obs for i, fc in enumerate(self.fcs): h = self.hidden_activation(fc(h)) out = self.last_fc(h).view(-1, self.k, (2 * self.action_dim + 1)) log_w = out[..., 0] mean = out[..., 1:1 + self.action_dim] log_std = out[..., 1 + self.action_dim:] log_std = torch.clamp(log_std, LOG_SIG_MIN, LOG_SIG_MAX) log_w = torch.clamp(log_w, min=LOG_W_MIN) self.log_w = log_w self.mean = mean self.log_std = log_std std = torch.exp(log_std) log_prob = None entropy = None mean_action_log_prob = None pre_tanh_value = None arange = torch.arange(out.shape[0]) if deterministic: ks = log_w.view(-1, self.k).argmax(1) action = torch.tanh(mean[arange, ks]) else: sample_ks = Categorical(logits=log_w.view(-1, self.k)).sample() tanh_normal = TanhNormal(mean[arange, sample_ks], std[arange, sample_ks]) if return_log_prob: action, pre_tanh_value = tanh_normal.sample( return_pretanh_value=True) # (NxKxA), (NxKxA), (Nx1xA) => (NxK) log_p_xz_t = log_gaussian(mean, log_std, pre_tanh_value[:, None, :].data) log_p_x_t = torch.logsumexp(log_p_xz_t + log_w, 1) - torch.logsumexp(log_w, 1) # squash correction log_prob = log_p_x_t - torch.log(1 - action**2 + 1e-6).sum(1) log_prob = log_prob[:, None] else: if reparameterize is True: action = tanh_normal.rsample() else: action = tanh_normal.sample() return ( action, mean, log_std, log_prob, entropy, std, mean_action_log_prob, pre_tanh_value, )
def forward( self, obs, reparameterize=True, deterministic=False, return_log_prob=False, ): """ :param obs: Observation :param deterministic: If True, do not sample :param return_log_prob: If True, return a sample and its log probability """ # This is a bit messed up TODO clean it if obs.shape[0] == 1: # if obs is single image: flatten h = torch.flatten(obs) h = h.view(1, -1) else: # else if obs comes from replay buffer --> it is already flat and comes in a batch --> DO NOT flatten! h = obs h = super().forward(h, None, complete=False) mean = self.last_fc(h) if self.std is None: log_std = self.last_fc_log_std(h) log_std = torch.clamp(log_std, LOG_SIG_MIN, LOG_SIG_MAX) std = torch.exp(log_std) else: std = self.std log_std = self.log_std log_prob = None entropy = None mean_action_log_prob = None pre_tanh_value = None if deterministic: action = torch.tanh(mean) else: tanh_normal = TanhNormal(mean, std) if return_log_prob: if reparameterize is True: action, pre_tanh_value = tanh_normal.rsample( return_pretanh_value=True) else: action, pre_tanh_value = tanh_normal.sample( return_pretanh_value=True) log_prob = tanh_normal.log_prob(action, pre_tanh_value=pre_tanh_value) log_prob = log_prob.sum(dim=1, keepdim=True) else: if reparameterize is True: action = tanh_normal.rsample() else: action = tanh_normal.sample() return ( action, mean, log_std, log_prob, entropy, std, mean_action_log_prob, pre_tanh_value, )
def forward( self, meta_size, batch_size, obs, reparameterize=False, deterministic=False, return_log_prob=False, ): """ :param obs: Observation :param deterministic: If True, do not sample :param return_log_prob: If True, return a sample and its log probability """ h = obs for i, fc in enumerate(self.fcs): h = fc(h) if self.layer_norm and i < len(self.fcs) - 1: h = self.layer_norms[i](h) h = self.hidden_activation(h) if self.use_dropout and i < len(self.fcs) - 1: h = self.dropouts[i](h) mean = self.last_fc(h) if self.std is None: log_std = self.last_fc_log_std(h) log_std = torch.clamp(log_std, LOG_SIG_MIN, LOG_SIG_MAX) std = torch.exp(log_std) else: std = self.std log_std = self.log_std log_prob = None expected_log_prob = None mean_action_log_prob = None pre_tanh_value = None if deterministic: action = torch.tanh(mean) else: tanh_normal = TanhNormal(mean, std) if return_log_prob: if reparameterize: action, pre_tanh_value = tanh_normal.rsample( return_pretanh_value=True ) else: action, pre_tanh_value = tanh_normal.sample( return_pretanh_value=True ) log_prob = tanh_normal.log_prob( action, pre_tanh_value=pre_tanh_value ) log_prob = log_prob.sum(dim=1, keepdim=True) else: if reparameterize: action = tanh_normal.rsample() else: action = tanh_normal.sample() return ( action, mean, log_std, log_prob, expected_log_prob, std, mean_action_log_prob, pre_tanh_value, )
def forward( self, obs, reparameterize=True, deterministic=False, return_log_prob=False, return_entropy=False, return_log_prob_of_mean=False, ): """ :param obs: Observation :param deterministic: If True, do not sample :param return_log_prob: If True, return a sample and its log probability :param return_entropy: If True, return the true expected log prob. Will not need to be differentiated through, so this can be a number. :param return_log_prob_of_mean: If True, return the true expected log prob. Will not need to be differentiated through, so this can be a number. """ h = self.obs_processor(obs) h = self.mean_and_log_std_net(h) mean, log_std = torch.split(h, self.action_dim, dim=1) log_std = torch.clamp(log_std, LOG_SIG_MIN, LOG_SIG_MAX) std = torch.exp(log_std) log_prob = None entropy = None mean_action_log_prob = None pre_tanh_value = None if deterministic: action = torch.tanh(mean) else: tanh_normal = TanhNormal(mean, std) if return_log_prob: if reparameterize is True: action, pre_tanh_value = tanh_normal.rsample( return_pretanh_value=True) else: action, pre_tanh_value = tanh_normal.sample( return_pretanh_value=True) log_prob = tanh_normal.log_prob(action, pre_tanh_value=pre_tanh_value) log_prob = log_prob.sum(dim=1, keepdim=True) else: if reparameterize is True: action = tanh_normal.rsample() else: action = tanh_normal.sample() if return_entropy: entropy = log_std + 0.5 + np.log(2 * np.pi) / 2 # I'm not sure how to compute the (differential) entropy for a # tanh(Gaussian) entropy = entropy.sum(dim=1, keepdim=True) raise NotImplementedError() if return_log_prob_of_mean: tanh_normal = TanhNormal(mean, std) mean_action_log_prob = tanh_normal.log_prob( torch.tanh(mean), pre_tanh_value=mean, ) mean_action_log_prob = mean_action_log_prob.sum(dim=1, keepdim=True) return ( action, mean, log_std, log_prob, entropy, std, mean_action_log_prob, pre_tanh_value, )