def forward(self, observation): if self.decoder is not None: state = self.decoder(observation) else: state = observation feature = self.extractor(state) log_lambda = self.lambda_layer(feature.view(-1, self.feature_size)) return clip_but_pass_gradient(log_lambda, lower=-10., upper=6.)
def forward(self, observation, action): if self.decoder is not None: state = self.decoder(observation) else: state = observation # Compute Q--value. feature = self.extractor(torch.cat((state, action), dim=-1)) pre_q = self.value_layer(feature.view(-1, self.feature_size)) return clip_but_pass_gradient(pre_q, 0., 1.)
def log_prob(self, observation, action): if self.decoder is not None: state = self.decoder(observation) else: state = observation feature = self.extractor(state).view(-1, self.feature_size) mean = self.mean_layer(feature) logstd = clip_but_pass_gradient(self.logstd_layer(feature), -6., 2.) pre_action = atanh(action) log_prob = normal_likelihood( pre_action, mean, logstd) - torch.log(-action**2 + 1. + EPS).sum( 1, keepdim=True) return log_prob
def sample(self, observation, deterministic=False): if self.decoder is not None: state = self.decoder(observation) else: state = observation feature = self.extractor(state).view(-1, self.feature_size) mean = self.mean_layer(feature) logstd = clip_but_pass_gradient(self.logstd_layer(feature), -6., 2.) std = torch.exp(logstd) if deterministic: return torch.tanh(mean) else: pre_sample = mean + std * torch.randn( mean.size(), dtype=mean.dtype, device=mean.device) return torch.tanh(pre_sample)
def forward(self, observation): if self.decoder is not None: state = self.decoder(observation) else: state = observation feature = self.extractor(state).view(-1, self.feature_size) mean = self.mean_layer(feature) logstd = clip_but_pass_gradient(self.logstd_layer(feature), -6., 2.) std = torch.exp(logstd) # Reparameterization trick pre_sample = mean + std * torch.randn( mean.size(), dtype=mean.dtype, device=mean.device) sample = torch.tanh(pre_sample) log_prob = normal_likelihood( pre_sample, mean, logstd) - torch.log(-sample**2 + 1. + EPS).sum( 1, keepdim=True) return sample, torch.tanh(mean), log_prob