class WishartNormal: def __init__(self, variables): d = variables['loc'].shape[0] self.wishart = SqrtWishart({ 'df': variables['df'], 'W': variables['W'] }) self.normal = MVN_torch(loc=variables['loc'], covariance_matrix=variables['alpha'] * torch.eye(d, dtype=torch.double)) def sample(self, sample_shape=(1, )): P_samples = self.wishart.sample(sample_shape) mu_samples = self.normal.sample(sample_shape) return tl2lt((P_samples, mu_samples)) def sample_pos_neg(self, sample_shape=(1, ), eps=None): if eps is None: eps = {'W_df': 2 * self.wishart.W.shape[0]} P_neg, P, P_pos = self.wishart.sample_pos_neg(sample_shape, float(eps)) mus = self.normal.sample(sample_shape) samples_neg = tl2lt((P_neg, mus)) samples = tl2lt((P, mus)) samples_pos = tl2lt((P_pos, mus)) return samples_neg, samples, samples_pos def log_prob(self, Pmu): P, mu = Pmu return self.wishart.log_prob(P) + self.normal.log_prob(mu) def entropy(self): return self.wishart.entropy() + self.normal.entropy()
def evaluate(self, state, action): state = torch.from_numpy(state).float().to(device) action = torch.from_numpy(action).float().to(device) state_value = self.critic(state) action_feats = self.actor(state) if self.continious: dist = MultivariateNormal(torch.squeeze(action_feats), torch.diag(self.action_var)) action_logprobs = dist.log_prob(torch.squeeze(action)) dist_entropy = dist.entropy() else: action_probs = F.softmax(action_feats, dim=1) dist = Categorical(action_probs) action_logprobs = dist.log_prob(torch.squeeze(action)) dist_entropy = dist.entropy() return action_logprobs, torch.squeeze(state_value), dist_entropy
def act(self, state, memory, inverse_action=None): state = torch.from_numpy(state).float().to(self.device) action_mean = self.actor(state) # print(action_mean) cov_mat = torch.diag(self.action_var).to(self.device) dist = MultivariateNormal(action_mean, cov_mat) action = dist.sample() if inverse_action is not None: action = (1 - self.alpha ) * action + self.alpha * inverse_action.detach() if (action.abs() > 1).any(): action = action / action.abs().max() action_logprob = dist.log_prob(action) entropy = dist.entropy() memory.states.append(state) memory.actions.append(action) memory.logprobs.append(action_logprob) # action += torch.randn(self.action_dim) * 0.1 return (action.detach(), action_mean.detach().cpu().numpy(), entropy.item())
def evaluate(self, depth, goal, vel, action): # Translate everything to tensors of the correct shape if type(depth) is not torch.Tensor: depth = torch.Tensor(list(depth)).view(-1, 10, 64, 80) if type(goal) is not torch.Tensor: goal = torch.Tensor(goal).view(-1, 2) if type(vel) is not torch.Tensor: vel = torch.Tensor(vel).view(-1, 2) # Convolve the depth image stack and concat with the goal and last velocity torch.save(depth, "last_depth.pt") conv = self.conv(depth) catted = torch.cat((conv, goal, vel), dim=1) # print(catted.shape) # Get the means for the two actions action_means = self.action_prediction(catted).view(-1, 2) # print("Action Means: ", action_means) action_var = self.action_var.expand_as(action_means) cov_mat = torch.diag_embed(action_var).to(self.device) dist = MultivariateNormal(action_means, cov_mat) action_logprobs = dist.log_prob(action) dist_entropy = dist.entropy() state_value = self.state_prediction(catted) return action_logprobs, torch.squeeze(state_value), dist_entropy
def forward(self, state, tensor_cv): # CV x = F.relu(self.maxp1(self.conv1(tensor_cv.unsqueeze(0)))) x = F.relu(self.maxp2(self.conv2(x))) x = x.view(x.size(0), -1) #展開 x = F.relu(self.linear_CNN(x)) # num output_1 = F.relu(self.linear1(state)) output_2 = F.relu(self.linear2(output_1)) # LSTM output_2 = torch.cat((x, output_2), 1) output_2 = output_2.unsqueeze(0) output_3, self.hidden_cell = self.LSTM_layer_3( output_2) #,self.hidden_cell a, b, c = output_3.shape # output_4 = F.relu(self.linear4(output_3.view(-1, c))) # mu = torch.tanh(self.mu(output_4)) #有正有负 sigmoid 0-1 sigma = F.relu(self.sigma(output_4)) + 0.001 mu = torch.diag_embed(mu).to(device) sigma = torch.diag_embed(sigma).to(device) # change to 2D dist = MultivariateNormal(mu, sigma) #N(μ,σ^2) entropy = dist.entropy().mean() action = dist.sample() action_logprob = dist.log_prob(action) return action, action_logprob, entropy
def forward(self, state, tensor_cv): # CV x = F.relu(self.maxp1(self.conv1(tensor_cv))) x = F.relu(self.maxp2(self.conv2(x))) x = x.view(x.size(0), -1) #展開 x = F.relu(self.linear_CNN_1(x)).reshape(1, 768) x = F.relu(self.linear_CNN_2(x)).reshape(1, 256) # num output_1 = F.relu(self.linear1(state)) output_2 = F.relu(self.linear2(output_1)).reshape(1, 255) # merge output_2 = torch.cat((x, output_2), 1) output_3 = F.relu(self.linear3(output_2)) # output_4 = F.relu(self.linear4( output_3)) #F.relu(self.linear4(output_3.view(-1,c))) # mu = torch.tanh(self.mu(output_4)) #有正有负 sigmoid 0-1 sigma = F.relu(self.sigma(output_4)) + 0.001 mu = torch.diag_embed(mu).to(device) sigma = torch.diag_embed(sigma).to(device) # change to 2D dist = MultivariateNormal(mu, sigma) #N(μ,σ^2) entropy = dist.entropy().mean() action = dist.sample() action_logprob = dist.log_prob(action) action = torch.clamp(action.detach(), -0.8, 0.6) return action, action_logprob, entropy
def sample_action(self, state, eval: bool = False, **kwargs) -> Tuple[npTT, npTT, npTT]: """ Sample action from gauss distribution Return: action, log_prob, entropy """ mean, std = self._get_mean_std(state) distribution = MultivariateNormal(mean, std) if not eval: action = torch.clamp(distribution.sample(), -1, 1).detach() else: action = torch.clamp(mean, -1, 1).detach() log_prob = distribution.log_prob(action) log_prob = log_prob.view((-1, 1)) entropy = distribution.entropy() return ( process_kwargs(action, **kwargs), process_kwargs(log_prob, **kwargs), process_kwargs(entropy, **kwargs), )
def evaluate(self, state, opponent_state, action): if self.has_continuous_action_space: pre_mean, pre_sigma = self.om(opponent_state) pre_var = pre_sigma**2 pre_var = pre_var.repeat(1, 2).to(device) pre_mat = torch.diag_embed(pre_var).to(device) pre_dist = MultivariateNormal(pre_mean, pre_mat) pre_action = pre_dist.sample() pre_action = pre_action.clamp(-1, 1) action_mean, action_sigma = self.actor(state, pre_action) action_var = action_sigma**2 action_var = action_var.repeat(1, 2).to(device) cov_mat = torch.diag_embed(action_var).to(device) dist = MultivariateNormal(action_mean, cov_mat) # For Single Action Environments. if self.action_dim == 1: action = action.reshape(-1, self.action_dim) else: action_probs = self.actor(state) dist = Categorical(action_probs) action_logprobs = dist.log_prob(action) dist_entropy = dist.entropy() state_values = self.critic(state, action) return action_logprobs, state_values, dist_entropy
def evaluate(self, inputs, logits, outputs): covariance = torch.diag(self.log_std.exp() * self.log_std.exp()) distribution = MultivariateNormal(logits, covariance) actions_log_prob = distribution.log_prob(outputs) entropy = distribution.entropy() return actions_log_prob, entropy
def _entropy(self, s, a): mean, std = self.actor(s) std = torch.stack([std] * mean.shape[0], dim=0) cov = torch.diag_embed(std) dist = MultivariateNormal(loc=mean, covariance_matrix=cov) entropy = dist.entropy() return entropy
def evaluate(self, state, action): action_mean, value = self.forward(state) cov_mat = torch.diag( torch.ones(self.action_space).to(device) * 0.5**0.5) dist = MultivariateNormal(action_mean, cov_mat) return action, dist.log_prob(action), value, dist.entropy()
def evaluate(self, state, action): action_mean = self.actor(state) dist = MultivariateNormal(action_mean, torch.diag(self.action_var)) action_logprobs = dist.log_prob(action) dist_entropy = dist.entropy() state_value = self.critic(state) return action_logprobs, state_value, dist_entropy
def estimate_action(self, state, action) -> Tuple[TT, TT]: """ Create distribution via state, and compute given action log_prob. Return: action, log_prob, entropy """ _action = make_it_batched_torch_tensor(action, self.device) distribution = MultivariateNormal(*self._get_mean_std(state)) return distribution.log_prob(_action).view( (-1, 1)), distribution.entropy().view((-1, 1))
def evaluate(self, state, action): _, _, value, action_mean = self.forward(state) cov_mat = torch.diag(self.action_var).to(self.device) dist = MultivariateNormal(action_mean, cov_mat) action_logprobs = dist.log_prob(action) dist_entropy = dist.entropy() return action_logprobs, torch.squeeze(value), dist_entropy
def evaluate(self, state, action): latent = self.encoder(state) action_mean = self.actor(latent) dist = MultivariateNormal(torch.squeeze(action_mean), torch.diag(self.action_var)) action_logprobs = dist.log_prob(torch.squeeze(action)) dist_entropy = dist.entropy() state_value = self.critic(latent) return action_logprobs, torch.squeeze(state_value), dist_entropy
def evaluate(self, state, action): if self.policy_model == 'Gaussian': action_mean = self.actor(state) action_var = self.action_var.expand_as(action_mean) cov_mat = torch.diag_embed(action_var).to(device) dist = MultivariateNormal(action_mean, cov_mat) action_logprobs = dist.log_prob(action) dist_entropy = dist.entropy() elif self.policy_model == 'Beta': action_aplha = self.alpha(state) + 1 action_beta = self.beta(state) + 1 dist = Beta(action_aplha, action_beta) action_logprobs = dist.log_prob(action) action_logprobs = torch.sum(action_logprobs, 1) dist_entropy = dist.entropy() dist_entropy = torch.sum(dist_entropy, 1) state_value = self.critic(state) return action_logprobs, torch.squeeze(state_value), dist_entropy
def evaluate(self, state, action): action_mean = self.actor(state) cov_mat = torch.diag(torch.exp(self.log_std)).to(self.device) distrib = MultivariateNormal(action_mean, cov_mat) action_log_probs = distrib.log_prob(action).unsqueeze(1) dist_entropy = distrib.entropy() value = self.critic(state) return action_log_probs, value, dist_entropy
def evaluate(self, state, action): action_mean = self.actor(state) action_var = self.action_var.expand_as(action_mean) cov_mat = torch.diag_embed(action_var).to(self.device) dist = MultivariateNormal(action_mean, cov_mat) action_logprobs = dist.log_prob(action) dist_entropy = dist.entropy() state_value = self.critic(state) return action_logprobs, torch.squeeze(state_value), dist_entropy
def select_action(self, x): mu, cov = self.forward(x) tril = self.reshape_output(mu, cov) dist = MultivariateNormal(mu, scale_tril=tril) if self.pwd: action = dist.rsample() else: action = dist.sample() log_prob = dist.log_prob(action) entropy = dist.entropy() return action, log_prob, entropy
def evaluate_action(self, mu, actions, sigma): n_batch = len(mu) if self.n_ctrl > 1: cov = torch.eye(self.n_ctrl).double() * sigma**2 cov = cov.repeat(n_batch, 1, 1) dist = MultivariateNormal(mu, cov) else: dist = Normal(mu, torch.ones_like(mu) * sigma) log_prob = dist.log_prob(actions.double()) entropy = dist.entropy() return log_prob, entropy
def evaluate(self, states, actions): mean = self.actor(states) cov_vec = self.action_var.expand_as(mean) cov = torch.diag_embed(cov_vec) dist = MultivariateNormal(mean, cov) if self.action_size == 1: actions = actions.reshape(-1, self.action_size) log_prob = dist.log_prob(actions) dist_entropy = dist.entropy() state_values = self.critic(states) return log_prob, state_values, dist_entropy
def evaluate(self, state, action): # state (4000, 24); action (4000, 4) state_value = self.critic(state) # (4000, 1) # to calculate action score(logprobs) and distribution entropy action_mean = self.actor(state) # (4000,4) action_var = self.action_var.expand_as(action_mean) # (4000,4) cov_mat = torch.diag_embed(action_var).to(device) # (4000,4,4) dist = MultivariateNormal(action_mean, cov_mat) action_logprobs = dist.log_prob(action) dist_entropy = dist.entropy() return action_logprobs, torch.squeeze(state_value), dist_entropy
def evaluate(self, state, action): """ Note that 'state' and 'action' here can be batches of states and actions """ mu_batch = self.block(state) variance_batch = self.variance.expand_as(mu_batch) cov_batch = torch.diag_embed(variance_batch) dist = MultivariateNormal(mu_batch, cov_batch) action_logprobs = dist.log_prob(action) dist_entropy = dist.entropy() return action_logprobs, dist_entropy
def evaluate(self, observations, action): action_mean = torch.squeeze(self.actor(observations)) action_var = self.action_var.expand_as(action_mean) cov_mat = torch.diag_embed(action_var).to(device) dist = MultivariateNormal(action_mean, cov_mat) action_logprobs = dist.log_prob(torch.squeeze(action)) dist_entropy = dist.entropy() observation_value = self.critic(observations) return action_logprobs, torch.squeeze(observation_value), dist_entropy
def evaluate(self, state, action): '''Evaluate action for a given state.''' action_mean, _, state_value = self.forward(state) action_var = self.action_var.expand_as(action_mean) cov_mat = torch.diag_embed(action_var) dist = MultivariateNormal(action_mean, cov_mat) action_logprobs = dist.log_prob(action) dist_entropy = dist.entropy() return action_logprobs, torch.squeeze(state_value), dist_entropy
def evaluate(self, states, action): states = torch.stack(states) states = states.view(-1, *states.shape[-3:]) actor_critic_input = self.conv(states).view(-1, self.size) action_mean = self.actor(actor_critic_input) action_var = self.action_var.repeat(states.shape[0], 1) cov_mat = torch.diag_embed(action_var).to(device) dist = MultivariateNormal(action_mean, cov_mat) action = action.view(-1, action_size) action_logprobs = dist.log_prob(action).view(states.shape[:-3]) dist_entropy = dist.entropy().view(states.shape[:-3]) state_value = self.critic(actor_critic_input).view(states.shape[:-3]) return action_logprobs, torch.squeeze(state_value), dist_entropy
def evaluate(self, states, actions): action_means = self.agent(states) action_var = torch.full((action_dim, ), self.sigma) action_var = action_var.expand_as(action_means) cov_mat = torch.diag_embed(action_var).to(device) dist = MultivariateNormal(action_means, cov_mat) action_logprobs = dist.log_prob(actions) dist_entropy = dist.entropy() return action_logprobs, dist_entropy
def get_training_params(self, frame, mes, action): frame = torch.squeeze(torch.stack(frame)) mes = torch.squeeze(torch.stack(mes)) action = torch.stack(action) mean = self.actor_(frame, mes) action_expanded = self.action_var.expand_as(mean) cov_matrix = torch.diag_embed(action_expanded).to(device) gauss_dist = MultivariateNormal(mean, cov_matrix) action_log_prob = gauss_dist.log_prob(action).to(device) entropy = gauss_dist.entropy().to(device) state_value = torch.squeeze(self.critic_(frame, mes)).to(device) return action_log_prob, state_value, entropy
def evaluate(self, old_state, old_action): action_mean = self.actor(old_state) action_var = self.action_var.expand_as(action_mean) cov_mat = torch.diag_embed(action_var).to(self.device) dist = MultivariateNormal(action_mean, cov_mat) #probability of old action under new policy action_log_probs = dist.log_prob(old_action) state_value = self.critic(old_state) dist_entropy = dist.entropy() return torch.squeeze(state_value), action_log_probs, dist_entropy
def forward(self, state): output_1 = F.relu(self.linear1(state)) output_2 = F.relu(self.linear2(output_1)) mu = 2 * torch.sigmoid(self.mu(output_2)) #有正有负 sigma = F.relu(self.sigma(output_2)) + 0.001 # avoid 0 softplus output = F.softmax(output, dim=-1) action_mean = self.linear3(output) #cov_mat = torch.diag(self.action_var).to(device) mu = torch.diag_embed(mu).to(device) sigma = torch.diag_embed(sigma).to(device) # change to 2D dist = MultivariateNormal(mu,sigma) #N(μ,σ^2) σ超参不用训练 MultivariateNormal(action_mean, cov_mat) #distribution = Categorical(F.softmax(output, dim=-1)) entropy = dist.entropy().mean() action = dist.sample() action_logprob = dist.log_prob(action) return action.detach(),action_logprob,entropy #distribution .detach()