示例#1
0
class WishartNormal:
    def __init__(self, variables):
        d = variables['loc'].shape[0]
        self.wishart = SqrtWishart({
            'df': variables['df'],
            'W': variables['W']
        })
        self.normal = MVN_torch(loc=variables['loc'],
                                covariance_matrix=variables['alpha'] *
                                torch.eye(d, dtype=torch.double))

    def sample(self, sample_shape=(1, )):
        P_samples = self.wishart.sample(sample_shape)
        mu_samples = self.normal.sample(sample_shape)
        return tl2lt((P_samples, mu_samples))

    def sample_pos_neg(self, sample_shape=(1, ), eps=None):
        if eps is None:
            eps = {'W_df': 2 * self.wishart.W.shape[0]}
        P_neg, P, P_pos = self.wishart.sample_pos_neg(sample_shape, float(eps))
        mus = self.normal.sample(sample_shape)
        samples_neg = tl2lt((P_neg, mus))
        samples = tl2lt((P, mus))
        samples_pos = tl2lt((P_pos, mus))
        return samples_neg, samples, samples_pos

    def log_prob(self, Pmu):
        P, mu = Pmu
        return self.wishart.log_prob(P) + self.normal.log_prob(mu)

    def entropy(self):
        return self.wishart.entropy() + self.normal.entropy()
 def evaluate(self, state, action):
     state = torch.from_numpy(state).float().to(device)
     action = torch.from_numpy(action).float().to(device)
     state_value = self.critic(state)
     action_feats = self.actor(state)
     if self.continious:
         dist = MultivariateNormal(torch.squeeze(action_feats),
                                   torch.diag(self.action_var))
         action_logprobs = dist.log_prob(torch.squeeze(action))
         dist_entropy = dist.entropy()
     else:
         action_probs = F.softmax(action_feats, dim=1)
         dist = Categorical(action_probs)
         action_logprobs = dist.log_prob(torch.squeeze(action))
         dist_entropy = dist.entropy()
     return action_logprobs, torch.squeeze(state_value), dist_entropy
示例#3
0
    def act(self, state, memory, inverse_action=None):
        state = torch.from_numpy(state).float().to(self.device)
        action_mean = self.actor(state)
        # print(action_mean)
        cov_mat = torch.diag(self.action_var).to(self.device)

        dist = MultivariateNormal(action_mean, cov_mat)
        action = dist.sample()

        if inverse_action is not None:
            action = (1 - self.alpha
                      ) * action + self.alpha * inverse_action.detach()
            if (action.abs() > 1).any():
                action = action / action.abs().max()
        action_logprob = dist.log_prob(action)
        entropy = dist.entropy()

        memory.states.append(state)
        memory.actions.append(action)
        memory.logprobs.append(action_logprob)

        # action += torch.randn(self.action_dim) * 0.1

        return (action.detach(), action_mean.detach().cpu().numpy(),
                entropy.item())
示例#4
0
    def evaluate(self, depth, goal, vel, action):
        # Translate everything to tensors of the correct shape
        if type(depth) is not torch.Tensor:
            depth = torch.Tensor(list(depth)).view(-1, 10, 64, 80)
        if type(goal) is not torch.Tensor:
            goal = torch.Tensor(goal).view(-1, 2)
        if type(vel) is not torch.Tensor:
            vel = torch.Tensor(vel).view(-1, 2)

        # Convolve the depth image stack and concat with the goal and last velocity
        torch.save(depth, "last_depth.pt")
        conv = self.conv(depth)
        catted = torch.cat((conv, goal, vel), dim=1)
        # print(catted.shape)

        # Get the means for the two actions
        action_means = self.action_prediction(catted).view(-1, 2)
        # print("Action Means: ", action_means)

        action_var = self.action_var.expand_as(action_means)
        cov_mat = torch.diag_embed(action_var).to(self.device)
        dist = MultivariateNormal(action_means, cov_mat)

        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_value = self.state_prediction(catted)

        return action_logprobs, torch.squeeze(state_value), dist_entropy
示例#5
0
 def forward(self, state, tensor_cv):
     # CV
     x = F.relu(self.maxp1(self.conv1(tensor_cv.unsqueeze(0))))
     x = F.relu(self.maxp2(self.conv2(x)))
     x = x.view(x.size(0), -1)  #展開
     x = F.relu(self.linear_CNN(x))
     # num
     output_1 = F.relu(self.linear1(state))
     output_2 = F.relu(self.linear2(output_1))
     # LSTM
     output_2 = torch.cat((x, output_2), 1)
     output_2 = output_2.unsqueeze(0)
     output_3, self.hidden_cell = self.LSTM_layer_3(
         output_2)  #,self.hidden_cell
     a, b, c = output_3.shape
     #
     output_4 = F.relu(self.linear4(output_3.view(-1, c)))  #
     mu = torch.tanh(self.mu(output_4))  #有正有负 sigmoid 0-1
     sigma = F.relu(self.sigma(output_4)) + 0.001
     mu = torch.diag_embed(mu).to(device)
     sigma = torch.diag_embed(sigma).to(device)  # change to 2D
     dist = MultivariateNormal(mu, sigma)  #N(μ,σ^2)
     entropy = dist.entropy().mean()
     action = dist.sample()
     action_logprob = dist.log_prob(action)
     return action, action_logprob, entropy
示例#6
0
 def forward(self, state, tensor_cv):
     # CV
     x = F.relu(self.maxp1(self.conv1(tensor_cv)))
     x = F.relu(self.maxp2(self.conv2(x)))
     x = x.view(x.size(0), -1)  #展開
     x = F.relu(self.linear_CNN_1(x)).reshape(1, 768)
     x = F.relu(self.linear_CNN_2(x)).reshape(1, 256)
     # num
     output_1 = F.relu(self.linear1(state))
     output_2 = F.relu(self.linear2(output_1)).reshape(1, 255)
     # merge
     output_2 = torch.cat((x, output_2), 1)
     output_3 = F.relu(self.linear3(output_2))
     #
     output_4 = F.relu(self.linear4(
         output_3))  #F.relu(self.linear4(output_3.view(-1,c))) #
     mu = torch.tanh(self.mu(output_4))  #有正有负 sigmoid 0-1
     sigma = F.relu(self.sigma(output_4)) + 0.001
     mu = torch.diag_embed(mu).to(device)
     sigma = torch.diag_embed(sigma).to(device)  # change to 2D
     dist = MultivariateNormal(mu, sigma)  #N(μ,σ^2)
     entropy = dist.entropy().mean()
     action = dist.sample()
     action_logprob = dist.log_prob(action)
     action = torch.clamp(action.detach(), -0.8, 0.6)
     return action, action_logprob, entropy
示例#7
0
    def sample_action(self,
                      state,
                      eval: bool = False,
                      **kwargs) -> Tuple[npTT, npTT, npTT]:
        """
        Sample action from gauss distribution
        Return: action, log_prob, entropy
        """
        mean, std = self._get_mean_std(state)
        distribution = MultivariateNormal(mean, std)

        if not eval:
            action = torch.clamp(distribution.sample(), -1, 1).detach()
        else:
            action = torch.clamp(mean, -1, 1).detach()

        log_prob = distribution.log_prob(action)
        log_prob = log_prob.view((-1, 1))

        entropy = distribution.entropy()

        return (
            process_kwargs(action, **kwargs),
            process_kwargs(log_prob, **kwargs),
            process_kwargs(entropy, **kwargs),
        )
示例#8
0
    def evaluate(self, state, opponent_state, action):

        if self.has_continuous_action_space:
            pre_mean, pre_sigma = self.om(opponent_state)
            pre_var = pre_sigma**2
            pre_var = pre_var.repeat(1, 2).to(device)
            pre_mat = torch.diag_embed(pre_var).to(device)
            pre_dist = MultivariateNormal(pre_mean, pre_mat)
            pre_action = pre_dist.sample()
            pre_action = pre_action.clamp(-1, 1)

            action_mean, action_sigma = self.actor(state, pre_action)
            action_var = action_sigma**2
            action_var = action_var.repeat(1, 2).to(device)
            cov_mat = torch.diag_embed(action_var).to(device)
            dist = MultivariateNormal(action_mean, cov_mat)

            # For Single Action Environments.
            if self.action_dim == 1:
                action = action.reshape(-1, self.action_dim)
        else:
            action_probs = self.actor(state)
            dist = Categorical(action_probs)

        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_values = self.critic(state, action)

        return action_logprobs, state_values, dist_entropy
示例#9
0
    def evaluate(self, inputs, logits, outputs):
        covariance = torch.diag(self.log_std.exp() * self.log_std.exp())
        distribution = MultivariateNormal(logits, covariance)

        actions_log_prob = distribution.log_prob(outputs)
        entropy = distribution.entropy()

        return actions_log_prob, entropy
示例#10
0
    def _entropy(self, s, a):
        mean, std = self.actor(s)
        std = torch.stack([std] * mean.shape[0], dim=0)

        cov = torch.diag_embed(std)
        dist = MultivariateNormal(loc=mean, covariance_matrix=cov)
        entropy = dist.entropy()
        return entropy
示例#11
0
    def evaluate(self, state, action):
        action_mean, value = self.forward(state)
        cov_mat = torch.diag(
            torch.ones(self.action_space).to(device) * 0.5**0.5)

        dist = MultivariateNormal(action_mean, cov_mat)

        return action, dist.log_prob(action), value, dist.entropy()
示例#12
0
    def evaluate(self, state, action):
        action_mean = self.actor(state)
        dist = MultivariateNormal(action_mean, torch.diag(self.action_var))
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_value = self.critic(state)

        return action_logprobs, state_value, dist_entropy
示例#13
0
 def estimate_action(self, state, action) -> Tuple[TT, TT]:
     """
     Create distribution via state, and compute given action log_prob.
     Return: action, log_prob, entropy
     """
     _action = make_it_batched_torch_tensor(action, self.device)
     distribution = MultivariateNormal(*self._get_mean_std(state))
     return distribution.log_prob(_action).view(
         (-1, 1)), distribution.entropy().view((-1, 1))
示例#14
0
    def evaluate(self, state, action):
        _, _, value, action_mean = self.forward(state)
        cov_mat = torch.diag(self.action_var).to(self.device)
        dist = MultivariateNormal(action_mean, cov_mat)

        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()

        return action_logprobs, torch.squeeze(value), dist_entropy
示例#15
0
    def evaluate(self, state, action):
        latent = self.encoder(state)
        action_mean = self.actor(latent)
        dist = MultivariateNormal(torch.squeeze(action_mean), torch.diag(self.action_var))

        action_logprobs = dist.log_prob(torch.squeeze(action))
        dist_entropy = dist.entropy()
        state_value = self.critic(latent)

        return action_logprobs, torch.squeeze(state_value), dist_entropy
示例#16
0
    def evaluate(self, state, action):
        if self.policy_model == 'Gaussian':
            action_mean = self.actor(state)
            action_var = self.action_var.expand_as(action_mean)
            cov_mat = torch.diag_embed(action_var).to(device)
            dist = MultivariateNormal(action_mean, cov_mat)
            action_logprobs = dist.log_prob(action)
            dist_entropy = dist.entropy()
        elif self.policy_model == 'Beta':
            action_aplha = self.alpha(state) + 1
            action_beta = self.beta(state) + 1
            dist = Beta(action_aplha, action_beta)
            action_logprobs = dist.log_prob(action)
            action_logprobs = torch.sum(action_logprobs, 1)
            dist_entropy = dist.entropy()
            dist_entropy = torch.sum(dist_entropy, 1)

        state_value = self.critic(state)

        return action_logprobs, torch.squeeze(state_value), dist_entropy
    def evaluate(self, state, action):
        action_mean = self.actor(state)
        cov_mat = torch.diag(torch.exp(self.log_std)).to(self.device)
        distrib = MultivariateNormal(action_mean, cov_mat)

        action_log_probs = distrib.log_prob(action).unsqueeze(1)
        dist_entropy = distrib.entropy()

        value = self.critic(state)

        return action_log_probs, value, dist_entropy
示例#18
0
    def evaluate(self, state, action):
        action_mean = self.actor(state)
        action_var = self.action_var.expand_as(action_mean)
        cov_mat = torch.diag_embed(action_var).to(self.device)
        dist = MultivariateNormal(action_mean, cov_mat)

        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_value = self.critic(state)

        return action_logprobs, torch.squeeze(state_value), dist_entropy
示例#19
0
 def select_action(self, x):
     mu, cov = self.forward(x)
     tril = self.reshape_output(mu, cov)
     dist = MultivariateNormal(mu, scale_tril=tril)
     if self.pwd:
         action = dist.rsample()
     else:
         action = dist.sample()
     log_prob = dist.log_prob(action)
     entropy = dist.entropy()
     return action, log_prob, entropy
示例#20
0
 def evaluate_action(self, mu, actions, sigma):
     n_batch = len(mu)
     if self.n_ctrl > 1:
         cov = torch.eye(self.n_ctrl).double() * sigma**2
         cov = cov.repeat(n_batch, 1, 1)
         dist = MultivariateNormal(mu, cov)
     else:
         dist = Normal(mu, torch.ones_like(mu) * sigma)
     log_prob = dist.log_prob(actions.double())
     entropy = dist.entropy()
     return log_prob, entropy
示例#21
0
    def evaluate(self, states, actions):
        mean = self.actor(states)
        cov_vec = self.action_var.expand_as(mean)
        cov = torch.diag_embed(cov_vec)
        dist = MultivariateNormal(mean, cov)
        if self.action_size == 1:
            actions = actions.reshape(-1, self.action_size)
        log_prob = dist.log_prob(actions)
        dist_entropy = dist.entropy()
        state_values = self.critic(states)

        return log_prob, state_values, dist_entropy
示例#22
0
    def evaluate(self, state, action):  # state (4000, 24); action (4000, 4)
        state_value = self.critic(state)  # (4000, 1)

        # to calculate action score(logprobs) and distribution entropy
        action_mean = self.actor(state)  # (4000,4)
        action_var = self.action_var.expand_as(action_mean)  # (4000,4)
        cov_mat = torch.diag_embed(action_var).to(device)  # (4000,4,4)
        dist = MultivariateNormal(action_mean, cov_mat)
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()

        return action_logprobs, torch.squeeze(state_value), dist_entropy
示例#23
0
 def evaluate(self, state, action):
     """
     Note that 'state' and 'action' here can be
     batches of states and actions
     """
     mu_batch = self.block(state)
     variance_batch = self.variance.expand_as(mu_batch)
     cov_batch = torch.diag_embed(variance_batch)
     dist = MultivariateNormal(mu_batch, cov_batch)
     action_logprobs = dist.log_prob(action)
     dist_entropy = dist.entropy()
     return action_logprobs, dist_entropy
示例#24
0
    def evaluate(self, observations, action):
        action_mean = torch.squeeze(self.actor(observations))

        action_var = self.action_var.expand_as(action_mean)
        cov_mat = torch.diag_embed(action_var).to(device)

        dist = MultivariateNormal(action_mean, cov_mat)

        action_logprobs = dist.log_prob(torch.squeeze(action))
        dist_entropy = dist.entropy()
        observation_value = self.critic(observations)

        return action_logprobs, torch.squeeze(observation_value), dist_entropy
示例#25
0
 def evaluate(self, state, action):
     '''Evaluate action for a given state.'''   
     action_mean, _, state_value = self.forward(state)
     
     action_var = self.action_var.expand_as(action_mean)
     cov_mat = torch.diag_embed(action_var)
     
     dist = MultivariateNormal(action_mean, cov_mat)
     
     action_logprobs = dist.log_prob(action)
     dist_entropy = dist.entropy()
     
     return action_logprobs, torch.squeeze(state_value), dist_entropy
 def evaluate(self, states, action):
     states = torch.stack(states)
     states = states.view(-1, *states.shape[-3:])
     actor_critic_input = self.conv(states).view(-1, self.size)
     action_mean = self.actor(actor_critic_input)
     action_var = self.action_var.repeat(states.shape[0], 1)
     cov_mat = torch.diag_embed(action_var).to(device)
     dist = MultivariateNormal(action_mean, cov_mat)
     action = action.view(-1, action_size)
     action_logprobs = dist.log_prob(action).view(states.shape[:-3])
     dist_entropy = dist.entropy().view(states.shape[:-3])
     state_value = self.critic(actor_critic_input).view(states.shape[:-3])
     return action_logprobs, torch.squeeze(state_value), dist_entropy
    def evaluate(self, states, actions):
        action_means = self.agent(states)

        action_var = torch.full((action_dim, ), self.sigma)
        action_var = action_var.expand_as(action_means)
        cov_mat = torch.diag_embed(action_var).to(device)

        dist = MultivariateNormal(action_means, cov_mat)

        action_logprobs = dist.log_prob(actions)
        dist_entropy = dist.entropy()

        return action_logprobs, dist_entropy
示例#28
0
    def get_training_params(self, frame, mes, action):
        frame = torch.squeeze(torch.stack(frame))
        mes = torch.squeeze(torch.stack(mes))
        action = torch.stack(action)

        mean = self.actor_(frame, mes)
        action_expanded = self.action_var.expand_as(mean)
        cov_matrix = torch.diag_embed(action_expanded).to(device)

        gauss_dist = MultivariateNormal(mean, cov_matrix)
        action_log_prob = gauss_dist.log_prob(action).to(device)
        entropy = gauss_dist.entropy().to(device)
        state_value = torch.squeeze(self.critic_(frame, mes)).to(device)
        return action_log_prob, state_value, entropy
示例#29
0
 def evaluate(self, old_state, old_action): 
     action_mean = self.actor(old_state)
     
     action_var = self.action_var.expand_as(action_mean)
     cov_mat = torch.diag_embed(action_var).to(self.device)
     dist = MultivariateNormal(action_mean, cov_mat)
     
     #probability of old action under new policy
    
     action_log_probs = dist.log_prob(old_action)
     state_value = self.critic(old_state)
     dist_entropy = dist.entropy()
     
     return torch.squeeze(state_value), action_log_probs, dist_entropy
 def forward(self, state):
     output_1 = F.relu(self.linear1(state))
     output_2 = F.relu(self.linear2(output_1))
     mu = 2 * torch.sigmoid(self.mu(output_2))   #有正有负
     sigma = F.relu(self.sigma(output_2)) + 0.001   # avoid 0 softplus    output = F.softmax(output, dim=-1)         action_mean = self.linear3(output)
     #cov_mat = torch.diag(self.action_var).to(device)
     mu = torch.diag_embed(mu).to(device)
     sigma = torch.diag_embed(sigma).to(device)  # change to 2D
     dist = MultivariateNormal(mu,sigma)  #N(μ,σ^2)  σ超参不用训练 MultivariateNormal(action_mean, cov_mat) 
     #distribution = Categorical(F.softmax(output, dim=-1))
     entropy = dist.entropy().mean()
     action = dist.sample()
     action_logprob = dist.log_prob(action)
     return action.detach(),action_logprob,entropy   #distribution .detach()