def calculate_log_probability_of_actions(self, policy, states, actions):
     """Calculates the log probability of an action occuring given a policy and starting state"""
     policy_output = policy.forward(states).to(self.device)
     policy_distribution = create_actor_distribution(
         self.action_types, policy_output, self.action_size)
     policy_distribution_log_prob = policy_distribution.log_prob(actions)
     return policy_distribution_log_prob
示例#2
0
 def calculate_log_probability_of_actions(self, policy, states, actions):
     """Calculates the log probability of an action occuring given a policy and starting state"""
     policy_output = policy.forward(states).to(self.device)
     policy_distribution = create_actor_distribution(
         "DISCRETE", policy_output,
         self.config.hyperparameters["action_space"])
     policy_distribution_log_prob = policy_distribution.log_prob(actions)
     return policy_distribution_log_prob
 def produce_action_and_action_info(self, state):
     """Given the state, produces an action, the probability of the action, the log probability of the action, and
     the argmax action"""
     action_probabilities = self.actor_local(state)
     max_probability_action = torch.argmax(action_probabilities).unsqueeze(0)
     action_distribution = create_actor_distribution(self.action_types, action_probabilities, self.action_size)
     action = action_distribution.sample().cpu()
     log_action_probabilities = torch.log(action_probabilities)
     return action, (action_probabilities, log_action_probabilities), max_probability_action
示例#4
0
 def produce_action_and_action_info(self, state):
     """Given the state, produces an action, the probability of the action, the log probability of the action, and
     the argmax action"""
     action_probabilities = self.actor_local(state)
     max_probability_action = torch.argmax(action_probabilities, dim=-1)
     action_distribution = create_actor_distribution(self.action_types, action_probabilities, self.action_size)
     action = action_distribution.sample().cpu()
     # Have to deal with situation of 0.0 probabilities because we can't do log 0
     z = action_probabilities == 0.0
     z = z.float() * 1e-8
     log_action_probabilities = torch.log(action_probabilities + z)
     return action, (action_probabilities, log_action_probabilities), max_probability_action
示例#5
0
    def pick_action(self, state, exploration_episilon):
        if self.config.hyperparameters['random_policy'] and random.random(
        ) <= exploration_episilon:
            action = random.randint(
                0, self.config.hyperparameters['action_space'] - 1)
            return action

        state = torch.from_numpy(state).float()
        actor_output = self.policy_new.forward(state)
        action_distribution = create_actor_distribution(
            "DISCRETE", actor_output,
            self.config.hyperparameters['action_space'])
        action = action_distribution.sample().cpu()
        return action.item()
示例#6
0
 def pick_action_and_get_critic_values(self, policy, state, epsilon_exploration=None):
     """Picks an action using the policy"""
     state = torch.from_numpy(state).float().unsqueeze(0)
     model_output = policy.forward(state)
     actor_output = model_output[:, list(range(self.action_size))] #we only use first set of columns to decide action, last column is state-value
     critic_output = model_output[:, -1]
     action_distribution = create_actor_distribution(self.action_types, actor_output, self.action_size)
     action = action_distribution.sample().cpu().numpy()
     if self.action_types == "CONTINUOUS": action += self.noise.sample()
     if self.action_types == "DISCRETE":
         if random.random() <= epsilon_exploration:
             action = random.randint(0, self.action_size - 1)
         else:
             action = action[0]
     action_log_prob = self.calculate_log_action_probability(action, action_distribution)
     return action, action_log_prob, critic_output
示例#7
0
    def pick_action(self, policy, state, epsilon_exploration=None):
        """Picks an action using the policy"""
        if self.action_types == "DISCRETE":
            if random.random() <= epsilon_exploration:
                action = random.randint(0, self.action_size - 1)
                return action

        state = torch.from_numpy(state).float().unsqueeze(0)
        actor_output = policy.forward(state)
        if self.action_choice_output_columns is not None:
            actor_output = actor_output[:, self.action_choice_output_columns]
        action_distribution = create_actor_distribution(
            self.action_types, actor_output, self.action_size)
        action = action_distribution.sample().cpu()

        if self.action_types == "CONTINUOUS":
            action += torch.Tensor(self.noise.sample())
        else:
            action = action.item()
        return action