예제 #1
0
    def get_action(self, state: torch.Tensor, online_actor: torch.nn.Module, training: bool = False) -> Action:
        """Returns actions for given state as per current policy."""

        def get_actions_():
            online_actor.eval()
            with torch.no_grad():
                actions_ = online_actor(state)
            online_actor.train()
            return actions_

        if training:
            r = np.random.random()
            if r <= self.epsilon:
                action = self.random_action_generator.sample()
            else:
                action = get_actions_().cpu().data.numpy()
                if self.random_action_generator.continuous_actions:
                    action = np.clip(
                        action,
                        self.random_action_generator.continuous_action_range[0],
                        self.random_action_generator.continuous_action_range[1],
                    )  # epsilon greedy policy
        else:
            action = get_actions_().cpu().data.numpy()

        action = Action(value=action)
        return action
예제 #2
0
    def get_action(self, state: torch.Tensor, online_actor: torch.nn.Module) -> Action:
        """Returns actions for given state as per current policy."""
        def get_actions_():
            online_actor.eval()
            with torch.no_grad():
                actions_ = online_actor(state)
            online_actor.train()
            return actions_

        if self.epsilon_scheduler:
            if self.training:
                r = np.random.random()
                if r <= self.epsilon:
                    action = self.random_brain_action_generator.sample()
                else:
                    action = get_actions_().cpu().data.numpy()
                    if self.random_brain_action_generator.continuous_actions:
                        action = np.clip(
                            action,
                            self.random_brain_action_generator.continuous_action_range[0],
                            self.random_brain_action_generator.continuous_action_range[1],
                        )  # epsilon greedy policy
            else:
                action = get_actions_().cpu().data.numpy()
        elif self.noise:
            action = get_actions_().cpu().data.numpy()
            if self.training:
                action += self.noise.sample(action)
            action = np.clip(action, self.action_range[0], self.action_range[1])
        else:
            raise ValueError('Must provide either epsilon_scheduler or noise')

        return Action(value=action)
예제 #3
0
    def get_action(self, state: np.array, model: torch.nn.Module) -> Action:
        model.eval()
        with torch.no_grad():
            action_values = model.forward(state, act=True)
        model.train()

        action = action_values.max(1)[1].cpu().numpy()
        action = Action(value=action)
        return action
예제 #4
0
    def get_action(self, state: np.array, model: torch.nn.Module) -> Action:
        """ Implement this function for speed"""

        model.eval()
        with torch.no_grad():
            action_values = model.forward(state, act=True)
        model.train()

        probs = torch.nn.functional.softmax(action_values)
        action = np.array([
            np.random.choice(np.arange(0, self.action_size),
                             p=probs.view(-1).numpy())
        ])
        action = Action(value=action)
        return action
예제 #5
0
def step_agents_fn(brain_set: BrainSet, next_brain_environment: dict, t: int):
    for brain_name, brain_environment in next_brain_environment.items():
        agent = brain_set[brain_name].agents[0]
        for i in range(NUM_AGENTS):
            action = brain_environment['actions'][0].value[i]
            action = action[np.newaxis, ...]

            brain_agent_experience = Experience(
                state=brain_environment['states'][i].unsqueeze(0),
                action=Action(value=action),
                reward=brain_environment['rewards'][i],
                next_state=brain_environment['next_states'][i].unsqueeze(0),
                done=brain_environment['dones'][i],
                t_step=t,
            )
            agent.step(brain_agent_experience)
예제 #6
0
    def get_action(self,
                   agent_state: torch.FloatTensor,
                   joint_state: torch.FloatTensor,
                   joint_action: Optional[torch.FloatTensor] = None,
                   action: Optional[torch.FloatTensor] = None,
                   *args,
                   **kwargs) -> Action:
        """Returns actions for given states as per target policy.
        :param agent_state: States for this agent
        :param joint_state: States for all agents
        :param joint_action: Actions for all agents
        :param action: Action for this agent

        :return: Action containing:
            - action (Tensor): predicted action
            - log_prob (Tensor): log probability of current action distribution
            - value (Tensor): estimate value function
        """
        other_agent_states = self.get_other_agent_attributes(
            joint_state, self.map_agent_to_state_slice, flatten=False)
        other_agent_actions = self.get_other_agent_attributes(
            joint_action, self.map_agent_to_action_slice,
            flatten=False) if joint_action is not None else None

        self.target_actor_critic.eval()
        with torch.no_grad():
            actions, log_probs, _, values = self.target_actor_critic(
                agent_state=agent_state,
                other_agent_states=other_agent_states,
                other_agent_actions=other_agent_actions,
                action=action,
                scale=self.std_scale)
            if actions.dim() == 1:
                actions = actions.unsqueeze(0)
            actions = actions.cpu().data.numpy()
        self.target_actor_critic.train()
        if self.continuous_actions and self.continuous_action_range_clip:
            actions = actions.clip(self.continuous_action_range_clip[0],
                                   self.continuous_action_range_clip[1])

        return Action(value=actions, log_probs=log_probs, critic_values=values)
예제 #7
0
    def get_action(self, state: np.array, model: torch.nn.Module) -> Action:
        """ Implement this function for speed"""
        def _get_action_values():
            model.eval()
            with torch.no_grad():
                action_values = model.forward(state, act=True)
            model.train()
            return action_values

        if self.training:
            action_values_ = _get_action_values()
            if random.random() > self.epsilon:
                action = action_values_.max(1)[1].data[0]
            else:
                probs = torch.nn.functional.softmax(action_values_)
                action = np.random.choice(np.arange(0, self.action_size),
                                          p=probs.view(-1).numpy())
        else:
            action_values_ = _get_action_values()
            action = action_values_.max(1)[1].data[0]

        return Action(value=action)
예제 #8
0
 def get_action(self, states, *args, **kwargs) -> Action:
     """Returns actions for given states as per target policy.
     :param states: States from environment
     :return: Action containing:
         - action (Tensor): predicted action
         - log_prob (Tensor): log probability of current action distribution
         - value (Tensor): estimate value function
     """
     # Use the target_actor_critic to get new actions
     states = states.to(device)
     self.target_actor_critic.eval()
     with torch.no_grad():
         actions, log_probs, _, values = self.target_actor_critic(
             state=states, scale=self.std_scale)
         if actions.dim() == 1:
             actions = actions.unsqueeze(0)
         actions = actions.cpu().data.numpy()
     self.target_actor_critic.train()
     if self.continuous_actions and self.continuous_action_range_clip:
         actions = actions.clip(self.continuous_action_range_clip[0],
                                self.continuous_action_range_clip[1])
     return Action(value=actions, log_probs=log_probs, critic_values=values)
예제 #9
0
 def get_random_action(self, *args) -> Action:
     """ Get a random action (used for warmup) """
     action = self.random_action_generator.sample()
     action = Action(value=action)
     return action
예제 #10
0
 def get_random_action(self, state: torch.Tensor, *args,
                       **kwargs) -> Action:
     action = np.array(
         np.random.random_integers(0, self.action_size - 1, (1, )))
     action = Action(value=action)
     return action
예제 #11
0
 def get_random_action(self, *args, **kwargs) -> Action:
     # action = torch.rand(1, self.action_size)
     action = torch.randint(0, self.action_size, (1, 1))
     action = action.cpu().data.numpy()
     return Action(value=action)
예제 #12
0
 def get_action(self, state, training=True, *args, **kwargs) -> Action:
     # action = torch.rand(1, self.action_size)
     action = torch.randint(0, self.action_size + 1, (1, 1))
     action = action.cpu().data.numpy()
     return Action(value=action)