예제 #1
0
    def action(self, state):
        """
        Arguments: 
            - state (Overcooked_mdp.OvercookedState) object encoding the global view of the environment
        returns: 
            - the argmax action for a single observation state
            - action_info (dict) that stores action probabilities under 'action_probs' key
        """
        # Preprocess the environment state
        obs = self.featurize(state)
        my_obs = obs[self.agent_index]

        # Use Rllib.Policy class to compute action argmax and action probabilities
        [action_idx], rnn_state, info = self.policy.compute_actions(
            np.array([my_obs]), self.rnn_state)
        agent_action = Action.INDEX_TO_ACTION[action_idx]

        # Softmax in numpy to convert logits to normalized probabilities
        logits = info['action_dist_inputs']
        action_probabilities = softmax(logits)

        agent_action_info = {'action_probs': action_probabilities}
        self.rnn_state = rnn_state

        return agent_action, agent_action_info
예제 #2
0
    def test_softmax(self):
        logits = np.array([[0.1, 0.1, 0.1],
                           [-0.1, 0.0, 0.1],
                           [0.5, -1.2, 3.2],
                           [-1.6, -2.0, -1.5]])
        expected = np.array([[0.33333333, 0.33333333, 0.33333333],
                             [0.30060961, 0.33222499, 0.3671654 ],
                             [0.06225714, 0.01137335, 0.92636951],
                             [0.36029662, 0.24151404, 0.39818934]])

        actual = softmax(logits)

        self.assertTrue(np.allclose(expected, actual))
예제 #3
0
    def action_probabilities(self, state):
        """
        Arguments:
            - state (Overcooked_mdp.OvercookedState) object encoding the global view of the environment
        returns:
            - Normalized action probabilities determined by self.policy
        """
        # Preprocess the environment state
        obs = self.featurize(state, debug=False)
        my_obs = obs[self.agent_index]

        # Compute non-normalized log probabilities from the underlying model
        logits = self.policy.compute_actions(np.array(
            [my_obs]), self.rnn_state)[2]['action_dist_inputs']

        # Softmax in numpy to convert logits to normalized probabilities
        return softmax(logits)
    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
                        prev_action_batch=None,
                        prev_reward_batch=None,
                        info_batch=None,
                        episodes=None,
                        **kwargs):
        """
        Computes sampled actions for each of the corresponding OvercookedEnv states in obs_batch

        Args:
            obs_batch (np.array): batch of pre-process (lossless state encoded) observations

        Returns:
            actions (list|np.array): batch of output actions shape [BATCH_SIZE, ACTION_SHAPE]
            state_outs (list): only necessary for rnn hidden states
            infos (dict): dictionary of extra feature batches { "action_dist_inputs" : [BATCH_SIZE, ...] }
        """
        # Cast to np.array if list (no-op if already np.array)
        obs_batch = np.array(obs_batch)

        # Run the model
        with self.context:
            action_logits, states, orders_logits = self._forward(
                obs_batch, state_batches)

        # Softmax in numpy to convert logits to probabilities
        action_probs = softmax(action_logits)
        if self.stochastic:
            # Sample according to action_probs for each row in the output
            actions = np.array([
                np.random.choice(self.action_shape[0], p=action_probs[i])
                for i in range(len(action_probs))
            ])
        else:
            actions = np.argmax(action_logits, axis=1)

        return actions, states, {"action_dist_inputs": action_logits}