def action(self, state): """ Arguments: - state (Overcooked_mdp.OvercookedState) object encoding the global view of the environment returns: - the argmax action for a single observation state - action_info (dict) that stores action probabilities under 'action_probs' key """ # Preprocess the environment state obs = self.featurize(state) my_obs = obs[self.agent_index] # Use Rllib.Policy class to compute action argmax and action probabilities [action_idx], rnn_state, info = self.policy.compute_actions( np.array([my_obs]), self.rnn_state) agent_action = Action.INDEX_TO_ACTION[action_idx] # Softmax in numpy to convert logits to normalized probabilities logits = info['action_dist_inputs'] action_probabilities = softmax(logits) agent_action_info = {'action_probs': action_probabilities} self.rnn_state = rnn_state return agent_action, agent_action_info
def test_softmax(self): logits = np.array([[0.1, 0.1, 0.1], [-0.1, 0.0, 0.1], [0.5, -1.2, 3.2], [-1.6, -2.0, -1.5]]) expected = np.array([[0.33333333, 0.33333333, 0.33333333], [0.30060961, 0.33222499, 0.3671654 ], [0.06225714, 0.01137335, 0.92636951], [0.36029662, 0.24151404, 0.39818934]]) actual = softmax(logits) self.assertTrue(np.allclose(expected, actual))
def action_probabilities(self, state): """ Arguments: - state (Overcooked_mdp.OvercookedState) object encoding the global view of the environment returns: - Normalized action probabilities determined by self.policy """ # Preprocess the environment state obs = self.featurize(state, debug=False) my_obs = obs[self.agent_index] # Compute non-normalized log probabilities from the underlying model logits = self.policy.compute_actions(np.array( [my_obs]), self.rnn_state)[2]['action_dist_inputs'] # Softmax in numpy to convert logits to normalized probabilities return softmax(logits)
def compute_actions(self, obs_batch, state_batches=None, prev_action_batch=None, prev_reward_batch=None, info_batch=None, episodes=None, **kwargs): """ Computes sampled actions for each of the corresponding OvercookedEnv states in obs_batch Args: obs_batch (np.array): batch of pre-process (lossless state encoded) observations Returns: actions (list|np.array): batch of output actions shape [BATCH_SIZE, ACTION_SHAPE] state_outs (list): only necessary for rnn hidden states infos (dict): dictionary of extra feature batches { "action_dist_inputs" : [BATCH_SIZE, ...] } """ # Cast to np.array if list (no-op if already np.array) obs_batch = np.array(obs_batch) # Run the model with self.context: action_logits, states, orders_logits = self._forward( obs_batch, state_batches) # Softmax in numpy to convert logits to probabilities action_probs = softmax(action_logits) if self.stochastic: # Sample according to action_probs for each row in the output actions = np.array([ np.random.choice(self.action_shape[0], p=action_probs[i]) for i in range(len(action_probs)) ]) else: actions = np.argmax(action_logits, axis=1) return actions, states, {"action_dist_inputs": action_logits}