def _compute_actions(policy, obs_batch, add_noise=False, update=True, **kwargs): # Batch is given as list -> Try converting to numpy first. if isinstance(obs_batch, list) and len(obs_batch) == 1: obs_batch = obs_batch[0] observation = policy.preprocessor.transform(obs_batch) observation = policy.observation_filter(observation[None], update=update) observation = convert_to_torch_tensor(observation, policy.device) dist_inputs, _ = policy.model({SampleBatch.CUR_OBS: observation}, [], None) dist = policy.dist_class(dist_inputs, policy.model) action = dist.sample() def _add_noise(single_action, single_action_space): single_action = single_action.detach().cpu().numpy() if add_noise and isinstance(single_action_space, gym.spaces.Box): single_action += np.random.randn(*single_action.shape) * \ policy.action_noise_std return single_action action = tree.map_structure(_add_noise, action, policy.action_space_struct) action = unbatch(action) return action, [], {}
def compute_actions(self, observation, add_noise=False, update=True, **kwargs): # Squeeze batch dimension (we always calculate actions for only a # single obs). observation = observation[0] observation = self.preprocessor.transform(observation) observation = self.observation_filter(observation[None], update=update) # `actions` is a list of (component) batches. # Eager mode. if not self.sess: dist_inputs, _ = self.model({SampleBatch.CUR_OBS: observation}) dist = self.dist_class(dist_inputs, self.model) actions = dist.sample() actions = tree.map_structure(lambda a: a.numpy(), actions) # Graph mode. else: actions = self.sess.run(self.sampler, feed_dict={self.inputs: observation}) if add_noise: actions = tree.map_structure(self._add_noise, actions, self.action_space_struct) # Convert `flat_actions` to a list of lists of action components # (list of single actions). actions = unbatch(actions) return actions, [], {}
def _process_policy_eval_results(to_eval, eval_results, active_episodes, active_envs, off_policy_actions, policies, clip_actions): """Process the output of policy neural network evaluation. Records policy evaluation results into the given episode objects and returns replies to send back to agents in the env. Returns: actions_to_send: nested dict of env id -> agent id -> agent replies. """ actions_to_send = defaultdict(dict) for env_id in active_envs: actions_to_send[env_id] = {} # at minimum send empty dict for policy_id, eval_data in to_eval.items(): rnn_in_cols = _to_column_format([t.rnn_state for t in eval_data]) actions = eval_results[policy_id][0] rnn_out_cols = eval_results[policy_id][1] pi_info_cols = eval_results[policy_id][2] # In case actions is a list (representing the 0th dim of a batch of # primitive actions), try to convert it first. if isinstance(actions, list): actions = np.array(actions) if len(rnn_in_cols) != len(rnn_out_cols): raise ValueError("Length of RNN in did not match RNN out, got: " "{} vs {}".format(rnn_in_cols, rnn_out_cols)) # Add RNN state info for f_i, column in enumerate(rnn_in_cols): pi_info_cols["state_in_{}".format(f_i)] = column for f_i, column in enumerate(rnn_out_cols): pi_info_cols["state_out_{}".format(f_i)] = column policy = _get_or_raise(policies, policy_id) # Clip if necessary (while action components are still batched). if clip_actions: actions = clip_action(actions, policy.action_space_struct) # Split action-component batches into single action rows. actions = unbatch(actions) for i, action in enumerate(actions): env_id = eval_data[i].env_id agent_id = eval_data[i].agent_id actions_to_send[env_id][agent_id] = action episode = active_episodes[env_id] episode._set_rnn_state(agent_id, [c[i] for c in rnn_out_cols]) episode._set_last_pi_info( agent_id, {k: v[i] for k, v in pi_info_cols.items()}) if env_id in off_policy_actions and \ agent_id in off_policy_actions[env_id]: episode._set_last_action(agent_id, off_policy_actions[env_id][agent_id]) else: episode._set_last_action(agent_id, action) return actions_to_send
def compute_actions(self, observation, add_noise=False, update=True, **kwargs): # Batch is given as list of one. if isinstance(observation, list) and len(observation) == 1: observation = observation[0] observation = self.preprocessor.transform(observation) observation = self.observation_filter(observation[None], update=update) # `actions` is a list of (component) batches. # Eager mode. if not self.sess: dist_inputs, _ = self.model({SampleBatch.CUR_OBS: observation}) dist = self.dist_class(dist_inputs, self.model) actions = dist.sample() actions = tree.map_structure(lambda a: a.numpy(), actions) # Graph mode. else: actions = self.sess.run( self.sampler, feed_dict={self.inputs: observation}) actions = unbatch(actions) if add_noise and isinstance(self.action_space, gym.spaces.Box): actions += np.random.randn(*actions.shape) * self.action_noise_std return actions, [], {}
def compute_actions(self, observation, add_noise=False, update=True): observation = self.preprocessor.transform(observation) observation = self.observation_filter(observation[None], update=update) action = self.sess.run(self.sampler, feed_dict={self.inputs: observation}) action = unbatch(action) if add_noise and isinstance(self.action_space, gym.spaces.Box): action += np.random.randn(*action.shape) * self.action_noise_std return action
def compute_actions(self, observation, add_noise=False, update=True): observation = self.preprocessor.transform(observation) observation = self.observation_filter(observation[None], update=update) # `actions` is a list of (component) batches. actions = self.sess.run(self.sampler, feed_dict={self.inputs: observation}) if add_noise: actions = tree.map_structure(self._add_noise, actions, self.action_space_struct) # Convert `flat_actions` to a list of lists of action components # (list of single actions). actions = unbatch(actions) return actions
def compute_actions(self, observation, add_noise=False, update=True, **kwargs): # Batch is given as list of one. if isinstance(observation, list) and len(observation) == 1: observation = observation[0] observation = self.preprocessor.transform(observation) observation = self.observation_filter(observation[None], update=update) action = self.sess.run(self.sampler, feed_dict={self.inputs: observation}) action = unbatch(action) if add_noise and isinstance(self.action_space, gym.spaces.Box): action += np.random.randn(*action.shape) * self.action_noise_std return action
def _process_policy_eval_results( *, to_eval: Dict[PolicyID, List[PolicyEvalData]], eval_results: Dict[PolicyID, Tuple[TensorStructType, StateBatch, dict]], active_episodes: Dict[str, MultiAgentEpisode], active_envs: Set[int], off_policy_actions: MultiEnvDict, policies: Dict[PolicyID, Policy], clip_actions: bool, ) -> Dict[EnvID, Dict[AgentID, EnvActionType]]: """Process the output of policy neural network evaluation. Records policy evaluation results into the given episode objects and returns replies to send back to agents in the env. Args: to_eval (Dict[PolicyID, List[PolicyEvalData]]): Mapping of policy IDs to lists of PolicyEvalData objects. eval_results (Dict[PolicyID, List]): Mapping of policy IDs to list of actions, rnn-out states, extra-action-fetches dicts. active_episodes (Dict[str, MultiAgentEpisode]): Mapping from episode ID to currently ongoing MultiAgentEpisode object. active_envs (Set[int]): Set of non-terminated env ids. off_policy_actions (dict): Doubly keyed dict of env-ids -> agent ids -> off-policy-action, returned by a `BaseEnv.poll()` call. policies (Dict[PolicyID, Policy]): Mapping from policy ID to Policy. clip_actions (bool): Whether to clip actions to the action space's bounds. Returns: actions_to_send: Nested dict of env id -> agent id -> actions to be sent to Env (np.ndarrays). """ actions_to_send: Dict[EnvID, Dict[AgentID, EnvActionType]] = \ defaultdict(dict) # type: int for env_id in active_envs: actions_to_send[env_id] = {} # at minimum send empty dict # type: PolicyID, List[PolicyEvalData] for policy_id, eval_data in to_eval.items(): actions: TensorStructType = eval_results[policy_id][0] actions = convert_to_numpy(actions) rnn_out_cols: StateBatch = eval_results[policy_id][1] pi_info_cols: dict = eval_results[policy_id][2] # In case actions is a list (representing the 0th dim of a batch of # primitive actions), try to convert it first. if isinstance(actions, list): actions = np.array(actions) # Store RNN state ins/outs and extra-action fetches to episode. for f_i, column in enumerate(rnn_out_cols): pi_info_cols["state_out_{}".format(f_i)] = column policy: Policy = _get_or_raise(policies, policy_id) # Split action-component batches into single action rows. actions: List[EnvActionType] = unbatch(actions) # type: int, EnvActionType for i, action in enumerate(actions): # Clip if necessary. if clip_actions: clipped_action = clip_action(action, policy.action_space_struct) else: clipped_action = action env_id: int = eval_data[i].env_id agent_id: AgentID = eval_data[i].agent_id episode: MultiAgentEpisode = active_episodes[env_id] episode._set_rnn_state(agent_id, [c[i] for c in rnn_out_cols]) episode._set_last_pi_info( agent_id, {k: v[i] for k, v in pi_info_cols.items()}) if env_id in off_policy_actions and \ agent_id in off_policy_actions[env_id]: episode._set_last_action(agent_id, off_policy_actions[env_id][agent_id]) else: episode._set_last_action(agent_id, action) assert agent_id not in actions_to_send[env_id] actions_to_send[env_id][agent_id] = clipped_action return actions_to_send
def compute_single_action( self, obs: Optional[TensorStructType] = None, state: Optional[List[TensorType]] = None, *, prev_action: Optional[TensorStructType] = None, prev_reward: Optional[TensorStructType] = None, info: dict = None, input_dict: Optional[SampleBatch] = None, episode: Optional["Episode"] = None, explore: Optional[bool] = None, timestep: Optional[int] = None, # Kwars placeholder for future compatibility. **kwargs, ) -> Tuple[TensorStructType, List[TensorType], Dict[str, TensorType]]: """Computes and returns a single (B=1) action value. Takes an input dict (usually a SampleBatch) as its main data input. This allows for using this method in case a more complex input pattern (view requirements) is needed, for example when the Model requires the last n observations, the last m actions/rewards, or a combination of any of these. Alternatively, in case no complex inputs are required, takes a single `obs` values (and possibly single state values, prev-action/reward values, etc..). Args: obs: Single observation. state: List of RNN state inputs, if any. prev_action: Previous action value, if any. prev_reward: Previous reward, if any. info: Info object, if any. input_dict: A SampleBatch or input dict containing the single (unbatched) Tensors to compute actions. If given, it'll be used instead of `obs`, `state`, `prev_action|reward`, and `info`. episode: This provides access to all of the internal episode state, which may be useful for model-based or multi-agent algorithms. explore: Whether to pick an exploitation or exploration action (default: None -> use self.config["explore"]). timestep: The current (sampling) time step. Keyword Args: kwargs: Forward compatibility placeholder. Returns: Tuple consisting of the action, the list of RNN state outputs (if any), and a dictionary of extra features (if any). """ # Build the input-dict used for the call to # `self.compute_actions_from_input_dict()`. if input_dict is None: input_dict = {SampleBatch.OBS: obs} if state is not None: for i, s in enumerate(state): input_dict[f"state_in_{i}"] = s if prev_action is not None: input_dict[SampleBatch.PREV_ACTIONS] = prev_action if prev_reward is not None: input_dict[SampleBatch.PREV_REWARDS] = prev_reward if info is not None: input_dict[SampleBatch.INFOS] = info # Batch all data in input dict. input_dict = tree.map_structure_with_path( lambda p, s: (s if p == "seq_lens" else s.unsqueeze(0) if torch and isinstance( s, torch.Tensor) else np.expand_dims(s, 0)), input_dict, ) episodes = None if episode is not None: episodes = [episode] out = self.compute_actions_from_input_dict( input_dict=SampleBatch(input_dict), episodes=episodes, explore=explore, timestep=timestep, ) # Some policies don't return a tuple, but always just a single action. # E.g. ES and ARS. if not isinstance(out, tuple): single_action = out state_out = [] info = {} # Normal case: Policy should return (action, state, info) tuple. else: batched_action, state_out, info = out single_action = unbatch(batched_action) assert len(single_action) == 1 single_action = single_action[0] # Return action, internal state(s), infos. return ( single_action, [s[0] for s in state_out], {k: v[0] for k, v in info.items()}, )
def compute_single_action(self, obs, state=None, prev_action=None, prev_reward=None, info=None, episode=None, clip_actions=False, explore=None, timestep=None, **kwargs): """Unbatched version of compute_actions. Arguments: obs (obj): Single observation. state (list): List of RNN state inputs, if any. prev_action (obj): Previous action value, if any. prev_reward (float): Previous reward, if any. info (dict): info object, if any episode (MultiAgentEpisode): this provides access to all of the internal episode state, which may be useful for model-based or multi-agent algorithms. clip_actions (bool): Should actions be clipped? explore (bool): Whether to pick an exploitation or exploration action (default: None -> use self.config["explore"]). timestep (int): The current (sampling) time step. kwargs: forward compatibility placeholder Returns: actions (obj): single action state_outs (list): list of RNN state outputs, if any info (dict): dictionary of extra features, if any """ prev_action_batch = None prev_reward_batch = None info_batch = None episodes = None state_batch = None if prev_action is not None: prev_action_batch = [prev_action] if prev_reward is not None: prev_reward_batch = [prev_reward] if info is not None: info_batch = [info] if episode is not None: episodes = [episode] if state is not None: state_batch = [ s.unsqueeze(0) if torch and isinstance(s, torch.Tensor) else [s] for s in state ] batched_action, state_out, info = self.compute_actions( [obs], state_batch, prev_action_batch=prev_action_batch, prev_reward_batch=prev_reward_batch, info_batch=info_batch, episodes=episodes, explore=explore, timestep=timestep) single_action = unbatch(batched_action) assert len(single_action) == 1 single_action = single_action[0] if clip_actions: single_action = clip_action(single_action, self.action_space_struct) # Return action, internal state(s), infos. return single_action, [s[0] for s in state_out], \ {k: v[0] for k, v in info.items()}
def _process_policy_eval_results( self, to_eval: Dict[PolicyID, List[_PolicyEvalData]], eval_results: Dict[PolicyID, PolicyOutputType], off_policy_actions: MultiEnvDict, ): """Process the output of policy neural network evaluation. Records policy evaluation results into agent connectors and returns replies to send back to agents in the env. Args: to_eval: Mapping of policy IDs to lists of _PolicyEvalData objects. eval_results: Mapping of policy IDs to list of actions, rnn-out states, extra-action-fetches dicts. off_policy_actions: Doubly keyed dict of env-ids -> agent ids -> off-policy-action, returned by a `BaseEnv.poll()` call. Returns: Nested dict of env id -> agent id -> actions to be sent to Env (np.ndarrays). """ actions_to_send: Dict[EnvID, Dict[AgentID, EnvActionType]] = defaultdict(dict) for eval_data in to_eval.values(): for d in eval_data: actions_to_send[d.env_id] = {} # at minimum send empty dict # types: PolicyID, List[_PolicyEvalData] for policy_id, eval_data in to_eval.items(): actions: TensorStructType = eval_results[policy_id][0] actions = convert_to_numpy(actions) rnn_out: StateBatches = eval_results[policy_id][1] extra_action_out: dict = eval_results[policy_id][2] # In case actions is a list (representing the 0th dim of a batch of # primitive actions), try converting it first. if isinstance(actions, list): actions = np.array(actions) # Split action-component batches into single action rows. actions: List[EnvActionType] = unbatch(actions) policy: Policy = _get_or_raise(self._worker.policy_map, policy_id) assert (policy.agent_connectors and policy.action_connectors ), "EnvRunnerV2 requires action connectors to work." # types: int, EnvActionType for i, action in enumerate(actions): env_id: int = eval_data[i].env_id agent_id: AgentID = eval_data[i].agent_id rnn_states: List[StateBatches] = [c[i] for c in rnn_out] fetches: Dict = {k: v[i] for k, v in extra_action_out.items()} # Post-process policy output by running them through action connectors. ac_data = ActionConnectorDataType( env_id, agent_id, (action, rnn_states, fetches)) action_to_send, rnn_states, fetches = policy.action_connectors( ac_data).output action_to_buffer = ( action_to_send if env_id not in off_policy_actions or agent_id not in off_policy_actions[env_id] else off_policy_actions[env_id][agent_id]) # Notify agent connectors with this new policy output. # Necessary for state buffering agent connectors, for example. ac_data: AgentConnectorDataType = ActionConnectorDataType( env_id, agent_id, (action_to_buffer, rnn_states, fetches)) policy.agent_connectors.on_policy_output(ac_data) assert agent_id not in actions_to_send[env_id] actions_to_send[env_id][agent_id] = action_to_send return actions_to_send
def compute_single_action( self, obs: TensorType, state: Optional[List[TensorType]] = None, prev_action: Optional[TensorType] = None, prev_reward: Optional[TensorType] = None, info: dict = None, episode: Optional["MultiAgentEpisode"] = None, clip_actions: bool = False, explore: Optional[bool] = None, timestep: Optional[int] = None, **kwargs) -> \ Tuple[TensorType, List[TensorType], Dict[str, TensorType]]: """Unbatched version of compute_actions. Args: obs (TensorType): Single observation. state (Optional[List[TensorType]]): List of RNN state inputs, if any. prev_action (Optional[TensorType]): Previous action value, if any. prev_reward (Optional[TensorType]): Previous reward, if any. info (dict): Info object, if any. episode (Optional[MultiAgentEpisode]): this provides access to all of the internal episode state, which may be useful for model-based or multi-agent algorithms. clip_actions (bool): Should actions be clipped? explore (Optional[bool]): Whether to pick an exploitation or exploration action (default: None -> use self.config["explore"]). timestep (Optional[int]): The current (sampling) time step. Keyword Args: kwargs: Forward compatibility. Returns: Tuple: - actions (TensorType): Single action. - state_outs (List[TensorType]): List of RNN state outputs, if any. - info (dict): Dictionary of extra features, if any. """ prev_action_batch = None prev_reward_batch = None info_batch = None episodes = None state_batch = None if prev_action is not None: prev_action_batch = [prev_action] if prev_reward is not None: prev_reward_batch = [prev_reward] if info is not None: info_batch = [info] if episode is not None: episodes = [episode] if state is not None: state_batch = [ s.unsqueeze(0) if torch and isinstance(s, torch.Tensor) else np.expand_dims(s, 0) for s in state ] out = self.compute_actions([obs], state_batch, prev_action_batch=prev_action_batch, prev_reward_batch=prev_reward_batch, info_batch=info_batch, episodes=episodes, explore=explore, timestep=timestep) # Some policies don't return a tuple, but always just a single action. # E.g. ES and ARS. if not isinstance(out, tuple): single_action = out state_out = [] info = {} # Normal case: Policy should return (action, state, info) tuple. else: batched_action, state_out, info = out single_action = unbatch(batched_action) assert len(single_action) == 1 single_action = single_action[0] if clip_actions: single_action = clip_action(single_action, self.action_space_struct) # Return action, internal state(s), infos. return single_action, [s[0] for s in state_out], \ {k: v[0] for k, v in info.items()}
def compute_single_action( self, obs: TensorStructType, state: Optional[List[TensorType]] = None, prev_action: Optional[TensorStructType] = None, prev_reward: Optional[TensorStructType] = None, info: dict = None, episode: Optional["MultiAgentEpisode"] = None, clip_actions: bool = None, explore: Optional[bool] = None, timestep: Optional[int] = None, unsquash_actions: bool = None, **kwargs) -> \ Tuple[TensorStructType, List[TensorType], Dict[str, TensorType]]: """Unbatched version of compute_actions. Args: obs: Single observation. state: List of RNN state inputs, if any. prev_action: Previous action value, if any. prev_reward: Previous reward, if any. info (dict): Info object, if any. episode: this provides access to all of the internal episode state, which may be useful for model-based or multi-agent algorithms. unsquash_actions: Should actions be unsquashed according to the Policy's action space? clip_actions: Should actions be clipped according to the Policy's action space? explore: Whether to pick an exploitation or exploration action (default: None -> use self.config["explore"]). timestep: The current (sampling) time step. Keyword Args: kwargs: Forward compatibility. Returns: - actions: Single action. - state_outs: List of RNN state outputs, if any. - info: Dictionary of extra features, if any. """ # If policy works in normalized space, we should unsquash the action. # Use value of config.normalize_actions, if None. unsquash_actions = \ unsquash_actions if unsquash_actions is not None \ else self.config["normalize_actions"] clip_actions = clip_actions if clip_actions is not None else \ self.config["clip_actions"] prev_action_batch = None prev_reward_batch = None info_batch = None episodes = None state_batch = None if prev_action is not None: prev_action_batch = [prev_action] if prev_reward is not None: prev_reward_batch = [prev_reward] if info is not None: info_batch = [info] if episode is not None: episodes = [episode] if state is not None: state_batch = [ s.unsqueeze(0) if torch and isinstance(s, torch.Tensor) else np.expand_dims(s, 0) for s in state ] out = self.compute_actions(tree.map_structure(lambda s: np.array([s]), obs), state_batch, prev_action_batch=prev_action_batch, prev_reward_batch=prev_reward_batch, info_batch=info_batch, episodes=episodes, explore=explore, timestep=timestep) # Some policies don't return a tuple, but always just a single action. # E.g. ES and ARS. if not isinstance(out, tuple): single_action = out state_out = [] info = {} # Normal case: Policy should return (action, state, info) tuple. else: batched_action, state_out, info = out single_action = unbatch(batched_action) assert len(single_action) == 1 single_action = single_action[0] # If we work in normalized action space (normalize_actions=True), # we re-translate here into the env's action space. if unsquash_actions: single_action = unsquash_action(single_action, self.action_space_struct) # Clip, according to env's action space. elif clip_actions: single_action = clip_action(single_action, self.action_space_struct) # Return action, internal state(s), infos. return single_action, [s[0] for s in state_out], \ {k: v[0] for k, v in info.items()}
def compute_single_action( self, obs: Optional[TensorStructType] = None, state: Optional[List[TensorType]] = None, *, prev_action: Optional[TensorStructType] = None, prev_reward: Optional[TensorStructType] = None, info: dict = None, input_dict: Optional[SampleBatch] = None, episode: Optional["MultiAgentEpisode"] = None, explore: Optional[bool] = None, timestep: Optional[int] = None, # Kwars placeholder for future compatibility. **kwargs) -> \ Tuple[TensorStructType, List[TensorType], Dict[str, TensorType]]: """Unbatched version of compute_actions. Args: obs: Single observation. state: List of RNN state inputs, if any. prev_action: Previous action value, if any. prev_reward: Previous reward, if any. info: Info object, if any. input_dict: A SampleBatch or input dict containing the single (unbatched) Tensors to compute actions. If given, it'll be used instead of `obs`, `state`, `prev_action|reward`, and `info`. episode: This provides access to all of the internal episode state, which may be useful for model-based or multi-agent algorithms. explore: Whether to pick an exploitation or exploration action (default: None -> use self.config["explore"]). timestep: The current (sampling) time step. Keyword Args: kwargs: Forward compatibility. Returns: - actions: Single action. - state_outs: List of RNN state outputs, if any. - info: Dictionary of extra features, if any. """ # Build the input-dict used for the call to # `self.compute_actions_from_input_dict()`. if input_dict is None: input_dict = {SampleBatch.OBS: obs} if state is not None: for i, s in enumerate(state): input_dict[f"state_in_{i}"] = s if prev_action is not None: input_dict[SampleBatch.PREV_ACTIONS] = prev_action if prev_reward is not None: input_dict[SampleBatch.PREV_REWARDS] = prev_reward if info is not None: input_dict[SampleBatch.INFOS] = info # Batch all data in input dict. input_dict = tree.map_structure_with_path( lambda p, s: (s if p == "seq_lens" else s.unsqueeze(0) if torch and isinstance( s, torch.Tensor) else np.expand_dims(s, 0)), input_dict) episodes = None if episode is not None: episodes = [episode] out = self.compute_actions_from_input_dict( input_dict=SampleBatch(input_dict), episodes=episodes, explore=explore, timestep=timestep, ) # Some policies don't return a tuple, but always just a single action. # E.g. ES and ARS. if not isinstance(out, tuple): single_action = out state_out = [] info = {} # Normal case: Policy should return (action, state, info) tuple. else: batched_action, state_out, info = out single_action = unbatch(batched_action) assert len(single_action) == 1 single_action = single_action[0] # Return action, internal state(s), infos. return single_action, [s[0] for s in state_out], \ {k: v[0] for k, v in info.items()}
def to_config(self): return name, None @staticmethod def from_config(ctx: ConnectorContext, params: List[Any]): return LambdaActionConnector(ctx) LambdaActionConnector.__name__ = name LambdaActionConnector.__qualname__ = name register_connector(name, LambdaActionConnector) return LambdaActionConnector # Convert actions and states into numpy arrays if necessary. ConvertToNumpyConnector = register_lambda_action_connector( "ConvertToNumpyConnector", lambda actions, states, fetches: ( convert_to_numpy(actions), convert_to_numpy(states), fetches, ), ) # Split action-component batches into single action rows. UnbatchActionsConnector = register_lambda_action_connector( "UnbatchActionsConnector", lambda actions, states, fetches: (unbatch(actions), states, fetches), )
def _process_policy_eval_results(*, to_eval, eval_results, active_episodes, active_envs, off_policy_actions, policies, clip_actions): """Process the output of policy neural network evaluation. Records policy evaluation results into the given episode objects and returns replies to send back to agents in the env. Args: to_eval (Dict[str,List[PolicyEvalData]]): Mapping of policy IDs to lists of PolicyEvalData objects. eval_results (Dict[str,List]): Mapping of policy IDs to list of actions, rnn-out states, extra-action-fetches dicts. active_episodes (defaultdict[str,MultiAgentEpisode]): Mapping from episode ID to currently ongoing MultiAgentEpisode object. active_envs (Set[int]): Set of non-terminated env ids. off_policy_actions (dict): Doubly keyed dict of env-ids -> agent ids -> off-policy-action, returned by a `BaseEnv.poll()` call. policies (Dict[str,Policy]): Mapping from policy ID to Policy obj. clip_actions (bool): Whether to clip actions to the action space's bounds. Returns: actions_to_send: Nested dict of env id -> agent id -> agent replies. """ actions_to_send = defaultdict(dict) for env_id in active_envs: actions_to_send[env_id] = {} # at minimum send empty dict for policy_id, eval_data in to_eval.items(): rnn_in_cols = _to_column_format([t.rnn_state for t in eval_data]) actions = eval_results[policy_id][0] rnn_out_cols = eval_results[policy_id][1] pi_info_cols = eval_results[policy_id][2] # In case actions is a list (representing the 0th dim of a batch of # primitive actions), try to convert it first. if isinstance(actions, list): actions = np.array(actions) if len(rnn_in_cols) != len(rnn_out_cols): raise ValueError("Length of RNN in did not match RNN out, got: " "{} vs {}".format(rnn_in_cols, rnn_out_cols)) # Add RNN state info for f_i, column in enumerate(rnn_in_cols): pi_info_cols["state_in_{}".format(f_i)] = column for f_i, column in enumerate(rnn_out_cols): pi_info_cols["state_out_{}".format(f_i)] = column policy = _get_or_raise(policies, policy_id) # Split action-component batches into single action rows. actions = unbatch(actions) for i, action in enumerate(actions): env_id = eval_data[i].env_id agent_id = eval_data[i].agent_id # Clip if necessary. if clip_actions: clipped_action = clip_action(action, policy.action_space_struct) else: clipped_action = action actions_to_send[env_id][agent_id] = clipped_action episode = active_episodes[env_id] episode._set_rnn_state(agent_id, [c[i] for c in rnn_out_cols]) episode._set_last_pi_info( agent_id, {k: v[i] for k, v in pi_info_cols.items()}) if env_id in off_policy_actions and \ agent_id in off_policy_actions[env_id]: episode._set_last_action(agent_id, off_policy_actions[env_id][agent_id]) else: episode._set_last_action(agent_id, action) return actions_to_send