def _process_policy_eval_results(to_eval, eval_results, active_episodes, active_envs, off_policy_actions, policies, clip_actions): """Process the output of policy neural network evaluation. Records policy evaluation results into the given episode objects and returns replies to send back to agents in the env. Returns: actions_to_send: nested dict of env id -> agent id -> agent replies. """ actions_to_send = defaultdict(dict) for env_id in active_envs: actions_to_send[env_id] = {} # at minimum send empty dict for policy_id, eval_data in to_eval.items(): rnn_in_cols = _to_column_format([t.rnn_state for t in eval_data]) actions = eval_results[policy_id][0] rnn_out_cols = eval_results[policy_id][1] pi_info_cols = eval_results[policy_id][2] # In case actions is a list (representing the 0th dim of a batch of # primitive actions), try to convert it first. if isinstance(actions, list): actions = np.array(actions) if len(rnn_in_cols) != len(rnn_out_cols): raise ValueError("Length of RNN in did not match RNN out, got: " "{} vs {}".format(rnn_in_cols, rnn_out_cols)) # Add RNN state info for f_i, column in enumerate(rnn_in_cols): pi_info_cols["state_in_{}".format(f_i)] = column for f_i, column in enumerate(rnn_out_cols): pi_info_cols["state_out_{}".format(f_i)] = column policy = _get_or_raise(policies, policy_id) # Clip if necessary (while action components are still batched). if clip_actions: actions = clip_action(actions, policy.action_space_struct) # Split action-component batches into single action rows. actions = unbatch(actions) for i, action in enumerate(actions): env_id = eval_data[i].env_id agent_id = eval_data[i].agent_id actions_to_send[env_id][agent_id] = action episode = active_episodes[env_id] episode._set_rnn_state(agent_id, [c[i] for c in rnn_out_cols]) episode._set_last_pi_info( agent_id, {k: v[i] for k, v in pi_info_cols.items()}) if env_id in off_policy_actions and \ agent_id in off_policy_actions[env_id]: episode._set_last_action(agent_id, off_policy_actions[env_id][agent_id]) else: episode._set_last_action(agent_id, action) return actions_to_send
def _process_policy_eval_results(to_eval, eval_results, active_episodes, active_envs, off_policy_actions, policies, clip_actions): """Process the output of policy neural network evaluation. Records policy evaluation results into the given episode objects and returns replies to send back to agents in the env. Returns: actions_to_send: nested dict of env id -> agent id -> agent replies. """ actions_to_send = defaultdict(dict) for env_id in active_envs: actions_to_send[env_id] = {} # at minimum send empty dict for policy_id, eval_data in to_eval.items(): rnn_in_cols = _to_column_format([t.rnn_state for t in eval_data]) actions = eval_results[policy_id][0] rnn_out_cols = eval_results[policy_id][1] pi_info_cols = eval_results[policy_id][2] if len(rnn_in_cols) != len(rnn_out_cols): raise ValueError("Length of RNN in did not match RNN out, got: " "{} vs {}".format(rnn_in_cols, rnn_out_cols)) # Add RNN state info for f_i, column in enumerate(rnn_in_cols): pi_info_cols["state_in_{}".format(f_i)] = column for f_i, column in enumerate(rnn_out_cols): pi_info_cols["state_out_{}".format(f_i)] = column # Save output rows actions = _unbatch_tuple_actions(actions) policy = _get_or_raise(policies, policy_id) for i, action in enumerate(actions): env_id = eval_data[i].env_id agent_id = eval_data[i].agent_id if clip_actions: actions_to_send[env_id][agent_id] = clip_action( action, policy.action_space) else: actions_to_send[env_id][agent_id] = action episode = active_episodes[env_id] episode._set_rnn_state(agent_id, [c[i] for c in rnn_out_cols]) episode._set_last_pi_info( agent_id, {k: v[i] for k, v in pi_info_cols.items()}) if env_id in off_policy_actions and \ agent_id in off_policy_actions[env_id]: episode._set_last_action(agent_id, off_policy_actions[env_id][agent_id]) else: episode._set_last_action(agent_id, action) return actions_to_send
def _try_parse(self, line: str) -> Optional[SampleBatchType]: line = line.strip() if not line: return None try: batch = _from_json(line) except Exception: logger.exception("Ignoring corrupt json record in {}: {}".format( self.cur_file, line)) return None # Clip actions, if necessary. if self.ioctx.config.get("clip_actions"): if isinstance(batch, SampleBatch): batch[SampleBatch.ACTIONS] = clip_action( batch[SampleBatch.ACTIONS], self.ioctx.worker. policy_map["default_policy"].action_space_struct) else: for pid, b in batch.policy_batches.items(): b[SampleBatch.ACTIONS] = clip_action( b[SampleBatch.ACTIONS], self.ioctx.worker.policy_map[pid].action_space_struct) return batch
def _process_policy_eval_results( *, to_eval: Dict[PolicyID, List[PolicyEvalData]], eval_results: Dict[PolicyID, Tuple[TensorStructType, StateBatch, dict]], active_episodes: Dict[str, MultiAgentEpisode], active_envs: Set[int], off_policy_actions: MultiEnvDict, policies: Dict[PolicyID, Policy], clip_actions: bool, ) -> Dict[EnvID, Dict[AgentID, EnvActionType]]: """Process the output of policy neural network evaluation. Records policy evaluation results into the given episode objects and returns replies to send back to agents in the env. Args: to_eval (Dict[PolicyID, List[PolicyEvalData]]): Mapping of policy IDs to lists of PolicyEvalData objects. eval_results (Dict[PolicyID, List]): Mapping of policy IDs to list of actions, rnn-out states, extra-action-fetches dicts. active_episodes (Dict[str, MultiAgentEpisode]): Mapping from episode ID to currently ongoing MultiAgentEpisode object. active_envs (Set[int]): Set of non-terminated env ids. off_policy_actions (dict): Doubly keyed dict of env-ids -> agent ids -> off-policy-action, returned by a `BaseEnv.poll()` call. policies (Dict[PolicyID, Policy]): Mapping from policy ID to Policy. clip_actions (bool): Whether to clip actions to the action space's bounds. Returns: actions_to_send: Nested dict of env id -> agent id -> actions to be sent to Env (np.ndarrays). """ actions_to_send: Dict[EnvID, Dict[AgentID, EnvActionType]] = \ defaultdict(dict) # type: int for env_id in active_envs: actions_to_send[env_id] = {} # at minimum send empty dict # type: PolicyID, List[PolicyEvalData] for policy_id, eval_data in to_eval.items(): actions: TensorStructType = eval_results[policy_id][0] actions = convert_to_numpy(actions) rnn_out_cols: StateBatch = eval_results[policy_id][1] pi_info_cols: dict = eval_results[policy_id][2] # In case actions is a list (representing the 0th dim of a batch of # primitive actions), try to convert it first. if isinstance(actions, list): actions = np.array(actions) # Store RNN state ins/outs and extra-action fetches to episode. for f_i, column in enumerate(rnn_out_cols): pi_info_cols["state_out_{}".format(f_i)] = column policy: Policy = _get_or_raise(policies, policy_id) # Split action-component batches into single action rows. actions: List[EnvActionType] = unbatch(actions) # type: int, EnvActionType for i, action in enumerate(actions): # Clip if necessary. if clip_actions: clipped_action = clip_action(action, policy.action_space_struct) else: clipped_action = action env_id: int = eval_data[i].env_id agent_id: AgentID = eval_data[i].agent_id episode: MultiAgentEpisode = active_episodes[env_id] episode._set_rnn_state(agent_id, [c[i] for c in rnn_out_cols]) episode._set_last_pi_info( agent_id, {k: v[i] for k, v in pi_info_cols.items()}) if env_id in off_policy_actions and \ agent_id in off_policy_actions[env_id]: episode._set_last_action(agent_id, off_policy_actions[env_id][agent_id]) else: episode._set_last_action(agent_id, action) assert agent_id not in actions_to_send[env_id] actions_to_send[env_id][agent_id] = clipped_action return actions_to_send
def _process_policy_eval_results(*, to_eval, eval_results, active_episodes, active_envs, off_policy_actions, policies, clip_actions): """Process the output of policy neural network evaluation. Records policy evaluation results into the given episode objects and returns replies to send back to agents in the env. Args: to_eval (Dict[str,List[PolicyEvalData]]): Mapping of policy IDs to lists of PolicyEvalData objects. eval_results (Dict[str,List]): Mapping of policy IDs to list of actions, rnn-out states, extra-action-fetches dicts. active_episodes (defaultdict[str,MultiAgentEpisode]): Mapping from episode ID to currently ongoing MultiAgentEpisode object. active_envs (Set[int]): Set of non-terminated env ids. off_policy_actions (dict): Doubly keyed dict of env-ids -> agent ids -> off-policy-action, returned by a `BaseEnv.poll()` call. policies (Dict[str,Policy]): Mapping from policy ID to Policy obj. clip_actions (bool): Whether to clip actions to the action space's bounds. Returns: actions_to_send: Nested dict of env id -> agent id -> agent replies. """ actions_to_send = defaultdict(dict) for env_id in active_envs: actions_to_send[env_id] = {} # at minimum send empty dict for policy_id, eval_data in to_eval.items(): rnn_in_cols = _to_column_format([t.rnn_state for t in eval_data]) actions = eval_results[policy_id][0] rnn_out_cols = eval_results[policy_id][1] pi_info_cols = eval_results[policy_id][2] # In case actions is a list (representing the 0th dim of a batch of # primitive actions), try to convert it first. if isinstance(actions, list): actions = np.array(actions) if len(rnn_in_cols) != len(rnn_out_cols): raise ValueError("Length of RNN in did not match RNN out, got: " "{} vs {}".format(rnn_in_cols, rnn_out_cols)) # Add RNN state info for f_i, column in enumerate(rnn_in_cols): pi_info_cols["state_in_{}".format(f_i)] = column for f_i, column in enumerate(rnn_out_cols): pi_info_cols["state_out_{}".format(f_i)] = column policy = _get_or_raise(policies, policy_id) # Split action-component batches into single action rows. actions = unbatch(actions) for i, action in enumerate(actions): env_id = eval_data[i].env_id agent_id = eval_data[i].agent_id # Clip if necessary. if clip_actions: clipped_action = clip_action(action, policy.action_space_struct) else: clipped_action = action actions_to_send[env_id][agent_id] = clipped_action episode = active_episodes[env_id] episode._set_rnn_state(agent_id, [c[i] for c in rnn_out_cols]) episode._set_last_pi_info( agent_id, {k: v[i] for k, v in pi_info_cols.items()}) if env_id in off_policy_actions and \ agent_id in off_policy_actions[env_id]: episode._set_last_action(agent_id, off_policy_actions[env_id][agent_id]) else: episode._set_last_action(agent_id, action) return actions_to_send