def _make_batched_step( self, name: str, done: bool, reward: float) -> Tuple[DecisionSteps, TerminalSteps]: recurrent_obs_val = (self.goal[name] if self.step_count[name] <= self.num_show_steps else 0) m_vector_obs = self._make_obs(recurrent_obs_val) m_reward = np.array([reward], dtype=np.float32) m_agent_id = np.array([self.agent_id[name]], dtype=np.int32) action_mask = self._generate_mask() decision_step = DecisionSteps(m_vector_obs, m_reward, m_agent_id, action_mask) terminal_step = TerminalSteps.empty(self.behavior_spec) if done: self.final_rewards[name].append(self.rewards[name]) self._reset_agent(name) recurrent_obs_val = (self.goal[name] if self.step_count[name] <= self.num_show_steps else 0) new_vector_obs = self._make_obs(recurrent_obs_val) ( new_reward, new_done, new_agent_id, new_action_mask, ) = self._construct_reset_step(name) decision_step = DecisionSteps(new_vector_obs, new_reward, new_agent_id, new_action_mask) terminal_step = TerminalSteps(m_vector_obs, m_reward, np.array([False], dtype=np.bool), m_agent_id) return (decision_step, terminal_step)
def _make_batched_step( self, name: str, done: bool, reward: float) -> Tuple[DecisionSteps, TerminalSteps]: m_vector_obs = self._make_obs(self.goal[name]) m_reward = np.array([reward], dtype=np.float32) m_agent_id = np.array([self.agent_id[name]], dtype=np.int32) action_mask = self._generate_mask() decision_step = DecisionSteps(m_vector_obs, m_reward, m_agent_id, action_mask) terminal_step = TerminalSteps.empty(self.behavior_spec) if done: self._reset_agent(name) new_vector_obs = self._make_obs(self.goal[name]) ( new_reward, new_done, new_agent_id, new_action_mask, ) = self._construct_reset_step(name) decision_step = DecisionSteps(new_vector_obs, new_reward, new_agent_id, new_action_mask) terminal_step = TerminalSteps(m_vector_obs, m_reward, np.array([False], dtype=np.bool), m_agent_id) return (decision_step, terminal_step)
def add_experiences( self, decision_steps: DecisionSteps, terminal_steps: TerminalSteps, worker_id: int, previous_action: ActionInfo, ) -> None: """ Adds experiences to each agent's experience history. :param decision_steps: current DecisionSteps. :param terminal_steps: current TerminalSteps. :param previous_action: The outputs of the Policy's get_action method. """ take_action_outputs = previous_action.outputs if take_action_outputs: for _entropy in take_action_outputs["entropy"]: self._stats_reporter.add_stat("Policy/Entropy", _entropy) # Make unique agent_ids that are global across workers action_global_agent_ids = [ get_global_agent_id(worker_id, ag_id) for ag_id in previous_action.agent_ids ] for global_id in action_global_agent_ids: if global_id in self._last_step_result: # Don't store if agent just reset self._last_take_action_outputs[global_id] = take_action_outputs # Iterate over all the terminal steps, first gather all the group obs # and then create the AgentExperiences/Trajectories. _add_to_group_status # stores Group statuses in a common data structure self.group_status for terminal_step in terminal_steps.values(): self._add_group_status_and_obs(terminal_step, worker_id) for terminal_step in terminal_steps.values(): local_id = terminal_step.agent_id global_id = get_global_agent_id(worker_id, local_id) self._process_step(terminal_step, worker_id, terminal_steps.agent_id_to_index[local_id]) # Clear the last seen group obs when agents die. self._clear_group_status_and_obs(global_id) # Iterate over all the decision steps, first gather all the group obs # and then create the trajectories. _add_to_group_status # stores Group statuses in a common data structure self.group_status for ongoing_step in decision_steps.values(): self._add_group_status_and_obs(ongoing_step, worker_id) for ongoing_step in decision_steps.values(): local_id = ongoing_step.agent_id self._process_step(ongoing_step, worker_id, decision_steps.agent_id_to_index[local_id]) for _gid in action_global_agent_ids: # If the ID doesn't have a last step result, the agent just reset, # don't store the action. if _gid in self._last_step_result: if "action" in take_action_outputs: self.policy.save_previous_action( [_gid], take_action_outputs["action"])
def create_mock_steps( num_agents: int = 1, num_vector_observations: int = 0, num_vis_observations: int = 0, action_shape: List[int] = None, discrete: bool = False, done: bool = False, ) -> Tuple[DecisionSteps, TerminalSteps]: """ Creates a mock Tuple[DecisionSteps, TerminalSteps] with observations. Imitates constant vector/visual observations, rewards, dones, and agents. :int num_agents: Number of "agents" to imitate. :int num_vector_observations: Number of "observations" in your observation space :int num_vis_observations: Number of "observations" in your observation space :int num_vector_acts: Number of actions in your action space :bool discrete: Whether or not action space is discrete :bool done: Whether all the agents in the batch are done """ if action_shape is None: action_shape = [2] obs_list = [] for _ in range(num_vis_observations): obs_list.append(np.ones((num_agents, 84, 84, 3), dtype=np.float32)) if num_vector_observations > 1: obs_list.append( np.array(num_agents * [num_vector_observations * [1]], dtype=np.float32)) action_mask = None if discrete: action_mask = [ np.array(num_agents * [action_size * [False]]) for action_size in action_shape ] reward = np.array(num_agents * [1.0], dtype=np.float32) interrupted = np.array(num_agents * [False], dtype=np.bool) agent_id = np.arange(num_agents, dtype=np.int32) behavior_spec = BehaviorSpec( [(84, 84, 3)] * num_vis_observations + [(num_vector_observations, 0, 0)], ActionType.DISCRETE if discrete else ActionType.CONTINUOUS, action_shape if discrete else action_shape[0], ) if done: return ( DecisionSteps.empty(behavior_spec), TerminalSteps(obs_list, reward, interrupted, agent_id), ) else: return ( DecisionSteps(obs_list, reward, agent_id, action_mask), TerminalSteps.empty(behavior_spec), )
def create_mock_steps( num_agents: int, observation_shapes: List[Tuple], action_shape: Union[int, Tuple[int]] = None, discrete: bool = False, done: bool = False, ) -> Tuple[DecisionSteps, TerminalSteps]: """ Creates a mock Tuple[DecisionSteps, TerminalSteps] with observations. Imitates constant vector/visual observations, rewards, dones, and agents. :int num_agents: Number of "agents" to imitate. :List observation_shapes: A List of the observation spaces in your steps :int num_vector_acts: Number of actions in your action space :bool discrete: Whether or not action space is discrete :bool done: Whether all the agents in the batch are done """ if action_shape is None: action_shape = 2 obs_list = [] for _shape in observation_shapes: obs_list.append(np.ones((num_agents, ) + _shape, dtype=np.float32)) action_mask = None if discrete and isinstance(action_shape, Iterable): action_mask = [ np.array(num_agents * [action_size * [False]]) for action_size in action_shape # type: ignore ] # type: ignore reward = np.array(num_agents * [1.0], dtype=np.float32) interrupted = np.array(num_agents * [False], dtype=np.bool) agent_id = np.arange(num_agents, dtype=np.int32) behavior_spec = BehaviorSpec( observation_shapes, ActionType.DISCRETE if discrete else ActionType.CONTINUOUS, action_shape, ) if done: return ( DecisionSteps.empty(behavior_spec), TerminalSteps(obs_list, reward, interrupted, agent_id), ) else: return ( DecisionSteps(obs_list, reward, agent_id, action_mask), TerminalSteps.empty(behavior_spec), )
def create_mock_vector_steps(specs, num_agents=1, number_visual_observations=0): """ Creates a mock BatchedStepResult with vector observations. Imitates constant vector observations, rewards, dones, and agents. :BehaviorSpecs specs: The BehaviorSpecs for this mock :int num_agents: Number of "agents" to imitate in your BatchedStepResult values. """ obs = [ np.array([num_agents * [1, 2, 3]], dtype=np.float32).reshape(num_agents, 3) ] if number_visual_observations: obs += [np.zeros(shape=(num_agents, 8, 8, 3), dtype=np.float32) ] * number_visual_observations rewards = np.array(num_agents * [1.0]) agents = np.array(range(0, num_agents)) group_id = np.array(num_agents * [0]) group_rewards = np.array(num_agents * [0.0]) return ( DecisionSteps(obs, rewards, agents, None, group_id, group_rewards), TerminalSteps.empty(specs), )
def test_empty_terminal_steps(): specs = BehaviorSpec(observation_shapes=[(3, 2), (5, )], action_spec=ActionSpec.create_continuous(3)) ts = TerminalSteps.empty(specs) assert len(ts.obs) == 2 assert ts.obs[0].shape == (0, 3, 2) assert ts.obs[1].shape == (0, 5)
def create_mock_steps( num_agents: int, observation_specs: List[ObservationSpec], action_spec: ActionSpec, done: bool = False, grouped: bool = False, ) -> Tuple[DecisionSteps, TerminalSteps]: """ Creates a mock Tuple[DecisionSteps, TerminalSteps] with observations. Imitates constant vector/visual observations, rewards, dones, and agents. :int num_agents: Number of "agents" to imitate. :List observation_specs: A List of the observation specs in your steps :int action_spec: ActionSpec for the agent :bool done: Whether all the agents in the batch are done """ obs_list = [] for obs_spec in observation_specs: obs_list.append(np.ones((num_agents,) + obs_spec.shape, dtype=np.float32)) action_mask = None if action_spec.is_discrete(): action_mask = [ np.array(num_agents * [action_size * [False]]) for action_size in action_spec.discrete_branches # type: ignore ] # type: ignore reward = np.array(num_agents * [1.0], dtype=np.float32) interrupted = np.array(num_agents * [False], dtype=np.bool) agent_id = np.arange(num_agents, dtype=np.int32) _gid = 1 if grouped else 0 group_id = np.array(num_agents * [_gid], dtype=np.int32) group_reward = np.array(num_agents * [0.0], dtype=np.float32) behavior_spec = BehaviorSpec(observation_specs, action_spec) if done: return ( DecisionSteps.empty(behavior_spec), TerminalSteps( obs_list, reward, interrupted, agent_id, group_id, group_reward ), ) else: return ( DecisionSteps( obs_list, reward, agent_id, action_mask, group_id, group_reward ), TerminalSteps.empty(behavior_spec), )
def test_empty_terminal_steps(): specs = BehaviorSpec( sensor_specs=create_sensor_specs_with_shapes([(3, 2), (5, )]), action_spec=ActionSpec.create_continuous(3), ) ts = TerminalSteps.empty(specs) assert len(ts.obs) == 2 assert ts.obs[0].shape == (0, 3, 2) assert ts.obs[1].shape == (0, 5)
def test_empty_terminal_steps(): specs = BehaviorSpec( observation_shapes=[(3, 2), (5, )], action_type=ActionType.CONTINUOUS, action_shape=3, ) ts = TerminalSteps.empty(specs) assert len(ts.obs) == 2 assert ts.obs[0].shape == (0, 3, 2) assert ts.obs[1].shape == (0, 5)
def add_experiences( self, decision_steps: DecisionSteps, terminal_steps: TerminalSteps, worker_id: int, previous_action: ActionInfo, ) -> None: """ Adds experiences to each agent's experience history. :param decision_steps: current DecisionSteps. :param terminal_steps: current TerminalSteps. :param previous_action: The outputs of the Policy's get_action method. """ take_action_outputs = previous_action.outputs if take_action_outputs: for _entropy in take_action_outputs["entropy"]: self.stats_reporter.add_stat("Policy/Entropy", _entropy) # Make unique agent_ids that are global across workers action_global_agent_ids = [ get_global_agent_id(worker_id, ag_id) for ag_id in previous_action.agent_ids ] for global_id in action_global_agent_ids: if global_id in self.last_step_result: # Don't store if agent just reset self.last_take_action_outputs[global_id] = take_action_outputs # Iterate over all the terminal steps for terminal_step in terminal_steps.values(): local_id = terminal_step.agent_id global_id = get_global_agent_id(worker_id, local_id) self._process_step( terminal_step, global_id, terminal_steps.agent_id_to_index[local_id] ) # Iterate over all the decision steps for ongoing_step in decision_steps.values(): local_id = ongoing_step.agent_id global_id = get_global_agent_id(worker_id, local_id) self._process_step( ongoing_step, global_id, decision_steps.agent_id_to_index[local_id] ) for _gid in action_global_agent_ids: # If the ID doesn't have a last step result, the agent just reset, # don't store the action. if _gid in self.last_step_result: if "action" in take_action_outputs: self.policy.save_previous_action( [_gid], take_action_outputs["action"] )
def _update_state(self, output: UnityRLOutputProto) -> None: """ Collects experience information from all external brains in environment at current step. """ for brain_name in self._env_specs.keys(): if brain_name in output.agentInfos: agent_info_list = output.agentInfos[brain_name].value self._env_state[brain_name] = steps_from_proto( agent_info_list, self._env_specs[brain_name] ) else: self._env_state[brain_name] = ( DecisionSteps.empty(self._env_specs[brain_name]), TerminalSteps.empty(self._env_specs[brain_name]), ) self._side_channel_manager.process_side_channel_message(output.side_channel)
def test_terminal_steps(): ts = TerminalSteps( obs=[np.array(range(12), dtype=np.float32).reshape(3, 4)], reward=np.array(range(3), dtype=np.float32), agent_id=np.array(range(10, 13), dtype=np.int32), interrupted=np.array([1, 0, 1], dtype=np.bool), ) assert ts.agent_id_to_index[10] == 0 assert ts.agent_id_to_index[11] == 1 assert ts.agent_id_to_index[12] == 2 assert ts[10].interrupted assert not ts[11].interrupted assert ts[12].interrupted with pytest.raises(KeyError): assert ts.agent_id_to_index[-1] == -1 for agent_id in ts: assert ts.agent_id_to_index[agent_id] in range(3)
def get_steps(self, behavior_name): # This gets the individual DecisionSteps and TerminalSteps # from the envs and merges them into a batch to be sent # to the AgentProcessor. dec_vec_obs = [] dec_reward = [] dec_group_reward = [] dec_agent_id = [] dec_group_id = [] ter_vec_obs = [] ter_reward = [] ter_group_reward = [] ter_agent_id = [] ter_group_id = [] interrupted = [] action_mask = None terminal_step = TerminalSteps.empty(self.behavior_spec) decision_step = None for i in range(self.num_agents): name_and_num = behavior_name + str(i) env = self.envs[name_and_num] _dec, _term = env.step_result[behavior_name] if not self.dones[name_and_num]: dec_agent_id.append(i) dec_group_id.append(1) if len(dec_vec_obs) > 0: for j, obs in enumerate(_dec.obs): dec_vec_obs[j] = np.concatenate((dec_vec_obs[j], obs), axis=0) else: for obs in _dec.obs: dec_vec_obs.append(obs) dec_reward.append(_dec.reward[0]) dec_group_reward.append(_dec.group_reward[0]) if _dec.action_mask is not None: if action_mask is None: action_mask = [] if len(action_mask) > 0: action_mask[0] = np.concatenate( (action_mask[0], _dec.action_mask[0]), axis=0) else: action_mask.append(_dec.action_mask[0]) if len(_term.reward) > 0 and name_and_num in self.just_died: ter_agent_id.append(i) ter_group_id.append(1) if len(ter_vec_obs) > 0: for j, obs in enumerate(_term.obs): ter_vec_obs[j] = np.concatenate((ter_vec_obs[j], obs), axis=0) else: for obs in _term.obs: ter_vec_obs.append(obs) ter_reward.append(_term.reward[0]) ter_group_reward.append(_term.group_reward[0]) interrupted.append(False) self.just_died.remove(name_and_num) decision_step = DecisionSteps( dec_vec_obs, dec_reward, dec_agent_id, action_mask, dec_group_id, dec_group_reward, ) terminal_step = TerminalSteps( ter_vec_obs, ter_reward, interrupted, ter_agent_id, ter_group_id, ter_group_reward, ) return (decision_step, terminal_step)
def steps_from_proto( agent_info_list: Collection[AgentInfoProto], # pylint: disable=unsubscriptable-object behavior_spec: BehaviorSpec, ) -> Tuple[DecisionSteps, TerminalSteps]: decision_agent_info_list = [ agent_info for agent_info in agent_info_list if not agent_info.done ] terminal_agent_info_list = [ agent_info for agent_info in agent_info_list if agent_info.done ] decision_obs_list: List[np.ndarray] = [] terminal_obs_list: List[np.ndarray] = [] for obs_index, sensor_specs in enumerate(behavior_spec.sensor_specs): is_visual = len(sensor_specs.shape) == 3 if is_visual: obs_shape = cast(Tuple[int, int, int], sensor_specs.shape) decision_obs_list.append( _process_visual_observation(obs_index, obs_shape, decision_agent_info_list)) terminal_obs_list.append( _process_visual_observation(obs_index, obs_shape, terminal_agent_info_list)) else: decision_obs_list.append( _process_vector_observation(obs_index, sensor_specs.shape, decision_agent_info_list)) terminal_obs_list.append( _process_vector_observation(obs_index, sensor_specs.shape, terminal_agent_info_list)) decision_rewards = np.array( [agent_info.reward for agent_info in decision_agent_info_list], dtype=np.float32) terminal_rewards = np.array( [agent_info.reward for agent_info in terminal_agent_info_list], dtype=np.float32) _raise_on_nan_and_inf(decision_rewards, "rewards") _raise_on_nan_and_inf(terminal_rewards, "rewards") max_step = np.array( [ agent_info.max_step_reached for agent_info in terminal_agent_info_list ], dtype=np.bool, ) decision_agent_id = np.array( [agent_info.id for agent_info in decision_agent_info_list], dtype=np.int32) terminal_agent_id = np.array( [agent_info.id for agent_info in terminal_agent_info_list], dtype=np.int32) action_mask = None if behavior_spec.action_spec.discrete_size > 0: if any([agent_info.action_mask is not None] for agent_info in decision_agent_info_list): n_agents = len(decision_agent_info_list) a_size = np.sum(behavior_spec.action_spec.discrete_branches) mask_matrix = np.ones((n_agents, a_size), dtype=np.bool) for agent_index, agent_info in enumerate(decision_agent_info_list): if agent_info.action_mask is not None: if len(agent_info.action_mask) == a_size: mask_matrix[agent_index, :] = [ False if agent_info.action_mask[k] else True for k in range(a_size) ] action_mask = (1 - mask_matrix).astype(np.bool) indices = _generate_split_indices( behavior_spec.action_spec.discrete_branches) action_mask = np.split(action_mask, indices, axis=1) return ( DecisionSteps(decision_obs_list, decision_rewards, decision_agent_id, action_mask), TerminalSteps(terminal_obs_list, terminal_rewards, max_step, terminal_agent_id), )