def test_agentprocessor(num_vis_obs): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) fake_action_outputs = { "action": ActionTuple(continuous=np.array([[0.1], [0.1]])), "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "log_probs": LogProbsTuple(continuous=np.array([[0.1], [0.1]])), } mock_decision_steps, mock_terminal_steps = mb.create_mock_steps( num_agents=2, observation_shapes=[(8,)] + num_vis_obs * [(84, 84, 3)], action_spec=ActionSpec.create_continuous(2), ) fake_action_info = ActionInfo( action=ActionTuple(continuous=np.array([[0.1], [0.1]])), env_action=ActionTuple(continuous=np.array([[0.1], [0.1]])), value=[0.1, 0.1], outputs=fake_action_outputs, agent_ids=mock_decision_steps.agent_id, ) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty() ) for _ in range(5): processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, fake_action_info ) # Assert that two trajectories have been added to the Trainer assert len(tqueue.put.call_args_list) == 2 # Assert that the trajectory is of length 5 trajectory = tqueue.put.call_args_list[0][0][0] assert len(trajectory.steps) == 5 # Assert that the AgentProcessor is empty assert len(processor.experience_buffers[0]) == 0 # Test empty steps mock_decision_steps, mock_terminal_steps = mb.create_mock_steps( num_agents=0, observation_shapes=[(8,)] + num_vis_obs * [(84, 84, 3)], action_spec=ActionSpec.create_continuous(2), ) processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty() ) # Assert that the AgentProcessor is still empty assert len(processor.experience_buffers[0]) == 0
def test_end_episode(): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) fake_action_outputs = { "action": ActionTuple(continuous=np.array([[0.1]])), "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "log_probs": LogProbsTuple(continuous=np.array([[0.1]])), } mock_decision_step, mock_terminal_step = mb.create_mock_steps( num_agents=1, observation_shapes=[(8,)], action_spec=ActionSpec.create_continuous(2), ) fake_action_info = ActionInfo( action=ActionTuple(continuous=np.array([[0.1]])), env_action=ActionTuple(continuous=np.array([[0.1]])), value=[0.1], outputs=fake_action_outputs, agent_ids=mock_decision_step.agent_id, ) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences( mock_decision_step, mock_terminal_step, 0, ActionInfo.empty() ) # Run 3 trajectories, with different workers (to simulate different agents) remove_calls = [] for _ep in range(3): remove_calls.append(mock.call([get_global_agent_id(_ep, 0)])) for _ in range(5): processor.add_experiences( mock_decision_step, mock_terminal_step, _ep, fake_action_info ) # Make sure we don't add experiences from the prior agents after the done # Call end episode processor.end_episode() # Check that we removed every agent policy.remove_previous_action.assert_has_calls(remove_calls) # Check that there are no experiences left assert len(processor.experience_buffers.keys()) == 0 assert len(processor.last_take_action_outputs.keys()) == 0 assert len(processor.episode_steps.keys()) == 0 assert len(processor.episode_rewards.keys()) == 0
def get_action( self, decision_requests: DecisionSteps, worker_id: int = 0 ) -> ActionInfo: """ Decides actions given observations information, and takes them in environment. :param decision_requests: A dictionary of brain names and DecisionSteps from environment. :param worker_id: In parallel environment training, the unique id of the environment worker that the DecisionSteps came from. Used to construct a globally unique id for each agent. :return: an ActionInfo containing action, memories, values and an object to be passed to add experiences """ if len(decision_requests) == 0: return ActionInfo.empty() global_agent_ids = [ get_global_agent_id(worker_id, int(agent_id)) for agent_id in decision_requests.agent_id ] # For 1-D array, the iterator order is correct. run_out = self.evaluate( # pylint: disable=assignment-from-no-return decision_requests, global_agent_ids ) self.save_memories(global_agent_ids, run_out.get("memory_out")) # For Compatibility with buffer changes for hybrid action support if "log_probs" in run_out: log_probs_tuple = LogProbsTuple() if self.behavior_spec.action_spec.is_continuous(): log_probs_tuple.add_continuous(run_out["log_probs"]) else: log_probs_tuple.add_discrete(run_out["log_probs"]) run_out["log_probs"] = log_probs_tuple if "action" in run_out: action_tuple = ActionTuple() env_action_tuple = ActionTuple() if self.behavior_spec.action_spec.is_continuous(): action_tuple.add_continuous(run_out["pre_action"]) env_action_tuple.add_continuous(run_out["action"]) else: action_tuple.add_discrete(run_out["action"]) env_action_tuple.add_discrete(run_out["action"]) run_out["action"] = action_tuple run_out["env_action"] = env_action_tuple self.check_nan_action(run_out.get("action")) return ActionInfo( action=run_out.get("action"), env_action=run_out.get("env_action"), value=run_out.get("value"), outputs=run_out, agent_ids=decision_requests.agent_id, )
def run(self): # reset unity environment before start self.unity_env.reset() while True: decision_steps, terminal_steps = self.unity_env.get_steps( self._behavior_name) # first process all envs/"agents" in decision steps actions = [] # process all envs/"agents" that finished their episodes in terminal # steps for agent_id in terminal_steps.agent_id: # first check if a new episode needs to be started if agent_id not in self.agentID_to_episodeID.keys(): episode_id = self.start_episode() self.agentID_to_episodeID[agent_id] = episode_id episode_id = self.agentID_to_episodeID[agent_id] # get observation, rewards and info obs = terminal_steps[agent_id].obs obs = obs[0] if len(obs) == 1 else obs reward = terminal_steps[agent_id].reward info = {"interrupted": terminal_steps[agent_id].interrupted} self.log_returns(episode_id, reward, info) # end episode and remove agent_id from self.agentID_to_episodeID self.end_episode(episode_id, obs) self.agentID_to_episodeID.pop(agent_id) for agent_id in decision_steps.agent_id: # first check if a new episode needs to be started if agent_id not in self.agentID_to_episodeID.keys(): episode_id = self.start_episode() self.agentID_to_episodeID[agent_id] = episode_id episode_id = self.agentID_to_episodeID[agent_id] # get observation and reward and request action obs = decision_steps[agent_id].obs obs = obs[0] if len(obs) == 1 else obs reward = decision_steps[agent_id].reward # log reward and request action self.log_returns(episode_id, reward) actions.append(self.get_action(episode_id, obs)) # set actions in Unity environment if actions: if actions[0].dtype == np.float32: action_tuple = ActionTuple(continuous=np.array(actions)) else: action_tuple = ActionTuple(discrete=np.array(actions)) self.unity_env.set_actions(self._behavior_name, action_tuple) self.unity_env.step()
def evaluate_population(pop, env, net_shapes, inference=False): """ Evaluation of a whole population """ behavior_name = list(env.behavior_specs)[0] spec = env.behavior_specs[behavior_name] action_spec = spec.action_spec fitnesses = [0.] * len(pop) env.reset() decision_steps, terminal_steps = env.get_steps(behavior_name) # Play in an environment for number of steps episode_length = 1000 while episode_length > 1: for agent_id in terminal_steps.agent_id: fitnesses[agent_id] += terminal_steps[agent_id].reward if len(terminal_steps) == len(pop): break # Generate an action for all agents actions = np.empty( (len(decision_steps), action_spec.continuous_size) ) for agent_id in decision_steps.agent_id: state = decision_steps[agent_id].obs[0] fitnesses[agent_id] += decision_steps[agent_id].reward individual = params_reshape(net_shapes, pop[agent_id]) if action_spec.is_discrete(): action_discrete_probs = get_action(individual, state) # returns probability for each action actions_discrete = np.argmax(action_discrete_probs) # choose action with highest probability actions[agent_id] = actions_discrete elif action_spec.is_continuous(): action_continuous_probs = get_action(individual, state, continuous=True) # returns probability for each action actions[agent_id] = action_continuous_probs if episode_length % 251 == 0 and not inference: env.reset() # reset to change initial position from time to time else: if action_spec.is_discrete(): action = ActionTuple(discrete=actions) elif action_spec.is_continuous(): action = ActionTuple(continuous=actions) env.set_actions(behavior_name, action) env.step() decision_steps, terminal_steps = env.get_steps(behavior_name) episode_length -= 1 return fitnesses
def step(self, action: List[Any]) -> GymStepResult: """Run one timestep of the environment's dynamics. When end of episode is reached, you are responsible for calling `reset()` to reset this environment's state. Accepts an action and returns a tuple (observation, reward, done, info). Args: action (object/list): an action provided by the environment Returns: observation (object/list): agent's observation of the current environment reward (float/list) : amount of reward returned after previous action done (boolean/list): whether the episode has ended. info (dict): contains auxiliary diagnostic information. """ if self._flattener is not None: # Translate action into list action = self._flattener.lookup_action(action) action = np.array(action).reshape((1, self.action_size)) action_tuple = ActionTuple() if self.group_spec.action_spec.is_continuous(): action_tuple.add_continuous(action) else: action_tuple.add_discrete(action) self._env.set_actions(self.name, action_tuple) self._env.step() decision_step, terminal_step = self._env.get_steps(self.name) self._check_agents(max(len(decision_step), len(terminal_step))) if len(terminal_step) != 0: # The agent is done self.game_over = True return self._single_step(terminal_step) else: return self._single_step(decision_step)
def test_step(mock_communicator, mock_launcher): mock_communicator.return_value = MockCommunicator(discrete_action=False, visual_inputs=0) env = UnityEnvironment(" ") spec = env.behavior_specs["RealFakeBrain"] env.step() decision_steps, terminal_steps = env.get_steps("RealFakeBrain") n_agents = len(decision_steps) env.set_actions("RealFakeBrain", spec.action_spec.empty_action(n_agents)) env.step() with pytest.raises(UnityActionException): env.set_actions("RealFakeBrain", spec.action_spec.empty_action(n_agents - 1)) decision_steps, terminal_steps = env.get_steps("RealFakeBrain") n_agents = len(decision_steps) _empty_act = spec.action_spec.empty_action(n_agents) next_action = ActionTuple(_empty_act.continuous - 1, _empty_act.discrete - 1) env.set_actions("RealFakeBrain", next_action) env.step() env.close() assert isinstance(decision_steps, DecisionSteps) assert isinstance(terminal_steps, TerminalSteps) assert len(spec.observation_specs) == len(decision_steps.obs) assert len(spec.observation_specs) == len(terminal_steps.obs) for spec, obs in zip(spec.observation_specs, decision_steps.obs): assert (n_agents, ) + spec.shape == obs.shape assert 0 in decision_steps assert 2 in terminal_steps
def solve(self) -> None: self.reset() for _ in range(self.n_demos): for name in self.names: if self.action_spec.discrete_size > 0: self.action[name] = ActionTuple( np.array([], dtype=np.float32), np.array([[1]] if self.goal[name] > 0 else [[0]], dtype=np.int32), ) else: self.action[name] = ActionTuple( np.array([[float(self.goal[name])]], dtype=np.float32), np.array([], dtype=np.int32), ) self.step()
def _create_action_info(num_agents: int, agent_ids: List[str]) -> ActionInfo: fake_action_outputs = { "action": ActionTuple( continuous=np.array([[0.1]] * num_agents, dtype=np.float32) ), "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "log_probs": LogProbsTuple( continuous=np.array([[0.1]] * num_agents, dtype=np.float32) ), } fake_action_info = ActionInfo( action=ActionTuple(continuous=np.array([[0.1]] * num_agents, dtype=np.float32)), env_action=ActionTuple( continuous=np.array([[0.1]] * num_agents, dtype=np.float32) ), outputs=fake_action_outputs, agent_ids=agent_ids, ) return fake_action_info
def ConvertList2DiscreteAction(self, arr, behavior_name): ''' input data type = list or 1D array -> ex)[3] !!! Don't Input 2D Array or list like [(0, 2)] output data type = Actiontuple ''' actionList = [] actionList.append(arr) _discrete = np.array(actionList, dtype=np.int32) action = ActionTuple(discrete=_discrete) return action
def step(self, action): # Reshape to (10, 9) as needed for the wrapper action = action.reshape((10, 9)) act = ActionTuple(action) self.env.set_actions(self.behavior_name, act) self.env.step() decision_steps, terminal_steps = self.env.get_steps(self.behavior_name) observation, reward = self._decision_to_observation(decision_steps) done = len(decision_steps) == 0 info = {} return observation, reward, done, info
def to_action_tuple(self, clip: bool = False) -> ActionTuple: """ Returns an ActionTuple """ action_tuple = ActionTuple() if self.continuous_tensor is not None: _continuous_tensor = self.continuous_tensor if clip: _continuous_tensor = torch.clamp(_continuous_tensor, -3, 3) / 3 continuous = ModelUtils.to_numpy(_continuous_tensor) action_tuple.add_continuous(continuous) if self.discrete_list is not None: discrete = ModelUtils.to_numpy(self.discrete_tensor[:, 0, :]) action_tuple.add_discrete(discrete) return action_tuple
def step( self, actions: List[np.ndarray] ) -> GymResult: # todo add support for ActionTuple(continuous, discrete) curr_action_idx = 0 for team in self.team_names: # print(f'start idx={curr_action_idx}. End idx = {curr_action_idx + self.agent_per_team[team]}') # print(f'len action list:{len(actions[curr_action_idx:curr_action_idx + self.agent_per_team[team]])}') action = np.vstack(actions[curr_action_idx:curr_action_idx + self.agent_per_team[team]]) # print(f'actions shape:{action.shape}') self._e.set_actions(team, ActionTuple(action)) curr_action_idx += self.agent_per_team[team] self._e.step() self.n += 1 return self.collect_obs()
def set_actions(self, behavior_name, action): # The ActionTuple contains the actions for all n_agents. This # slices the ActionTuple into an action tuple for each environment # and sets it. The index j is used to ignore agents that have already # reached done. j = 0 for i in range(self.num_agents): _act = ActionTuple() name_and_num = behavior_name + str(i) env = self.envs[name_and_num] if not self.dones[name_and_num]: if self.action_spec.continuous_size > 0: _act.add_continuous(action.continuous[j:j + 1]) if self.action_spec.discrete_size > 0: _disc_list = [action.discrete[j, :]] _act.add_discrete(np.array(_disc_list)) j += 1 env.action[behavior_name] = _act
def test_set_action_multi_agent(): engine_config_channel = EngineConfigurationChannel() env = default_registry[BALL_ID].make( base_port=6001, worker_id=0, no_graphics=True, side_channels=[engine_config_channel], ) engine_config_channel.set_configuration_parameters(time_scale=100) for _ in range(3): env.reset() behavior_name = list(env.behavior_specs.keys())[0] d, t = env.get_steps(behavior_name) for _ in range(50): action = np.ones((len(d), 2)) action_tuple = ActionTuple() action_tuple.add_continuous(action) env.set_actions(behavior_name, action_tuple) env.step() d, t = env.get_steps(behavior_name) env.close()
def step(self, action): """Runs one timestep of the environment"s dynamics. Once an episode is done, reset() has to be called manually. Arguments: action {List} -- A list of at least one discrete action to be executed by the agent Returns: {numpy.ndarray} -- Visual observation {numpy.ndarray} -- Vector observation {float} -- (Total) Scalar reward signaled by the environment {bool} -- Whether the episode of the environment terminated {dict} -- Further episode information (e.g. cumulated reward) retrieved from the environment once an episode completed """ # Carry out the agent's action action_tuple = ActionTuple() action_tuple.add_discrete(np.asarray(action).reshape([1, -1])) self._env.set_actions(self._behavior_name, action_tuple) self._env.step() info, terminal_info = self._env.get_steps(self._behavior_name) # Process step results vis_obs, vec_obs, reward, done = self._process_agent_info(info, terminal_info) self._rewards.append(reward) # Record trajectory data if self._record: self._trajectory["vis_obs"].append(vis_obs * 255) self._trajectory["vec_obs"].append(vec_obs) self._trajectory["rewards"].append(reward) self._trajectory["actions"].append(action) # Episode information if done: info = {"reward": sum(self._rewards), "length": len(self._rewards)} else: info = None return vis_obs, vec_obs, reward, done, info
def _add_group_status_and_obs( self, step: Union[TerminalStep, DecisionStep], worker_id: int ) -> None: """ Takes a TerminalStep or DecisionStep and adds the information in it to self.group_status. This information can then be retrieved when constructing trajectories to get the status of group mates. Also stores the current observation into current_group_obs, to be used to get the next group observations for bootstrapping. :param step: TerminalStep or DecisionStep :param worker_id: Worker ID of this particular environment. Used to generate a global group id. """ global_agent_id = get_global_agent_id(worker_id, step.agent_id) stored_decision_step, idx = self._last_step_result.get( global_agent_id, (None, None) ) stored_take_action_outputs = self._last_take_action_outputs.get( global_agent_id, None ) if stored_decision_step is not None and stored_take_action_outputs is not None: # 0, the default group_id, means that the agent doesn't belong to an agent group. # If 0, don't add any groupmate information. if step.group_id > 0: global_group_id = get_global_group_id(worker_id, step.group_id) stored_actions = stored_take_action_outputs["action"] action_tuple = ActionTuple( continuous=stored_actions.continuous[idx], discrete=stored_actions.discrete[idx], ) group_status = AgentStatus( obs=stored_decision_step.obs, reward=step.reward, action=action_tuple, done=isinstance(step, TerminalStep), ) self._group_status[global_group_id][global_agent_id] = group_status self._current_group_obs[global_group_id][global_agent_id] = step.obs
def step(self, action: List[Any]) -> GymStepResult: """Run one timestep of the environment's dynamics. When end of episode is reached, you are responsible for calling `reset()` to reset this environment's state. Accepts an action and returns a tuple (observation, reward, done, info). Args: action (object/list): an action provided by the environment Returns: observation (object/list): agent's observation of the current environment reward (float/list) : amount of reward returned after previous action done (boolean/list): whether the episode has ended. info (dict): contains auxiliary diagnostic information. """ if self._flattener is not None: # Translate action into list action = self._flattener.lookup_action(action) action = np.array(action).reshape((-1, self.action_size)) action_tuple = ActionTuple() if self.group_spec.action_spec.is_continuous(): action_tuple.add_continuous(action) else: action_tuple.add_discrete(action) self._env.set_actions(self.name, action_tuple) self._env.step() decision_step, terminal_step = self._env.get_steps(self.name) try: return self.combine_steps(decision_step, terminal_step) except KeyError: self.key_error_counter += 1 # print(f"{self.key_error_counter}th KeyError in UnityToMultiGymWrapper. Previous step returned.") return self.last_stepreturn
def make_fake_trajectory( length: int, observation_specs: List[ObservationSpec], action_spec: ActionSpec, max_step_complete: bool = False, memory_size: int = 10, num_other_agents_in_group: int = 0, ) -> Trajectory: """ Makes a fake trajectory of length length. If max_step_complete, the trajectory is terminated by a max step rather than a done. """ steps_list = [] action_size = action_spec.discrete_size + action_spec.continuous_size for _i in range(length - 1): obs = [] for obs_spec in observation_specs: obs.append(np.ones(obs_spec.shape, dtype=np.float32)) reward = 1.0 done = False action = ActionTuple( continuous=np.zeros(action_spec.continuous_size, dtype=np.float32), discrete=np.zeros(action_spec.discrete_size, dtype=np.int32), ) action_probs = LogProbsTuple( continuous=np.ones(action_spec.continuous_size, dtype=np.float32), discrete=np.ones(action_spec.discrete_size, dtype=np.float32), ) action_mask = ( [ [False for _ in range(branch)] for branch in action_spec.discrete_branches ] # type: ignore if action_spec.is_discrete() else None ) if action_spec.is_discrete(): prev_action = np.ones(action_size, dtype=np.int32) else: prev_action = np.ones(action_size, dtype=np.float32) max_step = False memory = np.ones(memory_size, dtype=np.float32) agent_id = "test_agent" behavior_id = "test_brain" group_status = [] for _ in range(num_other_agents_in_group): group_status.append(AgentStatus(obs, reward, action, done)) experience = AgentExperience( obs=obs, reward=reward, done=done, action=action, action_probs=action_probs, action_mask=action_mask, prev_action=prev_action, interrupted=max_step, memory=memory, group_status=group_status, group_reward=0, ) steps_list.append(experience) obs = [] for obs_spec in observation_specs: obs.append(np.ones(obs_spec.shape, dtype=np.float32)) last_experience = AgentExperience( obs=obs, reward=reward, done=not max_step_complete, action=action, action_probs=action_probs, action_mask=action_mask, prev_action=prev_action, interrupted=max_step_complete, memory=memory, group_status=group_status, group_reward=0, ) steps_list.append(last_experience) return Trajectory( steps=steps_list, agent_id=agent_id, behavior_id=behavior_id, next_obs=obs, next_group_obs=[obs] * num_other_agents_in_group, )
Created on Fri May 7 11:06:32 2021 @author: Win10 """ import numpy as np import matplotlib.pyplot as plt from mlagents_envs.environment import UnityEnvironment from mlagents_envs.base_env import ActionTuple # spdyer first, then unity play print('load_enviroment, prepare to hit play button!') env = UnityEnvironment(file_name=None, side_channels=[]) # Start interacting with the evironment. env.reset() behavior_name = list(env.behavior_specs)[0] spec = env.behavior_specs[behavior_name] #vis_obs = any(len(spec.shape) == 3 for spec in spec.observation_specs) decision_steps, terminal_steps = env.get_steps(behavior_name) action = ActionTuple(np.array([[1.0, 1.0]], dtype=np.float32)) action = spec.action_spec.random_action(len(decision_steps)) env.set_actions(behavior_name, action) env.step() decision_steps2, terminal_steps2 = env.get_steps(behavior_name) env.close()
def _process_step( self, step: Union[TerminalStep, DecisionStep], worker_id: int, index: int ) -> None: terminated = isinstance(step, TerminalStep) global_agent_id = get_global_agent_id(worker_id, step.agent_id) global_group_id = get_global_group_id(worker_id, step.group_id) stored_decision_step, idx = self._last_step_result.get( global_agent_id, (None, None) ) stored_take_action_outputs = self._last_take_action_outputs.get( global_agent_id, None ) if not terminated: # Index is needed to grab from last_take_action_outputs self._last_step_result[global_agent_id] = (step, index) # This state is the consequence of a past action if stored_decision_step is not None and stored_take_action_outputs is not None: obs = stored_decision_step.obs if self.policy.use_recurrent: memory = self.policy.retrieve_previous_memories([global_agent_id])[0, :] else: memory = None done = terminated # Since this is an ongoing step interrupted = step.interrupted if terminated else False # Add the outputs of the last eval stored_actions = stored_take_action_outputs["action"] action_tuple = ActionTuple( continuous=stored_actions.continuous[idx], discrete=stored_actions.discrete[idx], ) stored_action_probs = stored_take_action_outputs["log_probs"] log_probs_tuple = LogProbsTuple( continuous=stored_action_probs.continuous[idx], discrete=stored_action_probs.discrete[idx], ) action_mask = stored_decision_step.action_mask prev_action = self.policy.retrieve_previous_action([global_agent_id])[0, :] # Assemble teammate_obs. If none saved, then it will be an empty list. group_statuses = [] for _id, _mate_status in self._group_status[global_group_id].items(): if _id != global_agent_id: group_statuses.append(_mate_status) experience = AgentExperience( obs=obs, reward=step.reward, done=done, action=action_tuple, action_probs=log_probs_tuple, action_mask=action_mask, prev_action=prev_action, interrupted=interrupted, memory=memory, group_status=group_statuses, group_reward=step.group_reward, ) # Add the value outputs if needed self._experience_buffers[global_agent_id].append(experience) self._episode_rewards[global_agent_id] += step.reward if not terminated: self._episode_steps[global_agent_id] += 1 # Add a trajectory segment to the buffer if terminal or the length has reached the time horizon if ( len(self._experience_buffers[global_agent_id]) >= self._max_trajectory_length or terminated ): next_obs = step.obs next_group_obs = [] for _id, _obs in self._current_group_obs[global_group_id].items(): if _id != global_agent_id: next_group_obs.append(_obs) trajectory = Trajectory( steps=self._experience_buffers[global_agent_id], agent_id=global_agent_id, next_obs=next_obs, next_group_obs=next_group_obs, behavior_id=self._behavior_id, ) for traj_queue in self._trajectory_queues: traj_queue.put(trajectory) self._experience_buffers[global_agent_id] = [] if terminated: # Record episode length. self._stats_reporter.add_stat( "Environment/Episode Length", self._episode_steps.get(global_agent_id, 0), ) self._clean_agent_data(global_agent_id)
def stepDiscreteAction(behavior_name, arrlist): _discrete = np.array(arrlist, dtype=np.int32) action = ActionTuple(discrete=_discrete) env.set_actions(behavior_name, action) env.step()
print("Is there a visual observation ?", vis_obs_bool) #print action is_discrete print("Is action is discrete ?", spec.action_spec.is_discrete()) #print action is_continuous print("Is action is continus ?", spec.action_spec.is_continuous()) #make continuous action and discrete action with 0 array, and step! decision_steps, terminal_steps = env.get_steps(behavior_name) n_agents = len(decision_steps) _continuous = np.zeros((n_agents, spec.action_spec.continuous_size), dtype=np.float32) _discrete = np.zeros((n_agents, spec.action_spec.discrete_size), dtype=np.int32) action = ActionTuple(continuous=_continuous, discrete=_discrete) env.set_actions(behavior_name, action) env.step() #make custom discrete action, and step! actionarr = [[1]] #list shape(1<num_agents>,1<discrete_size>) _discrete = np.array(actionarr, dtype=np.int32) action = ActionTuple(discrete=_discrete) env.set_actions(behavior_name, action) env.step() #Get step information to get observation decision_steps, terminal_steps = env.get_steps(behavior_name) #visual observation for index, shape in enumerate(spec.observation_shapes): if len(shape) == 3:
def step( self, action_dict: MultiAgentDict ) -> Tuple[MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]: """Performs one multi-agent step through the game. Args: action_dict (dict): Multi-agent action dict with: keys=agent identifier consisting of [MLagents behavior name, e.g. "Goalie?team=1"] + "_" + [Agent index, a unique MLAgent-assigned index per single agent] Returns: tuple: - obs: Multi-agent observation dict. Only those observations for which to get new actions are returned. - rewards: Rewards dict matching `obs`. - dones: Done dict with only an __all__ multi-agent entry in it. __all__=True, if episode is done for all agents. - infos: An (empty) info dict. """ from mlagents_envs.base_env import ActionTuple # Set only the required actions (from the DecisionSteps) in Unity3D. all_agents = [] for behavior_name in self.unity_env.behavior_specs: # New ML-Agents API: Set all agents actions at the same time # via an ActionTuple. Since API v1.4.0. if self.api_version[0] > 1 or (self.api_version[0] == 1 and self.api_version[1] >= 4): actions = [] for agent_id in self.unity_env.get_steps( behavior_name)[0].agent_id: key = behavior_name + "_{}".format(agent_id) all_agents.append(key) actions.append(action_dict[key]) if actions: if actions[0].dtype == np.float32: action_tuple = ActionTuple( continuous=np.array(actions)) else: action_tuple = ActionTuple(discrete=np.array(actions)) self.unity_env.set_actions(behavior_name, action_tuple) # Old behavior: Do not use an ActionTuple and set each agent's # action individually. else: for agent_id in self.unity_env.get_steps( behavior_name)[0].agent_id_to_index.keys(): key = behavior_name + "_{}".format(agent_id) all_agents.append(key) self.unity_env.set_action_for_agent( behavior_name, agent_id, action_dict[key]) # Do the step. self.unity_env.step() obs, rewards, dones, infos = self._get_step_results() # Global horizon reached? -> Return __all__ done=True, so user # can reset. Set all agents' individual `done` to True as well. self.episode_timesteps += 1 if self.episode_timesteps > self.episode_horizon: return obs, rewards, dict({"__all__": True}, **{ agent_id: True for agent_id in all_agents }), infos return obs, rewards, dones, infos