def step(self, action: List[Any]) -> GymStepResult: """Run one timestep of the environment's dynamics. When end of episode is reached, you are responsible for calling `reset()` to reset this environment's state. Accepts an action and returns a tuple (observation, reward, done, info). Args: action (object/list): an action provided by the environment Returns: observation (object/list): agent's observation of the current environment reward (float/list) : amount of reward returned after previous action done (boolean/list): whether the episode has ended. info (dict): contains auxiliary diagnostic information. """ if self._flattener is not None: # Translate action into list action = self._flattener.lookup_action(action) action = np.array(action).reshape((1, self.action_size)) action_tuple = ActionTuple() if self.group_spec.action_spec.is_continuous(): action_tuple.add_continuous(action) else: action_tuple.add_discrete(action) self._env.set_actions(self.name, action_tuple) self._env.step() decision_step, terminal_step = self._env.get_steps(self.name) self._check_agents(max(len(decision_step), len(terminal_step))) if len(terminal_step) != 0: # The agent is done self.game_over = True return self._single_step(terminal_step) else: return self._single_step(decision_step)
def get_action( self, decision_requests: DecisionSteps, worker_id: int = 0 ) -> ActionInfo: """ Decides actions given observations information, and takes them in environment. :param decision_requests: A dictionary of brain names and DecisionSteps from environment. :param worker_id: In parallel environment training, the unique id of the environment worker that the DecisionSteps came from. Used to construct a globally unique id for each agent. :return: an ActionInfo containing action, memories, values and an object to be passed to add experiences """ if len(decision_requests) == 0: return ActionInfo.empty() global_agent_ids = [ get_global_agent_id(worker_id, int(agent_id)) for agent_id in decision_requests.agent_id ] # For 1-D array, the iterator order is correct. run_out = self.evaluate( # pylint: disable=assignment-from-no-return decision_requests, global_agent_ids ) self.save_memories(global_agent_ids, run_out.get("memory_out")) # For Compatibility with buffer changes for hybrid action support if "log_probs" in run_out: log_probs_tuple = LogProbsTuple() if self.behavior_spec.action_spec.is_continuous(): log_probs_tuple.add_continuous(run_out["log_probs"]) else: log_probs_tuple.add_discrete(run_out["log_probs"]) run_out["log_probs"] = log_probs_tuple if "action" in run_out: action_tuple = ActionTuple() env_action_tuple = ActionTuple() if self.behavior_spec.action_spec.is_continuous(): action_tuple.add_continuous(run_out["pre_action"]) env_action_tuple.add_continuous(run_out["action"]) else: action_tuple.add_discrete(run_out["action"]) env_action_tuple.add_discrete(run_out["action"]) run_out["action"] = action_tuple run_out["env_action"] = env_action_tuple self.check_nan_action(run_out.get("action")) return ActionInfo( action=run_out.get("action"), env_action=run_out.get("env_action"), value=run_out.get("value"), outputs=run_out, agent_ids=decision_requests.agent_id, )
def to_action_tuple(self, clip: bool = False) -> ActionTuple: """ Returns an ActionTuple """ action_tuple = ActionTuple() if self.continuous_tensor is not None: _continuous_tensor = self.continuous_tensor if clip: _continuous_tensor = torch.clamp(_continuous_tensor, -3, 3) / 3 continuous = ModelUtils.to_numpy(_continuous_tensor) action_tuple.add_continuous(continuous) if self.discrete_list is not None: discrete = ModelUtils.to_numpy(self.discrete_tensor[:, 0, :]) action_tuple.add_discrete(discrete) return action_tuple
def set_actions(self, behavior_name, action): # The ActionTuple contains the actions for all n_agents. This # slices the ActionTuple into an action tuple for each environment # and sets it. The index j is used to ignore agents that have already # reached done. j = 0 for i in range(self.num_agents): _act = ActionTuple() name_and_num = behavior_name + str(i) env = self.envs[name_and_num] if not self.dones[name_and_num]: if self.action_spec.continuous_size > 0: _act.add_continuous(action.continuous[j:j + 1]) if self.action_spec.discrete_size > 0: _disc_list = [action.discrete[j, :]] _act.add_discrete(np.array(_disc_list)) j += 1 env.action[behavior_name] = _act
def step(self, action): """Runs one timestep of the environment"s dynamics. Once an episode is done, reset() has to be called manually. Arguments: action {List} -- A list of at least one discrete action to be executed by the agent Returns: {numpy.ndarray} -- Visual observation {numpy.ndarray} -- Vector observation {float} -- (Total) Scalar reward signaled by the environment {bool} -- Whether the episode of the environment terminated {dict} -- Further episode information (e.g. cumulated reward) retrieved from the environment once an episode completed """ # Carry out the agent's action action_tuple = ActionTuple() action_tuple.add_discrete(np.asarray(action).reshape([1, -1])) self._env.set_actions(self._behavior_name, action_tuple) self._env.step() info, terminal_info = self._env.get_steps(self._behavior_name) # Process step results vis_obs, vec_obs, reward, done = self._process_agent_info(info, terminal_info) self._rewards.append(reward) # Record trajectory data if self._record: self._trajectory["vis_obs"].append(vis_obs * 255) self._trajectory["vec_obs"].append(vec_obs) self._trajectory["rewards"].append(reward) self._trajectory["actions"].append(action) # Episode information if done: info = {"reward": sum(self._rewards), "length": len(self._rewards)} else: info = None return vis_obs, vec_obs, reward, done, info
def step(self, action: List[Any]) -> GymStepResult: """Run one timestep of the environment's dynamics. When end of episode is reached, you are responsible for calling `reset()` to reset this environment's state. Accepts an action and returns a tuple (observation, reward, done, info). Args: action (object/list): an action provided by the environment Returns: observation (object/list): agent's observation of the current environment reward (float/list) : amount of reward returned after previous action done (boolean/list): whether the episode has ended. info (dict): contains auxiliary diagnostic information. """ if self._flattener is not None: # Translate action into list action = self._flattener.lookup_action(action) action = np.array(action).reshape((-1, self.action_size)) action_tuple = ActionTuple() if self.group_spec.action_spec.is_continuous(): action_tuple.add_continuous(action) else: action_tuple.add_discrete(action) self._env.set_actions(self.name, action_tuple) self._env.step() decision_step, terminal_step = self._env.get_steps(self.name) try: return self.combine_steps(decision_step, terminal_step) except KeyError: self.key_error_counter += 1 # print(f"{self.key_error_counter}th KeyError in UnityToMultiGymWrapper. Previous step returned.") return self.last_stepreturn