예제 #1
0
    def step(self, action: List[Any]) -> GymStepResult:
        """Run one timestep of the environment's dynamics. When end of
        episode is reached, you are responsible for calling `reset()`
        to reset this environment's state.
        Accepts an action and returns a tuple (observation, reward, done, info).
        Args:
            action (object/list): an action provided by the environment
        Returns:
            observation (object/list): agent's observation of the current environment
            reward (float/list) : amount of reward returned after previous action
            done (boolean/list): whether the episode has ended.
            info (dict): contains auxiliary diagnostic information.
        """
        if self._flattener is not None:
            # Translate action into list
            action = self._flattener.lookup_action(action)

        action = np.array(action).reshape((1, self.action_size))

        action_tuple = ActionTuple()
        if self.group_spec.action_spec.is_continuous():
            action_tuple.add_continuous(action)
        else:
            action_tuple.add_discrete(action)
        self._env.set_actions(self.name, action_tuple)

        self._env.step()
        decision_step, terminal_step = self._env.get_steps(self.name)
        self._check_agents(max(len(decision_step), len(terminal_step)))
        if len(terminal_step) != 0:
            # The agent is done
            self.game_over = True
            return self._single_step(terminal_step)
        else:
            return self._single_step(decision_step)
예제 #2
0
    def get_action(
        self, decision_requests: DecisionSteps, worker_id: int = 0
    ) -> ActionInfo:
        """
        Decides actions given observations information, and takes them in environment.
        :param decision_requests: A dictionary of brain names and DecisionSteps from environment.
        :param worker_id: In parallel environment training, the unique id of the environment worker that
            the DecisionSteps came from. Used to construct a globally unique id for each agent.
        :return: an ActionInfo containing action, memories, values and an object
        to be passed to add experiences
        """
        if len(decision_requests) == 0:
            return ActionInfo.empty()

        global_agent_ids = [
            get_global_agent_id(worker_id, int(agent_id))
            for agent_id in decision_requests.agent_id
        ]  # For 1-D array, the iterator order is correct.

        run_out = self.evaluate(  # pylint: disable=assignment-from-no-return
            decision_requests, global_agent_ids
        )

        self.save_memories(global_agent_ids, run_out.get("memory_out"))
        # For Compatibility with buffer changes for hybrid action support
        if "log_probs" in run_out:
            log_probs_tuple = LogProbsTuple()
            if self.behavior_spec.action_spec.is_continuous():
                log_probs_tuple.add_continuous(run_out["log_probs"])
            else:
                log_probs_tuple.add_discrete(run_out["log_probs"])
            run_out["log_probs"] = log_probs_tuple
        if "action" in run_out:
            action_tuple = ActionTuple()
            env_action_tuple = ActionTuple()
            if self.behavior_spec.action_spec.is_continuous():
                action_tuple.add_continuous(run_out["pre_action"])
                env_action_tuple.add_continuous(run_out["action"])
            else:
                action_tuple.add_discrete(run_out["action"])
                env_action_tuple.add_discrete(run_out["action"])
            run_out["action"] = action_tuple
            run_out["env_action"] = env_action_tuple
        self.check_nan_action(run_out.get("action"))
        return ActionInfo(
            action=run_out.get("action"),
            env_action=run_out.get("env_action"),
            value=run_out.get("value"),
            outputs=run_out,
            agent_ids=decision_requests.agent_id,
        )
예제 #3
0
 def to_action_tuple(self, clip: bool = False) -> ActionTuple:
     """
     Returns an ActionTuple
     """
     action_tuple = ActionTuple()
     if self.continuous_tensor is not None:
         _continuous_tensor = self.continuous_tensor
         if clip:
             _continuous_tensor = torch.clamp(_continuous_tensor, -3, 3) / 3
         continuous = ModelUtils.to_numpy(_continuous_tensor)
         action_tuple.add_continuous(continuous)
     if self.discrete_list is not None:
         discrete = ModelUtils.to_numpy(self.discrete_tensor[:, 0, :])
         action_tuple.add_discrete(discrete)
     return action_tuple
예제 #4
0
 def set_actions(self, behavior_name, action):
     # The ActionTuple contains the actions for all n_agents. This
     # slices the ActionTuple into an action tuple for each environment
     # and sets it. The index j is used to ignore agents that have already
     # reached done.
     j = 0
     for i in range(self.num_agents):
         _act = ActionTuple()
         name_and_num = behavior_name + str(i)
         env = self.envs[name_and_num]
         if not self.dones[name_and_num]:
             if self.action_spec.continuous_size > 0:
                 _act.add_continuous(action.continuous[j:j + 1])
             if self.action_spec.discrete_size > 0:
                 _disc_list = [action.discrete[j, :]]
                 _act.add_discrete(np.array(_disc_list))
             j += 1
             env.action[behavior_name] = _act
예제 #5
0
    def step(self, action):
        """Runs one timestep of the environment"s dynamics.
        Once an episode is done, reset() has to be called manually.
                
        Arguments:
            action {List} -- A list of at least one discrete action to be executed by the agent

        Returns:
            {numpy.ndarray} -- Visual observation
            {numpy.ndarray} -- Vector observation
            {float} -- (Total) Scalar reward signaled by the environment
            {bool} -- Whether the episode of the environment terminated
            {dict} -- Further episode information (e.g. cumulated reward) retrieved from the environment once an episode completed
        """
        # Carry out the agent's action
        action_tuple = ActionTuple()
        action_tuple.add_discrete(np.asarray(action).reshape([1, -1]))
        self._env.set_actions(self._behavior_name, action_tuple)
        self._env.step()
        info, terminal_info = self._env.get_steps(self._behavior_name)

        # Process step results
        vis_obs, vec_obs, reward, done = self._process_agent_info(info, terminal_info)
        self._rewards.append(reward)

        # Record trajectory data
        if self._record:
            self._trajectory["vis_obs"].append(vis_obs * 255)
            self._trajectory["vec_obs"].append(vec_obs)
            self._trajectory["rewards"].append(reward)
            self._trajectory["actions"].append(action)

        # Episode information
        if done:
            info = {"reward": sum(self._rewards),
                    "length": len(self._rewards)}
        else:
            info = None

        return vis_obs, vec_obs, reward, done, info
예제 #6
0
파일: __init__.py 프로젝트: batu/ml-agents
    def step(self, action: List[Any]) -> GymStepResult:
        """Run one timestep of the environment's dynamics. When end of
        episode is reached, you are responsible for calling `reset()`
        to reset this environment's state.
        Accepts an action and returns a tuple (observation, reward, done, info).
        Args:
            action (object/list): an action provided by the environment
        Returns:
            observation (object/list): agent's observation of the current environment
            reward (float/list) : amount of reward returned after previous action
            done (boolean/list): whether the episode has ended.
            info (dict): contains auxiliary diagnostic information.
        """
        if self._flattener is not None:
            # Translate action into list
            action = self._flattener.lookup_action(action)

        action = np.array(action).reshape((-1, self.action_size))

        action_tuple = ActionTuple()
        if self.group_spec.action_spec.is_continuous():
            action_tuple.add_continuous(action)
        else:
            action_tuple.add_discrete(action)

        self._env.set_actions(self.name, action_tuple)
        self._env.step()

        decision_step, terminal_step = self._env.get_steps(self.name)

        try:
            return self.combine_steps(decision_step, terminal_step)
        except KeyError:
            self.key_error_counter += 1
            # print(f"{self.key_error_counter}th KeyError in UnityToMultiGymWrapper. Previous step returned.")
            return self.last_stepreturn