예제 #1
0
def make_fake_trajectory(
    length: int,
    max_step_complete: bool = False,
    vec_obs_size: int = VEC_OBS_SIZE,
    num_vis_obs: int = 1,
    action_space: int = ACTION_SIZE,
) -> Trajectory:
    """
    Makes a fake trajectory of length length. If max_step_complete,
    the trajectory is terminated by a max step rather than a done.
    """
    steps_list = []
    for i in range(length - 1):
        obs = []
        for i in range(num_vis_obs):
            obs.append(np.ones((84, 84, 3), dtype=np.float32))
        obs.append(np.ones(vec_obs_size, dtype=np.float32))
        reward = 1.0
        done = False
        action = np.zeros(action_space, dtype=np.float32)
        action_probs = np.ones(action_space, dtype=np.float32)
        action_pre = np.zeros(action_space, dtype=np.float32)
        action_mask = np.ones(action_space, dtype=np.float32)
        prev_action = np.ones(action_space, dtype=np.float32)
        max_step = False
        memory = np.ones(10, dtype=np.float32)
        agent_id = "test_agent"
        behavior_id = "test_brain"
        experience = AgentExperience(
            obs=obs,
            reward=reward,
            done=done,
            action=action,
            action_probs=action_probs,
            action_pre=action_pre,
            action_mask=action_mask,
            prev_action=prev_action,
            max_step=max_step,
            memory=memory,
        )
        steps_list.append(experience)
    last_experience = AgentExperience(
        obs=obs,
        reward=reward,
        done=not max_step_complete,
        action=action,
        action_probs=action_probs,
        action_pre=action_pre,
        action_mask=action_mask,
        prev_action=prev_action,
        max_step=max_step_complete,
        memory=memory,
    )
    steps_list.append(last_experience)
    return Trajectory(steps=steps_list,
                      agent_id=agent_id,
                      behavior_id=behavior_id,
                      next_obs=obs)
예제 #2
0
def make_fake_trajectory(
    length: int,
    observation_specs: List[ObservationSpec],
    action_spec: ActionSpec,
    max_step_complete: bool = False,
    memory_size: int = 10,
    num_other_agents_in_group: int = 0,
) -> Trajectory:
    """
    Makes a fake trajectory of length length. If max_step_complete,
    the trajectory is terminated by a max step rather than a done.
    """
    steps_list = []

    action_size = action_spec.discrete_size + action_spec.continuous_size
    for _i in range(length - 1):
        obs = []
        for obs_spec in observation_specs:
            obs.append(np.ones(obs_spec.shape, dtype=np.float32))
        reward = 1.0
        done = False
        action = ActionTuple(
            continuous=np.zeros(action_spec.continuous_size, dtype=np.float32),
            discrete=np.zeros(action_spec.discrete_size, dtype=np.int32),
        )
        action_probs = LogProbsTuple(
            continuous=np.ones(action_spec.continuous_size, dtype=np.float32),
            discrete=np.ones(action_spec.discrete_size, dtype=np.float32),
        )
        action_mask = (
            [
                [False for _ in range(branch)]
                for branch in action_spec.discrete_branches
            ]  # type: ignore
            if action_spec.is_discrete()
            else None
        )
        if action_spec.is_discrete():
            prev_action = np.ones(action_size, dtype=np.int32)
        else:
            prev_action = np.ones(action_size, dtype=np.float32)

        max_step = False
        memory = np.ones(memory_size, dtype=np.float32)
        agent_id = "test_agent"
        behavior_id = "test_brain"
        group_status = []
        for _ in range(num_other_agents_in_group):
            group_status.append(AgentStatus(obs, reward, action, done))
        experience = AgentExperience(
            obs=obs,
            reward=reward,
            done=done,
            action=action,
            action_probs=action_probs,
            action_mask=action_mask,
            prev_action=prev_action,
            interrupted=max_step,
            memory=memory,
            group_status=group_status,
            group_reward=0,
        )
        steps_list.append(experience)
    obs = []
    for obs_spec in observation_specs:
        obs.append(np.ones(obs_spec.shape, dtype=np.float32))
    last_experience = AgentExperience(
        obs=obs,
        reward=reward,
        done=not max_step_complete,
        action=action,
        action_probs=action_probs,
        action_mask=action_mask,
        prev_action=prev_action,
        interrupted=max_step_complete,
        memory=memory,
        group_status=group_status,
        group_reward=0,
    )
    steps_list.append(last_experience)
    return Trajectory(
        steps=steps_list,
        agent_id=agent_id,
        behavior_id=behavior_id,
        next_obs=obs,
        next_group_obs=[obs] * num_other_agents_in_group,
    )
예제 #3
0
def make_fake_trajectory(
    length: int,
    max_step_complete: bool = False,
    vec_obs_size: int = 1,
    num_vis_obs: int = 1,
    action_space: List[int] = None,
    memory_size: int = 10,
    is_discrete: bool = True,
) -> Trajectory:
    """
    Makes a fake trajectory of length length. If max_step_complete,
    the trajectory is terminated by a max step rather than a done.
    """
    if action_space is None:
        action_space = [2]
    steps_list = []
    for _i in range(length - 1):
        obs = []
        for _j in range(num_vis_obs):
            obs.append(np.ones((84, 84, 3), dtype=np.float32))
        obs.append(np.ones(vec_obs_size, dtype=np.float32))
        reward = 1.0
        done = False
        if is_discrete:
            action_size = len(action_space)
            action_probs = np.ones(np.sum(action_space), dtype=np.float32)
        else:
            action_size = action_space[0]
            action_probs = np.ones((action_size), dtype=np.float32)
        action = np.zeros(action_size, dtype=np.float32)
        action_pre = np.zeros(action_size, dtype=np.float32)
        action_mask = ([[False for _ in range(branch)]
                        for branch in action_space] if is_discrete else None)
        prev_action = np.ones(action_size, dtype=np.float32)
        max_step = False
        memory = np.ones(memory_size, dtype=np.float32)
        agent_id = "test_agent"
        behavior_id = "test_brain"
        experience = AgentExperience(
            obs=obs,
            reward=reward,
            done=done,
            action=action,
            action_probs=action_probs,
            action_pre=action_pre,
            action_mask=action_mask,
            prev_action=prev_action,
            max_step=max_step,
            memory=memory,
        )
        steps_list.append(experience)
    last_experience = AgentExperience(
        obs=obs,
        reward=reward,
        done=not max_step_complete,
        action=action,
        action_probs=action_probs,
        action_pre=action_pre,
        action_mask=action_mask,
        prev_action=prev_action,
        max_step=max_step_complete,
        memory=memory,
    )
    steps_list.append(last_experience)
    return Trajectory(steps=steps_list,
                      agent_id=agent_id,
                      behavior_id=behavior_id,
                      next_obs=obs)
예제 #4
0
    def add_experiences(
        self,
        batched_step_result: BatchedStepResult,
        worker_id: int,
        previous_action: ActionInfo,
    ) -> None:
        """
        Adds experiences to each agent's experience history.
        :param batched_step_result: current BatchedStepResult.
        :param previous_action: The outputs of the Policy's get_action method.
        """
        take_action_outputs = previous_action.outputs
        if take_action_outputs:
            for _entropy in take_action_outputs["entropy"]:
                self.stats_reporter.add_stat("Policy/Entropy", _entropy)
            self.stats_reporter.add_stat("Policy/Learning Rate",
                                         take_action_outputs["learning_rate"])

        # Make unique agent_ids that are global across workers
        action_global_agent_ids = [
            get_global_agent_id(worker_id, ag_id)
            for ag_id in previous_action.agent_ids
        ]
        for global_id in action_global_agent_ids:
            self.last_take_action_outputs[global_id] = take_action_outputs

        for _id in batched_step_result.agent_id:  # Assume agent_id is 1-D
            local_id = int(
                _id
            )  # Needed for mypy to pass since ndarray has no content type
            curr_agent_step = batched_step_result.get_agent_step_result(
                local_id)
            global_id = get_global_agent_id(worker_id, local_id)
            stored_step = self.last_step_result.get(global_id, None)
            stored_take_action_outputs = self.last_take_action_outputs.get(
                global_id, None)
            if stored_step is not None and stored_take_action_outputs is not None:
                # We know the step is from the same worker, so use the local agent id.
                stored_agent_step = stored_step.get_agent_step_result(local_id)
                idx = stored_step.agent_id_to_index[local_id]
                obs = stored_agent_step.obs
                if not stored_agent_step.done:
                    if self.policy.use_recurrent:
                        memory = self.policy.retrieve_memories([global_id
                                                                ])[0, :]
                    else:
                        memory = None

                    done = curr_agent_step.done
                    max_step = curr_agent_step.max_step

                    # Add the outputs of the last eval
                    action = stored_take_action_outputs["action"][idx]
                    if self.policy.use_continuous_act:
                        action_pre = stored_take_action_outputs["pre_action"][
                            idx]
                    else:
                        action_pre = None
                    action_probs = stored_take_action_outputs["log_probs"][idx]
                    action_mask = stored_agent_step.action_mask
                    prev_action = self.policy.retrieve_previous_action(
                        [global_id])[0, :]

                    experience = AgentExperience(
                        obs=obs,
                        reward=curr_agent_step.reward,
                        done=done,
                        action=action,
                        action_probs=action_probs,
                        action_pre=action_pre,
                        action_mask=action_mask,
                        prev_action=prev_action,
                        max_step=max_step,
                        memory=memory,
                    )
                    # Add the value outputs if needed
                    self.experience_buffers[global_id].append(experience)
                    self.episode_rewards[global_id] += curr_agent_step.reward
                if (curr_agent_step.done or
                    (len(self.experience_buffers[global_id]) >=
                     self.max_trajectory_length)) and len(
                         self.experience_buffers[global_id]) > 0:
                    # Make next AgentExperience
                    next_obs = curr_agent_step.obs
                    trajectory = Trajectory(
                        steps=self.experience_buffers[global_id],
                        agent_id=global_id,
                        next_obs=next_obs,
                        behavior_id=self.behavior_id,
                    )
                    for traj_queue in self.trajectory_queues:
                        traj_queue.put(trajectory)
                    self.experience_buffers[global_id] = []
                    if curr_agent_step.done:
                        self.stats_reporter.add_stat(
                            "Environment/Cumulative Reward",
                            self.episode_rewards.get(global_id, 0),
                        )
                        self.stats_reporter.add_stat(
                            "Environment/Episode Length",
                            self.episode_steps.get(global_id, 0),
                        )
                        del self.episode_steps[global_id]
                        del self.episode_rewards[global_id]
                elif not curr_agent_step.done:
                    self.episode_steps[global_id] += 1

            self.last_step_result[global_id] = batched_step_result

        if "action" in take_action_outputs:
            self.policy.save_previous_action(previous_action.agent_ids,
                                             take_action_outputs["action"])
예제 #5
0
    def _process_step(
        self, step: Union[TerminalStep, DecisionStep], worker_id: int, index: int
    ) -> None:
        terminated = isinstance(step, TerminalStep)
        global_agent_id = get_global_agent_id(worker_id, step.agent_id)
        global_group_id = get_global_group_id(worker_id, step.group_id)
        stored_decision_step, idx = self._last_step_result.get(
            global_agent_id, (None, None)
        )
        stored_take_action_outputs = self._last_take_action_outputs.get(
            global_agent_id, None
        )
        if not terminated:
            # Index is needed to grab from last_take_action_outputs
            self._last_step_result[global_agent_id] = (step, index)

        # This state is the consequence of a past action
        if stored_decision_step is not None and stored_take_action_outputs is not None:
            obs = stored_decision_step.obs
            if self.policy.use_recurrent:
                memory = self.policy.retrieve_previous_memories([global_agent_id])[0, :]
            else:
                memory = None
            done = terminated  # Since this is an ongoing step
            interrupted = step.interrupted if terminated else False
            # Add the outputs of the last eval
            stored_actions = stored_take_action_outputs["action"]
            action_tuple = ActionTuple(
                continuous=stored_actions.continuous[idx],
                discrete=stored_actions.discrete[idx],
            )
            stored_action_probs = stored_take_action_outputs["log_probs"]
            log_probs_tuple = LogProbsTuple(
                continuous=stored_action_probs.continuous[idx],
                discrete=stored_action_probs.discrete[idx],
            )
            action_mask = stored_decision_step.action_mask
            prev_action = self.policy.retrieve_previous_action([global_agent_id])[0, :]

            # Assemble teammate_obs. If none saved, then it will be an empty list.
            group_statuses = []
            for _id, _mate_status in self._group_status[global_group_id].items():
                if _id != global_agent_id:
                    group_statuses.append(_mate_status)

            experience = AgentExperience(
                obs=obs,
                reward=step.reward,
                done=done,
                action=action_tuple,
                action_probs=log_probs_tuple,
                action_mask=action_mask,
                prev_action=prev_action,
                interrupted=interrupted,
                memory=memory,
                group_status=group_statuses,
                group_reward=step.group_reward,
            )
            # Add the value outputs if needed
            self._experience_buffers[global_agent_id].append(experience)
            self._episode_rewards[global_agent_id] += step.reward
            if not terminated:
                self._episode_steps[global_agent_id] += 1

            # Add a trajectory segment to the buffer if terminal or the length has reached the time horizon
            if (
                len(self._experience_buffers[global_agent_id])
                >= self._max_trajectory_length
                or terminated
            ):
                next_obs = step.obs
                next_group_obs = []
                for _id, _obs in self._current_group_obs[global_group_id].items():
                    if _id != global_agent_id:
                        next_group_obs.append(_obs)

                trajectory = Trajectory(
                    steps=self._experience_buffers[global_agent_id],
                    agent_id=global_agent_id,
                    next_obs=next_obs,
                    next_group_obs=next_group_obs,
                    behavior_id=self._behavior_id,
                )
                for traj_queue in self._trajectory_queues:
                    traj_queue.put(trajectory)
                self._experience_buffers[global_agent_id] = []
            if terminated:
                # Record episode length.
                self._stats_reporter.add_stat(
                    "Environment/Episode Length",
                    self._episode_steps.get(global_agent_id, 0),
                )
                self._clean_agent_data(global_agent_id)
예제 #6
0
    def add_experiences(
        self,
        curr_info: BrainInfo,
        next_info: BrainInfo,
        take_action_outputs: ActionInfoOutputs,
    ) -> None:
        """
        Adds experiences to each agent's experience history.
        :param curr_info: current BrainInfo.
        :param next_info: next BrainInfo.
        :param take_action_outputs: The outputs of the Policy's get_action method.
        """
        if take_action_outputs:
            self.stats_reporter.add_stat("Policy/Entropy",
                                         take_action_outputs["entropy"].mean())
            self.stats_reporter.add_stat("Policy/Learning Rate",
                                         take_action_outputs["learning_rate"])

        for agent_id in curr_info.agents:
            self.last_brain_info[agent_id] = curr_info
            self.last_take_action_outputs[agent_id] = take_action_outputs

        # Store the environment reward
        tmp_environment_reward = next_info.rewards

        for next_idx, agent_id in enumerate(next_info.agents):
            stored_info = self.last_brain_info.get(agent_id, None)
            if stored_info is not None:
                stored_take_action_outputs = self.last_take_action_outputs[
                    agent_id]
                idx = stored_info.agents.index(agent_id)
                obs = []
                if not stored_info.local_done[idx]:
                    for i, _ in enumerate(stored_info.visual_observations):
                        obs.append(stored_info.visual_observations[i][idx])
                    if self.policy.use_vec_obs:
                        obs.append(stored_info.vector_observations[idx])
                    if self.policy.use_recurrent:
                        memory = self.policy.retrieve_memories([agent_id
                                                                ])[0, :]
                    else:
                        memory = None

                    done = next_info.local_done[next_idx]
                    max_step = next_info.max_reached[next_idx]

                    # Add the outputs of the last eval
                    action = stored_take_action_outputs["action"][idx]
                    if self.policy.use_continuous_act:
                        action_pre = stored_take_action_outputs["pre_action"][
                            idx]
                    else:
                        action_pre = None
                    action_probs = stored_take_action_outputs["log_probs"][idx]
                    action_masks = stored_info.action_masks[idx]
                    prev_action = self.policy.retrieve_previous_action(
                        [agent_id])[0, :]

                    experience = AgentExperience(
                        obs=obs,
                        reward=tmp_environment_reward[next_idx],
                        done=done,
                        action=action,
                        action_probs=action_probs,
                        action_pre=action_pre,
                        action_mask=action_masks,
                        prev_action=prev_action,
                        max_step=max_step,
                        memory=memory,
                    )
                    # Add the value outputs if needed
                    self.experience_buffers[agent_id].append(experience)
                    self.episode_rewards[agent_id] += tmp_environment_reward[
                        next_idx]
                if (next_info.local_done[next_idx] or
                    (len(self.experience_buffers[agent_id]) >=
                     self.max_trajectory_length)) and len(
                         self.experience_buffers[agent_id]) > 0:
                    # Make next AgentExperience
                    next_obs = []
                    for i, _ in enumerate(next_info.visual_observations):
                        next_obs.append(
                            next_info.visual_observations[i][next_idx])
                    if self.policy.use_vec_obs:
                        next_obs.append(
                            next_info.vector_observations[next_idx])
                    trajectory = Trajectory(
                        steps=self.experience_buffers[agent_id],
                        agent_id=agent_id,
                        next_obs=next_obs,
                        behavior_id=self.behavior_id,
                    )
                    # This will eventually be replaced with a queue
                    self.trainer.process_trajectory(trajectory)
                    self.experience_buffers[agent_id] = []
                    if next_info.local_done[next_idx]:
                        self.stats_reporter.add_stat(
                            "Environment/Cumulative Reward",
                            self.episode_rewards.get(agent_id, 0),
                        )
                        self.stats_reporter.add_stat(
                            "Environment/Episode Length",
                            self.episode_steps.get(agent_id, 0),
                        )
                        del self.episode_steps[agent_id]
                        del self.episode_rewards[agent_id]
                elif not next_info.local_done[next_idx]:
                    self.episode_steps[agent_id] += 1
        if "action" in take_action_outputs:
            self.policy.save_previous_action(curr_info.agents,
                                             take_action_outputs["action"])
예제 #7
0
def make_fake_trajectory(
    length: int,
    observation_shapes: List[Tuple],
    action_spec: ActionSpec,
    max_step_complete: bool = False,
    memory_size: int = 10,
) -> Trajectory:
    """
    Makes a fake trajectory of length length. If max_step_complete,
    the trajectory is terminated by a max step rather than a done.
    """
    steps_list = []

    action_size = action_spec.discrete_size + action_spec.continuous_size
    action_probs = np.ones(
        int(
            np.sum(action_spec.discrete_branches) +
            action_spec.continuous_size),
        dtype=np.float32,
    )
    for _i in range(length - 1):
        obs = []
        for _shape in observation_shapes:
            obs.append(np.ones(_shape, dtype=np.float32))
        reward = 1.0
        done = False
        action = np.zeros(action_size, dtype=np.float32)
        action_pre = np.zeros(action_size, dtype=np.float32)
        action_mask = ([[False for _ in range(branch)]
                        for branch in action_spec.discrete_branches
                        ]  # type: ignore
                       if action_spec.is_discrete() else None)
        prev_action = np.ones(action_size, dtype=np.float32)
        max_step = False
        memory = np.ones(memory_size, dtype=np.float32)
        agent_id = "test_agent"
        behavior_id = "test_brain"
        experience = AgentExperience(
            obs=obs,
            reward=reward,
            done=done,
            action=action,
            action_probs=action_probs,
            action_pre=action_pre,
            action_mask=action_mask,
            prev_action=prev_action,
            interrupted=max_step,
            memory=memory,
        )
        steps_list.append(experience)
    obs = []
    for _shape in observation_shapes:
        obs.append(np.ones(_shape, dtype=np.float32))
    last_experience = AgentExperience(
        obs=obs,
        reward=reward,
        done=not max_step_complete,
        action=action,
        action_probs=action_probs,
        action_pre=action_pre,
        action_mask=action_mask,
        prev_action=prev_action,
        interrupted=max_step_complete,
        memory=memory,
    )
    steps_list.append(last_experience)
    return Trajectory(steps=steps_list,
                      agent_id=agent_id,
                      behavior_id=behavior_id,
                      next_obs=obs)
예제 #8
0
    def _process_step(
        self, step: Union[TerminalStep, DecisionStep], global_id: str, index: int
    ) -> None:
        terminated = isinstance(step, TerminalStep)
        stored_decision_step, idx = self.last_step_result.get(global_id, (None, None))
        stored_take_action_outputs = self.last_take_action_outputs.get(global_id, None)
        if not terminated:
            # Index is needed to grab from last_take_action_outputs
            self.last_step_result[global_id] = (step, index)

        # This state is the consequence of a past action
        if stored_decision_step is not None and stored_take_action_outputs is not None:
            obs = stored_decision_step.obs
            if self.policy.use_recurrent:
                memory = self.policy.retrieve_memories([global_id])[0, :]
            else:
                memory = None
            done = terminated  # Since this is an ongoing step
            interrupted = step.interrupted if terminated else False
            # Add the outputs of the last eval
            action = stored_take_action_outputs["action"][idx]
            if self.policy.use_continuous_act:
                action_pre = stored_take_action_outputs["pre_action"][idx]
            else:
                action_pre = None
            action_probs = stored_take_action_outputs["log_probs"][idx]
            action_mask = stored_decision_step.action_mask
            prev_action = self.policy.retrieve_previous_action([global_id])[0, :]
            experience = AgentExperience(
                obs=obs,
                reward=step.reward,
                done=done,
                action=action,
                action_probs=action_probs,
                action_pre=action_pre,
                action_mask=action_mask,
                prev_action=prev_action,
                interrupted=interrupted,
                memory=memory,
            )
            # Add the value outputs if needed
            self.experience_buffers[global_id].append(experience)
            self.episode_rewards[global_id] += step.reward
            if not terminated:
                self.episode_steps[global_id] += 1

            # Add a trajectory segment to the buffer if terminal or the length has reached the time horizon
            if (
                len(self.experience_buffers[global_id]) >= self.max_trajectory_length
                or terminated
            ):
                # Make next AgentExperience
                next_obs = step.obs
                trajectory = Trajectory(
                    steps=self.experience_buffers[global_id],
                    agent_id=global_id,
                    next_obs=next_obs,
                    behavior_id=self.behavior_id,
                )
                for traj_queue in self.trajectory_queues:
                    traj_queue.put(trajectory)
                self.experience_buffers[global_id] = []
            if terminated:
                # Record episode length.
                self.stats_reporter.add_stat(
                    "Environment/Episode Length", self.episode_steps.get(global_id, 0)
                )
                self._clean_agent_data(global_id)
예제 #9
0
    def add_experiences(
        self,
        batched_step_result: BatchedStepResult,
        worker_id: int,
        previous_action: ActionInfo,
    ) -> None:
        """
        Adds experiences to each agent's experience history.
        :param batched_step_result: current BatchedStepResult.
        :param previous_action: The outputs of the Policy's get_action method.
        """
        take_action_outputs = previous_action.outputs
        if take_action_outputs:
            for _entropy in take_action_outputs["entropy"]:
                self.stats_reporter.add_stat("Policy/Entropy", _entropy)

        # Make unique agent_ids that are global across workers
        action_global_agent_ids = [
            get_global_agent_id(worker_id, ag_id)
            for ag_id in previous_action.agent_ids
        ]
        for global_id in action_global_agent_ids:
            if global_id in self.last_step_result:  # Don't store if agent just reset
                self.last_take_action_outputs[global_id] = take_action_outputs

        for _id in batched_step_result.agent_id:  # Assume agent_id is 1-D
            local_id = int(
                _id
            )  # Needed for mypy to pass since ndarray has no content type
            curr_agent_step = batched_step_result.get_agent_step_result(
                local_id)
            global_id = get_global_agent_id(worker_id, local_id)
            stored_agent_step, idx = self.last_step_result.get(
                global_id, (None, None))
            stored_take_action_outputs = self.last_take_action_outputs.get(
                global_id, None)

            if stored_agent_step is not None and stored_take_action_outputs is not None:
                # We know the step is from the same worker, so use the local agent id.
                obs = stored_agent_step.obs
                if not stored_agent_step.done:
                    if self.policy.use_recurrent:
                        memory = self.policy.retrieve_memories([global_id
                                                                ])[0, :]
                    else:
                        memory = None

                    done = curr_agent_step.done
                    max_step = curr_agent_step.max_step

                    # Add the outputs of the last eval
                    action = stored_take_action_outputs["action"][idx]
                    if self.policy.use_continuous_act:
                        action_pre = stored_take_action_outputs["pre_action"][
                            idx]
                    else:
                        action_pre = None
                    action_probs = stored_take_action_outputs["log_probs"][idx]
                    action_mask = stored_agent_step.action_mask
                    prev_action = self.policy.retrieve_previous_action(
                        [global_id])[0, :]

                    experience = AgentExperience(
                        obs=obs,
                        reward=curr_agent_step.reward,
                        done=done,
                        action=action,
                        action_probs=action_probs,
                        action_pre=action_pre,
                        action_mask=action_mask,
                        prev_action=prev_action,
                        max_step=max_step,
                        memory=memory,
                    )
                    # Add the value outputs if needed
                    self.experience_buffers[global_id].append(experience)
                    self.episode_rewards[global_id] += curr_agent_step.reward
                if (curr_agent_step.done or
                    (len(self.experience_buffers[global_id]) >=
                     self.max_trajectory_length)) and len(
                         self.experience_buffers[global_id]) > 0:
                    # Make next AgentExperience
                    next_obs = curr_agent_step.obs
                    trajectory = Trajectory(
                        steps=self.experience_buffers[global_id],
                        agent_id=global_id,
                        next_obs=next_obs,
                        behavior_id=self.behavior_id,
                    )
                    for traj_queue in self.trajectory_queues:
                        traj_queue.put(trajectory)
                    self.experience_buffers[global_id] = []
                    if curr_agent_step.done:
                        # Record episode length for agents which have had at least
                        # 1 step. Done after reset ignored.
                        self.stats_reporter.add_stat(
                            "Environment/Episode Length",
                            self.episode_steps.get(global_id, 0),
                        )
                elif not curr_agent_step.done:
                    self.episode_steps[global_id] += 1

            # Index is needed to grab from last_take_action_outputs
            self.last_step_result[global_id] = (
                curr_agent_step,
                batched_step_result.agent_id_to_index[_id],
            )
            # Delete all done agents, regardless of if they had a 0-length episode.
            if curr_agent_step.done:
                self._clean_agent_data(global_id)

        for _gid in action_global_agent_ids:
            # If the ID doesn't have a last step result, the agent just reset,
            # don't store the action.
            if _gid in self.last_step_result:
                if "action" in take_action_outputs:
                    self.policy.save_previous_action(
                        [_gid], take_action_outputs["action"])