Пример #1
0
    def get_value_estimates(self, next_obs: List[np.ndarray], agent_id: str,
                            done: bool) -> Dict[str, float]:
        """
        Generates value estimates for bootstrapping.
        :param experience: AgentExperience to be used for bootstrapping.
        :param done: Whether or not this is the last element of the episode, in which case the value estimate will be 0.
        :return: The value estimate dictionary with key being the name of the reward signal and the value the
        corresponding value estimate.
        """

        feed_dict: Dict[tf.Tensor, Any] = {
            self.model.batch_size: 1,
            self.model.sequence_length: 1,
        }
        vec_vis_obs = SplitObservations.from_observations(next_obs)
        for i in range(len(vec_vis_obs.visual_observations)):
            feed_dict[self.model.visual_in[i]] = [
                vec_vis_obs.visual_observations[i]
            ]

        if self.use_vec_obs:
            feed_dict[self.model.vector_in] = [vec_vis_obs.vector_observations]
        if self.use_recurrent:
            feed_dict[self.model.memory_in] = self.retrieve_memories(
                [agent_id])
        if not self.use_continuous_act and self.use_recurrent:
            feed_dict[self.model.prev_action] = self.retrieve_previous_action(
                [agent_id])
        value_estimates = self.sess.run(self.model.value_heads, feed_dict)

        value_estimates = {k: float(v) for k, v in value_estimates.items()}

        # If we're done, reassign all of the value estimates that need terminal states.
        if done:
            for k in value_estimates:
                if self.reward_signals[k].use_terminal_states:
                    value_estimates[k] = 0.0

        return value_estimates
Пример #2
0
    def _process_trajectory(self, trajectory: Trajectory) -> None:
        """
        Takes a trajectory and processes it, putting it into the replay buffer.
        """
        super()._process_trajectory(trajectory)
        last_step = trajectory.steps[-1]
        agent_id = trajectory.agent_id  # All the agents should have the same ID

        agent_buffer_trajectory = trajectory.to_agentbuffer()

        # Update the normalization
        if self.is_training:
            self.policy.update_normalization(
                agent_buffer_trajectory["vector_obs"])

        # Evaluate all reward functions for reporting purposes
        self.collected_rewards["environment"][agent_id] += np.sum(
            agent_buffer_trajectory["environment_rewards"])
        for name, reward_signal in self.optimizer.reward_signals.items():
            # BaseRewardProvider is a PyTorch-based reward signal
            if isinstance(reward_signal, BaseRewardProvider):
                evaluate_result = (
                    reward_signal.evaluate(agent_buffer_trajectory) *
                    reward_signal.strength)
            else:  # reward_signal uses TensorFlow
                evaluate_result = reward_signal.evaluate_batch(
                    agent_buffer_trajectory).scaled_reward

            # Report the reward signals
            self.collected_rewards[name][agent_id] += np.sum(evaluate_result)

        # Get all value estimates for reporting purposes
        value_estimates, _ = self.optimizer.get_trajectory_value_estimates(
            agent_buffer_trajectory, trajectory.next_obs,
            trajectory.done_reached)
        for name, v in value_estimates.items():
            # BaseRewardProvider is a PyTorch-based reward signal
            if isinstance(self.optimizer.reward_signals[name],
                          BaseRewardProvider):
                self._stats_reporter.add_stat(
                    f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value",
                    np.mean(v),
                )
            else:  # TensorFlow reward signal
                self._stats_reporter.add_stat(
                    self.optimizer.reward_signals[name].value_name, np.mean(v))

        # Bootstrap using the last step rather than the bootstrap step if max step is reached.
        # Set last element to duplicate obs and remove dones.
        if last_step.interrupted:
            vec_vis_obs = SplitObservations.from_observations(last_step.obs)
            for i, obs in enumerate(vec_vis_obs.visual_observations):
                agent_buffer_trajectory["next_visual_obs%d" % i][-1] = obs
            if vec_vis_obs.vector_observations.size > 1:
                agent_buffer_trajectory["next_vector_in"][
                    -1] = vec_vis_obs.vector_observations
            agent_buffer_trajectory["done"][-1] = False

        # Append to update buffer
        agent_buffer_trajectory.resequence_and_append(
            self.update_buffer, training_length=self.policy.sequence_length)

        if trajectory.done_reached:
            self._update_end_episode_stats(agent_id, self.optimizer)
Пример #3
0
def make_demo_buffer(
    pair_infos: List[AgentInfoActionPairProto],
    behavior_spec: BehaviorSpec,
    sequence_length: int,
) -> AgentBuffer:
    # Create and populate buffer using experiences
    demo_raw_buffer = AgentBuffer()
    demo_processed_buffer = AgentBuffer()
    for idx, current_pair_info in enumerate(pair_infos):
        if idx > len(pair_infos) - 2:
            break
        next_pair_info = pair_infos[idx + 1]
        current_decision_step, current_terminal_step = steps_from_proto(
            [current_pair_info.agent_info], behavior_spec)
        next_decision_step, next_terminal_step = steps_from_proto(
            [next_pair_info.agent_info], behavior_spec)
        previous_action = (
            np.array(pair_infos[idx].action_info.vector_actions_deprecated,
                     dtype=np.float32) * 0)
        if idx > 0:
            previous_action = np.array(
                pair_infos[idx - 1].action_info.vector_actions_deprecated,
                dtype=np.float32,
            )

        next_done = len(next_terminal_step) == 1
        next_reward = 0
        if len(next_terminal_step) == 1:
            next_reward = next_terminal_step.reward[0]
        else:
            next_reward = next_decision_step.reward[0]
        current_obs = None
        if len(current_terminal_step) == 1:
            current_obs = list(current_terminal_step.values())[0].obs
        else:
            current_obs = list(current_decision_step.values())[0].obs

        demo_raw_buffer["done"].append(next_done)
        demo_raw_buffer["rewards"].append(next_reward)
        split_obs = SplitObservations.from_observations(current_obs)
        for i, obs in enumerate(split_obs.visual_observations):
            demo_raw_buffer["visual_obs%d" % i].append(obs)
        demo_raw_buffer["vector_obs"].append(split_obs.vector_observations)
        # TODO: update the demonstraction files and read from the new proto format
        if behavior_spec.action_spec.continuous_size > 0:
            demo_raw_buffer["continuous_action"].append(
                current_pair_info.action_info.vector_actions_deprecated)
        if behavior_spec.action_spec.discrete_size > 0:
            demo_raw_buffer["discrete_action"].append(
                current_pair_info.action_info.vector_actions_deprecated)
        demo_raw_buffer["prev_action"].append(previous_action)
        if next_done:
            demo_raw_buffer.resequence_and_append(
                demo_processed_buffer,
                batch_size=None,
                training_length=sequence_length)
            demo_raw_buffer.reset_agent()
    demo_raw_buffer.resequence_and_append(demo_processed_buffer,
                                          batch_size=None,
                                          training_length=sequence_length)
    return demo_processed_buffer