Пример #1
0
def create_agent_buffer(behavior_spec: BehaviorSpec,
                        number: int,
                        reward: float = 0.0) -> AgentBuffer:
    buffer = AgentBuffer()
    curr_observations = [
        np.random.normal(size=shape).astype(np.float32)
        for shape in behavior_spec.observation_shapes
    ]
    next_observations = [
        np.random.normal(size=shape).astype(np.float32)
        for shape in behavior_spec.observation_shapes
    ]
    action = behavior_spec.action_spec.random_action(1)[0, :]
    for _ in range(number):
        curr_split_obs = SplitObservations.from_observations(curr_observations)
        next_split_obs = SplitObservations.from_observations(next_observations)
        for i, _ in enumerate(curr_split_obs.visual_observations):
            buffer["visual_obs%d" % i].append(
                curr_split_obs.visual_observations[i])
            buffer["next_visual_obs%d" % i].append(
                next_split_obs.visual_observations[i])
        buffer["vector_obs"].append(curr_split_obs.vector_observations)
        buffer["next_vector_in"].append(next_split_obs.vector_observations)
        buffer["actions"].append(action)
        buffer["reward"].append(np.ones(1, dtype=np.float32) * reward)
        buffer["masks"].append(np.ones(1, dtype=np.float32))
    buffer["done"] = np.zeros(number, dtype=np.float32)
    return buffer
Пример #2
0
def create_agent_buffer(behavior_spec: BehaviorSpec,
                        number: int,
                        reward: float = 0.0) -> AgentBuffer:
    buffer = AgentBuffer()
    curr_observations = [
        np.random.normal(size=shape).astype(np.float32)
        for shape in behavior_spec.observation_shapes
    ]
    next_observations = [
        np.random.normal(size=shape).astype(np.float32)
        for shape in behavior_spec.observation_shapes
    ]
    action_buffer = behavior_spec.action_spec.random_action(1)
    action = {}
    if behavior_spec.action_spec.continuous_size > 0:
        action["continuous_action"] = action_buffer.continuous
    if behavior_spec.action_spec.discrete_size > 0:
        action["discrete_action"] = action_buffer.discrete

    for _ in range(number):
        curr_split_obs = SplitObservations.from_observations(curr_observations)
        next_split_obs = SplitObservations.from_observations(next_observations)
        for i, _ in enumerate(curr_split_obs.visual_observations):
            buffer["visual_obs%d" % i].append(
                curr_split_obs.visual_observations[i])
            buffer["next_visual_obs%d" % i].append(
                next_split_obs.visual_observations[i])
        buffer["vector_obs"].append(curr_split_obs.vector_observations)
        buffer["next_vector_in"].append(next_split_obs.vector_observations)
        for _act_type, _act in action.items():
            buffer[_act_type].append(_act[0, :])
        buffer["reward"].append(np.ones(1, dtype=np.float32) * reward)
        buffer["masks"].append(np.ones(1, dtype=np.float32))
    buffer["done"] = np.zeros(number, dtype=np.float32)
    return buffer
Пример #3
0
    def _process_trajectory(self, trajectory: Trajectory) -> None:
        """
        Takes a trajectory and processes it, putting it into the replay buffer.
        """
        super()._process_trajectory(trajectory)
        last_step = trajectory.steps[-1]
        agent_id = trajectory.agent_id  # All the agents should have the same ID

        agent_buffer_trajectory = trajectory.to_agentbuffer()

        # Update the normalization
        if self.is_training:
            self.policy.update_normalization(
                agent_buffer_trajectory["vector_obs"])

        # Evaluate all reward functions for reporting purposes
        self.collected_rewards["environment"][agent_id] += np.sum(
            agent_buffer_trajectory["environment_rewards"])
        for name, reward_signal in self.optimizer.reward_signals.items():
            if isinstance(reward_signal, RewardSignal):
                evaluate_result = reward_signal.evaluate_batch(
                    agent_buffer_trajectory).scaled_reward
            else:
                evaluate_result = (
                    reward_signal.evaluate(agent_buffer_trajectory) *
                    reward_signal.strength)
            # Report the reward signals
            self.collected_rewards[name][agent_id] += np.sum(evaluate_result)

        # Get all value estimates for reporting purposes
        value_estimates, _ = self.optimizer.get_trajectory_value_estimates(
            agent_buffer_trajectory, trajectory.next_obs,
            trajectory.done_reached)
        for name, v in value_estimates.items():
            if isinstance(self.optimizer.reward_signals[name], RewardSignal):
                self._stats_reporter.add_stat(
                    self.optimizer.reward_signals[name].value_name, np.mean(v))
            else:
                self._stats_reporter.add_stat(
                    f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value",
                    np.mean(v),
                )

        # Bootstrap using the last step rather than the bootstrap step if max step is reached.
        # Set last element to duplicate obs and remove dones.
        if last_step.interrupted:
            vec_vis_obs = SplitObservations.from_observations(last_step.obs)
            for i, obs in enumerate(vec_vis_obs.visual_observations):
                agent_buffer_trajectory["next_visual_obs%d" % i][-1] = obs
            if vec_vis_obs.vector_observations.size > 1:
                agent_buffer_trajectory["next_vector_in"][
                    -1] = vec_vis_obs.vector_observations
            agent_buffer_trajectory["done"][-1] = False

        # Append to update buffer
        agent_buffer_trajectory.resequence_and_append(
            self.update_buffer, training_length=self.policy.sequence_length)

        if trajectory.done_reached:
            self._update_end_episode_stats(agent_id, self.optimizer)
Пример #4
0
def make_demo_buffer(
    pair_infos: List[AgentInfoActionPairProto],
    behavior_spec: BehaviorSpec,
    sequence_length: int,
) -> AgentBuffer:
    # Create and populate buffer using experiences
    demo_raw_buffer = AgentBuffer()
    demo_processed_buffer = AgentBuffer()
    for idx, current_pair_info in enumerate(pair_infos):
        if idx > len(pair_infos) - 2:
            break
        next_pair_info = pair_infos[idx + 1]
        current_decision_step, current_terminal_step = steps_from_proto(
            [current_pair_info.agent_info], behavior_spec
        )
        next_decision_step, next_terminal_step = steps_from_proto(
            [next_pair_info.agent_info], behavior_spec
        )
        previous_action = (
            np.array(pair_infos[idx].action_info.vector_actions, dtype=np.float32) * 0
        )
        if idx > 0:
            previous_action = np.array(
                pair_infos[idx - 1].action_info.vector_actions, dtype=np.float32
            )

        next_done = len(next_terminal_step) == 1
        next_reward = 0
        if len(next_terminal_step) == 1:
            next_reward = next_terminal_step.reward[0]
        else:
            next_reward = next_decision_step.reward[0]
        current_obs = None
        if len(current_terminal_step) == 1:
            current_obs = list(current_terminal_step.values())[0].obs
        else:
            current_obs = list(current_decision_step.values())[0].obs

        demo_raw_buffer["done"].append(next_done)
        demo_raw_buffer["rewards"].append(next_reward)
        split_obs = SplitObservations.from_observations(current_obs)
        for i, obs in enumerate(split_obs.visual_observations):
            demo_raw_buffer["visual_obs%d" % i].append(obs)
        demo_raw_buffer["vector_obs"].append(split_obs.vector_observations)
        demo_raw_buffer["actions"].append(current_pair_info.action_info.vector_actions)
        demo_raw_buffer["prev_action"].append(previous_action)
        if next_done:
            demo_raw_buffer.resequence_and_append(
                demo_processed_buffer, batch_size=None, training_length=sequence_length
            )
            demo_raw_buffer.reset_agent()
    demo_raw_buffer.resequence_and_append(
        demo_processed_buffer, batch_size=None, training_length=sequence_length
    )
    return demo_processed_buffer
Пример #5
0
 def _split_decision_step(
     self, decision_requests: DecisionSteps
 ) -> Tuple[SplitObservations, np.ndarray]:
     vec_vis_obs = SplitObservations.from_observations(
         decision_requests.obs)
     mask = None
     if not self.use_continuous_act:
         mask = torch.ones([len(decision_requests), np.sum(self.act_size)])
         if decision_requests.action_mask is not None:
             mask = torch.as_tensor(
                 1 - np.concatenate(decision_requests.action_mask, axis=1))
     return vec_vis_obs, mask
Пример #6
0
def make_demo_buffer(
    pair_infos: List[AgentInfoActionPairProto],
    group_spec: AgentGroupSpec,
    sequence_length: int,
) -> AgentBuffer:
    # Create and populate buffer using experiences
    demo_raw_buffer = AgentBuffer()
    demo_processed_buffer = AgentBuffer()
    for idx, current_pair_info in enumerate(pair_infos):
        if idx > len(pair_infos) - 2:
            break
        next_pair_info = pair_infos[idx + 1]
        current_step_info = batched_step_result_from_proto(
            [current_pair_info.agent_info], group_spec)
        next_step_info = batched_step_result_from_proto(
            [next_pair_info.agent_info], group_spec)
        previous_action = (np.array(pair_infos[idx].action_info.vector_actions,
                                    dtype=np.float32) * 0)
        if idx > 0:
            previous_action = np.array(
                pair_infos[idx - 1].action_info.vector_actions,
                dtype=np.float32)
        curr_agent_id = current_step_info.agent_id[0]
        current_agent_step_info = current_step_info.get_agent_step_result(
            curr_agent_id)
        next_agent_id = next_step_info.agent_id[0]
        next_agent_step_info = next_step_info.get_agent_step_result(
            next_agent_id)

        demo_raw_buffer["done"].append(next_agent_step_info.done)
        demo_raw_buffer["rewards"].append(next_agent_step_info.reward)
        split_obs = SplitObservations.from_observations(
            current_agent_step_info.obs)
        for i, obs in enumerate(split_obs.visual_observations):
            demo_raw_buffer["visual_obs%d" % i].append(obs)
        demo_raw_buffer["vector_obs"].append(split_obs.vector_observations)
        demo_raw_buffer["actions"].append(
            current_pair_info.action_info.vector_actions)
        demo_raw_buffer["prev_action"].append(previous_action)
        if next_step_info.done:
            demo_raw_buffer.resequence_and_append(
                demo_processed_buffer,
                batch_size=None,
                training_length=sequence_length)
            demo_raw_buffer.reset_agent()
    demo_raw_buffer.resequence_and_append(demo_processed_buffer,
                                          batch_size=None,
                                          training_length=sequence_length)
    return demo_processed_buffer
Пример #7
0
def test_split_obs(num_visual_obs, num_vec_obs):
    obs = []
    for _ in range(num_visual_obs):
        obs.append(np.ones((84, 84, 3), dtype=np.float32))
    for _ in range(num_vec_obs):
        obs.append(np.ones(VEC_OBS_SIZE, dtype=np.float32))
    split_observations = SplitObservations.from_observations(obs)

    if num_vec_obs == 1:
        assert len(split_observations.vector_observations) == VEC_OBS_SIZE
    else:
        assert len(split_observations.vector_observations) == 0

    # Assert the number of vector observations.
    assert len(split_observations.visual_observations) == num_visual_obs
Пример #8
0
 def fill_eval_dict(self, feed_dict, batched_step_result):
     vec_vis_obs = SplitObservations.from_observations(batched_step_result.obs)
     for i, _ in enumerate(vec_vis_obs.visual_observations):
         feed_dict[self.visual_in[i]] = vec_vis_obs.visual_observations[i]
     if self.use_vec_obs:
         feed_dict[self.vector_in] = vec_vis_obs.vector_observations
     if not self.use_continuous_act:
         mask = np.ones(
             (len(batched_step_result), np.sum(self.brain.vector_action_space_size)),
             dtype=np.float32,
         )
         if batched_step_result.action_mask is not None:
             mask = 1 - np.concatenate(batched_step_result.action_mask, axis=1)
         feed_dict[self.action_masks] = mask
     return feed_dict
Пример #9
0
    def _get_value_estimates(
        self,
        next_obs: List[np.ndarray],
        done: bool,
        policy_memory: np.ndarray = None,
        value_memory: np.ndarray = None,
        prev_action: np.ndarray = None,
    ) -> Dict[str, float]:
        """
        Generates value estimates for bootstrapping.
        :param experience: AgentExperience to be used for bootstrapping.
        :param done: Whether or not this is the last element of the episode, in which case the value estimate will be 0.
        :return: The value estimate dictionary with key being the name of the reward signal and the value the
        corresponding value estimate.
        """

        feed_dict: Dict[tf.Tensor, Any] = {
            self.policy.batch_size_ph: 1,
            self.policy.sequence_length_ph: 1,
        }
        vec_vis_obs = SplitObservations.from_observations(next_obs)
        for i in range(len(vec_vis_obs.visual_observations)):
            feed_dict[self.policy.visual_in[i]] = [
                vec_vis_obs.visual_observations[i]
            ]

        if self.policy.vec_obs_size > 0:
            feed_dict[self.policy.vector_in] = [
                vec_vis_obs.vector_observations
            ]
        if policy_memory is not None:
            feed_dict[self.policy.memory_in] = policy_memory
        if value_memory is not None:
            feed_dict[self.memory_in] = value_memory
        if prev_action is not None:
            feed_dict[self.policy.prev_action] = [prev_action]
        value_estimates = self.sess.run(self.value_heads, feed_dict)

        value_estimates = {k: float(v) for k, v in value_estimates.items()}

        # If we're done, reassign all of the value estimates that need terminal states.
        if done:
            for k in value_estimates:
                if self.reward_signals[k].use_terminal_states:
                    value_estimates[k] = 0.0

        return value_estimates
Пример #10
0
    def get_trajectory_value_estimates(
            self, batch: AgentBuffer, next_obs: List[np.ndarray],
            done: bool) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
        vector_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
        if self.policy.use_vis_obs:
            visual_obs = []
            for idx, _ in enumerate(
                    self.policy.actor_critic.network_body.visual_processors):
                visual_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" %
                                                            idx])
                visual_obs.append(visual_ob)
        else:
            visual_obs = []

        memory = torch.zeros([1, 1, self.policy.m_size])

        vec_vis_obs = SplitObservations.from_observations(next_obs)
        next_vec_obs = [
            ModelUtils.list_to_tensor(
                vec_vis_obs.vector_observations).unsqueeze(0)
        ]
        next_vis_obs = [
            ModelUtils.list_to_tensor(_vis_ob).unsqueeze(0)
            for _vis_ob in vec_vis_obs.visual_observations
        ]

        value_estimates, next_memory = self.policy.actor_critic.critic_pass(
            vector_obs,
            visual_obs,
            memory,
            sequence_length=batch.num_experiences)

        next_value_estimate, _ = self.policy.actor_critic.critic_pass(
            next_vec_obs, next_vis_obs, next_memory, sequence_length=1)

        for name, estimate in value_estimates.items():
            value_estimates[name] = ModelUtils.to_numpy(estimate)
            next_value_estimate[name] = ModelUtils.to_numpy(
                next_value_estimate[name])

        if done:
            for k in next_value_estimate:
                if not self.reward_signals[k].ignore_done:
                    next_value_estimate[k] = 0.0

        return value_estimates, next_value_estimate
Пример #11
0
    def get_value_estimates(self, next_obs: List[np.ndarray], agent_id: str,
                            done: bool) -> Dict[str, float]:
        """
        Generates value estimates for bootstrapping.
        :param experience: AgentExperience to be used for bootstrapping.
        :param done: Whether or not this is the last element of the episode, in which case the value estimate will be 0.
        :return: The value estimate dictionary with key being the name of the reward signal and the value the
        corresponding value estimate.
        """

        feed_dict: Dict[tf.Tensor, Any] = {
            self.model.batch_size: 1,
            self.model.sequence_length: 1,
        }
        vec_vis_obs = SplitObservations.from_observations(next_obs)
        for i in range(len(vec_vis_obs.visual_observations)):
            feed_dict[self.model.visual_in[i]] = [
                vec_vis_obs.visual_observations[i]
            ]

        if self.use_vec_obs:
            feed_dict[self.model.vector_in] = [vec_vis_obs.vector_observations]
        if self.use_recurrent:
            feed_dict[self.model.memory_in] = self.retrieve_memories(
                [agent_id])
        if not self.use_continuous_act and self.use_recurrent:
            feed_dict[self.model.prev_action] = self.retrieve_previous_action(
                [agent_id])
        value_estimates = self.sess.run(self.model.value_heads, feed_dict)

        value_estimates = {k: float(v) for k, v in value_estimates.items()}

        # If we're done, reassign all of the value estimates that need terminal states.
        if done:
            for k in value_estimates:
                if self.reward_signals[k].use_terminal_states:
                    value_estimates[k] = 0.0

        return value_estimates