示例#1
0
def postprocess_trajectory(
        policy: Policy,
        sample_batch: SampleBatch,
        other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None,
        episode: Optional[MultiAgentEpisode] = None) -> SampleBatch:
    """Postprocesses a trajectory and returns the processed trajectory.

    The trajectory contains only data from one episode and from one agent.
    - If  `config.batch_mode=truncate_episodes` (default), sample_batch may
    contain a truncated (at-the-end) episode, in case the
    `config.rollout_fragment_length` was reached by the sampler.
    - If `config.batch_mode=complete_episodes`, sample_batch will contain
    exactly one episode (no matter how long).
    New columns can be added to sample_batch and existing ones may be altered.

    Args:
        policy (Policy): The Policy used to generate the trajectory
            (`sample_batch`)
        sample_batch (SampleBatch): The SampleBatch to postprocess.
        other_agent_batches (Optional[Dict[AgentID, SampleBatch]]): Optional
            dict of AgentIDs mapping to other agents' trajectory data (from the
            same episode). NOTE: The other agents use the same policy.
        episode (Optional[MultiAgentEpisode]): Optional multi-agent episode
            object in which the agents operated.

    Returns:
        SampleBatch: The postprocessed, modified SampleBatch (or a new one).
    """
    return postprocess_nstep_and_prio(policy, sample_batch)
示例#2
0
    def postprocess_with_HER(policy,
                             sample_batch,
                             _other_agent_batches=None,
                             _episode=None):
        """
            postprocess the sampled batch, inject modified trajectory with modified goal condition
        """

        # Hindsight Experience Replay trajectory augmentation
        if type(sample_batch) is SampleBatch:
            # init list of new trajectories
            augmented_trajs = [sample_batch]
            # init HER sampling strategy
            her_sampler = SamplingStrategy(policy, sample_batch)
            # sample n new trajectories using sampling strategy
            for i in range(policy.config['num_her_traj']):
                augmented_trajs.append(her_sampler.sample_trajectory())
            # concatenate sampled trajectories
            sample_batch = SampleBatch.concat_samples(augmented_trajs)

        # RLlib Original DQN postprocess_fn Implementation
        sample_batch = postprocess_nstep_and_prio(policy, sample_batch,
                                                  _other_agent_batches,
                                                  _episode)

        return sample_batch
示例#3
0
def postprocess_trajectory(policy,
                           sample_batch,
                           other_agent_batches=None,
                           episode=None):
    if 'infos' not in sample_batch:
        sample_batch['members'] = np.ones_like(
            sample_batch[SampleBatch.REWARDS]).astype(np.int32)
        print("infos field not in sample_batch !!!")
    else:
        sample_batch['members'] = np.array(
            [info['active_member'] for info in sample_batch['infos']],
            dtype=np.int32)
    return postprocess_nstep_and_prio(policy, sample_batch)
示例#4
0
文件: ddpg_policy.py 项目: yosagi/ray
    def postprocess_trajectory(self,
                               sample_batch,
                               other_agent_batches=None,
                               episode=None):
        if self.config["parameter_noise"]:
            # adjust the sigma of parameter space noise
            states, noisy_actions = [
                list(x) for x in sample_batch.columns(
                    [SampleBatch.CUR_OBS, SampleBatch.ACTIONS])
            ]
            self.sess.run(self.remove_parameter_noise_op)

            # TODO(sven): This won't work if exploration != Noise, which is
            #  probably fine as parameter_noise will soon be its own
            #  Exploration class.
            clean_actions, cur_noise_scale = self.sess.run(
                [self.output_actions,
                 self.exploration.get_info()],
                feed_dict={
                    self.cur_observations: states,
                    self._is_exploring: False,
                    self._timestep: self.global_timestep,
                })
            distance_in_action_space = np.sqrt(
                np.mean(np.square(clean_actions - noisy_actions)))
            self.pi_distance = distance_in_action_space
            if distance_in_action_space < \
                    self.config["exploration_config"].get("ou_sigma", 0.2) * \
                    cur_noise_scale:
                # multiplying the sampled OU noise by noise scale is
                # equivalent to multiplying the sigma of OU by noise scale
                self.parameter_noise_sigma_val *= 1.01
            else:
                self.parameter_noise_sigma_val /= 1.01
            self.parameter_noise_sigma.load(self.parameter_noise_sigma_val,
                                            session=self.sess)

        return postprocess_nstep_and_prio(self, sample_batch)
示例#5
0
def postprocess_trajectory(policy,
                           sample_batch,
                           other_agent_batches=None,
                           episode=None):
    return postprocess_nstep_and_prio(policy, sample_batch)