Пример #1
0
    def postprocess_trajectory(self,
                               sample_batch,
                               other_agent_batches=None,
                               episode=None):
        if self.config["parameter_noise"]:
            # adjust the sigma of parameter space noise
            states, noisy_actions = [
                list(x) for x in sample_batch.columns(
                    [SampleBatch.CUR_OBS, SampleBatch.ACTIONS])
            ]
            self.sess.run(self.remove_noise_op)
            clean_actions = self.sess.run(self.output_actions,
                                          feed_dict={
                                              self.cur_observations: states,
                                              self.stochastic: False,
                                              self.noise_scale: .0,
                                              self.pure_exploration_phase:
                                              False,
                                          })
            distance_in_action_space = np.sqrt(
                np.mean(np.square(clean_actions - noisy_actions)))
            self.pi_distance = distance_in_action_space
            if distance_in_action_space < \
                    self.config["exploration_ou_sigma"] * self.cur_noise_scale:
                # multiplying the sampled OU noise by noise scale is
                # equivalent to multiplying the sigma of OU by noise scale
                self.parameter_noise_sigma_val *= 1.01
            else:
                self.parameter_noise_sigma_val /= 1.01
            self.parameter_noise_sigma.load(self.parameter_noise_sigma_val,
                                            session=self.sess)

        return postprocess_nstep_and_prio(self, sample_batch)
Пример #2
0
    def postprocess_trajectory(self,
                               sample_batch,
                               other_agent_batches=None,
                               episode=None):
        if self.config["parameter_noise"]:
            # adjust the sigma of parameter space noise
            states, noisy_actions = [
                list(x) for x in sample_batch.columns(
                    [SampleBatch.CUR_OBS, SampleBatch.ACTIONS])
            ]
            self.sess.run(self.remove_parameter_noise_op)

            # TODO(sven): This won't work if exploration != Noise, which is
            #  probably fine as parameter_noise will soon be its own
            #  Exploration class.
            clean_actions, cur_noise_scale = self.sess.run(
                [self.output_actions,
                 self.exploration.get_info()],
                feed_dict={
                    self.cur_observations: states,
                    self._is_exploring: False,
                    self._timestep: self.global_timestep,
                })
            distance_in_action_space = np.sqrt(
                np.mean(np.square(clean_actions - noisy_actions)))
            self.pi_distance = distance_in_action_space
            if distance_in_action_space < \
                    self.config["exploration_config"].get("ou_sigma", 0.2) * \
                    cur_noise_scale:
                # multiplying the sampled OU noise by noise scale is
                # equivalent to multiplying the sigma of OU by noise scale
                self.parameter_noise_sigma_val *= 1.01
            else:
                self.parameter_noise_sigma_val /= 1.01
            self.parameter_noise_sigma.load(self.parameter_noise_sigma_val,
                                            session=self.sess)

        return postprocess_nstep_and_prio(self, sample_batch)
Пример #3
0
def postprocess_trajectory(policy,
                           sample_batch,
                           other_agent_batches=None,
                           episode=None):
    return postprocess_nstep_and_prio(policy, sample_batch)