Exemplo n.º 1
0
    def __init__(self,
                 action_space: gym.spaces.Space,
                 *,
                 framework: str,
                 model: ModelV2,
                 random_timesteps: int = 0,
                 **kwargs):
        """Initializes a StochasticSampling Exploration object.

        Args:
            action_space (gym.spaces.Space): The gym action space used by the
                environment.
            framework (str): One of None, "tf", "torch".
            model (ModelV2): The ModelV2 used by the owning Policy.
            random_timesteps (int): The number of timesteps for which to act
                completely randomly. Only after this number of timesteps,
                actual samples will be drawn to get exploration actions.
        """
        assert framework is not None
        super().__init__(
            action_space, model=model, framework=framework, **kwargs)

        # Create the Random exploration module (used for the first n
        # timesteps).
        self.random_timesteps = random_timesteps
        self.random_exploration = Random(
            action_space, model=self.model, framework=self.framework, **kwargs)

        # The current timestep value (tf-var or python int).
        self.last_timestep = get_variable(
            np.array(0, np.int64),
            framework=self.framework,
            tf_name="timestep",
            dtype=np.int64)
Exemplo n.º 2
0
    def __init__(self,
                 action_space: Space,
                 *,
                 framework: str,
                 model: ModelV2,
                 random_timesteps: int = 1000,
                 stddev: float = 0.1,
                 initial_scale: float = 1.0,
                 final_scale: float = 0.02,
                 scale_timesteps: int = 10000,
                 scale_schedule: Optional[Schedule] = None,
                 **kwargs):
        """Initializes a GaussianNoise Exploration object.

        Args:
            random_timesteps (int): The number of timesteps for which to act
                completely randomly. Only after this number of timesteps, the
                `self.scale` annealing process will start (see below).
            stddev (float): The stddev (sigma) to use for the
                Gaussian noise to be added to the actions.
            initial_scale (float): The initial scaling weight to multiply
                the noise with.
            final_scale (float): The final scaling weight to multiply
                the noise with.
            scale_timesteps (int): The timesteps over which to linearly anneal
                the scaling factor (after(!) having used random actions for
                `random_timesteps` steps.
            scale_schedule (Optional[Schedule]): An optional Schedule object
                to use (instead of constructing one from the given parameters).
        """
        assert framework is not None
        super().__init__(action_space,
                         model=model,
                         framework=framework,
                         **kwargs)

        # Create the Random exploration module (used for the first n
        # timesteps).
        self.random_timesteps = random_timesteps
        self.random_exploration = Random(action_space,
                                         model=self.model,
                                         framework=self.framework,
                                         **kwargs)

        self.stddev = stddev
        # The `scale` annealing schedule.
        self.scale_schedule = scale_schedule or PiecewiseSchedule(
            endpoints=[(random_timesteps, initial_scale),
                       (random_timesteps + scale_timesteps, final_scale)],
            outside_value=final_scale,
            framework=self.framework)

        # The current timestep value (tf-var or python int).
        self.last_timestep = get_variable(0,
                                          framework=self.framework,
                                          tf_name="timestep")

        # Build the tf-info-op.
        if self.framework in ["tf2", "tf", "tfe"]:
            self._tf_info_op = self.get_info()
Exemplo n.º 3
0
    def __init__(self,
                 action_space,
                 *,
                 random_timesteps=1000,
                 stddev=0.1,
                 initial_scale=1.0,
                 final_scale=0.02,
                 scale_timesteps=10000,
                 scale_schedule=None,
                 framework="tf",
                 **kwargs):
        """Initializes a GaussianNoise Exploration object.

        Args:
            action_space (Space): The gym action space used by the environment.
            random_timesteps (int): The number of timesteps for which to act
                completely randomly. Only after this number of timesteps, the
                `self.scale` annealing process will start (see below).
            stddev (float): The stddev (sigma) to use for the
                Gaussian noise to be added to the actions.
            initial_scale (float): The initial scaling weight to multiply
                the noise with.
            final_scale (float): The final scaling weight to multiply
                the noise with.
            scale_timesteps (int): The timesteps over which to linearly anneal
                the scaling factor (after(!) having used random actions for
                `random_timesteps` steps.
            scale_schedule (Optional[Schedule]): An optional Schedule object
                to use (instead of constructing one from the given parameters).
            framework (Optional[str]): One of None, "tf", "torch".
        """
        assert framework is not None
        super().__init__(action_space, framework=framework, **kwargs)

        self.random_timesteps = random_timesteps
        self.random_exploration = Random(action_space,
                                         framework=self.framework,
                                         **kwargs)
        self.stddev = stddev
        # The `scale` annealing schedule.
        self.scale_schedule = scale_schedule or PiecewiseSchedule(
            endpoints=[(random_timesteps, initial_scale),
                       (random_timesteps + scale_timesteps, final_scale)],
            outside_value=final_scale,
            framework=self.framework)

        # The current timestep value (tf-var or python int).
        self.last_timestep = get_variable(0,
                                          framework=self.framework,
                                          tf_name="timestep")
Exemplo n.º 4
0
def setup_early_mixins(
    policy: Policy,
    obs_space: gym.spaces.Space,
    action_space: gym.spaces.Space,
    config: TrainerConfigDict,
) -> None:
    """Call mixin classes' constructors before Policy's initialization.

    Adds the necessary optimizers to the given Policy.

    Args:
        policy (Policy): The Policy object.
        obs_space (gym.spaces.Space): The Policy's observation space.
        action_space (gym.spaces.Space): The Policy's action space.
        config (TrainerConfigDict): The Policy's config.
    """
    policy.cur_iter = 0
    ActorCriticOptimizerMixin.__init__(policy, config)
    if config["lagrangian"]:
        policy.model.log_alpha_prime = get_variable(0.0,
                                                    framework="tf",
                                                    trainable=True,
                                                    tf_name="log_alpha_prime")
        policy.alpha_prime_optim = tf.keras.optimizers.Adam(
            learning_rate=config["optimization"]["critic_learning_rate"], )
    # Generic random action generator for calculating CQL-loss.
    policy._random_action_generator = Random(
        action_space,
        model=None,
        framework="tf2",
        policy_config=config,
        num_workers=0,
        worker_index=0,
    )
Exemplo n.º 5
0
class StochasticSampling(Exploration):
    """An exploration that simply samples from a distribution.

    The sampling can be made deterministic by passing explore=False into
    the call to `get_exploration_action`.
    Also allows for scheduled parameters for the distributions, such as
    lowering stddev, temperature, etc.. over time.
    """
    def __init__(self,
                 action_space: gym.spaces.Space,
                 *,
                 framework: str,
                 model: ModelV2,
                 random_timesteps: int = 0,
                 **kwargs):
        """Initializes a StochasticSampling Exploration object.

        Args:
            action_space (gym.spaces.Space): The gym action space used by the
                environment.
            framework (str): One of None, "tf", "torch".
            model (ModelV2): The ModelV2 used by the owning Policy.
            random_timesteps (int): The number of timesteps for which to act
                completely randomly. Only after this number of timesteps,
                actual samples will be drawn to get exploration actions.
        """
        assert framework is not None
        super().__init__(action_space,
                         model=model,
                         framework=framework,
                         **kwargs)

        # Create the Random exploration module (used for the first n
        # timesteps).
        self.random_timesteps = random_timesteps
        self.random_exploration = Random(action_space,
                                         model=self.model,
                                         framework=self.framework,
                                         **kwargs)

        # The current timestep value (tf-var or python int).
        self.last_timestep = get_variable(0,
                                          framework=self.framework,
                                          tf_name="timestep")

    @override(Exploration)
    def get_exploration_action(self,
                               *,
                               action_distribution: ActionDistribution,
                               timestep: Union[int, TensorType],
                               explore: bool = True):
        if self.framework == "torch":
            return self._get_torch_exploration_action(action_distribution,
                                                      timestep, explore)
        else:
            return self._get_tf_exploration_action_op(action_distribution,
                                                      timestep, explore)

    def _get_tf_exploration_action_op(self, action_dist, timestep, explore):
        ts = timestep if timestep is not None else self.last_timestep + 1

        stochastic_actions = tf.cond(
            pred=tf.convert_to_tensor(ts < self.random_timesteps),
            true_fn=lambda:
            (self.random_exploration.get_tf_exploration_action_op(
                action_dist, explore=True)[0]),
            false_fn=lambda: action_dist.sample(),
        )
        deterministic_actions = action_dist.deterministic_sample()

        action = tf.cond(
            tf.constant(explore) if isinstance(explore, bool) else explore,
            true_fn=lambda: stochastic_actions,
            false_fn=lambda: deterministic_actions)

        def logp_false_fn():
            batch_size = tf.shape(tree.flatten(action)[0])[0]
            return tf.zeros(shape=(batch_size, ), dtype=tf.float32)

        logp = tf.cond(tf.math.logical_and(
            explore, tf.convert_to_tensor(ts >= self.random_timesteps)),
                       true_fn=lambda: action_dist.sampled_action_logp(),
                       false_fn=logp_false_fn)

        # Increment `last_timestep` by 1 (or set to `timestep`).
        if self.framework in ["tf2", "tfe"]:
            if timestep is None:
                self.last_timestep.assign_add(1)
            else:
                self.last_timestep.assign(timestep)
            return action, logp
        else:
            assign_op = (tf1.assign_add(self.last_timestep, 1)
                         if timestep is None else tf1.assign(
                             self.last_timestep, timestep))
            with tf1.control_dependencies([assign_op]):
                return action, logp

    def _get_torch_exploration_action(self, action_dist: ActionDistribution,
                                      timestep: Union[TensorType, int],
                                      explore: Union[TensorType, bool]):
        # Set last timestep or (if not given) increase by one.
        self.last_timestep = timestep if timestep is not None else \
            self.last_timestep + 1

        # Apply exploration.
        if explore:
            # Random exploration phase.
            if self.last_timestep < self.random_timesteps:
                action, logp = \
                    self.random_exploration.get_torch_exploration_action(
                        action_dist, explore=True)
            # Take a sample from our distribution.
            else:
                action = action_dist.sample()
                logp = action_dist.sampled_action_logp()

        # No exploration -> Return deterministic actions.
        else:
            action = action_dist.deterministic_sample()
            logp = torch.zeros_like(action_dist.sampled_action_logp())

        return action, logp
Exemplo n.º 6
0
class GaussianNoise(Exploration):
    """An exploration that adds white noise to continuous actions.

    If explore=True, returns actions plus scale (<-annealed over time) x
        Gaussian noise. Also, some completely random period is possible at the
        beginning.
    If explore=False, returns the deterministic action.
    """
    def __init__(self,
                 action_space: Space,
                 *,
                 framework: str,
                 model: ModelV2,
                 random_timesteps: int = 1000,
                 stddev: float = 0.1,
                 initial_scale: float = 1.0,
                 final_scale: float = 0.02,
                 scale_timesteps: int = 10000,
                 scale_schedule: Optional[Schedule] = None,
                 **kwargs):
        """Initializes a GaussianNoise Exploration object.

        Args:
            random_timesteps (int): The number of timesteps for which to act
                completely randomly. Only after this number of timesteps, the
                `self.scale` annealing process will start (see below).
            stddev (float): The stddev (sigma) to use for the
                Gaussian noise to be added to the actions.
            initial_scale (float): The initial scaling weight to multiply
                the noise with.
            final_scale (float): The final scaling weight to multiply
                the noise with.
            scale_timesteps (int): The timesteps over which to linearly anneal
                the scaling factor (after(!) having used random actions for
                `random_timesteps` steps.
            scale_schedule (Optional[Schedule]): An optional Schedule object
                to use (instead of constructing one from the given parameters).
        """
        assert framework is not None
        super().__init__(action_space,
                         model=model,
                         framework=framework,
                         **kwargs)

        # Create the Random exploration module (used for the first n
        # timesteps).
        self.random_timesteps = random_timesteps
        self.random_exploration = Random(action_space,
                                         model=self.model,
                                         framework=self.framework,
                                         **kwargs)

        self.stddev = stddev
        # The `scale` annealing schedule.
        self.scale_schedule = scale_schedule or PiecewiseSchedule(
            endpoints=[(random_timesteps, initial_scale),
                       (random_timesteps + scale_timesteps, final_scale)],
            outside_value=final_scale,
            framework=self.framework)

        # The current timestep value (tf-var or python int).
        self.last_timestep = get_variable(np.array(0, np.int64),
                                          framework=self.framework,
                                          tf_name="timestep",
                                          dtype=np.int64)

        # Build the tf-info-op.
        if self.framework == "tf":
            self._tf_state_op = self.get_state()

    @override(Exploration)
    def get_exploration_action(self,
                               *,
                               action_distribution: ActionDistribution,
                               timestep: Union[int, TensorType],
                               explore: bool = True):
        # Adds IID Gaussian noise for exploration, TD3-style.
        if self.framework == "torch":
            return self._get_torch_exploration_action(action_distribution,
                                                      explore, timestep)
        else:
            return self._get_tf_exploration_action_op(action_distribution,
                                                      explore, timestep)

    def _get_tf_exploration_action_op(self, action_dist: ActionDistribution,
                                      explore: bool,
                                      timestep: Union[int, TensorType]):
        ts = timestep if timestep is not None else self.last_timestep

        # The deterministic actions (if explore=False).
        deterministic_actions = action_dist.deterministic_sample()

        # Take a Gaussian sample with our stddev (mean=0.0) and scale it.
        gaussian_sample = self.scale_schedule(ts) * tf.random.normal(
            tf.shape(deterministic_actions), stddev=self.stddev)

        # Stochastic actions could either be: random OR action + noise.
        random_actions, _ = \
            self.random_exploration.get_tf_exploration_action_op(
                action_dist, explore)
        stochastic_actions = tf.cond(
            pred=tf.convert_to_tensor(ts < self.random_timesteps),
            true_fn=lambda: random_actions,
            false_fn=lambda: tf.clip_by_value(
                deterministic_actions + gaussian_sample,
                self.action_space.low * tf.ones_like(deterministic_actions),
                self.action_space.high * tf.ones_like(deterministic_actions)))

        # Chose by `explore` (main exploration switch).
        action = tf.cond(pred=tf.constant(explore, dtype=tf.bool)
                         if isinstance(explore, bool) else explore,
                         true_fn=lambda: stochastic_actions,
                         false_fn=lambda: deterministic_actions)
        # Logp=always zero.
        logp = zero_logps_from_actions(deterministic_actions)

        # Increment `last_timestep` by 1 (or set to `timestep`).
        if self.framework in ["tf2", "tfe"]:
            if timestep is None:
                self.last_timestep.assign_add(1)
            else:
                self.last_timestep.assign(tf.cast(timestep, tf.int64))
            return action, logp
        else:
            assign_op = (tf1.assign_add(self.last_timestep, 1)
                         if timestep is None else tf1.assign(
                             self.last_timestep, timestep))
            with tf1.control_dependencies([assign_op]):
                return action, logp

    def _get_torch_exploration_action(self, action_dist: ActionDistribution,
                                      explore: bool,
                                      timestep: Union[int, TensorType]):
        # Set last timestep or (if not given) increase by one.
        self.last_timestep = timestep if timestep is not None else \
            self.last_timestep + 1

        # Apply exploration.
        if explore:
            # Random exploration phase.
            if self.last_timestep < self.random_timesteps:
                action, _ = \
                    self.random_exploration.get_torch_exploration_action(
                        action_dist, explore=True)
            # Take a Gaussian sample with our stddev (mean=0.0) and scale it.
            else:
                det_actions = action_dist.deterministic_sample()
                scale = self.scale_schedule(self.last_timestep)
                gaussian_sample = scale * torch.normal(
                    mean=torch.zeros(det_actions.size()), std=self.stddev).to(
                        self.device)
                action = torch.min(
                    torch.max(
                        det_actions + gaussian_sample,
                        torch.tensor(self.action_space.low,
                                     dtype=torch.float32,
                                     device=self.device)),
                    torch.tensor(self.action_space.high,
                                 dtype=torch.float32,
                                 device=self.device))
        # No exploration -> Return deterministic actions.
        else:
            action = action_dist.deterministic_sample()

        # Logp=always zero.
        logp = torch.zeros((action.size()[0], ),
                           dtype=torch.float32,
                           device=self.device)

        return action, logp

    @override(Exploration)
    def get_state(self, sess: Optional["tf.Session"] = None):
        """Returns the current scale value.

        Returns:
            Union[float,tf.Tensor[float]]: The current scale value.
        """
        if sess:
            return sess.run(self._tf_state_op)
        scale = self.scale_schedule(self.last_timestep)
        return {
            "cur_scale":
            convert_to_numpy(scale) if self.framework != "tf" else scale,
            "last_timestep":
            convert_to_numpy(self.last_timestep)
            if self.framework != "tf" else self.last_timestep,
        }

    @override(Exploration)
    def set_state(self,
                  state: dict,
                  sess: Optional["tf.Session"] = None) -> None:
        if self.framework == "tf":
            self.last_timestep.load(state["last_timestep"], session=sess)
        elif isinstance(self.last_timestep, int):
            self.last_timestep = state["last_timestep"]
        else:
            self.last_timestep.assign(state["last_timestep"])
Exemplo n.º 7
0
class GaussianNoise(Exploration):
    """An exploration that adds white noise to continuous actions.

    If explore=True, returns actions plus scale (<-annealed over time) x
        Gaussian noise. Also, some completely random period is possible at the
        beginning.
    If explore=False, returns the deterministic action.
    """
    def __init__(self,
                 action_space,
                 *,
                 random_timesteps=1000,
                 stddev=0.1,
                 initial_scale=1.0,
                 final_scale=0.02,
                 scale_timesteps=10000,
                 scale_schedule=None,
                 framework="tf",
                 **kwargs):
        """Initializes a GaussianNoise Exploration object.

        Args:
            action_space (Space): The gym action space used by the environment.
            random_timesteps (int): The number of timesteps for which to act
                completely randomly. Only after this number of timesteps, the
                `self.scale` annealing process will start (see below).
            stddev (float): The stddev (sigma) to use for the
                Gaussian noise to be added to the actions.
            initial_scale (float): The initial scaling weight to multiply
                the noise with.
            final_scale (float): The final scaling weight to multiply
                the noise with.
            scale_timesteps (int): The timesteps over which to linearly anneal
                the scaling factor (after(!) having used random actions for
                `random_timesteps` steps.
            scale_schedule (Optional[Schedule]): An optional Schedule object
                to use (instead of constructing one from the given parameters).
            framework (Optional[str]): One of None, "tf", "torch".
        """
        assert framework is not None
        super().__init__(action_space, framework=framework, **kwargs)

        self.random_timesteps = random_timesteps
        self.random_exploration = Random(action_space,
                                         framework=self.framework)
        self.stddev = stddev
        # The `scale` annealing schedule.
        self.scale_schedule = scale_schedule or PiecewiseSchedule(
            endpoints=[(random_timesteps, initial_scale),
                       (random_timesteps + scale_timesteps, final_scale)],
            outside_value=final_scale,
            framework=self.framework)

        # The current timestep value (tf-var or python int).
        self.last_timestep = get_variable(0,
                                          framework=self.framework,
                                          tf_name="timestep")

    @override(Exploration)
    def get_exploration_action(self,
                               distribution_inputs,
                               action_dist_class,
                               model=None,
                               explore=True,
                               timestep=None):
        # Adds IID Gaussian noise for exploration, TD3-style.
        action_dist = action_dist_class(distribution_inputs, model)

        if self.framework == "torch":
            return self._get_torch_exploration_action(action_dist, explore,
                                                      timestep)
        else:
            return self._get_tf_exploration_action_op(action_dist, explore,
                                                      timestep)

    def _get_tf_exploration_action_op(self, action_dist, explore, timestep):
        ts = timestep if timestep is not None else self.last_timestep

        # The deterministic actions (if explore=False).
        deterministic_actions = action_dist.deterministic_sample()

        # Take a Gaussian sample with our stddev (mean=0.0) and scale it.
        gaussian_sample = self.scale_schedule(ts) * tf.random_normal(
            tf.shape(deterministic_actions), stddev=self.stddev)

        # Stochastic actions could either be: random OR action + noise.
        random_actions, _ = \
            self.random_exploration.get_tf_exploration_action_op(
                action_dist, explore)
        stochastic_actions = tf.cond(
            pred=ts <= self.random_timesteps,
            true_fn=lambda: random_actions,
            false_fn=lambda: tf.clip_by_value(
                deterministic_actions + gaussian_sample,
                self.action_space.low * tf.ones_like(deterministic_actions),
                self.action_space.high * tf.ones_like(deterministic_actions)))

        # Chose by `explore` (main exploration switch).
        batch_size = tf.shape(deterministic_actions)[0]
        action = tf.cond(pred=tf.constant(explore, dtype=tf.bool)
                         if isinstance(explore, bool) else explore,
                         true_fn=lambda: stochastic_actions,
                         false_fn=lambda: deterministic_actions)
        # Logp=always zero.
        logp = tf.zeros(shape=(batch_size, ), dtype=tf.float32)

        # Increment `last_timestep` by 1 (or set to `timestep`).
        assign_op = \
            tf.assign_add(self.last_timestep, 1) if timestep is None else \
            tf.assign(self.last_timestep, timestep)
        with tf.control_dependencies([assign_op]):
            return action, logp

    def _get_torch_exploration_action(self, action_dist, explore, timestep):
        # Set last timestep or (if not given) increase by one.
        self.last_timestep = timestep if timestep is not None else \
            self.last_timestep + 1

        # Apply exploration.
        if explore:
            # Random exploration phase.
            if self.last_timestep <= self.random_timesteps:
                action, _ = \
                    self.random_exploration.get_torch_exploration_action(
                        action_dist, True)
            # Take a Gaussian sample with our stddev (mean=0.0) and scale it.
            else:
                det_actions = action_dist.deterministic_sample()
                scale = self.scale_schedule(self.last_timestep)
                gaussian_sample = scale * torch.normal(
                    mean=0.0, stddev=self.stddev, size=det_actions.size())
                action = torch.clamp(
                    det_actions + gaussian_sample,
                    self.action_space.low * torch.ones_like(det_actions),
                    self.action_space.high * torch.ones_like(det_actions))
        # No exploration -> Return deterministic actions.
        else:
            action = action_dist.deterministic_sample()

        # Logp=always zero.
        logp = torch.zeros(shape=(action.size()[0], ), dtype=torch.float32)

        return action, logp

    @override(Exploration)
    def get_info(self):
        """Returns the current scale value.

        Returns:
            Union[float,tf.Tensor[float]]: The current scale value.
        """
        return self.scale_schedule(self.last_timestep)