예제 #1
0
    def __init__(self,
                 action_space,
                 *,
                 framework: str,
                 ou_theta: float = 0.15,
                 ou_sigma: float = 0.2,
                 ou_base_scale: float = 0.1,
                 random_timesteps: int = 1000,
                 initial_scale: float = 1.0,
                 final_scale: float = 0.02,
                 scale_timesteps: int = 10000,
                 scale_schedule: Optional[Schedule] = None,
                 **kwargs):
        """Initializes an Ornstein-Uhlenbeck Exploration object.

        Args:
            action_space: The gym action space used by the environment.
            ou_theta: The theta parameter of the Ornstein-Uhlenbeck process.
            ou_sigma: The sigma parameter of the Ornstein-Uhlenbeck process.
            ou_base_scale: A fixed scaling factor, by which all OU-
                noise is multiplied. NOTE: This is on top of the parent
                GaussianNoise's scaling.
            random_timesteps: The number of timesteps for which to act
                completely randomly. Only after this number of timesteps, the
                `self.scale` annealing process will start (see below).
            initial_scale: The initial scaling weight to multiply the
                noise with.
            final_scale: The final scaling weight to multiply the noise with.
            scale_timesteps: The timesteps over which to linearly anneal the
                scaling factor (after(!) having used random actions for
                `random_timesteps` steps.
            scale_schedule: An optional Schedule object to use (instead
                of constructing one from the given parameters).
            framework: One of None, "tf", "torch".
        """
        # The current OU-state value (gets updated each time, an eploration
        # action is computed).
        self.ou_state = get_variable(np.array(action_space.low.size * [.0],
                                              dtype=np.float32),
                                     framework=framework,
                                     tf_name="ou_state",
                                     torch_tensor=True,
                                     device=None)

        super().__init__(
            action_space,
            framework=framework,
            random_timesteps=random_timesteps,
            initial_scale=initial_scale,
            final_scale=final_scale,
            scale_timesteps=scale_timesteps,
            scale_schedule=scale_schedule,
            stddev=1.0,  # Force `self.stddev` to 1.0.
            **kwargs)
        self.ou_theta = ou_theta
        self.ou_sigma = ou_sigma
        self.ou_base_scale = ou_base_scale
        # Now that we know the device, move ou_state there, in case of PyTorch.
        if self.framework == "torch" and self.device is not None:
            self.ou_state = self.ou_state.to(self.device)
예제 #2
0
    def __init__(self,
                 action_space,
                 initial_epsilon=1.0,
                 final_epsilon=0.05,
                 epsilon_timesteps=int(1e5),
                 epsilon_schedule=None,
                 framework="tf",
                 **kwargs):
        """Create an EpsilonGreedy exploration class.

        Args:
            action_space (Space): The gym action space used by the environment.
            initial_epsilon (float): The initial epsilon value to use.
            final_epsilon (float): The final epsilon value to use.
            epsilon_timesteps (int): The time step after which epsilon should
                always be `final_epsilon`.
            epsilon_schedule (Optional[Schedule]): An optional Schedule object
                to use (instead of constructing one from the given parameters).
            framework (Optional[str]): One of None, "tf", "torch".
        """
        assert framework is not None
        super().__init__(action_space=action_space,
                         framework=framework,
                         **kwargs)

        self.epsilon_schedule = epsilon_schedule or PiecewiseSchedule(
            endpoints=[(0, initial_epsilon),
                       (epsilon_timesteps, final_epsilon)],
            outside_value=final_epsilon,
            framework=self.framework)

        # The current timestep value (tf-var or python int).
        self.last_timestep = get_variable(0,
                                          framework=framework,
                                          tf_name="timestep")
예제 #3
0
 def __init__(self, config):
     # - Create global step for counting the number of update operations.
     # - Use separate optimizers for actor & critic.
     if config["framework"] in ["tf2", "tfe"]:
         self.global_step = get_variable(0, tf_name="global_step")
         self._actor_optimizer = tf.keras.optimizers.Adam(
             learning_rate=config["optimization"]["actor_learning_rate"])
         self._critic_optimizer = [
             tf.keras.optimizers.Adam(learning_rate=config["optimization"]
                                      ["critic_learning_rate"])
         ]
         if config["twin_q"]:
             self._critic_optimizer.append(
                 tf.keras.optimizers.Adam(
                     learning_rate=config["optimization"]
                     ["critic_learning_rate"]))
         self._alpha_optimizer = tf.keras.optimizers.Adam(
             learning_rate=config["optimization"]["entropy_learning_rate"])
     else:
         self.global_step = tf1.train.get_or_create_global_step()
         self._actor_optimizer = tf1.train.AdamOptimizer(
             learning_rate=config["optimization"]["actor_learning_rate"])
         self._critic_optimizer = [
             tf1.train.AdamOptimizer(learning_rate=config["optimization"]
                                     ["critic_learning_rate"])
         ]
         if config["twin_q"]:
             self._critic_optimizer.append(
                 tf1.train.AdamOptimizer(
                     learning_rate=config["optimization"]
                     ["critic_learning_rate"]))
         self._alpha_optimizer = tf1.train.AdamOptimizer(
             learning_rate=config["optimization"]["entropy_learning_rate"])
예제 #4
0
def setup_early_mixins(
    policy: Policy,
    obs_space: gym.spaces.Space,
    action_space: gym.spaces.Space,
    config: TrainerConfigDict,
) -> None:
    """Call mixin classes' constructors before Policy's initialization.

    Adds the necessary optimizers to the given Policy.

    Args:
        policy (Policy): The Policy object.
        obs_space (gym.spaces.Space): The Policy's observation space.
        action_space (gym.spaces.Space): The Policy's action space.
        config (TrainerConfigDict): The Policy's config.
    """
    policy.cur_iter = 0
    ActorCriticOptimizerMixin.__init__(policy, config)
    if config["lagrangian"]:
        policy.model.log_alpha_prime = get_variable(0.0,
                                                    framework="tf",
                                                    trainable=True,
                                                    tf_name="log_alpha_prime")
        policy.alpha_prime_optim = tf.keras.optimizers.Adam(
            learning_rate=config["optimization"]["critic_learning_rate"], )
    # Generic random action generator for calculating CQL-loss.
    policy._random_action_generator = Random(
        action_space,
        model=None,
        framework="tf2",
        policy_config=config,
        num_workers=0,
        worker_index=0,
    )
예제 #5
0
 def __init__(self, config):
     # KL Coefficient
     self.kl_coeff_val = config["kl_coeff"]
     self.kl_target = config["kl_target"]
     self.kl_coeff = get_variable(float(self.kl_coeff_val),
                                  tf_name="kl_coeff",
                                  trainable=False)
예제 #6
0
파일: sac_tf_policy.py 프로젝트: smorad/ray
 def __init__(self, config):
     # Eager mode.
     if config["framework"] in ["tf2", "tfe"]:
         self.global_step = get_variable(0, tf_name="global_step")
         self._actor_optimizer = tf.keras.optimizers.Adam(
             learning_rate=config["optimization"]["actor_learning_rate"])
         self._critic_optimizer = [
             tf.keras.optimizers.Adam(learning_rate=config["optimization"]
                                      ["critic_learning_rate"])
         ]
         if config["twin_q"]:
             self._critic_optimizer.append(
                 tf.keras.optimizers.Adam(
                     learning_rate=config["optimization"]
                     ["critic_learning_rate"]))
         self._alpha_optimizer = tf.keras.optimizers.Adam(
             learning_rate=config["optimization"]["entropy_learning_rate"])
     # Static graph mode.
     else:
         self.global_step = tf1.train.get_or_create_global_step()
         self._actor_optimizer = tf1.train.AdamOptimizer(
             learning_rate=config["optimization"]["actor_learning_rate"])
         self._critic_optimizer = [
             tf1.train.AdamOptimizer(learning_rate=config["optimization"]
                                     ["critic_learning_rate"])
         ]
         if config["twin_q"]:
             self._critic_optimizer.append(
                 tf1.train.AdamOptimizer(
                     learning_rate=config["optimization"]
                     ["critic_learning_rate"]))
         self._alpha_optimizer = tf1.train.AdamOptimizer(
             learning_rate=config["optimization"]["entropy_learning_rate"])
예제 #7
0
    def __init__(self,
                 action_space: gym.spaces.Space,
                 *,
                 framework: str,
                 model: ModelV2,
                 random_timesteps: int = 0,
                 **kwargs):
        """Initializes a StochasticSampling Exploration object.

        Args:
            action_space (gym.spaces.Space): The gym action space used by the
                environment.
            framework (str): One of None, "tf", "torch".
            model (ModelV2): The ModelV2 used by the owning Policy.
            random_timesteps (int): The number of timesteps for which to act
                completely randomly. Only after this number of timesteps,
                actual samples will be drawn to get exploration actions.
        """
        assert framework is not None
        super().__init__(
            action_space, model=model, framework=framework, **kwargs)

        # Create the Random exploration module (used for the first n
        # timesteps).
        self.random_timesteps = random_timesteps
        self.random_exploration = Random(
            action_space, model=self.model, framework=self.framework, **kwargs)

        # The current timestep value (tf-var or python int).
        self.last_timestep = get_variable(
            np.array(0, np.int64),
            framework=self.framework,
            tf_name="timestep",
            dtype=np.int64)
예제 #8
0
    def __init__(self,
                 action_space,
                 *,
                 framework,
                 initial_temperature=1.0,
                 final_temperature=0.0,
                 temperature_timesteps=int(1e5),
                 temperature_schedule=None,
                 **kwargs):
        """Initializes a SoftQ Exploration object.

        Args:
            action_space (Space): The gym action space used by the environment.
            temperature (Schedule): The temperature to divide model outputs by
                before creating the Categorical distribution to sample from.
            framework (str): One of None, "tf", "torch".
            temperature_schedule (Optional[Schedule]): An optional Schedule object
                to use (instead of constructing one from the given parameters).
        """
        assert isinstance(action_space, Discrete)
        super().__init__(action_space, framework=framework, **kwargs)

        self.temperature_schedule = \
            from_config(Schedule, temperature_schedule, framework=framework) or \
            PiecewiseSchedule(
                endpoints=[
                    (0, initial_temperature), (temperature_timesteps, final_temperature)],
                outside_value=final_temperature,
                framework=self.framework)

        # The current timestep value (tf-var or python int).
        self.last_timestep = get_variable(0,
                                          framework=framework,
                                          tf_name="timestep")
        self.temperature = self.temperature_schedule(self.last_timestep)
예제 #9
0
    def __init__(self,
                 action_space,
                 *,
                 framework: str,
                 model: ModelV2,
                 initial_epsilon=1.0,
                 final_epsilon=0.05,
                 epsilon_timesteps=int(1e5),
                 epsilon_schedule=None,
                 **kwargs):
        """Initializes a StochasticSampling Exploration object.

        Args:
            action_space (Space): The gym action space used by the environment.
            framework (str): One of None, "tf", "torch".
        """
        assert framework is not None
        super().__init__(action_space,
                         model=model,
                         framework=framework,
                         **kwargs)

        self.epsilon_schedule = \
            from_config(Schedule, epsilon_schedule, framework=framework) or \
            PiecewiseSchedule(
                endpoints=[
                    (0, initial_epsilon), (epsilon_timesteps, final_epsilon)],
                outside_value=final_epsilon,
                framework=self.framework)

        self.last_timestep = get_variable(0,
                                          framework=framework,
                                          tf_name="timestep")
예제 #10
0
    def __init__(self,
                 action_space: Space,
                 *,
                 framework: str,
                 model: ModelV2,
                 random_timesteps: int = 1000,
                 stddev: float = 0.1,
                 initial_scale: float = 1.0,
                 final_scale: float = 0.02,
                 scale_timesteps: int = 10000,
                 scale_schedule: Optional[Schedule] = None,
                 **kwargs):
        """Initializes a GaussianNoise Exploration object.

        Args:
            random_timesteps (int): The number of timesteps for which to act
                completely randomly. Only after this number of timesteps, the
                `self.scale` annealing process will start (see below).
            stddev (float): The stddev (sigma) to use for the
                Gaussian noise to be added to the actions.
            initial_scale (float): The initial scaling weight to multiply
                the noise with.
            final_scale (float): The final scaling weight to multiply
                the noise with.
            scale_timesteps (int): The timesteps over which to linearly anneal
                the scaling factor (after(!) having used random actions for
                `random_timesteps` steps.
            scale_schedule (Optional[Schedule]): An optional Schedule object
                to use (instead of constructing one from the given parameters).
        """
        assert framework is not None
        super().__init__(action_space,
                         model=model,
                         framework=framework,
                         **kwargs)

        # Create the Random exploration module (used for the first n
        # timesteps).
        self.random_timesteps = random_timesteps
        self.random_exploration = Random(action_space,
                                         model=self.model,
                                         framework=self.framework,
                                         **kwargs)

        self.stddev = stddev
        # The `scale` annealing schedule.
        self.scale_schedule = scale_schedule or PiecewiseSchedule(
            endpoints=[(random_timesteps, initial_scale),
                       (random_timesteps + scale_timesteps, final_scale)],
            outside_value=final_scale,
            framework=self.framework)

        # The current timestep value (tf-var or python int).
        self.last_timestep = get_variable(0,
                                          framework=self.framework,
                                          tf_name="timestep")

        # Build the tf-info-op.
        if self.framework in ["tf2", "tf", "tfe"]:
            self._tf_info_op = self.get_info()
예제 #11
0
    def __init__(self, in_size, out_size, sigma0=0.5, activation="relu"):
        """Initializes a NoisyLayer object.
        Args:
            in_size:
            out_size:
            sigma0:
            non_linear:
        """
        super().__init__()

        self.in_size = in_size
        self.out_size = out_size
        self.sigma0 = sigma0
        self.activation = get_activation_fn(activation, framework="torch")
        if self.activation is not None:
            self.activation = self.activation()

        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

        self.sigma_w = get_variable(np.random.uniform(
            low=-1.0 / np.sqrt(float(self.in_size)),
            high=1.0 / np.sqrt(float(self.in_size)),
            size=[self.in_size, out_size]),
                                    framework="torch",
                                    torch_tensor=True,
                                    trainable=True,
                                    device=self.device).float()
        self.sigma_b = get_variable(np.full(shape=[out_size],
                                            fill_value=sigma0 /
                                            np.sqrt(float(self.in_size))),
                                    framework="torch",
                                    torch_tensor=True,
                                    trainable=True,
                                    device=self.device).float()
        self.w = get_variable(np.full(
            shape=[self.in_size, self.out_size],
            fill_value=6 / np.sqrt(float(in_size) + float(out_size))),
                              framework="torch",
                              torch_tensor=True,
                              trainable=True,
                              device=self.device).float()
        self.b = get_variable(np.zeros([out_size]),
                              framework="torch",
                              torch_tensor=True,
                              trainable=True,
                              device=self.device).float()
예제 #12
0
def before_init_fn(policy: Policy, obs_space: gym.spaces.Space,
                   action_space: gym.spaces.Space,
                   config: TrainerConfigDict) -> None:
    # Create global step for counting the number of update operations.
    if config["framework"] in ["tf2", "tfe"]:
        policy.global_step = get_variable(0, tf_name="global_step")
    else:
        policy.global_step = tf1.train.get_or_create_global_step()
예제 #13
0
파일: ppo_tf_policy.py 프로젝트: zzmcdc/ray
 def __init__(self, config):
     # The current KL value (as python float).
     self.kl_coeff_val = config["kl_coeff"]
     # The current KL value (as tf Variable for in-graph operations).
     self.kl_coeff = get_variable(float(self.kl_coeff_val),
                                  tf_name="kl_coeff",
                                  trainable=False)
     # Constant target value.
     self.kl_target = config["kl_target"]
예제 #14
0
def setup_mixins(policy, obs_space, action_space, config):
    ValueNetworkMixin.__init__(policy, obs_space, action_space, config)
    # Set up a tf-var for the moving avg (do this here to make it work with
    # eager mode); "c^2" in the paper.
    policy._moving_average_sqd_adv_norm = get_variable(
        100.0,
        framework="tf",
        tf_name="moving_average_of_advantage_norm",
        trainable=False)
예제 #15
0
    def __init__(self,
                 action_space,
                 *,
                 ou_theta=0.15,
                 ou_sigma=0.2,
                 ou_base_scale=0.1,
                 random_timesteps=1000,
                 initial_scale=1.0,
                 final_scale=0.02,
                 scale_timesteps=10000,
                 scale_schedule=None,
                 framework="tf",
                 **kwargs):
        """Initializes an Ornstein-Uhlenbeck Exploration object.

        Args:
            action_space (Space): The gym action space used by the environment.
            ou_theta (float): The theta parameter of the Ornstein-Uhlenbeck
                process.
            ou_sigma (float): The sigma parameter of the Ornstein-Uhlenbeck
                process.
            ou_base_scale (float): A fixed scaling factor, by which all OU-
                noise is multiplied. NOTE: This is on top of the parent
                GaussianNoise's scaling.
            random_timesteps (int): The number of timesteps for which to act
                completely randomly. Only after this number of timesteps, the
                `self.scale` annealing process will start (see below).
            initial_scale (float): The initial scaling weight to multiply
                the noise with.
            final_scale (float): The final scaling weight to multiply
                the noise with.
            scale_timesteps (int): The timesteps over which to linearly anneal
                the scaling factor (after(!) having used random actions for
                `random_timesteps` steps.
            scale_schedule (Optional[Schedule]): An optional Schedule object
                to use (instead of constructing one from the given parameters).
            framework (Optional[str]): One of None, "tf", "torch".
        """
        super().__init__(
            action_space,
            random_timesteps=random_timesteps,
            initial_scale=initial_scale,
            final_scale=final_scale,
            scale_timesteps=scale_timesteps,
            scale_schedule=scale_schedule,
            stddev=1.0,  # Force `self.stddev` to 1.0.
            framework=framework,
            **kwargs)
        self.ou_theta = ou_theta
        self.ou_sigma = ou_sigma
        self.ou_base_scale = ou_base_scale

        # The current OU-state value (gets updated each time, an eploration
        # action is computed).
        self.ou_state = get_variable(self.action_space.low.size * [.0],
                                     framework=self.framework,
                                     tf_name="ou_state")
예제 #16
0
    def __init__(
        self,
        action_space: gym.spaces.Space,
        *,
        framework: str,
        initial_epsilon: float = 1.0,
        final_epsilon: float = 0.05,
        warmup_timesteps: int = 0,
        epsilon_timesteps: int = int(1e5),
        epsilon_schedule: Optional[Schedule] = None,
        **kwargs,
    ):
        """Create an EpsilonGreedy exploration class.

        Args:
            action_space: The action space the exploration should occur in.
            framework: The framework specifier.
            initial_epsilon: The initial epsilon value to use.
            final_epsilon: The final epsilon value to use.
            warmup_timesteps: The timesteps over which to not change epsilon in the
                beginning.
            epsilon_timesteps: The timesteps (additional to `warmup_timesteps`)
                after which epsilon should always be `final_epsilon`.
                E.g.: warmup_timesteps=20k epsilon_timesteps=50k -> After 70k timesteps,
                epsilon will reach its final value.
            epsilon_schedule: An optional Schedule object
                to use (instead of constructing one from the given parameters).
        """
        assert framework is not None
        super().__init__(action_space=action_space,
                         framework=framework,
                         **kwargs)

        self.epsilon_schedule = from_config(
            Schedule, epsilon_schedule,
            framework=framework) or PiecewiseSchedule(
                endpoints=[
                    (0, initial_epsilon),
                    (warmup_timesteps, initial_epsilon),
                    (warmup_timesteps + epsilon_timesteps, final_epsilon),
                ],
                outside_value=final_epsilon,
                framework=self.framework,
            )

        # The current timestep value (tf-var or python int).
        self.last_timestep = get_variable(
            np.array(0, np.int64),
            framework=framework,
            tf_name="timestep",
            dtype=np.int64,
        )

        # Build the tf-info-op.
        if self.framework == "tf":
            self._tf_state_op = self.get_state()
예제 #17
0
    def build(self, input_shape: TensorShape):
        in_size = int(input_shape[1])

        self.sigma_w = get_variable(
            value=tf.keras.initializers.RandomUniform(
                minval=-1.0 / np.sqrt(float(in_size)),
                maxval=1.0 / np.sqrt(float(in_size)),
            ),
            trainable=True,
            tf_name=self.prefix + "_sigma_w",
            shape=[in_size, self.out_size],
            dtype=tf.float32,
        )

        self.sigma_b = get_variable(
            value=tf.keras.initializers.Constant(self.sigma0 /
                                                 np.sqrt(float(in_size))),
            trainable=True,
            tf_name=self.prefix + "_sigma_b",
            shape=[self.out_size],
            dtype=tf.float32,
        )

        self.w = get_variable(
            value=tf.keras.initializers.GlorotUniform(),
            tf_name=self.prefix + "_fc_w",
            trainable=True,
            shape=[in_size, self.out_size],
            dtype=tf.float32,
        )

        self.b = get_variable(
            value=tf.keras.initializers.Zeros(),
            tf_name=self.prefix + "_fc_b",
            trainable=True,
            shape=[self.out_size],
            dtype=tf.float32,
        )
예제 #18
0
    def __init__(self,
                 action_space,
                 *,
                 random_timesteps=1000,
                 stddev=0.1,
                 initial_scale=1.0,
                 final_scale=0.02,
                 scale_timesteps=10000,
                 scale_schedule=None,
                 framework="tf",
                 **kwargs):
        """Initializes a GaussianNoise Exploration object.

        Args:
            action_space (Space): The gym action space used by the environment.
            random_timesteps (int): The number of timesteps for which to act
                completely randomly. Only after this number of timesteps, the
                `self.scale` annealing process will start (see below).
            stddev (float): The stddev (sigma) to use for the
                Gaussian noise to be added to the actions.
            initial_scale (float): The initial scaling weight to multiply
                the noise with.
            final_scale (float): The final scaling weight to multiply
                the noise with.
            scale_timesteps (int): The timesteps over which to linearly anneal
                the scaling factor (after(!) having used random actions for
                `random_timesteps` steps.
            scale_schedule (Optional[Schedule]): An optional Schedule object
                to use (instead of constructing one from the given parameters).
            framework (Optional[str]): One of None, "tf", "torch".
        """
        assert framework is not None
        super().__init__(action_space, framework=framework, **kwargs)

        self.random_timesteps = random_timesteps
        self.random_exploration = Random(action_space,
                                         framework=self.framework,
                                         **kwargs)
        self.stddev = stddev
        # The `scale` annealing schedule.
        self.scale_schedule = scale_schedule or PiecewiseSchedule(
            endpoints=[(random_timesteps, initial_scale),
                       (random_timesteps + scale_timesteps, final_scale)],
            outside_value=final_scale,
            framework=self.framework)

        # The current timestep value (tf-var or python int).
        self.last_timestep = get_variable(0,
                                          framework=self.framework,
                                          tf_name="timestep")
예제 #19
0
 def __init__(self, config):
     # The current KL value (as python float).
     self.kl_coeff_val = config["kl_coeff"]
     # The current KL value (as tf Variable for in-graph operations).
     self.kl_coeff = get_variable(float(self.kl_coeff_val),
                                  tf_name="kl_coeff",
                                  trainable=False,
                                  framework=config["framework"])
     # Constant target value.
     self.kl_target = config["kl_target"]
     if self.framework == "tf":
         self._kl_coeff_placeholder = \
             tf1.placeholder(dtype=tf.float32, name="kl_coeff")
         self._kl_coeff_update = self.kl_coeff.assign(
             self._kl_coeff_placeholder, read_value=False)
예제 #20
0
def setup_mixins(policy: Policy, obs_space: gym.spaces.Space,
                 action_space: gym.spaces.Space,
                 config: TrainerConfigDict) -> None:
    # Setup Value branch of our NN.
    ValueNetworkMixin.__init__(policy, obs_space, action_space, config)

    # Not needed for pure BC.
    if policy.config["beta"] != 0.0:
        # Set up a tf-var for the moving avg (do this here to make it work
        # with eager mode); "c^2" in the paper.
        policy._moving_average_sqd_adv_norm = get_variable(
            policy.config["moving_average_sqd_adv_norm_start"],
            framework="tf",
            tf_name="moving_average_of_advantage_norm",
            trainable=False)
예제 #21
0
 def __init__(self, config):
     # Eager mode.
     if config["framework"] in ["tf2", "tfe"]:
         self.global_step = get_variable(0, tf_name="global_step")
         self._actor_optimizer = tf.keras.optimizers.Adam(
             learning_rate=config["actor_lr"])
         self._critic_optimizer = \
             tf.keras.optimizers.Adam(learning_rate=config["critic_lr"])
     # Static graph mode.
     else:
         self.global_step = tf1.train.get_or_create_global_step()
         self._actor_optimizer = tf1.train.AdamOptimizer(
             learning_rate=config["actor_lr"])
         self._critic_optimizer = \
             tf1.train.AdamOptimizer(learning_rate=config["critic_lr"])
예제 #22
0
 def optimizer(self, ) -> List["tf.keras.optimizers.Optimizer"]:
     """Create separate optimizers for actor & critic losses."""
     if self.config["framework"] in ["tf2", "tfe"]:
         self.global_step = get_variable(0, tf_name="global_step")
         self._actor_optimizer = tf.keras.optimizers.Adam(
             learning_rate=self.config["actor_lr"])
         self._critic_optimizer = tf.keras.optimizers.Adam(
             learning_rate=self.config["critic_lr"])
     # Static graph mode.
     else:
         self.global_step = tf1.train.get_or_create_global_step()
         self._actor_optimizer = tf1.train.AdamOptimizer(
             learning_rate=self.config["actor_lr"])
         self._critic_optimizer = tf1.train.AdamOptimizer(
             learning_rate=self.config["critic_lr"])
     return [self._actor_optimizer, self._critic_optimizer]
예제 #23
0
    def __init__(self,
                 action_space,
                 initial_epsilon=1.0,
                 final_epsilon=0.05,
                 epsilon_timesteps=int(1e5),
                 num_workers=None,
                 worker_index=None,
                 epsilon_schedule=None,
                 framework="tf"):
        """

        Args:
            action_space (Space): The gym action space used by the environment.
            initial_epsilon (float): The initial epsilon value to use.
            final_epsilon (float): The final epsilon value to use.
            epsilon_timesteps (int): The time step after which epsilon should
                always be `final_epsilon`.
            num_workers (Optional[int]): The overall number of workers used.
            worker_index (Optional[int]): The index of the Worker using this
                Exploration.
            epsilon_schedule (Optional[Schedule]): An optional Schedule object
                to use (instead of constructing one from the given parameters).
            framework (Optional[str]): One of None, "tf", "torch".
        """
        # For now, require Discrete action space (may loosen this restriction
        # in the future).
        assert isinstance(action_space, gym.spaces.Discrete)
        assert framework is not None
        super().__init__(action_space=action_space,
                         num_workers=num_workers,
                         worker_index=worker_index,
                         framework=framework)

        self.epsilon_schedule = epsilon_schedule or PiecewiseSchedule(
            endpoints=[(0, initial_epsilon),
                       (epsilon_timesteps, final_epsilon)],
            outside_value=final_epsilon,
            framework=self.framework)

        # The current timestep value (tf-var or python int).
        self.last_timestep = get_variable(0,
                                          framework=framework,
                                          tf_name="timestep")
예제 #24
0
        def __init__(
            self,
            obs_space,
            action_space,
            config,
            existing_model=None,
            existing_inputs=None,
        ):
            # First thing first, enable eager execution if necessary.
            base.enable_eager_execution_if_necessary()

            config = dict(
                ray.rllib.algorithms.marwil.marwil.MARWILConfig().to_dict(), **config
            )

            # Initialize base class.
            base.__init__(
                self,
                obs_space,
                action_space,
                config,
                existing_inputs=existing_inputs,
                existing_model=existing_model,
            )

            ValueNetworkMixin.__init__(self, config)
            PostprocessAdvantages.__init__(self)

            # Not needed for pure BC.
            if config["beta"] != 0.0:
                # Set up a tf-var for the moving avg (do this here to make it work
                # with eager mode); "c^2" in the paper.
                self._moving_average_sqd_adv_norm = get_variable(
                    config["moving_average_sqd_adv_norm_start"],
                    framework="tf",
                    tf_name="moving_average_of_advantage_norm",
                    trainable=False,
                )

            # Note: this is a bit ugly, but loss and optimizer initialization must
            # happen after all the MixIns are initialized.
            self.maybe_initialize_optimizer_and_loss()
예제 #25
0
    def __init__(self,
                 action_space,
                 *,
                 framework: str,
                 initial_epsilon: float = 1.0,
                 final_epsilon: float = 0.05,
                 epsilon_timesteps: int = int(1e5),
                 epsilon_schedule: Optional[Schedule] = None,
                 **kwargs):
        """Create an EpsilonGreedy exploration class.

        Args:
            initial_epsilon (float): The initial epsilon value to use.
            final_epsilon (float): The final epsilon value to use.
            epsilon_timesteps (int): The time step after which epsilon should
                always be `final_epsilon`.
            epsilon_schedule (Optional[Schedule]): An optional Schedule object
                to use (instead of constructing one from the given parameters).
        """
        assert framework is not None
        super().__init__(action_space=action_space,
                         framework=framework,
                         **kwargs)

        self.epsilon_schedule = \
            from_config(Schedule, epsilon_schedule, framework=framework) or \
            PiecewiseSchedule(
                endpoints=[
                    (0, initial_epsilon), (epsilon_timesteps, final_epsilon)],
                outside_value=final_epsilon,
                framework=self.framework)

        # The current timestep value (tf-var or python int).
        self.last_timestep = get_variable(np.array(0, np.int64),
                                          framework=framework,
                                          tf_name="timestep",
                                          dtype=np.int64)

        # Build the tf-info-op.
        if self.framework in ["tf2", "tf", "tfe"]:
            self._tf_info_op = self.get_info()
예제 #26
0
    def __init__(self,
                 action_space,
                 *,
                 framework: str,
                 initial_epsilon=1.0,
                 final_epsilon=0.05,
                 epsilon_timesteps=int(1e5),
                 epsilon_schedule=None,
                 **kwargs):
        """Create an EpsilonGreedy exploration class.

        Args:
            initial_epsilon (float): The initial epsilon value to use.
            final_epsilon (float): The final epsilon value to use.
            epsilon_timesteps (int): The time step after which epsilon should
                always be `final_epsilon`.
            epsilon_schedule (Optional[Schedule]): An optional Schedule object
                to use (instead of constructing one from the given parameters).
        """
        assert framework is not None
        super().__init__(action_space=action_space,
                         framework=framework,
                         **kwargs)

        self.epsilon_schedule = \
            from_config(Schedule, epsilon_schedule, framework=framework) or \
            PiecewiseSchedule(
                endpoints=[
                    (0, initial_epsilon), (epsilon_timesteps, final_epsilon)],
                outside_value=final_epsilon,
                framework=self.framework)

        # The current timestep value (tf-var or python int).
        self.last_timestep = get_variable(0,
                                          framework=framework,
                                          tf_name="timestep")

        # Build the tf-info-op.
        if self.framework == "tf":
            raise ValueError("Torch version does not support "
                             "multiobj episilon-greedy yet!")
예제 #27
0
    def __init__(self, entropy_coeff, entropy_coeff_schedule):
        self.entropy_coeff = get_variable(entropy_coeff,
                                          framework="tf",
                                          tf_name="entropy_coeff",
                                          trainable=False)

        if entropy_coeff_schedule is None:
            self.entropy_coeff_schedule = ConstantSchedule(entropy_coeff,
                                                           framework=None)
        else:
            # Allows for custom schedule similar to lr_schedule format
            if isinstance(entropy_coeff_schedule, list):
                self.entropy_coeff_schedule = PiecewiseSchedule(
                    entropy_coeff_schedule,
                    outside_value=entropy_coeff_schedule[-1][-1],
                    framework=None)
            else:
                # Implements previous version but enforces outside_value
                self.entropy_coeff_schedule = PiecewiseSchedule(
                    [[0, entropy_coeff], [entropy_coeff_schedule, 0.0]],
                    outside_value=0.0,
                    framework=None)
예제 #28
0
    def __init__(self,
                 action_space,
                 *,
                 framework: str,
                 policy_config: dict,
                 model: ModelV2,
                 initial_stddev: float = 1.0,
                 random_timesteps: int = 10000,
                 sub_exploration: Optional[dict] = None,
                 **kwargs):
        """Initializes a ParameterNoise Exploration object.

        Args:
            initial_stddev (float): The initial stddev to use for the noise.
            random_timesteps (int): The number of timesteps to act completely
                randomly (see [1]).
            sub_exploration (Optional[dict]): Optional sub-exploration config.
                None for auto-detection/setup.
        """
        assert framework is not None
        super().__init__(action_space,
                         policy_config=policy_config,
                         model=model,
                         framework=framework,
                         **kwargs)

        self.stddev = get_variable(initial_stddev,
                                   framework=self.framework,
                                   tf_name="stddev")
        self.stddev_val = initial_stddev  # Out-of-graph tf value holder.

        # The weight variables of the Model where noise should be applied to.
        # This excludes any variable, whose name contains "LayerNorm" (those
        # are BatchNormalization layers, which should not be perturbed).
        self.model_variables = [
            v for k, v in self.model.trainable_variables(as_dict=True).items()
            if "LayerNorm" not in k
        ]
        # Our noise to be added to the weights. Each item in `self.noise`
        # corresponds to one Model variable and holding the Gaussian noise to
        # be added to that variable (weight).
        self.noise = []
        for var in self.model_variables:
            name_ = var.name.split(":")[0] + "_noisy" if var.name else ""
            self.noise.append(
                get_variable(np.zeros(var.shape, dtype=np.float32),
                             framework=self.framework,
                             tf_name=name_,
                             torch_tensor=True,
                             device=self.device))

        # tf-specific ops to sample, assign and remove noise.
        if self.framework == "tf" and not tf.executing_eagerly():
            self.tf_sample_new_noise_op = \
                self._tf_sample_new_noise_op()
            self.tf_add_stored_noise_op = \
                self._tf_add_stored_noise_op()
            self.tf_remove_noise_op = \
                self._tf_remove_noise_op()
            # Create convenience sample+add op for tf.
            with tf1.control_dependencies([self.tf_sample_new_noise_op]):
                add_op = self._tf_add_stored_noise_op()
            with tf1.control_dependencies([add_op]):
                self.tf_sample_new_noise_and_add_op = tf.no_op()

        # Whether the Model's weights currently have noise added or not.
        self.weights_are_currently_noisy = False

        # Auto-detection of underlying exploration functionality.
        if sub_exploration is None:
            # For discrete action spaces, use an underlying EpsilonGreedy with
            # a special schedule.
            if isinstance(self.action_space, Discrete):
                sub_exploration = {
                    "type": "EpsilonGreedy",
                    "epsilon_schedule": {
                        "type":
                        "PiecewiseSchedule",
                        # Step function (see [2]).
                        "endpoints": [(0, 1.0), (random_timesteps + 1, 1.0),
                                      (random_timesteps + 2, 0.01)],
                        "outside_value":
                        0.01
                    }
                }
            elif isinstance(self.action_space, Box):
                sub_exploration = {
                    "type": "OrnsteinUhlenbeckNoise",
                    "random_timesteps": random_timesteps,
                }
            # TODO(sven): Implement for any action space.
            else:
                raise NotImplementedError

        self.sub_exploration = from_config(Exploration,
                                           sub_exploration,
                                           framework=self.framework,
                                           action_space=self.action_space,
                                           policy_config=self.policy_config,
                                           model=self.model,
                                           **kwargs)

        # Whether we need to call `self._delayed_on_episode_start` before
        # the forward pass.
        self.episode_started = False
예제 #29
0
def before_init_fn(policy, obs_space, action_space, config):
    # Create global step for counting the number of update operations.
    if tfv == 2 and config["framework"] == "tfe":
        policy.global_step = get_variable(0, tf_name="global_step")
    else:
        policy.global_step = tf1.train.get_or_create_global_step()