def __init__(self, action_space, *, framework: str, ou_theta: float = 0.15, ou_sigma: float = 0.2, ou_base_scale: float = 0.1, random_timesteps: int = 1000, initial_scale: float = 1.0, final_scale: float = 0.02, scale_timesteps: int = 10000, scale_schedule: Optional[Schedule] = None, **kwargs): """Initializes an Ornstein-Uhlenbeck Exploration object. Args: action_space: The gym action space used by the environment. ou_theta: The theta parameter of the Ornstein-Uhlenbeck process. ou_sigma: The sigma parameter of the Ornstein-Uhlenbeck process. ou_base_scale: A fixed scaling factor, by which all OU- noise is multiplied. NOTE: This is on top of the parent GaussianNoise's scaling. random_timesteps: The number of timesteps for which to act completely randomly. Only after this number of timesteps, the `self.scale` annealing process will start (see below). initial_scale: The initial scaling weight to multiply the noise with. final_scale: The final scaling weight to multiply the noise with. scale_timesteps: The timesteps over which to linearly anneal the scaling factor (after(!) having used random actions for `random_timesteps` steps. scale_schedule: An optional Schedule object to use (instead of constructing one from the given parameters). framework: One of None, "tf", "torch". """ # The current OU-state value (gets updated each time, an eploration # action is computed). self.ou_state = get_variable(np.array(action_space.low.size * [.0], dtype=np.float32), framework=framework, tf_name="ou_state", torch_tensor=True, device=None) super().__init__( action_space, framework=framework, random_timesteps=random_timesteps, initial_scale=initial_scale, final_scale=final_scale, scale_timesteps=scale_timesteps, scale_schedule=scale_schedule, stddev=1.0, # Force `self.stddev` to 1.0. **kwargs) self.ou_theta = ou_theta self.ou_sigma = ou_sigma self.ou_base_scale = ou_base_scale # Now that we know the device, move ou_state there, in case of PyTorch. if self.framework == "torch" and self.device is not None: self.ou_state = self.ou_state.to(self.device)
def __init__(self, action_space, initial_epsilon=1.0, final_epsilon=0.05, epsilon_timesteps=int(1e5), epsilon_schedule=None, framework="tf", **kwargs): """Create an EpsilonGreedy exploration class. Args: action_space (Space): The gym action space used by the environment. initial_epsilon (float): The initial epsilon value to use. final_epsilon (float): The final epsilon value to use. epsilon_timesteps (int): The time step after which epsilon should always be `final_epsilon`. epsilon_schedule (Optional[Schedule]): An optional Schedule object to use (instead of constructing one from the given parameters). framework (Optional[str]): One of None, "tf", "torch". """ assert framework is not None super().__init__(action_space=action_space, framework=framework, **kwargs) self.epsilon_schedule = epsilon_schedule or PiecewiseSchedule( endpoints=[(0, initial_epsilon), (epsilon_timesteps, final_epsilon)], outside_value=final_epsilon, framework=self.framework) # The current timestep value (tf-var or python int). self.last_timestep = get_variable(0, framework=framework, tf_name="timestep")
def __init__(self, config): # - Create global step for counting the number of update operations. # - Use separate optimizers for actor & critic. if config["framework"] in ["tf2", "tfe"]: self.global_step = get_variable(0, tf_name="global_step") self._actor_optimizer = tf.keras.optimizers.Adam( learning_rate=config["optimization"]["actor_learning_rate"]) self._critic_optimizer = [ tf.keras.optimizers.Adam(learning_rate=config["optimization"] ["critic_learning_rate"]) ] if config["twin_q"]: self._critic_optimizer.append( tf.keras.optimizers.Adam( learning_rate=config["optimization"] ["critic_learning_rate"])) self._alpha_optimizer = tf.keras.optimizers.Adam( learning_rate=config["optimization"]["entropy_learning_rate"]) else: self.global_step = tf1.train.get_or_create_global_step() self._actor_optimizer = tf1.train.AdamOptimizer( learning_rate=config["optimization"]["actor_learning_rate"]) self._critic_optimizer = [ tf1.train.AdamOptimizer(learning_rate=config["optimization"] ["critic_learning_rate"]) ] if config["twin_q"]: self._critic_optimizer.append( tf1.train.AdamOptimizer( learning_rate=config["optimization"] ["critic_learning_rate"])) self._alpha_optimizer = tf1.train.AdamOptimizer( learning_rate=config["optimization"]["entropy_learning_rate"])
def setup_early_mixins( policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict, ) -> None: """Call mixin classes' constructors before Policy's initialization. Adds the necessary optimizers to the given Policy. Args: policy (Policy): The Policy object. obs_space (gym.spaces.Space): The Policy's observation space. action_space (gym.spaces.Space): The Policy's action space. config (TrainerConfigDict): The Policy's config. """ policy.cur_iter = 0 ActorCriticOptimizerMixin.__init__(policy, config) if config["lagrangian"]: policy.model.log_alpha_prime = get_variable(0.0, framework="tf", trainable=True, tf_name="log_alpha_prime") policy.alpha_prime_optim = tf.keras.optimizers.Adam( learning_rate=config["optimization"]["critic_learning_rate"], ) # Generic random action generator for calculating CQL-loss. policy._random_action_generator = Random( action_space, model=None, framework="tf2", policy_config=config, num_workers=0, worker_index=0, )
def __init__(self, config): # KL Coefficient self.kl_coeff_val = config["kl_coeff"] self.kl_target = config["kl_target"] self.kl_coeff = get_variable(float(self.kl_coeff_val), tf_name="kl_coeff", trainable=False)
def __init__(self, config): # Eager mode. if config["framework"] in ["tf2", "tfe"]: self.global_step = get_variable(0, tf_name="global_step") self._actor_optimizer = tf.keras.optimizers.Adam( learning_rate=config["optimization"]["actor_learning_rate"]) self._critic_optimizer = [ tf.keras.optimizers.Adam(learning_rate=config["optimization"] ["critic_learning_rate"]) ] if config["twin_q"]: self._critic_optimizer.append( tf.keras.optimizers.Adam( learning_rate=config["optimization"] ["critic_learning_rate"])) self._alpha_optimizer = tf.keras.optimizers.Adam( learning_rate=config["optimization"]["entropy_learning_rate"]) # Static graph mode. else: self.global_step = tf1.train.get_or_create_global_step() self._actor_optimizer = tf1.train.AdamOptimizer( learning_rate=config["optimization"]["actor_learning_rate"]) self._critic_optimizer = [ tf1.train.AdamOptimizer(learning_rate=config["optimization"] ["critic_learning_rate"]) ] if config["twin_q"]: self._critic_optimizer.append( tf1.train.AdamOptimizer( learning_rate=config["optimization"] ["critic_learning_rate"])) self._alpha_optimizer = tf1.train.AdamOptimizer( learning_rate=config["optimization"]["entropy_learning_rate"])
def __init__(self, action_space: gym.spaces.Space, *, framework: str, model: ModelV2, random_timesteps: int = 0, **kwargs): """Initializes a StochasticSampling Exploration object. Args: action_space (gym.spaces.Space): The gym action space used by the environment. framework (str): One of None, "tf", "torch". model (ModelV2): The ModelV2 used by the owning Policy. random_timesteps (int): The number of timesteps for which to act completely randomly. Only after this number of timesteps, actual samples will be drawn to get exploration actions. """ assert framework is not None super().__init__( action_space, model=model, framework=framework, **kwargs) # Create the Random exploration module (used for the first n # timesteps). self.random_timesteps = random_timesteps self.random_exploration = Random( action_space, model=self.model, framework=self.framework, **kwargs) # The current timestep value (tf-var or python int). self.last_timestep = get_variable( np.array(0, np.int64), framework=self.framework, tf_name="timestep", dtype=np.int64)
def __init__(self, action_space, *, framework, initial_temperature=1.0, final_temperature=0.0, temperature_timesteps=int(1e5), temperature_schedule=None, **kwargs): """Initializes a SoftQ Exploration object. Args: action_space (Space): The gym action space used by the environment. temperature (Schedule): The temperature to divide model outputs by before creating the Categorical distribution to sample from. framework (str): One of None, "tf", "torch". temperature_schedule (Optional[Schedule]): An optional Schedule object to use (instead of constructing one from the given parameters). """ assert isinstance(action_space, Discrete) super().__init__(action_space, framework=framework, **kwargs) self.temperature_schedule = \ from_config(Schedule, temperature_schedule, framework=framework) or \ PiecewiseSchedule( endpoints=[ (0, initial_temperature), (temperature_timesteps, final_temperature)], outside_value=final_temperature, framework=self.framework) # The current timestep value (tf-var or python int). self.last_timestep = get_variable(0, framework=framework, tf_name="timestep") self.temperature = self.temperature_schedule(self.last_timestep)
def __init__(self, action_space, *, framework: str, model: ModelV2, initial_epsilon=1.0, final_epsilon=0.05, epsilon_timesteps=int(1e5), epsilon_schedule=None, **kwargs): """Initializes a StochasticSampling Exploration object. Args: action_space (Space): The gym action space used by the environment. framework (str): One of None, "tf", "torch". """ assert framework is not None super().__init__(action_space, model=model, framework=framework, **kwargs) self.epsilon_schedule = \ from_config(Schedule, epsilon_schedule, framework=framework) or \ PiecewiseSchedule( endpoints=[ (0, initial_epsilon), (epsilon_timesteps, final_epsilon)], outside_value=final_epsilon, framework=self.framework) self.last_timestep = get_variable(0, framework=framework, tf_name="timestep")
def __init__(self, action_space: Space, *, framework: str, model: ModelV2, random_timesteps: int = 1000, stddev: float = 0.1, initial_scale: float = 1.0, final_scale: float = 0.02, scale_timesteps: int = 10000, scale_schedule: Optional[Schedule] = None, **kwargs): """Initializes a GaussianNoise Exploration object. Args: random_timesteps (int): The number of timesteps for which to act completely randomly. Only after this number of timesteps, the `self.scale` annealing process will start (see below). stddev (float): The stddev (sigma) to use for the Gaussian noise to be added to the actions. initial_scale (float): The initial scaling weight to multiply the noise with. final_scale (float): The final scaling weight to multiply the noise with. scale_timesteps (int): The timesteps over which to linearly anneal the scaling factor (after(!) having used random actions for `random_timesteps` steps. scale_schedule (Optional[Schedule]): An optional Schedule object to use (instead of constructing one from the given parameters). """ assert framework is not None super().__init__(action_space, model=model, framework=framework, **kwargs) # Create the Random exploration module (used for the first n # timesteps). self.random_timesteps = random_timesteps self.random_exploration = Random(action_space, model=self.model, framework=self.framework, **kwargs) self.stddev = stddev # The `scale` annealing schedule. self.scale_schedule = scale_schedule or PiecewiseSchedule( endpoints=[(random_timesteps, initial_scale), (random_timesteps + scale_timesteps, final_scale)], outside_value=final_scale, framework=self.framework) # The current timestep value (tf-var or python int). self.last_timestep = get_variable(0, framework=self.framework, tf_name="timestep") # Build the tf-info-op. if self.framework in ["tf2", "tf", "tfe"]: self._tf_info_op = self.get_info()
def __init__(self, in_size, out_size, sigma0=0.5, activation="relu"): """Initializes a NoisyLayer object. Args: in_size: out_size: sigma0: non_linear: """ super().__init__() self.in_size = in_size self.out_size = out_size self.sigma0 = sigma0 self.activation = get_activation_fn(activation, framework="torch") if self.activation is not None: self.activation = self.activation() self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.sigma_w = get_variable(np.random.uniform( low=-1.0 / np.sqrt(float(self.in_size)), high=1.0 / np.sqrt(float(self.in_size)), size=[self.in_size, out_size]), framework="torch", torch_tensor=True, trainable=True, device=self.device).float() self.sigma_b = get_variable(np.full(shape=[out_size], fill_value=sigma0 / np.sqrt(float(self.in_size))), framework="torch", torch_tensor=True, trainable=True, device=self.device).float() self.w = get_variable(np.full( shape=[self.in_size, self.out_size], fill_value=6 / np.sqrt(float(in_size) + float(out_size))), framework="torch", torch_tensor=True, trainable=True, device=self.device).float() self.b = get_variable(np.zeros([out_size]), framework="torch", torch_tensor=True, trainable=True, device=self.device).float()
def before_init_fn(policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict) -> None: # Create global step for counting the number of update operations. if config["framework"] in ["tf2", "tfe"]: policy.global_step = get_variable(0, tf_name="global_step") else: policy.global_step = tf1.train.get_or_create_global_step()
def __init__(self, config): # The current KL value (as python float). self.kl_coeff_val = config["kl_coeff"] # The current KL value (as tf Variable for in-graph operations). self.kl_coeff = get_variable(float(self.kl_coeff_val), tf_name="kl_coeff", trainable=False) # Constant target value. self.kl_target = config["kl_target"]
def setup_mixins(policy, obs_space, action_space, config): ValueNetworkMixin.__init__(policy, obs_space, action_space, config) # Set up a tf-var for the moving avg (do this here to make it work with # eager mode); "c^2" in the paper. policy._moving_average_sqd_adv_norm = get_variable( 100.0, framework="tf", tf_name="moving_average_of_advantage_norm", trainable=False)
def __init__(self, action_space, *, ou_theta=0.15, ou_sigma=0.2, ou_base_scale=0.1, random_timesteps=1000, initial_scale=1.0, final_scale=0.02, scale_timesteps=10000, scale_schedule=None, framework="tf", **kwargs): """Initializes an Ornstein-Uhlenbeck Exploration object. Args: action_space (Space): The gym action space used by the environment. ou_theta (float): The theta parameter of the Ornstein-Uhlenbeck process. ou_sigma (float): The sigma parameter of the Ornstein-Uhlenbeck process. ou_base_scale (float): A fixed scaling factor, by which all OU- noise is multiplied. NOTE: This is on top of the parent GaussianNoise's scaling. random_timesteps (int): The number of timesteps for which to act completely randomly. Only after this number of timesteps, the `self.scale` annealing process will start (see below). initial_scale (float): The initial scaling weight to multiply the noise with. final_scale (float): The final scaling weight to multiply the noise with. scale_timesteps (int): The timesteps over which to linearly anneal the scaling factor (after(!) having used random actions for `random_timesteps` steps. scale_schedule (Optional[Schedule]): An optional Schedule object to use (instead of constructing one from the given parameters). framework (Optional[str]): One of None, "tf", "torch". """ super().__init__( action_space, random_timesteps=random_timesteps, initial_scale=initial_scale, final_scale=final_scale, scale_timesteps=scale_timesteps, scale_schedule=scale_schedule, stddev=1.0, # Force `self.stddev` to 1.0. framework=framework, **kwargs) self.ou_theta = ou_theta self.ou_sigma = ou_sigma self.ou_base_scale = ou_base_scale # The current OU-state value (gets updated each time, an eploration # action is computed). self.ou_state = get_variable(self.action_space.low.size * [.0], framework=self.framework, tf_name="ou_state")
def __init__( self, action_space: gym.spaces.Space, *, framework: str, initial_epsilon: float = 1.0, final_epsilon: float = 0.05, warmup_timesteps: int = 0, epsilon_timesteps: int = int(1e5), epsilon_schedule: Optional[Schedule] = None, **kwargs, ): """Create an EpsilonGreedy exploration class. Args: action_space: The action space the exploration should occur in. framework: The framework specifier. initial_epsilon: The initial epsilon value to use. final_epsilon: The final epsilon value to use. warmup_timesteps: The timesteps over which to not change epsilon in the beginning. epsilon_timesteps: The timesteps (additional to `warmup_timesteps`) after which epsilon should always be `final_epsilon`. E.g.: warmup_timesteps=20k epsilon_timesteps=50k -> After 70k timesteps, epsilon will reach its final value. epsilon_schedule: An optional Schedule object to use (instead of constructing one from the given parameters). """ assert framework is not None super().__init__(action_space=action_space, framework=framework, **kwargs) self.epsilon_schedule = from_config( Schedule, epsilon_schedule, framework=framework) or PiecewiseSchedule( endpoints=[ (0, initial_epsilon), (warmup_timesteps, initial_epsilon), (warmup_timesteps + epsilon_timesteps, final_epsilon), ], outside_value=final_epsilon, framework=self.framework, ) # The current timestep value (tf-var or python int). self.last_timestep = get_variable( np.array(0, np.int64), framework=framework, tf_name="timestep", dtype=np.int64, ) # Build the tf-info-op. if self.framework == "tf": self._tf_state_op = self.get_state()
def build(self, input_shape: TensorShape): in_size = int(input_shape[1]) self.sigma_w = get_variable( value=tf.keras.initializers.RandomUniform( minval=-1.0 / np.sqrt(float(in_size)), maxval=1.0 / np.sqrt(float(in_size)), ), trainable=True, tf_name=self.prefix + "_sigma_w", shape=[in_size, self.out_size], dtype=tf.float32, ) self.sigma_b = get_variable( value=tf.keras.initializers.Constant(self.sigma0 / np.sqrt(float(in_size))), trainable=True, tf_name=self.prefix + "_sigma_b", shape=[self.out_size], dtype=tf.float32, ) self.w = get_variable( value=tf.keras.initializers.GlorotUniform(), tf_name=self.prefix + "_fc_w", trainable=True, shape=[in_size, self.out_size], dtype=tf.float32, ) self.b = get_variable( value=tf.keras.initializers.Zeros(), tf_name=self.prefix + "_fc_b", trainable=True, shape=[self.out_size], dtype=tf.float32, )
def __init__(self, action_space, *, random_timesteps=1000, stddev=0.1, initial_scale=1.0, final_scale=0.02, scale_timesteps=10000, scale_schedule=None, framework="tf", **kwargs): """Initializes a GaussianNoise Exploration object. Args: action_space (Space): The gym action space used by the environment. random_timesteps (int): The number of timesteps for which to act completely randomly. Only after this number of timesteps, the `self.scale` annealing process will start (see below). stddev (float): The stddev (sigma) to use for the Gaussian noise to be added to the actions. initial_scale (float): The initial scaling weight to multiply the noise with. final_scale (float): The final scaling weight to multiply the noise with. scale_timesteps (int): The timesteps over which to linearly anneal the scaling factor (after(!) having used random actions for `random_timesteps` steps. scale_schedule (Optional[Schedule]): An optional Schedule object to use (instead of constructing one from the given parameters). framework (Optional[str]): One of None, "tf", "torch". """ assert framework is not None super().__init__(action_space, framework=framework, **kwargs) self.random_timesteps = random_timesteps self.random_exploration = Random(action_space, framework=self.framework, **kwargs) self.stddev = stddev # The `scale` annealing schedule. self.scale_schedule = scale_schedule or PiecewiseSchedule( endpoints=[(random_timesteps, initial_scale), (random_timesteps + scale_timesteps, final_scale)], outside_value=final_scale, framework=self.framework) # The current timestep value (tf-var or python int). self.last_timestep = get_variable(0, framework=self.framework, tf_name="timestep")
def __init__(self, config): # The current KL value (as python float). self.kl_coeff_val = config["kl_coeff"] # The current KL value (as tf Variable for in-graph operations). self.kl_coeff = get_variable(float(self.kl_coeff_val), tf_name="kl_coeff", trainable=False, framework=config["framework"]) # Constant target value. self.kl_target = config["kl_target"] if self.framework == "tf": self._kl_coeff_placeholder = \ tf1.placeholder(dtype=tf.float32, name="kl_coeff") self._kl_coeff_update = self.kl_coeff.assign( self._kl_coeff_placeholder, read_value=False)
def setup_mixins(policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict) -> None: # Setup Value branch of our NN. ValueNetworkMixin.__init__(policy, obs_space, action_space, config) # Not needed for pure BC. if policy.config["beta"] != 0.0: # Set up a tf-var for the moving avg (do this here to make it work # with eager mode); "c^2" in the paper. policy._moving_average_sqd_adv_norm = get_variable( policy.config["moving_average_sqd_adv_norm_start"], framework="tf", tf_name="moving_average_of_advantage_norm", trainable=False)
def __init__(self, config): # Eager mode. if config["framework"] in ["tf2", "tfe"]: self.global_step = get_variable(0, tf_name="global_step") self._actor_optimizer = tf.keras.optimizers.Adam( learning_rate=config["actor_lr"]) self._critic_optimizer = \ tf.keras.optimizers.Adam(learning_rate=config["critic_lr"]) # Static graph mode. else: self.global_step = tf1.train.get_or_create_global_step() self._actor_optimizer = tf1.train.AdamOptimizer( learning_rate=config["actor_lr"]) self._critic_optimizer = \ tf1.train.AdamOptimizer(learning_rate=config["critic_lr"])
def optimizer(self, ) -> List["tf.keras.optimizers.Optimizer"]: """Create separate optimizers for actor & critic losses.""" if self.config["framework"] in ["tf2", "tfe"]: self.global_step = get_variable(0, tf_name="global_step") self._actor_optimizer = tf.keras.optimizers.Adam( learning_rate=self.config["actor_lr"]) self._critic_optimizer = tf.keras.optimizers.Adam( learning_rate=self.config["critic_lr"]) # Static graph mode. else: self.global_step = tf1.train.get_or_create_global_step() self._actor_optimizer = tf1.train.AdamOptimizer( learning_rate=self.config["actor_lr"]) self._critic_optimizer = tf1.train.AdamOptimizer( learning_rate=self.config["critic_lr"]) return [self._actor_optimizer, self._critic_optimizer]
def __init__(self, action_space, initial_epsilon=1.0, final_epsilon=0.05, epsilon_timesteps=int(1e5), num_workers=None, worker_index=None, epsilon_schedule=None, framework="tf"): """ Args: action_space (Space): The gym action space used by the environment. initial_epsilon (float): The initial epsilon value to use. final_epsilon (float): The final epsilon value to use. epsilon_timesteps (int): The time step after which epsilon should always be `final_epsilon`. num_workers (Optional[int]): The overall number of workers used. worker_index (Optional[int]): The index of the Worker using this Exploration. epsilon_schedule (Optional[Schedule]): An optional Schedule object to use (instead of constructing one from the given parameters). framework (Optional[str]): One of None, "tf", "torch". """ # For now, require Discrete action space (may loosen this restriction # in the future). assert isinstance(action_space, gym.spaces.Discrete) assert framework is not None super().__init__(action_space=action_space, num_workers=num_workers, worker_index=worker_index, framework=framework) self.epsilon_schedule = epsilon_schedule or PiecewiseSchedule( endpoints=[(0, initial_epsilon), (epsilon_timesteps, final_epsilon)], outside_value=final_epsilon, framework=self.framework) # The current timestep value (tf-var or python int). self.last_timestep = get_variable(0, framework=framework, tf_name="timestep")
def __init__( self, obs_space, action_space, config, existing_model=None, existing_inputs=None, ): # First thing first, enable eager execution if necessary. base.enable_eager_execution_if_necessary() config = dict( ray.rllib.algorithms.marwil.marwil.MARWILConfig().to_dict(), **config ) # Initialize base class. base.__init__( self, obs_space, action_space, config, existing_inputs=existing_inputs, existing_model=existing_model, ) ValueNetworkMixin.__init__(self, config) PostprocessAdvantages.__init__(self) # Not needed for pure BC. if config["beta"] != 0.0: # Set up a tf-var for the moving avg (do this here to make it work # with eager mode); "c^2" in the paper. self._moving_average_sqd_adv_norm = get_variable( config["moving_average_sqd_adv_norm_start"], framework="tf", tf_name="moving_average_of_advantage_norm", trainable=False, ) # Note: this is a bit ugly, but loss and optimizer initialization must # happen after all the MixIns are initialized. self.maybe_initialize_optimizer_and_loss()
def __init__(self, action_space, *, framework: str, initial_epsilon: float = 1.0, final_epsilon: float = 0.05, epsilon_timesteps: int = int(1e5), epsilon_schedule: Optional[Schedule] = None, **kwargs): """Create an EpsilonGreedy exploration class. Args: initial_epsilon (float): The initial epsilon value to use. final_epsilon (float): The final epsilon value to use. epsilon_timesteps (int): The time step after which epsilon should always be `final_epsilon`. epsilon_schedule (Optional[Schedule]): An optional Schedule object to use (instead of constructing one from the given parameters). """ assert framework is not None super().__init__(action_space=action_space, framework=framework, **kwargs) self.epsilon_schedule = \ from_config(Schedule, epsilon_schedule, framework=framework) or \ PiecewiseSchedule( endpoints=[ (0, initial_epsilon), (epsilon_timesteps, final_epsilon)], outside_value=final_epsilon, framework=self.framework) # The current timestep value (tf-var or python int). self.last_timestep = get_variable(np.array(0, np.int64), framework=framework, tf_name="timestep", dtype=np.int64) # Build the tf-info-op. if self.framework in ["tf2", "tf", "tfe"]: self._tf_info_op = self.get_info()
def __init__(self, action_space, *, framework: str, initial_epsilon=1.0, final_epsilon=0.05, epsilon_timesteps=int(1e5), epsilon_schedule=None, **kwargs): """Create an EpsilonGreedy exploration class. Args: initial_epsilon (float): The initial epsilon value to use. final_epsilon (float): The final epsilon value to use. epsilon_timesteps (int): The time step after which epsilon should always be `final_epsilon`. epsilon_schedule (Optional[Schedule]): An optional Schedule object to use (instead of constructing one from the given parameters). """ assert framework is not None super().__init__(action_space=action_space, framework=framework, **kwargs) self.epsilon_schedule = \ from_config(Schedule, epsilon_schedule, framework=framework) or \ PiecewiseSchedule( endpoints=[ (0, initial_epsilon), (epsilon_timesteps, final_epsilon)], outside_value=final_epsilon, framework=self.framework) # The current timestep value (tf-var or python int). self.last_timestep = get_variable(0, framework=framework, tf_name="timestep") # Build the tf-info-op. if self.framework == "tf": raise ValueError("Torch version does not support " "multiobj episilon-greedy yet!")
def __init__(self, entropy_coeff, entropy_coeff_schedule): self.entropy_coeff = get_variable(entropy_coeff, framework="tf", tf_name="entropy_coeff", trainable=False) if entropy_coeff_schedule is None: self.entropy_coeff_schedule = ConstantSchedule(entropy_coeff, framework=None) else: # Allows for custom schedule similar to lr_schedule format if isinstance(entropy_coeff_schedule, list): self.entropy_coeff_schedule = PiecewiseSchedule( entropy_coeff_schedule, outside_value=entropy_coeff_schedule[-1][-1], framework=None) else: # Implements previous version but enforces outside_value self.entropy_coeff_schedule = PiecewiseSchedule( [[0, entropy_coeff], [entropy_coeff_schedule, 0.0]], outside_value=0.0, framework=None)
def __init__(self, action_space, *, framework: str, policy_config: dict, model: ModelV2, initial_stddev: float = 1.0, random_timesteps: int = 10000, sub_exploration: Optional[dict] = None, **kwargs): """Initializes a ParameterNoise Exploration object. Args: initial_stddev (float): The initial stddev to use for the noise. random_timesteps (int): The number of timesteps to act completely randomly (see [1]). sub_exploration (Optional[dict]): Optional sub-exploration config. None for auto-detection/setup. """ assert framework is not None super().__init__(action_space, policy_config=policy_config, model=model, framework=framework, **kwargs) self.stddev = get_variable(initial_stddev, framework=self.framework, tf_name="stddev") self.stddev_val = initial_stddev # Out-of-graph tf value holder. # The weight variables of the Model where noise should be applied to. # This excludes any variable, whose name contains "LayerNorm" (those # are BatchNormalization layers, which should not be perturbed). self.model_variables = [ v for k, v in self.model.trainable_variables(as_dict=True).items() if "LayerNorm" not in k ] # Our noise to be added to the weights. Each item in `self.noise` # corresponds to one Model variable and holding the Gaussian noise to # be added to that variable (weight). self.noise = [] for var in self.model_variables: name_ = var.name.split(":")[0] + "_noisy" if var.name else "" self.noise.append( get_variable(np.zeros(var.shape, dtype=np.float32), framework=self.framework, tf_name=name_, torch_tensor=True, device=self.device)) # tf-specific ops to sample, assign and remove noise. if self.framework == "tf" and not tf.executing_eagerly(): self.tf_sample_new_noise_op = \ self._tf_sample_new_noise_op() self.tf_add_stored_noise_op = \ self._tf_add_stored_noise_op() self.tf_remove_noise_op = \ self._tf_remove_noise_op() # Create convenience sample+add op for tf. with tf1.control_dependencies([self.tf_sample_new_noise_op]): add_op = self._tf_add_stored_noise_op() with tf1.control_dependencies([add_op]): self.tf_sample_new_noise_and_add_op = tf.no_op() # Whether the Model's weights currently have noise added or not. self.weights_are_currently_noisy = False # Auto-detection of underlying exploration functionality. if sub_exploration is None: # For discrete action spaces, use an underlying EpsilonGreedy with # a special schedule. if isinstance(self.action_space, Discrete): sub_exploration = { "type": "EpsilonGreedy", "epsilon_schedule": { "type": "PiecewiseSchedule", # Step function (see [2]). "endpoints": [(0, 1.0), (random_timesteps + 1, 1.0), (random_timesteps + 2, 0.01)], "outside_value": 0.01 } } elif isinstance(self.action_space, Box): sub_exploration = { "type": "OrnsteinUhlenbeckNoise", "random_timesteps": random_timesteps, } # TODO(sven): Implement for any action space. else: raise NotImplementedError self.sub_exploration = from_config(Exploration, sub_exploration, framework=self.framework, action_space=self.action_space, policy_config=self.policy_config, model=self.model, **kwargs) # Whether we need to call `self._delayed_on_episode_start` before # the forward pass. self.episode_started = False
def before_init_fn(policy, obs_space, action_space, config): # Create global step for counting the number of update operations. if tfv == 2 and config["framework"] == "tfe": policy.global_step = get_variable(0, tf_name="global_step") else: policy.global_step = tf1.train.get_or_create_global_step()