def get_default_config(cls) -> TrainerConfigDict: return Trainer.merge_trainer_configs( DEFAULT_CONFIG, { # Use UpperConfidenceBound exploration. "exploration_config": {"type": "UpperConfidenceBound"} }, )
def get_default_config(cls) -> TrainerConfigDict: config = Trainer.merge_trainer_configs( DEFAULT_CONFIG, { # Use ThompsonSampling exploration. "exploration_config": {"type": "ThompsonSampling"} }, ) return config
def add_policy(self, policy_id: PolicyID, policy_spec: PolicySpec): # Merge the policies config overrides with the main config. # Also, adjust `num_gpus` (to indicate an individual policy's # num_gpus, not the total number of GPUs). cfg = Trainer.merge_trainer_configs( self.config, dict(policy_spec.config, **{"num_gpus": self.num_gpus_per_policy}), ) # Need to create the replay actor first. Then add the first policy. if self.replay_actor is None: return self._add_replay_buffer_and_policy(policy_id, policy_spec, cfg) # Replay actor already exists -> Just add a new policy here. assert len(self.policy_actors) < self.max_num_policies actual_policy_class = get_tf_eager_cls_if_necessary( policy_spec.policy_class, cfg) colocated = create_colocated_actors( actor_specs=[( ray.remote( num_cpus=1, num_gpus=self.num_gpus_per_policy if not cfg["_fake_gpus"] else 0, )(actual_policy_class), # Policy c'tor args. (policy_spec.observation_space, policy_spec.action_space, cfg), # Policy c'tor kwargs={}. {}, # Count=1, 1, )], # Force co-locate on the already existing replay actor's node. node=ray.get(self.replay_actor.get_host.remote()), ) self.policy_actors[policy_id] = colocated[0][0] return self.policy_actors[policy_id]
DEFAULT_CONFIG = Trainer.merge_trainer_configs( SIMPLEQ_DEFAULT_CONFIG, { # === Model === # Number of atoms for representing the distribution of return. When # this is greater than 1, distributional Q-learning is used. # the discrete supports are bounded by v_min and v_max "num_atoms": 1, "v_min": -10.0, "v_max": 10.0, # Whether to use noisy network "noisy": False, # control the initial value of noisy nets "sigma0": 0.5, # Whether to use dueling dqn "dueling": True, # Dense-layer setup for each the advantage branch and the value branch # in a dueling architecture. "hiddens": [256], # Whether to use double dqn "double_q": True, # N-step Q learning "n_step": 1, # === Prioritized replay buffer === # If True prioritized replay buffer will be used. "prioritized_replay": True, # Alpha parameter for prioritized replay buffer. "prioritized_replay_alpha": 0.6, # Beta parameter for sampling from prioritized replay buffer. "prioritized_replay_beta": 0.4, # Final value of beta (by default, we use constant beta=0.4). "final_prioritized_replay_beta": 0.4, # Time steps over which the beta parameter is annealed. "prioritized_replay_beta_annealing_timesteps": 20000, # Epsilon to add to the TD errors when updating priorities. "prioritized_replay_eps": 1e-6, # Callback to run before learning on a multi-agent batch of # experiences. "before_learn_on_batch": None, # The intensity with which to update the model (vs collecting samples # from the env). If None, uses the "natural" value of: # `train_batch_size` / (`rollout_fragment_length` x `num_workers` x # `num_envs_per_worker`). # If provided, will make sure that the ratio between ts inserted into # and sampled from the buffer matches the given value. # Example: # training_intensity=1000.0 # train_batch_size=250 rollout_fragment_length=1 # num_workers=1 (or 0) num_envs_per_worker=1 # -> natural value = 250 / 1 = 250.0 # -> will make sure that replay+train op will be executed 4x as # often as rollout+insert op (4 * 250 = 1000). # See: rllib/agents/dqn/dqn.py::calculate_rr_weights for further # details. "training_intensity": None, # === Parallelism === # Whether to compute priorities on workers. "worker_side_prioritization": False, }, _allow_unknown_configs=True, )
R2D2_DEFAULT_CONFIG = Trainer.merge_trainer_configs( DQN_DEFAULT_CONFIG, # See keys in impala.py, which are also supported. { # Learning rate for adam optimizer. "lr": 1e-4, # Discount factor. "gamma": 0.997, # Train batch size (in number of single timesteps). "train_batch_size": 64 * 20, # Adam epsilon hyper parameter "adam_epsilon": 1e-3, # Run in parallel by default. "num_workers": 2, # Batch mode must be complete_episodes. "batch_mode": "complete_episodes", # === Replay buffer === "replay_buffer_config": { # For now we don't use the new ReplayBuffer API here "_enable_replay_buffer_api": False, "type": "MultiAgentReplayBuffer", "prioritized_replay": False, "prioritized_replay_alpha": 0.6, # Beta parameter for sampling from prioritized replay buffer. "prioritized_replay_beta": 0.4, # Epsilon to add to the TD errors when updating priorities. "prioritized_replay_eps": 1e-6, # Size of the replay buffer (in sequences, not timesteps). "capacity": 100000, # Set automatically: The number # of contiguous environment steps to # replay at once. Will be calculated via # model->max_seq_len + burn_in. # Do not set this to any valid value! "replay_sequence_length": -1, }, # If True, assume a zero-initialized state input (no matter where in # the episode the sequence is located). # If False, store the initial states along with each SampleBatch, use # it (as initial state when running through the network for training), # and update that initial state during training (from the internal # state outputs of the immediately preceding sequence). "zero_init_states": True, # If > 0, use the `burn_in` first steps of each replay-sampled sequence # (starting either from all 0.0-values if `zero_init_state=True` or # from the already stored values) to calculate an even more accurate # initial states for the actual sequence (starting after this burn-in # window). In the burn-in case, the actual length of the sequence # used for loss calculation is `n - burn_in` time steps # (n=LSTM’s/attention net’s max_seq_len). "burn_in": 0, # Whether to use the h-function from the paper [1] to scale target # values in the R2D2-loss function: # h(x) = sign(x)(|x| + 1 − 1) + εx "use_h_function": True, # The epsilon parameter from the R2D2 loss function (only used # if `use_h_function`=True. "h_function_epsilon": 1e-3, # Update the target network every `target_network_update_freq` steps. "target_network_update_freq": 2500, # Experimental flag. # If True, the execution plan API will not be used. Instead, # a Trainer's `training_iteration` method will be called as-is each # training iteration. "_disable_execution_plan_api": False, }, _allow_unknown_configs=True, )
DEFAULT_CONFIG = Trainer.merge_trainer_configs( PPO_DEFAULT_CONFIG, { # During the sampling phase, each rollout worker will collect a batch # `rollout_fragment_length * num_envs_per_worker` steps in size. "rollout_fragment_length": 100, # Vectorize the env (should enable by default since each worker has # a GPU). "num_envs_per_worker": 5, # During the SGD phase, workers iterate over minibatches of this size. # The effective minibatch size will be: # `sgd_minibatch_size * num_workers`. "sgd_minibatch_size": 50, # Number of SGD epochs per optimization round. "num_sgd_iter": 10, # Download weights between each training step. This adds a bit of # overhead but allows the user to access the weights from the trainer. "keep_local_weights_in_sync": True, # *** WARNING: configs below are DDPPO overrides over PPO; you # shouldn't need to adjust them. *** # DDPPO requires PyTorch distributed. "framework": "torch", # The communication backend for PyTorch distributed. "torch_distributed_backend": "gloo", # Learning is no longer done on the driver process, so # giving GPUs to the driver does not make sense! "num_gpus": 0, # Each rollout worker gets a GPU. "num_gpus_per_worker": 1, # Require evenly sized batches. Otherwise, # collective allreduce could fail. "truncate_episodes": True, # This is auto set based on sample batch size. "train_batch_size": -1, # Kl divergence penalty should be fixed to 0 in DDPPO because in order # for it to be used as a penalty, we would have to un-decentralize # DDPPO "kl_coeff": 0.0, "kl_target": 0.0 }, _allow_unknown_configs=True, )
DEFAULT_CONFIG = Trainer.merge_trainer_configs( appo.DEFAULT_CONFIG, # See keys in appo.py, which are also supported. { # TODO: Unify the buffer API, then clean up our existing # implementations of different buffers. # This is num batches held at any time for each policy. "replay_buffer_capacity": 20, # e.g. ratio=0.2 -> 20% of samples in each train batch are # old (replayed) ones. "replay_buffer_replay_ratio": 0.5, # Timeout to use for `ray.wait()` when waiting for samplers to have placed # new data into the buffers. If no samples are ready within the timeout, # the buffers used for mixin-sampling will return only older samples. "sample_wait_timeout": 0.0, # Timeout to use for `ray.wait()` when waiting for the policy learner actors # to have performed an update and returned learning stats. If no learner # actors have produced any learning results in the meantime, their # learner-stats in the results will be empty for that iteration. "learn_wait_timeout": 0.0, # League-building parameters. # The LeagueBuilder class to be used for league building logic. "league_builder_config": { "type": AlphaStarLeagueBuilder, # The number of random policies to add to the league. This must be an # even number (including 0) as these will be evenly distributed # amongst league- and main- exploiters. "num_random_policies": 2, # The number of initially learning league-exploiters to create. "num_learning_league_exploiters": 4, # The number of initially learning main-exploiters to create. "num_learning_main_exploiters": 4, # Minimum win-rate (between 0.0 = 0% and 1.0 = 100%) of any policy to # be considered for snapshotting (cloning). The cloned copy may then # be frozen (no further learning) or keep learning (independent of # its ancestor policy). # Set this to lower values to speed up league growth. "win_rate_threshold_for_new_snapshot": 0.9, # If we took a new snapshot of any given policy, what's the probability # that this snapshot will continue to be trainable (rather than become # frozen/non-trainable)? By default, only keep those policies trainable # that have been trainable from the very beginning. "keep_new_snapshot_training_prob": 0.0, # Probabilities of different match-types: # LE: Learning league_exploiter vs any. # ME: Learning main exploiter vs any main. # M: Main self-play (p=1.0 - LE - ME). "prob_league_exploiter_match": 0.33, "prob_main_exploiter_match": 0.33, # Only for ME matches: Prob to play against learning # main (vs a snapshot main). "prob_main_exploiter_playing_against_learning_main": 0.5, }, # The maximum number of trainable policies for this Trainer. # Each trainable policy will exist as a independent remote actor, co-located # with a replay buffer. This is besides its existence inside # the RolloutWorkers for training and evaluation. # Set to None for automatically inferring this value from the number of # trainable policies found in the `multiagent` config. "max_num_policies_to_train": None, # By default, don't drop last timestep. # TODO: We should do the same for IMPALA and APPO at some point. "vtrace_drop_last_ts": False, # Reporting interval. "min_time_s_per_reporting": 2, # Use the `training_iteration` method instead of an execution plan. "_disable_execution_plan_api": True, }, _allow_unknown_configs=True, )
DEFAULT_CONFIG = Trainer.merge_trainer_configs( SIMPLEQ_DEFAULT_CONFIG, { # === Model === # Number of atoms for representing the distribution of return. When # this is greater than 1, distributional Q-learning is used. # the discrete supports are bounded by v_min and v_max "num_atoms": 1, "v_min": -10.0, "v_max": 10.0, # Whether to use noisy network "noisy": False, # control the initial value of noisy nets "sigma0": 0.5, # Whether to use dueling dqn "dueling": True, # Dense-layer setup for each the advantage branch and the value branch # in a dueling architecture. "hiddens": [256], # Whether to use double dqn "double_q": True, # N-step Q learning "n_step": 1, # === Replay buffer === # Deprecated, use capacity in replay_buffer_config instead. "buffer_size": DEPRECATED_VALUE, "replay_buffer_config": { # For now we don't use the new ReplayBuffer API here "_enable_replay_buffer_api": True, "type": "MultiAgentPrioritizedReplayBuffer", # Size of the replay buffer. Note that if async_updates is set, # then each worker will have a replay buffer of this size. "capacity": 50000, "prioritized_replay_alpha": 0.6, # Beta parameter for sampling from prioritized replay buffer. "prioritized_replay_beta": 0.4, # Epsilon to add to the TD errors when updating priorities. "prioritized_replay_eps": 1e-6, # The number of continuous environment steps to replay at once. This may # be set to greater than 1 to support recurrent models. "replay_sequence_length": 1, }, # Set this to True, if you want the contents of your buffer(s) to be # stored in any saved checkpoints as well. # Warnings will be created if: # - This is True AND restoring from a checkpoint that contains no buffer # data. # - This is False AND restoring from a checkpoint that does contain # buffer data. "store_buffer_in_checkpoints": False, # Callback to run before learning on a multi-agent batch of # experiences. "before_learn_on_batch": None, # The intensity with which to update the model (vs collecting samples # from the env). If None, uses the "natural" value of: # `train_batch_size` / (`rollout_fragment_length` x `num_workers` x # `num_envs_per_worker`). # If provided, will make sure that the ratio between ts inserted into # and sampled from the buffer matches the given value. # Example: # training_intensity=1000.0 # train_batch_size=250 rollout_fragment_length=1 # num_workers=1 (or 0) num_envs_per_worker=1 # -> natural value = 250 / 1 = 250.0 # -> will make sure that replay+train op will be executed 4x as # often as rollout+insert op (4 * 250 = 1000). # See: rllib/agents/dqn/dqn.py::calculate_rr_weights for further # details. "training_intensity": None, # === Parallelism === # Whether to compute priorities on workers. "worker_side_prioritization": False, # Experimental flag. # If True, the execution plan API will not be used. Instead, # a Trainer's `training_iteration` method will be called as-is each # training iteration. "_disable_execution_plan_api": True, }, _allow_unknown_configs=True, )
R2D2_DEFAULT_CONFIG = Trainer.merge_trainer_configs( DQN_DEFAULT_CONFIG, # See keys in dqn.py, which are also supported. { # Learning rate for adam optimizer. "lr": 1e-4, # Discount factor. "gamma": 0.997, # Train batch size (in number of single timesteps). "train_batch_size": 64, # Adam epsilon hyper parameter "adam_epsilon": 1e-3, # Run in parallel by default. "num_workers": 2, # Batch mode must be complete_episodes. "batch_mode": "complete_episodes", # === Replay buffer === "replay_buffer_config": { "type": "MultiAgentReplayBuffer", # Specify prioritized replay by supplying a buffer type that supports # prioritization, for example: MultiAgentPrioritizedReplayBuffer. "prioritized_replay": DEPRECATED_VALUE, # Size of the replay buffer (in sequences, not timesteps). "capacity": 100000, "storage_unit": "sequences", # Set automatically: The number # of contiguous environment steps to # replay at once. Will be calculated via # model->max_seq_len + burn_in. # Do not set this to any valid value! "replay_sequence_length": -1, # If > 0, use the `replay_burn_in` first steps of each replay-sampled # sequence (starting either from all 0.0-values if `zero_init_state=True` or # from the already stored values) to calculate an even more accurate # initial states for the actual sequence (starting after this burn-in # window). In the burn-in case, the actual length of the sequence # used for loss calculation is `n - replay_burn_in` time steps # (n=LSTM’s/attention net’s max_seq_len). "replay_burn_in": 0, }, # If True, assume a zero-initialized state input (no matter where in # the episode the sequence is located). # If False, store the initial states along with each SampleBatch, use # it (as initial state when running through the network for training), # and update that initial state during training (from the internal # state outputs of the immediately preceding sequence). "zero_init_states": True, # Whether to use the h-function from the paper [1] to scale target # values in the R2D2-loss function: # h(x) = sign(x)(|x| + 1 − 1) + εx "use_h_function": True, # The epsilon parameter from the R2D2 loss function (only used # if `use_h_function`=True. "h_function_epsilon": 1e-3, # Update the target network every `target_network_update_freq` sample steps. "target_network_update_freq": 2500, # Deprecated keys: # Use config["replay_buffer_config"]["replay_burn_in"] instead "burn_in": DEPRECATED_VALUE }, _allow_unknown_configs=True, )