class RandomAgent(Agent): """Agent that takes random actions and never learns.""" _agent_name = "RandomAgent" _default_config = with_common_config({ "rollouts_per_iteration": 10, }) @override(Agent) def _init(self, config, env_creator): self.env = env_creator(config["env_config"]) @override(Agent) def _train(self): rewards = [] steps = 0 for _ in range(self.config["rollouts_per_iteration"]): obs = self.env.reset() done = False reward = 0.0 while not done: action = self.env.action_space.sample() obs, r, done, info = self.env.step(action) reward += r steps += 1 rewards.append(reward) return { "episode_reward_mean": np.mean(rewards), "timesteps_this_iter": steps, }
class _SigmoidFakeData(_MockAgent): """Agent that returns sigmoid learning curves. This can be helpful for evaluating early stopping algorithms.""" _agent_name = "SigmoidFakeData" _default_config = with_common_config({ "width": 100, "height": 100, "offset": 0, "iter_time": 10, "iter_timesteps": 1, "num_workers": 0, }) def _train(self): i = max(0, self.iteration - self.config["offset"]) v = np.tanh(float(i) / self.config["width"]) v *= self.config["height"] return dict( episode_reward_mean=v, episode_len_mean=v, timesteps_this_iter=self.config["iter_timesteps"], time_this_iter_s=self.config["iter_time"], info={})
class MaxAgent(Agent): """Agent that always takes the maximum available action.""" _agent_name = "MaxAgent" _default_config = with_common_config({}) def _init(self): self.env = self.env_creator(self.config["env_config"]) def _train(self): steps = 0 done = False reward = 0.0 while not done: action = self.env.action_space.high obs, r, done, info = self.env.step(action) #print('obs-------------', obs) #print('r--------------', r) reward += r steps += 1 if steps >= self.config["env_config"]["iterations"]: done = True return { "episode_reward_mean": reward, "timesteps_this_iter": steps, }
class _MockAgent(Agent): """Mock agent for use in tests""" _agent_name = "MockAgent" _default_config = with_common_config({ "mock_error": False, "persistent_error": False, "test_variable": 1, "num_workers": 0, }) def _init(self): self.info = None self.restored = False def _train(self): if self.config["mock_error"] and self.iteration == 1 \ and (self.config["persistent_error"] or not self.restored): raise Exception("mock error") return dict(episode_reward_mean=10, episode_len_mean=10, timesteps_this_iter=10, info={}) def _save(self, checkpoint_dir): path = os.path.join(checkpoint_dir, "mock_agent.pkl") with open(path, 'wb') as f: pickle.dump(self.info, f) return path def _restore(self, checkpoint_path): with open(checkpoint_path, 'rb') as f: info = pickle.load(f) self.info = info self.restored = True def _register_if_needed(self, env_object): pass def set_info(self, info): self.info = info return info def get_info(self): return self.info
class _ParameterTuningAgent(_MockAgent): _agent_name = "ParameterTuningAgent" _default_config = with_common_config({ "reward_amt": 10, "dummy_param": 10, "dummy_param2": 15, "iter_time": 10, "iter_timesteps": 1, "num_workers": 0, }) def _train(self): return dict(episode_reward_mean=self.config["reward_amt"] * self.iteration, episode_len_mean=self.config["reward_amt"], timesteps_this_iter=self.config["iter_timesteps"], time_this_iter_s=self.config["iter_time"], info={})
class MaxAgent(Agent): """Agent that always takes the maximum available action.""" _agent_name = "MaxAgent" _default_config = with_common_config({}) def _init(self): self.env = self.env_creator(self.config["env_config"]) self.env.reset() def _train(self): steps = 0 done = False reward = 0.0 max_iterations = self.config["env_config"]["iterations"] while steps < max_iterations: action = self.env.action_space.high obs, r, done, info = self.env.step(action) reward += r steps = steps + 1 return { "episode_reward_mean": reward, "timesteps_this_iter": steps, }
DEFAULT_CONFIG = with_common_config({ # V-trace params (see vtrace.py). "vtrace": True, "vtrace_clip_rho_threshold": 1.0, "vtrace_clip_pg_rho_threshold": 1.0, # System params. # # == Overview of data flow in IMPALA == # 1. Policy evaluation in parallel across `num_workers` actors produces # batches of size `sample_batch_size * num_envs_per_worker`. # 2. If enabled, the replay buffer stores and produces batches of size # `sample_batch_size * num_envs_per_worker`. # 3. If enabled, the minibatch ring buffer stores and replays batches of # size `train_batch_size` up to `num_sgd_iter` times per batch. # 4. The learner thread executes data parallel SGD across `num_gpus` GPUs # on batches of size `train_batch_size`. # "sample_batch_size": 50, "train_batch_size": 500, "min_iter_time_s": 10, "num_workers": 2, # number of GPUs the learner should use. "num_gpus": 1, # set >1 to load data into GPUs in parallel. Increases GPU memory usage # proportionally with the number of buffers. "num_data_loader_buffers": 1, # how many train batches should be retained for minibatching. This conf # only has an effect if `num_sgd_iter > 1`. "minibatch_buffer_size": 1, # number of passes to make over each train batch "num_sgd_iter": 1, # set >0 to enable experience replay. Saved samples will be replayed with # a p:1 proportion to new data samples. "replay_proportion": 0.0, # number of sample batches to store for replay. The number of transitions # saved total will be (replay_buffer_num_slots * sample_batch_size). "replay_buffer_num_slots": 0, # max queue size for train batches feeding into the learner "learner_queue_size": 16, # level of queuing for sampling. "max_sample_requests_in_flight_per_worker": 2, # max number of workers to broadcast one set of weights to "broadcast_interval": 1, # Learning params. "grad_clip": 40.0, # either "adam" or "rmsprop" "opt_type": "adam", "lr": 0.0005, "lr_schedule": None, # rmsprop considered "decay": 0.99, "momentum": 0.0, "epsilon": 0.1, # balancing the three losses "vf_loss_coeff": 0.5, "entropy_coeff": -0.01, })
DEFAULT_CONFIG = with_common_config({ # === Model === # Number of atoms for representing the distribution of return. When # this is greater than 1, distributional Q-learning is used. # the discrete supports are bounded by v_min and v_max "num_atoms": 1, "v_min": -10.0, "v_max": 10.0, # Whether to use noisy network "noisy": False, # control the initial value of noisy nets "sigma0": 0.5, # Whether to use dueling dqn "dueling": True, # Whether to use double dqn "double_q": True, # Hidden layer sizes of the state and action value networks "hiddens": [256], # N-step Q learning "n_step": 1, # Whether to use rllib or deepmind preprocessors "preprocessor_pref": "deepmind", # === Exploration === # Max num timesteps for annealing schedules. Exploration is annealed from # 1.0 to exploration_fraction over this number of timesteps scaled by # exploration_fraction "schedule_max_timesteps": 100000, # Number of env steps to optimize for before returning "timesteps_per_iteration": 1000, # Fraction of entire training period over which the exploration rate is # annealed "exploration_fraction": 0.1, # Final value of random action probability "exploration_final_eps": 0.02, # Update the target network every `target_network_update_freq` steps. "target_network_update_freq": 500, # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. "buffer_size": 50000, # If True prioritized replay buffer will be used. "prioritized_replay": True, # Alpha parameter for prioritized replay buffer. "prioritized_replay_alpha": 0.6, # Beta parameter for sampling from prioritized replay buffer. "prioritized_replay_beta": 0.4, # Fraction of entire training period over which the beta parameter is # annealed "beta_annealing_fraction": 0.2, # Final value of beta "final_prioritized_replay_beta": 0.4, # Epsilon to add to the TD errors when updating priorities. "prioritized_replay_eps": 1e-6, # Whether to LZ4 compress observations "compress_observations": True, # === Optimization === # Learning rate for adam optimizer "lr": 5e-4, # Adam epsilon hyper parameter "adam_epsilon": 1e-8, # If not None, clip gradients during optimization at this value "grad_norm_clipping": 40, # How many steps of the model to sample before learning starts. "learning_starts": 1000, # Update the replay buffer with this many samples at once. Note that # this setting applies per-worker if num_workers > 1. "sample_batch_size": 4, # Size of a batched sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. "train_batch_size": 32, # === Parallelism === # Whether to use a GPU for local optimization. "gpu": False, # Number of workers for collecting samples with. This only makes sense # to increase if your environment is particularly slow to sample, or if # you"re using the Async or Ape-X optimizers. "num_workers": 0, # Whether to allocate GPUs for workers (if > 0). "num_gpus_per_worker": 0, # Whether to allocate CPUs for workers (if > 0). "num_cpus_per_worker": 1, # Optimizer class to use. "optimizer_class": "SyncReplayOptimizer", # Whether to use a distribution of epsilons across workers for exploration. "per_worker_exploration": False, # Whether to compute priorities on workers. "worker_side_prioritization": False, # Prevent iterations from going lower than this time span "min_iter_time_s": 1, })
DEFAULT_CONFIG = with_common_config({ # === Model === # Number of atoms for representing the distribution of return. When # this is greater than 1, distributional Q-learning is used. # the discrete supports are bounded by v_min and v_max "num_atoms": 1, "v_min": -10.0, "v_max": 10.0, # Whether to use noisy network "noisy": False, # control the initial value of noisy nets "sigma0": 0.5, # Whether to use dueling dqn "dueling": True, # Whether to use double dqn "double_q": True, # Hidden layer sizes of the state and action value networks "hiddens": [256], # N-step Q learning "n_step": 1, # === Evaluation === # Evaluate with epsilon=0 every `evaluation_interval` training iterations. # The evaluation stats will be reported under the "evaluation" metric key. # Note that evaluation is currently not parallelized, and that for Ape-X # metrics are already only reported for the lowest epsilon workers. "evaluation_interval": None, # Number of episodes to run per evaluation period. "evaluation_num_episodes": 10, # === Exploration === # Max num timesteps for annealing schedules. Exploration is annealed from # 1.0 to exploration_fraction over this number of timesteps scaled by # exploration_fraction "schedule_max_timesteps": 100000, # Number of env steps to optimize for before returning "timesteps_per_iteration": 1000, # Fraction of entire training period over which the exploration rate is # annealed "exploration_fraction": 0.1, # Final value of random action probability "exploration_final_eps": 0.02, # Update the target network every `target_network_update_freq` steps. "target_network_update_freq": 500, # Use softmax for sampling actions. "soft_q": False, # Softmax temperature. Q values are divided by this value prior to softmax. # Softmax approaches argmax as the temperature drops to zero. "softmax_temp": 1.0, # If True parameter space noise will be used for exploration # See https://blog.openai.com/better-exploration-with-parameter-noise/ "parameter_noise": False, # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. "buffer_size": 50000, # If True prioritized replay buffer will be used. "prioritized_replay": True, # Alpha parameter for prioritized replay buffer. "prioritized_replay_alpha": 0.6, # Beta parameter for sampling from prioritized replay buffer. "prioritized_replay_beta": 0.4, # Fraction of entire training period over which the beta parameter is # annealed "beta_annealing_fraction": 0.2, # Final value of beta "final_prioritized_replay_beta": 0.4, # Epsilon to add to the TD errors when updating priorities. "prioritized_replay_eps": 1e-6, # Whether to LZ4 compress observations "compress_observations": True, # === Optimization === # Learning rate for adam optimizer "lr": 5e-4, # Adam epsilon hyper parameter "adam_epsilon": 1e-8, # If not None, clip gradients during optimization at this value "grad_norm_clipping": 40, # How many steps of the model to sample before learning starts. "learning_starts": 1000, # Update the replay buffer with this many samples at once. Note that # this setting applies per-worker if num_workers > 1. "sample_batch_size": 4, # Size of a batched sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. "train_batch_size": 32, # === Parallelism === # Number of workers for collecting samples with. This only makes sense # to increase if your environment is particularly slow to sample, or if # you"re using the Async or Ape-X optimizers. "num_workers": 0, # Optimizer class to use. "optimizer_class": "SyncReplayOptimizer", # Whether to use a distribution of epsilons across workers for exploration. "per_worker_exploration": False, # Whether to compute priorities on workers. "worker_side_prioritization": False, # Prevent iterations from going lower than this time span "min_iter_time_s": 1, })
DEFAULT_CONFIG = with_common_config({ # === Model === # Number of atoms for representing the distribution of return. When # this is greater than 1, distributional Q-learning is used. # the discrete supports are bounded by v_min and v_max "num_atoms": 1, "v_min": -10.0, "v_max": 10.0, # Whether to use noisy network "noisy": False, # control the initial value of noisy nets "sigma0": 0.5, # Whether to use dueling dqn "dueling": True, # Whether to use double dqn "double_q": True, # Postprocess model outputs with these hidden layers to compute the # state and action values. See also the model config in catalog.py. "hiddens": [256], # N-step Q learning "n_step": 1, # === Evaluation === # Evaluate with epsilon=0 every `evaluation_interval` training iterations. # The evaluation stats will be reported under the "evaluation" metric key. # Note that evaluation is currently not parallelized, and that for Ape-X # metrics are already only reported for the lowest epsilon workers. "evaluation_interval": None, # Number of episodes to run per evaluation period. "evaluation_num_episodes": 10, # === Exploration === # Max num timesteps for annealing schedules. Exploration is annealed from # 1.0 to exploration_fraction over this number of timesteps scaled by # exploration_fraction "schedule_max_timesteps": 100000, # Number of env steps to optimize for before returning "timesteps_per_iteration": 1000, # Fraction of entire training period over which the exploration rate is # annealed "exploration_fraction": 0.1, # Final value of random action probability "exploration_final_eps": 0.02, # Update the target network every `target_network_update_freq` steps. "target_network_update_freq": 500, # Use softmax for sampling actions. Required for off policy estimation. "soft_q": False, # Softmax temperature. Q values are divided by this value prior to softmax. # Softmax approaches argmax as the temperature drops to zero. "softmax_temp": 1.0, # If True parameter space noise will be used for exploration # See https://blog.openai.com/better-exploration-with-parameter-noise/ "parameter_noise": False, # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. "buffer_size": 50000, # If True prioritized replay buffer will be used. "prioritized_replay": True, # Alpha parameter for prioritized replay buffer. "prioritized_replay_alpha": 0.6, # Beta parameter for sampling from prioritized replay buffer. "prioritized_replay_beta": 0.4, # Fraction of entire training period over which the beta parameter is # annealed "beta_annealing_fraction": 0.2, # Final value of beta "final_prioritized_replay_beta": 0.4, # Epsilon to add to the TD errors when updating priorities. "prioritized_replay_eps": 1e-6, # Whether to LZ4 compress observations "compress_observations": True, # === Optimization === # Learning rate for adam optimizer "lr": 5e-4, # Learning rate schedule "lr_schedule": None, # Adam epsilon hyper parameter "adam_epsilon": 1e-8, # If not None, clip gradients during optimization at this value "grad_norm_clipping": 40, # How many steps of the model to sample before learning starts. "learning_starts": 1000, # Update the replay buffer with this many samples at once. Note that # this setting applies per-worker if num_workers > 1. "sample_batch_size": 4, # Size of a batched sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. "train_batch_size": 32, # === Parallelism === # Number of workers for collecting samples with. This only makes sense # to increase if your environment is particularly slow to sample, or if # you"re using the Async or Ape-X optimizers. "num_workers": 0, # Optimizer class to use. "optimizer_class": "SyncReplayOptimizer", # Whether to use a distribution of epsilons across workers for exploration. "per_worker_exploration": False, # Whether to compute priorities on workers. "worker_side_prioritization": False, # Prevent iterations from going lower than this time span "min_iter_time_s": 1, })
from ray.rllib.utils.annotations import override # yapf: disable # __sphinx_doc_begin__ DEFAULT_CONFIG = with_common_config({ # Scaling of advantages in exponential terms # When beta is 0, MARWIL is reduced to imitation learning "beta": 1.0, # Balancing value estimation loss and policy optimization loss "vf_coeff": 1.0, # Whether to calculate cumulative rewards "postprocess_inputs": True, # Whether to rollout "complete_episodes" or "truncate_episodes" "batch_mode": "complete_episodes", # Use importance sampling estimators for reward "input_evaluation": ["is", "wis"], # Learning rate for adam optimizer "lr": 1e-4, # Number of timesteps collected for each SGD round "train_batch_size": 2000, # Number of steps max to keep in the batch replay buffer "replay_buffer_size": 100000, # Number of steps to read before learning starts "learning_starts": 0, # === Parallelism === "num_workers": 0, }) # __sphinx_doc_end__ # yapf: enable
class _AgentImportFailed(Agent): _agent_name = "AgentImportFailed" _default_config = with_common_config({}) def _setup(self, config): raise ImportError(trace)
DEFAULT_CONFIG = with_common_config({ # === Twin Delayed DDPG (TD3) and Soft Actor-Critic (SAC) tricks === # TD3: https://spinningup.openai.com/en/latest/algorithms/td3.html # twin Q-net "twin_q": False, # delayed policy update "policy_delay": 1, # target policy smoothing # this also forces the use of gaussian instead of OU noise for exploration "smooth_target_policy": False, # gaussian stddev of act noise "act_noise": 0.1, # gaussian stddev of target noise "target_noise": 0.2, # target noise limit (bound) "noise_clip": 0.5, # === Model === # Hidden layer sizes of the policy network "actor_hiddens": [64, 64], # Hidden layers activation of the policy network "actor_hidden_activation": "relu", # Hidden layer sizes of the critic network "critic_hiddens": [64, 64], # Hidden layers activation of the critic network "critic_hidden_activation": "relu", # N-step Q learning "n_step": 1, # === Exploration === # Max num timesteps for annealing schedules. Exploration is annealed from # 1.0 to exploration_fraction over this number of timesteps scaled by # exploration_fraction "schedule_max_timesteps": 100000, # Number of env steps to optimize for before returning "timesteps_per_iteration": 1000, # Fraction of entire training period over which the exploration rate is # annealed "exploration_fraction": 0.1, # Final value of random action probability "exploration_final_eps": 0.02, # OU-noise scale "noise_scale": 0.1, # theta "exploration_theta": 0.15, # sigma "exploration_sigma": 0.2, # Update the target network every `target_network_update_freq` steps. "target_network_update_freq": 0, # Update the target by \tau * policy + (1-\tau) * target_policy "tau": 0.002, # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. "buffer_size": 50000, # If True prioritized replay buffer will be used. "prioritized_replay": True, # Alpha parameter for prioritized replay buffer. "prioritized_replay_alpha": 0.6, # Beta parameter for sampling from prioritized replay buffer. "prioritized_replay_beta": 0.4, # Epsilon to add to the TD errors when updating priorities. "prioritized_replay_eps": 1e-6, # Whether to LZ4 compress observations "compress_observations": False, # === Optimization === # Learning rate for adam optimizer. # Instead of using two optimizers, we use two different loss coefficients "lr": 1e-3, "actor_loss_coeff": 0.1, "critic_loss_coeff": 1.0, # If True, use huber loss instead of squared loss for critic network # Conventionally, no need to clip gradients if using a huber loss "use_huber": False, # Threshold of a huber loss "huber_threshold": 1.0, # Weights for L2 regularization "l2_reg": 1e-6, # If not None, clip gradients during optimization at this value "grad_norm_clipping": None, # How many steps of the model to sample before learning starts. "learning_starts": 1500, # Update the replay buffer with this many samples at once. Note that this # setting applies per-worker if num_workers > 1. "sample_batch_size": 1, # Size of a batched sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. "train_batch_size": 256, # === Parallelism === # Number of workers for collecting samples with. This only makes sense # to increase if your environment is particularly slow to sample, or if # you"re using the Async or Ape-X optimizers. "num_workers": 0, # Optimizer class to use. "optimizer_class": "SyncReplayOptimizer", # Whether to use a distribution of epsilons across workers for exploration. "per_worker_exploration": False, # Whether to compute priorities on workers. "worker_side_prioritization": False, # Prevent iterations from going lower than this time span "min_iter_time_s": 1, })
return input_filter(observation) ModelCatalog.register_custom_preprocessor("my_prep", MyPreprocessorClass) DEFAULT_CONFIG = with_common_config({ # Number of workers (excluding master) "num_workers": 0, # Size of rollout batch "batch_size": 100, # Max global norm for each gradient calculated by worker "grad_clip": 40.0, # Learning rate "lr": 0.0001, # Whether to use a GPU for local optimization. "gpu": False, # Whether to place workers on GPUs "use_gpu_for_workers": False, # Model and preprocessor options "model": { "custom_preprocessor": "my_prep", # "custom_options": {} # extra options to pass to your classes }, # Arguments to pass to the env creator "env_config": {}, }) class BCOAgent(Agent): """Behvioral cloning from observation agent.
DEFAULT_CONFIG = with_common_config({ # V-trace params (see vtrace.py). "vtrace": True, "vtrace_clip_rho_threshold": 1.0, "vtrace_clip_pg_rho_threshold": 1.0, # System params. "sample_batch_size": 50, "train_batch_size": 500, "min_iter_time_s": 10, "num_workers": 2, "num_cpus_per_worker": 1, "num_gpus_per_worker": 0, # number of GPUs the learner should use. "num_gpus": 1, # set >1 to load data into GPUs in parallel. Increases GPU memory usage # proportionally with the number of loaders. "num_parallel_data_loaders": 1, # level of queuing for sampling. "max_sample_requests_in_flight_per_worker": 2, # set >0 to enable experience replay. Saved samples will be replayed with # a p:1 proportion to new data samples. "replay_proportion": 0.0, # number of sample batches to store for replay. The number of transitions # saved total will be (replay_buffer_num_slots * sample_batch_size). "replay_buffer_num_slots": 100, # Learning params. "grad_clip": 40.0, # either "adam" or "rmsprop" "opt_type": "adam", "lr": 0.0005, "lr_schedule": None, # rmsprop considered "decay": 0.99, "momentum": 0.0, "epsilon": 0.1, # balancing the three losses "vf_loss_coeff": 0.5, "entropy_coeff": -0.01, })
DEFAULT_CONFIG = with_common_config({ # V-trace params (see vtrace.py). "vtrace": True, "vtrace_clip_rho_threshold": 1.0, "vtrace_clip_pg_rho_threshold": 1.0, # System params. "sample_batch_size": 50, "train_batch_size": 500, "min_iter_time_s": 10, "gpu": True, "num_workers": 2, "num_cpus_per_worker": 1, "num_gpus_per_worker": 0, # Learning params. "grad_clip": 40.0, # either "adam" or "rmsprop" "opt_type": "adam", "lr": 0.0005, "lr_schedule": None, # rmsprop considered "decay": 0.99, "momentum": 0.0, "epsilon": 0.1, # balancing the three losses "vf_loss_coeff": 0.5, "entropy_coeff": -0.01, # Model and preprocessor options. "model": { "use_lstm": False, "max_seq_len": 20, "dim": 84, }, })
DEFAULT_CONFIG = with_common_config({ # === QMix === # Mixing network. Either "qmix", "vdn", or None "mixer": "qmix", # Size of the mixing network embedding "mixing_embed_dim": 32, # Whether to use Double_Q learning "double_q": True, # Optimize over complete episodes by default. "batch_mode": "complete_episodes", # === Exploration === # Max num timesteps for annealing schedules. Exploration is annealed from # 1.0 to exploration_fraction over this number of timesteps scaled by # exploration_fraction "schedule_max_timesteps": 100000, # Number of env steps to optimize for before returning "timesteps_per_iteration": 1000, # Fraction of entire training period over which the exploration rate is # annealed "exploration_fraction": 0.1, # Final value of random action probability "exploration_final_eps": 0.02, # Update the target network every `target_network_update_freq` steps. "target_network_update_freq": 500, # === Replay buffer === # Size of the replay buffer in steps. "buffer_size": 10000, # === Optimization === # Learning rate for adam optimizer "lr": 0.0005, # RMSProp alpha "optim_alpha": 0.99, # RMSProp epsilon "optim_eps": 0.00001, # If not None, clip gradients during optimization at this value "grad_norm_clipping": 10, # How many steps of the model to sample before learning starts. "learning_starts": 1000, # Update the replay buffer with this many samples at once. Note that # this setting applies per-worker if num_workers > 1. "sample_batch_size": 4, # Size of a batched sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. "train_batch_size": 32, # === Parallelism === # Number of workers for collecting samples with. This only makes sense # to increase if your environment is particularly slow to sample, or if # you"re using the Async or Ape-X optimizers. "num_workers": 0, # Optimizer class to use. "optimizer_class": "SyncBatchReplayOptimizer", # Whether to use a distribution of epsilons across workers for exploration. "per_worker_exploration": False, # Whether to compute priorities on workers. "worker_side_prioritization": False, # Prevent iterations from going lower than this time span "min_iter_time_s": 1, # === Model === "model": { "lstm_cell_size": 64, "max_seq_len": 999999, }, })
from __future__ import division from __future__ import print_function from ray.rllib.agents.agent import Agent, with_common_config from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph from ray.rllib.optimizers import SyncSamplesOptimizer from ray.rllib.utils import merge_dicts from ray.tune.trial import Resources DEFAULT_CONFIG = with_common_config({ # No remote workers by default "num_workers": 0, # Learning rate "lr": 0.0004, # Override model config "model": { # Use LSTM model. "use_lstm": False, # Max seq length for LSTM training. "max_seq_len": 20, }, }) class PGAgent(Agent): """Simple policy gradient agent. This is an example agent to show how to implement algorithms in RLlib. In most cases, you will probably want to use the PPO agent instead. """
DEFAULT_CONFIG = with_common_config({ # === Twin Delayed DDPG (TD3) and Soft Actor-Critic (SAC) tricks === # TD3: https://spinningup.openai.com/en/latest/algorithms/td3.html # twin Q-net "twin_q": False, # delayed policy update "policy_delay": 1, # target policy smoothing # this also forces the use of gaussian instead of OU noise for exploration "smooth_target_policy": False, # gaussian stddev of act noise "act_noise": 0.1, # gaussian stddev of target noise "target_noise": 0.2, # target noise limit (bound) "noise_clip": 0.5, # === Evaluation === # Evaluate with epsilon=0 every `evaluation_interval` training iterations. # The evaluation stats will be reported under the "evaluation" metric key. # Note that evaluation is currently not parallelized, and that for Ape-X # metrics are already only reported for the lowest epsilon workers. "evaluation_interval": None, # Number of episodes to run per evaluation period. "evaluation_num_episodes": 10, # === Model === # Hidden layer sizes of the policy network "actor_hiddens": [64, 64], # Hidden layers activation of the policy network "actor_hidden_activation": "relu", # Hidden layer sizes of the critic network "critic_hiddens": [64, 64], # Hidden layers activation of the critic network "critic_hidden_activation": "relu", # N-step Q learning "n_step": 1, # === Exploration === # Max num timesteps for annealing schedules. Exploration is annealed from # 1.0 to exploration_fraction over this number of timesteps scaled by # exploration_fraction "schedule_max_timesteps": 100000, # Number of env steps to optimize for before returning "timesteps_per_iteration": 1000, # Fraction of entire training period over which the exploration rate is # annealed "exploration_fraction": 0.1, # Final value of random action probability "exploration_final_eps": 0.02, # OU-noise scale "noise_scale": 0.1, # theta "exploration_theta": 0.15, # sigma "exploration_sigma": 0.2, # Update the target network every `target_network_update_freq` steps. "target_network_update_freq": 0, # Update the target by \tau * policy + (1-\tau) * target_policy "tau": 0.002, # If True parameter space noise will be used for exploration # See https://blog.openai.com/better-exploration-with-parameter-noise/ "parameter_noise": False, # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. "buffer_size": 50000, # If True prioritized replay buffer will be used. "prioritized_replay": True, # Alpha parameter for prioritized replay buffer. "prioritized_replay_alpha": 0.6, # Beta parameter for sampling from prioritized replay buffer. "prioritized_replay_beta": 0.4, # Epsilon to add to the TD errors when updating priorities. "prioritized_replay_eps": 1e-6, # Whether to LZ4 compress observations "compress_observations": False, # === Optimization === # Learning rate for adam optimizer. # Instead of using two optimizers, we use two different loss coefficients "lr": 1e-3, "actor_loss_coeff": 0.1, "critic_loss_coeff": 1.0, # If True, use huber loss instead of squared loss for critic network # Conventionally, no need to clip gradients if using a huber loss "use_huber": False, # Threshold of a huber loss "huber_threshold": 1.0, # Weights for L2 regularization "l2_reg": 1e-6, # If not None, clip gradients during optimization at this value "grad_norm_clipping": None, # How many steps of the model to sample before learning starts. "learning_starts": 1500, # Update the replay buffer with this many samples at once. Note that this # setting applies per-worker if num_workers > 1. "sample_batch_size": 1, # Size of a batched sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. "train_batch_size": 256, # === Parallelism === # Number of workers for collecting samples with. This only makes sense # to increase if your environment is particularly slow to sample, or if # you"re using the Async or Ape-X optimizers. "num_workers": 0, # Optimizer class to use. "optimizer_class": "SyncReplayOptimizer", # Whether to use a distribution of epsilons across workers for exploration. "per_worker_exploration": False, # Whether to compute priorities on workers. "worker_side_prioritization": False, # Prevent iterations from going lower than this time span "min_iter_time_s": 1, })
DEFAULT_CONFIG = with_common_config({ # === Model === # Hidden layer sizes of the policy network "actor_hiddens": [64, 64], # Hidden layers activation of the policy network "actor_hidden_activation": "relu", # Hidden layer sizes of the critic network "critic_hiddens": [64, 64], # Hidden layers activation of the critic network "critic_hidden_activation": "relu", # N-step Q learning "n_step": 1, # === Exploration === # Max num timesteps for annealing schedules. Exploration is annealed from # 1.0 to exploration_fraction over this number of timesteps scaled by # exploration_fraction "schedule_max_timesteps": 100000, # Number of env steps to optimize for before returning "timesteps_per_iteration": 1000, # Fraction of entire training period over which the exploration rate is # annealed "exploration_fraction": 0.1, # Final value of random action probability "exploration_final_eps": 0.02, # OU-noise scale "noise_scale": 0.1, # theta "exploration_theta": 0.15, # sigma "exploration_sigma": 0.2, # Update the target network every `target_network_update_freq` steps. "target_network_update_freq": 0, # Update the target by \tau * policy + (1-\tau) * target_policy "tau": 0.002, # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. "buffer_size": 50000, # If True prioritized replay buffer will be used. "prioritized_replay": True, # Alpha parameter for prioritized replay buffer. "prioritized_replay_alpha": 0.6, # Beta parameter for sampling from prioritized replay buffer. "prioritized_replay_beta": 0.4, # Epsilon to add to the TD errors when updating priorities. "prioritized_replay_eps": 1e-6, # Whether to clip rewards to [-1, 1] prior to adding to the replay buffer. "clip_rewards": True, # Whether to LZ4 compress observations "compress_observations": False, # === Optimization === # Learning rate for adam optimizer "actor_lr": 1e-4, "critic_lr": 1e-3, # If True, use huber loss instead of squared loss for critic network # Conventionally, no need to clip gradients if using a huber loss "use_huber": False, # Threshold of a huber loss "huber_threshold": 1.0, # Weights for L2 regularization "l2_reg": 1e-6, # If not None, clip gradients during optimization at this value "grad_norm_clipping": None, # How many steps of the model to sample before learning starts. "learning_starts": 1500, # Update the replay buffer with this many samples at once. Note that this # setting applies per-worker if num_workers > 1. "sample_batch_size": 1, # Size of a batched sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. "train_batch_size": 256, # === Parallelism === # Whether to use a GPU for local optimization. "gpu": False, # Number of workers for collecting samples with. This only makes sense # to increase if your environment is particularly slow to sample, or if # you"re using the Async or Ape-X optimizers. "num_workers": 0, # Whether to allocate GPUs for workers (if > 0). "num_gpus_per_worker": 0, # Whether to allocate CPUs for workers (if > 0). "num_cpus_per_worker": 1, # Optimizer class to use. "optimizer_class": "SyncReplayOptimizer", # Whether to use a distribution of epsilons across workers for exploration. "per_worker_exploration": False, # Whether to compute priorities on workers. "worker_side_prioritization": False, })
DEFAULT_CONFIG = with_common_config({ # Size of rollout batch "sample_batch_size": 10, # Use PyTorch as backend - no LSTM support "use_pytorch": False, # GAE(gamma) parameter "lambda": 1.0, # Max global norm for each gradient calculated by worker "grad_clip": 40.0, # Learning rate "lr": 0.0001, # Value Function Loss coefficient "vf_loss_coeff": 0.5, # Entropy coefficient "entropy_coeff": -0.01, # Whether to place workers on GPUs "use_gpu_for_workers": False, # Whether to emit extra summary stats "summarize": False, # Workers sample async "sample_async": True, # Model and preprocessor options "model": { # Use LSTM model. Requires TF. "use_lstm": False, # Max seq length for LSTM training. "max_seq_len": 20, # (Image statespace) - Converts image to Channels = 1 "grayscale": True, # (Image statespace) - Each pixel "zero_mean": False, # (Image statespace) - Converts image to (dim, dim, C) "dim": 80, # (Image statespace) - Converts image shape to (C, dim, dim) "channel_major": False, }, # Configure TF for single-process operation "tf_session_args": { "intra_op_parallelism_threads": 1, "inter_op_parallelism_threads": 1, "gpu_options": { "allow_growth": True, }, }, # Arguments to pass to the rllib optimizer "optimizer": { # Number of gradients applied for each `train` step "grads_per_step": 100, }, })
DEFAULT_CONFIG = with_common_config({ # Size of rollout batch "sample_batch_size": 10, # Use PyTorch as backend - no LSTM support "use_pytorch": False, # GAE(gamma) parameter "lambda": 1.0, # Max global norm for each gradient calculated by worker "grad_clip": 40.0, # Learning rate "lr": 0.0001, # Learning rate schedule "lr_schedule": None, # Value Function Loss coefficient "vf_loss_coeff": 0.5, # Entropy coefficient "entropy_coeff": -0.01, # Whether to place workers on GPUs "use_gpu_for_workers": False, # Min time per iteration "min_iter_time_s": 5, # Workers sample async. Note that this increases the effective # sample_batch_size by up to 5x due to async buffering of batches. "sample_async": True, # Model and preprocessor options "model": { # Use LSTM model. Requires TF. "use_lstm": False, # Max seq length for LSTM training. "max_seq_len": 20, # (Image statespace) - Converts image to Channels = 1 "grayscale": True, # (Image statespace) - Each pixel "zero_mean": False, # (Image statespace) - Converts image to (dim, dim, C) "dim": 84, # (Image statespace) - Converts image shape to (C, dim, dim) "channel_major": False, }, # Configure TF for single-process operation "tf_session_args": { "intra_op_parallelism_threads": 1, "inter_op_parallelism_threads": 1, "gpu_options": { "allow_growth": True, }, }, })
from __future__ import absolute_import from __future__ import division from __future__ import print_function from ray.rllib.agents.agent import Agent, with_common_config from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph from ray.rllib.optimizers import SyncSamplesOptimizer # yapf: disable # __sphinx_doc_begin__ DEFAULT_CONFIG = with_common_config({ # No remote workers by default "num_workers": 0, # Learning rate "lr": 0.0004, }) # __sphinx_doc_end__ # yapf: enable class PGAgent(Agent): """Simple policy gradient agent. This is an example agent to show how to implement algorithms in RLlib. In most cases, you will probably want to use the PPO agent instead. """ _agent_name = "PG" _default_config = DEFAULT_CONFIG _policy_graph = PGPolicyGraph
from __future__ import absolute_import from __future__ import division from __future__ import print_function from ray.rllib.agents.agent import Agent, with_common_config from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph from ray.rllib.optimizers import SyncSamplesOptimizer from ray.rllib.utils.annotations import override # yapf: disable # __sphinx_doc_begin__ DEFAULT_CONFIG = with_common_config({ # No remote workers by default "num_workers": 0, # Learning rate "lr": 0.0004, }) # __sphinx_doc_end__ # yapf: enable class PGAgent(Agent): """Simple policy gradient agent. This is an example agent to show how to implement algorithms in RLlib. In most cases, you will probably want to use the PPO agent instead. """ _agent_name = "PG" _default_config = DEFAULT_CONFIG _policy_graph = PGPolicyGraph
from __future__ import absolute_import from __future__ import division from __future__ import print_function from ray.rllib.agents.agent import Agent, with_common_config from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph from ray.rllib.optimizers import SyncSamplesOptimizer from ray.rllib.utils.annotations import override # yapf: disable # __sphinx_doc_begin__ DEFAULT_CONFIG = with_common_config({ # No remote workers by default "num_workers": 0, # Learning rate "lr": 0.0004, # Use PyTorch as backend "use_pytorch": False, }) # __sphinx_doc_end__ # yapf: enable class PGAgent(Agent): """Simple policy gradient agent. This is an example agent to show how to implement algorithms in RLlib. In most cases, you will probably want to use the PPO agent instead. """ _agent_name = "PG"
DEFAULT_CONFIG = with_common_config({ # V-trace params (see vtrace.py). "vtrace": True, "vtrace_clip_rho_threshold": 1.0, "vtrace_clip_pg_rho_threshold": 1.0, # System params. "sample_batch_size": 50, "train_batch_size": 500, "min_iter_time_s": 10, "summarize": False, "gpu": True, "num_workers": 2, "num_cpus_per_worker": 1, "num_gpus_per_worker": 0, # Learning params. "grad_clip": 40.0, "lr": 0.0001, "vf_loss_coeff": 0.5, "entropy_coeff": -0.01, # Model and preprocessor options. "clip_rewards": True, "preprocessor_pref": "deepmind", "model": { "use_lstm": False, "max_seq_len": 20, "dim": 80, }, })
from ray.rllib.utils.annotations import override # yapf: disable # __sphinx_doc_begin__ DEFAULT_CONFIG = with_common_config({ # Size of rollout batch "sample_batch_size": 10, # Use PyTorch as backend - no LSTM support "use_pytorch": False, # GAE(gamma) parameter "lambda": 1.0, # Max global norm for each gradient calculated by worker "grad_clip": 40.0, # Learning rate "lr": 0.0001, # Learning rate schedule "lr_schedule": None, # Value Function Loss coefficient "vf_loss_coeff": 0.5, # Entropy coefficient "entropy_coeff": 0.01, # Min time per iteration "min_iter_time_s": 5, # Workers sample async. Note that this increases the effective # sample_batch_size by up to 5x due to async buffering of batches. "sample_async": True, }) # __sphinx_doc_end__ # yapf: enable
DEFAULT_CONFIG = with_common_config({ # V-trace params (see vtrace.py). "vtrace": True, "vtrace_clip_rho_threshold": 1.0, "vtrace_clip_pg_rho_threshold": 1.0, # System params. # # == Overview of data flow in IMPALA == # 1. Policy evaluation in parallel across `num_workers` actors produces # batches of size `sample_batch_size * num_envs_per_worker`. # 2. If enabled, the replay buffer stores and produces batches of size # `sample_batch_size * num_envs_per_worker`. # 3. If enabled, the minibatch ring buffer stores and replays batches of # size `train_batch_size` up to `num_sgd_iter` times per batch. # 4. The learner thread executes data parallel SGD across `num_gpus` GPUs # on batches of size `train_batch_size`. # "sample_batch_size": 50, "train_batch_size": 500, "min_iter_time_s": 10, "num_workers": 2, # number of GPUs the learner should use. "num_gpus": 1, # set >1 to load data into GPUs in parallel. Increases GPU memory usage # proportionally with the number of buffers. "num_data_loader_buffers": 1, # how many train batches should be retained for minibatching. This conf # only has an effect if `num_sgd_iter > 1`. "minibatch_buffer_size": 1, # number of passes to make over each train batch "num_sgd_iter": 1, # set >0 to enable experience replay. Saved samples will be replayed with # a p:1 proportion to new data samples. "replay_proportion": 0.0, # number of sample batches to store for replay. The number of transitions # saved total will be (replay_buffer_num_slots * sample_batch_size). "replay_buffer_num_slots": 100, # level of queuing for sampling. "max_sample_requests_in_flight_per_worker": 2, # max number of workers to broadcast one set of weights to "broadcast_interval": 1, # Learning params. "grad_clip": 40.0, # either "adam" or "rmsprop" "opt_type": "adam", "lr": 0.0005, "lr_schedule": None, # rmsprop considered "decay": 0.99, "momentum": 0.0, "epsilon": 0.1, # balancing the three losses "vf_loss_coeff": 0.5, "entropy_coeff": -0.01, })
DQN_CONFIG = with_common_config({ # === Model === # Number of atoms for representing the distribution of return. When # this is greater than 1, distributional Q-learning is used. # the discrete supports are bounded by v_min and v_max "num_atoms": 1, "v_min": -10.0, "v_max": 10.0, # Whether to use noisy network "noisy": False, # control the initial value of noisy nets "sigma0": 0.5, # Whether to use dueling dqn "dueling": True, # Whether to use double dqn "double_q": True, # Hidden layer sizes of the state and action value networks "hiddens": [256], # N-step Q learning "n_step": 1, # === Evaluation === # Evaluate with epsilon=0 every `evaluation_interval` training iterations. # The evaluation stats will be reported under the "evaluation" metric key. # Note that evaluation is currently not parallelized, and that for Ape-X # metrics are already only reported for the lowest epsilon workers. #"evaluation_interval": None, # Number of episodes to run per evaluation period. #"evaluation_num_episodes": 10, # === Exploration === # Max num timesteps for annealing schedules. Exploration is annealed from # 1.0 to exploration_fraction over this number of timesteps scaled by # exploration_fraction "schedule_max_timesteps": 100000, # Number of env steps to optimize for before returning "timesteps_per_iteration": 1000, # Fraction of entire training period over which the exploration rate is # annealed "exploration_fraction": 0.1, # Final value of random action probability "exploration_final_eps": 0.02, # Update the target network every `target_network_update_freq` steps. "target_network_update_freq": 500, # Use softmax for sampling actions. #"soft_q": False, # Softmax temperature. Q values are divided by this value prior to softmax. # Softmax approaches argmax as the temperature drops to zero. #"softmax_temp": 1.0, # If True parameter space noise will be used for exploration # See https://blog.openai.com/better-exploration-with-parameter-noise/ #"parameter_noise": False, # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. "buffer_size": 50000, # If True prioritized replay buffer will be used. #"prioritized_replay": True, # Alpha parameter for prioritized replay buffer. #"prioritized_replay_alpha": 0.6, # Beta parameter for sampling from prioritized replay buffer. #"prioritized_replay_beta": 0.4, # Fraction of entire training period over which the beta parameter is # annealed "beta_annealing_fraction": 0.2, # Final value of beta "final_prioritized_replay_beta": 0.4, # Epsilon to add to the TD errors when updating priorities. "prioritized_replay_eps": 1e-6, # Whether to LZ4 compress observations "compress_observations": True, # === Optimization === # Learning rate for adam optimizer "lr": 5e-4, # Adam epsilon hyper parameter "adam_epsilon": 1e-8, # If not None, clip gradients during optimization at this value "grad_norm_clipping": 40, # How many steps of the model to sample before learning starts. "learning_starts": 1000, # Update the replay buffer with this many samples at once. Note that # this setting applies per-worker if num_workers > 1. "sample_batch_size": 4, # Size of a batched sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. "train_batch_size": 32, # === Parallelism === # Number of workers for collecting samples with. This only makes sense # to increase if your environment is particularly slow to sample, or if # you"re using the Async or Ape-X optimizers. "num_workers": 0, # Optimizer class to use. "optimizer_class": "SyncReplayOptimizer", # Whether to use a distribution of epsilons across workers for exploration. "per_worker_exploration": False, # Whether to compute priorities on workers. "worker_side_prioritization": False, # Prevent iterations from going lower than this time span "min_iter_time_s": 1, })