def __init__(self): super().__init__( with_common_config({ # PPO specific keys: "use_critic": True, "use_gae": True, "lambda": 1.0, "kl_coeff": 0.2, "sgd_minibatch_size": 128, "shuffle_sequences": True, "num_sgd_iter": 30, "lr_schedule": None, "vf_loss_coeff": 1.0, "entropy_coeff": 0.0, "entropy_coeff_schedule": None, "clip_param": 0.3, "vf_clip_param": 10.0, "grad_clip": None, "kl_target": 0.01, "rollout_fragment_length": 200, # TrainerConfig overrides: "train_batch_size": 4000, "lr": 5e-5, "model": { "vf_share_layers": False, }, "_disable_execution_plan_api": True, }))
def get_default_config(cls) -> TrainerConfigDict: # Run this Trainer with new `training_iteration` API and set some PPO-specific # parameters. return with_common_config({ "num_sgd_iter": 10, "sgd_minibatch_size": 128, })
def rand_func(): wandb.init(config={}) config = with_common_config(wandb.config) config['evaluation_num_episodes'] = 10 config['num_workers'] = mp.cpu_count() - 1 config['num_envs_per_worker'] = 4 config['metrics_smoothing_episodes'] = 2000 config['observation_filter'] = 'NoFilter' config['env'] = default_config['env'] config['env_config'] = { 'env_config': { 'instance_path': config['instance_path'] } } config.pop('instance_path', None) config['callbacks'] = CustomCallbacks ray.init() stop = { "time_total_s": 600, } analysis = tune.run(RandomMaskedTrainer, config=config, stop=stop, name="ppo-jss") result = analysis.results_df.to_dict('index') last_run_id = list(result.keys())[0] result = result[last_run_id] wandb.log({'time_step_min': result['custom_metrics.time_step_min']}) if result['custom_metrics.time_step_max'] != float('inf'): wandb.log({'time_step_max': result['custom_metrics.time_step_max']}) wandb.log({'time_step_mean': result['custom_metrics.time_step_mean']}) wandb.log({'episode_reward_max': result['episode_reward_max']}) wandb.log({'episode_reward_min': result['episode_reward_min']}) wandb.log({'episode_reward_mean': result['episode_reward_mean']}) wandb.log({'episodes_total': result['episodes_total']}) wandb.log({'training_iteration': result['training_iteration']}) ray.shutdown()
class MinimalTrainer(Trainer): _name = "MinimalTrainer" _default_config = with_common_config( {"workers": False, "optimizer": False, "tracker": False} ) _policy = DummyPolicy def _init(self, config, env_creator): def make_workers(): return self._make_workers( env_creator, self._policy, config, num_workers=config["num_workers"] ) if config["tracker"]: self.tracker = StatsTracker(make_workers()) elif config["optimizer"]: self.optimizer = PolicyOptimizer(make_workers()) elif config["workers"]: self.workers = make_workers() def _train(self): return self._log_metrics({})
DEFAULT_CONFIG = with_common_config({ # If true, use the Generalized Advantage Estimator (GAE) # with a value function, see https://arxiv.org/pdf/1506.02438.pdf. "use_gae": True, # GAE(lambda) parameter "lambda": 1.0, # Initial coefficient for KL divergence "kl_coeff": 0.0005, # Size of batches collected from each worker "rollout_fragment_length": 200, # Do create an actual env on the local worker (worker-idx=0). "create_env_on_driver": True, # Stepsize of SGD "lr": 1e-3, "model": { # Share layers for value function. "vf_share_layers": False, }, # Coefficient of the value function loss "vf_loss_coeff": 0.5, # Coefficient of the entropy regularizer "entropy_coeff": 0.0, # PPO clip parameter "clip_param": 0.3, # Clip param for the value function. Note that this is sensitive to the # scale of the rewards. If your expected V is large, increase this. "vf_clip_param": 10.0, # If specified, clip the global norm of gradients by this amount "grad_clip": None, # Target value for KL divergence "kl_target": 0.01, # Whether to rollout "complete_episodes" or "truncate_episodes" "batch_mode": "complete_episodes", # Which observation filter to apply to the observation "observation_filter": "NoFilter", # Number of Inner adaptation steps for the MAML algorithm "inner_adaptation_steps": 1, # Number of MAML steps per meta-update iteration (PPO steps) "maml_optimizer_steps": 5, # Inner Adaptation Step size "inner_lr": 0.1, # Use Meta Env Template "use_meta_env": True, # Deprecated keys: # Share layers for value function. If you set this to True, it's important # to tune vf_loss_coeff. # Use config.model.vf_share_layers instead. "vf_share_layers": DEPRECATED_VALUE, })
logger = logging.getLogger(__name__) Result = namedtuple("Result", [ "noise_indices", "noisy_returns", "sign_noisy_returns", "noisy_lengths", "eval_returns", "eval_lengths" ]) # yapf: disable # __sphinx_doc_begin__ DEFAULT_CONFIG = with_common_config({ "noise_stdev": 0.02, # std deviation of parameter noise "num_rollouts": 32, # number of perturbs to try "rollouts_used": 32, # number of perturbs to keep in gradient estimate "num_workers": 2, "sgd_stepsize": 0.01, # sgd step-size "observation_filter": "MeanStdFilter", "noise_size": 250000000, "eval_prob": 0.03, # probability of evaluating the parameter rewards "report_length": 10, # how many of the last rewards we average over "offset": 0, }) # __sphinx_doc_end__ # yapf: enable @ray.remote def create_shared_noise(count): """Create a large array of noise to be shared by all workers.""" seed = 123 noise = np.random.RandomState(seed).randn(count).astype(np.float32) return noise
DEFAULT_CONFIG = with_common_config({ # If true, use the Generalized Advantage Estimator (GAE) # with a value function, see https://arxiv.org/pdf/1506.02438.pdf. "use_gae": True, # GAE(lambda) parameter "lambda": 1.0, # Initial coefficient for KL divergence "kl_coeff": 0.2, # Number of timesteps collected for each SGD round "timesteps_per_batch": 4000, # Number of SGD iterations in each outer loop "num_sgd_iter": 30, # Stepsize of SGD "sgd_stepsize": 5e-5, # Total SGD batch size across all devices for SGD "sgd_batchsize": 128, # Coefficient of the value function loss "vf_loss_coeff": 1.0, # Coefficient of the entropy regularizer "entropy_coeff": 0.0, # PPO clip parameter "clip_param": 0.3, # Target value for KL divergence "kl_target": 0.01, # Number of GPUs to use for SGD "num_gpus": 0, # Whether to allocate GPUs for workers (if > 0). "num_gpus_per_worker": 0, # Whether to allocate CPUs for workers (if > 0). "num_cpus_per_worker": 1, # Whether to rollout "complete_episodes" or "truncate_episodes" "batch_mode": "complete_episodes", # Which observation filter to apply to the observation "observation_filter": "MeanStdFilter", })
from ray.rllib.agents.ars import policies from ray.rllib.agents.es import tabular_logger as tlogger from ray.rllib.agents.ars import utils Result = namedtuple("Result", [ "noise_indices", "noisy_returns", "sign_noisy_returns", "noisy_lengths", "eval_returns", "eval_lengths" ]) DEFAULT_CONFIG = with_common_config({ 'noise_stdev': 0.02, # std deviation of parameter noise 'num_deltas': 4, # number of perturbations to try 'deltas_used': 4, # number of perturbations to keep in gradient estimate 'num_workers': 2, 'stepsize': 0.01, # sgd step-size 'observation_filter': "MeanStdFilter", 'noise_size': 250000000, 'eval_prob': 0.03, # probability of evaluating the parameter rewards 'env_config': {}, 'offset': 0, 'policy_type': "LinearPolicy", # ["LinearPolicy", "MLPPolicy"] "fcnet_hiddens": [32, 32], # fcnet structure of MLPPolicy }) @ray.remote def create_shared_noise(count): """Create a large array of noise to be shared by all workers.""" seed = 123 noise = np.random.RandomState(seed).randn(count).astype(np.float32) return noise
DEFAULT_CONFIG = with_common_config({ # If true, use the Generalized Advantage Estimator (GAE) # with a value function, see https://arxiv.org/pdf/1506.02438.pdf. "use_gae": True, # GAE(lambda) parameter. "lambda": 1.0, # Initial coefficient for KL divergence. "kl_coeff": 0.0005, # Size of batches collected from each worker. "rollout_fragment_length": 200, # Do create an actual env on the local worker (worker-idx=0). "create_env_on_driver": True, # Step size of SGD. "lr": 1e-3, # Coefficient of the value function loss. "vf_loss_coeff": 0.5, # Coefficient of the entropy regularizer. "entropy_coeff": 0.0, # PPO clip parameter. "clip_param": 0.5, # Clip param for the value function. Note that this is sensitive to the # scale of the rewards. If your expected V is large, increase this. "vf_clip_param": 10.0, # If specified, clip the global norm of gradients by this amount. "grad_clip": None, # Target value for KL divergence. "kl_target": 0.01, # Whether to rollout "complete_episodes" or "truncate_episodes". "batch_mode": "complete_episodes", # Which observation filter to apply to the observation. "observation_filter": "NoFilter", # Number of Inner adaptation steps for the MAML algorithm. "inner_adaptation_steps": 1, # Number of MAML steps per meta-update iteration (PPO steps). "maml_optimizer_steps": 8, # Inner adaptation step size. "inner_lr": 1e-3, # Horizon of the environment (200 in MB-MPO paper). "horizon": 200, # Dynamics ensemble hyperparameters. "dynamics_model": { "custom_model": DynamicsEnsembleCustomModel, # Number of Transition-Dynamics (TD) models in the ensemble. "ensemble_size": 5, # Hidden layers for each model in the TD-model ensemble. "fcnet_hiddens": [512, 512, 512], # Model learning rate. "lr": 1e-3, # Max number of training epochs per MBMPO iter. "train_epochs": 500, # Model batch size. "batch_size": 500, # Training/validation split. "valid_split_ratio": 0.2, # Normalize data (obs, action, and deltas). "normalize_data": True, }, # Exploration for MB-MPO is based on StochasticSampling, but uses 8000 # random timesteps up-front for worker=0. "exploration_config": { "type": MBMPOExploration, "random_timesteps": 8000, }, # Workers sample from dynamics models, not from actual envs. "custom_vector_env": model_vector_env, # How many iterations through MAML per MBMPO iteration. "num_maml_steps": 10, # Deprecated keys: # Share layers for value function. If you set this to True, it's important # to tune vf_loss_coeff. # Use config.model.vf_share_layers instead. "vf_share_layers": DEPRECATED_VALUE, })
DEFAULT_CONFIG = with_common_config({ # If true, use the Generalized Advantage Estimator (GAE) # with a value function, see https://arxiv.org/pdf/1506.02438.pdf. "use_gae": True, # GAE(lambda) parameter "lambda": 1.0, # Initial coefficient for KL divergence "kl_coeff": 0.2, # Size of batches collected from each worker "sample_batch_size": 200, # Number of timesteps collected for each SGD round "train_batch_size": 4000, # Total SGD batch size across all devices for SGD "sgd_minibatch_size": 128, # Number of SGD iterations in each outer loop "num_sgd_iter": 30, # Stepsize of SGD "lr": 5e-5, # Learning rate schedule "lr_schedule": None, # Share layers for value function "vf_share_layers": False, # Coefficient of the value function loss "vf_loss_coeff": 1.0, # Coefficient of the entropy regularizer "entropy_coeff": 0.0, # PPO clip parameter "clip_param": 0.3, # Clip param for the value function. Note that this is sensitive to the # scale of the rewards. If your expected V is large, increase this. "vf_clip_param": 10.0, # Target value for KL divergence "kl_target": 0.01, # Whether to rollout "complete_episodes" or "truncate_episodes" "batch_mode": "truncate_episodes", # Which observation filter to apply to the observation "observation_filter": "MeanStdFilter", # Uses the sync samples optimizer instead of the multi-gpu one. This does # not support minibatches. "simple_optimizer": False, })
import ray import numpy as np from ray.rllib import Policy from ray.rllib.agents import with_common_config from ray.rllib.agents.trainer import Trainer from ray.rllib.execution.rollout_ops import synchronous_parallel_sample from ray.rllib.examples.env.parametric_actions_cartpole import ParametricActionsCartPole from ray.rllib.models.modelv2 import restore_original_dimensions from ray.rllib.utils import override from ray.rllib.utils.typing import ResultDict from ray.tune.registry import register_env DEFAULT_CONFIG = with_common_config({ # Run with new `training_iteration` API. "_disable_execution_plan_api": True, }) class RandomParametricPolicy(Policy, ABC): """ Just pick a random legal action The outputted state of the environment needs to be a dictionary with an 'action_mask' key containing the legal actions for the agent. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.exploration = self._create_exploration() @override(Policy) def compute_actions(self,
DEFAULT_CONFIG = with_common_config({ # If true, use the Generalized Advantage Estimator (GAE) # with a value function, see https://arxiv.org/pdf/1506.02438.pdf. "use_gae": True, # The GAE(lambda) parameter. "lambda": 1.0, # Initial coefficient for KL divergence. "kl_coeff": 0.2, # Size of batches collected from each worker. "sample_batch_size": 200, # Number of timesteps collected for each SGD round. This defines the size # of each SGD epoch. "train_batch_size": 4000, # Total SGD batch size across all devices for SGD. This defines the # minibatch size within each epoch. "sgd_minibatch_size": 128, # Whether to shuffle sequences in the batch when training (recommended). "shuffle_sequences": True, # Number of SGD iterations in each outer loop (i.e., number of epochs to # execute per train batch). "num_sgd_iter": 30, # Stepsize of SGD. "lr": 5e-5, # Learning rate schedule. "lr_schedule": None, # Share layers for value function. If you set this to True, it's important # to tune vf_loss_coeff. "vf_share_layers": False, # Coefficient of the value function loss. IMPORTANT: you must tune this if # you set vf_share_layers: True. "vf_loss_coeff": 1.0, # Coefficient of the entropy regularizer. "entropy_coeff": 0.0, # Decay schedule for the entropy regularizer. "entropy_coeff_schedule": None, # PPO clip parameter. "clip_param": 0.3, # Clip param for the value function. Note that this is sensitive to the # scale of the rewards. If your expected V is large, increase this. "vf_clip_param": 10.0, # If specified, clip the global norm of gradients by this amount. "grad_clip": None, # Target value for KL divergence. "kl_target": 0.01, # Whether to rollout "complete_episodes" or "truncate_episodes". "batch_mode": "truncate_episodes", # Which observation filter to apply to the observation. "observation_filter": "NoFilter", # Uses the sync samples optimizer instead of the multi-gpu one. This is # usually slower, but you might want to try it if you run into issues with # the default optimizer. "simple_optimizer": False, # Use the experimental torch multi-node SGD optimizer. "distributed_data_parallel_optimizer": False, # Use PyTorch as framework? "use_pytorch": False })
DEFAULT_CONFIG = with_common_config({ # If true, use the Generalized Advantage Estimator (GAE) # with a value function, see https://arxiv.org/pdf/1506.02438.pdf. "use_gae": True, # GAE(lambda) parameter "lambda": 1.0, # Initial coefficient for KL divergence "kl_coeff": 0.2, # Number of timesteps collected for each SGD round "timesteps_per_batch": 4000, # Number of SGD iterations in each outer loop "num_sgd_iter": 30, # Stepsize of SGD "sgd_stepsize": 5e-5, # Learning rate schedule "lr_schedule": None, # Share layers for value function "vf_share_layers": False, # Total SGD batch size across all devices for SGD (multi-gpu only) "sgd_batchsize": 128, # Coefficient of the value function loss "vf_loss_coeff": 1.0, # Coefficient of the entropy regularizer "entropy_coeff": 0.0, # PPO clip parameter "clip_param": 0.3, # Target value for KL divergence "kl_target": 0.01, # Number of GPUs to use for SGD "num_gpus": 0, # Whether to allocate GPUs for workers (if > 0). "num_gpus_per_worker": 0, # Whether to allocate CPUs for workers (if > 0). "num_cpus_per_worker": 1, # Whether to rollout "complete_episodes" or "truncate_episodes" "batch_mode": "complete_episodes", # Which observation filter to apply to the observation "observation_filter": "MeanStdFilter", # Use the sync samples optimizer instead of the multi-gpu one "simple_optimizer": False, # Override model config "model": { # Whether to use LSTM model "use_lstm": False, # Max seq length for LSTM training. "max_seq_len": 20, }, })
from ray.rllib.agents import with_common_config from ray.rllib.utils.deprecation import DEPRECATED_VALUE DEFAULT_CONFIG = with_common_config({ "use_critic": True, "use_gae": True, "lambda": 1.0, "kl_coeff": 0.2, "rollout_fragment_length": 200, "train_batch_size": 4000, "sgd_minibatch_size": 128, "shuffle_sequences": True, "num_sgd_iter": 30, "lr": 5e-5, "lr_schedule": None, "vf_loss_coeff": 1.0, "model": { "vf_share_layers": False, }, "entropy_coeff": 0.0, "entropy_coeff_schedule": None, "clip_param": 0.3, "vf_clip_param": 10.0, "grad_clip": None, "kl_target": 0.01, "batch_mode": "truncate_episodes", "observation_filter": "NoFilter", "vf_share_layers": DEPRECATED_VALUE, })
from ray.rllib import Policy from ray.rllib.agents import with_common_config from ray.rllib.agents.trainer_template import build_trainer from ray.rllib.evaluation.worker_set import WorkerSet from ray.rllib.execution.metric_ops import StandardMetricsReporting from ray.rllib.execution.rollout_ops import ParallelRollouts, SelectExperiences from ray.rllib.examples.env.parametric_actions_cartpole import \ ParametricActionsCartPole from ray.rllib.models.modelv2 import restore_original_dimensions from ray.rllib.utils import override from ray.rllib.utils.typing import TrainerConfigDict from ray.util.iter import LocalIterator from ray.tune.registry import register_env DEFAULT_CONFIG = with_common_config({}) class RandomParametriclPolicy(Policy, ABC): """ Just pick a random legal action The outputted state of the environment needs to be a dictionary with an 'action_mask' key containing the legal actions for the agent. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.exploration = self._create_exploration() @override(Policy) def compute_actions(self, obs_batch,
from pathlib import Path import numpy as np import ray import yaml from ray.rllib.agents import with_common_config from algorithms.trainer_ea import EATrainer from utils.chromosome import VBNChromosome DEFAULT_CONFIG = with_common_config(yaml.load( Path('configs/config_ga_default.yaml').read_text() )) class GATrainer(EATrainer): _name = "GA" _default_config = DEFAULT_CONFIG def _init(self, config, env_creator): """ Trainer class for the Coevolutionary Genetic Algorithm. This class distributes the mutation and evaluation workload over a number of workers and updates and maintains the population.""" super(GATrainer, self)._init(config, env_creator) self.elites = [VBNChromosome(number_actions=self.config['number_actions'], input_channels=self.config['input_channels']) for _ in range(config['number_elites'])] samples = self.collect_samples() for chrom in self.elites:
logger = logging.getLogger(__name__) Result = namedtuple("Result", [ "noise_indices", "noisy_returns", "sign_noisy_returns", "noisy_lengths", "eval_returns", "eval_lengths" ]) # yapf: disable # __sphinx_doc_begin__ DEFAULT_CONFIG = with_common_config({ "l2_coeff": 0.005, "noise_stdev": 0.02, "episodes_per_batch": 1000, "train_batch_size": 10000, "eval_prob": 0.003, "return_proc_mode": "centered_rank", "num_workers": 10, "stepsize": 0.01, "observation_filter": "MeanStdFilter", "noise_size": 250000000, "report_length": 10, }) # __sphinx_doc_end__ # yapf: enable @ray.remote def create_shared_noise(count): """Create a large array of noise to be shared by all workers.""" seed = 123 noise = np.random.RandomState(seed).randn(count).astype(np.float32)
"eval_lengths", "novelty", "policy_weights" ]) # yapf: disable # __sphinx_doc_begin__ DEFAULT_CONFIG = with_common_config({ "l2_coeff": 0.005, "noise_stdev": 0.02, "noise_stdevGA": 0.02, "episodes_per_batch": 7, "train_batch_size": 1000, "eval_prob": 0.003, "return_proc_mode": "centered_rank", "num_workers": 7,#10 "stepsize": 0.01, "observation_filter": "MeanStdFilter", "noise_size": 250000000, "report_length": 10, "pop_size": 1, "population_size":20, "bf_sz":2000, "k":10, "epsilon":0.5, "neinum":10, "eta_c":20 }) # __sphinx_doc_end__ # yapf: enable @ray.remote
from ray.rllib.utils.annotations import override from ray.rllib.utils import FilterManager logger = logging.getLogger(__name__) # yapf: disable # __sphinx_doc_begin__ DEFAULT_CONFIG = with_common_config({ # 'theta_decay': 0.001, # 'alpha': .2, # 'noise_stdev': 0.02, # 'candidates_per_iteration': 144, 'timestep_limit': None, 'num_evals_per_iteration': 1, # 'return_proc_mode': 'centered_rank', 'num_workers': 4, 'request_interleaving': 2, # 'stepsize': 0.01, # 'observation_filter': 'MeanStdFilter', 'noise_size': 33554432, 'random_seed': None, # 'report_length': 10, "action_noise_std": 0.0, }) # __sphinx_doc_end__ # yapf: enable class Common:
DEFAULT_CONFIG = with_common_config({ # Should use a critic as a baseline (otherwise don't use value baseline; # required for using GAE). "use_critic": True, # If true, use the Generalized Advantage Estimator (GAE) # with a value function, see https://arxiv.org/pdf/1506.02438.pdf. "use_gae": True, # The GAE (lambda) parameter. "lambda": 1.0, # Initial coefficient for KL divergence. "kl_coeff": 0.2, # Size of batches collected from each worker. "rollout_fragment_length": 200, # Number of timesteps collected for each SGD round. This defines the size # of each SGD epoch. "train_batch_size": 4000, # Total SGD batch size across all devices for SGD. This defines the # minibatch size within each epoch. "sgd_minibatch_size": 128, # Whether to shuffle sequences in the batch when training (recommended). "shuffle_sequences": True, # Number of SGD iterations in each outer loop (i.e., number of epochs to # execute per train batch). "num_sgd_iter": 30, # Stepsize of SGD. "lr": 5e-5, # Learning rate schedule. "lr_schedule": None, # Coefficient of the value function loss. IMPORTANT: you must tune this if # you set vf_share_layers=True inside your model's config. "vf_loss_coeff": 1.0, "model": { # Share layers for value function. If you set this to True, it's # important to tune vf_loss_coeff. "vf_share_layers": False, }, # Coefficient of the entropy regularizer. "entropy_coeff": 0.0, # Decay schedule for the entropy regularizer. "entropy_coeff_schedule": None, # PPO clip parameter. "clip_param": 0.3, # Clip param for the value function. Note that this is sensitive to the # scale of the rewards. If your expected V is large, increase this. "vf_clip_param": 10.0, # If specified, clip the global norm of gradients by this amount. "grad_clip": None, # Target value for KL divergence. "kl_target": 0.01, # Whether to rollout "complete_episodes" or "truncate_episodes". "batch_mode": "truncate_episodes", # Which observation filter to apply to the observation. "observation_filter": "NoFilter", # Deprecated keys: # Share layers for value function. If you set this to True, it's important # to tune vf_loss_coeff. # Use config.model.vf_share_layers instead. "vf_share_layers": DEPRECATED_VALUE, })
GENERAL_CONFIGS = with_common_config({ # Buffers "replay_buffer_size": int(2e4), "reservoir_buffer_size": int(1e5), # Train specs "replay_train_batch_size": 128, "reservoir_train_batch_size": 128, "replay_train_every": 10, "reservoir_train_every": 66, "reservoir_train_every_sims": 300, "replay_min_size_to_learn": 10, "reservoir_min_size_to_learn": 1000, "reservoir_min_size_to_learn_sims": 1000, "replay_num_episodes": 10, "replay_min_episodes_to_learn": 100, # Models "model": { "lstm_cell_size": 128, "max_seq_len": 999999, "fcnet_activation": 'relu', "fcnet_hiddens": [128, 128, 128] }, "sig_model": { "fcnet_hiddens": [64], "fcnet_activation": 'relu', "max_seq_len": 20 }, # Generals "framework": "torch", "use_exec_api": True, # keys in obs space to be used at inference time "test_obs_keys": ["obs"], # keys in obs space to be used at training time "train_obs_keys": ["obs"], # Train statistics flag "log_stats": False, "stats_fn": None, # Wandb integration configs "logger_config": { "wandb": { "project": None, "api_key_file": "/home/coordination/wandb_api", "log_config": True } }, # Param to support cases in which training environment is different than execution environment (e.g. SIMS) "train_obs_space": None, # Param for hyperparameter tuning "model_struc": None, # TODO (fede): remove once tuning is done # Debugging purposes, log probabilities for specific actions when training "relevant_obs": None })
from ray.rllib.agents import Trainer, with_common_config from ray.rllib.optimizers import SyncSamplesOptimizer from agents.ppo.ppo_torch_policy import PPOTorchPolicy DEFAULT_CONFIG = with_common_config({ 'alpha': 0.1, 'clip_ratio': 0.2, 'gamma': 0.99, 'lambda': 0.97, 'lr_pi': 3e-4, 'lr_vf': 1e-3, 'max_episode_len': 1000, 'model_hidden_sizes': (256, 128, 64), 'policy': 'default', 'num_workers': 4, 'num_sgd_iter': 80, 'num_skills': 10, 'rollout_fragment_length': 200, 'seed': 123, 'sgd_minibatch_size': 128, 'skill_input': None, 'target_kl': 0.01, 'train_batch_size': 4000, 'use_diayn': True, 'use_env_rewards': True, 'use_gae': True, }) policy_options = { 'default': PPOTorchPolicy, 'neuroblast': PPONeuroblastPolicy,
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id) numpy.random.seed(random_seed) torch.manual_seed(random_seed) CHECKPOINT_DIR = f"./rllib_temp_{gpu_id}" CHECKPOINT_FILE = f"last_checkpoint_{gpu_id}.out" config = with_common_config({ 'gamma': 0.99, 'lr': 1e-5, 'num_workers': 4, # 'framework': 'torch', 'num_gpus': 1, 'sgd_minibatch_size': 256, 'num_sgd_iter': (2515 * 8) * 2**0 // 256, 'train_batch_size': 2515 * 8, 'entropy_coeff': 0.02, 'vf_loss_coeff': 0.01, 'model': { 'fcnet_hiddens': [256, 256, 256] } }) def train_it(): ray.init() # Configure RLLib with The Roadwork Environment agent = ppo.PPOTrainer(env=StockEnvDOW30, config=config) # agent = ppo.PPOTrainer(env=StockEnvDOW30)
DEFAULT_CONFIG = with_common_config({ # PlaNET Model LR "td_model_lr": 6e-4, # Actor LR "actor_lr": 8e-5, # Critic LR "critic_lr": 8e-5, # Grad Clipping "grad_clip": 100.0, # Discount "discount": 0.99, # Lambda "lambda": 0.95, # Training iterations per data collection from real env "dreamer_train_iters": 100, # Horizon for Enviornment (1000 for Mujoco/DMC) "horizon": 1000, # Number of episodes to sample for Loss Calculation "batch_size": 50, # Length of each episode to sample for Loss Calculation "batch_length": 50, # Imagination Horizon for Training Actor and Critic "imagine_horizon": 15, # Free Nats "free_nats": 3.0, # KL Coeff for the Model Loss "kl_coeff": 1.0, # Distributed Dreamer not implemented yet "num_workers": 0, # Prefill Timesteps "prefill_timesteps": 5000, # This should be kept at 1 to preserve sample efficiency "num_envs_per_worker": 1, # Exploration Gaussian "explore_noise": 0.3, # Batch mode "batch_mode": "complete_episodes", # Custom Model "dreamer_model": { "custom_model": DreamerModel, # RSSM/PlaNET parameters "deter_size": 200, "stoch_size": 30, # CNN Decoder Encoder "depth_size": 32, # General Network Parameters "hidden_size": 400, # Action STD "action_init_std": 5.0, }, "env_config": { # Repeats action send by policy for frame_skip times in env "frame_skip": 2, } })
import time from ray.rllib.agents import Agent, with_common_config from ray.rllib.evaluation.tf_policy_graph import LearningRateSchedule from ray.rllib.optimizers import AsyncGradientsOptimizer from ray.rllib.utils.annotations import override DEFAULT_CONFIG = with_common_config(dict( )) class A3CPolicyGraph: class A3CAgent(Agent): _agent_name ="A3C-Per" _policy_graph = None # TODO @override(Agent) def _init(self, config, env_creator): policy_cls = self._policy_graph self.local_evaluator = self.make_local_evaluator(env_creator, policy_cls) self.remote_evaluators = self.make_remote_evaluators(env_creator, policy_cls, config["num_workers"]) self.optimizer = self._make_optimizer() @override(Agent) def _train(self): prev_steps = self.optimizer.num_steps_sampled
DEFAULT_CONFIG = with_common_config({ # Size of batches collected from each worker "rollout_fragment_length": 200, # Number of timesteps collected for each SGD round "train_batch_size": 4000, # Total SGD batch size across all devices for SGD "sgd_minibatch_size": 128, # Whether to shuffle sequences in the batch when training (recommended) "shuffle_sequences": True, # Number of SGD iterations in each outer loop "num_sgd_iter": 30, # IN case a buffer optimizer is used "learning_starts": 1000, # Size of the replay buffer in batches (not timesteps!). "buffer_size": 1000, # Stepsize of SGD "lr": 5e-5, # Learning rate schedule "lr_schedule": None, # Share layers for value function. If you set this to True, it"s important # to tune vf_loss_coeff. "vf_share_layers": False, # Whether to rollout "complete_episodes" or "truncate_episodes" "batch_mode": "complete_episodes", # Which observation filter to apply to the observation "observation_filter": "NoFilter", # Uses the sync samples optimizer instead of the multi-gpu one. This does # not support minibatches. "simple_optimizer": True, # === MCTS === "mcts_config": { "puct_coefficient": 1.0, "num_simulations": 30, "temperature": 1.5, "dirichlet_epsilon": 0.25, "dirichlet_noise": 0.03, "argmax_tree_policy": False, "add_dirichlet_noise": True, }, # === Ranked Rewards === # implement the ranked reward (r2) algorithm # from: https://arxiv.org/pdf/1807.01672.pdf "ranked_rewards": { "enable": True, "percentile": 75, "buffer_max_length": 1000, # add rewards obtained from random policy to # "warm start" the buffer "initialize_buffer": True, "num_init_rewards": 100, }, # === Evaluation === # Extra configuration that disables exploration. "evaluation_config": { "mcts_config": { "argmax_tree_policy": True, "add_dirichlet_noise": False, }, }, # === Callbacks === "callbacks": AlphaZeroDefaultCallbacks, "framework": "torch", # Only PyTorch supported so far. })
"eval_returns", "eval_lengths" ]) # yapf: disable # __sphinx_doc_begin__ DEFAULT_CONFIG = with_common_config({ "action_noise_std": 0.0, "noise_stdev": 0.02, # std deviation of parameter noise "num_rollouts": 32, # number of perturbs to try "rollouts_used": 32, # number of perturbs to keep in gradient estimate "num_workers": 2, "sgd_stepsize": 0.01, # sgd step-size "observation_filter": "MeanStdFilter", "noise_size": 250000000, "eval_prob": 0.03, # probability of evaluating the parameter rewards "report_length": 10, # how many of the last rewards we average over "offset": 0, # ARS will use Trainer's evaluation WorkerSet (if evaluation_interval > 0). # Therefore, we must be careful not to use more than 1 env per eval worker # (would break ARSPolicy's compute_single_action method) and to not do # obs-filtering. "evaluation_config": { "num_envs_per_worker": 1, "observation_filter": "NoFilter" }, }) # __sphinx_doc_end__ # yapf: enable @ray.remote
logger = logging.getLogger(__name__) Result = namedtuple("Result", [ "noise_indices", "noisy_returns", "sign_noisy_returns", "noisy_lengths", "eval_returns", "eval_lengths" ]) # yapf: disable # __sphinx_doc_begin__ DEFAULT_CONFIG = with_common_config({ "action_noise_std": 0.01, "l2_coeff": 0.005, "noise_stdev": 0.02, "episodes_per_batch": 1000, "train_batch_size": 10000, "eval_prob": 0.003, "return_proc_mode": "centered_rank", "num_workers": 10, "stepsize": 0.01, "observation_filter": "MeanStdFilter", "noise_size": 250000000, "report_length": 10, }) # __sphinx_doc_end__ # yapf: enable @ray.remote def create_shared_noise(count): """Create a large array of noise to be shared by all workers.""" seed = 123 noise = np.random.RandomState(seed).randn(count).astype(np.float32)
DEFAULT_CONFIG = with_common_config({ # Should use a critic as a baseline (otherwise don't use value baseline; # required for using GAE). "use_critic": True, # If true, use the Generalized Advantage Estimator (GAE) # with a value function, see https://arxiv.org/pdf/1506.02438.pdf. "use_gae": True, # The GAE (lambda) parameter. "lambda": 1.0, # Initial coefficient for KL divergence. "kl_coeff": 0.2, # Size of batches collected from each worker. "rollout_fragment_length": 200, # Number of timesteps collected for each SGD round. This defines the size # of each SGD epoch. "train_batch_size": 4000, # Total SGD batch size across all devices for SGD. This defines the # minibatch size within each epoch. "sgd_minibatch_size": 128, # Whether to shuffle sequences in the batch when training (recommended). "shuffle_sequences": True, # Number of SGD iterations in each outer loop (i.e., number of epochs to # execute per train batch). "ppo_epochs": 8, # Use dynamics model or not "use_dynamics": True, # Dynamics epochs "dynamics_epochs": 2, # Stepsize of SGD. "lr": 3e-4, # Learning rate schedule. "lr_schedule": None, # Skill dynamics learning rate. "dynamics_lr":3e-4, # dynamics orth reg "dynamics_orth_reg": True, # dynamics l2 reg "dynamics_l2_reg": False, # dynamics spectral norm "dynamics_spectral_norm": False, # dynamics apply reg to hidden or not "dynamics_reg_hiddens": False, # dads reward scale "dads_reward_scale": 1.0, # Coefficient of the value function loss. IMPORTANT: you must tune this if # you set vf_share_layers=True inside your model's config. "vf_loss_coeff": 1.0, "model": { # Share layers for value function. If you set this to True, it's # important to tune vf_loss_coeff. "vf_share_layers": True, }, # Coefficient of the entropy regularizer. "entropy_coeff": 0.0, # Decay schedule for the entropy regularizer. "entropy_coeff_schedule": None, # PPO clip parameter. "clip_param": 0.3, # Clip param for the value function. Note that this is sensitive to the # scale of the rewards. If your expected V is large, increase this. "vf_clip_param": 10.0, # If specified, clip the global norm of gradients by this amount. "grad_clip": None, # Target value for KL divergence. "kl_target": 0.01, # Whether to rollout "complete_episodes" or "truncate_episodes". "batch_mode": "truncate_episodes", # Which observation filter to apply to the observation. "observation_filter": "NoFilter", # Uses the sync samples optimizer instead of the multi-gpu one. This is # usually slower, but you might want to try it if you run into issues with # the default optimizer. "simple_optimizer": False, # Whether to fake GPUs (using CPUs). # Set this to True for debugging on non-GPU machines (set `num_gpus` > 0). "_fake_gpus": False, # Deprecated keys: # Share layers for value function. If you set this to True, it's important # to tune vf_loss_coeff. # Use config.model.vf_share_layers instead. "vf_share_layers": DEPRECATED_VALUE, })
import logging from ray.rllib.agents import with_common_config from ray.rllib.agents.trainer_template import build_trainer from dqn.dqn_policy import DQNPolicy from dqn.prey_policy import PreyPolicy logger = logging.getLogger(__name__) DEFAULT_CONFIG = with_common_config({ # Agent parameters. "lr": 0.001, "gamma": 0.9, "eps_start": 1, "eps_end": 0.05, "eps_decay": 0.9995, "replay_memory_size": 10000, "target_update_frequency": 10, "dqn_model": { "custom_model": "DQNModel", "custom_model_config": {}, # Extra options to pass to your model (e.g. network of model). } }) # Custom trainer. DQNTrainer = build_trainer(name="DQNAlgorithm", default_policy=DQNPolicy, default_config=DEFAULT_CONFIG)
"eval_returns", "eval_lengths" ]) # yapf: disable # __sphinx_doc_begin__ DEFAULT_CONFIG = with_common_config({ "action_noise_std": 0.01, "l2_coeff": 0.005, "noise_stdev": 0.02, "episodes_per_batch": 1000, "train_batch_size": 10000, "eval_prob": 0.003, "return_proc_mode": "centered_rank", "num_workers": 10, "stepsize": 0.01, "observation_filter": "MeanStdFilter", "noise_size": 250000000, "report_length": 10, # ARS will use Trainer's evaluation WorkerSet (if evaluation_interval > 0). # Therefore, we must be careful not to use more than 1 env per eval worker # (would break ESPolicy's compute_single_action method) and to not do # obs-filtering. "evaluation_config": { "num_envs_per_worker": 1, "observation_filter": "NoFilter" }, }) # __sphinx_doc_end__ # yapf: enable @ray.remote
DEFAULT_CONFIG = with_common_config({ # If true, use the Generalized Advantage Estimator (GAE) # with a value function, see https://arxiv.org/pdf/1506.02438.pdf. "use_gae": True, # GAE(lambda) parameter "lambda": 1.0, # Initial coefficient for KL divergence "kl_coeff": 0.2, # Size of batches collected from each worker "sample_batch_size": 200, # Number of timesteps collected for each SGD round "train_batch_size": 4000, # Total SGD batch size across all devices for SGD "sgd_minibatch_size": 128, # Number of SGD iterations in each outer loop "num_sgd_iter": 30, # Stepsize of SGD "lr": 5e-5, # Learning rate schedule "lr_schedule": None, # Share layers for value function "vf_share_layers": False, # Coefficient of the value function loss "vf_loss_coeff": 1.0, # Coefficient of the entropy regularizer "entropy_coeff": 0.0, # PPO clip parameter "clip_param": 0.3, # Clip param for the value function. Note that this is sensitive to the # scale of the rewards. If your expected V is large, increase this. "vf_clip_param": 10.0, # If specified, clip the global norm of gradients by this amount "grad_clip": None, # Target value for KL divergence "kl_target": 0.01, # Whether to rollout "complete_episodes" or "truncate_episodes" "batch_mode": "truncate_episodes", # Which observation filter to apply to the observation "observation_filter": "NoFilter", # Uses the sync samples optimizer instead of the multi-gpu one. This does # not support minibatches. "simple_optimizer": False, # (Deprecated) Use the sampling behavior as of 0.6, which launches extra # sampling tasks for performance but can waste a large portion of samples. "straggler_mitigation": False, })