DEPRECATED_VALUE: deprecation_warning("exploration_final_scale", "exploration_config.final_scale") if isinstance(config["exploration_config"], dict): config["exploration_config"]["final_scale"] = \ config.pop("exploration_final_scale") if config.get("exploration_fraction", DEPRECATED_VALUE) != \ DEPRECATED_VALUE: assert schedule_max_timesteps is not None deprecation_warning("exploration_fraction", "exploration_config.scale_timesteps") if isinstance(config["exploration_config"], dict): config["exploration_config"]["scale_timesteps"] = config.pop( "exploration_fraction") * schedule_max_timesteps if config.get("per_worker_exploration", DEPRECATED_VALUE) != \ DEPRECATED_VALUE: deprecation_warning( "per_worker_exploration", "exploration_config.type=PerWorkerOrnsteinUhlenbeckNoise") if isinstance(config["exploration_config"], dict): config["exploration_config"]["type"] = \ PerWorkerOrnsteinUhlenbeckNoise DDPGTrainer = GenericOffPolicyTrainer.with_updates( name="DDPG", default_config=DEFAULT_CONFIG, default_policy=DDPGTFPolicy, validate_config=validate_config, )
def execution_plan(workers, config): rollouts = ParallelRollouts(workers, mode="bulk_sync") replay_buffer = SimpleReplayBuffer(config["buffer_size"]) store_op = rollouts \ .for_each(StoreToReplayBuffer(local_buffer=replay_buffer)) train_op = Replay(local_buffer=replay_buffer) \ .combine( ConcatBatches(min_batch_size=config["train_batch_size"])) \ .for_each(TrainOneStep(workers)) \ .for_each(UpdateTargetNetwork( workers, config["target_network_update_freq"])) merged_op = Concurrently([store_op, train_op], mode="round_robin", output_indexes=[1]) return StandardMetricsReporting(merged_op, workers, config) QMixTrainer = GenericOffPolicyTrainer.with_updates( name="QMIX", default_config=DEFAULT_CONFIG, default_policy=QMixTorchPolicy, get_policy_class=None, validate_config=validate_config, execution_plan=execution_plan)
"critic_tau": 0.01, "encoder_tau": 0.05, "learning_starts": 1000, "train_batch_size": 32, "gamma": 0.99, "initial_alpha": 0.1, # customs "embed_dim": 128, "encoder_type": "pixel", "num_layers": 4, "num_filters": 32, "cropped_image_size": 54, } SAC_CONFIG = DEFAULT_CONFIG.copy() SAC_CONFIG.update(new_config) CurlSACTrainer = GenericOffPolicyTrainer.with_updates( name="CurlSAC", default_config=SAC_CONFIG, validate_config=validate_config, default_policy=CurlSACTorchPolicy, get_policy_class=get_sac_policy_class )
"redo_invalid_games": False, "wandb": {}, "ed": None, "policy_catalog": None, "eq_iters": None, "adaptive_pval_test": False, "br_thres": None, "eq_thres": None, "br_eval_against_policy": None, "thres_is_pval": None, "adaptive_pval": None } PG_CUSTOM_EVAL_TRAINER_DEFAULT_CONFIG = with_base_config( base_config=DEFAULT_CONFIG, extra_config=ppo_custom_eval_trainer_added_config_items) mixins = [CustomEvaluationsTrainerMixin, WeightsUtilsTrainerMixin] SACTrainer = GenericOffPolicyTrainer.with_updates( name="SACDiscrete", default_config=PG_CUSTOM_EVAL_TRAINER_DEFAULT_CONFIG, default_policy=SACTFPolicy, validate_config=validate_config, before_init=pg_custom_eval_trainer_before_init, after_init=pg_custom_eval_trainer_after_init, after_optimizer_step=after_optimizer_step, collect_metrics_fn=collect_metrics, make_policy_optimizer=make_optimizer, mixins=mixins)
"num_workers": 32, "buffer_size": 2000000, "learning_starts": 50000, "train_batch_size": 512, "rollout_fragment_length": 50, "target_network_update_freq": 500000, "timesteps_per_iteration": 1000, "exploration_config": { "type": "PerWorkerEpsilonGreedy" }, "worker_side_prioritization": True, "min_iter_time_s": 30, "training_intensity": None, "prioritized_replay": True, "prioritized_replay_alpha": 0.6, "prioritized_replay_beta": 0.4, "final_prioritized_replay_beta": 0.4, "prioritized_replay_beta_annealing_timesteps": 20000, "prioritized_replay_eps": 1e-6, }, ) QMixTrainer = GenericOffPolicyTrainer.with_updates( name="QMIXApex", default_config=QMIX_APEX_DEFAULT_CONFIG, default_policy=QMixTorchPolicy, get_policy_class=None, execution_plan=apex_execution_plan) register_trainable("QMIXApex", QMixTrainer)
def setup_ddpg_exploration(trainer): trainer.exploration0 = make_exploration_schedule(trainer.config, -1) trainer.explorations = [ make_exploration_schedule(trainer.config, i) for i in range(trainer.config["num_workers"]) ] def add_pure_exploration_phase(trainer): global_timestep = trainer.optimizer.num_steps_sampled pure_expl_steps = trainer.config["pure_exploration_steps"] if pure_expl_steps: # tell workers whether they should do pure exploration only_explore = global_timestep < pure_expl_steps trainer.workers.local_worker().foreach_trainable_policy( lambda p, _: p.set_pure_exploration_phase(only_explore)) for e in trainer.workers.remote_workers(): e.foreach_trainable_policy.remote( lambda p, _: p.set_pure_exploration_phase(only_explore)) update_worker_explorations(trainer) DDPGTrainer = GenericOffPolicyTrainer.with_updates( name="DDPG", default_config=DEFAULT_CONFIG, default_policy=DDPGTFPolicy, before_init=setup_ddpg_exploration, before_train_step=add_pure_exploration_phase)
else: return SACTFPolicy def validate_config(config): if config.get("grad_norm_clipping", DEPRECATED_VALUE) != DEPRECATED_VALUE: deprecation_warning("grad_norm_clipping", "grad_clip") config["grad_clip"] = config.pop("grad_norm_clipping") # Use same keys as for standard Trainer "model" config. for model in ["Q_model", "policy_model"]: if config[model].get("hidden_activation", DEPRECATED_VALUE) != \ DEPRECATED_VALUE: deprecation_warning("{}.hidden_activation".format(model), "{}.fcnet_activation".format(model), error=True) if config[model].get("hidden_layer_sizes", DEPRECATED_VALUE) != \ DEPRECATED_VALUE: deprecation_warning("{}.hidden_layer_sizes".format(model), "{}.fcnet_hiddens".format(model), error=True) DreamerTrainer = GenericOffPolicyTrainer.with_updates( name="Dreamer", default_config=DEFAULT_CONFIG, validate_config=validate_config, default_policy=DreamerTorchPolicy, get_policy_class=get_policy_class, )
# Update the target network every `target_network_update_freq` steps. "target_network_update_freq": 0, # === Parallelism === # Whether to use a GPU for local optimization. "num_gpus": 0, # Number of workers for collecting samples with. This only makes sense # to increase if your environment is particularly slow to sample, or if # you"re using the Async or Ape-X optimizers. "num_workers": 0, # Whether to allocate GPUs for workers (if > 0). "num_gpus_per_worker": 0, # Whether to allocate CPUs for workers (if > 0). "num_cpus_per_worker": 1, # Whether to compute priorities on workers. "worker_side_prioritization": False, # Prevent iterations from going lower than this time span "min_iter_time_s": 1, # TODO(ekl) these are unused; remove them from sac config "per_worker_exploration": False, "exploration_fraction": 0.1, "schedule_max_timesteps": 100000, "exploration_final_eps": 0.02, }) # __sphinx_doc_end__ # yapf: enable SACTrainer = GenericOffPolicyTrainer.with_updates( name="SAC", default_config=DEFAULT_CONFIG, default_policy=SACTFPolicy)
class CustomStdOut(object): def _log_result(self, result): if result["training_iteration"] % 50 == 0: try: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(result["timesteps_total"], result["episodes_total"], result["episode_reward_mean"], result["policy_reward_mean"], round(result["time_total_s"] - self.cur_time, 3))) except: pass self.cur_time = result["time_total_s"] MADDPGTrainer = GenericOffPolicyTrainer.with_updates( name="MADDPG", default_config=DEFAULT_CONFIG, default_policy=MADDPGTFPolicy, before_init=None, before_train_step=set_global_timestep, make_policy_optimizer=make_optimizer, after_train_result=add_trainer_metrics, collect_metrics_fn=collect_metrics, before_evaluate_fn=None, mixins=[CustomStdOut])
"explore": True, "exploration_config": { # Exploration sub-class by name or full path to module+class # (e.g. “ray.rllib.utils.exploration.epsilon_greedy.EpsilonGreedy”) "type": "EpsilonGreedy", # Parameters for the Exploration class' constructor: "initial_epsilon": 1.0, "final_epsilon": 0.1, "epsilon_timesteps": 20000, # Timesteps over which to anneal epsilon. }, }) HERRainbowTrainer = GenericOffPolicyTrainer.with_updates( name="HER_RainbowDQN", default_policy=DQNTorchPolicy, # .with_updates( # postprocess_fn=postprocess_with_HER # ), default_config=HER_RAINBOW_DQN_CONFIG) if __name__ == "__main__": ray.init() parser = argparse.ArgumentParser() parser.add_argument("--steps", type=int, default=1_000_000) args = parser.parse_args() tune.run( HERRainbowTrainer, config={ "env": "CartPole-v1", "num_workers": 1, # "num_gpus": 1,
"dueling": True, "n_step": 20, "learning_starts": 1600, # "critic_learning_rate": 1e-3, "lr": 1e-3, "critic_beta": 0.9, "encoder_learning_rate": 1e-3, # "adam_epsilon": 1e-7, "cpc_update_freq": 1, "target_network_update_freq": 2, "critic_tau": 0.01, "encoder_tau": 0.05, "train_batch_size": 32, "gamma": 0.99, # customs "embed_dim": 128, "encoder_type": "impala", } RAINBOW_CONFIG = DEFAULT_CONFIG.copy() RAINBOW_CONFIG.update(new_config) BaselineRainbowTrainer = GenericOffPolicyTrainer.with_updates( name="BaselineRainbow", default_config=RAINBOW_CONFIG, validate_config=validate_config, default_policy=BaselineRainbowTorchPolicy, get_policy_class=get_rainbow_policy_class, )
type=str, choices=['simple_adversary', 'simple_crypto', 'simple_push', 'simple_tag'], default='simple_push', help="Scenario name of the multiagent-particle-envs") args = parser.parse_args() # Define the trainer def make_sync_batch_optimizer(workers, config): return SyncBatchReplayOptimizer(workers, learning_starts=config["learning_starts"], buffer_size=config["buffer_size"], train_batch_size=config["train_batch_size"]) MADDPGTrainer = GenericOffPolicyTrainer.with_updates(name="MADDPG", default_config=MADDPG_CONFIG, default_policy=MADDPGTorchPolicy, make_policy_optimizer=make_sync_batch_optimizer) # Registry Environment register_env("simple_multiagent", lambda config: SimpleMultiAgentEnv(config)) single_env = SimpleMultiAgentEnv(env_config={"scenario_name": args.scenario_name}) # Policy Mapping policies = { agent: (None, single_env.observation_space[agent], single_env.action_space[agent], { "observation_spaces": single_env.observation_space, "action_spaces": single_env.action_space, "agent_id": agent }) for agent in single_env.agent_ids }
# Experimental distributed execution impl; enable with "use_exec_api": True. def execution_plan(workers, config): rollouts = ParallelRollouts(workers, mode="bulk_sync") replay_buffer = SimpleReplayBuffer(config["buffer_size"]) store_op = rollouts \ .for_each(StoreToReplayBuffer(local_buffer=replay_buffer)) train_op = Replay(local_buffer=replay_buffer) \ .combine( ConcatBatches(min_batch_size=config["train_batch_size"])) \ .for_each(TrainOneStep(workers)) \ .for_each(UpdateTargetNetwork( workers, config["target_network_update_freq"])) merged_op = Concurrently( [store_op, train_op], mode="round_robin", output_indexes=[1]) return StandardMetricsReporting(merged_op, workers, config) QMixTrainer = GenericOffPolicyTrainer.with_updates( name="QMIX", default_config=DEFAULT_CONFIG, default_policy=QMixTorchPolicy, get_policy_class=None, make_policy_optimizer=make_sync_batch_optimizer, execution_plan=execution_plan)
if config["grad_clip"] is not None and config["grad_clip"] <= 0.0: raise ValueError("`grad_clip` value must be > 0.0!") def get_policy_class(config: TrainerConfigDict) -> Optional[Type[Policy]]: """Policy class picker function. Class is chosen based on DL-framework. Args: config (TrainerConfigDict): The trainer's configuration dict. Returns: Optional[Type[Policy]]: The Policy class to use with PPOTrainer. If None, use `default_policy` provided in build_trainer(). """ if config["framework"] == "torch": from ray.rllib.agents.sac.sac_torch_policy import SACTorchPolicy return SACTorchPolicy # Build a child class of `Trainer` (based on the kwargs used to create the # GenericOffPolicyTrainer class and the kwargs used in the call below), which # uses the framework specific Policy determined in `get_policy_class()` above. SACTrainer = GenericOffPolicyTrainer.with_updates( name="SAC", default_config=DEFAULT_CONFIG, validate_config=validate_config, default_policy=SACTFPolicy, get_policy_class=get_policy_class, )
def add_maddpg_postprocessing(config): """Add the before learn on batch hook. This hook is called explicitly prior to TrainOneStep() in the execution setups for DQN and APEX. """ def f(batch, workers, config): policies = dict(workers.local_worker() .foreach_trainable_policy(lambda p, i: (i, p))) return before_learn_on_batch(batch, policies, config["train_batch_size"]) config["before_learn_on_batch"] = f return config MADDPGTrainer = GenericOffPolicyTrainer.with_updates( name="MADDPG", default_config=DEFAULT_CONFIG, default_policy=MADDPGTFPolicy, validate_config=add_maddpg_postprocessing, get_policy_class=None, before_init=None, before_train_step=set_global_timestep, make_policy_optimizer=make_optimizer, after_train_result=add_trainer_metrics, collect_metrics_fn=collect_metrics, before_evaluate_fn=None)
logger.warning("`simple_optimizer` must be True (or unset) for SAC!") config["simple_optimizer"] = True def get_policy_class(config: TrainerConfigDict) -> Optional[Type[Policy]]: """Policy class picker function. Class is chosen based on DL-framework. Args: config (TrainerConfigDict): The trainer's configuration dict. Returns: Optional[Type[Policy]]: The Policy class to use with PPOTrainer. If None, use `default_policy` provided in build_trainer(). """ if config["framework"] == "torch": from ray.rllib.agents.sac.sac_torch_policy import SACTorchPolicy return SACTorchPolicy # Build a child class of `Trainer` (based on the kwargs used to create the # GenericOffPolicyTrainer class and the kwargs used in the call below), which # uses the framework specific Policy determined in `get_policy_class()` above. SACTrainer = GenericOffPolicyTrainer.with_updates( name="SAC", default_config=DEFAULT_CONFIG, validate_config=validate_config, default_policy=SACTFPolicy, get_policy_class=get_policy_class, allow_unknown_subkeys=["Q_model", "policy_model"], )
# Whether to use a distribution of epsilons across workers for exploration. "per_worker_exploration": False, # Whether to compute priorities on workers. "worker_side_prioritization": False, # Prevent iterations from going lower than this time span "min_iter_time_s": 1, # === Model === "model": { "lstm_cell_size": 64, "max_seq_len": 999999, }, }) # __sphinx_doc_end__ # yapf: enable def make_sync_batch_optimizer(workers, config): return SyncBatchReplayOptimizer( workers, learning_starts=config["learning_starts"], buffer_size=config["buffer_size"], train_batch_size=config["train_batch_size"]) QMixTrainer = GenericOffPolicyTrainer.with_updates( name="QMIX", default_config=DEFAULT_CONFIG, default_policy=QMixTorchPolicy, make_policy_optimizer=make_sync_batch_optimizer)
for i, new_act in enumerate(new_act_n)}) # Share samples among agents. policy_batches = {pid: SampleBatch(samples) for pid in policies.keys()} return MultiAgentBatch(policy_batches, train_batch_size) def add_maddpg_postprocessing(config): """Add the before learn on batch hook. This hook is called explicitly prior to TrainOneStep() in the execution setups for DQN and APEX. """ def f(batch, workers, config): policies = dict(workers.local_worker().foreach_trainable_policy( lambda p, i: (i, p))) return before_learn_on_batch(batch, policies, config["train_batch_size"]) config["before_learn_on_batch"] = f return config MADDPGTrainer = GenericOffPolicyTrainer.with_updates( name="MADDPG2", default_config=DEFAULT_CONFIG, default_policy=MADDPG2TFPolicy, get_policy_class=None, validate_config=add_maddpg_postprocessing, )
raise ValueError("Prioritized replay is not supported when " "replay_mode=lockstep.") elif config["replay_sequence_length"] > 1: raise ValueError("Prioritized replay is not supported when " "replay_sequence_length > 1.") def get_dqn_policy_class(config): return BaselineDQNTorchPolicy ####################################################################################################### ##################################### Trainers ##################################################### ####################################################################################################### new_config = { # customs "embed_dim": 256, "encoder_type": "impala", } DQN_CONFIG = DEFAULT_CONFIG.copy() DQN_CONFIG.update(new_config) BaselineDQNTrainer = GenericOffPolicyTrainer.with_updates( name="BaselineDQN", default_config=DQN_CONFIG, validate_config=validate_config, default_policy=BaselineDQNTorchPolicy, get_policy_class=get_dqn_policy_class, )