from ray.rllib.contrib.bandits.agents.policy import BanditPolicy logger = logging.getLogger(__name__) # yapf: disable # __sphinx_doc_begin__ TS_CONFIG = with_common_config({ # No remote workers by default. "num_workers": 0, "use_pytorch": True, # Do online learning one step at a time. "rollout_fragment_length": 1, "train_batch_size": 1, # Bandits cant afford to do one timestep per iteration as it is extremely # slow because of metrics collection overhead. This setting means that the # agent will be trained for 100 times in one iteration of Rllib "timesteps_per_iteration": 100, "exploration_config": { "type": "ray.rllib.contrib.bandits.exploration.ThompsonSampling" } }) # __sphinx_doc_end__ # yapf: enable LinTSTrainer = build_trainer(name="LinTS", default_config=TS_CONFIG, default_policy=BanditPolicy)
return [1, 1] # e.g., 32 / 4 -> native ratio of 8.0 native_ratio = (config["train_batch_size"] / config["rollout_fragment_length"]) # Training intensity is specified in terms of # (steps_replayed / steps_sampled), so adjust for the native ratio. weights = [1, config["training_intensity"] / native_ratio] return weights def get_policy_class(config: TrainerConfigDict) -> Type[Policy]: """Policy class picker function. Args: config (TrainerConfigDict): The trainer's configuration dict. Returns: Type[Policy]: The Policy class to use with SlateQTrainer. """ if config["slateq_strategy"] == "RANDOM": return RandomPolicy else: return SlateQTorchPolicy SlateQTrainer = build_trainer(name="SlateQ", get_policy_class=get_policy_class, default_config=DEFAULT_CONFIG, validate_config=validate_config, execution_plan=execution_plan)
def ray_train(save_in_sub_folder=None, available_cluster_cpus=None, available_cluster_gpus=None, LOCAL_MODE=None, config=None, **mainkwargs): #config = gym.make(train_env_id).config subprocess.run(["chmod", "-R", "a+rwx", save_in_sub_folder + "/"]) # Postprocess the perturbed config to ensure it's still valid s3pathname = 's3://datastore-s3/groups/Behavior/Pinaki' upload_dir_path = s3pathname + "/" + ray_folder + '/' + InceptcurrentDT if save_in_sub_folder is not None: local_dir_path = save_in_sub_folder # makedirpath(upload_dir_path) from ray.rllib.agents.impala.vtrace_policy import VTraceTFPolicy if is_predict_only() or LOCAL_MODE: delegated_cpus = 1 delegated_gpus = 0 else: delegated_cpus = available_cluster_cpus - 2 delegated_gpus = available_cluster_gpus impala_config = impala.DEFAULT_CONFIG.copy() impala_config["num_gpus"] = 0 ImpalaTrainer = build_trainer( name="IMPALA", default_config=impala_config, default_policy=VTraceTFPolicy, validate_config=impala.impala.validate_config, get_policy_class=impala.impala.choose_policy, make_workers=impala.impala.defer_make_workers, make_policy_optimizer=impala.impala.make_aggregators_and_optimizer, mixins=[impala.impala.OverrideDefaultResourceRequest]) def make_async_optimizer(workers, config): return AsyncGradientsOptimizer(workers, grads_per_step=100) CustomTrainer = PPOTrainer.with_updates( make_policy_optimizer=make_async_optimizer) restore_folder = None algo = "PPO" # RL Algorithm of choice LOAD_MODEL_FOLDER = config[ "LOAD_MODEL_FOLDER"] # Location of previous model (if needed) for training #RESTORE_COND = "NONE" # RESTORE: Use a previous model to start new training # RESTORE_AND_RESUME: Use a previous model to finish previous unfinished training # NONE: Start fresh # RESTORE_COND = config["RESTORE_COND"] RESTORE_COND = "NONE" if RESTORE_COND == "RESTORE_AND_RESUME": restore_folder, local_restore_path, _ = retrieve_ray_folder_info( LOAD_MODEL_FOLDER) local_dir = local_restore_path resume = True elif RESTORE_COND == "RESTORE": restore_folder, local_restore_path, _ = retrieve_ray_folder_info( LOAD_MODEL_FOLDER) local_dir = local_dir_path resume = False else: local_dir = local_dir_path resume = False checkpoint_freq = int(num_timesteps) // min(int(num_timesteps), 20) retrieved_agent_policy = settings.retrieved_agent_policy model = config["MODEL"] print("delegated_cpus ", delegated_cpus, " delegated_gpus ", delegated_gpus) ray_trials = ray.tune.run( PPOTrainer, name="pygame-ray", stop={"training_iteration": int(num_timesteps)}, checkpoint_freq=checkpoint_freq, checkpoint_at_end=True, local_dir=local_dir, # upload_dir=upload_dir_path, verbose=True, queue_trials=False, resume=resume, # scheduler=pbt, # trial_executor=RayTrialExecutor(), # resources_per_trial={"cpu": delegated_cpus, "gpu": 0}, restore=restore_folder, #**es.DEFAULT_CONFIG, **{ "num_samples": 1, "config": { "num_gpus_per_worker": 0, #"num_cpus_per_worker": 1, "num_gpus": delegated_gpus, "gamma": 0.85, "num_workers": delegated_cpus, "num_envs_per_worker": 2, "env": train_env_id, "remote_worker_envs": False, "model": model, "ignore_worker_failures": True, #"env_config": { # "retrieved_agent_policy": 1, # }, #"callbacks": { # "on_episode_start": ray.tune.function(on_episode_start), # }, # These params are tuned from a fixed starting value. # "lambda": 0.95, # "clip_param": 0.2, # "lr": 1e-4, # These params start off randomly drawn from a set. # "num_sgd_iter": sample_from(lambda spec: random.choice([10, 20, 30])), # "sgd_minibatch_size": sample_from(lambda spec: random.choice([128, 512, 2048])), # "train_batch_size": sample_from(lambda spec: random.choice([10000, 20000, 40000])), }, }) copy_terminal_output_file( save_folder=local_dir_path, terminal_output_file_name=terminal_output_file_name) subprocess.run(["chmod", "-R", "a+rwx", ray_folder + "/"])
return A3CTFPolicy def validate_config(config): if config["entropy_coeff"] < 0: raise DeprecationWarning("entropy_coeff must be >= 0") if config["sample_async"] and config["use_pytorch"]: config["sample_async"] = False logger.warning( "The sample_async option is not supported with use_pytorch: " "Multithreading can be lead to crashes if used with pytorch.") def execution_plan(workers, config): # For A3C, compute policy gradients remotely on the rollout workers. grads = AsyncGradients(workers) # Apply the gradients as they arrive. We set update_all to False so that # only the worker sending the gradient is updated with new weights. train_op = grads.for_each(ApplyGradients(workers, update_all=False)) return StandardMetricsReporting(train_op, workers, config) A3CTrainer = build_trainer(name="A3C", default_config=DEFAULT_CONFIG, default_policy=A3CTFPolicy, get_policy_class=get_policy_class, validate_config=validate_config, execution_plan=execution_plan)
EXPERIMENT_NAME = "{scenario}-{algorithm}-{n_agent}" scenario_root = (Path(__file__).parent / "../dataset_public").resolve() scenario_paths = [ scenario for scenario_dir in scenario_root.iterdir() for scenario in scenario_dir.iterdir() if scenario.is_dir() ] print(f"training on {scenario_paths}") from ray.rllib.agents.trainer_template import build_trainer from ray.rllib.agents.ppo.ppo import DEFAULT_CONFIG, execution_plan, validate_config PPOTrainer = build_trainer(name="PPO_TORCH", default_config=DEFAULT_CONFIG, default_policy=PPOTorchPolicy, execution_plan=execution_plan, validate_config=validate_config) def parse_args(): parser = argparse.ArgumentParser("train on multi scenarios") # env setting parser.add_argument("--scenario", type=str, default=None, help="Scenario name") parser.add_argument("--headless", default=False, action="store_true",
# Do online learning one step at a time. "rollout_fragment_length": 1, "train_batch_size": 1, # Bandits cant afford to do one timestep per iteration as it is extremely # slow because of metrics collection overhead. This setting means that the # agent will be trained for 100 times in one iteration of Rllib "timesteps_per_iteration": 100, "exploration_config": { "type": "ray.rllib.contrib.bandits.exploration.UCB" } }) # __sphinx_doc_end__ # yapf: enable def get_stats(trainer): env_metrics = trainer.collect_metrics() stats = trainer.optimizer.stats() # Uncomment if regret at each time step is needed # stats.update({"all_regrets": trainer.get_policy().regrets}) return dict(env_metrics, **stats) LinUCBTrainer = build_trainer( name="LinUCB", default_config=UCB_CONFIG, default_policy=BanditPolicy, collect_metrics_fn=get_stats)
.for_each(BroadcastUpdateLearnerWeights( learner_thread, workers, broadcast_interval=config["broadcast_interval"])) # This sub-flow updates the steps trained counter based on learner output. dequeue_op = Dequeue( learner_thread.outqueue, check=learner_thread.is_alive) \ .for_each(record_steps_trained) merged_op = Concurrently( [enqueue_op, dequeue_op], mode="async", output_indexes=[1]) # Callback for APPO to use to update KL, target network periodically. # The input to the callback is the learner fetches dict. if config["after_train_step"]: merged_op = merged_op.for_each(lambda t: t[1]).for_each( config["after_train_step"](workers, config)) return StandardMetricsReporting(merged_op, workers, config) \ .for_each(learner_thread.add_learner_metrics) ImpalaTrainer = build_trainer( name="IMPALA", default_config=DEFAULT_CONFIG, default_policy=VTraceTFPolicy, validate_config=validate_config, get_policy_class=get_policy_class, execution_plan=execution_plan, mixins=[OverrideDefaultResourceRequest])
# No remote workers by default. "num_workers": 0, # Learning rate. "lr": 0.0004, # Use the execution plan API instead of policy optimizers. "use_exec_api": True, "callbacks": MyCallbacks, }) # Define the trainer. # From the _setup() function in trainer.py, we can see how the env is setup. # The main function is _train() in trainer_template.py. # Here we can see how the execution_plan or other training is called. PGTrainer = build_trainer( name="PolicyGradientTrainer", default_config=DEFAULT_CONFIG, default_policy=PolicyGradient, execution_plan=execution_plan, ) class InfoNumberRounds(): def __init__(self, min_, max_, step): self.min = min_ self.max = max_ self.step = step def self_play_workflow(config): """ Expects in config: checkpoint
parser = argparse.ArgumentParser() parser.add_argument("--iters", type=int, default=200) def policy_gradient_loss(policy, model, dist_class, train_batch): logits, _ = model({SampleBatch.CUR_OBS: train_batch[SampleBatch.CUR_OBS]}) action_dist = dist_class(logits, model) log_probs = action_dist.logp(train_batch[SampleBatch.ACTIONS]) return -train_batch[SampleBatch.REWARDS].dot(log_probs) # <class 'ray.rllib.policy.torch_policy_template.MyTorchPolicy'> MyTorchPolicy = build_torch_policy(name="MyTorchPolicy", loss_fn=policy_gradient_loss) # <class 'ray.rllib.agents.trainer_template.MyCustomTrainer'> MyTrainer = build_trainer( name="MyCustomTrainer", default_policy=MyTorchPolicy, ) if __name__ == "__main__": ray.init() args = parser.parse_args() tune.run(MyTrainer, stop={"training_iteration": args.iters}, config={ "env": "CartPole-v0", "num_workers": 2, })
num_sgd_iter=config["num_sgd_iter"], sgd_minibatch_size=config["sgd_minibatch_size"])) else: train_op = rollouts.for_each( TrainTFMultiGPU(workers=workers, sgd_minibatch_size=config["sgd_minibatch_size"], num_sgd_iter=config["num_sgd_iter"], num_gpus=config["num_gpus"], shuffle_sequences=config["shuffle_sequences"], _fake_gpus=config["_fake_gpus"], framework=config.get("framework"))) # Update KL after each round of training. train_op = train_op.for_each(lambda t: t[1]).for_each(UpdateKL(workers)) # Warn about bad reward scales and return training metrics. return StandardMetricsReporting(train_op, workers, config) \ .for_each(lambda result: warn_about_bad_reward_scales(config, result)) # Build a child class of `Trainer`, which uses the framework specific Policy # determined in `get_policy_class()` above. PPOTrainer = build_trainer( name="PPO", default_config=DEFAULT_CONFIG, validate_config=validate_config, default_policy=PPOTFPolicy, get_policy_class=get_policy_class, execution_plan=execution_plan, )
"use_gae": False, "vf_loss_coeff": 0.5, "entropy_coeff": 0.01, "truncate_episodes": True, "use_critic": True, "grad_clip": 40.0, "lr": 0.0001, "min_iter_time_s": 5, "sample_async": True, "lr_schedule": None, } ) CA2CTFPolicy = build_tf_policy( name="CA2CTFPolicy", stats_fn=stats, grad_stats_fn=central_vf_stats, loss_fn=ac_loss_func, postprocess_fn=postprocess_trajectory, before_loss_init=setup_mixins, make_model=build_cac_model, mixins=[CentralizedValueMixin], get_default_config=lambda: DEFAULT_CONFIG, ) CA2CTrainer = build_trainer( name="CA2C", default_policy=CA2CTFPolicy, default_config=DEFAULT_CONFIG )
# policy configs, we have to explicitly set it in the multiagent config: policies = { "ppo_policy": (PPOTorchPolicy if args.torch or args.mixed_torch_tf else PPOTFPolicy, obs_space, act_space, PPO_CONFIG), "dqn_policy": (DQNTorchPolicy if args.torch else DQNTFPolicy, obs_space, act_space, DQN_CONFIG), } def policy_mapping_fn(agent_id): if agent_id % 2 == 0: return "ppo_policy" else: return "dqn_policy" MyTrainer = build_trainer( name="PPO_DQN_MultiAgent", default_policy=None, execution_plan=custom_training_workflow) config = { "rollout_fragment_length": 50, "num_workers": 0, "env": "multi_agent_cartpole", "multiagent": { "policies": policies, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": ["dqn_policy", "ppo_policy"], }, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "framework": "torch" if args.torch else "tf", "_use_trajectory_view_api": True,
def execution_plan(workers: WorkerSet, config: TrainerConfigDict, **kwargs) -> LocalIterator[dict]: rollouts = ParallelRollouts(workers, mode="async") # Collect batches for the trainable policies. rollouts = rollouts.for_each( SelectExperiences(local_worker=workers.local_worker())) # Return training metrics. return StandardMetricsReporting(rollouts, workers, config) RandomParametricTrainer = build_trainer(name="RandomParametric", default_config=DEFAULT_CONFIG, default_policy=RandomParametriclPolicy, execution_plan=execution_plan) def main(): register_env("pa_cartpole", lambda _: ParametricActionsCartPole(10)) trainer = RandomParametricTrainer(env="pa_cartpole") result = trainer.train() assert result["episode_reward_mean"] > 10, result print("Test: OK") if __name__ == "__main__": ray.init() main()
sgd_minibatch_size=config["sgd_minibatch_size"])) # Update KL after each round of training. train_op = train_op.for_each(lambda t: t[1]).for_each(UpdateKL(workers)) return StandardMetricsReporting(train_op, workers, config) \ .for_each(lambda result: warn_about_bad_reward_scales(config, result)) ####################################################################################################### ##################################### Trainer ##################################################### ####################################################################################################### new_config = { # customs "embed_dim": 256, "encoder_type": "impala", "augmentation": True, "aug_num": 2, "max_shift": 4, } PPO_CONFIG = DEFAULT_CONFIG.copy() PPO_CONFIG.update(new_config) DrqPPOTrainer = build_trainer(name="DrqPPO", default_config=PPO_CONFIG, default_policy=DrqPPOTorchPolicy, get_policy_class=get_policy_class, execution_plan=execution_plan, validate_config=validate_config)
def get_policy_class(config): if config["framework"] == "torch": from ray.rllib.agents.dqn.dqn_torch_policy import DQNTorchPolicy return DQNTorchPolicy else: return DQNTFPolicy def get_simple_policy_class(config): if config["framework"] == "torch": from ray.rllib.agents.dqn.simple_q_torch_policy import \ SimpleQTorchPolicy return SimpleQTorchPolicy else: return SimpleQTFPolicy GenericOffPolicyTrainer = build_trainer(name="GenericOffPolicyAlgorithm", default_policy=None, get_policy_class=get_policy_class, default_config=DEFAULT_CONFIG, validate_config=validate_config, execution_plan=execution_plan) DQNTrainer = GenericOffPolicyTrainer.with_updates( name="DQN", default_policy=DQNTFPolicy, default_config=DEFAULT_CONFIG) SimpleQTrainer = DQNTrainer.with_updates( default_policy=SimpleQTFPolicy, get_policy_class=get_simple_policy_class)
from ray.rllib.agents.trainer import with_common_config from ray.rllib.agents.trainer_template import build_trainer from ray.rllib.agents.pg.pg_policy import PGTFPolicy # yapf: disable # __sphinx_doc_begin__ DEFAULT_CONFIG = with_common_config({ # No remote workers by default "num_workers": 0, # Learning rate "lr": 0.0004, # Use PyTorch as backend "use_pytorch": False, }) # __sphinx_doc_end__ # yapf: enable def get_policy_class(config): if config["use_pytorch"]: from ray.rllib.agents.pg.torch_pg_policy import PGTorchPolicy return PGTorchPolicy else: return PGTFPolicy PGTrainer = build_trainer(name="PGTrainer", default_config=DEFAULT_CONFIG, default_policy=PGTFPolicy, get_policy_class=get_policy_class)
raise ValueError("Must have an actual Env created on the driver " "(local) worker! Set `create_env_on_driver` to True.") def validate_env(env: EnvType, env_context: EnvContext): """Validates the local_worker's env object (after creation). Args: env (EnvType): The env object to check (for worker=0 only). env_context (EnvContext): The env context used for the instantiation of the local worker's env (worker=0). Raises: ValueError: In case something is wrong with the config. """ if not hasattr(env, "reward") or not callable(env.reward): raise ValueError("Env {} doest not have a `reward()` method, needed " "for MB-MPO!".format(env)) # Build a child class of `Trainer`, which uses the default policy, # MBMPOTorchPolicy. A TensorFlow version is not available yet. MBMPOTrainer = build_trainer( name="MBMPO", default_config=DEFAULT_CONFIG, default_policy=MBMPOTorchPolicy, execution_plan=execution_plan, validate_config=validate_config, validate_env=validate_env, )
return DQNTFPolicy def get_simple_policy_class(config): if config["use_pytorch"]: from ray.rllib.agents.dqn.simple_q_torch_policy import \ SimpleQTorchPolicy return SimpleQTorchPolicy else: return SimpleQTFPolicy GenericOffPolicyTrainer = build_trainer( name="GenericOffPolicyAlgorithm", default_policy=None, get_policy_class=get_policy_class, default_config=DEFAULT_CONFIG, validate_config=validate_config, get_initial_state=get_initial_state, make_policy_optimizer=make_policy_optimizer, before_train_step=update_worker_exploration, after_optimizer_step=update_target_if_needed, after_train_result=after_train_result, execution_plan=execution_plan) DQNTrainer = GenericOffPolicyTrainer.with_updates( name="DQN", default_policy=DQNTFPolicy, default_config=DEFAULT_CONFIG) SimpleQTrainer = DQNTrainer.with_updates( default_policy=SimpleQTFPolicy, get_policy_class=get_simple_policy_class)
selected_workers=trainer.workers.remote_workers() [-len(trainer.workers.remote_workers()) // 3:]) else: result = trainer.collect_metrics() return result def disable_exploration(trainer): trainer.evaluation_workers.local_worker().foreach_policy( lambda p, _: p.set_epsilon(0)) GenericOffPolicyTrainer = build_trainer( name="GenericOffPolicyAlgorithm", default_policy=None, default_config=DEFAULT_CONFIG, validate_config=check_config_and_setup_param_noise, get_initial_state=get_initial_state, make_policy_optimizer=make_optimizer, before_init=setup_exploration, before_train_step=update_worker_explorations, after_optimizer_step=update_target_if_needed, after_train_result=add_trainer_metrics, collect_metrics_fn=collect_metrics, before_evaluate_fn=disable_exploration) DQNTrainer = GenericOffPolicyTrainer.with_updates( name="NAF", default_policy=DQNTFPolicy, default_config=DEFAULT_CONFIG) SimpleQTrainer = DQNTrainer.with_updates(default_policy=SimpleQPolicy)
sgd_minibatch_size=config["train_batch_size"], num_sgd_iter=1, num_gpus=config["num_gpus"], shuffle_sequences=True, _fake_gpus=config["_fake_gpus"], framework=config.get("framework")) # (2) Read and train on experiences from the replay buffer. replay_op = Replay(local_buffer=local_replay_buffer) \ .for_each(train_step_op) \ .for_each(UpdateTargetNetwork( workers, config["target_network_update_freq"])) # Alternate deterministically between (1) and (2). train_op = Concurrently([store_op, replay_op], mode="round_robin", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config) # Build a child class of `Trainer`, which uses the framework specific Policy # determined in `get_policy_class()` above. SimpleQTrainer = build_trainer( name="SimpleQTrainer", default_policy=SimpleQTFPolicy, get_policy_class=get_policy_class, execution_plan=execution_plan, default_config=DEFAULT_CONFIG, )
class OverrideDefaultResourceRequest: @classmethod @override(Trainable) def default_resource_request(cls, config): cf = dict(cls._default_config, **config) Trainer._validate_config(cf) return Resources( cpu=cf["num_cpus_for_driver"], gpu=cf["num_gpus"], memory=cf["memory"], object_store_memory=cf["object_store_memory"], extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"] + cf["num_aggregation_workers"], extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"], extra_memory=cf["memory_per_worker"] * cf["num_workers"], extra_object_store_memory=cf["object_store_memory_per_worker"] * cf["num_workers"]) ImpalaTrainer = build_trainer( name="IMPALA", default_config=DEFAULT_CONFIG, default_policy=VTraceTFPolicy, validate_config=validate_config, get_policy_class=choose_policy, make_workers=defer_make_workers, make_policy_optimizer=make_aggregators_and_optimizer, mixins=[OverrideDefaultResourceRequest])
def get_policy_class(config): if config["use_pytorch"]: from ray.rllib.agents.pg.pg_torch_policy import PGTorchPolicy return PGTorchPolicy else: return PGTFPolicy # Experimental pipeline-based impl; enable with "use_pipeline_impl": True. def training_pipeline(workers, config): # Collects experiences in parallel from multiple RolloutWorker actors. rollouts = ParallelRollouts(workers, mode="bulk_sync") # Combine experiences batches until we hit `train_batch_size` in size. # Then, train the policy on those experiences and update the workers. train_op = rollouts \ .combine(ConcatBatches( min_batch_size=config["train_batch_size"])) \ .for_each(TrainOneStep(workers)) # Add on the standard episode reward, etc. metrics reporting. This returns # a LocalIterator[metrics_dict] representing metrics for each train step. return StandardMetricsReporting(train_op, workers, config) PGTrainer = build_trainer(name="PG", default_config=DEFAULT_CONFIG, default_policy=PGTFPolicy, get_policy_class=get_policy_class, training_pipeline=training_pipeline)
reservoir_buffers = MultiAgentReservoirBuffer( reservoir_size, config["multiagent"]["policies"]) rollouts = ParallelRollouts(workers, mode="bulk_sync") # 2. define store operations store_op = rollouts.for_each( StoreToBuffers(replay_buffers, reservoir_buffers, config['multiagent']['policies_to_train'])) # Sampling # 3. define replay/reservoir operations replay_op = SimpleLocalReplayMultiagent(replay_buffers, config["replay_train_batch_size"], config["replay_min_size_to_learn"], config["replay_train_every"]) \ .for_each(TrainOneStep(workers))\ .for_each(UpdateTargetNetwork(workers, config['dqn_policy']["target_network_update_freq"])) reservoir_op = LocalReservoirMultiagent(reservoir_buffers, config["reservoir_train_batch_size"], config["reservoir_min_size_to_learn"], config["reservoir_train_every"])\ .for_each(TrainOneStep(workers)) # 4. define main train loop train_op = Concurrently([replay_op, reservoir_op, store_op], mode="round_robin") return LowMemoryMetricsReporting(train_op, workers, config) NFSPTrainer = build_trainer(name='NFSPTrainer', default_policy=NFSPPolicy, default_config=NFSP_CONFIG, execution_plan=execution_plan_nfsp)
"prioritized_replay_beta": config["prioritized_replay_beta"], "prioritized_replay_beta_annealing_timesteps": config["prioritized_replay_beta_annealing_timesteps"], "final_prioritized_replay_beta": config["final_prioritized_replay_beta"], "prioritized_replay_eps": config["prioritized_replay_eps"], }) return SyncReplayOptimizer(workers, learning_starts=config["learning_starts"], buffer_size=config["buffer_size"], train_batch_size=config["train_batch_size"], before_learn_on_batch=before_learn_on_batch, **kwargs) DataAugmentingDQNTrainer = build_trainer( name="data_augmenting_dqn_trainer", default_policy=DataAugmentingDQNTFPolicy, get_policy_class=get_policy_class, default_config=DEFAULT_CONFIG, validate_config=validate_config, get_initial_state=get_initial_state, make_policy_optimizer=make_data_augmenting_policy_optimizer, before_train_step=update_worker_exploration, after_optimizer_step=update_target_if_needed, after_train_result=after_train_result, ) # execution_plan=execution_plan)
from benchmark.networks.communicate import NetworkedMixin, postprocess_trajectory def networked_pg_loss(policy, model, dist_class, train_batch): # make gradients accessed for k in train_batch.keys(): if "var" in k or "gamma" in k: _ = train_batch[k].shape return pg_tf_loss(policy, model, dist_class, train_batch) def setupmixin(policy, obs_space, action_space, config): NetworkedMixin.__init__(policy) NetworkedPG = build_tf_policy( name="NetworkedPG", get_default_config=lambda: PG_DEFAULT_CONFIG, postprocess_fn=postprocess_trajectory, loss_fn=networked_pg_loss, mixins=[NetworkedMixin], after_init=setupmixin, ) NetworkedPGTrainer = build_trainer( name="NetworkedPGTrainer", default_policy=NetworkedPG, )
# above. MultiPPOTorchPolicy = build_policy_class( name="MultiPPOTorchPolicy", framework="torch", get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, loss_fn=ppo_surrogate_loss, stats_fn=kl_and_loss_stats, extra_action_out_fn=vf_preds_fetches, postprocess_fn=compute_gae_for_sample_batch, extra_grad_process_fn=apply_grad_clipping, before_init=setup_config, before_loss_init=setup_mixins_override, mixins=[ LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin, ValueNetworkMixin ], ) def get_policy_class(config): return MultiPPOTorchPolicy MultiPPOTrainer = build_trainer( name="MultiPPO", default_config=ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, validate_config=ray.rllib.agents.ppo.ppo.validate_config, default_policy=MultiPPOTorchPolicy, get_policy_class=get_policy_class, execution_plan=ray.rllib.agents.ppo.ppo.execution_plan)
config["microbatch_size"]) # In microbatch mode, we want to compute gradients on experience # microbatches, average a number of these microbatches, and then apply # the averaged gradient in one SGD step. This conserves GPU memory, # allowing for extremely large experience batches to be used. train_op = ( rollouts.combine( ConcatBatches( min_batch_size=config["microbatch_size"])).for_each( ComputeGradients(workers)) # (grads, info) .batch(num_microbatches) # List[(grads, info)] .for_each(AverageGradients()) # (avg_grads, info) .for_each(ApplyGradients(workers))) else: # In normal mode, we execute one SGD step per each train batch. train_op = rollouts \ .combine(ConcatBatches( min_batch_size=config["train_batch_size"])) \ .for_each(TrainOneStep(workers)) return StandardMetricsReporting(train_op, workers, config) A2CTrainer = build_trainer(name="A2C", default_config=A2C_DEFAULT_CONFIG, default_policy=A3CTFPolicy, get_policy_class=get_policy_class, make_policy_optimizer=choose_policy_optimizer, validate_config=validate_config, training_pipeline=training_pipeline)
if config["simple_optimizer"]: train_op = rollouts \ .combine(ConcatBatches( min_batch_size=config["train_batch_size"])) \ .for_each(TrainOneStep( workers, num_sgd_iter=config["num_sgd_iter"])) else: replay_buffer = SimpleReplayBuffer(config["buffer_size"]) store_op = rollouts \ .for_each(StoreToReplayBuffer(local_buffer=replay_buffer)) replay_op = Replay(local_buffer=replay_buffer) \ .filter(WaitUntilTimestepsElapsed(config["learning_starts"])) \ .combine( ConcatBatches(min_batch_size=config["train_batch_size"])) \ .for_each(TrainOneStep( workers, num_sgd_iter=config["num_sgd_iter"])) train_op = Concurrently([store_op, replay_op], mode="round_robin", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config) AlphaZeroTrainer = build_trainer(name="AlphaZero", default_config=DEFAULT_CONFIG, default_policy=AlphaZeroPolicyWrapperClass, execution_plan=execution_plan)
return MARWILTorchPolicy def execution_plan(workers, config): rollouts = ParallelRollouts(workers, mode="bulk_sync") replay_buffer = SimpleReplayBuffer(config["replay_buffer_size"]) store_op = rollouts \ .for_each(StoreToReplayBuffer(local_buffer=replay_buffer)) replay_op = Replay(local_buffer=replay_buffer) \ .combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], )) \ .for_each(TrainOneStep(workers)) train_op = Concurrently( [store_op, replay_op], mode="round_robin", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config) MARWILTrainer = build_trainer( name="MARWIL", default_config=DEFAULT_CONFIG, default_policy=MARWILTFPolicy, get_policy_class=get_policy_class, execution_plan=execution_plan)
"observation_filter": "NoFilter", # Uses the sync samples optimizer instead of the multi-gpu one. This is # usually slower, but you might want to try it if you run into issues with # the default optimizer. "simple_optimizer": False, # Use PyTorch as framework? "use_pytorch": False }) # __sphinx_doc_end__ # yapf: enable from ray.rllib.agents.ppo.ppo import choose_policy_optimizer, update_kl,\ warn_about_bad_reward_scales, validate_config def get_policy_class(config): if config.get("use_pytorch") is True: from algorithms.master_agent.master_policy import PPOTorchPolicy return PPOTorchPolicy else: return PPOTFPolicy MasterAgent = build_trainer(name="MasterAgent", default_config=DEFAULT_CONFIG, default_policy=PPOTFPolicy, get_policy_class=get_policy_class, make_policy_optimizer=choose_policy_optimizer, validate_config=validate_config, after_optimizer_step=update_kl, after_train_result=warn_about_bad_reward_scales)