def setup_mixins(policy, obs_space, action_space, config): """Copied from PPO""" KLCoeffMixin.__init__(policy, config) EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"], config["entropy_coeff_schedule"]) LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"]) def central_vf_stats(policy, train_batch, grads): """Report the explained variance of the central value function""" return { "vf_explained_var": explained_variance(train_batch[Postprocessing.VALUE_TARGETS], policy.central_value_out), } CCPPO = PPOTFPolicy.with_updates( name="CCPPO", postprocess_fn=centralized_critic_postprocessing, loss_fn=loss_with_central_critic, before_loss_init=setup_mixins, grad_stats_fn=central_vf_stats, mixins=[ LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin, CentralizedValueMixin ]) CCTrainer = PPOTrainer.with_updates(name="CCPPOTrainer", default_policy=CCPPO)
values = values[:len(values) - max(upper, 0)] values = np.pad( values, pad_width=[ (-min(lower, 0), -min(0, upper)), *[(0, 0) for k in range(values.ndim - 1)], ], mode="constant", ) return values CCPPOPolicy = PPOTFPolicy.with_updates( name="CCPPOPolicy", postprocess_fn=centralized_critic_postprocessing, loss_fn=loss_with_central_critic, before_loss_init=setup_mixins, grad_stats_fn=central_vf_stats, mixins=[ LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin, CentralizedValueMixin, ], ) register_trainable( "CcTransformer", PPOTrainer.with_updates(name="CCPPOTrainer", get_policy_class=lambda c: CCPPOPolicy), )
postprocess_fn=centralized_critic_postprocessing, loss_fn=loss_with_central_critic, before_init=setup_mixins, mixins=[ TorchLR, TorchEntropyCoeffSchedule, TorchKLCoeffMixin, CentralizedValueMixin ]) def get_policy_class(config): return CCPPOTorchPolicy if config["use_pytorch"] else CCPPOTFPolicy CCTrainer = PPOTrainer.with_updates( name="CCPPOTrainer", default_policy=CCPPOTFPolicy, get_policy_class=get_policy_class, ) if __name__ == "__main__": ray.init(local_mode=True) args = parser.parse_args() ModelCatalog.register_custom_model( "cc_model", TorchCentralizedCriticModel if args.torch else CentralizedCriticModel) config = { "env": TwoStepGame, "batch_mode": "complete_episodes", "eager": False,
# only update the policies pool if used DELAY_UPDATE, otherwise # the policies_pool in each policy is simply not used, so we don't # need to update it. if trainer.config[DELAY_UPDATE]: if trainer.workers.remote_workers(): weights = ray.put(trainer.workers.local_worker().get_weights()) for e in trainer.workers.remote_workers(): e.set_weights.remote(weights) def _delay_update_for_worker(worker, worker_index): worker.foreach_policy(lambda p, _: p.update_target()) trainer.workers.foreach_worker_with_index(_delay_update_for_worker) def get_policy_class(config): return DiCEPolicy DiCETrainer = PPOTrainer.with_updates( name="DiCETrainer", default_config=dice_default_config, default_policy=DiCEPolicy, get_policy_class=get_policy_class, validate_config=validate_config, make_policy_optimizer=make_policy_optimizer_tnbes, after_init=setup_policies_pool, after_optimizer_step=after_optimizer_iteration, )
from toolbox import train from toolbox.evolution import GaussianESTrainer from toolbox.evolution_plugin.evolution_plugin import choose_optimzier, \ merge_dicts, DEFAULT_CONFIG from toolbox.train import get_train_parser ppo_sgd_config = merge_dicts(DEFAULT_CONFIG, dict(master_optimizer_type="sgd")) PPOSGDPolicy = PPOTFPolicy.with_updates( name="PPOSGDPolicy", get_default_config=lambda: ppo_sgd_config, optimizer_fn=choose_optimzier) PPOSGDTrainer = PPOTrainer.with_updates( name="PPOSGD", default_config=ppo_sgd_config, default_policy=PPOSGDPolicy, get_policy_class=lambda _: PPOSGDPolicy) if __name__ == '__main__': parser = get_train_parser() parser.add_argument("--ppo", action="store_true") parser.add_argument("--es", action="store_true") # parser.add_argument("--optimizer", type=str, default="sgd") # [adam, sgd] parser.add_argument("--stop", type=float, default=1e7) parser.add_argument("--local-mode", "-lm", action="store_true") args = parser.parse_args() print(args) local_mode = args.local_mode now = time.time() assert int(args.ppo) + int(args.es) == 1
policy.central_value_out), } CCPPO = PPOTFPolicy.with_updates( name="CCPPO", postprocess_fn=centralized_critic_postprocessing, loss_fn=loss_with_central_critic, before_loss_init=setup_mixins, grad_stats_fn=central_vf_stats, mixins=[ LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin, CentralizedValueMixin ]) CCTrainer = PPOTrainer.with_updates( name="CCPPOTrainer", default_policy=CCPPO, get_policy_class=None) if __name__ == "__main__": args = parser.parse_args() ModelCatalog.register_custom_model("cc_model", CentralizedCriticModel) tune.run( CCTrainer, stop={ "timesteps_total": args.stop, "episode_reward_mean": 7.99, }, config={ "env": TwoStepGame, "batch_mode": "complete_episodes", "eager": False, "num_workers": 0,
train_batch[Postprocessing.VALUE_TARGETS], policy.model.value_function() ) } def setup_mixins_without_kl(policy, obs_space, action_space, config): ValueNetworkMixin.__init__(policy, obs_space, action_space, config) # EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"], # config["entropy_coeff_schedule"]) LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"]) PPOTFPolicyWithoutKL = PPOTFPolicy.with_updates( name="PPOTFPolicyWithoutKL", loss_fn=ppo_surrogate_loss_without_kl, stats_fn=loss_stats, before_loss_init=setup_mixins_without_kl, mixins=[ LearningRateSchedule, # EntropyCoeffSchedule, ValueNetworkMixin ] ) PPOTrainerWithoutKL = PPOTrainer.with_updates( name="PPOWithoutKL", default_policy=PPOTFPolicyWithoutKL, after_optimizer_step=None )
"redo_invalid_games": False, "wandb": {}, "ed": None, "policy_catalog": None, "eq_iters": None, "adaptive_pval_test": False, "br_thres": None, "eq_thres": None, "br_eval_against_policy": None, "thres_is_pval": None, "adaptive_pval": None } PPO_CUSTOM_EVAL_TRAINER_DEFAULT_CONFIG = with_base_config( base_config=DEFAULT_CONFIG, extra_config=ppo_custom_eval_trainer_added_config_items) ppo_custom_eval_trainer_mixins = [ CustomEvaluationsTrainerMixin, WeightsUtilsTrainerMixin ] # Add custom evaluation logic to PPOTrainer PPOCustomEvalTrainer = PPOTrainer.with_updates( name="PPOCustomEvalTrainer", default_config=PPO_CUSTOM_EVAL_TRAINER_DEFAULT_CONFIG, before_init=ppo_custom_eval_trainer_before_init, after_init=ppo_custom_eval_trainer_after_init, validate_config=ppo_custom_eval_trainer_validate_config, after_optimizer_step=after_optimizer_step, collect_metrics_fn=collect_metrics, mixins=ppo_custom_eval_trainer_mixins)
tf.shape(policy.get_placeholder(SampleBatch.CUR_OBS))[0]) def grad_stats(policy, train_batch, grads): return { "grad_gnorm": tf.global_norm(grads), "vf_explained_var": explained_variance( train_batch[Postprocessing.VALUE_TARGETS], policy.central_value_function), } ImitationCentralizedPolicy = PPOTFPolicy.with_updates( name="ImitationCentralizedPolicy", before_loss_init=setup_mixins, postprocess_fn=centralized_critic_postprocessing, stats_fn=loss_stats, grad_stats_fn=grad_stats, loss_fn=new_ppo_surrogate_loss, mixins=[ LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin, CentralizedValueMixin, ImitationLearningRateSchedule ]) ImitationCentralizedTrainer = PPOTrainer.with_updates(name="ImitationCentralizedPPOTrainer", default_policy=ImitationCentralizedPolicy, after_optimizer_step=update_kl) CCImitationTrainer = PPOTrainer.with_updates(name="CCImitationPPOTrainer", default_policy=ImitationCentralizedPolicy, )
train_batch_size=config["train_batch_size"], standardize_fields=["advantages"], shuffle_sequences=config["shuffle_sequences"]) def setup_mixins_modified(policy, obs_space, action_space, config): AddLossMixin.__init__(policy, config) setup_mixins(policy, obs_space, action_space, config) ExtraLossPPOTFPolicy = PPOTFPolicy.with_updates( name="ExtraLossPPOTFPolicy", get_default_config=lambda: extra_loss_ppo_default_config, postprocess_fn=postprocess_ppo_gae_modified, stats_fn=kl_and_loss_stats_modified, loss_fn=extra_loss_ppo_loss, before_loss_init=setup_mixins_modified, mixins=mixin_list + [AddLossMixin]) ExtraLossPPOTrainer = PPOTrainer.with_updates( name="ExtraLossPPO", default_config=extra_loss_ppo_default_config, validate_config=validate_config_modified, default_policy=ExtraLossPPOTFPolicy, make_policy_optimizer=choose_policy_optimizer) if __name__ == '__main__': from toolbox.marl.test_extra_loss import test_extra_loss_ppo_trainer1 test_extra_loss_ppo_trainer1(True)
ImitationLearningRateSchedule.__init__( policy, config["model"]["custom_options"]["num_imitation_iters"], config["model"]["custom_options"]["imitation_weight"], config) def grad_stats(policy, train_batch, grads): return { "grad_gnorm": tf.global_norm(grads), "vf_explained_var": explained_variance(train_batch[Postprocessing.VALUE_TARGETS], policy.model.value_function()), } ImitationPolicy = PPOTFPolicy.with_updates(name="ImitationPolicy", before_loss_init=setup_mixins, stats_fn=loss_stats, grad_stats_fn=grad_stats, loss_fn=new_ppo_surrogate_loss, mixins=[ LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin, ValueNetworkMixin, ImitationLearningRateSchedule ]) ImitationTrainer = PPOTrainer.with_updates(name="ImitationPPOTrainer", default_policy=ImitationPolicy, after_optimizer_step=update_kl)
# Which observation filter to apply to the observation. "observation_filter": "NoFilter", # Uses the sync samples optimizer instead of the multi-gpu one. This is # usually slower, but you might want to try it if you run into issues with # the default optimizer. "simple_optimizer": False, # Whether to fake GPUs (using CPUs). # Set this to True for debugging on non-GPU machines (set `num_gpus` > 0). "_fake_gpus": False, # Use PyTorch as framework? "use_pytorch": False }) # __sphinx_doc_end__ # yapf: enable def get_policy_class(config): if config["use_pytorch"]: from algorithms.custom_ppo.custom_ppo_torch_policy import CustomPPOTorchPolicy return CustomPPOTorchPolicy else: return PPOTFPolicy CustomPPOTrainer = PPOTrainer.with_updates( name="CustomPPO", default_config=DEFAULT_CONFIG, default_policy=PPOTFPolicy, get_policy_class=get_policy_class, )
EntropyCoeffSchedule, KLCoeffMixin, ValueNetworkMixin, FIMEmbeddingMixin ]) def get_policy_class(config): if config.get("use_pytorch") is True: raise NotImplementedError() else: return PPOFIMTFPolicy PPOFIMTrainer = PPOTrainer.with_updates( name="PPOFIM", default_policy=PPOFIMTFPolicy, get_policy_class=get_policy_class, ) def agent_to_vector(target_agent, probe_agent): # Step 1: sample a dataset for given subject_agent dataset = [] for i in range(20): dataset.append(target_agent.workers.local_worker().sample()) dataset = dataset[0].concat_samples(dataset) dataset.shuffle() # TODO not sure the samples is uniformly spread since each batch is # in one episode. # Step 2: compute the embdding for subject_agent via probe_agent
def validate_config(config): tmp_env = MultiAgentEnvWrapper(config["env_config"]) config["multiagent"]["policies"] = { "agent{}".format(i): (None, tmp_env.observation_space, tmp_env.action_space, {}) for i in range(num_agents) } config["multiagent"]["policy_mapping_fn"] = lambda x: x original_validate(config) PPOESTrainer = PPOTrainer.with_updates( name="PPOES", default_config=ppo_es_default_config, after_train_result=run_evolution_strategies, validate_config=validate_config) if __name__ == '__main__': env_name = "CartPole-v0" num_agents = 3 config = { "num_sgd_iter": 2, "train_batch_size": 400, "update_steps": 1000, **get_marl_env_config(env_name, num_agents) } initialize_ray(test_mode=True, local_mode=True) train(PPOESTrainer, config,
def before_train_step(trainer): policy = trainer.get_policy() if not policy.initialized_policies_pool: # function to call for each worker (including remote and local workers) def init_novelty(worker): # function for each policy within one worker. def _init_novelty_policy(policy, _): policy._lazy_initialize() worker.foreach_policy(_init_novelty_policy) trainer.workers.foreach_worker(init_novelty) def validate_config(config): validate_config_original(config) assert config['model']['custom_model'] == "ActorDoubleCriticNetwork" config['model']['custom_options'] = { "use_novelty_value_network": config['use_novelty_value_network'] } TNBTrainer = PPOTrainer.with_updates( name="TNBPPO", validate_config=validate_config, make_policy_optimizer=choose_policy_optimizer, default_config=tnb_default_config, before_train_step=before_train_step, default_policy=TNBPolicy, get_policy_class=lambda _: TNBPolicy)
trainer.workers.foreach_worker(_init_pool) IPDPolicy = PPOTFPolicy.with_updates( name="IPDPolicy", get_default_config=lambda: ipd_default_config, before_loss_init=setup_mixins_tnb, mixins=[ LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin, ValueNetworkMixin, AgentPoolMixin ] ) IPDTrainer = PPOTrainer.with_updates( name="IPD", default_config=ipd_default_config, after_init=after_init, default_policy=IPDPolicy ) if __name__ == '__main__': from ray import tune from toolbox import initialize_ray initialize_ray(test_mode=True, local_mode=False) env_name = "CartPole-v0" config = { "num_sgd_iter": 2, "env": IPDEnv, "env_config": { "env_name": env_name,