def custom_ppo(): return PPOTrainer.with_updates(default_policy=CustomPPOTFPolicy)
def ray_train(save_in_sub_folder=None, available_cluster_cpus=None, available_cluster_gpus=None, LOCAL_MODE=None, config=None, **mainkwargs): #config = gym.make(train_env_id).config subprocess.run(["chmod", "-R", "a+rwx", save_in_sub_folder + "/"]) # Postprocess the perturbed config to ensure it's still valid s3pathname = 's3://datastore-s3/groups/Behavior/Pinaki' upload_dir_path = s3pathname + "/" + ray_folder + '/' + InceptcurrentDT if save_in_sub_folder is not None: local_dir_path = save_in_sub_folder # makedirpath(upload_dir_path) from ray.rllib.agents.impala.vtrace_policy import VTraceTFPolicy if is_predict_only() or LOCAL_MODE: delegated_cpus = 1 delegated_gpus = 0 else: delegated_cpus = available_cluster_cpus - 2 delegated_gpus = available_cluster_gpus impala_config = impala.DEFAULT_CONFIG.copy() impala_config["num_gpus"] = 0 ImpalaTrainer = build_trainer( name="IMPALA", default_config=impala_config, default_policy=VTraceTFPolicy, validate_config=impala.impala.validate_config, get_policy_class=impala.impala.choose_policy, make_workers=impala.impala.defer_make_workers, make_policy_optimizer=impala.impala.make_aggregators_and_optimizer, mixins=[impala.impala.OverrideDefaultResourceRequest]) def make_async_optimizer(workers, config): return AsyncGradientsOptimizer(workers, grads_per_step=100) CustomTrainer = PPOTrainer.with_updates( make_policy_optimizer=make_async_optimizer) restore_folder = None algo = "PPO" # RL Algorithm of choice LOAD_MODEL_FOLDER = config[ "LOAD_MODEL_FOLDER"] # Location of previous model (if needed) for training #RESTORE_COND = "NONE" # RESTORE: Use a previous model to start new training # RESTORE_AND_RESUME: Use a previous model to finish previous unfinished training # NONE: Start fresh # RESTORE_COND = config["RESTORE_COND"] RESTORE_COND = "NONE" if RESTORE_COND == "RESTORE_AND_RESUME": restore_folder, local_restore_path, _ = retrieve_ray_folder_info( LOAD_MODEL_FOLDER) local_dir = local_restore_path resume = True elif RESTORE_COND == "RESTORE": restore_folder, local_restore_path, _ = retrieve_ray_folder_info( LOAD_MODEL_FOLDER) local_dir = local_dir_path resume = False else: local_dir = local_dir_path resume = False checkpoint_freq = int(num_timesteps) // min(int(num_timesteps), 20) retrieved_agent_policy = settings.retrieved_agent_policy model = config["MODEL"] print("delegated_cpus ", delegated_cpus, " delegated_gpus ", delegated_gpus) ray_trials = ray.tune.run( PPOTrainer, name="pygame-ray", stop={"training_iteration": int(num_timesteps)}, checkpoint_freq=checkpoint_freq, checkpoint_at_end=True, local_dir=local_dir, # upload_dir=upload_dir_path, verbose=True, queue_trials=False, resume=resume, # scheduler=pbt, # trial_executor=RayTrialExecutor(), # resources_per_trial={"cpu": delegated_cpus, "gpu": 0}, restore=restore_folder, #**es.DEFAULT_CONFIG, **{ "num_samples": 1, "config": { "num_gpus_per_worker": 0, #"num_cpus_per_worker": 1, "num_gpus": delegated_gpus, "gamma": 0.85, "num_workers": delegated_cpus, "num_envs_per_worker": 2, "env": train_env_id, "remote_worker_envs": False, "model": model, "ignore_worker_failures": True, #"env_config": { # "retrieved_agent_policy": 1, # }, #"callbacks": { # "on_episode_start": ray.tune.function(on_episode_start), # }, # These params are tuned from a fixed starting value. # "lambda": 0.95, # "clip_param": 0.2, # "lr": 1e-4, # These params start off randomly drawn from a set. # "num_sgd_iter": sample_from(lambda spec: random.choice([10, 20, 30])), # "sgd_minibatch_size": sample_from(lambda spec: random.choice([128, 512, 2048])), # "train_batch_size": sample_from(lambda spec: random.choice([10000, 20000, 40000])), }, }) copy_terminal_output_file( save_folder=local_dir_path, terminal_output_file_name=terminal_output_file_name) subprocess.run(["chmod", "-R", "a+rwx", ray_folder + "/"])
# Check the input is correct. exist_mask = self.get_mask() for name, arr in mask_dict.items(): assert name in exist_mask assert list(arr.shape) == exist_mask[name] self.get_policy().set_default(mask_dict) if hasattr(self, "workers"): self.workers.foreach_worker( lambda w: w.get_policy().set_default(mask_dict)) logger.info("Successfully set mask: {}".format([ "layer: {}, shape: {}, mean {:.4f}, std {:.4f}.".format( name, arr.shape, arr.mean(), arr.std()) for name, arr in mask_dict.items() ])) print("Successfully set mask: {}".format([ "layer: {}, shape: {}, mean {:.4f}, std {:.4f}.".format( name, arr.shape, arr.mean(), arr.std()) for name, arr in mask_dict.items() ])) # PPOTrainer.with_updates PPOAgentWithMask = PPOTrainer.with_updates( name="PPOWithMask", default_config=ppo_agent_default_config_with_mask, default_policy=PPOTFPolicyWithMask, mixins=[AddMaskInfoMixin])
from ray.rllib.agents.ppo import PPOTrainer from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy, postprocess_ppo_gae def my_postprocess_ppo_gae(policy, sample_batch, *args, **kwargs): if sample_batch.get('infos') is not None: idx = [i for i, x in enumerate(sample_batch['infos']) if x['done']] if idx: idx.append(sample_batch.count) sbatch = sample_batch.slice(0, idx[0] + 1) sbatch['dones'][-1] = True batch = postprocess_ppo_gae(policy, sbatch, *args, **kwargs) for s, t in zip(idx[:-1], idx[1:]): sbatch = sample_batch.slice(s, t + 1) sbatch['dones'][-1] = True batch.concat( postprocess_ppo_gae(policy, sbatch, *args, **kwargs)) return batch return postprocess_ppo_gae(policy, sample_batch, *args, **kwargs) MyPpoPolicy = PPOTFPolicy.with_updates(name="MyPpoTFPolicy", postprocess_fn=my_postprocess_ppo_gae) MyPpoTrainer = PPOTrainer.with_updates(name="MyPpoTrainer", default_policy=MyPpoPolicy)
completed = sample_batch[SampleBatch.DONES][-1] if completed: last_r = 0.0 else: next_state = [] for i in range(policy.num_state_tensors()): next_state.append(sample_batch["state_out_{}".format(i)][-1]) last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1], sample_batch[SampleBatch.ACTIONS][-1], sample_batch[SampleBatch.REWARDS][-1], *next_state) # print(np.mean(sample_batch['int_rew']), np.std(sample_batch['int_rew']), np.max(sample_batch['int_rew'])) sample_batch[SampleBatch.REWARDS] = sample_batch[SampleBatch.REWARDS] + np.clip(sample_batch['int_rew'], 0, 2) batch = compute_advantages( sample_batch, last_r, policy.config["gamma"], policy.config["lambda"], use_gae=policy.config["use_gae"]) return batch PPOTFPolicy_RND = PPOTFPolicy.with_updates( name="PPOTFRND", postprocess_fn=add_intrinsic_rewards_to_gae, extra_action_fetches_fn=add_intrinsic_rews_to_batch ) PPOTrainer_RND = PPOTrainer.with_updates(name="PPO_RND", default_policy=PPOTFPolicy_RND, get_policy_class=None)
if policy.config["caps_temporal_reg"] > 0.0: stats_dict["temporal_smoothness"] = policy._mean_temporal_caps_loss if policy.config["caps_spatial_reg"] > 0.0: stats_dict["spatial_smoothness"] = policy._mean_spatial_caps_loss if policy.config["caps_global_reg"] > 0.0: stats_dict["global_smoothness"] = policy._mean_global_caps_loss return stats_dict PPOTorchPolicy = PPOTorchPolicy.with_updates( before_loss_init=ppo_init, loss_fn=ppo_loss, stats_fn=ppo_stats, get_default_config=lambda: DEFAULT_CONFIG, ) def get_policy_class(config: TrainerConfigDict) -> Optional[Type[Policy]]: """ TODO: Write documentation. """ if config["framework"] == "torch": return PPOTorchPolicy return None PPOTrainer = PPOTrainer.with_updates(default_config=DEFAULT_CONFIG, get_policy_class=get_policy_class) __all__ = ["DEFAULT_CONFIG", "PPOTorchPolicy", "PPOTrainer"]
# # single-agent # trainer.workers.local_worker().for_policy( # lambda pi: pi.update_kl(fetches["kl"])) # else: # # def update(pi, pi_id): # if pi_id in fetches: # pi.update_kl(fetches[pi_id]["kl"]) # else: # logger.debug("No data for {}, not updating kl".format(pi_id)) # # # multi-agent # trainer.workers.local_worker().foreach_trainable_policy(update) CustomPPOPolicy = PPOTFPolicy.with_updates( name="POPO", loss_fn=new_ppo_surrogate_loss, postprocess_fn=new_postprocess_ppo_gae, stats_fn=new_kl_and_loss_stats, extra_action_fetches_fn=new_vf_preds_and_logits_fetches, mixins=[ SetUpConfig, LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin, ValueNetworkMixin, KLDiffMixin ], before_loss_init=special_setup_mixins) KLPPOTrainer = PPOTrainer.with_updates(default_policy=CustomPPOPolicy, default_config=DEFAULT_CONFIG, after_optimizer_step=update_kl)
config["entropy_coeff_schedule"]) warmup_steps = config["model"]["custom_options"].get( "warmup_steps", 100000) TransformerLearningRateSchedule.__init__( policy, config["model"]["custom_options"]["transformer"]["num_heads"], warmup_steps) TTFPPOPolicy = PPOTFPolicy.with_updates(name="TTFPPOPolicy", before_loss_init=setup_mixins, mixins=[ TransformerLearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin, ValueNetworkMixin ]) TTFPPOPolicyInfer = PPOTFPolicy.with_updates(name="TTFPPOPolicyInfer", before_loss_init=setup_mixins, mixins=[ LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin, ValueNetworkMixin ]) register_trainable( "TTFPPO", PPOTrainer.with_updates(name="TTFPPOTrainer", get_policy_class=lambda c: TTFPPOPolicy), )