Пример #1
0
 def custom_ppo():
     return PPOTrainer.with_updates(default_policy=CustomPPOTFPolicy)
Пример #2
0
def ray_train(save_in_sub_folder=None,
              available_cluster_cpus=None,
              available_cluster_gpus=None,
              LOCAL_MODE=None,
              config=None,
              **mainkwargs):
    #config = gym.make(train_env_id).config

    subprocess.run(["chmod", "-R", "a+rwx", save_in_sub_folder + "/"])
    # Postprocess the perturbed config to ensure it's still valid

    s3pathname = 's3://datastore-s3/groups/Behavior/Pinaki'
    upload_dir_path = s3pathname + "/" + ray_folder + '/' + InceptcurrentDT
    if save_in_sub_folder is not None:
        local_dir_path = save_in_sub_folder
        # makedirpath(upload_dir_path)

    from ray.rllib.agents.impala.vtrace_policy import VTraceTFPolicy

    if is_predict_only() or LOCAL_MODE:
        delegated_cpus = 1
        delegated_gpus = 0
    else:
        delegated_cpus = available_cluster_cpus - 2
        delegated_gpus = available_cluster_gpus

    impala_config = impala.DEFAULT_CONFIG.copy()
    impala_config["num_gpus"] = 0
    ImpalaTrainer = build_trainer(
        name="IMPALA",
        default_config=impala_config,
        default_policy=VTraceTFPolicy,
        validate_config=impala.impala.validate_config,
        get_policy_class=impala.impala.choose_policy,
        make_workers=impala.impala.defer_make_workers,
        make_policy_optimizer=impala.impala.make_aggregators_and_optimizer,
        mixins=[impala.impala.OverrideDefaultResourceRequest])

    def make_async_optimizer(workers, config):
        return AsyncGradientsOptimizer(workers, grads_per_step=100)

    CustomTrainer = PPOTrainer.with_updates(
        make_policy_optimizer=make_async_optimizer)

    restore_folder = None
    algo = "PPO"  # RL Algorithm of choice
    LOAD_MODEL_FOLDER = config[
        "LOAD_MODEL_FOLDER"]  # Location of previous model (if needed) for training
    #RESTORE_COND = "NONE" # RESTORE: Use a previous model to start new training
    # RESTORE_AND_RESUME: Use a previous model to finish previous unfinished training
    # NONE: Start fresh

    # RESTORE_COND = config["RESTORE_COND"]
    RESTORE_COND = "NONE"
    if RESTORE_COND == "RESTORE_AND_RESUME":
        restore_folder, local_restore_path, _ = retrieve_ray_folder_info(
            LOAD_MODEL_FOLDER)
        local_dir = local_restore_path
        resume = True
    elif RESTORE_COND == "RESTORE":
        restore_folder, local_restore_path, _ = retrieve_ray_folder_info(
            LOAD_MODEL_FOLDER)
        local_dir = local_dir_path
        resume = False
    else:
        local_dir = local_dir_path
        resume = False

    checkpoint_freq = int(num_timesteps) // min(int(num_timesteps), 20)

    retrieved_agent_policy = settings.retrieved_agent_policy

    model = config["MODEL"]
    print("delegated_cpus ", delegated_cpus, " delegated_gpus ",
          delegated_gpus)

    ray_trials = ray.tune.run(
        PPOTrainer,
        name="pygame-ray",
        stop={"training_iteration": int(num_timesteps)},
        checkpoint_freq=checkpoint_freq,
        checkpoint_at_end=True,
        local_dir=local_dir,
        # upload_dir=upload_dir_path,
        verbose=True,
        queue_trials=False,
        resume=resume,
        # scheduler=pbt,
        # trial_executor=RayTrialExecutor(),
        # resources_per_trial={"cpu": delegated_cpus, "gpu": 0},
        restore=restore_folder,
        #**es.DEFAULT_CONFIG,
        **{
            "num_samples": 1,
            "config": {
                "num_gpus_per_worker": 0,
                #"num_cpus_per_worker": 1,
                "num_gpus": delegated_gpus,
                "gamma": 0.85,
                "num_workers": delegated_cpus,
                "num_envs_per_worker": 2,
                "env": train_env_id,
                "remote_worker_envs": False,
                "model": model,
                "ignore_worker_failures": True,
                #"env_config": {
                #                "retrieved_agent_policy": 1,
                #              },
                #"callbacks": {
                #  "on_episode_start": ray.tune.function(on_episode_start),
                #             },
                # These params are tuned from a fixed starting value.
                # "lambda": 0.95,
                # "clip_param": 0.2,
                # "lr": 1e-4,
                # These params start off randomly drawn from a set.
                # "num_sgd_iter": sample_from(lambda spec: random.choice([10, 20, 30])),
                # "sgd_minibatch_size": sample_from(lambda spec: random.choice([128, 512, 2048])),
                # "train_batch_size": sample_from(lambda spec: random.choice([10000, 20000, 40000])),
            },
        })
    copy_terminal_output_file(
        save_folder=local_dir_path,
        terminal_output_file_name=terminal_output_file_name)
    subprocess.run(["chmod", "-R", "a+rwx", ray_folder + "/"])
Пример #3
0
        # Check the input is correct.
        exist_mask = self.get_mask()
        for name, arr in mask_dict.items():
            assert name in exist_mask
            assert list(arr.shape) == exist_mask[name]

        self.get_policy().set_default(mask_dict)
        if hasattr(self, "workers"):
            self.workers.foreach_worker(
                lambda w: w.get_policy().set_default(mask_dict))

        logger.info("Successfully set mask: {}".format([
            "layer: {}, shape: {}, mean {:.4f}, std {:.4f}.".format(
                name, arr.shape, arr.mean(), arr.std())
            for name, arr in mask_dict.items()
        ]))
        print("Successfully set mask: {}".format([
            "layer: {}, shape: {}, mean {:.4f}, std {:.4f}.".format(
                name, arr.shape, arr.mean(), arr.std())
            for name, arr in mask_dict.items()
        ]))


# PPOTrainer.with_updates

PPOAgentWithMask = PPOTrainer.with_updates(
    name="PPOWithMask",
    default_config=ppo_agent_default_config_with_mask,
    default_policy=PPOTFPolicyWithMask,
    mixins=[AddMaskInfoMixin])
Пример #4
0
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy, postprocess_ppo_gae


def my_postprocess_ppo_gae(policy, sample_batch, *args, **kwargs):
    if sample_batch.get('infos') is not None:
        idx = [i for i, x in enumerate(sample_batch['infos']) if x['done']]
        if idx:
            idx.append(sample_batch.count)
            sbatch = sample_batch.slice(0, idx[0] + 1)
            sbatch['dones'][-1] = True
            batch = postprocess_ppo_gae(policy, sbatch, *args, **kwargs)
            for s, t in zip(idx[:-1], idx[1:]):
                sbatch = sample_batch.slice(s, t + 1)
                sbatch['dones'][-1] = True
                batch.concat(
                    postprocess_ppo_gae(policy, sbatch, *args, **kwargs))
            return batch
    return postprocess_ppo_gae(policy, sample_batch, *args, **kwargs)


MyPpoPolicy = PPOTFPolicy.with_updates(name="MyPpoTFPolicy",
                                       postprocess_fn=my_postprocess_ppo_gae)

MyPpoTrainer = PPOTrainer.with_updates(name="MyPpoTrainer",
                                       default_policy=MyPpoPolicy)
Пример #5
0
    completed = sample_batch[SampleBatch.DONES][-1]
    if completed:
        last_r = 0.0
    else:
        next_state = []
        for i in range(policy.num_state_tensors()):
            next_state.append(sample_batch["state_out_{}".format(i)][-1])
        last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1],
                               sample_batch[SampleBatch.ACTIONS][-1],
                               sample_batch[SampleBatch.REWARDS][-1],
                               *next_state)

    # print(np.mean(sample_batch['int_rew']), np.std(sample_batch['int_rew']), np.max(sample_batch['int_rew']))
    sample_batch[SampleBatch.REWARDS] = sample_batch[SampleBatch.REWARDS] +  np.clip(sample_batch['int_rew'], 0, 2)
    batch = compute_advantages(
        sample_batch,
        last_r,
        policy.config["gamma"],
        policy.config["lambda"],
        use_gae=policy.config["use_gae"])
    return batch

PPOTFPolicy_RND = PPOTFPolicy.with_updates(
    name="PPOTFRND",
    postprocess_fn=add_intrinsic_rewards_to_gae,
    extra_action_fetches_fn=add_intrinsic_rews_to_batch
)

PPOTrainer_RND = PPOTrainer.with_updates(name="PPO_RND", default_policy=PPOTFPolicy_RND, get_policy_class=None)
Пример #6
0
    if policy.config["caps_temporal_reg"] > 0.0:
        stats_dict["temporal_smoothness"] = policy._mean_temporal_caps_loss
    if policy.config["caps_spatial_reg"] > 0.0:
        stats_dict["spatial_smoothness"] = policy._mean_spatial_caps_loss
    if policy.config["caps_global_reg"] > 0.0:
        stats_dict["global_smoothness"] = policy._mean_global_caps_loss

    return stats_dict


PPOTorchPolicy = PPOTorchPolicy.with_updates(
    before_loss_init=ppo_init,
    loss_fn=ppo_loss,
    stats_fn=ppo_stats,
    get_default_config=lambda: DEFAULT_CONFIG,
)


def get_policy_class(config: TrainerConfigDict) -> Optional[Type[Policy]]:
    """ TODO: Write documentation.
    """
    if config["framework"] == "torch":
        return PPOTorchPolicy
    return None


PPOTrainer = PPOTrainer.with_updates(default_config=DEFAULT_CONFIG,
                                     get_policy_class=get_policy_class)

__all__ = ["DEFAULT_CONFIG", "PPOTorchPolicy", "PPOTrainer"]
Пример #7
0
    #     # single-agent
    #     trainer.workers.local_worker().for_policy(
    #         lambda pi: pi.update_kl(fetches["kl"]))
    # else:
    #
    #     def update(pi, pi_id):
    #         if pi_id in fetches:
    #             pi.update_kl(fetches[pi_id]["kl"])
    #         else:
    #             logger.debug("No data for {}, not updating kl".format(pi_id))
    #
    #     # multi-agent
    #     trainer.workers.local_worker().foreach_trainable_policy(update)


CustomPPOPolicy = PPOTFPolicy.with_updates(
    name="POPO",
    loss_fn=new_ppo_surrogate_loss,
    postprocess_fn=new_postprocess_ppo_gae,
    stats_fn=new_kl_and_loss_stats,
    extra_action_fetches_fn=new_vf_preds_and_logits_fetches,
    mixins=[
        SetUpConfig, LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
        ValueNetworkMixin, KLDiffMixin
    ],
    before_loss_init=special_setup_mixins)

KLPPOTrainer = PPOTrainer.with_updates(default_policy=CustomPPOPolicy,
                                       default_config=DEFAULT_CONFIG,
                                       after_optimizer_step=update_kl)
Пример #8
0
                                  config["entropy_coeff_schedule"])
    warmup_steps = config["model"]["custom_options"].get(
        "warmup_steps", 100000)
    TransformerLearningRateSchedule.__init__(
        policy, config["model"]["custom_options"]["transformer"]["num_heads"],
        warmup_steps)


TTFPPOPolicy = PPOTFPolicy.with_updates(name="TTFPPOPolicy",
                                        before_loss_init=setup_mixins,
                                        mixins=[
                                            TransformerLearningRateSchedule,
                                            EntropyCoeffSchedule, KLCoeffMixin,
                                            ValueNetworkMixin
                                        ])

TTFPPOPolicyInfer = PPOTFPolicy.with_updates(name="TTFPPOPolicyInfer",
                                             before_loss_init=setup_mixins,
                                             mixins=[
                                                 LearningRateSchedule,
                                                 EntropyCoeffSchedule,
                                                 KLCoeffMixin,
                                                 ValueNetworkMixin
                                             ])

register_trainable(
    "TTFPPO",
    PPOTrainer.with_updates(name="TTFPPOTrainer",
                            get_policy_class=lambda c: TTFPPOPolicy),
)