Пример #1
0
    dp_gt = dict(m=2., k=20., d=0.8)  # ground truth
    dp_init = dict(m=1.0, k=24., d=0.4)  # initial guess
    dt = 1 / 50.
    env = OneMassOscillatorSim(dt=dt, max_steps=400)
    env.reset(domain_param=dp_gt)

    # Set up policy
    policy = DummyPolicy(env.spec)

    # Sample
    sampler = ParallelRolloutSampler(env,
                                     policy,
                                     num_workers=1,
                                     min_rollouts=50,
                                     seed=1)
    ros = sampler.sample()

    # Pyro
    pyro.set_rng_seed(1001)
    pyro.enable_validation(True)

    train(
        SVI(
            model=model,
            guide=guide,
            optim=optim.Adam({'lr': 0.01}),
            # optim=optim.SGD({'lr': 0.001, 'momentum': 0.1}),
            loss=Trace_ELBO()),
        rollouts=ros,
        prior=dp_init)
Пример #2
0
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env_hparams = dict(dt=1 / 100.0, max_steps=600)
    env = QQubeSwingUpSim(**env_hparams)
    env = ActNormWrapper(env)

    # Learning rate scheduler
    lrs_gamma = trial.suggest_categorical("exp_lr_scheduler_gamma",
                                          [None, 0.995, 0.999])
    if lrs_gamma is not None:
        lr_sched = lr_scheduler.ExponentialLR
        lr_sched_hparam = dict(gamma=lrs_gamma)
    else:
        lr_sched, lr_sched_hparam = None, dict()

    # Policy
    policy_hparam = dict(
        hidden_sizes=trial.suggest_categorical("hidden_sizes_policy",
                                               [(16, 16), (32, 32), (64, 64)]),
        hidden_nonlin=fcn_from_str(
            trial.suggest_categorical("hidden_nonlin_policy",
                                      ["to_tanh", "to_relu"])),
    )  # FNN
    # policy_hparam = dict(
    #     hidden_size=trial.suggest_categorical('hidden_size_policy', [16, 32, 64]),
    #     num_recurrent_layers=trial.suggest_categorical('num_recurrent_layers_policy', [1, 2]),
    # )  # LSTM & GRU
    policy = FNNPolicy(spec=env.spec, **policy_hparam)
    # policy = GRUPolicy(spec=env.spec, **policy_hparam)

    # Critic
    vfcn_hparam = dict(
        hidden_sizes=trial.suggest_categorical("hidden_sizes_critic",
                                               [(16, 16), (32, 32), (64, 64)]),
        hidden_nonlin=fcn_from_str(
            trial.suggest_categorical("hidden_nonlin_critic",
                                      ["to_tanh", "to_relu"])),
    )
    # vfcn_hparam = dict(
    #     hidden_size=trial.suggest_categorical('hidden_size_critic', [16, 32, 64]),
    #     num_recurrent_layers=trial.suggest_categorical('num_recurrent_layers_critic', [1, 2]),
    # )  # LSTM & GRU
    vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace),
                     **vfcn_hparam)
    # vfcn = GRUPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam)
    critic_hparam = dict(
        batch_size=500,
        gamma=trial.suggest_uniform("gamma_critic", 0.98, 1.0),
        lamda=trial.suggest_uniform("lamda_critic", 0.95, 1.0),
        num_epoch=trial.suggest_int("num_epoch_critic", 1, 10),
        lr=trial.suggest_loguniform("lr_critic", 1e-5, 1e-3),
        standardize_adv=trial.suggest_categorical("standardize_adv_critic",
                                                  [False]),
        max_grad_norm=trial.suggest_categorical("max_grad_norm_critic",
                                                [None, 1.0, 5.0]),
        lr_scheduler=lr_sched,
        lr_scheduler_hparam=lr_sched_hparam,
    )
    critic = GAE(vfcn, **critic_hparam)

    # Algorithm
    algo_hparam = dict(
        num_workers=1,  # parallelize via optuna n_jobs
        max_iter=250,
        batch_size=500,
        min_steps=trial.suggest_int("num_rollouts_algo", 10, 30) *
        env.max_steps,
        num_epoch=trial.suggest_int("num_epoch_algo", 1, 10),
        eps_clip=trial.suggest_uniform("eps_clip_algo", 0.05, 0.2),
        std_init=trial.suggest_uniform("std_init_algo", 0.5, 1.0),
        lr=trial.suggest_loguniform("lr_algo", 1e-5, 1e-3),
        max_grad_norm=trial.suggest_categorical("max_grad_norm_algo",
                                                [None, 1.0, 5.0]),
        lr_scheduler=lr_sched,
        lr_scheduler_hparam=lr_sched_hparam,
    )
    csv_logger = create_csv_step_logger(
        osp.join(study_dir, f"trial_{trial.number}"))
    algo = PPO(osp.join(study_dir, f"trial_{trial.number}"),
               env,
               policy,
               critic,
               **algo_hparam,
               logger=csv_logger)

    # Train without saving the results
    algo.train(snapshot_mode="latest", seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(
        env, policy, num_workers=1,
        min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret