예제 #1
0
def create_qqsu_setup():
    # Environments
    env_hparams = dict(dt=1 / 100.0, max_steps=600)
    env_real = QQubeSwingUpSim(**env_hparams)
    env_real.domain_param = dict(
        mass_rot_pole=0.095 * 0.9,  # 0.095*0.9 = 0.0855
        mass_pend_pole=0.024 * 1.1,  # 0.024*1.1 = 0.0264
        length_rot_pole=0.085 * 0.9,  # 0.085*0.9 = 0.0765
        length_pend_pole=0.129 * 1.1,  # 0.129*1.1 = 0.1419
    )

    env_sim = QQubeSwingUpSim(**env_hparams)
    randomizer = DomainRandomizer(
        NormalDomainParam(name="mass_rot_pole",
                          mean=0.0,
                          std=1e-9,
                          clip_lo=1e-3),
        NormalDomainParam(name="mass_pend_pole",
                          mean=0.0,
                          std=1e-9,
                          clip_lo=1e-3),
        NormalDomainParam(name="length_rot_pole",
                          mean=0.0,
                          std=1e-9,
                          clip_lo=1e-3),
        NormalDomainParam(name="length_pend_pole",
                          mean=0.0,
                          std=1e-9,
                          clip_lo=1e-3),
    )
    env_sim = DomainRandWrapperLive(env_sim, randomizer)
    dp_map = {
        0: ("mass_rot_pole", "mean"),
        1: ("mass_rot_pole", "std"),
        2: ("mass_pend_pole", "mean"),
        3: ("mass_pend_pole", "std"),
        4: ("length_rot_pole", "mean"),
        5: ("length_rot_pole", "std"),
        6: ("length_pend_pole", "mean"),
        7: ("length_pend_pole", "std"),
    }
    # trafo_mask = [False, True, False, True, False, True, False, True]
    trafo_mask = [True] * 8
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    # Policies (the behavioral policy needs to be deterministic)
    behavior_policy = QQubeSwingUpAndBalanceCtrl(env_sim.spec)
    prior = DomainRandomizer(
        NormalDomainParam(name="mass_rot_pole", mean=0.095, std=0.095 / 10),
        NormalDomainParam(name="mass_pend_pole", mean=0.024, std=0.024 / 10),
        NormalDomainParam(name="length_rot_pole", mean=0.085, std=0.085 / 10),
        NormalDomainParam(name="length_pend_pole", mean=0.129, std=0.129 / 10),
    )
    ddp_policy = DomainDistrParamPolicy(mapping=dp_map,
                                        trafo_mask=trafo_mask,
                                        prior=prior,
                                        scale_params=False)

    return env_sim, env_real, env_hparams, dp_map, behavior_policy, ddp_policy
예제 #2
0
def create_bob_setup():
    # Environments
    env_hparams = dict(dt=1 / 100., max_steps=500)
    env_real = BallOnBeamSim(**env_hparams)
    env_real.domain_param = dict(
        # l_beam=1.95,
        # ang_offset=-0.03,
        g=10.81)

    env_sim = BallOnBeamSim(**env_hparams)
    randomizer = DomainRandomizer(
        # NormalDomainParam(name='l_beam', mean=0, std=1e-12, clip_lo=1.5, clip_up=3.5),
        # UniformDomainParam(name='ang_offset', mean=0, halfspan=1e-12),
        NormalDomainParam(name='g', mean=0, std=1e-12), )
    env_sim = DomainRandWrapperLive(env_sim, randomizer)
    dp_map = {
        # 0: ('l_beam', 'mean'), 1: ('l_beam', 'std'),
        # 2: ('ang_offset', 'mean'), 3: ('ang_offset', 'halfspan')
        0: ('g', 'mean'),
        1: ('g', 'std')
    }
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    # Policies (the behavioral policy needs to be deterministic)
    behavior_policy = LinearPolicy(env_sim.spec,
                                   feats=FeatureStack(
                                       [identity_feat, sin_feat]))
    behavior_policy.param_values = to.tensor(
        [3.8090, -3.8036, -1.0786, -2.4510, -0.9875, -1.3252, 3.1503, 1.4443])
    prior = DomainRandomizer(
        # NormalDomainParam(name='l_beam', mean=2.05, std=2.05/10),
        # UniformDomainParam(name='ang_offset', mean=0.03, halfspan=0.03/10),
        NormalDomainParam(name='g', mean=8.81, std=8.81 / 10), )
    # trafo_mask = [False, True, False, True]
    trafo_mask = [True, True]
    ddp_policy = DomainDistrParamPolicy(mapping=dp_map,
                                        trafo_mask=trafo_mask,
                                        prior=prior,
                                        scale_params=True)

    return env_sim, env_real, env_hparams, dp_map, behavior_policy, ddp_policy
예제 #3
0
        lr_scheduler_hparam=dict(gamma=0.999),
    )
    subrtn_policy = PPO(ex_dir, env_sim, behav_policy, critic,
                        **subrtn_policy_hparam)

    # Subroutine for system identification
    prior = DomainRandomizer(
        NormalDomainParam(name="mass_rot_pole", mean=0.095, std=0.095 / 10),
        NormalDomainParam(name="mass_pend_pole", mean=0.024, std=0.024 / 10),
        NormalDomainParam(name="length_rot_pole", mean=0.085, std=0.085 / 10),
        NormalDomainParam(name="length_pend_pole", mean=0.129, std=0.129 / 10),
    )
    ddp_policy_hparam = dict(mapping=dp_map,
                             trafo_mask=trafo_mask,
                             scale_params=False)
    ddp_policy = DomainDistrParamPolicy(prior=prior, **ddp_policy_hparam)
    subsubrtn_distr_hparam = dict(
        max_iter=10,
        pop_size=None,
        eta_mean=1.0,
        num_init_states_per_domain=1,
        expl_std_init=5e-2,
        expl_std_min=1e-4,
        num_workers=num_workers,
    )
    subsubrtn_distr = NES(ex_dir, env_sim, ddp_policy,
                          **subsubrtn_distr_hparam)
    subrtn_distr_hparam = dict(
        metric=None,
        obs_dim_weight=[1.0, 1.0, 1.0, 1.0, 2.0, 2.0],
        num_rollouts_per_distr=len(dp_map) * 10,
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.
    
    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environments
    env_hparams = dict(dt=1 / 100., max_steps=600)
    env_real = QQubeSwingUpSim(**env_hparams)
    env_real.domain_param = dict(
        Mr=0.095 * 0.9,  # 0.095*0.9 = 0.0855
        Mp=0.024 * 1.1,  # 0.024*1.1 = 0.0264
        Lr=0.085 * 0.9,  # 0.085*0.9 = 0.0765
        Lp=0.129 * 1.1,  # 0.129*1.1 = 0.1419
    )

    env_sim = QQubeSwingUpSim(**env_hparams)
    randomizer = DomainRandomizer(
        NormalDomainParam(name='Mr', mean=0., std=1e6, clip_lo=1e-3),
        NormalDomainParam(name='Mp', mean=0., std=1e6, clip_lo=1e-3),
        NormalDomainParam(name='Lr', mean=0., std=1e6, clip_lo=1e-3),
        NormalDomainParam(name='Lp', mean=0., std=1e6, clip_lo=1e-3),
    )
    env_sim = DomainRandWrapperLive(env_sim, randomizer)
    dp_map = {
        0: ('Mr', 'mean'),
        1: ('Mr', 'std'),
        2: ('Mp', 'mean'),
        3: ('Mp', 'std'),
        4: ('Lr', 'mean'),
        5: ('Lr', 'std'),
        6: ('Lp', 'mean'),
        7: ('Lp', 'std')
    }
    trafo_mask = [True] * 8
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    # Subroutine for policy improvement
    behav_policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)
    behav_policy = FNNPolicy(spec=env_sim.spec, **behav_policy_hparam)
    vfcn_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)
    vfcn = FNNPolicy(spec=EnvSpec(env_sim.obs_space, ValueFunctionSpace),
                     **vfcn_hparam)
    critic_hparam = dict(
        gamma=0.9885,
        lamda=0.9648,
        num_epoch=2,
        batch_size=500,
        standardize_adv=False,
        lr=5.792e-4,
        max_grad_norm=1.,
    )
    critic = GAE(vfcn, **critic_hparam)
    subrtn_policy_hparam = dict(
        max_iter=200,
        min_steps=3 * 23 * env_sim.max_steps,
        num_epoch=7,
        eps_clip=0.0744,
        batch_size=500,
        std_init=0.9074,
        lr=3.446e-04,
        max_grad_norm=1.,
        num_workers=1,
    )
    subrtn_policy = PPO(study_dir, env_sim, behav_policy, critic,
                        **subrtn_policy_hparam)

    # Subroutine for system identification
    prior_std_denom = trial.suggest_uniform('prior_std_denom', 5, 20)
    prior = DomainRandomizer(
        NormalDomainParam(name='Mr', mean=0.095, std=0.095 / prior_std_denom),
        NormalDomainParam(name='Mp', mean=0.024, std=0.024 / prior_std_denom),
        NormalDomainParam(name='Lr', mean=0.085, std=0.085 / prior_std_denom),
        NormalDomainParam(name='Lp', mean=0.129, std=0.129 / prior_std_denom),
    )
    ddp_policy = DomainDistrParamPolicy(
        mapping=dp_map,
        trafo_mask=trafo_mask,
        prior=prior,
        scale_params=trial.suggest_categorical('ddp_policy_scale_params',
                                               [True, False]),
    )
    subsubrtn_distr_hparam = dict(
        max_iter=trial.suggest_categorical('subsubrtn_distr_max_iter', [20]),
        pop_size=trial.suggest_int('pop_size', 50, 500),
        num_rollouts=1,
        num_is_samples=trial.suggest_int('num_is_samples', 5, 20),
        expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3, 1e-1),
        expl_std_min=trial.suggest_categorical('expl_std_min', [1e-4]),
        extra_expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3,
                                                     1e-1),
        extra_expl_decay_iter=trial.suggest_int('extra_expl_decay_iter', 0,
                                                10),
        num_workers=1,
    )
    csv_logger = create_csv_step_logger(
        osp.join(study_dir, f'trial_{trial.number}'))
    subsubrtn_distr = CEM(study_dir,
                          env_sim,
                          ddp_policy,
                          **subsubrtn_distr_hparam,
                          logger=csv_logger)
    obs_vel_weight = trial.suggest_loguniform('obs_vel_weight', 1, 100)
    subrtn_distr_hparam = dict(
        metric=None,
        obs_dim_weight=[1, 1, 1, 1, obs_vel_weight, obs_vel_weight],
        num_rollouts_per_distr=trial.suggest_int('num_rollouts_per_distr', 20,
                                                 100),
        num_workers=1,
    )
    subrtn_distr = SysIdViaEpisodicRL(subsubrtn_distr, behav_policy,
                                      **subrtn_distr_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=trial.suggest_categorical('algo_max_iter', [10]),
        num_eval_rollouts=trial.suggest_categorical('algo_num_eval_rollouts',
                                                    [5]),
        warmstart=trial.suggest_categorical('algo_warmstart', [True]),
        thold_succ_subrtn=trial.suggest_categorical('algo_thold_succ_subrtn',
                                                    [50]),
        subrtn_snapshot_mode='latest',
    )
    algo = SimOpt(study_dir,
                  env_sim,
                  env_real,
                  subrtn_policy,
                  subrtn_distr,
                  **algo_hparam,
                  logger=csv_logger)

    # Jeeeha
    algo.train(seed=args.seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(
        env_real, algo.policy, num_workers=1,
        min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret