Пример #1
0
def test_dr_wrapper_live_bob(env):
    param_init = env.domain_param
    randomizer = create_default_randomizer(env)
    wrapper = DomainRandWrapperLive(env, randomizer)
    # So far no randomization happened, thus the parameter should be equal
    assert env.domain_param == param_init

    # Randomize 10 times 1 new parameter set
    for _ in range(10):
        param_old = wrapper.domain_param
        wrapper.reset()
        assert param_old != wrapper.domain_param
Пример #2
0
def eval_randomized_domain(
        pool: SamplerPool, env: SimEnv, randomizer: DomainRandomizer,
        policy: Policy, init_states: List[np.ndarray]) -> List[StepSequence]:
    """
    Evaluate a policy in a randomized domain.

    :param pool: parallel sampler
    :param env: environment to evaluate in
    :param randomizer: randomizer used to sample random domain instances, inherited from `DomainRandomizer`
    :param policy: policy to evaluate
    :param init_states: initial states of the environment which will be fixed if not set to `None`
    :return: list of rollouts
    """
    # Randomize the environments
    env = remove_all_dr_wrappers(env)
    env = DomainRandWrapperLive(env, randomizer)

    pool.invoke_all(_ps_init, pickle.dumps(env), pickle.dumps(policy))

    # Run with progress bar
    with tqdm(leave=False, file=sys.stdout, unit="rollouts",
              desc="Sampling") as pb:
        return pool.run_map(
            functools.partial(_ps_run_one_init_state, eval=True), init_states,
            pb)
Пример #3
0
def create_qqsu_setup():
    # Environments
    env_hparams = dict(dt=1 / 100.0, max_steps=600)
    env_real = QQubeSwingUpSim(**env_hparams)
    env_real.domain_param = dict(
        mass_rot_pole=0.095 * 0.9,  # 0.095*0.9 = 0.0855
        mass_pend_pole=0.024 * 1.1,  # 0.024*1.1 = 0.0264
        length_rot_pole=0.085 * 0.9,  # 0.085*0.9 = 0.0765
        length_pend_pole=0.129 * 1.1,  # 0.129*1.1 = 0.1419
    )

    env_sim = QQubeSwingUpSim(**env_hparams)
    randomizer = DomainRandomizer(
        NormalDomainParam(name="mass_rot_pole",
                          mean=0.0,
                          std=1e-9,
                          clip_lo=1e-3),
        NormalDomainParam(name="mass_pend_pole",
                          mean=0.0,
                          std=1e-9,
                          clip_lo=1e-3),
        NormalDomainParam(name="length_rot_pole",
                          mean=0.0,
                          std=1e-9,
                          clip_lo=1e-3),
        NormalDomainParam(name="length_pend_pole",
                          mean=0.0,
                          std=1e-9,
                          clip_lo=1e-3),
    )
    env_sim = DomainRandWrapperLive(env_sim, randomizer)
    dp_map = {
        0: ("mass_rot_pole", "mean"),
        1: ("mass_rot_pole", "std"),
        2: ("mass_pend_pole", "mean"),
        3: ("mass_pend_pole", "std"),
        4: ("length_rot_pole", "mean"),
        5: ("length_rot_pole", "std"),
        6: ("length_pend_pole", "mean"),
        7: ("length_pend_pole", "std"),
    }
    # trafo_mask = [False, True, False, True, False, True, False, True]
    trafo_mask = [True] * 8
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    # Policies (the behavioral policy needs to be deterministic)
    behavior_policy = QQubeSwingUpAndBalanceCtrl(env_sim.spec)
    prior = DomainRandomizer(
        NormalDomainParam(name="mass_rot_pole", mean=0.095, std=0.095 / 10),
        NormalDomainParam(name="mass_pend_pole", mean=0.024, std=0.024 / 10),
        NormalDomainParam(name="length_rot_pole", mean=0.085, std=0.085 / 10),
        NormalDomainParam(name="length_pend_pole", mean=0.129, std=0.129 / 10),
    )
    ddp_policy = DomainDistrParamPolicy(mapping=dp_map,
                                        trafo_mask=trafo_mask,
                                        prior=prior,
                                        scale_params=False)

    return env_sim, env_real, env_hparams, dp_map, behavior_policy, ddp_policy
Пример #4
0
def test_param_expl_sampler(
    env: SimEnv,
    policy: Policy,
    num_init_states_per_domain: int,
    fixed_init_state: bool,
    num_domains: int,
    num_workers: int,
):
    pyrado.set_seed(0)

    # Add randomizer
    pert = create_default_randomizer(env)
    env = DomainRandWrapperLive(env, pert)

    # Create the sampler
    sampler = ParameterExplorationSampler(env,
                                          policy,
                                          num_init_states_per_domain,
                                          num_domains,
                                          num_workers=num_workers)

    # Use some random parameters
    num_ps = 7
    params = to.rand(num_ps, policy.num_param)

    if fixed_init_state:
        # Sample a custom init state
        init_states = [env.init_space.sample_uniform()
                       ] * num_init_states_per_domain
    else:
        # Let the sampler forward to the env to randomly sample an init state
        init_states = None

    # Do the sampling
    samples = sampler.sample(param_sets=params, init_states=init_states)

    # Check if the correct number of rollouts has been sampled
    assert num_ps == len(samples)
    num_rollouts_per_param = num_init_states_per_domain * num_domains
    assert num_ps * num_rollouts_per_param == samples.num_rollouts
    for ps in samples:
        assert len(ps.rollouts) == num_rollouts_per_param

    # Compare rollouts that should be matching
    for idx in range(num_rollouts_per_param):
        # Use the first parameter set as pivot
        piter = iter(samples)
        pivot = next(piter).rollouts[idx]

        # Iterate through others
        for ops in piter:
            other_ro = ops.rollouts[idx]
            # Compare domain params
            assert pivot.rollout_info["domain_param"] == other_ro.rollout_info[
                "domain_param"]
            # Compare first observation a.k.a. init state
            assert pivot[0].observation == pytest.approx(
                other_ro[0].observation)
Пример #5
0
def test_sprl(ex_dir, env: SimEnv, optimize_mean: bool):
    pyrado.set_seed(0)

    env = ActNormWrapper(env)
    env_sprl_params = [
        dict(
            name="gravity_const",
            target_mean=to.tensor([9.81]),
            target_cov_chol_flat=to.tensor([1.0]),
            init_mean=to.tensor([9.81]),
            init_cov_chol_flat=to.tensor([0.05]),
        )
    ]
    radnomizer = DomainRandomizer(
        *[SelfPacedDomainParam(**p) for p in env_sprl_params])
    env = DomainRandWrapperLive(env, randomizer=radnomizer)

    policy = FNNPolicy(env.spec, hidden_sizes=[64, 64], hidden_nonlin=to.tanh)

    vfcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.relu)
    vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace),
                     **vfcn_hparam)
    critic_hparam = dict(
        gamma=0.9844534412010116,
        lamda=0.9710614403461155,
        num_epoch=10,
        batch_size=150,
        standardize_adv=False,
        lr=0.00016985313083236645,
    )
    critic = GAE(vfcn, **critic_hparam)

    subrtn_hparam = dict(
        max_iter=1,
        eps_clip=0.12648736789309026,
        min_steps=10 * env.max_steps,
        num_epoch=3,
        batch_size=150,
        std_init=0.7573286998997557,
        lr=6.999956625305722e-04,
        max_grad_norm=1.0,
        num_workers=1,
    )

    algo_hparam = dict(
        kl_constraints_ub=8000,
        performance_lower_bound=500,
        std_lower_bound=0.4,
        kl_threshold=200,
        max_iter=1,
        optimize_mean=optimize_mean,
    )

    algo = SPRL(env, PPO(ex_dir, env, policy, critic, **subrtn_hparam),
                **algo_hparam)
    algo.train(snapshot_mode="latest")
    assert algo.curr_iter == algo.max_iter
Пример #6
0
def test_cuda_sampling_w_dr(env: SimEnv, policy: Policy, num_workers: int):
    randomizer = create_default_randomizer(env)
    env = DomainRandWrapperLive(env, randomizer)

    sampler = ParallelRolloutSampler(env,
                                     policy,
                                     num_workers=num_workers,
                                     min_rollouts=4)
    samples = sampler.sample()

    assert samples is not None
Пример #7
0
def test_cuda_sampling_w_dr(default_bob, bob_pert):
    # Add randomizer
    env = DomainRandWrapperLive(default_bob, bob_pert)

    # Use a simple policy
    policy = FNNPolicy(env.spec, hidden_sizes=[8], hidden_nonlin=to.tanh, use_cuda=True)

    # Create the sampler
    sampler = ParallelSampler(env, policy, num_envs=2, min_rollouts=10)

    samples = sampler.sample()
    assert samples is not None
Пример #8
0
def test_bayrn_power(ex_dir, env: SimEnv, bayrn_hparam: dict):
    pyrado.set_seed(0)

    # Environments and domain randomization
    env_real = deepcopy(env)
    env_sim = DomainRandWrapperLive(env, create_zero_var_randomizer(env))
    dp_map = create_default_domain_param_map_qq()
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)
    env_real.domain_param = dict(mass_pend_pole=0.024 * 1.1,
                                 mass_rot_pole=0.095 * 1.1)
    env_real = wrap_like_other_env(env_real, env_sim)

    # Policy and subroutine
    policy_hparam = dict(energy_gain=0.587, ref_energy=0.827)
    policy = QQubeSwingUpAndBalanceCtrl(env_sim.spec, **policy_hparam)
    subrtn_hparam = dict(
        max_iter=1,
        pop_size=8,
        num_init_states_per_domain=1,
        num_is_samples=4,
        expl_std_init=0.1,
        num_workers=1,
    )
    subrtn = PoWER(ex_dir, env_sim, policy, **subrtn_hparam)

    # Set the boundaries for the GP
    dp_nom = inner_env(env_sim).get_nominal_domain_param()
    ddp_space = BoxSpace(
        bound_lo=np.array([
            0.8 * dp_nom["mass_pend_pole"], 1e-8,
            0.8 * dp_nom["mass_rot_pole"], 1e-8
        ]),
        bound_up=np.array([
            1.2 * dp_nom["mass_pend_pole"], 1e-7,
            1.2 * dp_nom["mass_rot_pole"], 1e-7
        ]),
    )

    # Create algorithm and train
    algo = BayRn(ex_dir,
                 env_sim,
                 env_real,
                 subrtn,
                 ddp_space,
                 **bayrn_hparam,
                 num_workers=1)
    algo.train()

    assert algo.curr_iter == algo.max_iter or algo.stopping_criterion_met()
Пример #9
0
def test_pddr(ex_dir, env: SimEnv, policy, algo_hparam):
    pyrado.set_seed(0)

    # Create algorithm and train
    teacher_policy = deepcopy(policy)
    critic = GAE(
        vfcn=FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace),
                       hidden_sizes=[16, 16],
                       hidden_nonlin=to.tanh))
    teacher_algo_hparam = dict(critic=critic, min_steps=1500, max_iter=2)
    teacher_algo = PPO

    # Wrapper
    randomizer = create_default_randomizer(env)
    env = DomainRandWrapperLive(env, randomizer)

    # Subroutine
    algo_hparam = dict(
        max_iter=2,
        min_steps=env.max_steps,
        std_init=0.15,
        num_epochs=10,
        num_teachers=2,
        teacher_policy=teacher_policy,
        teacher_algo=teacher_algo,
        teacher_algo_hparam=teacher_algo_hparam,
        num_workers=1,
    )

    algo = PDDR(ex_dir, env, policy, **algo_hparam)

    algo.train()

    assert algo.curr_iter == algo.max_iter

    # Save and load
    algo.save_snapshot(meta_info=None)
    algo_loaded = Algorithm.load_snapshot(load_dir=ex_dir)
    assert isinstance(algo_loaded, Algorithm)
    policy_loaded = algo_loaded.policy

    # Check
    assert all(algo.policy.param_values == policy_loaded.param_values)

    # Load the experiment. Since we did not save any hyper-parameters, we ignore the errors when loading.
    env, policy, extra = load_experiment(ex_dir)
    assert isinstance(env, Env)
    assert isinstance(policy, Policy)
    assert isinstance(extra, dict)
Пример #10
0
def create_bob_setup():
    # Environments
    env_hparams = dict(dt=1 / 100., max_steps=500)
    env_real = BallOnBeamSim(**env_hparams)
    env_real.domain_param = dict(
        # l_beam=1.95,
        # ang_offset=-0.03,
        g=10.81)

    env_sim = BallOnBeamSim(**env_hparams)
    randomizer = DomainRandomizer(
        # NormalDomainParam(name='l_beam', mean=0, std=1e-12, clip_lo=1.5, clip_up=3.5),
        # UniformDomainParam(name='ang_offset', mean=0, halfspan=1e-12),
        NormalDomainParam(name='g', mean=0, std=1e-12), )
    env_sim = DomainRandWrapperLive(env_sim, randomizer)
    dp_map = {
        # 0: ('l_beam', 'mean'), 1: ('l_beam', 'std'),
        # 2: ('ang_offset', 'mean'), 3: ('ang_offset', 'halfspan')
        0: ('g', 'mean'),
        1: ('g', 'std')
    }
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    # Policies (the behavioral policy needs to be deterministic)
    behavior_policy = LinearPolicy(env_sim.spec,
                                   feats=FeatureStack(
                                       [identity_feat, sin_feat]))
    behavior_policy.param_values = to.tensor(
        [3.8090, -3.8036, -1.0786, -2.4510, -0.9875, -1.3252, 3.1503, 1.4443])
    prior = DomainRandomizer(
        # NormalDomainParam(name='l_beam', mean=2.05, std=2.05/10),
        # UniformDomainParam(name='ang_offset', mean=0.03, halfspan=0.03/10),
        NormalDomainParam(name='g', mean=8.81, std=8.81 / 10), )
    # trafo_mask = [False, True, False, True]
    trafo_mask = [True, True]
    ddp_policy = DomainDistrParamPolicy(mapping=dp_map,
                                        trafo_mask=trafo_mask,
                                        prior=prior,
                                        scale_params=True)

    return env_sim, env_real, env_hparams, dp_map, behavior_policy, ddp_policy
Пример #11
0
def test_param_expl_sampler(default_bob, bob_pert):
    # Add randomizer
    env = DomainRandWrapperLive(default_bob, bob_pert)

    # Use a simple policy
    policy = FNNPolicy(env.spec, hidden_sizes=[8], hidden_nonlin=to.tanh)

    # Create the sampler
    num_rollouts_per_param = 12
    sampler = ParameterExplorationSampler(
        env,
        policy,
        num_envs=1,
        num_rollouts_per_param=num_rollouts_per_param,
    )

    # Use some random parameters
    num_ps = 12
    params = to.rand(num_ps, policy.num_param)

    # Do the sampling
    samples = sampler.sample(params)

    assert num_ps == len(samples)
    for ps in samples:
        assert len(ps.rollouts) == num_rollouts_per_param

    # Compare rollouts that should be matching
    for ri in range(num_rollouts_per_param):
        # Use the first paramset as pivot
        piter = iter(samples)
        pivot = next(piter).rollouts[ri]
        # Iterate through others
        for ops in piter:
            ro = ops.rollouts[ri]

            # Compare domain params
            assert pivot.rollout_info['domain_param'] == ro.rollout_info['domain_param']
            # Compare first observation a.k.a. init state
            assert pivot[0].observation == pytest.approx(ro[0].observation)
Пример #12
0
def eval_randomized_domain(pool: SamplerPool, env: SimEnv,
                           randomizer: DomainRandomizer, policy: Policy,
                           init_states: list) -> list:
    """
    Evaluate a policy in a randomized domain.

    :param pool: parallel sampler
    :param env: environment to evaluate in
    :param randomizer: randomizer used to sample random domain instances, inherited from `DomainRandomizer`
    :param policy: policy to evaluate
    :param init_states: initial states of the environment which will be fixed if not set to None
    :return: list of rollouts
    """
    # Randomize the environments
    env = DomainRandWrapperLive(env, randomizer)

    pool.invoke_all(_setup_env_policy, env, policy)

    # Run with progress bar
    with tqdm(leave=False, file=sys.stdout, unit='rollouts',
              desc='Sampling') as pb:
        return pool.run_map(_run_rollout_nom, init_states, pb)
Пример #13
0
def test_param_expl_sampler(env: SimEnv, policy: Policy):
    # Add randomizer
    pert = create_default_randomizer(env)
    env = DomainRandWrapperLive(env, pert)

    # Create the sampler
    num_rollouts_per_param = 12
    sampler = ParameterExplorationSampler(
        env,
        policy,
        num_workers=1,
        num_rollouts_per_param=num_rollouts_per_param)

    # Use some random parameters
    num_ps = 12
    params = to.rand(num_ps, policy.num_param)

    # Do the sampling
    samples = sampler.sample(params)

    assert num_ps == len(samples)
    for ps in samples:
        assert len(ps.rollouts) == num_rollouts_per_param

    # Compare rollouts that should be matching
    for ri in range(num_rollouts_per_param):
        # Use the first paramset as pivot
        piter = iter(samples)
        pivot = next(piter).rollouts[ri]
        # Iterate through others
        for ops in piter:
            ro = ops.rollouts[ri]

            # Compare domain params
            assert pivot.rollout_info['domain_param'] == ro.rollout_info[
                'domain_param']
            # Compare first observation a.k.a. init state
            assert pivot[0].observation == pytest.approx(ro[0].observation)
Пример #14
0
        max_grad_norm=1.0,
        num_workers=8,
        lr_scheduler=lr_scheduler.ExponentialLR,
        lr_scheduler_hparam=dict(gamma=0.999),
    )
    env_sprl_params = [
        dict(
            name="gravity_const",
            target_mean=to.tensor([9.81]),
            target_cov_chol_flat=to.tensor([1.0]),
            init_mean=to.tensor([9.81]),
            init_cov_chol_flat=to.tensor([0.05]),
        )
    ]
    env = DomainRandWrapperLive(
        env,
        randomizer=DomainRandomizer(
            *[SelfPacedDomainParam(**p) for p in env_sprl_params]))

    sprl_hparam = dict(
        kl_constraints_ub=8000,
        performance_lower_bound=500,
        std_lower_bound=0.4,
        kl_threshold=200,
        max_iter=args.sprl_iterations,
        optimize_mean=not args.cov_only,
    )
    algo = SPRL(env, PPO(ex_dir, env, policy, critic, **algo_hparam),
                **sprl_hparam)

    # Save the hyper-parameters
    save_dicts_to_yaml(
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.
    
    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environments
    env_hparams = dict(dt=1 / 100., max_steps=600)
    env_real = QQubeSwingUpSim(**env_hparams)
    env_real.domain_param = dict(
        Mr=0.095 * 0.9,  # 0.095*0.9 = 0.0855
        Mp=0.024 * 1.1,  # 0.024*1.1 = 0.0264
        Lr=0.085 * 0.9,  # 0.085*0.9 = 0.0765
        Lp=0.129 * 1.1,  # 0.129*1.1 = 0.1419
    )

    env_sim = QQubeSwingUpSim(**env_hparams)
    randomizer = DomainRandomizer(
        NormalDomainParam(name='Mr', mean=0., std=1e6, clip_lo=1e-3),
        NormalDomainParam(name='Mp', mean=0., std=1e6, clip_lo=1e-3),
        NormalDomainParam(name='Lr', mean=0., std=1e6, clip_lo=1e-3),
        NormalDomainParam(name='Lp', mean=0., std=1e6, clip_lo=1e-3),
    )
    env_sim = DomainRandWrapperLive(env_sim, randomizer)
    dp_map = {
        0: ('Mr', 'mean'),
        1: ('Mr', 'std'),
        2: ('Mp', 'mean'),
        3: ('Mp', 'std'),
        4: ('Lr', 'mean'),
        5: ('Lr', 'std'),
        6: ('Lp', 'mean'),
        7: ('Lp', 'std')
    }
    trafo_mask = [True] * 8
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    # Subroutine for policy improvement
    behav_policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)
    behav_policy = FNNPolicy(spec=env_sim.spec, **behav_policy_hparam)
    vfcn_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)
    vfcn = FNNPolicy(spec=EnvSpec(env_sim.obs_space, ValueFunctionSpace),
                     **vfcn_hparam)
    critic_hparam = dict(
        gamma=0.9885,
        lamda=0.9648,
        num_epoch=2,
        batch_size=500,
        standardize_adv=False,
        lr=5.792e-4,
        max_grad_norm=1.,
    )
    critic = GAE(vfcn, **critic_hparam)
    subrtn_policy_hparam = dict(
        max_iter=200,
        min_steps=3 * 23 * env_sim.max_steps,
        num_epoch=7,
        eps_clip=0.0744,
        batch_size=500,
        std_init=0.9074,
        lr=3.446e-04,
        max_grad_norm=1.,
        num_workers=1,
    )
    subrtn_policy = PPO(study_dir, env_sim, behav_policy, critic,
                        **subrtn_policy_hparam)

    # Subroutine for system identification
    prior_std_denom = trial.suggest_uniform('prior_std_denom', 5, 20)
    prior = DomainRandomizer(
        NormalDomainParam(name='Mr', mean=0.095, std=0.095 / prior_std_denom),
        NormalDomainParam(name='Mp', mean=0.024, std=0.024 / prior_std_denom),
        NormalDomainParam(name='Lr', mean=0.085, std=0.085 / prior_std_denom),
        NormalDomainParam(name='Lp', mean=0.129, std=0.129 / prior_std_denom),
    )
    ddp_policy = DomainDistrParamPolicy(
        mapping=dp_map,
        trafo_mask=trafo_mask,
        prior=prior,
        scale_params=trial.suggest_categorical('ddp_policy_scale_params',
                                               [True, False]),
    )
    subsubrtn_distr_hparam = dict(
        max_iter=trial.suggest_categorical('subsubrtn_distr_max_iter', [20]),
        pop_size=trial.suggest_int('pop_size', 50, 500),
        num_rollouts=1,
        num_is_samples=trial.suggest_int('num_is_samples', 5, 20),
        expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3, 1e-1),
        expl_std_min=trial.suggest_categorical('expl_std_min', [1e-4]),
        extra_expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3,
                                                     1e-1),
        extra_expl_decay_iter=trial.suggest_int('extra_expl_decay_iter', 0,
                                                10),
        num_workers=1,
    )
    csv_logger = create_csv_step_logger(
        osp.join(study_dir, f'trial_{trial.number}'))
    subsubrtn_distr = CEM(study_dir,
                          env_sim,
                          ddp_policy,
                          **subsubrtn_distr_hparam,
                          logger=csv_logger)
    obs_vel_weight = trial.suggest_loguniform('obs_vel_weight', 1, 100)
    subrtn_distr_hparam = dict(
        metric=None,
        obs_dim_weight=[1, 1, 1, 1, obs_vel_weight, obs_vel_weight],
        num_rollouts_per_distr=trial.suggest_int('num_rollouts_per_distr', 20,
                                                 100),
        num_workers=1,
    )
    subrtn_distr = SysIdViaEpisodicRL(subsubrtn_distr, behav_policy,
                                      **subrtn_distr_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=trial.suggest_categorical('algo_max_iter', [10]),
        num_eval_rollouts=trial.suggest_categorical('algo_num_eval_rollouts',
                                                    [5]),
        warmstart=trial.suggest_categorical('algo_warmstart', [True]),
        thold_succ_subrtn=trial.suggest_categorical('algo_thold_succ_subrtn',
                                                    [50]),
        subrtn_snapshot_mode='latest',
    )
    algo = SimOpt(study_dir,
                  env_sim,
                  env_real,
                  subrtn_policy,
                  subrtn_distr,
                  **algo_hparam,
                  logger=csv_logger)

    # Jeeeha
    algo.train(seed=args.seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(
        env_real, algo.policy, num_workers=1,
        min_rollouts=min_rollouts)  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret
Пример #16
0
            R=np.diag([0, 0, 1e-1, 2e-1]),
            Q_dev=np.diag([0.0, 0.0, 5]),
            # R_dev=np.diag([0., 0., 1e-3, 1e-3])
        ),
    )
    env = WAMBallInCupSim(**env_hparams)

    # Randomizer
    randomizer = DomainRandomizer(
        UniformDomainParam(name="cup_scale", mean=1.0, halfspan=0.2),
        NormalDomainParam(name="rope_length", mean=0.3, std=0.005),
        NormalDomainParam(name="ball_mass", mean=0.021, std=0.001),
        UniformDomainParam(name="joint_2_damping", mean=0.05, halfspan=0.05),
        UniformDomainParam(name="joint_2_dryfriction", mean=0.1, halfspan=0.1),
    )
    env = DomainRandWrapperLive(env, randomizer)

    # Policy
    bounds = ([0.0, 0.25, 0.5], [1.0, 1.5, 2.5])
    policy_hparam = dict(rbf_hparam=dict(num_feat_per_dim=9, bounds=bounds, scale=None), dim_mask=2)
    policy = DualRBFLinearPolicy(env.spec, **policy_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=15,
        pop_size=100,
        num_is_samples=10,
        num_init_states_per_domain=2,
        num_domains=10,
        expl_std_init=np.pi / 12,
        expl_std_min=0.02,
Пример #17
0
                          std=1e6,
                          clip_lo=1e-3),
        NormalDomainParam(name="mass_pend_pole",
                          mean=0.0,
                          std=1e6,
                          clip_lo=1e-3),
        NormalDomainParam(name="length_rot_pole",
                          mean=0.0,
                          std=1e6,
                          clip_lo=1e-3),
        NormalDomainParam(name="length_pend_pole",
                          mean=0.0,
                          std=1e6,
                          clip_lo=1e-3),
    )
    env_sim = DomainRandWrapperLive(env_sim, randomizer)
    dp_map = {
        0: ("mass_rot_pole", "mean"),
        1: ("mass_rot_pole", "std"),
        2: ("mass_pend_pole", "mean"),
        3: ("mass_pend_pole", "std"),
        4: ("length_rot_pole", "mean"),
        5: ("length_rot_pole", "std"),
        6: ("length_pend_pole", "mean"),
        7: ("length_pend_pole", "std"),
    }
    trafo_mask = [False, True, False, True, False, True, False, True]
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    # Subroutine for policy improvement
    behav_policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)
Пример #18
0
            lr_scheduler=lr_scheduler.ExponentialLR,
            lr_scheduler_hparam=dict(gamma=0.999),
            max_iter=args.max_iter_teacher,
            critic=critic,
        )

        teacher_algo = PPO

    else:
        teacher_policy = None
        teacher_algo = None
        teacher_algo_hparam = None

    # Wrapper
    randomizer = create_default_randomizer(env_real)
    env_real = DomainRandWrapperLive(env_real, randomizer)
    env_real = ActNormWrapper(env_real)

    # Policy
    policy_hparam = dict(hidden_sizes=[64, 64],
                         hidden_nonlin=to.relu,
                         output_nonlin=to.tanh)
    policy = FNNPolicy(spec=env_real.spec, **policy_hparam)

    # Subroutine
    algo_hparam = dict(
        max_iter=args.max_iter,
        min_steps=args.max_steps,
        std_init=0.15,
        num_epochs=args.num_epochs,
        num_teachers=args.num_teachers,
Пример #19
0
def test_sysidasrl_reps(ex_dir, env: SimEnv, num_eval_rollouts: int):
    pyrado.set_seed(0)

    def eval_ddp_policy(rollouts_real):
        init_states_real = np.array([ro.states[0, :] for ro in rollouts_real])
        rollouts_sim = []
        for i, _ in enumerate(range(num_eval_rollouts)):
            rollouts_sim.append(
                rollout(env_sim,
                        behavior_policy,
                        eval=True,
                        reset_kwargs=dict(init_state=init_states_real[i, :])))

        # Clip the rollouts rollouts yielding two lists of pairwise equally long rollouts
        ros_real_tr, ros_sim_tr = algo.truncate_rollouts(rollouts_real,
                                                         rollouts_sim,
                                                         replicate=False)
        assert len(ros_real_tr) == len(ros_sim_tr)
        assert all([
            np.allclose(r.states[0, :], s.states[0, :])
            for r, s in zip(ros_real_tr, ros_sim_tr)
        ])

        # Return the average the loss
        losses = [
            algo.loss_fcn(ro_r, ro_s)
            for ro_r, ro_s in zip(ros_real_tr, ros_sim_tr)
        ]
        return float(np.mean(np.asarray(losses)))

    # Environments
    env_real = deepcopy(env)
    env_real.domain_param = dict(ang_offset=-2 * np.pi / 180)

    env_sim = deepcopy(env)
    randomizer = DomainRandomizer(
        UniformDomainParam(name="ang_offset", mean=0, halfspan=1e-6), )
    env_sim = DomainRandWrapperLive(env_sim, randomizer)
    dp_map = {0: ("ang_offset", "mean"), 1: ("ang_offset", "halfspan")}
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    assert env_real is not env_sim

    # Policies (the behavioral policy needs to be deterministic)
    behavior_policy = LinearPolicy(env_sim.spec,
                                   feats=FeatureStack(identity_feat))
    prior = DomainRandomizer(
        UniformDomainParam(name="ang_offset",
                           mean=1 * np.pi / 180,
                           halfspan=1 * np.pi / 180), )
    ddp_policy = DomainDistrParamPolicy(mapping=dp_map,
                                        trafo_mask=[False, True],
                                        prior=prior)

    # Subroutine
    subrtn_hparam = dict(
        max_iter=2,
        eps=1.0,
        pop_size=100,
        num_init_states_per_domain=1,
        expl_std_init=5e-2,
        expl_std_min=1e-4,
        num_workers=1,
    )
    subrtn = REPS(ex_dir, env_sim, ddp_policy, **subrtn_hparam)

    algo_hparam = dict(metric=None,
                       obs_dim_weight=np.ones(env_sim.obs_space.shape),
                       num_rollouts_per_distr=5,
                       num_workers=1)
    algo = SysIdViaEpisodicRL(subrtn, behavior_policy, **algo_hparam)

    rollouts_real_tst = []
    for _ in range(num_eval_rollouts):
        rollouts_real_tst.append(rollout(env_real, behavior_policy, eval=True))
    loss_pre = eval_ddp_policy(rollouts_real_tst)

    # Mimic training
    while algo.curr_iter < algo.max_iter and not algo.stopping_criterion_met():
        algo.logger.add_value(algo.iteration_key, algo.curr_iter)

        # Creat fake real-world data
        rollouts_real = []
        for _ in range(num_eval_rollouts):
            rollouts_real.append(rollout(env_real, behavior_policy, eval=True))

        algo.step(snapshot_mode="latest",
                  meta_info=dict(rollouts_real=rollouts_real))

        algo.logger.record_step()
        algo._curr_iter += 1

    loss_post = eval_ddp_policy(rollouts_real_tst)
    assert loss_post <= loss_pre  # don't have to be better every step
Пример #20
0
    # Experiment (set seed before creating the modules)
    ex_dir = setup_experiment(
        QQubeSwingUpSim.name,
        f"{BayRn.name}-{PPO.name}_{FNNPolicy.name}",
        "rand-mass_pend_pole-mass_rot_pole-length_pend_pole-length_rot_pole_lower-std",
    )

    # Set seed if desired
    pyrado.set_seed(args.seed, verbose=True)

    # Environments
    env_sim_hparams = dict(dt=1 / 100.0, max_steps=600)
    env_sim = QQubeSwingUpSim(**env_sim_hparams)
    env_sim = ActNormWrapper(env_sim)
    env_sim = DomainRandWrapperLive(env_sim,
                                    create_zero_var_randomizer(env_sim))
    dp_map = create_default_domain_param_map_qq()
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    env_real_hparams = dict(dt=1 / 500.0, max_steps=3000)
    env_real = QQubeSwingUpReal(**env_real_hparams)
    env_real = wrap_like_other_env(env_real, env_sim)

    # Policy
    policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)
    policy = FNNPolicy(spec=env_sim.spec, **policy_hparam)

    # Critic
    vfcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.tanh)
    vfcn = FNNPolicy(spec=EnvSpec(env_sim.obs_space, ValueFunctionSpace),
                     **vfcn_hparam)
Пример #21
0
def test_simopt_cem_ppo(ex_dir, env: SimEnv):
    pyrado.set_seed(0)

    # Environments
    env_real = deepcopy(env)
    env_real = ActNormWrapper(env_real)
    env_sim = ActNormWrapper(env)
    randomizer = DomainRandomizer(
        NormalDomainParam(name="mass_rot_pole",
                          mean=0.0,
                          std=1e6,
                          clip_lo=1e-3),
        NormalDomainParam(name="mass_pend_pole",
                          mean=0.0,
                          std=1e6,
                          clip_lo=1e-3),
        NormalDomainParam(name="length_rot_pole",
                          mean=0.0,
                          std=1e6,
                          clip_lo=1e-3),
        NormalDomainParam(name="length_pend_pole",
                          mean=0.0,
                          std=1e6,
                          clip_lo=1e-3),
    )
    env_sim = DomainRandWrapperLive(env_sim, randomizer)
    dp_map = {
        0: ("mass_rot_pole", "mean"),
        1: ("mass_rot_pole", "std"),
        2: ("mass_pend_pole", "mean"),
        3: ("mass_pend_pole", "std"),
        4: ("length_rot_pole", "mean"),
        5: ("length_rot_pole", "std"),
        6: ("length_pend_pole", "mean"),
        7: ("length_pend_pole", "std"),
    }
    trafo_mask = [True] * 8
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    # Subroutine for policy improvement
    behav_policy_hparam = dict(hidden_sizes=[16, 16], hidden_nonlin=to.tanh)
    behav_policy = FNNPolicy(spec=env_sim.spec, **behav_policy_hparam)
    vfcn_hparam = dict(hidden_sizes=[16, 16], hidden_nonlin=to.relu)
    vfcn = FNNPolicy(spec=EnvSpec(env_sim.obs_space, ValueFunctionSpace),
                     **vfcn_hparam)
    critic_hparam = dict(
        gamma=0.99,
        lamda=0.98,
        num_epoch=2,
        batch_size=128,
        standardize_adv=True,
        lr=8e-4,
        max_grad_norm=5.0,
    )
    critic = GAE(vfcn, **critic_hparam)
    subrtn_policy_hparam = dict(
        max_iter=2,
        eps_clip=0.13,
        min_steps=4 * env_sim.max_steps,
        num_epoch=3,
        batch_size=128,
        std_init=0.75,
        lr=3e-04,
        max_grad_norm=1.0,
        num_workers=1,
    )
    subrtn_policy = PPO(ex_dir, env_sim, behav_policy, critic,
                        **subrtn_policy_hparam)

    prior = DomainRandomizer(
        NormalDomainParam(name="mass_rot_pole", mean=0.095, std=0.095 / 10),
        NormalDomainParam(name="mass_pend_pole", mean=0.024, std=0.024 / 10),
        NormalDomainParam(name="length_rot_pole", mean=0.085, std=0.085 / 10),
        NormalDomainParam(name="length_pend_pole", mean=0.129, std=0.129 / 10),
    )
    ddp_policy_hparam = dict(mapping=dp_map,
                             trafo_mask=trafo_mask,
                             scale_params=True)
    ddp_policy = DomainDistrParamPolicy(prior=prior, **ddp_policy_hparam)
    subsubrtn_distr_hparam = dict(
        max_iter=2,
        pop_size=10,
        num_init_states_per_domain=1,
        num_is_samples=8,
        expl_std_init=1e-2,
        expl_std_min=1e-5,
        extra_expl_std_init=1e-2,
        extra_expl_decay_iter=5,
        num_workers=1,
    )
    subsubrtn_distr = CEM(ex_dir, env_sim, ddp_policy,
                          **subsubrtn_distr_hparam)
    subrtn_distr_hparam = dict(
        metric=None,
        obs_dim_weight=[1, 1, 1, 1, 10, 10],
        num_rollouts_per_distr=3,
        num_workers=1,
    )
    subrtn_distr = SysIdViaEpisodicRL(subsubrtn_distr,
                                      behavior_policy=behav_policy,
                                      **subrtn_distr_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=1,
        num_eval_rollouts=5,
        warmstart=True,
    )
    algo = SimOpt(ex_dir, env_sim, env_real, subrtn_policy, subrtn_distr,
                  **algo_hparam)
    algo.train()

    assert algo.curr_iter == algo.max_iter
Пример #22
0
from pyrado.policies.linear import LinearPolicy
from pyrado.utils.experiments import wrap_like_other_env

if __name__ == '__main__':
    # Experiment (set seed before creating the modules)
    # ex_dir = setup_experiment(QQubeSim.name, f'{BayRn.name}_{PoWER.name}-sim2sim', '100Hz_lin_dr-Mp+', seed=111)
    ex_dir = setup_experiment(
        QQubeSim.name,
        f'{BayRn.name}_{PoWER.name}-sim2sim',
        f'{QQubeSwingUpAndBalanceCtrl.name}_100Hz_dr-Mp+Mr+',
        seed=1111)

    # Environments
    env_hparams = dict(dt=1 / 100., max_steps=600)
    env_sim = QQubeSim(**env_hparams)
    env_sim = DomainRandWrapperLive(env_sim, get_zero_var_randomizer(env_sim))
    dp_map = get_default_domain_param_map_qq()
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    env_real = QQubeSim(**env_hparams)
    env_real.domain_param = dict(Mp=0.026, Mr=0.097)
    # env_real = QQubeReal(**env_hparams)
    env_real = wrap_like_other_env(env_real, env_sim)

    # Policy
    # policy_hparam = dict(
    #     feats=FeatureStack([identity_feat, sign_feat, abs_feat, squared_feat, qubic_feat,
    #                         MultFeat([2, 5]), MultFeat([3, 5]), MultFeat([4, 5])])
    # )
    # policy = LinearPolicy(spec=env_sim.spec, **policy_hparam)
    policy_hparam = dict(energy_gain=0.587, ref_energy=0.827, acc_max=10.)
Пример #23
0
def test_basic_meta(ex_dir, policy, env: SimEnv, algo, algo_hparam: dict):
    pyrado.set_seed(0)

    # Policy and subroutine
    env = GaussianObsNoiseWrapper(
        env,
        noise_std=[
            1 / 180 * np.pi,
            1 / 180 * np.pi,
            0.0025,
            0.0025,
            2 / 180 * np.pi,
            2 / 180 * np.pi,
            0.05,
            0.05,
        ],
    )
    env = ActNormWrapper(env)
    env = ActDelayWrapper(env)
    randomizer = create_default_randomizer_qbb()
    randomizer.add_domain_params(
        UniformDomainParam(name="act_delay",
                           mean=15,
                           halfspan=15,
                           clip_lo=0,
                           roundint=True))
    env = DomainRandWrapperLive(env, randomizer)

    # Policy
    policy_hparam = dict(hidden_sizes=[16, 16], hidden_nonlin=to.tanh)  # FNN
    policy = FNNPolicy(spec=env.spec, **policy_hparam)

    # Critic
    vfcn_hparam = dict(hidden_sizes=[16, 16], hidden_nonlin=to.tanh)  # FNN
    vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace),
                     **vfcn_hparam)
    critic_hparam = dict(
        gamma=0.9995,
        lamda=0.98,
        num_epoch=2,
        batch_size=64,
        lr=5e-4,
        standardize_adv=False,
    )
    critic = GAE(vfcn, **critic_hparam)

    subrtn_hparam = dict(
        max_iter=3,
        min_rollouts=5,
        num_epoch=2,
        eps_clip=0.1,
        batch_size=64,
        std_init=0.8,
        lr=2e-4,
        num_workers=1,
    )
    subrtn = PPO(ex_dir, env, policy, critic, **subrtn_hparam)
    algo = algo(env, subrtn, **algo_hparam)

    algo.train()

    assert algo.curr_iter == algo.max_iter