Exemplo n.º 1
0
def test_training_parameter_exploring(ex_dir, env: SimEnv, algo, algo_hparam):
    # Environment and policy
    env = ActNormWrapper(env)
    policy_hparam = dict(feats=FeatureStack(const_feat, identity_feat))
    policy = LinearPolicy(spec=env.spec, **policy_hparam)

    # Get initial return for comparison
    rets_before = np.zeros(5)
    for i in range(rets_before.size):
        rets_before[i] = rollout(env, policy, eval=True,
                                 seed=i).undiscounted_return()

    # Create the algorithm and train
    algo_hparam["num_workers"] = 1
    algo = algo(ex_dir, env, policy, **algo_hparam)
    algo.train()
    policy.param_values = algo.best_policy_param  # mimic saving and loading

    # Compare returns before and after training for max_iter iteration
    rets_after = np.zeros_like(rets_before)
    for i in range(rets_before.size):
        rets_after[i] = rollout(env, policy, eval=True,
                                seed=i).undiscounted_return()

    assert all(rets_after > rets_before)
Exemplo n.º 2
0
def create_nonrecurrent_policy():
    return LinearPolicy(
        EnvSpec(
            BoxSpace(-1, 1, 4),
            BoxSpace(-1, 1, 3),
        ),
        FeatureStack(const_feat, identity_feat, squared_feat),
    )
Exemplo n.º 3
0
def create_lin_setup(physicsEngine: str, dt: float, max_steps: int,
                     checkJointLimits: bool):
    # Set up environment
    env = MiniGolfIKSim(
        usePhysicsNode=True,
        physicsEngine=physicsEngine,
        dt=dt,
        max_steps=max_steps,
        checkJointLimits=checkJointLimits,
        fixedInitState=True,
    )

    # Set up policy
    policy = LinearPolicy(env.spec, FeatureStack([const_feat]))
    policy.param_values = to.tensor([0.6, 0.0, 0.03
                                     ])  # X (abs), Y (rel), Z (abs), C (abs)

    return env, policy
Exemplo n.º 4
0
def test_rfb_policy_serial(env: Env, num_feat_per_dim: int):
    rbf = RBFFeat(num_feat_per_dim=num_feat_per_dim,
                  bounds=env.obs_space.bounds)
    fs = FeatureStack(rbf)
    policy = LinearPolicy(env.spec, fs)
    for _ in range(10):
        obs = env.obs_space.sample_uniform()
        obs = to.from_numpy(obs).to(dtype=to.get_default_dtype())
        act = policy(obs)
        assert act.shape == (env.act_space.flat_dim, )
Exemplo n.º 5
0
def test_rff_policy_serial(env: Env, num_feat_per_dim: int):
    rff = RFFeat(inp_dim=env.obs_space.flat_dim,
                 num_feat_per_dim=num_feat_per_dim,
                 bandwidth=env.obs_space.bound_up)
    policy = LinearPolicy(env.spec, FeatureStack(rff))
    for _ in range(10):
        obs = env.obs_space.sample_uniform()
        obs = to.from_numpy(obs).to(dtype=to.get_default_dtype())
        act = policy(obs)
        assert act.shape == (env.act_space.flat_dim, )
Exemplo n.º 6
0
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int):
    """
    Objective function for the Optuna `Study` to maximize.

    .. note::
        Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments.

    :param trial: Optuna Trial object for hyper-parameter optimization
    :param study_dir: the parent directory for all trials in this study
    :param seed: seed value for the random number generators, pass `None` for no seeding
    :return: objective function value
    """
    # Synchronize seeds between Optuna trials
    pyrado.set_seed(seed)

    # Environment
    env_hparams = dict(dt=1 / 250.0, max_steps=1500)
    env = QQubeSwingUpSim(**env_hparams)
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(
        feats=FeatureStack(
            [identity_feat, sign_feat, abs_feat, squared_feat, cubic_feat, ATan2Feat(1, 2), MultFeat((4, 5))]
        )
    )
    policy = LinearPolicy(spec=env.spec, **policy_hparam)

    # Algorithm
    algo_hparam = dict(
        num_workers=1,  # parallelize via optuna n_jobs
        max_iter=50,
        pop_size=trial.suggest_int("pop_size", 50, 200),
        num_init_states_per_domain=trial.suggest_int("num_init_states_per_domain", 4, 10),
        num_is_samples=trial.suggest_int("num_is_samples", 5, 40),
        expl_std_init=trial.suggest_uniform("expl_std_init", 0.1, 0.5),
        symm_sampling=trial.suggest_categorical("symm_sampling", [True, False]),
    )
    csv_logger = create_csv_step_logger(osp.join(study_dir, f"trial_{trial.number}"))
    algo = PoWER(osp.join(study_dir, f"trial_{trial.number}"), env, policy, **algo_hparam, logger=csv_logger)

    # Train without saving the results
    algo.train(snapshot_mode="latest", seed=seed)

    # Evaluate
    min_rollouts = 1000
    sampler = ParallelRolloutSampler(
        env, policy, num_workers=1, min_rollouts=min_rollouts
    )  # parallelize via optuna n_jobs
    ros = sampler.sample()
    mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts

    return mean_ret
Exemplo n.º 7
0
def create_bob_setup():
    # Environments
    env_hparams = dict(dt=1 / 100.0, max_steps=500)
    env_real = BallOnBeamSim(**env_hparams)
    env_real.domain_param = dict(
        # l_beam=1.95,
        # ang_offset=-0.03,
        gravity_const=10.81)

    env_sim = BallOnBeamSim(**env_hparams)
    randomizer = DomainRandomizer(
        # NormalDomainParam(name="beam_length", mean=0, std=1e-6, clip_lo=1.5, clip_up=3.5),
        # UniformDomainParam(name="ang_offset", mean=0, halfspan=1e-6),
        NormalDomainParam(name="gravity_const", mean=0, std=1e-6), )
    env_sim = DomainRandWrapperLive(env_sim, randomizer)
    dp_map = {
        # 0: ("beam_length", "mean"), 1: ("beam_length", "std"),
        # 2: ("ang_offset", "mean"), 3: ("ang_offset", "halfspan")
        0: ("gravity_const", "mean"),
        1: ("gravity_const", "std"),
    }
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    # Policies (the behavioral policy needs to be deterministic)
    behavior_policy = LinearPolicy(env_sim.spec,
                                   feats=FeatureStack(identity_feat, sin_feat))
    behavior_policy.param_values = to.tensor(
        [3.8090, -3.8036, -1.0786, -2.4510, -0.9875, -1.3252, 3.1503, 1.4443])
    prior = DomainRandomizer(
        # NormalDomainParam(name="beam_length", mean=2.05, std=2.05/10),
        # UniformDomainParam(name="ang_offset", mean=0.03, halfspan=0.03/10),
        NormalDomainParam(name="gravity_const", mean=8.81, std=8.81 / 10), )
    # trafo_mask = [False, True, False, True]
    trafo_mask = [True, True]
    ddp_policy = DomainDistrParamPolicy(mapping=dp_map,
                                        trafo_mask=trafo_mask,
                                        prior=prior,
                                        scale_params=True)

    return env_sim, env_real, env_hparams, dp_map, behavior_policy, ddp_policy
Exemplo n.º 8
0
def test_rff_regression(ex_dir, num_feat_per_dim: int, loss_fcn: Callable,
                        algo_hparam: dict):
    # Generate some data
    inputs = to.linspace(-4.0, 4.0, 8001).view(-1, 1)
    targets = noisy_nonlin_fcn(inputs, f=3.0, noise_std=0).view(-1, 1)

    # Create the policy
    rff = RFFeat(inp_dim=1,
                 num_feat_per_dim=num_feat_per_dim,
                 bandwidth=1 / 20)
    policy = LinearPolicy(
        EnvSpec(InfBoxSpace(shape=(1, )), InfBoxSpace(shape=(1, ))),
        FeatureStack(rff))

    # Create the algorithm, and train
    loss_before = loss_fcn(policy(inputs), targets)
    algo = NonlinRegression(ex_dir, inputs, targets, policy, **algo_hparam)
    algo.train()
    loss_after = loss_fcn(policy(inputs), targets)
    assert loss_after < loss_before
    assert algo.curr_iter >= algo_hparam["max_iter_no_improvement"]
Exemplo n.º 9
0
                              f"{REPS.name}_{LinearPolicy.name}")

    # Set seed if desired
    pyrado.set_seed(args.seed, verbose=True)

    # Environment
    env_hparams = dict(dt=1 / 100.0, max_steps=500)
    env = BallOnBeamSim(**env_hparams)
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(
        # feats=FeatureStack(RFFeat(env.obs_space.flat_dim, num_feat=1000, bandwidth=1/env.obs_space.bound_up))
        # feats=FeatureStack(RBFFeat(num_feat_per_dim=20, bounds=env.obs_space.bounds, scale=0.8)),
        feats=FeatureStack(identity_feat, sin_feat))
    policy = LinearPolicy(spec=env.spec, **policy_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=500,
        eps=0.2,
        pop_size=10 * policy.num_param,
        num_init_states_per_domain=10,
        expl_std_init=0.2,
        expl_std_min=0.02,
        num_epoch_dual=1000,
        optim_mode="scipy",
        lr_dual=1e-3,
        use_map=True,
        num_workers=8,
    )
Exemplo n.º 10
0
    env_hparams = dict(dt=1 / 100.0, max_steps=500)
    env = BallOnBeamSim(**env_hparams)
    env = ActNormWrapper(env)

    # Policy
    policy_hparam = dict(
        # feats=FeatureStack(
        #     [
        #         RFFeat(
        #             env.obs_space.flat_dim, num_feat_per_dim=500, bandwidth=1/env.obs_space.bound_up, use_cuda=True
        #         )
        #     ]
        # )
        # feats=FeatureStack(RBFFeat(num_feat_per_dim=20, bounds=env.obs_space.bounds, scale=None, use_cuda=True))
        feats=FeatureStack(identity_feat, sin_feat))
    policy = LinearPolicy(spec=env.spec, **policy_hparam, use_cuda=True)

    # Critic
    vfcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.tanh)
    vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace),
                     **vfcn_hparam,
                     use_cuda=True)
    critic_hparam = dict(
        gamma=0.99,
        lamda=0.95,
        batch_size=100,
        standardize_adv=True,
        lr_scheduler=lr_scheduler.ExponentialLR,
        lr_scheduler_hparam=dict(gamma=0.99),
    )
    critic = GAE(vfcn, **critic_hparam)
Exemplo n.º 11
0
def test_sysidasrl_reps(ex_dir, env: SimEnv, num_eval_rollouts: int):
    pyrado.set_seed(0)

    def eval_ddp_policy(rollouts_real):
        init_states_real = np.array([ro.states[0, :] for ro in rollouts_real])
        rollouts_sim = []
        for i, _ in enumerate(range(num_eval_rollouts)):
            rollouts_sim.append(
                rollout(env_sim,
                        behavior_policy,
                        eval=True,
                        reset_kwargs=dict(init_state=init_states_real[i, :])))

        # Clip the rollouts rollouts yielding two lists of pairwise equally long rollouts
        ros_real_tr, ros_sim_tr = algo.truncate_rollouts(rollouts_real,
                                                         rollouts_sim,
                                                         replicate=False)
        assert len(ros_real_tr) == len(ros_sim_tr)
        assert all([
            np.allclose(r.states[0, :], s.states[0, :])
            for r, s in zip(ros_real_tr, ros_sim_tr)
        ])

        # Return the average the loss
        losses = [
            algo.loss_fcn(ro_r, ro_s)
            for ro_r, ro_s in zip(ros_real_tr, ros_sim_tr)
        ]
        return float(np.mean(np.asarray(losses)))

    # Environments
    env_real = deepcopy(env)
    env_real.domain_param = dict(ang_offset=-2 * np.pi / 180)

    env_sim = deepcopy(env)
    randomizer = DomainRandomizer(
        UniformDomainParam(name="ang_offset", mean=0, halfspan=1e-6), )
    env_sim = DomainRandWrapperLive(env_sim, randomizer)
    dp_map = {0: ("ang_offset", "mean"), 1: ("ang_offset", "halfspan")}
    env_sim = MetaDomainRandWrapper(env_sim, dp_map)

    assert env_real is not env_sim

    # Policies (the behavioral policy needs to be deterministic)
    behavior_policy = LinearPolicy(env_sim.spec,
                                   feats=FeatureStack(identity_feat))
    prior = DomainRandomizer(
        UniformDomainParam(name="ang_offset",
                           mean=1 * np.pi / 180,
                           halfspan=1 * np.pi / 180), )
    ddp_policy = DomainDistrParamPolicy(mapping=dp_map,
                                        trafo_mask=[False, True],
                                        prior=prior)

    # Subroutine
    subrtn_hparam = dict(
        max_iter=2,
        eps=1.0,
        pop_size=100,
        num_init_states_per_domain=1,
        expl_std_init=5e-2,
        expl_std_min=1e-4,
        num_workers=1,
    )
    subrtn = REPS(ex_dir, env_sim, ddp_policy, **subrtn_hparam)

    algo_hparam = dict(metric=None,
                       obs_dim_weight=np.ones(env_sim.obs_space.shape),
                       num_rollouts_per_distr=5,
                       num_workers=1)
    algo = SysIdViaEpisodicRL(subrtn, behavior_policy, **algo_hparam)

    rollouts_real_tst = []
    for _ in range(num_eval_rollouts):
        rollouts_real_tst.append(rollout(env_real, behavior_policy, eval=True))
    loss_pre = eval_ddp_policy(rollouts_real_tst)

    # Mimic training
    while algo.curr_iter < algo.max_iter and not algo.stopping_criterion_met():
        algo.logger.add_value(algo.iteration_key, algo.curr_iter)

        # Creat fake real-world data
        rollouts_real = []
        for _ in range(num_eval_rollouts):
            rollouts_real.append(rollout(env_real, behavior_policy, eval=True))

        algo.step(snapshot_mode="latest",
                  meta_info=dict(rollouts_real=rollouts_real))

        algo.logger.record_step()
        algo._curr_iter += 1

    loss_post = eval_ddp_policy(rollouts_real_tst)
    assert loss_post <= loss_pre  # don't have to be better every step
Exemplo n.º 12
0
 def linear_policy_cuda(env: Env):
     return LinearPolicy(env.spec, DefaultPolicies.default_fs(), use_cuda=True)
Exemplo n.º 13
0
 def linear_policy(env: Env):
     return LinearPolicy(env.spec, DefaultPolicies.default_fs())
Exemplo n.º 14
0
from tabulate import tabulate

from pyrado.environment_wrappers.action_normalization import ActNormWrapper
from pyrado.environments.pysim.ball_on_beam import BallOnBeamSim
from pyrado.policies.features import FeatureStack, identity_feat, squared_feat
from pyrado.policies.feed_back.linear import LinearPolicy
from pyrado.sampling.parallel_rollout_sampler import ParallelRolloutSampler

if __name__ == "__main__":
    # Set up environment
    env = BallOnBeamSim(dt=0.02, max_steps=500)
    env = ActNormWrapper(env)

    # Set up policy
    feats = FeatureStack(identity_feat, squared_feat)
    policy = LinearPolicy(env.spec, feats)

    # Set up sampler
    sampler = ParallelRolloutSampler(env,
                                     policy,
                                     num_workers=2,
                                     min_rollouts=2000)

    # Sample and print
    ros = sampler.sample()
    print(
        tabulate({
            "StepSequence count": len(ros),
            "Step count": sum(map(len, ros)),
        }.items()))