示例#1
0
    init_policy_param_values = to.tensor([
        -14., 0, -14 * 3.45, 0, 0, 0, -14 * 2.11, 0, 0, -14., 0, -14 * 3.45, 0,
        0, 0, -14 * 2.11
    ])

    # Algorithm
    subrtn_hparam_cand = dict(
        max_iter=100,
        num_rollouts=1,  # will be overwritten by SPOTA
        pop_size=50,
        expl_factor=1.1,
        expl_std_init=0.5,
        num_workers=8)
    subrtn_hparam_refs = deepcopy(subrtn_hparam_cand)

    sr_cand = HCNormal(ex_dir, env, policy, **subrtn_hparam_cand)
    sr_refs = HCNormal(ex_dir, env, deepcopy(policy), **subrtn_hparam_refs)

    algo_hparam = dict(
        max_iter=10,
        alpha=0.05,
        beta=0.1,
        nG=20,
        nJ=120,
        ntau=5,
        nc_init=5,
        nr_init=1,
        sequence_cand=sequence_add_init,
        sequence_refs=sequence_const,
        warmstart_cand=True,
        warmstart_refs=True,
示例#2
0
        conv_padding_mode='circular',
        init_param_kwargs=dict(bell=True),
        activation_nonlin=to.sigmoid,
        tau_init=10.,
        tau_learnable=True,
        kappa_init=1e-3,
        kappa_learnable=True,
        potential_init_learnable=True,
    )
    policy = NFPolicy(spec=env.spec, **policy_hparam)

    algo_hparam = dict(
        max_iter=100,
        pop_size=5 * policy.num_param,
        expl_factor=1.05,
        num_rollouts=1,
        expl_std_init=1.0,
        num_workers=6,
    )
    algo = HCNormal(ex_dir, env, policy, **algo_hparam)

    # Save the hyper-parameters
    save_list_of_dicts_to_yaml([
        dict(env=env_hparams, seed=args.seed),
        dict(policy=policy_hparam),
        dict(algo=algo_hparam, algo_name=algo.name)
    ], ex_dir)

    # Jeeeha
    algo.train(seed=args.seed)
示例#3
0
def test_npdr_and_bayessim(
    ex_dir,
    algo_name: str,
    env: SimEnv,
    num_segments: int,
    len_segments: int,
    num_real_rollouts: int,
    num_sbi_rounds: int,
    use_rec_act: bool,
):
    pyrado.set_seed(0)

    # Create a fake ground truth target domain
    env_real = deepcopy(env)
    dp_nom = env.get_nominal_domain_param()
    env_real.domain_param = dict(mass_pend_pole=dp_nom["mass_pend_pole"] * 1.2,
                                 length_pend_pole=dp_nom["length_pend_pole"] *
                                 0.8)

    # Reduce the number of steps to make this test run faster
    env.max_steps = 40
    env_real.max_steps = 40

    # Policy
    policy = QQubeSwingUpAndBalanceCtrl(env.spec)

    # Define a mapping: index - domain parameter
    dp_mapping = {1: "mass_pend_pole", 2: "length_pend_pole"}

    # Prior
    prior_hparam = dict(
        low=to.tensor(
            [dp_nom["mass_pend_pole"] * 0.5,
             dp_nom["length_pend_pole"] * 0.5]),
        high=to.tensor(
            [dp_nom["mass_pend_pole"] * 1.5,
             dp_nom["length_pend_pole"] * 1.5]),
    )
    prior = sbiutils.BoxUniform(**prior_hparam)

    # Time series embedding
    embedding = BayesSimEmbedding(
        env.spec,
        RolloutSamplerForSBI.get_dim_data(env.spec),
        downsampling_factor=3,
    )

    # Posterior (normalizing flow)
    posterior_hparam = dict(model="maf",
                            embedding_net=nn.Identity(),
                            hidden_features=20,
                            num_transforms=3)

    # Policy optimization subroutine
    subrtn_policy_hparam = dict(
        max_iter=1,
        pop_size=2,
        num_init_states_per_domain=1,
        num_domains=2,
        expl_std_init=0.1,
        expl_factor=1.1,
        num_workers=1,
    )
    subrtn_policy = HCNormal(ex_dir, env, policy, **subrtn_policy_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=1,
        num_sim_per_round=20,
        num_real_rollouts=num_real_rollouts,
        num_sbi_rounds=num_sbi_rounds,
        simulation_batch_size=1,
        normalize_posterior=False,
        num_eval_samples=2,
        num_segments=num_segments,
        len_segments=len_segments,
        use_rec_act=use_rec_act,
        stop_on_done=True,
        subrtn_sbi_training_hparam=dict(
            max_num_epochs=1),  # only train for 1 iteration
        # subrtn_sbi_sampling_hparam=dict(sample_with_mcmc=True, mcmc_parameters=dict(warmup_steps=20)),
        num_workers=1,
    )
    skip = False
    if algo_name == NPDR.name:
        algo = NPDR(
            save_dir=ex_dir,
            env_sim=env,
            env_real=env_real,
            policy=policy,
            dp_mapping=dp_mapping,
            prior=prior,
            embedding=embedding,
            subrtn_sbi_class=SNPE_C,
            posterior_hparam=posterior_hparam,
            subrtn_policy=subrtn_policy,
            **algo_hparam,
        )
    elif algo_name == BayesSim.name:
        # We are not checking multi-round SNPE-A since it has known issues
        if algo_hparam["num_sbi_rounds"] > 1:
            skip = True
        algo = BayesSim(
            save_dir=ex_dir,
            env_sim=env,
            env_real=env_real,
            policy=policy,
            dp_mapping=dp_mapping,
            embedding=embedding,
            prior=prior,
            subrtn_policy=subrtn_policy,
            **algo_hparam,
        )
    else:
        raise NotImplementedError

    if not skip:
        algo.train()
        # Just checking the interface here
        assert algo.curr_iter == algo.max_iter
示例#4
0
i.e. even for the same random seed, you will get different results. Moreover, it is advised to set `num_workers` to 1
if you want to debug your code.
The algorithms can be categorized in two different types: one type randomizes the action every step (their exploration
strategy inherits from `StochasticActionExplStrat`), and the other type randomizes the policy parameters once every
rollout their exploration strategy inherits from `StochasticParamExplStrat`). It goes without saying that every
algorithm has different hyper-parameters. However, they all use the same `rollout()` function to generate their data.
"""
algo_hparam = dict(
    max_iter=8,
    pop_size=20,
    num_init_states_per_domain=10,
    expl_factor=1.1,
    expl_std_init=1.0,
    num_workers=4,
)
algo = HCNormal(ex_dir, env, policy, **algo_hparam)
"""
Save the hyper-parameters before staring the training in a YAML-file. This step is not strictly necessary, but it helps
you to later see which hyper-parameters you used, i.e. which setting leads to a successfully trained policy.
"""
save_dicts_to_yaml(
    dict(env=env_hparams, seed=0),
    dict(policy=policy_hparam),
    dict(algo=algo_hparam, algo_name=algo.name),
    save_dir=ex_dir,
)
"""
Finally, start the training. The `train()` function is the same for all algorithms inheriting from the `Algorithm`
base class. It repetitively calls the algorithm's custom `step()` and `update()` functions.
You can load and continue a previous experiment using the Algorithm's `load()` method. The `snapshot_mode()` method
determines when to save the current training state, e.g. 'latest' saves after every step of the algorithm, and 'best'
示例#5
0
    policy_hparam = dict(
        feats=FeatureStack(const_feat, identity_feat, sign_feat, squared_feat,
                           MultFeat((0, 2)), MultFeat((1, 2))))
    policy = LinearPolicy(spec=env_sim.spec, **policy_hparam)

    # Policy optimization subroutine
    subrtn_policy_hparam = dict(
        max_iter=5,
        pop_size=5 * policy.num_param,
        num_domains=20,
        num_init_states_per_domain=1,
        expl_factor=1.05,
        expl_std_init=1.0,
        num_workers=args.num_workers,
    )
    subrtn_policy = HCNormal(ex_dir, env_sim, policy, **subrtn_policy_hparam)

    # Algorithm
    algo_hparam = dict(
        max_iter=5,
        num_real_rollouts=num_real_rollouts,
        num_sim_per_round=200,
        num_sbi_rounds=3,
        simulation_batch_size=10,
        normalize_posterior=False,
        num_eval_samples=100,
        # num_segments=1,
        len_segments=100,
        stop_on_done=False,
        posterior_hparam=posterior_hparam,
        subrtn_sbi_training_hparam=dict(