init_policy_param_values = to.tensor([ -14., 0, -14 * 3.45, 0, 0, 0, -14 * 2.11, 0, 0, -14., 0, -14 * 3.45, 0, 0, 0, -14 * 2.11 ]) # Algorithm subrtn_hparam_cand = dict( max_iter=100, num_rollouts=1, # will be overwritten by SPOTA pop_size=50, expl_factor=1.1, expl_std_init=0.5, num_workers=8) subrtn_hparam_refs = deepcopy(subrtn_hparam_cand) sr_cand = HCNormal(ex_dir, env, policy, **subrtn_hparam_cand) sr_refs = HCNormal(ex_dir, env, deepcopy(policy), **subrtn_hparam_refs) algo_hparam = dict( max_iter=10, alpha=0.05, beta=0.1, nG=20, nJ=120, ntau=5, nc_init=5, nr_init=1, sequence_cand=sequence_add_init, sequence_refs=sequence_const, warmstart_cand=True, warmstart_refs=True,
conv_padding_mode='circular', init_param_kwargs=dict(bell=True), activation_nonlin=to.sigmoid, tau_init=10., tau_learnable=True, kappa_init=1e-3, kappa_learnable=True, potential_init_learnable=True, ) policy = NFPolicy(spec=env.spec, **policy_hparam) algo_hparam = dict( max_iter=100, pop_size=5 * policy.num_param, expl_factor=1.05, num_rollouts=1, expl_std_init=1.0, num_workers=6, ) algo = HCNormal(ex_dir, env, policy, **algo_hparam) # Save the hyper-parameters save_list_of_dicts_to_yaml([ dict(env=env_hparams, seed=args.seed), dict(policy=policy_hparam), dict(algo=algo_hparam, algo_name=algo.name) ], ex_dir) # Jeeeha algo.train(seed=args.seed)
def test_npdr_and_bayessim( ex_dir, algo_name: str, env: SimEnv, num_segments: int, len_segments: int, num_real_rollouts: int, num_sbi_rounds: int, use_rec_act: bool, ): pyrado.set_seed(0) # Create a fake ground truth target domain env_real = deepcopy(env) dp_nom = env.get_nominal_domain_param() env_real.domain_param = dict(mass_pend_pole=dp_nom["mass_pend_pole"] * 1.2, length_pend_pole=dp_nom["length_pend_pole"] * 0.8) # Reduce the number of steps to make this test run faster env.max_steps = 40 env_real.max_steps = 40 # Policy policy = QQubeSwingUpAndBalanceCtrl(env.spec) # Define a mapping: index - domain parameter dp_mapping = {1: "mass_pend_pole", 2: "length_pend_pole"} # Prior prior_hparam = dict( low=to.tensor( [dp_nom["mass_pend_pole"] * 0.5, dp_nom["length_pend_pole"] * 0.5]), high=to.tensor( [dp_nom["mass_pend_pole"] * 1.5, dp_nom["length_pend_pole"] * 1.5]), ) prior = sbiutils.BoxUniform(**prior_hparam) # Time series embedding embedding = BayesSimEmbedding( env.spec, RolloutSamplerForSBI.get_dim_data(env.spec), downsampling_factor=3, ) # Posterior (normalizing flow) posterior_hparam = dict(model="maf", embedding_net=nn.Identity(), hidden_features=20, num_transforms=3) # Policy optimization subroutine subrtn_policy_hparam = dict( max_iter=1, pop_size=2, num_init_states_per_domain=1, num_domains=2, expl_std_init=0.1, expl_factor=1.1, num_workers=1, ) subrtn_policy = HCNormal(ex_dir, env, policy, **subrtn_policy_hparam) # Algorithm algo_hparam = dict( max_iter=1, num_sim_per_round=20, num_real_rollouts=num_real_rollouts, num_sbi_rounds=num_sbi_rounds, simulation_batch_size=1, normalize_posterior=False, num_eval_samples=2, num_segments=num_segments, len_segments=len_segments, use_rec_act=use_rec_act, stop_on_done=True, subrtn_sbi_training_hparam=dict( max_num_epochs=1), # only train for 1 iteration # subrtn_sbi_sampling_hparam=dict(sample_with_mcmc=True, mcmc_parameters=dict(warmup_steps=20)), num_workers=1, ) skip = False if algo_name == NPDR.name: algo = NPDR( save_dir=ex_dir, env_sim=env, env_real=env_real, policy=policy, dp_mapping=dp_mapping, prior=prior, embedding=embedding, subrtn_sbi_class=SNPE_C, posterior_hparam=posterior_hparam, subrtn_policy=subrtn_policy, **algo_hparam, ) elif algo_name == BayesSim.name: # We are not checking multi-round SNPE-A since it has known issues if algo_hparam["num_sbi_rounds"] > 1: skip = True algo = BayesSim( save_dir=ex_dir, env_sim=env, env_real=env_real, policy=policy, dp_mapping=dp_mapping, embedding=embedding, prior=prior, subrtn_policy=subrtn_policy, **algo_hparam, ) else: raise NotImplementedError if not skip: algo.train() # Just checking the interface here assert algo.curr_iter == algo.max_iter
i.e. even for the same random seed, you will get different results. Moreover, it is advised to set `num_workers` to 1 if you want to debug your code. The algorithms can be categorized in two different types: one type randomizes the action every step (their exploration strategy inherits from `StochasticActionExplStrat`), and the other type randomizes the policy parameters once every rollout their exploration strategy inherits from `StochasticParamExplStrat`). It goes without saying that every algorithm has different hyper-parameters. However, they all use the same `rollout()` function to generate their data. """ algo_hparam = dict( max_iter=8, pop_size=20, num_init_states_per_domain=10, expl_factor=1.1, expl_std_init=1.0, num_workers=4, ) algo = HCNormal(ex_dir, env, policy, **algo_hparam) """ Save the hyper-parameters before staring the training in a YAML-file. This step is not strictly necessary, but it helps you to later see which hyper-parameters you used, i.e. which setting leads to a successfully trained policy. """ save_dicts_to_yaml( dict(env=env_hparams, seed=0), dict(policy=policy_hparam), dict(algo=algo_hparam, algo_name=algo.name), save_dir=ex_dir, ) """ Finally, start the training. The `train()` function is the same for all algorithms inheriting from the `Algorithm` base class. It repetitively calls the algorithm's custom `step()` and `update()` functions. You can load and continue a previous experiment using the Algorithm's `load()` method. The `snapshot_mode()` method determines when to save the current training state, e.g. 'latest' saves after every step of the algorithm, and 'best'
policy_hparam = dict( feats=FeatureStack(const_feat, identity_feat, sign_feat, squared_feat, MultFeat((0, 2)), MultFeat((1, 2)))) policy = LinearPolicy(spec=env_sim.spec, **policy_hparam) # Policy optimization subroutine subrtn_policy_hparam = dict( max_iter=5, pop_size=5 * policy.num_param, num_domains=20, num_init_states_per_domain=1, expl_factor=1.05, expl_std_init=1.0, num_workers=args.num_workers, ) subrtn_policy = HCNormal(ex_dir, env_sim, policy, **subrtn_policy_hparam) # Algorithm algo_hparam = dict( max_iter=5, num_real_rollouts=num_real_rollouts, num_sim_per_round=200, num_sbi_rounds=3, simulation_batch_size=10, normalize_posterior=False, num_eval_samples=100, # num_segments=1, len_segments=100, stop_on_done=False, posterior_hparam=posterior_hparam, subrtn_sbi_training_hparam=dict(