def create_qqsu_setup(): # Environments env_hparams = dict(dt=1 / 100.0, max_steps=600) env_real = QQubeSwingUpSim(**env_hparams) env_real.domain_param = dict( mass_rot_pole=0.095 * 0.9, # 0.095*0.9 = 0.0855 mass_pend_pole=0.024 * 1.1, # 0.024*1.1 = 0.0264 length_rot_pole=0.085 * 0.9, # 0.085*0.9 = 0.0765 length_pend_pole=0.129 * 1.1, # 0.129*1.1 = 0.1419 ) env_sim = QQubeSwingUpSim(**env_hparams) randomizer = DomainRandomizer( NormalDomainParam(name="mass_rot_pole", mean=0.0, std=1e-9, clip_lo=1e-3), NormalDomainParam(name="mass_pend_pole", mean=0.0, std=1e-9, clip_lo=1e-3), NormalDomainParam(name="length_rot_pole", mean=0.0, std=1e-9, clip_lo=1e-3), NormalDomainParam(name="length_pend_pole", mean=0.0, std=1e-9, clip_lo=1e-3), ) env_sim = DomainRandWrapperLive(env_sim, randomizer) dp_map = { 0: ("mass_rot_pole", "mean"), 1: ("mass_rot_pole", "std"), 2: ("mass_pend_pole", "mean"), 3: ("mass_pend_pole", "std"), 4: ("length_rot_pole", "mean"), 5: ("length_rot_pole", "std"), 6: ("length_pend_pole", "mean"), 7: ("length_pend_pole", "std"), } # trafo_mask = [False, True, False, True, False, True, False, True] trafo_mask = [True] * 8 env_sim = MetaDomainRandWrapper(env_sim, dp_map) # Policies (the behavioral policy needs to be deterministic) behavior_policy = QQubeSwingUpAndBalanceCtrl(env_sim.spec) prior = DomainRandomizer( NormalDomainParam(name="mass_rot_pole", mean=0.095, std=0.095 / 10), NormalDomainParam(name="mass_pend_pole", mean=0.024, std=0.024 / 10), NormalDomainParam(name="length_rot_pole", mean=0.085, std=0.085 / 10), NormalDomainParam(name="length_pend_pole", mean=0.129, std=0.129 / 10), ) ddp_policy = DomainDistrParamPolicy(mapping=dp_map, trafo_mask=trafo_mask, prior=prior, scale_params=False) return env_sim, env_real, env_hparams, dp_map, behavior_policy, ddp_policy
def create_bob_setup(): # Environments env_hparams = dict(dt=1 / 100., max_steps=500) env_real = BallOnBeamSim(**env_hparams) env_real.domain_param = dict( # l_beam=1.95, # ang_offset=-0.03, g=10.81) env_sim = BallOnBeamSim(**env_hparams) randomizer = DomainRandomizer( # NormalDomainParam(name='l_beam', mean=0, std=1e-12, clip_lo=1.5, clip_up=3.5), # UniformDomainParam(name='ang_offset', mean=0, halfspan=1e-12), NormalDomainParam(name='g', mean=0, std=1e-12), ) env_sim = DomainRandWrapperLive(env_sim, randomizer) dp_map = { # 0: ('l_beam', 'mean'), 1: ('l_beam', 'std'), # 2: ('ang_offset', 'mean'), 3: ('ang_offset', 'halfspan') 0: ('g', 'mean'), 1: ('g', 'std') } env_sim = MetaDomainRandWrapper(env_sim, dp_map) # Policies (the behavioral policy needs to be deterministic) behavior_policy = LinearPolicy(env_sim.spec, feats=FeatureStack( [identity_feat, sin_feat])) behavior_policy.param_values = to.tensor( [3.8090, -3.8036, -1.0786, -2.4510, -0.9875, -1.3252, 3.1503, 1.4443]) prior = DomainRandomizer( # NormalDomainParam(name='l_beam', mean=2.05, std=2.05/10), # UniformDomainParam(name='ang_offset', mean=0.03, halfspan=0.03/10), NormalDomainParam(name='g', mean=8.81, std=8.81 / 10), ) # trafo_mask = [False, True, False, True] trafo_mask = [True, True] ddp_policy = DomainDistrParamPolicy(mapping=dp_map, trafo_mask=trafo_mask, prior=prior, scale_params=True) return env_sim, env_real, env_hparams, dp_map, behavior_policy, ddp_policy
lr_scheduler_hparam=dict(gamma=0.999), ) subrtn_policy = PPO(ex_dir, env_sim, behav_policy, critic, **subrtn_policy_hparam) # Subroutine for system identification prior = DomainRandomizer( NormalDomainParam(name="mass_rot_pole", mean=0.095, std=0.095 / 10), NormalDomainParam(name="mass_pend_pole", mean=0.024, std=0.024 / 10), NormalDomainParam(name="length_rot_pole", mean=0.085, std=0.085 / 10), NormalDomainParam(name="length_pend_pole", mean=0.129, std=0.129 / 10), ) ddp_policy_hparam = dict(mapping=dp_map, trafo_mask=trafo_mask, scale_params=False) ddp_policy = DomainDistrParamPolicy(prior=prior, **ddp_policy_hparam) subsubrtn_distr_hparam = dict( max_iter=10, pop_size=None, eta_mean=1.0, num_init_states_per_domain=1, expl_std_init=5e-2, expl_std_min=1e-4, num_workers=num_workers, ) subsubrtn_distr = NES(ex_dir, env_sim, ddp_policy, **subsubrtn_distr_hparam) subrtn_distr_hparam = dict( metric=None, obs_dim_weight=[1.0, 1.0, 1.0, 1.0, 2.0, 2.0], num_rollouts_per_distr=len(dp_map) * 10,
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environments env_hparams = dict(dt=1 / 100., max_steps=600) env_real = QQubeSwingUpSim(**env_hparams) env_real.domain_param = dict( Mr=0.095 * 0.9, # 0.095*0.9 = 0.0855 Mp=0.024 * 1.1, # 0.024*1.1 = 0.0264 Lr=0.085 * 0.9, # 0.085*0.9 = 0.0765 Lp=0.129 * 1.1, # 0.129*1.1 = 0.1419 ) env_sim = QQubeSwingUpSim(**env_hparams) randomizer = DomainRandomizer( NormalDomainParam(name='Mr', mean=0., std=1e6, clip_lo=1e-3), NormalDomainParam(name='Mp', mean=0., std=1e6, clip_lo=1e-3), NormalDomainParam(name='Lr', mean=0., std=1e6, clip_lo=1e-3), NormalDomainParam(name='Lp', mean=0., std=1e6, clip_lo=1e-3), ) env_sim = DomainRandWrapperLive(env_sim, randomizer) dp_map = { 0: ('Mr', 'mean'), 1: ('Mr', 'std'), 2: ('Mp', 'mean'), 3: ('Mp', 'std'), 4: ('Lr', 'mean'), 5: ('Lr', 'std'), 6: ('Lp', 'mean'), 7: ('Lp', 'std') } trafo_mask = [True] * 8 env_sim = MetaDomainRandWrapper(env_sim, dp_map) # Subroutine for policy improvement behav_policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh) behav_policy = FNNPolicy(spec=env_sim.spec, **behav_policy_hparam) vfcn_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh) vfcn = FNNPolicy(spec=EnvSpec(env_sim.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.9885, lamda=0.9648, num_epoch=2, batch_size=500, standardize_adv=False, lr=5.792e-4, max_grad_norm=1., ) critic = GAE(vfcn, **critic_hparam) subrtn_policy_hparam = dict( max_iter=200, min_steps=3 * 23 * env_sim.max_steps, num_epoch=7, eps_clip=0.0744, batch_size=500, std_init=0.9074, lr=3.446e-04, max_grad_norm=1., num_workers=1, ) subrtn_policy = PPO(study_dir, env_sim, behav_policy, critic, **subrtn_policy_hparam) # Subroutine for system identification prior_std_denom = trial.suggest_uniform('prior_std_denom', 5, 20) prior = DomainRandomizer( NormalDomainParam(name='Mr', mean=0.095, std=0.095 / prior_std_denom), NormalDomainParam(name='Mp', mean=0.024, std=0.024 / prior_std_denom), NormalDomainParam(name='Lr', mean=0.085, std=0.085 / prior_std_denom), NormalDomainParam(name='Lp', mean=0.129, std=0.129 / prior_std_denom), ) ddp_policy = DomainDistrParamPolicy( mapping=dp_map, trafo_mask=trafo_mask, prior=prior, scale_params=trial.suggest_categorical('ddp_policy_scale_params', [True, False]), ) subsubrtn_distr_hparam = dict( max_iter=trial.suggest_categorical('subsubrtn_distr_max_iter', [20]), pop_size=trial.suggest_int('pop_size', 50, 500), num_rollouts=1, num_is_samples=trial.suggest_int('num_is_samples', 5, 20), expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3, 1e-1), expl_std_min=trial.suggest_categorical('expl_std_min', [1e-4]), extra_expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3, 1e-1), extra_expl_decay_iter=trial.suggest_int('extra_expl_decay_iter', 0, 10), num_workers=1, ) csv_logger = create_csv_step_logger( osp.join(study_dir, f'trial_{trial.number}')) subsubrtn_distr = CEM(study_dir, env_sim, ddp_policy, **subsubrtn_distr_hparam, logger=csv_logger) obs_vel_weight = trial.suggest_loguniform('obs_vel_weight', 1, 100) subrtn_distr_hparam = dict( metric=None, obs_dim_weight=[1, 1, 1, 1, obs_vel_weight, obs_vel_weight], num_rollouts_per_distr=trial.suggest_int('num_rollouts_per_distr', 20, 100), num_workers=1, ) subrtn_distr = SysIdViaEpisodicRL(subsubrtn_distr, behav_policy, **subrtn_distr_hparam) # Algorithm algo_hparam = dict( max_iter=trial.suggest_categorical('algo_max_iter', [10]), num_eval_rollouts=trial.suggest_categorical('algo_num_eval_rollouts', [5]), warmstart=trial.suggest_categorical('algo_warmstart', [True]), thold_succ_subrtn=trial.suggest_categorical('algo_thold_succ_subrtn', [50]), subrtn_snapshot_mode='latest', ) algo = SimOpt(study_dir, env_sim, env_real, subrtn_policy, subrtn_distr, **algo_hparam, logger=csv_logger) # Jeeeha algo.train(seed=args.seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler( env_real, algo.policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret