def test_dr_wrapper_live_bob(env): param_init = env.domain_param randomizer = create_default_randomizer(env) wrapper = DomainRandWrapperLive(env, randomizer) # So far no randomization happened, thus the parameter should be equal assert env.domain_param == param_init # Randomize 10 times 1 new parameter set for _ in range(10): param_old = wrapper.domain_param wrapper.reset() assert param_old != wrapper.domain_param
def eval_randomized_domain( pool: SamplerPool, env: SimEnv, randomizer: DomainRandomizer, policy: Policy, init_states: List[np.ndarray]) -> List[StepSequence]: """ Evaluate a policy in a randomized domain. :param pool: parallel sampler :param env: environment to evaluate in :param randomizer: randomizer used to sample random domain instances, inherited from `DomainRandomizer` :param policy: policy to evaluate :param init_states: initial states of the environment which will be fixed if not set to `None` :return: list of rollouts """ # Randomize the environments env = remove_all_dr_wrappers(env) env = DomainRandWrapperLive(env, randomizer) pool.invoke_all(_ps_init, pickle.dumps(env), pickle.dumps(policy)) # Run with progress bar with tqdm(leave=False, file=sys.stdout, unit="rollouts", desc="Sampling") as pb: return pool.run_map( functools.partial(_ps_run_one_init_state, eval=True), init_states, pb)
def create_qqsu_setup(): # Environments env_hparams = dict(dt=1 / 100.0, max_steps=600) env_real = QQubeSwingUpSim(**env_hparams) env_real.domain_param = dict( mass_rot_pole=0.095 * 0.9, # 0.095*0.9 = 0.0855 mass_pend_pole=0.024 * 1.1, # 0.024*1.1 = 0.0264 length_rot_pole=0.085 * 0.9, # 0.085*0.9 = 0.0765 length_pend_pole=0.129 * 1.1, # 0.129*1.1 = 0.1419 ) env_sim = QQubeSwingUpSim(**env_hparams) randomizer = DomainRandomizer( NormalDomainParam(name="mass_rot_pole", mean=0.0, std=1e-9, clip_lo=1e-3), NormalDomainParam(name="mass_pend_pole", mean=0.0, std=1e-9, clip_lo=1e-3), NormalDomainParam(name="length_rot_pole", mean=0.0, std=1e-9, clip_lo=1e-3), NormalDomainParam(name="length_pend_pole", mean=0.0, std=1e-9, clip_lo=1e-3), ) env_sim = DomainRandWrapperLive(env_sim, randomizer) dp_map = { 0: ("mass_rot_pole", "mean"), 1: ("mass_rot_pole", "std"), 2: ("mass_pend_pole", "mean"), 3: ("mass_pend_pole", "std"), 4: ("length_rot_pole", "mean"), 5: ("length_rot_pole", "std"), 6: ("length_pend_pole", "mean"), 7: ("length_pend_pole", "std"), } # trafo_mask = [False, True, False, True, False, True, False, True] trafo_mask = [True] * 8 env_sim = MetaDomainRandWrapper(env_sim, dp_map) # Policies (the behavioral policy needs to be deterministic) behavior_policy = QQubeSwingUpAndBalanceCtrl(env_sim.spec) prior = DomainRandomizer( NormalDomainParam(name="mass_rot_pole", mean=0.095, std=0.095 / 10), NormalDomainParam(name="mass_pend_pole", mean=0.024, std=0.024 / 10), NormalDomainParam(name="length_rot_pole", mean=0.085, std=0.085 / 10), NormalDomainParam(name="length_pend_pole", mean=0.129, std=0.129 / 10), ) ddp_policy = DomainDistrParamPolicy(mapping=dp_map, trafo_mask=trafo_mask, prior=prior, scale_params=False) return env_sim, env_real, env_hparams, dp_map, behavior_policy, ddp_policy
def test_param_expl_sampler( env: SimEnv, policy: Policy, num_init_states_per_domain: int, fixed_init_state: bool, num_domains: int, num_workers: int, ): pyrado.set_seed(0) # Add randomizer pert = create_default_randomizer(env) env = DomainRandWrapperLive(env, pert) # Create the sampler sampler = ParameterExplorationSampler(env, policy, num_init_states_per_domain, num_domains, num_workers=num_workers) # Use some random parameters num_ps = 7 params = to.rand(num_ps, policy.num_param) if fixed_init_state: # Sample a custom init state init_states = [env.init_space.sample_uniform() ] * num_init_states_per_domain else: # Let the sampler forward to the env to randomly sample an init state init_states = None # Do the sampling samples = sampler.sample(param_sets=params, init_states=init_states) # Check if the correct number of rollouts has been sampled assert num_ps == len(samples) num_rollouts_per_param = num_init_states_per_domain * num_domains assert num_ps * num_rollouts_per_param == samples.num_rollouts for ps in samples: assert len(ps.rollouts) == num_rollouts_per_param # Compare rollouts that should be matching for idx in range(num_rollouts_per_param): # Use the first parameter set as pivot piter = iter(samples) pivot = next(piter).rollouts[idx] # Iterate through others for ops in piter: other_ro = ops.rollouts[idx] # Compare domain params assert pivot.rollout_info["domain_param"] == other_ro.rollout_info[ "domain_param"] # Compare first observation a.k.a. init state assert pivot[0].observation == pytest.approx( other_ro[0].observation)
def test_sprl(ex_dir, env: SimEnv, optimize_mean: bool): pyrado.set_seed(0) env = ActNormWrapper(env) env_sprl_params = [ dict( name="gravity_const", target_mean=to.tensor([9.81]), target_cov_chol_flat=to.tensor([1.0]), init_mean=to.tensor([9.81]), init_cov_chol_flat=to.tensor([0.05]), ) ] radnomizer = DomainRandomizer( *[SelfPacedDomainParam(**p) for p in env_sprl_params]) env = DomainRandWrapperLive(env, randomizer=radnomizer) policy = FNNPolicy(env.spec, hidden_sizes=[64, 64], hidden_nonlin=to.tanh) vfcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.relu) vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.9844534412010116, lamda=0.9710614403461155, num_epoch=10, batch_size=150, standardize_adv=False, lr=0.00016985313083236645, ) critic = GAE(vfcn, **critic_hparam) subrtn_hparam = dict( max_iter=1, eps_clip=0.12648736789309026, min_steps=10 * env.max_steps, num_epoch=3, batch_size=150, std_init=0.7573286998997557, lr=6.999956625305722e-04, max_grad_norm=1.0, num_workers=1, ) algo_hparam = dict( kl_constraints_ub=8000, performance_lower_bound=500, std_lower_bound=0.4, kl_threshold=200, max_iter=1, optimize_mean=optimize_mean, ) algo = SPRL(env, PPO(ex_dir, env, policy, critic, **subrtn_hparam), **algo_hparam) algo.train(snapshot_mode="latest") assert algo.curr_iter == algo.max_iter
def test_cuda_sampling_w_dr(env: SimEnv, policy: Policy, num_workers: int): randomizer = create_default_randomizer(env) env = DomainRandWrapperLive(env, randomizer) sampler = ParallelRolloutSampler(env, policy, num_workers=num_workers, min_rollouts=4) samples = sampler.sample() assert samples is not None
def test_cuda_sampling_w_dr(default_bob, bob_pert): # Add randomizer env = DomainRandWrapperLive(default_bob, bob_pert) # Use a simple policy policy = FNNPolicy(env.spec, hidden_sizes=[8], hidden_nonlin=to.tanh, use_cuda=True) # Create the sampler sampler = ParallelSampler(env, policy, num_envs=2, min_rollouts=10) samples = sampler.sample() assert samples is not None
def test_bayrn_power(ex_dir, env: SimEnv, bayrn_hparam: dict): pyrado.set_seed(0) # Environments and domain randomization env_real = deepcopy(env) env_sim = DomainRandWrapperLive(env, create_zero_var_randomizer(env)) dp_map = create_default_domain_param_map_qq() env_sim = MetaDomainRandWrapper(env_sim, dp_map) env_real.domain_param = dict(mass_pend_pole=0.024 * 1.1, mass_rot_pole=0.095 * 1.1) env_real = wrap_like_other_env(env_real, env_sim) # Policy and subroutine policy_hparam = dict(energy_gain=0.587, ref_energy=0.827) policy = QQubeSwingUpAndBalanceCtrl(env_sim.spec, **policy_hparam) subrtn_hparam = dict( max_iter=1, pop_size=8, num_init_states_per_domain=1, num_is_samples=4, expl_std_init=0.1, num_workers=1, ) subrtn = PoWER(ex_dir, env_sim, policy, **subrtn_hparam) # Set the boundaries for the GP dp_nom = inner_env(env_sim).get_nominal_domain_param() ddp_space = BoxSpace( bound_lo=np.array([ 0.8 * dp_nom["mass_pend_pole"], 1e-8, 0.8 * dp_nom["mass_rot_pole"], 1e-8 ]), bound_up=np.array([ 1.2 * dp_nom["mass_pend_pole"], 1e-7, 1.2 * dp_nom["mass_rot_pole"], 1e-7 ]), ) # Create algorithm and train algo = BayRn(ex_dir, env_sim, env_real, subrtn, ddp_space, **bayrn_hparam, num_workers=1) algo.train() assert algo.curr_iter == algo.max_iter or algo.stopping_criterion_met()
def test_pddr(ex_dir, env: SimEnv, policy, algo_hparam): pyrado.set_seed(0) # Create algorithm and train teacher_policy = deepcopy(policy) critic = GAE( vfcn=FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), hidden_sizes=[16, 16], hidden_nonlin=to.tanh)) teacher_algo_hparam = dict(critic=critic, min_steps=1500, max_iter=2) teacher_algo = PPO # Wrapper randomizer = create_default_randomizer(env) env = DomainRandWrapperLive(env, randomizer) # Subroutine algo_hparam = dict( max_iter=2, min_steps=env.max_steps, std_init=0.15, num_epochs=10, num_teachers=2, teacher_policy=teacher_policy, teacher_algo=teacher_algo, teacher_algo_hparam=teacher_algo_hparam, num_workers=1, ) algo = PDDR(ex_dir, env, policy, **algo_hparam) algo.train() assert algo.curr_iter == algo.max_iter # Save and load algo.save_snapshot(meta_info=None) algo_loaded = Algorithm.load_snapshot(load_dir=ex_dir) assert isinstance(algo_loaded, Algorithm) policy_loaded = algo_loaded.policy # Check assert all(algo.policy.param_values == policy_loaded.param_values) # Load the experiment. Since we did not save any hyper-parameters, we ignore the errors when loading. env, policy, extra = load_experiment(ex_dir) assert isinstance(env, Env) assert isinstance(policy, Policy) assert isinstance(extra, dict)
def create_bob_setup(): # Environments env_hparams = dict(dt=1 / 100., max_steps=500) env_real = BallOnBeamSim(**env_hparams) env_real.domain_param = dict( # l_beam=1.95, # ang_offset=-0.03, g=10.81) env_sim = BallOnBeamSim(**env_hparams) randomizer = DomainRandomizer( # NormalDomainParam(name='l_beam', mean=0, std=1e-12, clip_lo=1.5, clip_up=3.5), # UniformDomainParam(name='ang_offset', mean=0, halfspan=1e-12), NormalDomainParam(name='g', mean=0, std=1e-12), ) env_sim = DomainRandWrapperLive(env_sim, randomizer) dp_map = { # 0: ('l_beam', 'mean'), 1: ('l_beam', 'std'), # 2: ('ang_offset', 'mean'), 3: ('ang_offset', 'halfspan') 0: ('g', 'mean'), 1: ('g', 'std') } env_sim = MetaDomainRandWrapper(env_sim, dp_map) # Policies (the behavioral policy needs to be deterministic) behavior_policy = LinearPolicy(env_sim.spec, feats=FeatureStack( [identity_feat, sin_feat])) behavior_policy.param_values = to.tensor( [3.8090, -3.8036, -1.0786, -2.4510, -0.9875, -1.3252, 3.1503, 1.4443]) prior = DomainRandomizer( # NormalDomainParam(name='l_beam', mean=2.05, std=2.05/10), # UniformDomainParam(name='ang_offset', mean=0.03, halfspan=0.03/10), NormalDomainParam(name='g', mean=8.81, std=8.81 / 10), ) # trafo_mask = [False, True, False, True] trafo_mask = [True, True] ddp_policy = DomainDistrParamPolicy(mapping=dp_map, trafo_mask=trafo_mask, prior=prior, scale_params=True) return env_sim, env_real, env_hparams, dp_map, behavior_policy, ddp_policy
def test_param_expl_sampler(default_bob, bob_pert): # Add randomizer env = DomainRandWrapperLive(default_bob, bob_pert) # Use a simple policy policy = FNNPolicy(env.spec, hidden_sizes=[8], hidden_nonlin=to.tanh) # Create the sampler num_rollouts_per_param = 12 sampler = ParameterExplorationSampler( env, policy, num_envs=1, num_rollouts_per_param=num_rollouts_per_param, ) # Use some random parameters num_ps = 12 params = to.rand(num_ps, policy.num_param) # Do the sampling samples = sampler.sample(params) assert num_ps == len(samples) for ps in samples: assert len(ps.rollouts) == num_rollouts_per_param # Compare rollouts that should be matching for ri in range(num_rollouts_per_param): # Use the first paramset as pivot piter = iter(samples) pivot = next(piter).rollouts[ri] # Iterate through others for ops in piter: ro = ops.rollouts[ri] # Compare domain params assert pivot.rollout_info['domain_param'] == ro.rollout_info['domain_param'] # Compare first observation a.k.a. init state assert pivot[0].observation == pytest.approx(ro[0].observation)
def eval_randomized_domain(pool: SamplerPool, env: SimEnv, randomizer: DomainRandomizer, policy: Policy, init_states: list) -> list: """ Evaluate a policy in a randomized domain. :param pool: parallel sampler :param env: environment to evaluate in :param randomizer: randomizer used to sample random domain instances, inherited from `DomainRandomizer` :param policy: policy to evaluate :param init_states: initial states of the environment which will be fixed if not set to None :return: list of rollouts """ # Randomize the environments env = DomainRandWrapperLive(env, randomizer) pool.invoke_all(_setup_env_policy, env, policy) # Run with progress bar with tqdm(leave=False, file=sys.stdout, unit='rollouts', desc='Sampling') as pb: return pool.run_map(_run_rollout_nom, init_states, pb)
def test_param_expl_sampler(env: SimEnv, policy: Policy): # Add randomizer pert = create_default_randomizer(env) env = DomainRandWrapperLive(env, pert) # Create the sampler num_rollouts_per_param = 12 sampler = ParameterExplorationSampler( env, policy, num_workers=1, num_rollouts_per_param=num_rollouts_per_param) # Use some random parameters num_ps = 12 params = to.rand(num_ps, policy.num_param) # Do the sampling samples = sampler.sample(params) assert num_ps == len(samples) for ps in samples: assert len(ps.rollouts) == num_rollouts_per_param # Compare rollouts that should be matching for ri in range(num_rollouts_per_param): # Use the first paramset as pivot piter = iter(samples) pivot = next(piter).rollouts[ri] # Iterate through others for ops in piter: ro = ops.rollouts[ri] # Compare domain params assert pivot.rollout_info['domain_param'] == ro.rollout_info[ 'domain_param'] # Compare first observation a.k.a. init state assert pivot[0].observation == pytest.approx(ro[0].observation)
max_grad_norm=1.0, num_workers=8, lr_scheduler=lr_scheduler.ExponentialLR, lr_scheduler_hparam=dict(gamma=0.999), ) env_sprl_params = [ dict( name="gravity_const", target_mean=to.tensor([9.81]), target_cov_chol_flat=to.tensor([1.0]), init_mean=to.tensor([9.81]), init_cov_chol_flat=to.tensor([0.05]), ) ] env = DomainRandWrapperLive( env, randomizer=DomainRandomizer( *[SelfPacedDomainParam(**p) for p in env_sprl_params])) sprl_hparam = dict( kl_constraints_ub=8000, performance_lower_bound=500, std_lower_bound=0.4, kl_threshold=200, max_iter=args.sprl_iterations, optimize_mean=not args.cov_only, ) algo = SPRL(env, PPO(ex_dir, env, policy, critic, **algo_hparam), **sprl_hparam) # Save the hyper-parameters save_dicts_to_yaml(
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environments env_hparams = dict(dt=1 / 100., max_steps=600) env_real = QQubeSwingUpSim(**env_hparams) env_real.domain_param = dict( Mr=0.095 * 0.9, # 0.095*0.9 = 0.0855 Mp=0.024 * 1.1, # 0.024*1.1 = 0.0264 Lr=0.085 * 0.9, # 0.085*0.9 = 0.0765 Lp=0.129 * 1.1, # 0.129*1.1 = 0.1419 ) env_sim = QQubeSwingUpSim(**env_hparams) randomizer = DomainRandomizer( NormalDomainParam(name='Mr', mean=0., std=1e6, clip_lo=1e-3), NormalDomainParam(name='Mp', mean=0., std=1e6, clip_lo=1e-3), NormalDomainParam(name='Lr', mean=0., std=1e6, clip_lo=1e-3), NormalDomainParam(name='Lp', mean=0., std=1e6, clip_lo=1e-3), ) env_sim = DomainRandWrapperLive(env_sim, randomizer) dp_map = { 0: ('Mr', 'mean'), 1: ('Mr', 'std'), 2: ('Mp', 'mean'), 3: ('Mp', 'std'), 4: ('Lr', 'mean'), 5: ('Lr', 'std'), 6: ('Lp', 'mean'), 7: ('Lp', 'std') } trafo_mask = [True] * 8 env_sim = MetaDomainRandWrapper(env_sim, dp_map) # Subroutine for policy improvement behav_policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh) behav_policy = FNNPolicy(spec=env_sim.spec, **behav_policy_hparam) vfcn_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh) vfcn = FNNPolicy(spec=EnvSpec(env_sim.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.9885, lamda=0.9648, num_epoch=2, batch_size=500, standardize_adv=False, lr=5.792e-4, max_grad_norm=1., ) critic = GAE(vfcn, **critic_hparam) subrtn_policy_hparam = dict( max_iter=200, min_steps=3 * 23 * env_sim.max_steps, num_epoch=7, eps_clip=0.0744, batch_size=500, std_init=0.9074, lr=3.446e-04, max_grad_norm=1., num_workers=1, ) subrtn_policy = PPO(study_dir, env_sim, behav_policy, critic, **subrtn_policy_hparam) # Subroutine for system identification prior_std_denom = trial.suggest_uniform('prior_std_denom', 5, 20) prior = DomainRandomizer( NormalDomainParam(name='Mr', mean=0.095, std=0.095 / prior_std_denom), NormalDomainParam(name='Mp', mean=0.024, std=0.024 / prior_std_denom), NormalDomainParam(name='Lr', mean=0.085, std=0.085 / prior_std_denom), NormalDomainParam(name='Lp', mean=0.129, std=0.129 / prior_std_denom), ) ddp_policy = DomainDistrParamPolicy( mapping=dp_map, trafo_mask=trafo_mask, prior=prior, scale_params=trial.suggest_categorical('ddp_policy_scale_params', [True, False]), ) subsubrtn_distr_hparam = dict( max_iter=trial.suggest_categorical('subsubrtn_distr_max_iter', [20]), pop_size=trial.suggest_int('pop_size', 50, 500), num_rollouts=1, num_is_samples=trial.suggest_int('num_is_samples', 5, 20), expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3, 1e-1), expl_std_min=trial.suggest_categorical('expl_std_min', [1e-4]), extra_expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3, 1e-1), extra_expl_decay_iter=trial.suggest_int('extra_expl_decay_iter', 0, 10), num_workers=1, ) csv_logger = create_csv_step_logger( osp.join(study_dir, f'trial_{trial.number}')) subsubrtn_distr = CEM(study_dir, env_sim, ddp_policy, **subsubrtn_distr_hparam, logger=csv_logger) obs_vel_weight = trial.suggest_loguniform('obs_vel_weight', 1, 100) subrtn_distr_hparam = dict( metric=None, obs_dim_weight=[1, 1, 1, 1, obs_vel_weight, obs_vel_weight], num_rollouts_per_distr=trial.suggest_int('num_rollouts_per_distr', 20, 100), num_workers=1, ) subrtn_distr = SysIdViaEpisodicRL(subsubrtn_distr, behav_policy, **subrtn_distr_hparam) # Algorithm algo_hparam = dict( max_iter=trial.suggest_categorical('algo_max_iter', [10]), num_eval_rollouts=trial.suggest_categorical('algo_num_eval_rollouts', [5]), warmstart=trial.suggest_categorical('algo_warmstart', [True]), thold_succ_subrtn=trial.suggest_categorical('algo_thold_succ_subrtn', [50]), subrtn_snapshot_mode='latest', ) algo = SimOpt(study_dir, env_sim, env_real, subrtn_policy, subrtn_distr, **algo_hparam, logger=csv_logger) # Jeeeha algo.train(seed=args.seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler( env_real, algo.policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
R=np.diag([0, 0, 1e-1, 2e-1]), Q_dev=np.diag([0.0, 0.0, 5]), # R_dev=np.diag([0., 0., 1e-3, 1e-3]) ), ) env = WAMBallInCupSim(**env_hparams) # Randomizer randomizer = DomainRandomizer( UniformDomainParam(name="cup_scale", mean=1.0, halfspan=0.2), NormalDomainParam(name="rope_length", mean=0.3, std=0.005), NormalDomainParam(name="ball_mass", mean=0.021, std=0.001), UniformDomainParam(name="joint_2_damping", mean=0.05, halfspan=0.05), UniformDomainParam(name="joint_2_dryfriction", mean=0.1, halfspan=0.1), ) env = DomainRandWrapperLive(env, randomizer) # Policy bounds = ([0.0, 0.25, 0.5], [1.0, 1.5, 2.5]) policy_hparam = dict(rbf_hparam=dict(num_feat_per_dim=9, bounds=bounds, scale=None), dim_mask=2) policy = DualRBFLinearPolicy(env.spec, **policy_hparam) # Algorithm algo_hparam = dict( max_iter=15, pop_size=100, num_is_samples=10, num_init_states_per_domain=2, num_domains=10, expl_std_init=np.pi / 12, expl_std_min=0.02,
std=1e6, clip_lo=1e-3), NormalDomainParam(name="mass_pend_pole", mean=0.0, std=1e6, clip_lo=1e-3), NormalDomainParam(name="length_rot_pole", mean=0.0, std=1e6, clip_lo=1e-3), NormalDomainParam(name="length_pend_pole", mean=0.0, std=1e6, clip_lo=1e-3), ) env_sim = DomainRandWrapperLive(env_sim, randomizer) dp_map = { 0: ("mass_rot_pole", "mean"), 1: ("mass_rot_pole", "std"), 2: ("mass_pend_pole", "mean"), 3: ("mass_pend_pole", "std"), 4: ("length_rot_pole", "mean"), 5: ("length_rot_pole", "std"), 6: ("length_pend_pole", "mean"), 7: ("length_pend_pole", "std"), } trafo_mask = [False, True, False, True, False, True, False, True] env_sim = MetaDomainRandWrapper(env_sim, dp_map) # Subroutine for policy improvement behav_policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh)
lr_scheduler=lr_scheduler.ExponentialLR, lr_scheduler_hparam=dict(gamma=0.999), max_iter=args.max_iter_teacher, critic=critic, ) teacher_algo = PPO else: teacher_policy = None teacher_algo = None teacher_algo_hparam = None # Wrapper randomizer = create_default_randomizer(env_real) env_real = DomainRandWrapperLive(env_real, randomizer) env_real = ActNormWrapper(env_real) # Policy policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.relu, output_nonlin=to.tanh) policy = FNNPolicy(spec=env_real.spec, **policy_hparam) # Subroutine algo_hparam = dict( max_iter=args.max_iter, min_steps=args.max_steps, std_init=0.15, num_epochs=args.num_epochs, num_teachers=args.num_teachers,
def test_sysidasrl_reps(ex_dir, env: SimEnv, num_eval_rollouts: int): pyrado.set_seed(0) def eval_ddp_policy(rollouts_real): init_states_real = np.array([ro.states[0, :] for ro in rollouts_real]) rollouts_sim = [] for i, _ in enumerate(range(num_eval_rollouts)): rollouts_sim.append( rollout(env_sim, behavior_policy, eval=True, reset_kwargs=dict(init_state=init_states_real[i, :]))) # Clip the rollouts rollouts yielding two lists of pairwise equally long rollouts ros_real_tr, ros_sim_tr = algo.truncate_rollouts(rollouts_real, rollouts_sim, replicate=False) assert len(ros_real_tr) == len(ros_sim_tr) assert all([ np.allclose(r.states[0, :], s.states[0, :]) for r, s in zip(ros_real_tr, ros_sim_tr) ]) # Return the average the loss losses = [ algo.loss_fcn(ro_r, ro_s) for ro_r, ro_s in zip(ros_real_tr, ros_sim_tr) ] return float(np.mean(np.asarray(losses))) # Environments env_real = deepcopy(env) env_real.domain_param = dict(ang_offset=-2 * np.pi / 180) env_sim = deepcopy(env) randomizer = DomainRandomizer( UniformDomainParam(name="ang_offset", mean=0, halfspan=1e-6), ) env_sim = DomainRandWrapperLive(env_sim, randomizer) dp_map = {0: ("ang_offset", "mean"), 1: ("ang_offset", "halfspan")} env_sim = MetaDomainRandWrapper(env_sim, dp_map) assert env_real is not env_sim # Policies (the behavioral policy needs to be deterministic) behavior_policy = LinearPolicy(env_sim.spec, feats=FeatureStack(identity_feat)) prior = DomainRandomizer( UniformDomainParam(name="ang_offset", mean=1 * np.pi / 180, halfspan=1 * np.pi / 180), ) ddp_policy = DomainDistrParamPolicy(mapping=dp_map, trafo_mask=[False, True], prior=prior) # Subroutine subrtn_hparam = dict( max_iter=2, eps=1.0, pop_size=100, num_init_states_per_domain=1, expl_std_init=5e-2, expl_std_min=1e-4, num_workers=1, ) subrtn = REPS(ex_dir, env_sim, ddp_policy, **subrtn_hparam) algo_hparam = dict(metric=None, obs_dim_weight=np.ones(env_sim.obs_space.shape), num_rollouts_per_distr=5, num_workers=1) algo = SysIdViaEpisodicRL(subrtn, behavior_policy, **algo_hparam) rollouts_real_tst = [] for _ in range(num_eval_rollouts): rollouts_real_tst.append(rollout(env_real, behavior_policy, eval=True)) loss_pre = eval_ddp_policy(rollouts_real_tst) # Mimic training while algo.curr_iter < algo.max_iter and not algo.stopping_criterion_met(): algo.logger.add_value(algo.iteration_key, algo.curr_iter) # Creat fake real-world data rollouts_real = [] for _ in range(num_eval_rollouts): rollouts_real.append(rollout(env_real, behavior_policy, eval=True)) algo.step(snapshot_mode="latest", meta_info=dict(rollouts_real=rollouts_real)) algo.logger.record_step() algo._curr_iter += 1 loss_post = eval_ddp_policy(rollouts_real_tst) assert loss_post <= loss_pre # don't have to be better every step
# Experiment (set seed before creating the modules) ex_dir = setup_experiment( QQubeSwingUpSim.name, f"{BayRn.name}-{PPO.name}_{FNNPolicy.name}", "rand-mass_pend_pole-mass_rot_pole-length_pend_pole-length_rot_pole_lower-std", ) # Set seed if desired pyrado.set_seed(args.seed, verbose=True) # Environments env_sim_hparams = dict(dt=1 / 100.0, max_steps=600) env_sim = QQubeSwingUpSim(**env_sim_hparams) env_sim = ActNormWrapper(env_sim) env_sim = DomainRandWrapperLive(env_sim, create_zero_var_randomizer(env_sim)) dp_map = create_default_domain_param_map_qq() env_sim = MetaDomainRandWrapper(env_sim, dp_map) env_real_hparams = dict(dt=1 / 500.0, max_steps=3000) env_real = QQubeSwingUpReal(**env_real_hparams) env_real = wrap_like_other_env(env_real, env_sim) # Policy policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh) policy = FNNPolicy(spec=env_sim.spec, **policy_hparam) # Critic vfcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.tanh) vfcn = FNNPolicy(spec=EnvSpec(env_sim.obs_space, ValueFunctionSpace), **vfcn_hparam)
def test_simopt_cem_ppo(ex_dir, env: SimEnv): pyrado.set_seed(0) # Environments env_real = deepcopy(env) env_real = ActNormWrapper(env_real) env_sim = ActNormWrapper(env) randomizer = DomainRandomizer( NormalDomainParam(name="mass_rot_pole", mean=0.0, std=1e6, clip_lo=1e-3), NormalDomainParam(name="mass_pend_pole", mean=0.0, std=1e6, clip_lo=1e-3), NormalDomainParam(name="length_rot_pole", mean=0.0, std=1e6, clip_lo=1e-3), NormalDomainParam(name="length_pend_pole", mean=0.0, std=1e6, clip_lo=1e-3), ) env_sim = DomainRandWrapperLive(env_sim, randomizer) dp_map = { 0: ("mass_rot_pole", "mean"), 1: ("mass_rot_pole", "std"), 2: ("mass_pend_pole", "mean"), 3: ("mass_pend_pole", "std"), 4: ("length_rot_pole", "mean"), 5: ("length_rot_pole", "std"), 6: ("length_pend_pole", "mean"), 7: ("length_pend_pole", "std"), } trafo_mask = [True] * 8 env_sim = MetaDomainRandWrapper(env_sim, dp_map) # Subroutine for policy improvement behav_policy_hparam = dict(hidden_sizes=[16, 16], hidden_nonlin=to.tanh) behav_policy = FNNPolicy(spec=env_sim.spec, **behav_policy_hparam) vfcn_hparam = dict(hidden_sizes=[16, 16], hidden_nonlin=to.relu) vfcn = FNNPolicy(spec=EnvSpec(env_sim.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.99, lamda=0.98, num_epoch=2, batch_size=128, standardize_adv=True, lr=8e-4, max_grad_norm=5.0, ) critic = GAE(vfcn, **critic_hparam) subrtn_policy_hparam = dict( max_iter=2, eps_clip=0.13, min_steps=4 * env_sim.max_steps, num_epoch=3, batch_size=128, std_init=0.75, lr=3e-04, max_grad_norm=1.0, num_workers=1, ) subrtn_policy = PPO(ex_dir, env_sim, behav_policy, critic, **subrtn_policy_hparam) prior = DomainRandomizer( NormalDomainParam(name="mass_rot_pole", mean=0.095, std=0.095 / 10), NormalDomainParam(name="mass_pend_pole", mean=0.024, std=0.024 / 10), NormalDomainParam(name="length_rot_pole", mean=0.085, std=0.085 / 10), NormalDomainParam(name="length_pend_pole", mean=0.129, std=0.129 / 10), ) ddp_policy_hparam = dict(mapping=dp_map, trafo_mask=trafo_mask, scale_params=True) ddp_policy = DomainDistrParamPolicy(prior=prior, **ddp_policy_hparam) subsubrtn_distr_hparam = dict( max_iter=2, pop_size=10, num_init_states_per_domain=1, num_is_samples=8, expl_std_init=1e-2, expl_std_min=1e-5, extra_expl_std_init=1e-2, extra_expl_decay_iter=5, num_workers=1, ) subsubrtn_distr = CEM(ex_dir, env_sim, ddp_policy, **subsubrtn_distr_hparam) subrtn_distr_hparam = dict( metric=None, obs_dim_weight=[1, 1, 1, 1, 10, 10], num_rollouts_per_distr=3, num_workers=1, ) subrtn_distr = SysIdViaEpisodicRL(subsubrtn_distr, behavior_policy=behav_policy, **subrtn_distr_hparam) # Algorithm algo_hparam = dict( max_iter=1, num_eval_rollouts=5, warmstart=True, ) algo = SimOpt(ex_dir, env_sim, env_real, subrtn_policy, subrtn_distr, **algo_hparam) algo.train() assert algo.curr_iter == algo.max_iter
from pyrado.policies.linear import LinearPolicy from pyrado.utils.experiments import wrap_like_other_env if __name__ == '__main__': # Experiment (set seed before creating the modules) # ex_dir = setup_experiment(QQubeSim.name, f'{BayRn.name}_{PoWER.name}-sim2sim', '100Hz_lin_dr-Mp+', seed=111) ex_dir = setup_experiment( QQubeSim.name, f'{BayRn.name}_{PoWER.name}-sim2sim', f'{QQubeSwingUpAndBalanceCtrl.name}_100Hz_dr-Mp+Mr+', seed=1111) # Environments env_hparams = dict(dt=1 / 100., max_steps=600) env_sim = QQubeSim(**env_hparams) env_sim = DomainRandWrapperLive(env_sim, get_zero_var_randomizer(env_sim)) dp_map = get_default_domain_param_map_qq() env_sim = MetaDomainRandWrapper(env_sim, dp_map) env_real = QQubeSim(**env_hparams) env_real.domain_param = dict(Mp=0.026, Mr=0.097) # env_real = QQubeReal(**env_hparams) env_real = wrap_like_other_env(env_real, env_sim) # Policy # policy_hparam = dict( # feats=FeatureStack([identity_feat, sign_feat, abs_feat, squared_feat, qubic_feat, # MultFeat([2, 5]), MultFeat([3, 5]), MultFeat([4, 5])]) # ) # policy = LinearPolicy(spec=env_sim.spec, **policy_hparam) policy_hparam = dict(energy_gain=0.587, ref_energy=0.827, acc_max=10.)
def test_basic_meta(ex_dir, policy, env: SimEnv, algo, algo_hparam: dict): pyrado.set_seed(0) # Policy and subroutine env = GaussianObsNoiseWrapper( env, noise_std=[ 1 / 180 * np.pi, 1 / 180 * np.pi, 0.0025, 0.0025, 2 / 180 * np.pi, 2 / 180 * np.pi, 0.05, 0.05, ], ) env = ActNormWrapper(env) env = ActDelayWrapper(env) randomizer = create_default_randomizer_qbb() randomizer.add_domain_params( UniformDomainParam(name="act_delay", mean=15, halfspan=15, clip_lo=0, roundint=True)) env = DomainRandWrapperLive(env, randomizer) # Policy policy_hparam = dict(hidden_sizes=[16, 16], hidden_nonlin=to.tanh) # FNN policy = FNNPolicy(spec=env.spec, **policy_hparam) # Critic vfcn_hparam = dict(hidden_sizes=[16, 16], hidden_nonlin=to.tanh) # FNN vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.9995, lamda=0.98, num_epoch=2, batch_size=64, lr=5e-4, standardize_adv=False, ) critic = GAE(vfcn, **critic_hparam) subrtn_hparam = dict( max_iter=3, min_rollouts=5, num_epoch=2, eps_clip=0.1, batch_size=64, std_init=0.8, lr=2e-4, num_workers=1, ) subrtn = PPO(ex_dir, env, policy, critic, **subrtn_hparam) algo = algo(env, subrtn, **algo_hparam) algo.train() assert algo.curr_iter == algo.max_iter