def test_combination_downsampling_delay(): mockenv = MockEnv(act_space=BoxSpace(-1, 1, shape=(2, )), obs_space=BoxSpace(-1, 1, shape=(2, ))) wenv_ds_dl = DownsamplingWrapper(mockenv, factor=2) wenv_ds_dl = ActDelayWrapper(wenv_ds_dl, delay=3) # Reset to initialize buffer wenv_ds_dl.reset() # The first ones are 0 because the ActDelayWrapper's queue is initialized with 0 wenv_ds_dl.step(np.array([0, 1])) assert mockenv.last_act == [0, 0] wenv_ds_dl.step(np.array([0, 2])) assert mockenv.last_act == [0, 0] wenv_ds_dl.step(np.array([0, 3])) assert mockenv.last_act == [0, 0] wenv_ds_dl.step(np.array([0, 4])) # Intuitively one would think last_act would be [0, 1] here, but this is the effect of the wrappers' combination assert mockenv.last_act == [0, 0] wenv_ds_dl.step(np.array([0, 5])) assert mockenv.last_act == [0, 2] wenv_ds_dl.step(np.array([0, 6])) assert mockenv.last_act == [0, 2] wenv_ds_dl.step(np.array([0, 7])) assert mockenv.last_act == [0, 4] wenv_ds_dl.step(np.array([0, 8])) assert mockenv.last_act == [0, 4] wenv_ds_dl.step(np.array([0, 9])) assert mockenv.last_act == [0, 6] wenv_ds_dl.step(np.array([1, 0])) assert mockenv.last_act == [0, 6]
def test_no_delay(): mockenv = MockEnv(act_space=BoxSpace(-1, 1, shape=(2, ))) wenv = ActDelayWrapper(mockenv, delay=0) # Reset to initialize buffer wenv.reset() # Perform some actions wenv.step(np.array([4, 1])) assert mockenv.last_act == [4, 1] wenv.step(np.array([7, 5])) assert mockenv.last_act == [7, 5]
def test_combination(): env = QCartPoleSwingUpSim(dt=1/50., max_steps=20) randomizer = create_default_randomizer(env) env_r = DomainRandWrapperBuffer(env, randomizer) env_r.fill_buffer(num_domains=3) dp_before = [] dp_after = [] for i in range(4): dp_before.append(env_r.domain_param) rollout(env_r, DummyPolicy(env_r.spec), eval=True, seed=0, render_mode=RenderMode()) dp_after.append(env_r.domain_param) assert dp_after[i] != dp_before[i] assert dp_after[0] == dp_after[3] env_rn = ActNormWrapper(env) elb = {'x_dot': -213., 'theta_dot': -42.} eub = {'x_dot': 213., 'theta_dot': 42., 'x': 0.123} env_rn = ObsNormWrapper(env_rn, explicit_lb=elb, explicit_ub=eub) alb, aub = env_rn.act_space.bounds assert all(alb == -1) assert all(aub == 1) olb, oub = env_rn.obs_space.bounds assert all(olb == -1) assert all(oub == 1) ro_r = rollout(env_r, DummyPolicy(env_r.spec), eval=True, seed=0, render_mode=RenderMode()) ro_rn = rollout(env_rn, DummyPolicy(env_rn.spec), eval=True, seed=0, render_mode=RenderMode()) assert np.allclose(env_rn._process_obs(ro_r.observations), ro_rn.observations) env_rnp = ObsPartialWrapper(env_rn, idcs=['x_dot', r'cos_theta']) ro_rnp = rollout(env_rnp, DummyPolicy(env_rnp.spec), eval=True, seed=0, render_mode=RenderMode()) env_rnpa = GaussianActNoiseWrapper(env_rnp, noise_mean=0.5*np.ones(env_rnp.act_space.shape), noise_std=0.1*np.ones(env_rnp.act_space.shape)) ro_rnpa = rollout(env_rnpa, DummyPolicy(env_rnpa.spec), eval=True, seed=0, render_mode=RenderMode()) assert np.allclose(ro_rnp.actions, ro_rnpa.actions) assert not np.allclose(ro_rnp.observations, ro_rnpa.observations) env_rnpd = ActDelayWrapper(env_rnp, delay=3) ro_rnpd = rollout(env_rnpd, DummyPolicy(env_rnpd.spec), eval=True, seed=0, render_mode=RenderMode()) assert np.allclose(ro_rnp.actions, ro_rnpd.actions) assert not np.allclose(ro_rnp.observations, ro_rnpd.observations) assert isinstance(inner_env(env_rnpd), QCartPoleSwingUpSim) assert typed_env(env_rnpd, ObsPartialWrapper) is not None assert isinstance(env_rnpd, ActDelayWrapper) env_rnpdr = remove_env(env_rnpd, ActDelayWrapper) assert not isinstance(env_rnpdr, ActDelayWrapper)
from pyrado.policies.features import FeatureStack, identity_feat from pyrado.policies.linear import LinearPolicy from pyrado.sampling.sequences import * if __name__ == '__main__': # Experiment (set seed before creating the modules) ex_dir = setup_experiment(QBallBalancerSim.name, f'{SPOTA.name}-{HCNormal.name}', f'{LinearPolicy.name}_obsnoise-s_actedlay-10', seed=1001) # Environment and domain randomization env_hparams = dict(dt=1/100., max_steps=500) env = QBallBalancerSim(**env_hparams) env = GaussianObsNoiseWrapper(env, noise_std=[1/180*pi, 1/180*pi, 0.005, 0.005, # [rad, rad, m, m, ... 10/180*pi, 10/180*pi, 0.05, 0.05]) # ... rad/s, rad/s, m/s, m/s] # env = ObsPartialWrapper(env, mask=[0, 0, 0, 0, 1, 1, 0, 0]) env = ActDelayWrapper(env) randomizer = get_default_randomizer(env) randomizer.add_domain_params(UniformDomainParam(name='act_delay', mean=5, halfspan=5, clip_lo=0, roundint=True)) env = DomainRandWrapperBuffer(env, randomizer) # Policy policy_hparam = dict(feats=FeatureStack([identity_feat])) policy = LinearPolicy(spec=env.spec, **policy_hparam) # Initialize with Quanser's PD gains init_policy_param_values = to.tensor([[-14., 0, -14*3.45, 0, 0, 0, -14*2.11, 0], [0, -14., 0, -14*3.45, 0, 0, 0, -14*2.11]]) # Algorithm subrtn_hparam_cand = dict( max_iter=100,
def test_domain_param(): mockenv = MockEnv(act_space=BoxSpace(-1, 1, shape=(2, ))) wenv = ActDelayWrapper(mockenv, delay=1) # Reset to initialize buffer wenv.reset() # Perform some actions wenv.step(np.array([0, 1])) assert mockenv.last_act == [0, 0] wenv.step(np.array([2, 4])) assert mockenv.last_act == [0, 1] # change the delay and reset wenv.domain_param = {'act_delay': 2} wenv.reset() wenv.step(np.array([1, 2])) assert mockenv.last_act == [0, 0] wenv.step(np.array([2, 3])) assert mockenv.last_act == [0, 0] wenv.step(np.array([8, 9])) assert mockenv.last_act == [1, 2]
def test_reset(): mockenv = MockEnv(act_space=BoxSpace(-1, 1, shape=(2, ))) wenv = ActDelayWrapper(mockenv, delay=1) # Reset to initialize buffer wenv.reset() # Perform some actions wenv.step(np.array([0, 4])) assert mockenv.last_act == [0, 0] wenv.step(np.array([4, 4])) assert mockenv.last_act == [0, 4] # The next action would be [4, 4], but now we reset again wenv.reset() wenv.step(np.array([1, 2])) assert mockenv.last_act == [0, 0] wenv.step(np.array([2, 3])) assert mockenv.last_act == [1, 2]
if __name__ == "__main__": # Parse command line arguments args = get_argparser().parse_args() # Experiment (set seed before creating the modules) ex_dir = setup_experiment( QQubeSwingUpSim.name, f"{NPDR.name}_{QQubeSwingUpAndBalanceCtrl.name}", "sim2sim") # Set seed if desired pyrado.set_seed(args.seed, verbose=True) # Environments env_sim_hparams = dict(dt=1 / 250.0, max_steps=1500) env_sim = QQubeSwingUpSim(**env_sim_hparams) env_sim = ActDelayWrapper(env_sim) # Create a fake ground truth target domain num_real_rollouts = 2 env_real = deepcopy(env_sim) dp_nom = env_sim.get_nominal_domain_param() env_real.domain_param = dict( damping_rot_pole=dp_nom["damping_rot_pole"] * 1.9, damping_pend_pole=dp_nom["damping_pend_pole"] * 0.4, motor_resistance=dp_nom["motor_resistance"] * 1.0, motor_back_emf=dp_nom["motor_back_emf"] * 1.0, mass_pend_pole=dp_nom["mass_pend_pole"] * 1.1, mass_rot_pole=dp_nom["mass_rot_pole"] * 1.2, length_pend_pole=dp_nom["length_pend_pole"] * 0.8, length_rot_pole=dp_nom["length_rot_pole"] * 0.9, gravity_const=dp_nom["gravity_const"] * 1.0,
def test_basic_meta(ex_dir, policy, env: SimEnv, algo, algo_hparam: dict): pyrado.set_seed(0) # Policy and subroutine env = GaussianObsNoiseWrapper( env, noise_std=[ 1 / 180 * np.pi, 1 / 180 * np.pi, 0.0025, 0.0025, 2 / 180 * np.pi, 2 / 180 * np.pi, 0.05, 0.05, ], ) env = ActNormWrapper(env) env = ActDelayWrapper(env) randomizer = create_default_randomizer_qbb() randomizer.add_domain_params( UniformDomainParam(name="act_delay", mean=15, halfspan=15, clip_lo=0, roundint=True)) env = DomainRandWrapperLive(env, randomizer) # Policy policy_hparam = dict(hidden_sizes=[16, 16], hidden_nonlin=to.tanh) # FNN policy = FNNPolicy(spec=env.spec, **policy_hparam) # Critic vfcn_hparam = dict(hidden_sizes=[16, 16], hidden_nonlin=to.tanh) # FNN vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.9995, lamda=0.98, num_epoch=2, batch_size=64, lr=5e-4, standardize_adv=False, ) critic = GAE(vfcn, **critic_hparam) subrtn_hparam = dict( max_iter=3, min_rollouts=5, num_epoch=2, eps_clip=0.1, batch_size=64, std_init=0.8, lr=2e-4, num_workers=1, ) subrtn = PPO(ex_dir, env, policy, critic, **subrtn_hparam) algo = algo(env, subrtn, **algo_hparam) algo.train() assert algo.curr_iter == algo.max_iter
def test_combination_delay_downsampling(): """ After delay number of actions, the actions are downsampled by the factor """ mockenv = MockEnv(act_space=BoxSpace(-1, 1, shape=(2, )), obs_space=BoxSpace(-1, 1, shape=(2, ))) wenv_dl_ds = ActDelayWrapper(mockenv, delay=3) wenv_dl_ds = DownsamplingWrapper(wenv_dl_ds, factor=2) # Reset to initialize buffer wenv_dl_ds.reset() # The first ones are 0 because the ActDelayWrapper's queue is initialized with 0 wenv_dl_ds.step(np.array([0, 1])) assert mockenv.last_act == [0, 0] wenv_dl_ds.step(np.array([0, 2])) assert mockenv.last_act == [0, 0] wenv_dl_ds.step(np.array([0, 3])) assert mockenv.last_act == [0, 0] # One time step earlier than the other order of wrappers wenv_dl_ds.step(np.array([0, 4])) assert mockenv.last_act == [0, 1] wenv_dl_ds.step(np.array([0, 5])) assert mockenv.last_act == [0, 1] wenv_dl_ds.step(np.array([0, 6])) assert mockenv.last_act == [0, 3] wenv_dl_ds.step(np.array([0, 7])) assert mockenv.last_act == [0, 3] wenv_dl_ds.step(np.array([0, 8])) assert mockenv.last_act == [0, 5] wenv_dl_ds.step(np.array([0, 9])) assert mockenv.last_act == [0, 5] wenv_dl_ds.step(np.array([1, 0])) assert mockenv.last_act == [0, 7] wenv_dl_ds.step(np.array([1, 1])) assert mockenv.last_act == [0, 7]
def evaluate_policy(args, ex_dir): """Helper function to evaluate the policy from an experiment in the associated environment.""" env, policy, _ = load_experiment(ex_dir, args) # Create multi-dim evaluation grid param_spec = dict() param_spec_dim = None if isinstance(inner_env(env), BallOnPlateSim): param_spec["ball_radius"] = np.linspace(0.02, 0.08, num=2, endpoint=True) param_spec["ball_rolling_friction_coefficient"] = np.linspace(0.0295, 0.9, num=2, endpoint=True) elif isinstance(inner_env(env), QQubeSwingUpSim): eval_num = 200 # Use nominal values for all other parameters. for param, nominal_value in env.get_nominal_domain_param().items(): param_spec[param] = nominal_value # param_spec["gravity_const"] = np.linspace(5.0, 15.0, num=eval_num, endpoint=True) param_spec["damping_pend_pole"] = np.linspace(0.0, 0.0001, num=eval_num, endpoint=True) param_spec["damping_rot_pole"] = np.linspace(0.0, 0.0006, num=eval_num, endpoint=True) param_spec_dim = 2 elif isinstance(inner_env(env), QBallBalancerSim): # param_spec["gravity_const"] = np.linspace(7.91, 11.91, num=11, endpoint=True) # param_spec["ball_mass"] = np.linspace(0.003, 0.3, num=11, endpoint=True) # param_spec["ball_radius"] = np.linspace(0.01, 0.1, num=11, endpoint=True) param_spec["plate_length"] = np.linspace(0.275, 0.275, num=11, endpoint=True) param_spec["arm_radius"] = np.linspace(0.0254, 0.0254, num=11, endpoint=True) # param_spec["load_inertia"] = np.linspace(5.2822e-5*0.5, 5.2822e-5*1.5, num=11, endpoint=True) # param_spec["motor_inertia"] = np.linspace(4.6063e-7*0.5, 4.6063e-7*1.5, num=11, endpoint=True) # param_spec["gear_ratio"] = np.linspace(60, 80, num=11, endpoint=True) # param_spec["gear_efficiency"] = np.linspace(0.6, 1.0, num=11, endpoint=True) # param_spec["motor_efficiency"] = np.linspace(0.49, 0.89, num=11, endpoint=True) # param_spec["motor_back_emf"] = np.linspace(0.006, 0.066, num=11, endpoint=True) # param_spec["motor_resistance"] = np.linspace(2.6*0.5, 2.6*1.5, num=11, endpoint=True) # param_spec["combined_damping"] = np.linspace(0.0, 0.05, num=11, endpoint=True) # param_spec["friction_coeff"] = np.linspace(0, 0.015, num=11, endpoint=True) # param_spec["voltage_thold_x_pos"] = np.linspace(0.0, 1.0, num=11, endpoint=True) # param_spec["voltage_thold_x_neg"] = np.linspace(-1., 0.0, num=11, endpoint=True) # param_spec["voltage_thold_y_pos"] = np.linspace(0.0, 1.0, num=11, endpoint=True) # param_spec["voltage_thold_y_neg"] = np.linspace(-1.0, 0, num=11, endpoint=True) # param_spec["offset_th_x"] = np.linspace(-5/180*np.pi, 5/180*np.pi, num=11, endpoint=True) # param_spec["offset_th_y"] = np.linspace(-5/180*np.pi, 5/180*np.pi, num=11, endpoint=True) else: raise NotImplementedError # Always add an action delay wrapper (with 0 delay by default) if typed_env(env, ActDelayWrapper) is None: env = ActDelayWrapper(env) # param_spec['act_delay'] = np.linspace(0, 30, num=11, endpoint=True, dtype=int) add_info = "-".join(param_spec.keys()) # Create multidimensional results grid and ensure right number of rollouts param_list = param_grid(param_spec) param_list *= args.num_rollouts_per_config # Fix initial state (set to None if it should not be fixed) init_state = np.array([0.0, 0.0, 0.0, 0.0]) # Create sampler pool = SamplerPool(args.num_workers) if args.seed is not None: pool.set_seed(args.seed) print_cbt(f"Set the random number generators' seed to {args.seed}.", "w") else: print_cbt("No seed was set", "y") # Sample rollouts ros = eval_domain_params(pool, env, policy, param_list, init_state) # Compute metrics lod = [] for ro in ros: d = dict(**ro.rollout_info["domain_param"], ret=ro.undiscounted_return(), len=ro.length) # Simply remove the observation noise from the domain parameters try: d.pop("obs_noise_mean") d.pop("obs_noise_std") except KeyError: pass lod.append(d) df = pd.DataFrame(lod) metrics = dict( avg_len=df["len"].mean(), avg_ret=df["ret"].mean(), median_ret=df["ret"].median(), min_ret=df["ret"].min(), max_ret=df["ret"].max(), std_ret=df["ret"].std(), ) pprint(metrics, indent=4) # Create subfolder and save timestamp = datetime.datetime.now() add_info = timestamp.strftime(pyrado.timestamp_format) + "--" + add_info save_dir = osp.join(ex_dir, "eval_domain_grid", add_info) os.makedirs(save_dir, exist_ok=True) save_dicts_to_yaml( {"ex_dir": str(ex_dir)}, {"varied_params": list(param_spec.keys())}, {"num_rpp": args.num_rollouts_per_config, "seed": args.seed}, {"metrics": dict_arraylike_to_float(metrics)}, save_dir=save_dir, file_name="summary", ) pyrado.save(df, f"df_sp_grid_{len(param_spec) if param_spec_dim is None else param_spec_dim}d.pkl", save_dir)
def test_combination(env: SimEnv): pyrado.set_seed(0) env.max_steps = 20 randomizer = create_default_randomizer(env) env_r = DomainRandWrapperBuffer(env, randomizer) env_r.fill_buffer(num_domains=3) dp_before = [] dp_after = [] for i in range(4): dp_before.append(env_r.domain_param) rollout(env_r, DummyPolicy(env_r.spec), eval=True, seed=0, render_mode=RenderMode()) dp_after.append(env_r.domain_param) assert dp_after[i] != dp_before[i] assert dp_after[0] == dp_after[3] env_rn = ActNormWrapper(env) elb = {"x_dot": -213.0, "theta_dot": -42.0} eub = {"x_dot": 213.0, "theta_dot": 42.0, "x": 0.123} env_rn = ObsNormWrapper(env_rn, explicit_lb=elb, explicit_ub=eub) alb, aub = env_rn.act_space.bounds assert all(alb == -1) assert all(aub == 1) olb, oub = env_rn.obs_space.bounds assert all(olb == -1) assert all(oub == 1) ro_r = rollout(env_r, DummyPolicy(env_r.spec), eval=True, seed=0, render_mode=RenderMode()) ro_rn = rollout(env_rn, DummyPolicy(env_rn.spec), eval=True, seed=0, render_mode=RenderMode()) assert np.allclose(env_rn._process_obs(ro_r.observations), ro_rn.observations) env_rnp = ObsPartialWrapper( env_rn, idcs=[env.obs_space.labels[2], env.obs_space.labels[3]]) ro_rnp = rollout(env_rnp, DummyPolicy(env_rnp.spec), eval=True, seed=0, render_mode=RenderMode()) env_rnpa = GaussianActNoiseWrapper( env_rnp, noise_mean=0.5 * np.ones(env_rnp.act_space.shape), noise_std=0.1 * np.ones(env_rnp.act_space.shape)) ro_rnpa = rollout(env_rnpa, DummyPolicy(env_rnpa.spec), eval=True, seed=0, render_mode=RenderMode()) assert not np.allclose( ro_rnp.observations, ro_rnpa.observations) # the action noise changed to rollout env_rnpd = ActDelayWrapper(env_rnp, delay=3) ro_rnpd = rollout(env_rnpd, DummyPolicy(env_rnpd.spec), eval=True, seed=0, render_mode=RenderMode()) assert np.allclose(ro_rnp.actions, ro_rnpd.actions) assert not np.allclose(ro_rnp.observations, ro_rnpd.observations) assert type(inner_env(env_rnpd)) == type(env) assert typed_env(env_rnpd, ObsPartialWrapper) is not None assert isinstance(env_rnpd, ActDelayWrapper) env_rnpdr = remove_env(env_rnpd, ActDelayWrapper) assert not isinstance(env_rnpdr, ActDelayWrapper)