def test_sprl(ex_dir, env: SimEnv, optimize_mean: bool): pyrado.set_seed(0) env = ActNormWrapper(env) env_sprl_params = [ dict( name="gravity_const", target_mean=to.tensor([9.81]), target_cov_chol_flat=to.tensor([1.0]), init_mean=to.tensor([9.81]), init_cov_chol_flat=to.tensor([0.05]), ) ] radnomizer = DomainRandomizer( *[SelfPacedDomainParam(**p) for p in env_sprl_params]) env = DomainRandWrapperLive(env, randomizer=radnomizer) policy = FNNPolicy(env.spec, hidden_sizes=[64, 64], hidden_nonlin=to.tanh) vfcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.relu) vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.9844534412010116, lamda=0.9710614403461155, num_epoch=10, batch_size=150, standardize_adv=False, lr=0.00016985313083236645, ) critic = GAE(vfcn, **critic_hparam) subrtn_hparam = dict( max_iter=1, eps_clip=0.12648736789309026, min_steps=10 * env.max_steps, num_epoch=3, batch_size=150, std_init=0.7573286998997557, lr=6.999956625305722e-04, max_grad_norm=1.0, num_workers=1, ) algo_hparam = dict( kl_constraints_ub=8000, performance_lower_bound=500, std_lower_bound=0.4, kl_threshold=200, max_iter=1, optimize_mean=optimize_mean, ) algo = SPRL(env, PPO(ex_dir, env, policy, critic, **subrtn_hparam), **algo_hparam) algo.train(snapshot_mode="latest") assert algo.curr_iter == algo.max_iter
def test_spota_ppo(ex_dir, env: SimEnv, spota_hparam: dict): pyrado.set_seed(0) # Environment and domain randomization randomizer = create_default_randomizer(env) env = DomainRandWrapperBuffer(env, randomizer) # Policy and subroutines policy = FNNPolicy(env.spec, [16, 16], hidden_nonlin=to.tanh) vfcn = FNN(input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=[16, 16], hidden_nonlin=to.tanh) critic_hparam = dict(gamma=0.998, lamda=0.95, num_epoch=3, batch_size=64, lr=1e-3) critic_cand = GAE(vfcn, **critic_hparam) critic_refs = GAE(deepcopy(vfcn), **critic_hparam) subrtn_hparam_common = dict( # min_rollouts=0, # will be overwritten by SPOTA min_steps=0, # will be overwritten by SPOTA max_iter=2, num_epoch=3, eps_clip=0.1, batch_size=64, num_workers=1, std_init=0.5, lr=1e-2, ) sr_cand = PPO(ex_dir, env, policy, critic_cand, **subrtn_hparam_common) sr_refs = PPO(ex_dir, env, deepcopy(policy), critic_refs, **subrtn_hparam_common) # Create algorithm and train algo = SPOTA(ex_dir, env, sr_cand, sr_refs, **spota_hparam) algo.train() assert algo.curr_iter == algo.max_iter or algo.stopping_criterion_met()
def test_actor_critic(ex_dir, env: SimEnv, policy: Policy, algo, algo_hparam, vfcn_type, use_cuda): pyrado.set_seed(0) if use_cuda: policy._device = "cuda" policy = policy.to(device="cuda") # Create value function if vfcn_type == "fnn-plain": vfcn = FNN( input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=[16, 16], hidden_nonlin=to.tanh, use_cuda=use_cuda, ) elif vfcn_type == FNNPolicy.name: vf_spec = EnvSpec(env.obs_space, ValueFunctionSpace) vfcn = FNNPolicy(vf_spec, hidden_sizes=[16, 16], hidden_nonlin=to.tanh, use_cuda=use_cuda) elif vfcn_type == RNNPolicy.name: vf_spec = EnvSpec(env.obs_space, ValueFunctionSpace) vfcn = RNNPolicy(vf_spec, hidden_size=16, num_recurrent_layers=1, use_cuda=use_cuda) else: raise NotImplementedError # Create critic critic_hparam = dict( gamma=0.98, lamda=0.95, batch_size=32, lr=1e-3, standardize_adv=False, ) critic = GAE(vfcn, **critic_hparam) # Common hyper-parameters common_hparam = dict(max_iter=2, min_rollouts=3, num_workers=1) # Add specific hyper parameters if any common_hparam.update(algo_hparam) # Create algorithm and train algo = algo(ex_dir, env, policy, critic, **common_hparam) algo.train() assert algo.curr_iter == algo.max_iter
def test_pddr(ex_dir, env: SimEnv, policy, algo_hparam): pyrado.set_seed(0) # Create algorithm and train teacher_policy = deepcopy(policy) critic = GAE( vfcn=FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), hidden_sizes=[16, 16], hidden_nonlin=to.tanh)) teacher_algo_hparam = dict(critic=critic, min_steps=1500, max_iter=2) teacher_algo = PPO # Wrapper randomizer = create_default_randomizer(env) env = DomainRandWrapperLive(env, randomizer) # Subroutine algo_hparam = dict( max_iter=2, min_steps=env.max_steps, std_init=0.15, num_epochs=10, num_teachers=2, teacher_policy=teacher_policy, teacher_algo=teacher_algo, teacher_algo_hparam=teacher_algo_hparam, num_workers=1, ) algo = PDDR(ex_dir, env, policy, **algo_hparam) algo.train() assert algo.curr_iter == algo.max_iter # Save and load algo.save_snapshot(meta_info=None) algo_loaded = Algorithm.load_snapshot(load_dir=ex_dir) assert isinstance(algo_loaded, Algorithm) policy_loaded = algo_loaded.policy # Check assert all(algo.policy.param_values == policy_loaded.param_values) # Load the experiment. Since we did not save any hyper-parameters, we ignore the errors when loading. env, policy, extra = load_experiment(ex_dir) assert isinstance(env, Env) assert isinstance(policy, Policy) assert isinstance(extra, dict)
def test_arpl(ex_dir, env: SimEnv): pyrado.set_seed(0) env = ActNormWrapper(env) env = StateAugmentationWrapper(env, domain_param=None) policy = FNNPolicy(env.spec, hidden_sizes=[16, 16], hidden_nonlin=to.tanh) vfcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.tanh) vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.9844534412010116, lamda=0.9710614403461155, num_epoch=10, batch_size=150, standardize_adv=False, lr=0.00016985313083236645, ) critic = GAE(vfcn, **critic_hparam) algo_hparam = dict( max_iter=2, min_steps=23 * env.max_steps, min_rollouts=None, num_epoch=5, eps_clip=0.085, batch_size=150, std_init=0.995, lr=2e-4, num_workers=1, ) arpl_hparam = dict( max_iter=2, steps_num=23 * env.max_steps, halfspan=0.05, dyn_eps=0.07, dyn_phi=0.25, obs_phi=0.1, obs_eps=0.05, proc_phi=0.1, proc_eps=0.03, torch_observation=True, ) ppo = PPO(ex_dir, env, policy, critic, **algo_hparam) algo = ARPL(ex_dir, env, ppo, policy, ppo.expl_strat, **arpl_hparam) algo.train(snapshot_mode="best")
def __init__( self, save_dir: pyrado.PathLike, env: Env, policy: Policy, critic: GAE, max_iter: int, min_rollouts: int = None, min_steps: int = None, num_epoch: int = 3, eps_clip: float = 0.1, vfcn_coeff: float = 0.5, entropy_coeff: float = 1e-3, batch_size: int = 32, std_init: float = 1.0, num_workers: int = 4, max_grad_norm: Optional[float] = None, lr: float = 5e-4, lr_scheduler=None, lr_scheduler_hparam: [dict, None] = None, logger: StepLogger = None, ): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param critic: advantage estimation function $A(s,a) = Q(s,a) - V(s)$ :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param num_epoch: number of iterations over all gathered samples during one policy update :param eps_clip: max/min probability ratio, see [1] :param vfcn_coeff: weighting factor of the value function term in the combined loss, specific to PPO2 :param entropy_coeff: weighting factor of the entropy term in the combined loss, specific to PPO2 :param batch_size: number of samples per policy update batch :param std_init: initial standard deviation on the actions for the exploration noise :param num_workers: number of environments for parallel sampling :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler. By default, the learning rate is constant. :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set) :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler :param logger: logger for every step of the algorithm, if `None` the default logger will be created .. note:: The Adam optimizer computes individual learning rates for all parameters. Thus, the learning rate scheduler schedules the maximum learning rate. """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) # Call ActorCritic's constructor super().__init__(env, policy, critic, save_dir, max_iter, logger) critic.standardize_adv = True # enforce this for PPO2 # Store the inputs self.num_epoch = num_epoch self.eps_clip = eps_clip self.vfcn_coeff = vfcn_coeff self.entropy_coeff = entropy_coeff self.batch_size = batch_size self.max_grad_norm = max_grad_norm # Initialize self.log_loss = True self._expl_strat = NormalActNoiseExplStrat(self._policy, std_init=std_init) self._sampler = ParallelRolloutSampler(env, self._expl_strat, num_workers=num_workers, min_steps=min_steps, min_rollouts=min_rollouts) self.optim = to.optim.Adam( [ { "params": self._expl_strat.policy.parameters() }, { "params": self._expl_strat.noise.parameters() }, { "params": self._critic.vfcn.parameters() }, ], lr=lr, eps=1e-5, ) self._lr_scheduler = lr_scheduler self._lr_scheduler_hparam = lr_scheduler_hparam if lr_scheduler is not None: self._lr_scheduler = lr_scheduler(self.optim, **lr_scheduler_hparam)
def test_basic_meta(ex_dir, policy, env: SimEnv, algo, algo_hparam: dict): pyrado.set_seed(0) # Policy and subroutine env = GaussianObsNoiseWrapper( env, noise_std=[ 1 / 180 * np.pi, 1 / 180 * np.pi, 0.0025, 0.0025, 2 / 180 * np.pi, 2 / 180 * np.pi, 0.05, 0.05, ], ) env = ActNormWrapper(env) env = ActDelayWrapper(env) randomizer = create_default_randomizer_qbb() randomizer.add_domain_params( UniformDomainParam(name="act_delay", mean=15, halfspan=15, clip_lo=0, roundint=True)) env = DomainRandWrapperLive(env, randomizer) # Policy policy_hparam = dict(hidden_sizes=[16, 16], hidden_nonlin=to.tanh) # FNN policy = FNNPolicy(spec=env.spec, **policy_hparam) # Critic vfcn_hparam = dict(hidden_sizes=[16, 16], hidden_nonlin=to.tanh) # FNN vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.9995, lamda=0.98, num_epoch=2, batch_size=64, lr=5e-4, standardize_adv=False, ) critic = GAE(vfcn, **critic_hparam) subrtn_hparam = dict( max_iter=3, min_rollouts=5, num_epoch=2, eps_clip=0.1, batch_size=64, std_init=0.8, lr=2e-4, num_workers=1, ) subrtn = PPO(ex_dir, env, policy, critic, **subrtn_hparam) algo = algo(env, subrtn, **algo_hparam) algo.train() assert algo.curr_iter == algo.max_iter
def test_simopt_cem_ppo(ex_dir, env: SimEnv): pyrado.set_seed(0) # Environments env_real = deepcopy(env) env_real = ActNormWrapper(env_real) env_sim = ActNormWrapper(env) randomizer = DomainRandomizer( NormalDomainParam(name="mass_rot_pole", mean=0.0, std=1e6, clip_lo=1e-3), NormalDomainParam(name="mass_pend_pole", mean=0.0, std=1e6, clip_lo=1e-3), NormalDomainParam(name="length_rot_pole", mean=0.0, std=1e6, clip_lo=1e-3), NormalDomainParam(name="length_pend_pole", mean=0.0, std=1e6, clip_lo=1e-3), ) env_sim = DomainRandWrapperLive(env_sim, randomizer) dp_map = { 0: ("mass_rot_pole", "mean"), 1: ("mass_rot_pole", "std"), 2: ("mass_pend_pole", "mean"), 3: ("mass_pend_pole", "std"), 4: ("length_rot_pole", "mean"), 5: ("length_rot_pole", "std"), 6: ("length_pend_pole", "mean"), 7: ("length_pend_pole", "std"), } trafo_mask = [True] * 8 env_sim = MetaDomainRandWrapper(env_sim, dp_map) # Subroutine for policy improvement behav_policy_hparam = dict(hidden_sizes=[16, 16], hidden_nonlin=to.tanh) behav_policy = FNNPolicy(spec=env_sim.spec, **behav_policy_hparam) vfcn_hparam = dict(hidden_sizes=[16, 16], hidden_nonlin=to.relu) vfcn = FNNPolicy(spec=EnvSpec(env_sim.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.99, lamda=0.98, num_epoch=2, batch_size=128, standardize_adv=True, lr=8e-4, max_grad_norm=5.0, ) critic = GAE(vfcn, **critic_hparam) subrtn_policy_hparam = dict( max_iter=2, eps_clip=0.13, min_steps=4 * env_sim.max_steps, num_epoch=3, batch_size=128, std_init=0.75, lr=3e-04, max_grad_norm=1.0, num_workers=1, ) subrtn_policy = PPO(ex_dir, env_sim, behav_policy, critic, **subrtn_policy_hparam) prior = DomainRandomizer( NormalDomainParam(name="mass_rot_pole", mean=0.095, std=0.095 / 10), NormalDomainParam(name="mass_pend_pole", mean=0.024, std=0.024 / 10), NormalDomainParam(name="length_rot_pole", mean=0.085, std=0.085 / 10), NormalDomainParam(name="length_pend_pole", mean=0.129, std=0.129 / 10), ) ddp_policy_hparam = dict(mapping=dp_map, trafo_mask=trafo_mask, scale_params=True) ddp_policy = DomainDistrParamPolicy(prior=prior, **ddp_policy_hparam) subsubrtn_distr_hparam = dict( max_iter=2, pop_size=10, num_init_states_per_domain=1, num_is_samples=8, expl_std_init=1e-2, expl_std_min=1e-5, extra_expl_std_init=1e-2, extra_expl_decay_iter=5, num_workers=1, ) subsubrtn_distr = CEM(ex_dir, env_sim, ddp_policy, **subsubrtn_distr_hparam) subrtn_distr_hparam = dict( metric=None, obs_dim_weight=[1, 1, 1, 1, 10, 10], num_rollouts_per_distr=3, num_workers=1, ) subrtn_distr = SysIdViaEpisodicRL(subsubrtn_distr, behavior_policy=behav_policy, **subrtn_distr_hparam) # Algorithm algo_hparam = dict( max_iter=1, num_eval_rollouts=5, warmstart=True, ) algo = SimOpt(ex_dir, env_sim, env_real, subrtn_policy, subrtn_distr, **algo_hparam) algo.train() assert algo.curr_iter == algo.max_iter
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environments env_hparams = dict(dt=1 / 100., max_steps=600) env_real = QQubeSwingUpSim(**env_hparams) env_real.domain_param = dict( Mr=0.095 * 0.9, # 0.095*0.9 = 0.0855 Mp=0.024 * 1.1, # 0.024*1.1 = 0.0264 Lr=0.085 * 0.9, # 0.085*0.9 = 0.0765 Lp=0.129 * 1.1, # 0.129*1.1 = 0.1419 ) env_sim = QQubeSwingUpSim(**env_hparams) randomizer = DomainRandomizer( NormalDomainParam(name='Mr', mean=0., std=1e6, clip_lo=1e-3), NormalDomainParam(name='Mp', mean=0., std=1e6, clip_lo=1e-3), NormalDomainParam(name='Lr', mean=0., std=1e6, clip_lo=1e-3), NormalDomainParam(name='Lp', mean=0., std=1e6, clip_lo=1e-3), ) env_sim = DomainRandWrapperLive(env_sim, randomizer) dp_map = { 0: ('Mr', 'mean'), 1: ('Mr', 'std'), 2: ('Mp', 'mean'), 3: ('Mp', 'std'), 4: ('Lr', 'mean'), 5: ('Lr', 'std'), 6: ('Lp', 'mean'), 7: ('Lp', 'std') } trafo_mask = [True] * 8 env_sim = MetaDomainRandWrapper(env_sim, dp_map) # Subroutine for policy improvement behav_policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh) behav_policy = FNNPolicy(spec=env_sim.spec, **behav_policy_hparam) vfcn_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh) vfcn = FNNPolicy(spec=EnvSpec(env_sim.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.9885, lamda=0.9648, num_epoch=2, batch_size=500, standardize_adv=False, lr=5.792e-4, max_grad_norm=1., ) critic = GAE(vfcn, **critic_hparam) subrtn_policy_hparam = dict( max_iter=200, min_steps=3 * 23 * env_sim.max_steps, num_epoch=7, eps_clip=0.0744, batch_size=500, std_init=0.9074, lr=3.446e-04, max_grad_norm=1., num_workers=1, ) subrtn_policy = PPO(study_dir, env_sim, behav_policy, critic, **subrtn_policy_hparam) # Subroutine for system identification prior_std_denom = trial.suggest_uniform('prior_std_denom', 5, 20) prior = DomainRandomizer( NormalDomainParam(name='Mr', mean=0.095, std=0.095 / prior_std_denom), NormalDomainParam(name='Mp', mean=0.024, std=0.024 / prior_std_denom), NormalDomainParam(name='Lr', mean=0.085, std=0.085 / prior_std_denom), NormalDomainParam(name='Lp', mean=0.129, std=0.129 / prior_std_denom), ) ddp_policy = DomainDistrParamPolicy( mapping=dp_map, trafo_mask=trafo_mask, prior=prior, scale_params=trial.suggest_categorical('ddp_policy_scale_params', [True, False]), ) subsubrtn_distr_hparam = dict( max_iter=trial.suggest_categorical('subsubrtn_distr_max_iter', [20]), pop_size=trial.suggest_int('pop_size', 50, 500), num_rollouts=1, num_is_samples=trial.suggest_int('num_is_samples', 5, 20), expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3, 1e-1), expl_std_min=trial.suggest_categorical('expl_std_min', [1e-4]), extra_expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3, 1e-1), extra_expl_decay_iter=trial.suggest_int('extra_expl_decay_iter', 0, 10), num_workers=1, ) csv_logger = create_csv_step_logger( osp.join(study_dir, f'trial_{trial.number}')) subsubrtn_distr = CEM(study_dir, env_sim, ddp_policy, **subsubrtn_distr_hparam, logger=csv_logger) obs_vel_weight = trial.suggest_loguniform('obs_vel_weight', 1, 100) subrtn_distr_hparam = dict( metric=None, obs_dim_weight=[1, 1, 1, 1, obs_vel_weight, obs_vel_weight], num_rollouts_per_distr=trial.suggest_int('num_rollouts_per_distr', 20, 100), num_workers=1, ) subrtn_distr = SysIdViaEpisodicRL(subsubrtn_distr, behav_policy, **subrtn_distr_hparam) # Algorithm algo_hparam = dict( max_iter=trial.suggest_categorical('algo_max_iter', [10]), num_eval_rollouts=trial.suggest_categorical('algo_num_eval_rollouts', [5]), warmstart=trial.suggest_categorical('algo_warmstart', [True]), thold_succ_subrtn=trial.suggest_categorical('algo_thold_succ_subrtn', [50]), subrtn_snapshot_mode='latest', ) algo = SimOpt(study_dir, env_sim, env_real, subrtn_policy, subrtn_distr, **algo_hparam, logger=csv_logger) # Jeeeha algo.train(seed=args.seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler( env_real, algo.policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
# # NormalDomainParam(name="mass_pend_pole", mean=0.0227, std=0.0009), NormalDomainParam(name="mass_rot_pole", mean=0.0899, std=0.0039), NormalDomainParam(name="length_pend_pole", mean=0.1474, std=0.0046), NormalDomainParam(name="length_rot_pole", mean=0.0777, std=0.003), ) env = DomainRandWrapperLive(env, randomizer) # Policy policy = to.load(osp.join(ref_ex_dir, "policy.pt")) policy.init_param() # Critic vfcn = to.load(osp.join(ref_ex_dir, "valuefcn.pt")) vfcn.init_param() critic = GAE(vfcn, **hparams["critic"]) # Algorithm algo_hparam = hparams["subrtn"] algo_hparam.update( {"num_workers": 1}) # should be equivalent to the number of cores per job # algo_hparam.update({'max_iter': 300}) # algo_hparam.update({'max_iter': 600}) # algo_hparam.update({'min_steps': 3*algo_hparam['min_steps']}) algo = PPO(ex_dir, env, policy, critic, **algo_hparam) # Save the hyper-parameters save_dicts_to_yaml( dict(env=env_hparams, seed=args.seed), dict(policy=hparams["policy"]),
def __init__( self, save_dir: pyrado.PathLike, env: Env, particle_hparam: dict, max_iter: int, num_particles: int, temperature: float, lr: float, horizon: int, std_init: float = 1.0, min_rollouts: int = None, min_steps: int = 10000, num_workers: int = 4, serial: bool = True, logger: StepLogger = None, ): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param particle_hparam: hyper-parameters for particle template construction :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param num_particles: number of distinct particles :param temperature: the temperature of the SVGD determines how jointly the training takes place :param lr: the learning rate for the update of the particles :param horizon: horizon for each particle :param std_init: initial standard deviation for the exploration :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param num_workers: number of environments for parallel sampling :param serial: serial mode can be switched off which can be used to partly control the flow of SVPG from outside :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) if not isinstance(particle_hparam, dict): raise pyrado.TypeErr(given=particle_hparam, expected_type=dict) if not all([key in particle_hparam for key in ["actor", "vfcn", "critic"]]): raise AttributeError # Call Algorithm's constructor super().__init__(save_dir, max_iter, policy=None, logger=logger) # Store the inputs self._env = env self.num_particles = num_particles self.horizon = horizon self.lr = lr self.temperature = temperature self.serial = serial # Prepare placeholders for particles self.particles = [None] * num_particles self.particleSteps = [None] * num_particles self.expl_strats = [None] * num_particles self.optimizers = [None] * num_particles self.fixed_particles = [None] * num_particles self.fixed_expl_strats = [None] * num_particles self.samplers = [None] * num_particles self.count = 0 self.update_count = 0 # Particle factory actor = FNNPolicy(spec=env.spec, **particle_hparam["actor"]) vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **particle_hparam["vfcn"]) critic = GAE(vfcn, **particle_hparam["critic"]) self.register_as_logger_parent(critic) particle = SVPGParticle(env.spec, actor, critic) for i in range(self.num_particles): self.particles[i] = deepcopy(particle) self.particles[i].init_param() self.expl_strats[i] = NormalActNoiseExplStrat(self.particles[i].actor, std_init) self.optimizers[i] = to.optim.Adam(self.expl_strats[i].parameters(), lr=self.lr) self.fixed_particles[i] = deepcopy(self.particles[i]) self.fixed_expl_strats[i] = deepcopy(self.expl_strats[i]) self.particleSteps[i] = 0 if self.serial: self.samplers[i] = ParallelRolloutSampler( env, self.expl_strats[i], num_workers, min_rollouts=min_rollouts, min_steps=min_steps )
def test_snapshots_notmeta(ex_dir, env: SimEnv, policy, algo_class, algo_hparam): # Collect hyper-parameters, create algorithm, and train common_hparam = dict(max_iter=1, num_workers=1) common_hparam.update(algo_hparam) if issubclass(algo_class, ActorCritic): common_hparam.update( min_rollouts=3, critic=GAE( vfcn=FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), hidden_sizes=[16, 16], hidden_nonlin=to.tanh)), ) elif issubclass(algo_class, ParameterExploring): common_hparam.update(num_init_states_per_domain=1) elif issubclass(algo_class, (DQL, SAC)): common_hparam.update(memory_size=1000, num_updates_per_step=2, gamma=0.99, min_rollouts=1) fnn_hparam = dict(hidden_sizes=[8, 8], hidden_nonlin=to.tanh) if issubclass(algo_class, DQL): # Override the setting env = BallOnBeamDiscSim(env.dt, env.max_steps) net = FNN( input_size=DiscreteActQValPolicy.get_qfcn_input_size(env.spec), output_size=DiscreteActQValPolicy.get_qfcn_output_size(), **fnn_hparam, ) policy = DiscreteActQValPolicy(spec=env.spec, net=net) else: # Override the setting env = ActNormWrapper(env) policy = TwoHeadedGRUPolicy(env.spec, shared_hidden_size=8, shared_num_recurrent_layers=1) obsact_space = BoxSpace.cat([env.obs_space, env.act_space]) common_hparam.update(qfcn_1=FNNPolicy( spec=EnvSpec(obsact_space, ValueFunctionSpace), **fnn_hparam)) common_hparam.update(qfcn_2=FNNPolicy( spec=EnvSpec(obsact_space, ValueFunctionSpace), **fnn_hparam)) else: raise NotImplementedError # Simulate training algo = algo_class(ex_dir, env, policy, **common_hparam) algo.policy.param_values += to.tensor([42.0]) if isinstance(algo, ActorCritic): algo.critic.vfcn.param_values += to.tensor([42.0]) # Save and load algo.save_snapshot(meta_info=None) algo_loaded = Algorithm.load_snapshot(load_dir=ex_dir) assert isinstance(algo_loaded, Algorithm) policy_loaded = algo_loaded.policy if isinstance(algo, ActorCritic): critic_loaded = algo_loaded.critic # Check assert all(algo.policy.param_values == policy_loaded.param_values) if isinstance(algo, ActorCritic): assert all( algo.critic.vfcn.param_values == critic_loaded.vfcn.param_values) # Load the experiment. Since we did not save any hyper-parameters, we ignore the errors when loading. env, policy, extra = load_experiment(ex_dir) assert isinstance(env, Env) assert isinstance(policy, Policy) assert isinstance(extra, dict)
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env = QBallBalancerSim(dt=1 / 250.0, max_steps=1500) env = ActNormWrapper(env) # Learning rate scheduler lrs_gamma = trial.suggest_categorical("exp_lr_scheduler_gamma", [None, 0.99, 0.995, 0.999]) if lrs_gamma is not None: lr_sched = lr_scheduler.ExponentialLR lr_sched_hparam = dict(gamma=lrs_gamma) else: lr_sched, lr_sched_hparam = None, dict() # Policy policy = FNNPolicy( spec=env.spec, hidden_sizes=trial.suggest_categorical("hidden_sizes_policy", [(16, 16), (32, 32), (64, 64)]), hidden_nonlin=fcn_from_str( trial.suggest_categorical("hidden_nonlin_policy", ["to_tanh", "to_relu"])), ) # Critic vfcn = FNN( input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=trial.suggest_categorical("hidden_sizes_critic", [(16, 16), (32, 32), (64, 64)]), hidden_nonlin=fcn_from_str( trial.suggest_categorical("hidden_nonlin_critic", ["to_tanh", "to_relu"])), ) critic_hparam = dict( batch_size=250, gamma=trial.suggest_uniform("gamma_critic", 0.99, 1.0), lamda=trial.suggest_uniform("lamda_critic", 0.95, 1.0), num_epoch=trial.suggest_int("num_epoch_critic", 1, 10), lr=trial.suggest_loguniform("lr_critic", 1e-5, 1e-3), standardize_adv=trial.suggest_categorical("standardize_adv_critic", [True, False]), max_grad_norm=trial.suggest_categorical("max_grad_norm_critic", [None, 1.0, 5.0]), lr_scheduler=lr_sched, lr_scheduler_hparam=lr_sched_hparam, ) critic = GAE(vfcn, **critic_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=300, batch_size=250, min_steps=trial.suggest_int("num_rollouts_algo", 10, 30) * env.max_steps, num_epoch=trial.suggest_int("num_epoch_algo", 1, 10), eps_clip=trial.suggest_uniform("eps_clip_algo", 0.05, 0.2), std_init=trial.suggest_uniform("std_init_algo", 0.5, 1.0), lr=trial.suggest_loguniform("lr_algo", 1e-5, 1e-3), max_grad_norm=trial.suggest_categorical("max_grad_norm_algo", [None, 1.0, 5.0]), lr_scheduler=lr_sched, lr_scheduler_hparam=lr_sched_hparam, ) algo = PPO(osp.join(study_dir, f"trial_{trial.number}"), env, policy, critic, **algo_hparam) # Train without saving the results algo.train(snapshot_mode="latest", seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler(env, policy, num_workers=1, min_rollouts=min_rollouts) ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
# vfcn_hparam = dict(hidden_siz=16, num_recurrent_layers=1, hidden_nonlin='tanh') # RNN vfcn_hparam = dict(hidden_size=16, num_recurrent_layers=1) # LSTM & GRU # vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) # vfcn = RNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) # vfcn = LSTMPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) vfcn = GRUPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.995, lamda=0.98, num_epoch=1, batch_size=100, lr=1e-4, standardize_adv=False, max_grad_norm=1.0, ) critic_cand = GAE(vfcn, **critic_hparam) critic_refs = GAE(deepcopy(vfcn), **critic_hparam) subrtn_hparam_cand = dict( max_iter=400, # min_rollouts=0, # will be overwritten by SPOTA min_steps=0, # will be overwritten by SPOTA num_epoch=1, eps_clip=0.1, batch_size=100, std_init=0.8, max_grad_norm=1.0, lr=1e-4, ) subrtn_hparam_refs = deepcopy(subrtn_hparam_cand)
# # NormalDomainParam(name='Mp', mean=0.0227, std=0.0009), NormalDomainParam(name='Mr', mean=0.0899, std=0.0039), NormalDomainParam(name='Lp', mean=0.1474, std=0.0046), NormalDomainParam(name='Lr', mean=0.0777, std=0.003), ) env = DomainRandWrapperLive(env, randomizer) # Policy policy = to.load(osp.join(ref_ex_dir, 'policy.pt')) policy.init_param() # Critic vfcn = to.load(osp.join(ref_ex_dir, 'valuefcn.pt')) vfcn.init_param() critic = GAE(vfcn, **hparams['critic']) # Algorithm algo_hparam = hparams['subrtn'] algo_hparam.update({'num_workers': 1}) # should be equivalent to the number of cores per job # algo_hparam.update({'max_iter': 300}) # algo_hparam.update({'max_iter': 600}) # algo_hparam.update({'min_steps': 3*algo_hparam['min_steps']}) algo = PPO(ex_dir, env, policy, critic, **algo_hparam) # Save the hyper-parameters save_list_of_dicts_to_yaml([ dict(env=env_hparams, seed=args.seed), dict(policy=hparams['policy']), dict(critic=hparams['critic']), dict(algo=algo_hparam, algo_name=algo.name)],
def test_parallel_sampling_deterministic_smoke_test_w_min_steps( tmpdir_factory, env: SimEnv, policy: Policy, algo, min_rollouts: int, min_steps: int): env.max_steps = 20 seeds = (0, 1) nums_workers = (1, 2, 4) logging_results = [] rollout_results: List[List[List[List[StepSequence]]]] = [] for seed in seeds: logging_results.append((seed, [])) rollout_results.append([]) for num_workers in nums_workers: pyrado.set_seed(seed) policy.init_param(None) ex_dir = str( tmpdir_factory.mktemp( f"seed={seed}-num_workers={num_workers}")) set_log_prefix_dir(ex_dir) vfcn = FNN(input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=[16, 16], hidden_nonlin=to.tanh) critic = GAE(vfcn, gamma=0.98, lamda=0.95, batch_size=32, lr=1e-3, standardize_adv=False) alg = algo( ex_dir, env, policy, critic, max_iter=3, min_rollouts=min_rollouts, min_steps=min_steps * env.max_steps, num_workers=num_workers, ) alg.sampler = RolloutSavingWrapper(alg.sampler) alg.train() with open(f"{ex_dir}/progress.csv") as f: logging_results[-1][1].append(str(f.read())) rollout_results[-1].append(alg.sampler.rollouts) # Test that the observations for all number of workers are equal. for rollouts in rollout_results: for ros_a, ros_b in [(a, b) for a in rollouts for b in rollouts]: assert len(ros_a) == len(ros_b) for ro_a, ro_b in zip(ros_a, ros_b): assert len(ro_a) == len(ro_b) for r_a, r_b in zip(ro_a, ro_b): assert r_a.observations == pytest.approx(r_b.observations) # Test that different seeds actually produce different results. for results_a, results_b in [(a, b) for seed_a, a in logging_results for seed_b, b in logging_results if seed_a != seed_b]: for result_a, result_b in [(a, b) for a in results_a for b in results_b if a is not b]: assert result_a != result_b # Test that same seeds produce same results. for _, results in logging_results: for result_a, result_b in [(a, b) for a in results for b in results]: assert result_a == result_b
# vfcn_hparam = dict(hidden_size=32, num_recurrent_layers=1) # LSTM & GRU vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) # vfcn = GRUPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.9844224855479998, lamda=0.9700148505302241, num_epoch=5, batch_size=500, standardize_adv=False, lr=7.058326426522811e-4, max_grad_norm=6.0, lr_scheduler=lr_scheduler.ExponentialLR, lr_scheduler_hparam=dict(gamma=0.999), ) critic = GAE(vfcn, **critic_hparam) # Subroutine algo_hparam = dict( max_iter=args.ppo_iterations, eps_clip=0.12648736789309026, min_steps=30 * env.max_steps, num_epoch=7, batch_size=500, std_init=0.7573286998997557, lr=6.999956625305722e-04, max_grad_norm=1.0, num_workers=8, lr_scheduler=lr_scheduler.ExponentialLR, lr_scheduler_hparam=dict(gamma=0.999), )
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(dt=1/100., max_steps=600) env = QQubeSwingUpSim(**env_hparams) env = ActNormWrapper(env) # Learning rate scheduler lrs_gamma = trial.suggest_categorical('exp_lr_scheduler_gamma', [None, 0.995, 0.999]) if lrs_gamma is not None: lr_sched = lr_scheduler.ExponentialLR lr_sched_hparam = dict(gamma=lrs_gamma) else: lr_sched, lr_sched_hparam = None, dict() # Policy policy_hparam = dict( hidden_sizes=trial.suggest_categorical('hidden_sizes_policy', [(16, 16), (32, 32), (64, 64)]), hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_policy', ['to_tanh', 'to_relu'])), ) # FNN # policy_hparam = dict( # hidden_size=trial.suggest_categorical('hidden_size_policy', [16, 32, 64]), # num_recurrent_layers=trial.suggest_categorical('num_recurrent_layers_policy', [1, 2]), # ) # LSTM & GRU policy = FNNPolicy(spec=env.spec, **policy_hparam) # policy = GRUPolicy(spec=env.spec, **policy_hparam) # Critic vfcn_hparam = dict( hidden_sizes=trial.suggest_categorical('hidden_sizes_critic', [(16, 16), (32, 32), (64, 64)]), hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])), ) # vfcn_hparam = dict( # hidden_size=trial.suggest_categorical('hidden_size_critic', [16, 32, 64]), # num_recurrent_layers=trial.suggest_categorical('num_recurrent_layers_critic', [1, 2]), # ) # LSTM & GRU vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) # vfcn = GRUPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( batch_size=500, gamma=trial.suggest_uniform('gamma_critic', 0.98, 1.), lamda=trial.suggest_uniform('lamda_critic', 0.95, 1.), num_epoch=trial.suggest_int('num_epoch_critic', 1, 10), lr=trial.suggest_loguniform('lr_critic', 1e-5, 1e-3), standardize_adv=trial.suggest_categorical('standardize_adv_critic', [False]), max_grad_norm=trial.suggest_categorical('max_grad_norm_critic', [None, 1., 5.]), lr_scheduler=lr_sched, lr_scheduler_hparam=lr_sched_hparam ) critic = GAE(vfcn, **critic_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=250, batch_size=500, min_steps=trial.suggest_int('num_rollouts_algo', 10, 30)*env.max_steps, num_epoch=trial.suggest_int('num_epoch_algo', 1, 10), eps_clip=trial.suggest_uniform('eps_clip_algo', 0.05, 0.2), std_init=trial.suggest_uniform('std_init_algo', 0.5, 1.0), lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3), max_grad_norm=trial.suggest_categorical('max_grad_norm_algo', [None, 1., 5.]), lr_scheduler=lr_sched, lr_scheduler_hparam=lr_sched_hparam ) csv_logger = create_csv_step_logger(osp.join(study_dir, f'trial_{trial.number}')) algo = PPO(osp.join(study_dir, f'trial_{trial.number}'), env, policy, critic, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler(env, policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros])/min_rollouts return mean_ret