def test_spota_ppo(env, spota_hparam, ex_dir): # Environment and domain randomization randomizer = get_default_randomizer(env) env = DomainRandWrapperBuffer(env, randomizer) # Policy and subroutines policy = FNNPolicy(env.spec, [16, 16], hidden_nonlin=to.tanh) value_fcn = FNN(input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=[16, 16], hidden_nonlin=to.tanh) critic_hparam = dict(gamma=0.998, lamda=0.95, num_epoch=3, batch_size=64, lr=1e-3) critic_cand = GAE(value_fcn, **critic_hparam) critic_refs = GAE(deepcopy(value_fcn), **critic_hparam) subrtn_hparam_cand = dict( # min_rollouts=0, # will be overwritten by SPOTA min_steps=0, # will be overwritten by SPOTA max_iter=2, num_epoch=3, eps_clip=0.1, batch_size=64, num_sampler_envs=4, std_init=0.5, lr=1e-2) subrtn_hparam_cand = subrtn_hparam_cand sr_cand = PPO(ex_dir, env, policy, critic_cand, **subrtn_hparam_cand) sr_refs = PPO(ex_dir, env, deepcopy(policy), critic_refs, **subrtn_hparam_cand) # Create algorithm and train algo = SPOTA(ex_dir, env, sr_cand, sr_refs, **spota_hparam) algo.train()
def test_adr(env, ex_dir, subrtn_hparam, actor_hparam, value_fcn_hparam, critic_hparam, adr_hparam): # Create the subroutine for the meta-algorithm actor = FNNPolicy(spec=env.spec, **actor_hparam) value_fcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **value_fcn_hparam) critic = GAE(value_fcn, **critic_hparam) subroutine = PPO(ex_dir, env, actor, critic, **subrtn_hparam) # Create algorithm and train particle_hparam = dict(actor=actor_hparam, value_fcn=value_fcn_hparam, critic=critic_hparam) algo = ADR(ex_dir, env, subroutine, svpg_particle_hparam=particle_hparam, **adr_hparam) algo.train() assert algo.curr_iter == algo.max_iter
def test_actor_critic(env, linear_policy, ex_dir, algo, algo_hparam, value_fcn_type, use_cuda): # Create value function if value_fcn_type == 'fnn-plain': value_fcn = FNN(input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=[16, 16], hidden_nonlin=to.tanh, use_cuda=use_cuda) else: vf_spec = EnvSpec(env.obs_space, ValueFunctionSpace) if value_fcn_type == 'fnn': value_fcn = FNNPolicy(vf_spec, hidden_sizes=[16, 16], hidden_nonlin=to.tanh, use_cuda=use_cuda) else: value_fcn = RNNPolicy(vf_spec, hidden_size=16, num_recurrent_layers=1, use_cuda=use_cuda) # Create critic critic_hparam = dict( gamma=0.98, lamda=0.95, batch_size=32, lr=1e-3, standardize_adv=False, ) critic = GAE(value_fcn, **critic_hparam) # Common hyper-parameters common_hparam = dict(max_iter=3, min_rollouts=3, num_sampler_envs=1) # Add specific hyper parameters if any common_hparam.update(algo_hparam) # Create algorithm and train algo = algo(ex_dir, env, linear_policy, critic, **common_hparam) algo.train() assert algo.curr_iter == algo.max_iter
def test_snapshots_notmeta(ex_dir, env, policy, algo_class, algo_hparam): # Collect hyper-parameters, create algorithm, and train common_hparam = dict(max_iter=1, num_sampler_envs=1) common_hparam.update(algo_hparam) if issubclass(algo_class, ActorCritic): common_hparam.update( min_rollouts=3, critic=GAE(value_fcn=FNNPolicy(spec=EnvSpec( env.obs_space, ValueFunctionSpace), hidden_sizes=[16, 16], hidden_nonlin=to.tanh))) elif issubclass(algo_class, ParameterExploring): common_hparam.update(num_rollouts=1) else: raise NotImplementedError # Train algo = algo_class(ex_dir, env, policy, **common_hparam) algo.train() if isinstance(algo, ActorCritic): policy_posttrn_param_values = algo.policy.param_values critic_posttrn_value_fcn_param_values = algo.critic.value_fcn.param_values elif isinstance(algo, ParameterExploring): policy_posttrn_param_values = algo.best_policy_param # Save and load algo.save_snapshot(meta_info=None) algo.load_snapshot(load_dir=ex_dir, meta_info=None) policy_loaded = deepcopy(algo.policy) # Check assert all(policy_posttrn_param_values == policy_loaded.param_values) if algo_class in [A2C, PPO, PPO2]: critic_loaded = deepcopy(algo.critic) assert all(critic_posttrn_value_fcn_param_values == critic_loaded.value_fcn.param_values)
num_recurrent_layers=1) # LSTM & GRU # value_fcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **value_fcn_hparam) # value_fcn = RNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **value_fcn_hparam) # value_fcn = LSTMPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **value_fcn_hparam) value_fcn = GRUPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **value_fcn_hparam) critic_hparam = dict( gamma=0.995, lamda=0.98, num_epoch=1, batch_size=100, lr=1e-4, standardize_adv=False, max_grad_norm=1., ) critic_cand = GAE(value_fcn, **critic_hparam) critic_refs = GAE(deepcopy(value_fcn), **critic_hparam) subrtn_hparam_cand = dict( max_iter=400, # min_rollouts=0, # will be overwritten by SPOTA min_steps=0, # will be overwritten by SPOTA num_epoch=1, eps_clip=0.1, batch_size=100, std_init=0.8, max_grad_norm=1., lr=1e-4, ) subrtn_hparam_cand = subrtn_hparam_cand
def train_and_eval(trial: optuna.Trial, ex_dir: str, seed: [int, None]): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param ex_dir: experiment's directory, i.e. the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env = QBallBalancerSim(dt=1/250., max_steps=1500) env = ActNormWrapper(env) # Policy policy = FNNPolicy( spec=env.spec, hidden_sizes=trial.suggest_categorical('hidden_sizes_policy', [[16, 16], [32, 32], [64, 64]]), hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_policy', ['to_tanh', 'to_relu'])), ) # Critic value_fcn = FNN( input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=trial.suggest_categorical('hidden_sizes_critic', [[16, 16], [32, 32], [64, 64]]), hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])), ) critic_hparam = dict( gamma=trial.suggest_uniform('gamma_critic', 0.99, 1.), lamda=trial.suggest_uniform('lamda_critic', 0.95, 1.), num_epoch=trial.suggest_int('num_epoch_critic', 1, 10), batch_size=100, lr=trial.suggest_loguniform('lr_critic', 1e-5, 1e-3), standardize_adv=trial.suggest_categorical('standardize_adv_critic', [True, False]), # max_grad_norm=5., # lr_scheduler=scheduler.StepLR, # lr_scheduler_hparam=dict(step_size=10, gamma=0.9) # lr_scheduler=scheduler.ExponentialLR, # lr_scheduler_hparam=dict(gamma=0.99) ) critic = GAE(value_fcn, **critic_hparam) # Algorithm algo_hparam = dict( num_sampler_envs=1, # parallelize via optuna n_jobs max_iter=500, min_steps=25*env.max_steps, num_epoch=trial.suggest_int('num_epoch_algo', 1, 10), eps_clip=trial.suggest_uniform('eps_clip_algo', 0.05, 0.2), batch_size=100, std_init=0.9, lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3), # max_grad_norm=5., # lr_scheduler=scheduler.StepLR, # lr_scheduler_hparam=dict(step_size=10, gamma=0.9) # lr_scheduler=scheduler.ExponentialLR, # lr_scheduler_hparam=dict(gamma=0.99) ) algo = PPO(osp.join(ex_dir, f'trial_{trial.number}'), env, policy, critic, **algo_hparam) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelSampler(env, policy, num_envs=20, min_rollouts=min_rollouts) ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros])/min_rollouts return mean_ret
def __init__(self, save_dir: str, env: Env, policy: Policy, critic: GAE, max_iter: int, min_rollouts: int = None, min_steps: int = None, num_epoch: int = 3, eps_clip: float = 0.1, value_fcn_coeff: float = 0.5, entropy_coeff: float = 1e-3, batch_size: int = 32, std_init: float = 1.0, num_sampler_envs: int = 4, max_grad_norm: float = None, lr: float = 5e-4, lr_scheduler=None, lr_scheduler_hparam: [dict, None] = None, logger: StepLogger = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param critic: advantage estimation function $A(s,a) = Q(s,a) - V(s)$ :param max_iter: number of iterations (policy updates) :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param num_epoch: number of iterations over all gathered samples during one policy update :param eps_clip: max/min probability ratio, see [1] :param value_fcn_coeff: weighting factor of the value function term in the combined loss, specific to PPO2 :param entropy_coeff: weighting factor of the entropy term in the combined loss, specific to PPO2 :param batch_size: number of samples per policy update batch :param std_init: initial standard deviation on the actions for the exploration noise :param num_sampler_envs: number of environments for parallel sampling :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping :param lr: (initial) learning rate for the optimizer which can be by modified by the scheduler. By default, the learning rate is constant. :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set) :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler :param logger: logger for every step of the algorithm, if `None` the default logger will be created .. note:: The Adam optimizer computes individual learning rates for all parameters. Thus, the learning rate scheduler schedules the maximum learning rate. """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) # Call ActorCritic's constructor super().__init__(env, policy, critic, save_dir, max_iter, logger) critic.standardize_adv = True # enforce this for PPO2 # Store the inputs self.num_epoch = num_epoch self.eps_clip = eps_clip self.value_fcn_coeff = value_fcn_coeff self.entropy_coeff = entropy_coeff self.batch_size = batch_size self.max_grad_norm = max_grad_norm # Initialize self.log_loss = True self._expl_strat = NormalActNoiseExplStrat(self._policy, std_init=std_init) self.sampler = ParallelSampler(env, self._expl_strat, num_envs=num_sampler_envs, min_steps=min_steps, min_rollouts=min_rollouts) self.optim = to.optim.Adam( [{ 'params': self._expl_strat.policy.parameters() }, { 'params': self._expl_strat.noise.parameters() }, { 'params': self._critic.value_fcn.parameters() }], lr=lr, eps=1e-5) self._lr_scheduler = lr_scheduler self._lr_scheduler_hparam = lr_scheduler_hparam if lr_scheduler is not None: self._lr_scheduler = lr_scheduler(self.optim, **lr_scheduler_hparam)
def __init__(self, save_dir: str, env: Env, particle_hparam: dict, max_iter: int, num_particles: int, temperature: float, lr: float, horizon: int, std_init: float = 1.0, min_rollouts: int = None, min_steps: int = 10000, num_sampler_envs: int = 4, serial: bool = True, logger: StepLogger = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param particle_hparam: hyper-parameters for particle template construction :param max_iter: number of iterations :param num_particles: number of distinct particles :param temperature: the temperature of the SVGD determines how jointly the training takes place :param lr: the learning rate for the update of the particles :param horizon: horizon for each particle :param std_init: initial standard deviation for the exploration :param min_rollouts: minimum number of rollouts sampled per policy update batch :param min_steps: minimum number of state transitions sampled per policy update batch :param num_sampler_envs: number of environments for parallel sampling :param serial: serial mode can be switched off which can be used to partly control the flow of SVPG from outside :param logger: logger for every step of the algorithm """ if not isinstance(env, Env): raise pyrado.TypeErr(given=env, expected_type=Env) if not isinstance(particle_hparam, dict): raise pyrado.TypeErr(given=particle_hparam, expected_type=dict) if not all([ key in particle_hparam for key in ['actor', 'value_fcn', 'critic'] ]): raise AttributeError # Call Algorithm's constructor super().__init__(save_dir, max_iter, policy=None, logger=logger) # Store the inputs self._env = env self.num_particles = num_particles self.horizon = horizon # TODO @Robin: where is the horizon used?! self.lr = lr self.temperature = temperature self.serial = serial # Prepare placeholders for particles self.particles = [None] * num_particles self.expl_strats = [None] * num_particles self.optimizers = [None] * num_particles self.fixed_particles = [None] * num_particles self.fixed_expl_strats = [None] * num_particles self.samplers = [None] * num_particles self.count = 0 self.updatecount = 0 # Particle factory actor = FNNPolicy(spec=env.spec, **particle_hparam['actor']) value_fcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **particle_hparam['value_fcn']) critic = GAE(value_fcn, **particle_hparam['critic']) particle = SVPGParticle(env.spec, actor, critic) for i in range(self.num_particles): self.particles[i] = deepcopy(particle) self.particles[i].init_param() self.expl_strats[i] = NormalActNoiseExplStrat( self.particles[i].actor, std_init) self.optimizers[i] = to.optim.Adam( self.expl_strats[i].parameters(), lr=self.lr) self.fixed_particles[i] = deepcopy(self.particles[i]) self.fixed_expl_strats[i] = deepcopy(self.expl_strats[i]) if self.serial: self.samplers[i] = ParallelSampler(env, self.expl_strats[i], num_sampler_envs, min_rollouts=min_rollouts, min_steps=min_steps)
policy = FNNPolicy(spec=env.spec, **policy_hparam) # Critic value_fcn_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh) value_fcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **value_fcn_hparam) critic_hparam = dict( gamma=0.995, lamda=0.95, num_epoch=10, batch_size=512, standardize_adv=False, standardizer=None, max_grad_norm=1., lr=5e-4, ) critic = GAE(value_fcn, **critic_hparam) # Algorithm algo_hparam = dict( max_iter=500, min_steps=20 * env.max_steps, num_epoch=10, eps_clip=0.15, batch_size=512, max_grad_norm=1., lr=3e-4, num_sampler_envs=12, ) algo = PPO(ex_dir, env, policy, critic, **algo_hparam) # Save the hyper-parameters
if __name__ == '__main__': # Experiment (set seed before creating the modules) ex_dir = ask_for_experiment() # Environments hparams = load_dict_from_yaml(osp.join(ex_dir, 'hyperparams.yaml')) env_sim = joblib.load(osp.join(ex_dir, 'env_sim.pkl')) env_real = joblib.load(osp.join(ex_dir, 'env_real.pkl')) # Policy policy = to.load(osp.join(ex_dir, 'policy.pt')) # Critic valuefcn = to.load(osp.join(ex_dir, 'valuefcn.pt')) critic = GAE(valuefcn, **hparams['critic']) # Subroutine algo_hparam = hparams['subroutine'] # algo_hparam.update({'num_sampler_envs': 1}) ppo = PPO(ex_dir, env_sim, policy, critic, **algo_hparam) # Set the boundaries for the GP bounds = to.load(osp.join(ex_dir, 'bounds.pt')) # Algorithm algo = BayRn(ex_dir, env_sim, env_real, subroutine=ppo, bounds=bounds,
# Get the experiment's directory to load from ex_dir = ask_for_experiment() if args.ex_dir is None else args.ex_dir # Load the environment and the policy env_sim, policy, kwout = load_experiment(ex_dir, args) # Load the required data cands = to.load(osp.join(ex_dir, 'candidates.pt')) cands_values = to.load(osp.join(ex_dir, 'candidates_values.pt')).unsqueeze(1) bounds = to.load(osp.join(ex_dir, 'bounds.pt')) uc_normalizer = UnitCubeProjector(bounds[0, :], bounds[1, :]) # Decide on which algorithm to use via the mode argument if args.mode == PPO.name: critic = GAE(kwout['value_fcn'], **kwout['hparams']['critic']) subroutine = PPO(ex_dir, env_sim, policy, critic, **kwout['hparams']['subroutine']) elif args.mode == PPO2.name: critic = GAE(kwout['value_fcn'], **kwout['hparams']['critic']) subroutine = PPO2(ex_dir, env_sim, policy, critic, **kwout['hparams']['subroutine']) elif args.mode == CEM.name: subroutine = CEM(ex_dir, env_sim, policy, **kwout['hparams']['subroutine']) elif args.mode == NES.name: subroutine = NES(ex_dir, env_sim, policy, **kwout['hparams']['subroutine']) elif args.mode == PoWER.name: subroutine = PoWER(ex_dir, env_sim, policy, **kwout['hparams']['subroutine']) else: raise NotImplementedError('Only PPO, PPO2, CEM, NES, and PoWER are implemented so far.') if args.warmstart: ppi = policy.param_values.data