def __init__(self, spec: EnvSpec, shared_hidden_sizes: Sequence[int], shared_hidden_nonlin: [Callable, Sequence[Callable]], head_1_size: int = None, head_2_size: int = None, head_1_output_nonlin: Callable = None, head_2_output_nonlin: Callable = None, shared_dropout: float = 0., init_param_kwargs: dict = None, use_cuda: bool = False): """ Constructor :param spec: environment specification :param shared_hidden_sizes: sizes of shared hidden layer outputs. Every entry creates one shared hidden layer. :param shared_hidden_nonlin: nonlinearity for the shared hidden layers :param head_1_size: size of the fully connected layer for head 1, if `None` this is set to the action space dim :param head_2_size: size of the fully connected layer for head 2, if `None` this is set to the action space dim :param head_1_output_nonlin: nonlinearity for output layer of the first head :param head_2_output_nonlin: nonlinearity for output layer of the second head :param shared_dropout: dropout probability, default = 0 deactivates dropout :param init_param_kwargs: additional keyword arguments for the policy parameter initialization :param use_cuda: `True` to move the policy to the GPU, `False` (default) to use the CPU """ super().__init__(spec, use_cuda) # Create the feed-forward neural network self.shared = FNN(input_size=spec.obs_space.flat_dim, output_size=shared_hidden_sizes[-1], hidden_sizes=shared_hidden_sizes, hidden_nonlin=shared_hidden_nonlin, dropout=shared_dropout, output_nonlin=None) # Create output layer head_1_size = spec.act_space.flat_dim if head_1_size is None else head_1_size head_2_size = spec.act_space.flat_dim if head_2_size is None else head_2_size self.head_1 = nn.Linear(shared_hidden_sizes[-1], head_1_size) self.head_2 = nn.Linear(shared_hidden_sizes[-1], head_2_size) self.head_1_output_nonlin = head_1_output_nonlin self.head_2_output_nonlin = head_2_output_nonlin # Call custom initialization function after PyTorch network parameter initialization init_param_kwargs = init_param_kwargs if init_param_kwargs is not None else dict( ) self.init_param(None, **init_param_kwargs) self.to(self.device)
def test_actor_critic(ex_dir, env: SimEnv, policy: Policy, algo, algo_hparam, vfcn_type, use_cuda): if use_cuda: policy._device = 'cuda' policy = policy.to(device='cuda') # Create value function if vfcn_type == 'fnn-plain': vfcn = FNN(input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=[16, 16], hidden_nonlin=to.tanh, use_cuda=use_cuda) else: vf_spec = EnvSpec(env.obs_space, ValueFunctionSpace) if vfcn_type == 'fnn': vfcn = FNNPolicy(vf_spec, hidden_sizes=[16, 16], hidden_nonlin=to.tanh, use_cuda=use_cuda) else: vfcn = RNNPolicy(vf_spec, hidden_size=16, num_recurrent_layers=1, use_cuda=use_cuda) # Create critic critic_hparam = dict( gamma=0.98, lamda=0.95, batch_size=32, lr=1e-3, standardize_adv=False, ) critic = GAE(vfcn, **critic_hparam) # Common hyper-parameters common_hparam = dict(max_iter=2, min_rollouts=3, num_workers=1) # Add specific hyper parameters if any common_hparam.update(algo_hparam) # Create algorithm and train algo = algo(ex_dir, env, policy, critic, **common_hparam) algo.train() assert algo.curr_iter == algo.max_iter
def test_spota_ppo(ex_dir, env: SimEnv, spota_hparam): # Environment and domain randomization randomizer = create_default_randomizer(env) env = DomainRandWrapperBuffer(env, randomizer) # Policy and subroutines policy = FNNPolicy(env.spec, [16, 16], hidden_nonlin=to.tanh) vfcn = FNN(input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=[16, 16], hidden_nonlin=to.tanh) critic_hparam = dict(gamma=0.998, lamda=0.95, num_epoch=3, batch_size=64, lr=1e-3) critic_cand = GAE(vfcn, **critic_hparam) critic_refs = GAE(deepcopy(vfcn), **critic_hparam) subrtn_hparam_cand = dict( # min_rollouts=0, # will be overwritten by SPOTA min_steps=0, # will be overwritten by SPOTA max_iter=2, num_epoch=3, eps_clip=0.1, batch_size=64, num_workers=1, std_init=0.5, lr=1e-2) subrtn_hparam_cand = subrtn_hparam_cand sr_cand = PPO(ex_dir, env, policy, critic_cand, **subrtn_hparam_cand) sr_refs = PPO(ex_dir, env, deepcopy(policy), critic_refs, **subrtn_hparam_cand) # Create algorithm and train algo = SPOTA(ex_dir, env, sr_cand, sr_refs, **spota_hparam) algo.train()
# Set seed if desired pyrado.set_seed(args.seed, verbose=True) # Environment env_hparams = dict(dt=1/100., max_steps=500) env = BallOnBeamDiscSim(**env_hparams) # Policy policy_hparam = dict( hidden_sizes=[32, 32], hidden_nonlin=to.tanh ) net = FNN( input_size=DiscreteActQValPolicy.get_qfcn_input_size(env.spec), output_size=DiscreteActQValPolicy.get_qfcn_output_size(), **policy_hparam ) policy = DiscreteActQValPolicy(spec=env.spec, net=net) # Algorithm algo_hparam = dict( max_iter=5000, memory_size=10*env.max_steps, eps_init=0.1286, eps_schedule_gamma=0.9955, gamma=0.998, target_update_intvl=5, num_batch_updates=20, max_grad_norm=0.5, min_steps=10,
class TwoHeadedFNNPolicy(TwoHeadedPolicy): """ Policy architecture which has a common body and two heads that have a separate last layer """ name: str = 'thfnn' def __init__(self, spec: EnvSpec, shared_hidden_sizes: Sequence[int], shared_hidden_nonlin: [Callable, Sequence[Callable]], head_1_size: int = None, head_2_size: int = None, head_1_output_nonlin: Callable = None, head_2_output_nonlin: Callable = None, shared_dropout: float = 0., init_param_kwargs: dict = None, use_cuda: bool = False): """ Constructor :param spec: environment specification :param shared_hidden_sizes: sizes of shared hidden layer outputs. Every entry creates one shared hidden layer. :param shared_hidden_nonlin: nonlinearity for the shared hidden layers :param head_1_size: size of the fully connected layer for head 1, if `None` this is set to the action space dim :param head_2_size: size of the fully connected layer for head 2, if `None` this is set to the action space dim :param head_1_output_nonlin: nonlinearity for output layer of the first head :param head_2_output_nonlin: nonlinearity for output layer of the second head :param shared_dropout: dropout probability, default = 0 deactivates dropout :param init_param_kwargs: additional keyword arguments for the policy parameter initialization :param use_cuda: `True` to move the policy to the GPU, `False` (default) to use the CPU """ super().__init__(spec, use_cuda) # Create the feed-forward neural network self.shared = FNN(input_size=spec.obs_space.flat_dim, output_size=shared_hidden_sizes[-1], hidden_sizes=shared_hidden_sizes, hidden_nonlin=shared_hidden_nonlin, dropout=shared_dropout, output_nonlin=None) # Create output layer head_1_size = spec.act_space.flat_dim if head_1_size is None else head_1_size head_2_size = spec.act_space.flat_dim if head_2_size is None else head_2_size self.head_1 = nn.Linear(shared_hidden_sizes[-1], head_1_size) self.head_2 = nn.Linear(shared_hidden_sizes[-1], head_2_size) self.head_1_output_nonlin = head_1_output_nonlin self.head_2_output_nonlin = head_2_output_nonlin # Call custom initialization function after PyTorch network parameter initialization init_param_kwargs = init_param_kwargs if init_param_kwargs is not None else dict( ) self.init_param(None, **init_param_kwargs) self.to(self.device) def init_param(self, init_values: to.Tensor = None, **kwargs): if init_values is None: self.shared.init_param(None, **kwargs) init_param(self.head_1, **kwargs) init_param(self.head_2, **kwargs) else: self.param_values = init_values def forward(self, obs: to.Tensor) -> Tuple[to.Tensor, to.Tensor]: obs = obs.to(self.device) # Get the output of the last shared layer and pass this to the two headers separately x = self.shared(obs) output_1 = self.head_1(x) output_2 = self.head_2(x) if self.head_1_output_nonlin is not None: output_1 = self.head_1_output_nonlin(output_1) if self.head_2_output_nonlin is not None: output_2 = self.head_2_output_nonlin(output_2) return output_1, output_2
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env = QBallBalancerSim(dt=1 / 250., max_steps=1500) env = ActNormWrapper(env) # Learning rate scheduler lrs_gamma = trial.suggest_categorical('exp_lr_scheduler_gamma', [None, 0.99, 0.995, 0.999]) if lrs_gamma is not None: lr_sched = lr_scheduler.ExponentialLR lr_sched_hparam = dict(gamma=lrs_gamma) else: lr_sched, lr_sched_hparam = None, dict() # Policy policy = FNNPolicy( spec=env.spec, hidden_sizes=trial.suggest_categorical('hidden_sizes_policy', [(16, 16), (32, 32), (64, 64)]), hidden_nonlin=fcn_from_str( trial.suggest_categorical('hidden_nonlin_policy', ['to_tanh', 'to_relu'])), ) # Critic vfcn = FNN( input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=trial.suggest_categorical('hidden_sizes_critic', [(16, 16), (32, 32), (64, 64)]), hidden_nonlin=fcn_from_str( trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])), ) critic_hparam = dict( batch_size=250, gamma=trial.suggest_uniform('gamma_critic', 0.99, 1.), lamda=trial.suggest_uniform('lamda_critic', 0.95, 1.), num_epoch=trial.suggest_int('num_epoch_critic', 1, 10), lr=trial.suggest_loguniform('lr_critic', 1e-5, 1e-3), standardize_adv=trial.suggest_categorical('standardize_adv_critic', [True, False]), max_grad_norm=trial.suggest_categorical('max_grad_norm_critic', [None, 1., 5.]), lr_scheduler=lr_sched, lr_scheduler_hparam=lr_sched_hparam) critic = GAE(vfcn, **critic_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=300, batch_size=250, min_steps=trial.suggest_int('num_rollouts_algo', 10, 30) * env.max_steps, num_epoch=trial.suggest_int('num_epoch_algo', 1, 10), eps_clip=trial.suggest_uniform('eps_clip_algo', 0.05, 0.2), std_init=trial.suggest_uniform('std_init_algo', 0.5, 1.0), lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3), max_grad_norm=trial.suggest_categorical('max_grad_norm_algo', [None, 1., 5.]), lr_scheduler=lr_sched, lr_scheduler_hparam=lr_sched_hparam) algo = PPO(osp.join(study_dir, f'trial_{trial.number}'), env, policy, critic, **algo_hparam) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler(env, policy, num_workers=1, min_rollouts=min_rollouts) ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
randomizer = create_empty_randomizer() env = ActDelayWrapper(env) randomizer.add_domain_params( UniformDomainParam(name='act_delay', mean=2, halfspan=2, clip_lo=0, roundint=True)) env = DomainRandWrapperLive(env, randomizer) # Policy policy_hparam = dict( obs_layer=FNN(input_size=env.obs_space.flat_dim, output_size=env.act_space.flat_dim, hidden_sizes=[32, 32], hidden_nonlin=to.tanh, dropout=0.), tau_init=10., tau_learnable=True, kappa_init=0.02, kappa_learnable=True, activation_nonlin=to.sigmoid, potentials_dyn_fcn=pd_cubic, ) policy = ADNPolicy(spec=env.spec, **policy_hparam) # Algorithm algo_hparam = dict( max_iter=5000, pop_size=None,
def test_snapshots_notmeta(ex_dir, env: SimEnv, policy, algo_class, algo_hparam): # Collect hyper-parameters, create algorithm, and train common_hparam = dict(max_iter=1, num_workers=1) common_hparam.update(algo_hparam) if issubclass(algo_class, ActorCritic): common_hparam.update( min_rollouts=3, critic=GAE( vfcn=FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), hidden_sizes=[16, 16], hidden_nonlin=to.tanh))) elif issubclass(algo_class, ParameterExploring): common_hparam.update(num_rollouts=1) elif issubclass(algo_class, (DQL, SAC)): common_hparam.update(memory_size=1000, num_batch_updates=2, gamma=0.99, min_rollouts=1) fnn_hparam = dict(hidden_sizes=[8, 8], hidden_nonlin=to.tanh) if issubclass(algo_class, DQL): # Override the setting env = BallOnBeamDiscSim(env.dt, env.max_steps) net = FNN(input_size=DiscreteActQValPolicy.get_qfcn_input_size( env.spec), output_size=DiscreteActQValPolicy.get_qfcn_output_size(), **fnn_hparam) policy = DiscreteActQValPolicy(spec=env.spec, net=net) else: # Override the setting env = ActNormWrapper(env) policy = TwoHeadedGRUPolicy(env.spec, shared_hidden_size=8, shared_num_recurrent_layers=1) obsact_space = BoxSpace.cat([env.obs_space, env.act_space]) common_hparam.update(qfcn_1=FNNPolicy( spec=EnvSpec(obsact_space, ValueFunctionSpace), **fnn_hparam)) common_hparam.update(qfcn_2=FNNPolicy( spec=EnvSpec(obsact_space, ValueFunctionSpace), **fnn_hparam)) else: raise NotImplementedError # Simulate training algo = algo_class(ex_dir, env, policy, **common_hparam) algo.policy.param_values += to.tensor([42.]) if isinstance(algo, ActorCritic): algo.critic.vfcn.param_values += to.tensor([42.]) # Save and load algo.save_snapshot(meta_info=None) algo_loaded = Algorithm.load_snapshot(load_dir=ex_dir) assert isinstance(algo_loaded, Algorithm) policy_loaded = algo_loaded.policy if isinstance(algo, ActorCritic): critic_loaded = algo_loaded.critic # Check assert all(algo.policy.param_values == policy_loaded.param_values) if isinstance(algo, ActorCritic): assert all( algo.critic.vfcn.param_values == critic_loaded.vfcn.param_values) # Load the experiment. Since we did not save any hyper-parameters, we ignore the errors when loading. env, policy, extra = load_experiment(ex_dir) assert isinstance(env, Env) assert isinstance(policy, Policy) assert isinstance(extra, dict)