) critic = GAE(vfcn, **critic_hparam) # Subroutine algo_hparam = dict( max_iter=200 if policy.name == FNNPolicy.name else 75, eps_clip=0.12648736789309026, min_steps=30 * env.max_steps, num_epoch=7, batch_size=500, std_init=0.7573286998997557, lr=6.999956625305722e-04, max_grad_norm=1.0, num_workers=8, lr_scheduler=lr_scheduler.ExponentialLR, lr_scheduler_hparam=dict(gamma=0.999), ) algo = PPO(ex_dir, env, policy, critic, **algo_hparam) # Save the hyper-parameters save_dicts_to_yaml( dict(env=env_hparams, seed=args.seed), dict(policy=policy_hparam), dict(critic=critic_hparam, vfcn=vfcn_hparam), dict(algo=algo_hparam, algo_name=algo.name), save_dir=ex_dir, ) # Jeeeha algo.train(snapshot_mode="latest", seed=args.seed)
standardizer=None, max_grad_norm=1.0, lr=5e-4, ) critic = GAE(vfcn, **critic_hparam) # Algorithm algo_hparam = dict( max_iter=500, min_steps=20 * env.max_steps, num_epoch=10, eps_clip=0.15, batch_size=512, max_grad_norm=1.0, lr=3e-4, num_workers=12, ) algo = PPO(ex_dir, env, policy, critic, **algo_hparam) # Save the hyper-parameters save_dicts_to_yaml( dict(env=env_hparams, seed=args.seed), dict(policy=policy_hparam), dict(critic=critic_hparam, vfcn=vfcn_hparam), dict(algo=algo_hparam, algo_name=algo.name), save_dir=ex_dir, ) # Jeeeha algo.train(seed=args.seed, snapshot_mode="best")
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env = QBallBalancerSim(dt=1 / 250.0, max_steps=1500) env = ActNormWrapper(env) # Learning rate scheduler lrs_gamma = trial.suggest_categorical("exp_lr_scheduler_gamma", [None, 0.99, 0.995, 0.999]) if lrs_gamma is not None: lr_sched = lr_scheduler.ExponentialLR lr_sched_hparam = dict(gamma=lrs_gamma) else: lr_sched, lr_sched_hparam = None, dict() # Policy policy = FNNPolicy( spec=env.spec, hidden_sizes=trial.suggest_categorical("hidden_sizes_policy", [(16, 16), (32, 32), (64, 64)]), hidden_nonlin=fcn_from_str( trial.suggest_categorical("hidden_nonlin_policy", ["to_tanh", "to_relu"])), ) # Critic vfcn = FNN( input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=trial.suggest_categorical("hidden_sizes_critic", [(16, 16), (32, 32), (64, 64)]), hidden_nonlin=fcn_from_str( trial.suggest_categorical("hidden_nonlin_critic", ["to_tanh", "to_relu"])), ) critic_hparam = dict( batch_size=250, gamma=trial.suggest_uniform("gamma_critic", 0.99, 1.0), lamda=trial.suggest_uniform("lamda_critic", 0.95, 1.0), num_epoch=trial.suggest_int("num_epoch_critic", 1, 10), lr=trial.suggest_loguniform("lr_critic", 1e-5, 1e-3), standardize_adv=trial.suggest_categorical("standardize_adv_critic", [True, False]), max_grad_norm=trial.suggest_categorical("max_grad_norm_critic", [None, 1.0, 5.0]), lr_scheduler=lr_sched, lr_scheduler_hparam=lr_sched_hparam, ) critic = GAE(vfcn, **critic_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=300, batch_size=250, min_steps=trial.suggest_int("num_rollouts_algo", 10, 30) * env.max_steps, num_epoch=trial.suggest_int("num_epoch_algo", 1, 10), eps_clip=trial.suggest_uniform("eps_clip_algo", 0.05, 0.2), std_init=trial.suggest_uniform("std_init_algo", 0.5, 1.0), lr=trial.suggest_loguniform("lr_algo", 1e-5, 1e-3), max_grad_norm=trial.suggest_categorical("max_grad_norm_algo", [None, 1.0, 5.0]), lr_scheduler=lr_sched, lr_scheduler_hparam=lr_sched_hparam, ) algo = PPO(osp.join(study_dir, f"trial_{trial.number}"), env, policy, critic, **algo_hparam) # Train without saving the results algo.train(snapshot_mode="latest", seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler(env, policy, num_workers=1, min_rollouts=min_rollouts) ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
env = DomainRandWrapperLive(env, randomizer) # Policy policy = to.load(osp.join(ref_ex_dir, 'policy.pt')) policy.init_param() # Critic vfcn = to.load(osp.join(ref_ex_dir, 'valuefcn.pt')) vfcn.init_param() critic = GAE(vfcn, **hparams['critic']) # Algorithm algo_hparam = hparams['subrtn'] algo_hparam.update({'num_workers': 1}) # should be equivalent to the number of cores per job # algo_hparam.update({'max_iter': 300}) # algo_hparam.update({'max_iter': 600}) # algo_hparam.update({'min_steps': 3*algo_hparam['min_steps']}) algo = PPO(ex_dir, env, policy, critic, **algo_hparam) # Save the hyper-parameters save_list_of_dicts_to_yaml([ dict(env=env_hparams, seed=args.seed), dict(policy=hparams['policy']), dict(critic=hparams['critic']), dict(algo=algo_hparam, algo_name=algo.name)], ex_dir ) # Jeeeha algo.train(seed=args.seed, snapshot_mode='latest')
lr=7.058326426522811e-4, max_grad_norm=6., lr_scheduler=lr_scheduler.ExponentialLR, lr_scheduler_hparam=dict(gamma=0.999)) critic = GAE(vfcn, **critic_hparam) # Subroutine algo_hparam = dict(max_iter=200, eps_clip=0.12648736789309026, min_steps=30 * env.max_steps, num_epoch=7, batch_size=500, std_init=0.7573286998997557, lr=6.999956625305722e-04, max_grad_norm=1., num_workers=8, lr_scheduler=lr_scheduler.ExponentialLR, lr_scheduler_hparam=dict(gamma=0.999)) algo = PPO(ex_dir, env, policy, critic, **algo_hparam) # Save the hyper-parameters save_list_of_dicts_to_yaml([ dict(env=env_hparams, seed=args.seed), dict(policy=policy_hparam), dict(critic=critic_hparam, vfcn=vfcn_hparam), dict(algo=algo_hparam, algo_name=algo.name) ], ex_dir) # Jeeeha algo.train(snapshot_mode='latest', seed=args.seed)
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(dt=1/100., max_steps=600) env = QQubeSwingUpSim(**env_hparams) env = ActNormWrapper(env) # Learning rate scheduler lrs_gamma = trial.suggest_categorical('exp_lr_scheduler_gamma', [None, 0.995, 0.999]) if lrs_gamma is not None: lr_sched = lr_scheduler.ExponentialLR lr_sched_hparam = dict(gamma=lrs_gamma) else: lr_sched, lr_sched_hparam = None, dict() # Policy policy_hparam = dict( hidden_sizes=trial.suggest_categorical('hidden_sizes_policy', [(16, 16), (32, 32), (64, 64)]), hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_policy', ['to_tanh', 'to_relu'])), ) # FNN # policy_hparam = dict( # hidden_size=trial.suggest_categorical('hidden_size_policy', [16, 32, 64]), # num_recurrent_layers=trial.suggest_categorical('num_recurrent_layers_policy', [1, 2]), # ) # LSTM & GRU policy = FNNPolicy(spec=env.spec, **policy_hparam) # policy = GRUPolicy(spec=env.spec, **policy_hparam) # Critic vfcn_hparam = dict( hidden_sizes=trial.suggest_categorical('hidden_sizes_critic', [(16, 16), (32, 32), (64, 64)]), hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])), ) # vfcn_hparam = dict( # hidden_size=trial.suggest_categorical('hidden_size_critic', [16, 32, 64]), # num_recurrent_layers=trial.suggest_categorical('num_recurrent_layers_critic', [1, 2]), # ) # LSTM & GRU vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) # vfcn = GRUPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( batch_size=500, gamma=trial.suggest_uniform('gamma_critic', 0.98, 1.), lamda=trial.suggest_uniform('lamda_critic', 0.95, 1.), num_epoch=trial.suggest_int('num_epoch_critic', 1, 10), lr=trial.suggest_loguniform('lr_critic', 1e-5, 1e-3), standardize_adv=trial.suggest_categorical('standardize_adv_critic', [False]), max_grad_norm=trial.suggest_categorical('max_grad_norm_critic', [None, 1., 5.]), lr_scheduler=lr_sched, lr_scheduler_hparam=lr_sched_hparam ) critic = GAE(vfcn, **critic_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=250, batch_size=500, min_steps=trial.suggest_int('num_rollouts_algo', 10, 30)*env.max_steps, num_epoch=trial.suggest_int('num_epoch_algo', 1, 10), eps_clip=trial.suggest_uniform('eps_clip_algo', 0.05, 0.2), std_init=trial.suggest_uniform('std_init_algo', 0.5, 1.0), lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3), max_grad_norm=trial.suggest_categorical('max_grad_norm_algo', [None, 1., 5.]), lr_scheduler=lr_sched, lr_scheduler_hparam=lr_sched_hparam ) csv_logger = create_csv_step_logger(osp.join(study_dir, f'trial_{trial.number}')) algo = PPO(osp.join(study_dir, f'trial_{trial.number}'), env, policy, critic, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler(env, policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros])/min_rollouts return mean_ret