def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(dt=1 / 250., max_steps=1500) env = QQubeSwingUpSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict(feats=FeatureStack([ identity_feat, sign_feat, abs_feat, squared_feat, cubic_feat, ATan2Feat(1, 2), MultFeat([4, 5]) ])) policy = LinearPolicy(spec=env.spec, **policy_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=50, pop_size=trial.suggest_int('pop_size', 50, 200), num_rollouts=trial.suggest_int('num_rollouts', 4, 10), num_is_samples=trial.suggest_int('num_is_samples', 5, 40), expl_std_init=trial.suggest_uniform('expl_std_init', 0.1, 0.5), symm_sampling=trial.suggest_categorical('symm_sampling', [True, False]), ) csv_logger = create_csv_step_logger( osp.join(study_dir, f'trial_{trial.number}')) algo = PoWER(osp.join(study_dir, f'trial_{trial.number}'), env, policy, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler( env, policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
def train_and_eval(trial: optuna.Trial, ex_dir: str, seed: [int, None]): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param ex_dir: experiment's directory, i.e. the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(dt=1 / 100., max_steps=600) env = QQubeSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict( shared_hidden_sizes=trial.suggest_categorical( 'shared_hidden_sizes_policy', [[16, 16], [32, 32], [64, 64], [16, 16, 16], [32, 32, 32]]), shared_hidden_nonlin=fcn_from_str( trial.suggest_categorical('shared_hidden_nonlin_policy', ['to_tanh', 'to_relu'])), ) policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam) # Critic q_fcn_hparam = dict( hidden_sizes=trial.suggest_categorical( 'hidden_sizes_critic', [[16, 16], [32, 32], [64, 64], [16, 16, 16], [32, 32, 32]]), hidden_nonlin=fcn_from_str( trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])), ) obsact_space = BoxSpace.cat([env.obs_space, env.act_space]) q_fcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **q_fcn_hparam) q_fcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **q_fcn_hparam) # Algorithm algo_hparam = dict( num_sampler_envs=1, # parallelize via optuna n_jobs max_iter=100 * env.max_steps, min_steps=trial.suggest_categorical( 'min_steps_algo', [1]), # , 10, env.max_steps, 10*env.max_steps memory_size=trial.suggest_loguniform('memory_size_algo', 1e2 * env.max_steps, 1e4 * env.max_steps), tau=trial.suggest_uniform('tau_algo', 0.99, 1.), alpha_init=trial.suggest_uniform('alpha_init_algo', 0.1, 0.9), learn_alpha=trial.suggest_categorical('learn_alpha_algo', [True, False]), standardize_rew=trial.suggest_categorical('standardize_rew_algo', [False]), gamma=trial.suggest_uniform('gamma_algo', 0.99, 1.), target_update_intvl=trial.suggest_categorical( 'target_update_intvl_algo', [1, 5]), num_batch_updates=trial.suggest_categorical('num_batch_updates_algo', [1, 5]), batch_size=trial.suggest_categorical('batch_size_algo', [128, 256, 512]), lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3), ) csv_logger = create_csv_step_logger( osp.join(ex_dir, f'trial_{trial.number}')) algo = SAC(ex_dir, env, policy, q_fcn_1, q_fcn_2, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelSampler( env, policy, num_envs=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Load the data data_set_name = "oscillation_50Hz_initpos-0.5" data = pd.read_csv(osp.join(pyrado.PERMA_DIR, "misc", f"{data_set_name}.csv")) if data_set_name == "daily_min_temperatures": data = to.tensor(data["Temp"].values, dtype=to.get_default_dtype()).view(-1, 1) elif data_set_name == "monthly_sunspots": data = to.tensor(data["Sunspots"].values, dtype=to.get_default_dtype()).view(-1, 1) elif "oscillation" in data_set_name: data = to.tensor(data["Positions"].values, dtype=to.get_default_dtype()).view(-1, 1) else: raise pyrado.ValueErr( given=data_set_name, eq_constraint="'daily_min_temperatures', 'monthly_sunspots', " "'oscillation_50Hz_initpos-0.5', or 'oscillation_100Hz_initpos-0.4", ) # Dataset data_set_hparam = dict( name=data_set_name, ratio_train=0.7, window_size=trial.suggest_int("dataset_window_size", 1, 100), standardize_data=False, scale_min_max_data=True, ) dataset = TimeSeriesDataSet(data, **data_set_hparam) # Policy policy_hparam = dict( dt=0.02 if "oscillation" in data_set_name else 1.0, obs_layer=None, activation_nonlin=to.tanh, potentials_dyn_fcn=fcn_from_str( trial.suggest_categorical("policy_potentials_dyn_fcn", ["pd_linear", "pd_cubic"]) ), tau_init=trial.suggest_loguniform("policy_tau_init", 1e-2, 1e3), tau_learnable=True, kappa_init=trial.suggest_categorical("policy_kappa_init", [0, 1e-4, 1e-2]), kappa_learnable=True, capacity_learnable=True, potential_init_learnable=trial.suggest_categorical("policy_potential_init_learnable", [True, False]), init_param_kwargs=trial.suggest_categorical("policy_init_param_kwargs", [None]), use_cuda=False, ) policy = ADNPolicy(spec=EnvSpec(act_space=InfBoxSpace(shape=1), obs_space=InfBoxSpace(shape=1)), **policy_hparam) # Algorithm algo_hparam = dict( windowed=trial.suggest_categorical("algo_windowed", [True, False]), max_iter=1000, optim_class=optim.Adam, optim_hparam=dict( lr=trial.suggest_uniform("optim_lr", 5e-4, 5e-2), eps=trial.suggest_uniform("optim_eps", 1e-8, 1e-5), weight_decay=trial.suggest_uniform("optim_weight_decay", 5e-5, 5e-3), ), loss_fcn=nn.MSELoss(), ) csv_logger = create_csv_step_logger(osp.join(study_dir, f"trial_{trial.number}")) algo = TSPred(study_dir, dataset, policy, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode="latest", seed=seed) # Evaluate num_init_samples = dataset.window_size _, loss_trn = TSPred.evaluate( policy, dataset.data_trn_inp, dataset.data_trn_targ, windowed=algo.windowed, num_init_samples=num_init_samples, cascaded=False, ) _, loss_tst = TSPred.evaluate( policy, dataset.data_tst_inp, dataset.data_tst_targ, windowed=algo.windowed, num_init_samples=num_init_samples, cascaded=False, ) return loss_trn
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(physicsEngine="Bullet", dt=1 / 100.0, max_steps=500) env = BallOnPlate2DSim(**env_hparams) env = ActNormWrapper(env) # Policy policy_hparam = dict( shared_hidden_sizes=trial.suggest_categorical( "shared_hidden_sizes_policy", [(16, 16), (32, 32), (64, 64), (16, 16, 16), (32, 32, 32)]), shared_hidden_nonlin=fcn_from_str( trial.suggest_categorical("shared_hidden_nonlin_policy", ["to_tanh", "to_relu"])), ) policy = TwoHeadedFNNPolicy(spec=env.spec, **policy_hparam) # Critic qfcn_hparam = dict( hidden_sizes=trial.suggest_categorical("hidden_sizes_critic", [(16, 16), (32, 32), (64, 64), (16, 16, 16), (32, 32, 32)]), hidden_nonlin=fcn_from_str( trial.suggest_categorical("hidden_nonlin_critic", ["to_tanh", "to_relu"])), ) obsact_space = BoxSpace.cat([env.obs_space, env.act_space]) qfcn_1 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **qfcn_hparam) qfcn_2 = FNNPolicy(spec=EnvSpec(obsact_space, ValueFunctionSpace), **qfcn_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=100 * env.max_steps, min_steps=trial.suggest_categorical( "min_steps_algo", [1]), # 10, env.max_steps, 10*env.max_steps memory_size=trial.suggest_loguniform("memory_size_algo", 1e2 * env.max_steps, 1e4 * env.max_steps), tau=trial.suggest_uniform("tau_algo", 0.99, 1.0), ent_coeff_init=trial.suggest_uniform("ent_coeff_init_algo", 0.1, 0.9), learn_ent_coeff=trial.suggest_categorical("learn_ent_coeff_algo", [True, False]), standardize_rew=trial.suggest_categorical("standardize_rew_algo", [False]), gamma=trial.suggest_uniform("gamma_algo", 0.99, 1.0), target_update_intvl=trial.suggest_categorical( "target_update_intvl_algo", [1, 5]), num_updates_per_step=trial.suggest_categorical( "num_batch_updates_algo", [1, 5]), batch_size=trial.suggest_categorical("batch_size_algo", [128, 256, 512]), lr=trial.suggest_loguniform("lr_algo", 1e-5, 1e-3), ) csv_logger = create_csv_step_logger( osp.join(study_dir, f"trial_{trial.number}")) algo = SAC(study_dir, env, policy, qfcn_1, qfcn_2, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode="latest", seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler( env, policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Load the data data_set_name = 'oscillation_50Hz_initpos-0.5' data = pd.read_csv(osp.join(pyrado.PERMA_DIR, 'time_series', f'{data_set_name}.csv')) if data_set_name == 'daily_min_temperatures': data = to.tensor(data['Temp'].values, dtype=to.get_default_dtype()).view(-1, 1) elif data_set_name == 'monthly_sunspots': data = to.tensor(data['Sunspots'].values, dtype=to.get_default_dtype()).view(-1, 1) elif 'oscillation' in data_set_name: data = to.tensor(data['Positions'].values, dtype=to.get_default_dtype()).view(-1, 1) else: raise pyrado.ValueErr( given=data_set_name, eq_constraint="'daily_min_temperatures', 'monthly_sunspots', " "'oscillation_50Hz_initpos-0.5', or 'oscillation_100Hz_initpos-0.4") # Dataset data_set_hparam = dict( name=data_set_name, ratio_train=0.7, window_size=trial.suggest_int('dataset_window_size', 1, 100), standardize_data=False, scale_min_max_data=True ) dataset = TimeSeriesDataSet(data, **data_set_hparam) # Policy policy_hparam = dict( dt=0.02 if 'oscillation' in data_set_name else 1., hidden_size=trial.suggest_int('policy_hidden_size', 2, 51), obs_layer=None, activation_nonlin=fcn_from_str( trial.suggest_categorical('policy_activation_nonlin', ['to_tanh', 'to_sigmoid'])), mirrored_conv_weights=trial.suggest_categorical('policy_mirrored_conv_weights', [True, False]), conv_out_channels=1, conv_kernel_size=None, conv_padding_mode=trial.suggest_categorical('policy_conv_padding_mode', ['zeros', 'circular']), tau_init=trial.suggest_loguniform('policy_tau_init', 1e-2, 1e3), tau_learnable=True, kappa_init=trial.suggest_categorical('policy_kappa_init', [0, 1e-4, 1e-2]), kappa_learnable=True, potential_init_learnable=trial.suggest_categorical('policy_potential_init_learnable', [True, False]), init_param_kwargs=trial.suggest_categorical('policy_init_param_kwargs', [None, dict(bell=True)]), use_cuda=False ) policy = NFPolicy(spec=EnvSpec(act_space=InfBoxSpace(shape=1), obs_space=InfBoxSpace(shape=1)), **policy_hparam) # Algorithm algo_hparam = dict( windowed=trial.suggest_categorical('algo_windowed', [True, False]), max_iter=1000, optim_class=optim.Adam, optim_hparam=dict( lr=trial.suggest_uniform('optim_lr', 5e-4, 5e-2), eps=trial.suggest_uniform('optim_eps', 1e-8, 1e-5), weight_decay=trial.suggest_uniform('optim_weight_decay', 5e-5, 5e-3) ), loss_fcn=nn.MSELoss(), ) csv_logger = create_csv_step_logger(osp.join(study_dir, f'trial_{trial.number}')) algo = TSPred(study_dir, dataset, policy, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate num_init_samples = dataset.window_size _, loss_trn = TSPred.evaluate(policy, dataset.data_trn_inp, dataset.data_trn_targ, windowed=algo.windowed, num_init_samples=num_init_samples, cascaded=False) _, loss_tst = TSPred.evaluate(policy, dataset.data_tst_inp, dataset.data_tst_targ, windowed=algo.windowed, num_init_samples=num_init_samples, cascaded=False) return loss_trn
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environments env_hparams = dict(dt=1 / 100., max_steps=600) env_real = QQubeSwingUpSim(**env_hparams) env_real.domain_param = dict( Mr=0.095 * 0.9, # 0.095*0.9 = 0.0855 Mp=0.024 * 1.1, # 0.024*1.1 = 0.0264 Lr=0.085 * 0.9, # 0.085*0.9 = 0.0765 Lp=0.129 * 1.1, # 0.129*1.1 = 0.1419 ) env_sim = QQubeSwingUpSim(**env_hparams) randomizer = DomainRandomizer( NormalDomainParam(name='Mr', mean=0., std=1e6, clip_lo=1e-3), NormalDomainParam(name='Mp', mean=0., std=1e6, clip_lo=1e-3), NormalDomainParam(name='Lr', mean=0., std=1e6, clip_lo=1e-3), NormalDomainParam(name='Lp', mean=0., std=1e6, clip_lo=1e-3), ) env_sim = DomainRandWrapperLive(env_sim, randomizer) dp_map = { 0: ('Mr', 'mean'), 1: ('Mr', 'std'), 2: ('Mp', 'mean'), 3: ('Mp', 'std'), 4: ('Lr', 'mean'), 5: ('Lr', 'std'), 6: ('Lp', 'mean'), 7: ('Lp', 'std') } trafo_mask = [True] * 8 env_sim = MetaDomainRandWrapper(env_sim, dp_map) # Subroutine for policy improvement behav_policy_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh) behav_policy = FNNPolicy(spec=env_sim.spec, **behav_policy_hparam) vfcn_hparam = dict(hidden_sizes=[64, 64], hidden_nonlin=to.tanh) vfcn = FNNPolicy(spec=EnvSpec(env_sim.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.9885, lamda=0.9648, num_epoch=2, batch_size=500, standardize_adv=False, lr=5.792e-4, max_grad_norm=1., ) critic = GAE(vfcn, **critic_hparam) subrtn_policy_hparam = dict( max_iter=200, min_steps=3 * 23 * env_sim.max_steps, num_epoch=7, eps_clip=0.0744, batch_size=500, std_init=0.9074, lr=3.446e-04, max_grad_norm=1., num_workers=1, ) subrtn_policy = PPO(study_dir, env_sim, behav_policy, critic, **subrtn_policy_hparam) # Subroutine for system identification prior_std_denom = trial.suggest_uniform('prior_std_denom', 5, 20) prior = DomainRandomizer( NormalDomainParam(name='Mr', mean=0.095, std=0.095 / prior_std_denom), NormalDomainParam(name='Mp', mean=0.024, std=0.024 / prior_std_denom), NormalDomainParam(name='Lr', mean=0.085, std=0.085 / prior_std_denom), NormalDomainParam(name='Lp', mean=0.129, std=0.129 / prior_std_denom), ) ddp_policy = DomainDistrParamPolicy( mapping=dp_map, trafo_mask=trafo_mask, prior=prior, scale_params=trial.suggest_categorical('ddp_policy_scale_params', [True, False]), ) subsubrtn_distr_hparam = dict( max_iter=trial.suggest_categorical('subsubrtn_distr_max_iter', [20]), pop_size=trial.suggest_int('pop_size', 50, 500), num_rollouts=1, num_is_samples=trial.suggest_int('num_is_samples', 5, 20), expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3, 1e-1), expl_std_min=trial.suggest_categorical('expl_std_min', [1e-4]), extra_expl_std_init=trial.suggest_loguniform('expl_std_init', 1e-3, 1e-1), extra_expl_decay_iter=trial.suggest_int('extra_expl_decay_iter', 0, 10), num_workers=1, ) csv_logger = create_csv_step_logger( osp.join(study_dir, f'trial_{trial.number}')) subsubrtn_distr = CEM(study_dir, env_sim, ddp_policy, **subsubrtn_distr_hparam, logger=csv_logger) obs_vel_weight = trial.suggest_loguniform('obs_vel_weight', 1, 100) subrtn_distr_hparam = dict( metric=None, obs_dim_weight=[1, 1, 1, 1, obs_vel_weight, obs_vel_weight], num_rollouts_per_distr=trial.suggest_int('num_rollouts_per_distr', 20, 100), num_workers=1, ) subrtn_distr = SysIdViaEpisodicRL(subsubrtn_distr, behav_policy, **subrtn_distr_hparam) # Algorithm algo_hparam = dict( max_iter=trial.suggest_categorical('algo_max_iter', [10]), num_eval_rollouts=trial.suggest_categorical('algo_num_eval_rollouts', [5]), warmstart=trial.suggest_categorical('algo_warmstart', [True]), thold_succ_subrtn=trial.suggest_categorical('algo_thold_succ_subrtn', [50]), subrtn_snapshot_mode='latest', ) algo = SimOpt(study_dir, env_sim, env_real, subrtn_policy, subrtn_distr, **algo_hparam, logger=csv_logger) # Jeeeha algo.train(seed=args.seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler( env_real, algo.policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros]) / min_rollouts return mean_ret
def train_and_eval(trial: optuna.Trial, study_dir: str, seed: int): """ Objective function for the Optuna `Study` to maximize. .. note:: Optuna expects only the `trial` argument, thus we use `functools.partial` to sneak in custom arguments. :param trial: Optuna Trial object for hyper-parameter optimization :param study_dir: the parent directory for all trials in this study :param seed: seed value for the random number generators, pass `None` for no seeding :return: objective function value """ # Synchronize seeds between Optuna trials pyrado.set_seed(seed) # Environment env_hparams = dict(dt=1/100., max_steps=600) env = QQubeSwingUpSim(**env_hparams) env = ActNormWrapper(env) # Learning rate scheduler lrs_gamma = trial.suggest_categorical('exp_lr_scheduler_gamma', [None, 0.995, 0.999]) if lrs_gamma is not None: lr_sched = lr_scheduler.ExponentialLR lr_sched_hparam = dict(gamma=lrs_gamma) else: lr_sched, lr_sched_hparam = None, dict() # Policy policy_hparam = dict( hidden_sizes=trial.suggest_categorical('hidden_sizes_policy', [(16, 16), (32, 32), (64, 64)]), hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_policy', ['to_tanh', 'to_relu'])), ) # FNN # policy_hparam = dict( # hidden_size=trial.suggest_categorical('hidden_size_policy', [16, 32, 64]), # num_recurrent_layers=trial.suggest_categorical('num_recurrent_layers_policy', [1, 2]), # ) # LSTM & GRU policy = FNNPolicy(spec=env.spec, **policy_hparam) # policy = GRUPolicy(spec=env.spec, **policy_hparam) # Critic vfcn_hparam = dict( hidden_sizes=trial.suggest_categorical('hidden_sizes_critic', [(16, 16), (32, 32), (64, 64)]), hidden_nonlin=fcn_from_str(trial.suggest_categorical('hidden_nonlin_critic', ['to_tanh', 'to_relu'])), ) # vfcn_hparam = dict( # hidden_size=trial.suggest_categorical('hidden_size_critic', [16, 32, 64]), # num_recurrent_layers=trial.suggest_categorical('num_recurrent_layers_critic', [1, 2]), # ) # LSTM & GRU vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) # vfcn = GRUPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( batch_size=500, gamma=trial.suggest_uniform('gamma_critic', 0.98, 1.), lamda=trial.suggest_uniform('lamda_critic', 0.95, 1.), num_epoch=trial.suggest_int('num_epoch_critic', 1, 10), lr=trial.suggest_loguniform('lr_critic', 1e-5, 1e-3), standardize_adv=trial.suggest_categorical('standardize_adv_critic', [False]), max_grad_norm=trial.suggest_categorical('max_grad_norm_critic', [None, 1., 5.]), lr_scheduler=lr_sched, lr_scheduler_hparam=lr_sched_hparam ) critic = GAE(vfcn, **critic_hparam) # Algorithm algo_hparam = dict( num_workers=1, # parallelize via optuna n_jobs max_iter=250, batch_size=500, min_steps=trial.suggest_int('num_rollouts_algo', 10, 30)*env.max_steps, num_epoch=trial.suggest_int('num_epoch_algo', 1, 10), eps_clip=trial.suggest_uniform('eps_clip_algo', 0.05, 0.2), std_init=trial.suggest_uniform('std_init_algo', 0.5, 1.0), lr=trial.suggest_loguniform('lr_algo', 1e-5, 1e-3), max_grad_norm=trial.suggest_categorical('max_grad_norm_algo', [None, 1., 5.]), lr_scheduler=lr_sched, lr_scheduler_hparam=lr_sched_hparam ) csv_logger = create_csv_step_logger(osp.join(study_dir, f'trial_{trial.number}')) algo = PPO(osp.join(study_dir, f'trial_{trial.number}'), env, policy, critic, **algo_hparam, logger=csv_logger) # Train without saving the results algo.train(snapshot_mode='latest', seed=seed) # Evaluate min_rollouts = 1000 sampler = ParallelRolloutSampler(env, policy, num_workers=1, min_rollouts=min_rollouts) # parallelize via optuna n_jobs ros = sampler.sample() mean_ret = sum([r.undiscounted_return() for r in ros])/min_rollouts return mean_ret