def train_cartpole_expert(): env = make_vec_env('CartPole-v1', n_envs=8) model = PPO('MlpPolicy', env, verbose=1, n_steps=32, batch_size=256, gae_lambda=0.8, gamma=0.98, n_epochs=20, ent_coef=0.0, learning_rate=linear_schedule(0.001), clip_range=linear_schedule(0.2), policy_kwargs=dict(net_arch=[64, 64])) model.learn(total_timesteps=1e5) model.save("experts/CartPole-v1/cartpole_expert") gen_expert_demos('CartPole-v1', gym.make('CartPole-v1'), model, 25)
def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]: """ Sampler for A2C hyperparams. :param trial: :return: """ gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) normalize_advantage = trial.suggest_categorical("normalize_advantage", [False, True]) max_grad_norm = trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5]) use_rms_prop = trial.suggest_categorical("use_rms_prop", [False, True]) gae_lambda = trial.suggest_categorical("gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0]) n_steps = trial.suggest_categorical("n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048]) lr_schedule = trial.suggest_categorical("lr_schedule", ["linear", "constant"]) learning_rate = trial.suggest_loguniform("lr", 1e-5, 1) ent_coef = trial.suggest_loguniform("ent_coef", 0.00000001, 0.1) vf_coef = trial.suggest_uniform("vf_coef", 0, 1) log_std_init = trial.suggest_uniform("log_std_init", -4, 1) ortho_init = trial.suggest_categorical("ortho_init", [False, True]) net_arch = trial.suggest_categorical("net_arch", ["small", "medium"]) # sde_net_arch = trial.suggest_categorical("sde_net_arch", [None, "tiny", "small"]) # full_std = trial.suggest_categorical("full_std", [False, True]) # activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu']) activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"]) if lr_schedule == "linear": learning_rate = linear_schedule(learning_rate) net_arch = { "small": [dict(pi=[64, 64], vf=[64, 64])], "medium": [dict(pi=[256, 256], vf=[256, 256])], }[net_arch] # sde_net_arch = { # None: None, # "tiny": [64], # "small": [64, 64], # }[sde_net_arch] activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn] return { "n_steps": n_steps, "gamma": gamma, "gae_lambda": gae_lambda, "learning_rate": learning_rate, "ent_coef": ent_coef, "normalize_advantage": normalize_advantage, "max_grad_norm": max_grad_norm, "use_rms_prop": use_rms_prop, "vf_coef": vf_coef, "policy_kwargs": dict( log_std_init=log_std_init, net_arch=net_arch, # full_std=full_std, activation_fn=activation_fn, # sde_net_arch=sde_net_arch, ortho_init=ortho_init, ), }
def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]: """ Sampler for PPO2 hyperparams. :param trial: :return: """ batch_size = trial.suggest_categorical("batch_size", [8, 16, 32, 64, 128, 256, 512]) n_steps = trial.suggest_categorical("n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048]) gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) learning_rate = trial.suggest_loguniform("lr", 1e-5, 1) lr_schedule = "constant" # lr_schedule = trial.suggest_categorical('lr_schedule', ['linear', 'constant']) ent_coef = trial.suggest_loguniform("ent_coef", 0.00000001, 0.1) clip_range = trial.suggest_categorical("clip_range", [0.1, 0.2, 0.3, 0.4]) n_epochs = trial.suggest_categorical("n_epochs", [1, 5, 10, 20]) gae_lambda = trial.suggest_categorical("gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0]) max_grad_norm = trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5]) vf_coef = trial.suggest_uniform("vf_coef", 0, 1) net_arch = trial.suggest_categorical("net_arch", ["small", "medium"]) log_std_init = trial.suggest_uniform("log_std_init", -4, 1) sde_sample_freq = trial.suggest_categorical("sde_sample_freq", [-1, 8, 16, 32, 64, 128, 256]) ortho_init = False # ortho_init = trial.suggest_categorical('ortho_init', [False, True]) # activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu']) activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"]) # TODO: account when using multiple envs if batch_size > n_steps: batch_size = n_steps if lr_schedule == "linear": learning_rate = linear_schedule(learning_rate) net_arch = { "small": [dict(pi=[64, 64], vf=[64, 64])], "medium": [dict(pi=[256, 256], vf=[256, 256])], }[net_arch] activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn] return { "n_steps": n_steps, "batch_size": batch_size, "gamma": gamma, "learning_rate": learning_rate, "ent_coef": ent_coef, "clip_range": clip_range, "n_epochs": n_epochs, "gae_lambda": gae_lambda, "max_grad_norm": max_grad_norm, "vf_coef": vf_coef, "sde_sample_freq": sde_sample_freq, "policy_kwargs": dict( log_std_init=log_std_init, net_arch=net_arch, activation_fn=activation_fn, ortho_init=ortho_init, ), }
def sample_ppo_params(trial): """ Sampler for PPO2 hyperparams. :param trial: (optuna.trial) :return: (dict) """ batch_size = trial.suggest_categorical('batch_size', [8, 16, 32, 64, 128, 256, 512]) n_steps = trial.suggest_categorical('n_steps', [8, 16, 32, 64, 128, 256, 512, 1024, 2048]) gamma = trial.suggest_categorical('gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) learning_rate = trial.suggest_loguniform('lr', 1e-5, 1) lr_schedule = 'constant' # lr_schedule = trial.suggest_categorical('lr_schedule', ['linear', 'constant']) ent_coef = trial.suggest_loguniform('ent_coef', 0.00000001, 0.1) clip_range = trial.suggest_categorical('clip_range', [0.1, 0.2, 0.3, 0.4]) n_epochs = trial.suggest_categorical('n_epochs', [1, 5, 10, 20]) gae_lambda = trial.suggest_categorical('gae_lambda', [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0]) max_grad_norm = trial.suggest_categorical('max_grad_norm', [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5]) vf_coef = trial.suggest_uniform('vf_coef', 0, 1) net_arch = trial.suggest_categorical('net_arch', ['small', 'medium']) log_std_init = trial.suggest_uniform('log_std_init', -4, 1) sde_sample_freq = trial.suggest_categorical('sde_sample_freq', [-1, 8, 16, 32, 64, 128, 256]) ortho_init = False # ortho_init = trial.suggest_categorical('ortho_init', [False, True]) # activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu']) activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu']) # TODO: account when using multiple envs if batch_size > n_steps: batch_size = n_steps if lr_schedule == 'linear': learning_rate = linear_schedule(learning_rate) net_arch = { 'small': [dict(pi=[64, 64], vf=[64, 64])], 'medium': [dict(pi=[256, 256], vf=[256, 256])], }[net_arch] activation_fn = { 'tanh': nn.Tanh, 'relu': nn.ReLU, 'elu': nn.ELU, 'leaky_relu': nn.LeakyReLU }[activation_fn] return { 'n_steps': n_steps, 'batch_size': batch_size, 'gamma': gamma, 'learning_rate': learning_rate, 'ent_coef': ent_coef, 'clip_range': clip_range, 'n_epochs': n_epochs, 'gae_lambda': gae_lambda, 'max_grad_norm': max_grad_norm, 'vf_coef': vf_coef, 'sde_sample_freq': sde_sample_freq, 'policy_kwargs': dict(log_std_init=log_std_init, net_arch=net_arch, activation_fn=activation_fn) }
def a2c(env, hyper, policy = "MlpPolicy", tensorboard_log = None, verbose = 1, seed = 0, use_sde = True, sde_sample_freq = -1, rms_prop_eps = 1e-05, device = "auto"): lr_schedule = hyper["params_lr_schedule"] learning_rate = hyper["params_lr"] if lr_schedule == "linear": learning_rate = linear_schedule(learning_rate) policy_kwargs = make_policy_kwargs(hyper, "a2c") model = A2C(policy, env, tensorboard_log=tensorboard_log, verbose = verbose, seed = seed, use_sde = use_sde, sde_sample_freq = sde_sample_freq, rms_prop_eps = rms_prop_eps, learning_rate = learning_rate, n_steps = np.int(hyper["params_n_steps"]), gamma = hyper["params_gamma"], gae_lambda = hyper["params_gae_lambda"], ent_coef = hyper["params_ent_coef"], vf_coef = hyper["params_vf_coef"], max_grad_norm = hyper["params_max_grad_norm"], use_rms_prop = hyper["params_use_rms_prop"], normalize_advantage = hyper["params_normalize_advantage"], policy_kwargs = policy_kwargs, device = device ) return model
def sample_a2c_params(trial): """ Sampler for A2C hyperparams. :param trial: (optuna.trial) :return: (dict) """ gamma = trial.suggest_categorical('gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) normalize_advantage = trial.suggest_categorical('normalize_advantage', [False, True]) max_grad_norm = trial.suggest_categorical('max_grad_norm', [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5]) use_rms_prop = trial.suggest_categorical('use_rms_prop', [False, True]) gae_lambda = trial.suggest_categorical('gae_lambda', [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0]) n_steps = trial.suggest_categorical('n_steps', [8, 16, 32, 64, 128, 256, 512, 1024, 2048]) lr_schedule = trial.suggest_categorical('lr_schedule', ['linear', 'constant']) learning_rate = trial.suggest_loguniform('lr', 1e-5, 1) ent_coef = trial.suggest_loguniform('ent_coef', 0.00000001, 0.1) vf_coef = trial.suggest_uniform('vf_coef', 0, 1) log_std_init = trial.suggest_uniform('log_std_init', -4, 1) ortho_init = trial.suggest_categorical('ortho_init', [False, True]) net_arch = trial.suggest_categorical('net_arch', ['small', 'medium']) sde_net_arch = trial.suggest_categorical('sde_net_arch', [None, 'tiny', 'small']) full_std = trial.suggest_categorical('full_std', [False, True]) # activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu']) activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu']) if lr_schedule == 'linear': learning_rate = linear_schedule(learning_rate) net_arch = { 'small': [dict(pi=[64, 64], vf=[64, 64])], 'medium': [dict(pi=[256, 256], vf=[256, 256])], }[net_arch] sde_net_arch = { None: None, 'tiny': [64], 'small': [64, 64], }[sde_net_arch] activation_fn = { 'tanh': nn.Tanh, 'relu': nn.ReLU, 'elu': nn.ELU, 'leaky_relu': nn.LeakyReLU }[activation_fn] return { 'n_steps': n_steps, 'gamma': gamma, 'gae_lambda': gae_lambda, 'learning_rate': learning_rate, 'ent_coef': ent_coef, 'normalize_advantage': normalize_advantage, 'max_grad_norm': max_grad_norm, 'use_rms_prop': use_rms_prop, 'vf_coef': vf_coef, 'policy_kwargs': dict(log_std_init=log_std_init, net_arch=net_arch, full_std=full_std, activation_fn=activation_fn, sde_net_arch=sde_net_arch, ortho_init=ortho_init) }
def __init__(self, env, args): # define hyper parameters self.env = env self.args = args self.criterion = nn.MSELoss() # define network self.net = net(self.env.action_space.n, self.args.use_dueling) self.target_net = copy.deepcopy(self.net) self.target_net.load_state_dict(self.net.state_dict()) if self.args.cuda: self.net.cuda() self.target_net.cuda() # define the optimizer self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.args.lr) self.buffer = ReplayBuffer(self.args.batch_size) # define the linear schedule of the exploration # TODO self.exploration_schedule = linear_schedule(int(self.args.total_timesteps * self.args.exploration_fraction), \ self.args.final_ratio, self.args.init_ratio) # create the folder to save the model if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # set environment folder self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path)
def __init__(self, env, args): # define some important self.env = env self.args = args # define the network self.net = net(self.env.action_space.n, self.args.use_dueling) # copy the self.net as the self.target_net = copy.deepcopy(self.net) # make sure the target net has the same weights as the network self.target_net.load_state_dict(self.net.state_dict()) if self.args.cuda: self.net.cuda() self.target_net.cuda() # define the optimizer self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.args.lr) # define the replay memory self.buffer = replay_buffer(self.args.buffer_size) # define the linear schedule of the exploration self.exploration_schedule = linear_schedule(int(self.args.total_timesteps * self.args.exploration_fraction), \ self.args.final_ratio, self.args.init_ratio) # create the folder to save the models if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # set the environment folder self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path)
def train_hopper_expert(): # No env normalization. env = make_vec_env('HopperBulletEnv-v0', n_envs=1) model = SAC('MlpPolicy', env, verbose=1, buffer_size=300000, batch_size=256, gamma=0.98, tau=0.02, train_freq=64, gradient_steps=64, ent_coef='auto', learning_rate=linear_schedule(7.3e-4), learning_starts=10000, policy_kwargs=dict(net_arch=[256, 256], log_std_init=-3), use_sde=True) model.learn(total_timesteps=1e6) model.save("experts/HopperBulletEnv-v0/hopper_expert") gen_expert_demos('HopperBulletEnv-v0', gym.make('HopperBulletEnv-v0'), model, 25)
def train_sqil(env, n=0): venv = gym.make(env) expert_data = make_sa_dataset(env, max_trajs=5) for i in range(n): if isinstance(venv.action_space, Discrete): model = DQN(SQLPolicy, venv, verbose=1, policy_kwargs=dict(net_arch=[64, 64]), learning_starts=1) else: model = SAC('MlpPolicy', venv, verbose=1, policy_kwargs=dict(net_arch=[256, 256]), ent_coef='auto', learning_rate=linear_schedule(7.3e-4), train_freq=64, gradient_steps=64, gamma=0.98, tau=0.02) model.replay_buffer = SQILReplayBuffer(model.buffer_size, model.observation_space, model.action_space, model.device, 1, model.optimize_memory_usage, expert_data=expert_data) mean_rewards = [] std_rewards = [] for train_steps in range(20): if train_steps > 0: if 'Bullet' in env: model.learn(total_timesteps=25000, log_interval=1) else: model.learn(total_timesteps=16384, log_interval=1) mean_reward, std_reward = evaluate_policy(model, model.env, n_eval_episodes=10) mean_rewards.append(mean_reward) std_rewards.append(std_reward) print("{0} Steps: {1}".format(train_steps, mean_reward)) np.savez(os.path.join("learners", env, "sqil_rewards_{0}".format(i)), means=mean_rewards, stds=std_rewards)
def train_adril(env, n=0, balanced=False): num_trajs = 20 expert_data = make_sa_dataset(env, max_trajs=num_trajs) n_expert = len(expert_data["obs"]) expert_sa = np.concatenate( (expert_data["obs"], np.reshape(expert_data["acts"], (n_expert, -1))), axis=1) for i in range(0, n): venv = AdRILWrapper(gym.make(env)) mean_rewards = [] std_rewards = [] # Create model if isinstance(venv.action_space, Discrete): model = DQN(SQLPolicy, venv, verbose=1, policy_kwargs=dict(net_arch=[64, 64]), learning_starts=1) else: model = SAC('MlpPolicy', venv, verbose=1, policy_kwargs=dict(net_arch=[256, 256]), ent_coef='auto', learning_rate=linear_schedule(7.3e-4), train_freq=64, gradient_steps=64, gamma=0.98, tau=0.02) model.replay_buffer = AdRILReplayBuffer(model.buffer_size, model.observation_space, model.action_space, model.device, 1, model.optimize_memory_usage, expert_data=expert_data, N_expert=num_trajs, balanced=balanced) if not balanced: for j in range(len(expert_sa)): obs = expert_data["obs"][j] act = expert_data["acts"][j] next_obs = expert_data["next_obs"][j] done = expert_data["dones"][j] model.replay_buffer.add(obs, next_obs, act, -1, done) for train_steps in range(400): # Train policy if train_steps > 0: if 'Bullet' in env: model.learn(total_timesteps=1250, log_interval=1000) else: model.learn(total_timesteps=25000, log_interval=1000) if train_steps % 1 == 0: # written to support more complex update schemes model.replay_buffer.set_iter(train_steps) model.replay_buffer.set_n_learner(venv.num_trajs) # Evaluate policy if train_steps % 20 == 0: model.set_env(gym.make(env)) mean_reward, std_reward = evaluate_policy(model, model.env, n_eval_episodes=10) mean_rewards.append(mean_reward) std_rewards.append(std_reward) print("{0} Steps: {1}".format(int(train_steps * 1250), mean_reward)) np.savez(os.path.join("learners", env, "adril_rewards_{0}".format(i)), means=mean_rewards, stds=std_rewards) # Update env if train_steps > 0: if train_steps % 1 == 0: venv.set_iter(train_steps + 1) model.set_env(venv)
def hyperparam_anneal(args, global_step): if args.train.beta_aux_pres_anneal_end_step == 0: args.train.beta_aux_pres = args.train.beta_aux_pres_anneal_start_value else: args.train.beta_aux_pres = linear_schedule( global_step, args.train.beta_aux_pres_anneal_start_step, args.train.beta_aux_pres_anneal_end_step, args.train.beta_aux_pres_anneal_start_value, args.train.beta_aux_pres_anneal_end_value) if args.train.beta_aux_where_anneal_end_step == 0: args.train.beta_aux_where = args.train.beta_aux_where_anneal_start_value else: args.train.beta_aux_where = linear_schedule( global_step, args.train.beta_aux_where_anneal_start_step, args.train.beta_aux_where_anneal_end_step, args.train.beta_aux_where_anneal_start_value, args.train.beta_aux_where_anneal_end_value) if args.train.beta_aux_what_anneal_end_step == 0: args.train.beta_aux_what = args.train.beta_aux_what_anneal_start_value else: args.train.beta_aux_what = linear_schedule( global_step, args.train.beta_aux_what_anneal_start_step, args.train.beta_aux_what_anneal_end_step, args.train.beta_aux_what_anneal_start_value, args.train.beta_aux_what_anneal_end_value) if args.train.beta_aux_depth_anneal_end_step == 0: args.train.beta_aux_depth = args.train.beta_aux_depth_anneal_start_value else: args.train.beta_aux_depth = linear_schedule( global_step, args.train.beta_aux_depth_anneal_start_step, args.train.beta_aux_depth_anneal_end_step, args.train.beta_aux_depth_anneal_start_value, args.train.beta_aux_depth_anneal_end_value) if args.train.beta_aux_global_anneal_end_step == 0: args.train.beta_aux_global = args.train.beta_aux_global_anneal_start_value else: args.train.beta_aux_global = linear_schedule( global_step, args.train.beta_aux_global_anneal_start_step, args.train.beta_aux_global_anneal_end_step, args.train.beta_aux_global_anneal_start_value, args.train.beta_aux_global_anneal_end_value) if args.train.beta_aux_bg_anneal_end_step == 0: args.train.beta_aux_bg = args.train.beta_aux_bg_anneal_start_value else: args.train.beta_aux_bg = linear_schedule( global_step, args.train.beta_aux_bg_anneal_start_step, args.train.beta_aux_bg_anneal_end_step, args.train.beta_aux_bg_anneal_start_value, args.train.beta_aux_bg_anneal_end_value) ########################### split here ########################### if args.train.beta_pres_anneal_end_step == 0: args.train.beta_pres = args.train.beta_pres_anneal_start_value else: args.train.beta_pres = linear_schedule( global_step, args.train.beta_pres_anneal_start_step, args.train.beta_pres_anneal_end_step, args.train.beta_pres_anneal_start_value, args.train.beta_pres_anneal_end_value) if args.train.beta_where_anneal_end_step == 0: args.train.beta_where = args.train.beta_where_anneal_start_value else: args.train.beta_where = linear_schedule( global_step, args.train.beta_where_anneal_start_step, args.train.beta_where_anneal_end_step, args.train.beta_where_anneal_start_value, args.train.beta_where_anneal_end_value) if args.train.beta_what_anneal_end_step == 0: args.train.beta_what = args.train.beta_what_anneal_start_value else: args.train.beta_what = linear_schedule( global_step, args.train.beta_what_anneal_start_step, args.train.beta_what_anneal_end_step, args.train.beta_what_anneal_start_value, args.train.beta_what_anneal_end_value) if args.train.beta_depth_anneal_end_step == 0: args.train.beta_depth = args.train.beta_depth_anneal_start_value else: args.train.beta_depth = linear_schedule( global_step, args.train.beta_depth_anneal_start_step, args.train.beta_depth_anneal_end_step, args.train.beta_depth_anneal_start_value, args.train.beta_depth_anneal_end_value) if args.train.beta_global_anneal_end_step == 0: args.train.beta_global = args.train.beta_global_anneal_start_value else: args.train.beta_global = linear_schedule( global_step, args.train.beta_global_anneal_start_step, args.train.beta_global_anneal_end_step, args.train.beta_global_anneal_start_value, args.train.beta_global_anneal_end_value) if args.train.tau_pres_anneal_end_step == 0: args.train.tau_pres = args.train.tau_pres_anneal_start_value else: args.train.tau_pres = linear_schedule( global_step, args.train.tau_pres_anneal_start_step, args.train.tau_pres_anneal_end_step, args.train.tau_pres_anneal_start_value, args.train.tau_pres_anneal_end_value) if args.train.beta_bg_anneal_end_step == 0: args.train.beta_bg = args.train.beta_bg_anneal_start_value else: args.train.beta_bg = linear_schedule( global_step, args.train.beta_bg_anneal_start_step, args.train.beta_bg_anneal_end_step, args.train.beta_bg_anneal_start_value, args.train.beta_bg_anneal_end_value) return
def sample_ppo_params(trial: optuna.Trial, n_envs) -> Dict[str, Any]: """ Sampler for PPO hyperparams. :param trial: :return: """ batch_size = trial.suggest_categorical("batch_size", [8, 16, 32, 64, 128, 256, 512]) #batch_size = 32 n_steps = trial.suggest_categorical("n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048]) #n_steps = 256 gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) #gamma = 0.98 learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3) #learning_rate = 0.000107739192714429 lr_schedule = "constant" # Uncomment to enable learning rate schedule # lr_schedule = trial.suggest_categorical('lr_schedule', ['linear', 'constant']) # ent_coef = trial.suggest_loguniform("ent_coef", 0.00000001, 0.1) ent_coef = 0.0 clip_range = trial.suggest_categorical("clip_range", [0.1, 0.2, 0.3, 0.4]) #clip_range = 0.2 n_epochs = trial.suggest_categorical("n_epochs", [1, 5, 10, 20]) #n_epochs = 5 gae_lambda = trial.suggest_categorical("gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0]) #gae_lambda = 0.8 max_grad_norm = trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5]) #max_grad_norm = 0.5 #vf_coef = trial.suggest_uniform("vf_coef", 0, 1) vf_coef = 0.5 net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "large", "huge", "small3", "medium3", "large3", "huge3"]) #net_arch = "large" use_sde = False # Uncomment for gSDE (continuous action) # sde_sample_freq = trial.suggest_categorical("sde_sample_freq", [-1, 8, 16, 32, 64, 128, 256]) # Orthogonal initialization ortho_init = True # ortho_init = trial.suggest_categorical('ortho_init', [False, True]) # activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu']) activation_fn = 'tanh' # activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"]) distribution_type = 'FixedVarSquashedDiagGaussian' # distribution_type = trial.suggest_categorical('distribution_type', # ['FixedVarSquashedDiagGaussian', 'FixedVarDiagGaussian', # 'SquashedDiagGaussian', 'Beta']) log_std_init = None if distribution_type in ['FixedVarSquashedDiagGaussian', 'FixedVarDiagGaussian', 'SquashedDiagGaussian']: # only need this if we use some form of gaussian distribution log_std_init = trial.suggest_uniform("log_std_init", -4, 0) # log_std_init = -3 if distribution_type == 'Beta': beta_init = trial.suggest_uniform("beta_init", 0, 20) else: beta_init = 16 if batch_size > n_steps * n_envs: batch_size = n_steps * n_envs if lr_schedule == "linear": learning_rate = linear_schedule(learning_rate) # Independent networks usually work best # when not working with images net_arch = { 'small': [dict(pi=[64, 64], vf=[64, 64])], 'medium': [dict(pi=[256, 256], vf=[256, 256])], 'large': [dict(pi=[512, 512], vf=[512, 512])], 'huge': [dict(pi=[1024, 1024], vf=[1024, 1024])], 'small3': [dict(pi=[64, 64, 64], vf=[64, 64, 64])], 'medium3': [dict(pi=[256, 256, 256], vf=[256, 256, 256])], 'large3': [dict(pi=[512, 512, 512], vf=[512, 512, 512])], 'huge3': [dict(pi=[1024, 1024, 1024], vf=[1024, 1024, 1024])] }[net_arch] activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn] #distribution_type = {'FixedVarSquashedDiagGaussianDistribution': FixedVarSquashedDiagGaussianDistribution, # 'FixedVarDiagGaussianDistribution': FixedVarDiagGaussianDistribution, # 'SquashedDiagGaussianDistribution': SquashedDiagGaussianDistribution, # 'Beta': Beta}[distribution_type] return { 'set_action_bias_from_env': True, 'n_steps': n_steps, 'batch_size': batch_size, 'gamma': gamma, 'learning_rate': learning_rate, 'ent_coef': ent_coef, 'clip_range': clip_range, 'n_epochs': n_epochs, 'gae_lambda': gae_lambda, 'max_grad_norm': max_grad_norm, 'vf_coef': vf_coef, 'use_sde': use_sde, 'beta_init': beta_init, # 'sde_sample_freq': sde_sample_freq, 'policy_kwargs': dict(log_std_init=log_std_init, net_arch=net_arch, activation_fn=activation_fn, ortho_init=ortho_init, distribution_type=distribution_type) }
def sample_trpo_params(trial: optuna.Trial) -> Dict[str, Any]: """ Sampler for TRPO hyperparams. :param trial: :return: """ batch_size = trial.suggest_categorical("batch_size", [8, 16, 32, 64, 128, 256, 512]) n_steps = trial.suggest_categorical("n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048]) gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1) lr_schedule = "constant" # Uncomment to enable learning rate schedule # lr_schedule = trial.suggest_categorical('lr_schedule', ['linear', 'constant']) # line_search_shrinking_factor = trial.suggest_categorical("line_search_shrinking_factor", [0.6, 0.7, 0.8, 0.9]) n_critic_updates = trial.suggest_categorical("n_critic_updates", [5, 10, 20, 25, 30]) cg_max_steps = trial.suggest_categorical("cg_max_steps", [5, 10, 20, 25, 30]) # cg_damping = trial.suggest_categorical("cg_damping", [0.5, 0.2, 0.1, 0.05, 0.01]) target_kl = trial.suggest_categorical("target_kl", [0.1, 0.05, 0.03, 0.02, 0.01, 0.005, 0.001]) gae_lambda = trial.suggest_categorical("gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0]) net_arch = trial.suggest_categorical("net_arch", ["small", "medium"]) # Uncomment for gSDE (continuous actions) # log_std_init = trial.suggest_uniform("log_std_init", -4, 1) # Uncomment for gSDE (continuous action) # sde_sample_freq = trial.suggest_categorical("sde_sample_freq", [-1, 8, 16, 32, 64, 128, 256]) # Orthogonal initialization ortho_init = False # ortho_init = trial.suggest_categorical('ortho_init', [False, True]) # activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu']) activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"]) # TODO: account when using multiple envs if batch_size > n_steps: batch_size = n_steps if lr_schedule == "linear": learning_rate = linear_schedule(learning_rate) # Independent networks usually work best # when not working with images net_arch = { "small": [dict(pi=[64, 64], vf=[64, 64])], "medium": [dict(pi=[256, 256], vf=[256, 256])], }[net_arch] activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn] return { "n_steps": n_steps, "batch_size": batch_size, "gamma": gamma, # "cg_damping": cg_damping, "cg_max_steps": cg_max_steps, # "line_search_shrinking_factor": line_search_shrinking_factor, "n_critic_updates": n_critic_updates, "target_kl": target_kl, "learning_rate": learning_rate, "gae_lambda": gae_lambda, # "sde_sample_freq": sde_sample_freq, "policy_kwargs": dict( # log_std_init=log_std_init, net_arch=net_arch, activation_fn=activation_fn, ortho_init=ortho_init, ), }
def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]: """ Sampler for PPO hyperparams. :param trial: :return: """ batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 128, 256, 512]) n_steps = trial.suggest_categorical("n_steps", [32, 64, 128, 256, 512]) gamma = trial.suggest_categorical( "gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1) lr_schedule = "constant" # Uncomment to enable learning rate schedule # lr_schedule = trial.suggest_categorical('lr_schedule', ['linear', 'constant']) ent_coef = trial.suggest_loguniform("ent_coef", 0.00000001, 0.1) clip_range = trial.suggest_categorical("clip_range", [0.1, 0.2, 0.3, 0.4]) n_epochs = trial.suggest_categorical("n_epochs", [1, 5, 10, 20]) gae_lambda = trial.suggest_categorical( "gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0]) max_grad_norm = trial.suggest_categorical( "max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5]) vf_coef = trial.suggest_uniform("vf_coef", 0, 1) # net_arch = trial.suggest_categorical("net_arch", ["small", "medium"]) # Uncomment for gSDE (continuous actions) # log_std_init = trial.suggest_uniform("log_std_init", -4, 1) # Uncomment for gSDE (continuous action) # sde_sample_freq = trial.suggest_categorical("sde_sample_freq", [-1, 8, 16, 32, 64, 128, 256]) # Orthogonal initialization # ortho_init = False # ortho_init = trial.suggest_categorical('ortho_init', [False, True]) # activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu']) # activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"]) # TODO: account when using multiple envs if batch_size > n_steps: batch_size = n_steps if lr_schedule == "linear": learning_rate = linear_schedule(learning_rate) # Independent networks usually work best # when not working with images """ net_arch = { "small": [dict(pi=[64, 64], vf=[64, 64])], "medium": [dict(pi=[256, 256], vf=[256, 256])], }[net_arch] """ # activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn] return { "n_steps": n_steps, "batch_size": batch_size, "gamma": gamma, "learning_rate": learning_rate, "ent_coef": ent_coef, "clip_range": clip_range, "n_epochs": n_epochs, "gae_lambda": gae_lambda, "max_grad_norm": max_grad_norm, "vf_coef": vf_coef, # "sde_sample_freq": sde_sample_freq, "policy": "CnnPolicy", }
pprint(saved_hyperparams) n_envs = hyperparams.get('n_envs', 1) if args.verbose > 0: print("Using {} environments".format(n_envs)) # Create learning rate schedules for ppo2 and sac if algo_ in ["ppo2", "sac", "td3"]: for key in ['learning_rate', 'cliprange', 'cliprange_vf']: if key not in hyperparams: continue if isinstance(hyperparams[key], str): schedule, initial_value = hyperparams[key].split('_') initial_value = float(initial_value) hyperparams[key] = linear_schedule(initial_value) elif isinstance(hyperparams[key], (float, int)): # Negative value: ignore (ex: for clipping) if hyperparams[key] < 0: continue hyperparams[key] = constfn(float(hyperparams[key])) else: raise ValueError('Invalid value for {}: {}'.format( key, hyperparams[key])) # Should we overwrite the number of timesteps? if args.n_timesteps > 0: if args.verbose: print("Overwriting n_timesteps with n={}".format( args.n_timesteps)) n_timesteps = args.n_timesteps
if __name__ == '__main__': env = gym.make("CartPole-v1") env = DummyVecEnv() model = PPORepresentation( 'MlpPolicy', env, verbose=1, tensorboard_log="runs/Representation/CartPole-v1", n_steps=32, batch_size=256, gae_lambda=0.8, gamma=0.98, n_epochs=20, ent_coef=0, learning_rate=linear_schedule(0.001), clip_range=linear_schedule(0.2), ) model.learn(total_timesteps=1000000) # CartPole-v1: # n_envs: 8 # n_timesteps: !!float 1e5 # policy: 'MlpPolicy' # n_steps: 32 # batch_size: 256 # gae_lambda: 0.8 # gamma: 0.98 # n_epochs: 20 # ent_coef: 0.0 # learning_rate: lin_0.001