예제 #1
0
파일: train.py 프로젝트: gkswamy98/pillbox
def train_cartpole_expert():
    env = make_vec_env('CartPole-v1', n_envs=8)
    model = PPO('MlpPolicy', env, verbose=1,
                n_steps=32, batch_size=256, gae_lambda=0.8, gamma=0.98,
                n_epochs=20, ent_coef=0.0, learning_rate=linear_schedule(0.001),
                clip_range=linear_schedule(0.2), policy_kwargs=dict(net_arch=[64, 64]))
    model.learn(total_timesteps=1e5)
    model.save("experts/CartPole-v1/cartpole_expert")
    gen_expert_demos('CartPole-v1', gym.make('CartPole-v1'), model, 25)
def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]:
    """
    Sampler for A2C hyperparams.

    :param trial:
    :return:
    """
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    normalize_advantage = trial.suggest_categorical("normalize_advantage", [False, True])
    max_grad_norm = trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
    use_rms_prop = trial.suggest_categorical("use_rms_prop", [False, True])
    gae_lambda = trial.suggest_categorical("gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
    n_steps = trial.suggest_categorical("n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048])
    lr_schedule = trial.suggest_categorical("lr_schedule", ["linear", "constant"])
    learning_rate = trial.suggest_loguniform("lr", 1e-5, 1)
    ent_coef = trial.suggest_loguniform("ent_coef", 0.00000001, 0.1)
    vf_coef = trial.suggest_uniform("vf_coef", 0, 1)
    log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
    ortho_init = trial.suggest_categorical("ortho_init", [False, True])
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium"])
    # sde_net_arch = trial.suggest_categorical("sde_net_arch", [None, "tiny", "small"])
    # full_std = trial.suggest_categorical("full_std", [False, True])
    # activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu'])
    activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])

    if lr_schedule == "linear":
        learning_rate = linear_schedule(learning_rate)

    net_arch = {
        "small": [dict(pi=[64, 64], vf=[64, 64])],
        "medium": [dict(pi=[256, 256], vf=[256, 256])],
    }[net_arch]

    # sde_net_arch = {
    #     None: None,
    #     "tiny": [64],
    #     "small": [64, 64],
    # }[sde_net_arch]

    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn]

    return {
        "n_steps": n_steps,
        "gamma": gamma,
        "gae_lambda": gae_lambda,
        "learning_rate": learning_rate,
        "ent_coef": ent_coef,
        "normalize_advantage": normalize_advantage,
        "max_grad_norm": max_grad_norm,
        "use_rms_prop": use_rms_prop,
        "vf_coef": vf_coef,
        "policy_kwargs": dict(
            log_std_init=log_std_init,
            net_arch=net_arch,
            # full_std=full_std,
            activation_fn=activation_fn,
            # sde_net_arch=sde_net_arch,
            ortho_init=ortho_init,
        ),
    }
def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
    """
    Sampler for PPO2 hyperparams.

    :param trial:
    :return:
    """
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32, 64, 128, 256, 512])
    n_steps = trial.suggest_categorical("n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048])
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    learning_rate = trial.suggest_loguniform("lr", 1e-5, 1)
    lr_schedule = "constant"
    # lr_schedule = trial.suggest_categorical('lr_schedule', ['linear', 'constant'])
    ent_coef = trial.suggest_loguniform("ent_coef", 0.00000001, 0.1)
    clip_range = trial.suggest_categorical("clip_range", [0.1, 0.2, 0.3, 0.4])
    n_epochs = trial.suggest_categorical("n_epochs", [1, 5, 10, 20])
    gae_lambda = trial.suggest_categorical("gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
    max_grad_norm = trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
    vf_coef = trial.suggest_uniform("vf_coef", 0, 1)
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium"])
    log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
    sde_sample_freq = trial.suggest_categorical("sde_sample_freq", [-1, 8, 16, 32, 64, 128, 256])
    ortho_init = False
    # ortho_init = trial.suggest_categorical('ortho_init', [False, True])
    # activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu'])
    activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])

    # TODO: account when using multiple envs
    if batch_size > n_steps:
        batch_size = n_steps

    if lr_schedule == "linear":
        learning_rate = linear_schedule(learning_rate)

    net_arch = {
        "small": [dict(pi=[64, 64], vf=[64, 64])],
        "medium": [dict(pi=[256, 256], vf=[256, 256])],
    }[net_arch]

    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn]

    return {
        "n_steps": n_steps,
        "batch_size": batch_size,
        "gamma": gamma,
        "learning_rate": learning_rate,
        "ent_coef": ent_coef,
        "clip_range": clip_range,
        "n_epochs": n_epochs,
        "gae_lambda": gae_lambda,
        "max_grad_norm": max_grad_norm,
        "vf_coef": vf_coef,
        "sde_sample_freq": sde_sample_freq,
        "policy_kwargs": dict(
            log_std_init=log_std_init,
            net_arch=net_arch,
            activation_fn=activation_fn,
            ortho_init=ortho_init,
        ),
    }
예제 #4
0
def sample_ppo_params(trial):
    """
    Sampler for PPO2 hyperparams.

    :param trial: (optuna.trial)
    :return: (dict)
    """
    batch_size = trial.suggest_categorical('batch_size', [8, 16, 32, 64, 128, 256, 512])
    n_steps = trial.suggest_categorical('n_steps', [8, 16, 32, 64, 128, 256, 512, 1024, 2048])
    gamma = trial.suggest_categorical('gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    learning_rate = trial.suggest_loguniform('lr', 1e-5, 1)
    lr_schedule = 'constant'
    # lr_schedule = trial.suggest_categorical('lr_schedule', ['linear', 'constant'])
    ent_coef = trial.suggest_loguniform('ent_coef', 0.00000001, 0.1)
    clip_range = trial.suggest_categorical('clip_range', [0.1, 0.2, 0.3, 0.4])
    n_epochs = trial.suggest_categorical('n_epochs', [1, 5, 10, 20])
    gae_lambda = trial.suggest_categorical('gae_lambda', [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
    max_grad_norm = trial.suggest_categorical('max_grad_norm', [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
    vf_coef = trial.suggest_uniform('vf_coef', 0, 1)
    net_arch = trial.suggest_categorical('net_arch', ['small', 'medium'])
    log_std_init = trial.suggest_uniform('log_std_init', -4, 1)
    sde_sample_freq = trial.suggest_categorical('sde_sample_freq', [-1, 8, 16, 32, 64, 128, 256])
    ortho_init = False
    # ortho_init = trial.suggest_categorical('ortho_init', [False, True])
    # activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu'])
    activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu'])

    # TODO: account when using multiple envs
    if batch_size > n_steps:
        batch_size = n_steps

    if lr_schedule == 'linear':
        learning_rate = linear_schedule(learning_rate)

    net_arch = {
        'small': [dict(pi=[64, 64], vf=[64, 64])],
        'medium': [dict(pi=[256, 256], vf=[256, 256])],
    }[net_arch]

    activation_fn = {
        'tanh': nn.Tanh,
        'relu': nn.ReLU,
        'elu': nn.ELU,
        'leaky_relu': nn.LeakyReLU
    }[activation_fn]

    return {
        'n_steps': n_steps,
        'batch_size': batch_size,
        'gamma': gamma,
        'learning_rate': learning_rate,
        'ent_coef': ent_coef,
        'clip_range': clip_range,
        'n_epochs': n_epochs,
        'gae_lambda': gae_lambda,
        'max_grad_norm': max_grad_norm,
        'vf_coef': vf_coef,
        'sde_sample_freq': sde_sample_freq,
        'policy_kwargs': dict(log_std_init=log_std_init, net_arch=net_arch, activation_fn=activation_fn)
    }
def a2c(env, hyper, policy = "MlpPolicy", tensorboard_log = None, verbose = 1,
        seed = 0, use_sde = True, sde_sample_freq = -1, rms_prop_eps = 1e-05,
        device = "auto"):
   
  lr_schedule = hyper["params_lr_schedule"]
  learning_rate = hyper["params_lr"]
  if lr_schedule == "linear":
    learning_rate = linear_schedule(learning_rate)

  policy_kwargs = make_policy_kwargs(hyper, "a2c")
  model = A2C(policy, 
              env, 
              tensorboard_log=tensorboard_log, 
              verbose = verbose, 
              seed = seed,
              use_sde = use_sde,
              sde_sample_freq = sde_sample_freq,
              rms_prop_eps = rms_prop_eps,
              learning_rate = learning_rate,
              n_steps = np.int(hyper["params_n_steps"]),
              gamma = hyper["params_gamma"],
              gae_lambda = hyper["params_gae_lambda"],
              ent_coef = hyper["params_ent_coef"],
              vf_coef = hyper["params_vf_coef"],
              max_grad_norm = hyper["params_max_grad_norm"],
              use_rms_prop = hyper["params_use_rms_prop"],
              normalize_advantage = hyper["params_normalize_advantage"],
              policy_kwargs = policy_kwargs,
              device = device
          )
  return model
예제 #6
0
def sample_a2c_params(trial):
    """
    Sampler for A2C hyperparams.

    :param trial: (optuna.trial)
    :return: (dict)
    """
    gamma = trial.suggest_categorical('gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    normalize_advantage = trial.suggest_categorical('normalize_advantage', [False, True])
    max_grad_norm = trial.suggest_categorical('max_grad_norm', [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
    use_rms_prop = trial.suggest_categorical('use_rms_prop', [False, True])
    gae_lambda = trial.suggest_categorical('gae_lambda', [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
    n_steps = trial.suggest_categorical('n_steps', [8, 16, 32, 64, 128, 256, 512, 1024, 2048])
    lr_schedule = trial.suggest_categorical('lr_schedule', ['linear', 'constant'])
    learning_rate = trial.suggest_loguniform('lr', 1e-5, 1)
    ent_coef = trial.suggest_loguniform('ent_coef', 0.00000001, 0.1)
    vf_coef = trial.suggest_uniform('vf_coef', 0, 1)
    log_std_init = trial.suggest_uniform('log_std_init', -4, 1)
    ortho_init = trial.suggest_categorical('ortho_init', [False, True])
    net_arch = trial.suggest_categorical('net_arch', ['small', 'medium'])
    sde_net_arch = trial.suggest_categorical('sde_net_arch', [None, 'tiny', 'small'])
    full_std = trial.suggest_categorical('full_std', [False, True])
    # activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu'])
    activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu'])

    if lr_schedule == 'linear':
        learning_rate = linear_schedule(learning_rate)

    net_arch = {
        'small': [dict(pi=[64, 64], vf=[64, 64])],
        'medium': [dict(pi=[256, 256], vf=[256, 256])],
    }[net_arch]

    sde_net_arch = {
        None: None,
        'tiny': [64],
        'small': [64, 64],
    }[sde_net_arch]

    activation_fn = {
        'tanh': nn.Tanh,
        'relu': nn.ReLU,
        'elu': nn.ELU,
        'leaky_relu': nn.LeakyReLU
    }[activation_fn]

    return {
        'n_steps': n_steps,
        'gamma': gamma,
        'gae_lambda': gae_lambda,
        'learning_rate': learning_rate,
        'ent_coef': ent_coef,
        'normalize_advantage': normalize_advantage,
        'max_grad_norm': max_grad_norm,
        'use_rms_prop': use_rms_prop,
        'vf_coef': vf_coef,
        'policy_kwargs': dict(log_std_init=log_std_init, net_arch=net_arch, full_std=full_std,
                              activation_fn=activation_fn, sde_net_arch=sde_net_arch,
                              ortho_init=ortho_init)
    }
예제 #7
0
 def __init__(self, env, args):
     # define hyper parameters
     self.env = env
     self.args = args
     self.criterion = nn.MSELoss()
     # define network
     self.net = net(self.env.action_space.n, self.args.use_dueling)
     self.target_net = copy.deepcopy(self.net)
     self.target_net.load_state_dict(self.net.state_dict())
     if self.args.cuda:
         self.net.cuda()
         self.target_net.cuda()
     # define the optimizer
     self.optimizer = torch.optim.Adam(self.net.parameters(),
                                       lr=self.args.lr)
     self.buffer = ReplayBuffer(self.args.batch_size)
     # define the linear schedule of the exploration
     # TODO
     self.exploration_schedule = linear_schedule(int(self.args.total_timesteps * self.args.exploration_fraction), \
                                                 self.args.final_ratio, self.args.init_ratio)
     # create the folder to save the model
     if not os.path.exists(self.args.save_dir):
         os.mkdir(self.args.save_dir)
     # set environment folder
     self.model_path = os.path.join(self.args.save_dir, self.args.env_name)
     if not os.path.exists(self.model_path):
         os.mkdir(self.model_path)
 def __init__(self, env, args):
     # define some important
     self.env = env
     self.args = args
     # define the network
     self.net = net(self.env.action_space.n, self.args.use_dueling)
     # copy the self.net as the
     self.target_net = copy.deepcopy(self.net)
     # make sure the target net has the same weights as the network
     self.target_net.load_state_dict(self.net.state_dict())
     if self.args.cuda:
         self.net.cuda()
         self.target_net.cuda()
     # define the optimizer
     self.optimizer = torch.optim.Adam(self.net.parameters(),
                                       lr=self.args.lr)
     # define the replay memory
     self.buffer = replay_buffer(self.args.buffer_size)
     # define the linear schedule of the exploration
     self.exploration_schedule = linear_schedule(int(self.args.total_timesteps * self.args.exploration_fraction), \
                                                 self.args.final_ratio, self.args.init_ratio)
     # create the folder to save the models
     if not os.path.exists(self.args.save_dir):
         os.mkdir(self.args.save_dir)
     # set the environment folder
     self.model_path = os.path.join(self.args.save_dir, self.args.env_name)
     if not os.path.exists(self.model_path):
         os.mkdir(self.model_path)
예제 #9
0
파일: train.py 프로젝트: gkswamy98/pillbox
def train_hopper_expert():
    # No env normalization.
    env = make_vec_env('HopperBulletEnv-v0', n_envs=1)
    model = SAC('MlpPolicy', env, verbose=1,
                buffer_size=300000, batch_size=256, gamma=0.98, tau=0.02,
                train_freq=64, gradient_steps=64, ent_coef='auto', learning_rate=linear_schedule(7.3e-4), 
                learning_starts=10000, policy_kwargs=dict(net_arch=[256, 256], log_std_init=-3),
                use_sde=True)
    model.learn(total_timesteps=1e6)
    model.save("experts/HopperBulletEnv-v0/hopper_expert")
    gen_expert_demos('HopperBulletEnv-v0', gym.make('HopperBulletEnv-v0'), model, 25)
예제 #10
0
파일: train.py 프로젝트: gkswamy98/pillbox
def train_sqil(env, n=0):
    venv = gym.make(env)
    expert_data = make_sa_dataset(env, max_trajs=5)

    for i in range(n):
        if isinstance(venv.action_space, Discrete):
            model = DQN(SQLPolicy,
                        venv,
                        verbose=1,
                        policy_kwargs=dict(net_arch=[64, 64]),
                        learning_starts=1)
        else:
            model = SAC('MlpPolicy',
                        venv,
                        verbose=1,
                        policy_kwargs=dict(net_arch=[256, 256]),
                        ent_coef='auto',
                        learning_rate=linear_schedule(7.3e-4),
                        train_freq=64,
                        gradient_steps=64,
                        gamma=0.98,
                        tau=0.02)

        model.replay_buffer = SQILReplayBuffer(model.buffer_size,
                                               model.observation_space,
                                               model.action_space,
                                               model.device,
                                               1,
                                               model.optimize_memory_usage,
                                               expert_data=expert_data)
        mean_rewards = []
        std_rewards = []
        for train_steps in range(20):
            if train_steps > 0:
                if 'Bullet' in env:
                    model.learn(total_timesteps=25000, log_interval=1)
                else:
                    model.learn(total_timesteps=16384, log_interval=1)
            mean_reward, std_reward = evaluate_policy(model,
                                                      model.env,
                                                      n_eval_episodes=10)
            mean_rewards.append(mean_reward)
            std_rewards.append(std_reward)
            print("{0} Steps: {1}".format(train_steps, mean_reward))
            np.savez(os.path.join("learners", env,
                                  "sqil_rewards_{0}".format(i)),
                     means=mean_rewards,
                     stds=std_rewards)
예제 #11
0
파일: train.py 프로젝트: gkswamy98/pillbox
def train_adril(env, n=0, balanced=False):
    num_trajs = 20
    expert_data = make_sa_dataset(env, max_trajs=num_trajs)
    n_expert = len(expert_data["obs"])
    expert_sa = np.concatenate(
        (expert_data["obs"], np.reshape(expert_data["acts"], (n_expert, -1))),
        axis=1)

    for i in range(0, n):
        venv = AdRILWrapper(gym.make(env))
        mean_rewards = []
        std_rewards = []
        # Create model
        if isinstance(venv.action_space, Discrete):
            model = DQN(SQLPolicy,
                        venv,
                        verbose=1,
                        policy_kwargs=dict(net_arch=[64, 64]),
                        learning_starts=1)
        else:
            model = SAC('MlpPolicy',
                        venv,
                        verbose=1,
                        policy_kwargs=dict(net_arch=[256, 256]),
                        ent_coef='auto',
                        learning_rate=linear_schedule(7.3e-4),
                        train_freq=64,
                        gradient_steps=64,
                        gamma=0.98,
                        tau=0.02)
        model.replay_buffer = AdRILReplayBuffer(model.buffer_size,
                                                model.observation_space,
                                                model.action_space,
                                                model.device,
                                                1,
                                                model.optimize_memory_usage,
                                                expert_data=expert_data,
                                                N_expert=num_trajs,
                                                balanced=balanced)
        if not balanced:
            for j in range(len(expert_sa)):
                obs = expert_data["obs"][j]
                act = expert_data["acts"][j]
                next_obs = expert_data["next_obs"][j]
                done = expert_data["dones"][j]
                model.replay_buffer.add(obs, next_obs, act, -1, done)
        for train_steps in range(400):
            # Train policy
            if train_steps > 0:
                if 'Bullet' in env:
                    model.learn(total_timesteps=1250, log_interval=1000)
                else:
                    model.learn(total_timesteps=25000, log_interval=1000)
                if train_steps % 1 == 0:  # written to support more complex update schemes
                    model.replay_buffer.set_iter(train_steps)
                    model.replay_buffer.set_n_learner(venv.num_trajs)

            # Evaluate policy
            if train_steps % 20 == 0:
                model.set_env(gym.make(env))
                mean_reward, std_reward = evaluate_policy(model,
                                                          model.env,
                                                          n_eval_episodes=10)
                mean_rewards.append(mean_reward)
                std_rewards.append(std_reward)
                print("{0} Steps: {1}".format(int(train_steps * 1250),
                                              mean_reward))
                np.savez(os.path.join("learners", env,
                                      "adril_rewards_{0}".format(i)),
                         means=mean_rewards,
                         stds=std_rewards)
            # Update env
            if train_steps > 0:
                if train_steps % 1 == 0:
                    venv.set_iter(train_steps + 1)
            model.set_env(venv)
예제 #12
0
def hyperparam_anneal(args, global_step):
    if args.train.beta_aux_pres_anneal_end_step == 0:
        args.train.beta_aux_pres = args.train.beta_aux_pres_anneal_start_value
    else:
        args.train.beta_aux_pres = linear_schedule(
            global_step, args.train.beta_aux_pres_anneal_start_step,
            args.train.beta_aux_pres_anneal_end_step,
            args.train.beta_aux_pres_anneal_start_value,
            args.train.beta_aux_pres_anneal_end_value)

    if args.train.beta_aux_where_anneal_end_step == 0:
        args.train.beta_aux_where = args.train.beta_aux_where_anneal_start_value
    else:
        args.train.beta_aux_where = linear_schedule(
            global_step, args.train.beta_aux_where_anneal_start_step,
            args.train.beta_aux_where_anneal_end_step,
            args.train.beta_aux_where_anneal_start_value,
            args.train.beta_aux_where_anneal_end_value)

    if args.train.beta_aux_what_anneal_end_step == 0:
        args.train.beta_aux_what = args.train.beta_aux_what_anneal_start_value
    else:
        args.train.beta_aux_what = linear_schedule(
            global_step, args.train.beta_aux_what_anneal_start_step,
            args.train.beta_aux_what_anneal_end_step,
            args.train.beta_aux_what_anneal_start_value,
            args.train.beta_aux_what_anneal_end_value)

    if args.train.beta_aux_depth_anneal_end_step == 0:
        args.train.beta_aux_depth = args.train.beta_aux_depth_anneal_start_value
    else:
        args.train.beta_aux_depth = linear_schedule(
            global_step, args.train.beta_aux_depth_anneal_start_step,
            args.train.beta_aux_depth_anneal_end_step,
            args.train.beta_aux_depth_anneal_start_value,
            args.train.beta_aux_depth_anneal_end_value)

    if args.train.beta_aux_global_anneal_end_step == 0:
        args.train.beta_aux_global = args.train.beta_aux_global_anneal_start_value
    else:
        args.train.beta_aux_global = linear_schedule(
            global_step, args.train.beta_aux_global_anneal_start_step,
            args.train.beta_aux_global_anneal_end_step,
            args.train.beta_aux_global_anneal_start_value,
            args.train.beta_aux_global_anneal_end_value)

    if args.train.beta_aux_bg_anneal_end_step == 0:
        args.train.beta_aux_bg = args.train.beta_aux_bg_anneal_start_value
    else:
        args.train.beta_aux_bg = linear_schedule(
            global_step, args.train.beta_aux_bg_anneal_start_step,
            args.train.beta_aux_bg_anneal_end_step,
            args.train.beta_aux_bg_anneal_start_value,
            args.train.beta_aux_bg_anneal_end_value)

    ########################### split here ###########################
    if args.train.beta_pres_anneal_end_step == 0:
        args.train.beta_pres = args.train.beta_pres_anneal_start_value
    else:
        args.train.beta_pres = linear_schedule(
            global_step, args.train.beta_pres_anneal_start_step,
            args.train.beta_pres_anneal_end_step,
            args.train.beta_pres_anneal_start_value,
            args.train.beta_pres_anneal_end_value)

    if args.train.beta_where_anneal_end_step == 0:
        args.train.beta_where = args.train.beta_where_anneal_start_value
    else:
        args.train.beta_where = linear_schedule(
            global_step, args.train.beta_where_anneal_start_step,
            args.train.beta_where_anneal_end_step,
            args.train.beta_where_anneal_start_value,
            args.train.beta_where_anneal_end_value)

    if args.train.beta_what_anneal_end_step == 0:
        args.train.beta_what = args.train.beta_what_anneal_start_value
    else:
        args.train.beta_what = linear_schedule(
            global_step, args.train.beta_what_anneal_start_step,
            args.train.beta_what_anneal_end_step,
            args.train.beta_what_anneal_start_value,
            args.train.beta_what_anneal_end_value)

    if args.train.beta_depth_anneal_end_step == 0:
        args.train.beta_depth = args.train.beta_depth_anneal_start_value
    else:
        args.train.beta_depth = linear_schedule(
            global_step, args.train.beta_depth_anneal_start_step,
            args.train.beta_depth_anneal_end_step,
            args.train.beta_depth_anneal_start_value,
            args.train.beta_depth_anneal_end_value)

    if args.train.beta_global_anneal_end_step == 0:
        args.train.beta_global = args.train.beta_global_anneal_start_value
    else:
        args.train.beta_global = linear_schedule(
            global_step, args.train.beta_global_anneal_start_step,
            args.train.beta_global_anneal_end_step,
            args.train.beta_global_anneal_start_value,
            args.train.beta_global_anneal_end_value)

    if args.train.tau_pres_anneal_end_step == 0:
        args.train.tau_pres = args.train.tau_pres_anneal_start_value
    else:
        args.train.tau_pres = linear_schedule(
            global_step, args.train.tau_pres_anneal_start_step,
            args.train.tau_pres_anneal_end_step,
            args.train.tau_pres_anneal_start_value,
            args.train.tau_pres_anneal_end_value)

    if args.train.beta_bg_anneal_end_step == 0:
        args.train.beta_bg = args.train.beta_bg_anneal_start_value
    else:
        args.train.beta_bg = linear_schedule(
            global_step, args.train.beta_bg_anneal_start_step,
            args.train.beta_bg_anneal_end_step,
            args.train.beta_bg_anneal_start_value,
            args.train.beta_bg_anneal_end_value)

    return
예제 #13
0
def sample_ppo_params(trial: optuna.Trial, n_envs) -> Dict[str, Any]:
    """
    Sampler for PPO hyperparams.

    :param trial:
    :return:
    """
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32, 64, 128, 256, 512])
    #batch_size = 32
    n_steps = trial.suggest_categorical("n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048])
    #n_steps = 256
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    #gamma = 0.98
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
    #learning_rate = 0.000107739192714429
    lr_schedule = "constant"
    # Uncomment to enable learning rate schedule
    # lr_schedule = trial.suggest_categorical('lr_schedule', ['linear', 'constant'])
    # ent_coef = trial.suggest_loguniform("ent_coef", 0.00000001, 0.1)
    ent_coef = 0.0
    clip_range = trial.suggest_categorical("clip_range", [0.1, 0.2, 0.3, 0.4])
    #clip_range = 0.2
    n_epochs = trial.suggest_categorical("n_epochs", [1, 5, 10, 20])
    #n_epochs = 5
    gae_lambda = trial.suggest_categorical("gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
    #gae_lambda = 0.8
    max_grad_norm = trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
    #max_grad_norm = 0.5
    #vf_coef = trial.suggest_uniform("vf_coef", 0, 1)
    vf_coef = 0.5
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "large", "huge", "small3", "medium3", "large3", "huge3"])
    #net_arch = "large"
    use_sde = False
    # Uncomment for gSDE (continuous action)
    # sde_sample_freq = trial.suggest_categorical("sde_sample_freq", [-1, 8, 16, 32, 64, 128, 256])
    # Orthogonal initialization
    ortho_init = True
    # ortho_init = trial.suggest_categorical('ortho_init', [False, True])
    # activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu'])
    activation_fn = 'tanh'
    # activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])


    distribution_type = 'FixedVarSquashedDiagGaussian'
    # distribution_type = trial.suggest_categorical('distribution_type',
    #                                              ['FixedVarSquashedDiagGaussian', 'FixedVarDiagGaussian',
    #                                               'SquashedDiagGaussian', 'Beta'])
    log_std_init = None
    if distribution_type in ['FixedVarSquashedDiagGaussian', 'FixedVarDiagGaussian', 'SquashedDiagGaussian']:
        # only need this if we use some form of gaussian distribution
        log_std_init = trial.suggest_uniform("log_std_init", -4, 0)
        # log_std_init = -3

    if distribution_type == 'Beta':
        beta_init = trial.suggest_uniform("beta_init", 0, 20)
    else:
        beta_init = 16

    if batch_size > n_steps * n_envs:
        batch_size = n_steps * n_envs

    if lr_schedule == "linear":
        learning_rate = linear_schedule(learning_rate)

    # Independent networks usually work best
    # when not working with images
    net_arch = {
        'small': [dict(pi=[64, 64], vf=[64, 64])],
        'medium': [dict(pi=[256, 256], vf=[256, 256])],
        'large': [dict(pi=[512, 512], vf=[512, 512])],
        'huge': [dict(pi=[1024, 1024], vf=[1024, 1024])],
        'small3': [dict(pi=[64, 64, 64], vf=[64, 64, 64])],
        'medium3': [dict(pi=[256, 256, 256], vf=[256, 256, 256])],
        'large3': [dict(pi=[512, 512, 512], vf=[512, 512, 512])],
        'huge3': [dict(pi=[1024, 1024, 1024], vf=[1024, 1024, 1024])]
    }[net_arch]

    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn]
    #distribution_type = {'FixedVarSquashedDiagGaussianDistribution': FixedVarSquashedDiagGaussianDistribution,
    #                     'FixedVarDiagGaussianDistribution': FixedVarDiagGaussianDistribution,
    #                     'SquashedDiagGaussianDistribution': SquashedDiagGaussianDistribution,
    #                     'Beta': Beta}[distribution_type]

    return {
        'set_action_bias_from_env': True,
        'n_steps': n_steps,
        'batch_size': batch_size,
        'gamma': gamma,
        'learning_rate': learning_rate,
        'ent_coef': ent_coef,
        'clip_range': clip_range,
        'n_epochs': n_epochs,
        'gae_lambda': gae_lambda,
        'max_grad_norm': max_grad_norm,
        'vf_coef': vf_coef,
        'use_sde': use_sde,
        'beta_init': beta_init,
        # 'sde_sample_freq': sde_sample_freq,
        'policy_kwargs': dict(log_std_init=log_std_init, net_arch=net_arch, activation_fn=activation_fn,
                              ortho_init=ortho_init, distribution_type=distribution_type)
    }
예제 #14
0
def sample_trpo_params(trial: optuna.Trial) -> Dict[str, Any]:
    """
    Sampler for TRPO hyperparams.

    :param trial:
    :return:
    """
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32, 64, 128, 256, 512])
    n_steps = trial.suggest_categorical("n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048])
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
    lr_schedule = "constant"
    # Uncomment to enable learning rate schedule
    # lr_schedule = trial.suggest_categorical('lr_schedule', ['linear', 'constant'])
    # line_search_shrinking_factor = trial.suggest_categorical("line_search_shrinking_factor", [0.6, 0.7, 0.8, 0.9])
    n_critic_updates = trial.suggest_categorical("n_critic_updates", [5, 10, 20, 25, 30])
    cg_max_steps = trial.suggest_categorical("cg_max_steps", [5, 10, 20, 25, 30])
    # cg_damping = trial.suggest_categorical("cg_damping", [0.5, 0.2, 0.1, 0.05, 0.01])
    target_kl = trial.suggest_categorical("target_kl", [0.1, 0.05, 0.03, 0.02, 0.01, 0.005, 0.001])
    gae_lambda = trial.suggest_categorical("gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium"])
    # Uncomment for gSDE (continuous actions)
    # log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
    # Uncomment for gSDE (continuous action)
    # sde_sample_freq = trial.suggest_categorical("sde_sample_freq", [-1, 8, 16, 32, 64, 128, 256])
    # Orthogonal initialization
    ortho_init = False
    # ortho_init = trial.suggest_categorical('ortho_init', [False, True])
    # activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu'])
    activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])

    # TODO: account when using multiple envs
    if batch_size > n_steps:
        batch_size = n_steps

    if lr_schedule == "linear":
        learning_rate = linear_schedule(learning_rate)

    # Independent networks usually work best
    # when not working with images
    net_arch = {
        "small": [dict(pi=[64, 64], vf=[64, 64])],
        "medium": [dict(pi=[256, 256], vf=[256, 256])],
    }[net_arch]

    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn]

    return {
        "n_steps": n_steps,
        "batch_size": batch_size,
        "gamma": gamma,
        # "cg_damping": cg_damping,
        "cg_max_steps": cg_max_steps,
        # "line_search_shrinking_factor": line_search_shrinking_factor,
        "n_critic_updates": n_critic_updates,
        "target_kl": target_kl,
        "learning_rate": learning_rate,
        "gae_lambda": gae_lambda,
        # "sde_sample_freq": sde_sample_freq,
        "policy_kwargs": dict(
            # log_std_init=log_std_init,
            net_arch=net_arch,
            activation_fn=activation_fn,
            ortho_init=ortho_init,
        ),
    }
예제 #15
0
def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
    """
    Sampler for PPO hyperparams.

    :param trial:
    :return:
    """
    batch_size = trial.suggest_categorical("batch_size",
                                           [16, 32, 64, 128, 256, 512])
    n_steps = trial.suggest_categorical("n_steps", [32, 64, 128, 256, 512])
    gamma = trial.suggest_categorical(
        "gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
    lr_schedule = "constant"
    # Uncomment to enable learning rate schedule
    # lr_schedule = trial.suggest_categorical('lr_schedule', ['linear', 'constant'])
    ent_coef = trial.suggest_loguniform("ent_coef", 0.00000001, 0.1)
    clip_range = trial.suggest_categorical("clip_range", [0.1, 0.2, 0.3, 0.4])
    n_epochs = trial.suggest_categorical("n_epochs", [1, 5, 10, 20])
    gae_lambda = trial.suggest_categorical(
        "gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
    max_grad_norm = trial.suggest_categorical(
        "max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
    vf_coef = trial.suggest_uniform("vf_coef", 0, 1)
    # net_arch = trial.suggest_categorical("net_arch", ["small", "medium"])
    # Uncomment for gSDE (continuous actions)
    # log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
    # Uncomment for gSDE (continuous action)
    # sde_sample_freq = trial.suggest_categorical("sde_sample_freq", [-1, 8, 16, 32, 64, 128, 256])
    # Orthogonal initialization
    # ortho_init = False
    # ortho_init = trial.suggest_categorical('ortho_init', [False, True])
    # activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu'])
    # activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])

    # TODO: account when using multiple envs
    if batch_size > n_steps:
        batch_size = n_steps

    if lr_schedule == "linear":
        learning_rate = linear_schedule(learning_rate)

    # Independent networks usually work best
    # when not working with images
    """
    net_arch = {
        "small": [dict(pi=[64, 64], vf=[64, 64])],
        "medium": [dict(pi=[256, 256], vf=[256, 256])],
    }[net_arch]
    """

    # activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn]

    return {
        "n_steps": n_steps,
        "batch_size": batch_size,
        "gamma": gamma,
        "learning_rate": learning_rate,
        "ent_coef": ent_coef,
        "clip_range": clip_range,
        "n_epochs": n_epochs,
        "gae_lambda": gae_lambda,
        "max_grad_norm": max_grad_norm,
        "vf_coef": vf_coef,
        # "sde_sample_freq": sde_sample_freq,
        "policy": "CnnPolicy",
    }
예제 #16
0
            pprint(saved_hyperparams)

        n_envs = hyperparams.get('n_envs', 1)

        if args.verbose > 0:
            print("Using {} environments".format(n_envs))

        # Create learning rate schedules for ppo2 and sac
        if algo_ in ["ppo2", "sac", "td3"]:
            for key in ['learning_rate', 'cliprange', 'cliprange_vf']:
                if key not in hyperparams:
                    continue
                if isinstance(hyperparams[key], str):
                    schedule, initial_value = hyperparams[key].split('_')
                    initial_value = float(initial_value)
                    hyperparams[key] = linear_schedule(initial_value)
                elif isinstance(hyperparams[key], (float, int)):
                    # Negative value: ignore (ex: for clipping)
                    if hyperparams[key] < 0:
                        continue
                    hyperparams[key] = constfn(float(hyperparams[key]))
                else:
                    raise ValueError('Invalid value for {}: {}'.format(
                        key, hyperparams[key]))

        # Should we overwrite the number of timesteps?
        if args.n_timesteps > 0:
            if args.verbose:
                print("Overwriting n_timesteps with n={}".format(
                    args.n_timesteps))
            n_timesteps = args.n_timesteps
예제 #17
0
if __name__ == '__main__':
    env = gym.make("CartPole-v1")
    env = DummyVecEnv()

    model = PPORepresentation(
        'MlpPolicy',
        env,
        verbose=1,
        tensorboard_log="runs/Representation/CartPole-v1",
        n_steps=32,
        batch_size=256,
        gae_lambda=0.8,
        gamma=0.98,
        n_epochs=20,
        ent_coef=0,
        learning_rate=linear_schedule(0.001),
        clip_range=linear_schedule(0.2),
    )
    model.learn(total_timesteps=1000000)

# CartPole-v1:
#   n_envs: 8
#   n_timesteps: !!float 1e5
#   policy: 'MlpPolicy'
#   n_steps: 32
#   batch_size: 256
#   gae_lambda: 0.8
#   gamma: 0.98
#   n_epochs: 20
#   ent_coef: 0.0
#   learning_rate: lin_0.001