예제 #1
0
def test_gail(expert_env):
    env_id, expert_path = expert_env
    env = gym.make(env_id)
    dataset = ExpertDataset(expert_path=expert_path,
                            traj_limitation=10,
                            sequential_preprocessing=True)

    # Note: train for 1M steps to have a working policy
    model = GAIL('MlpPolicy',
                 env,
                 adversary_entcoeff=0.0,
                 lam=0.92,
                 max_kl=0.001,
                 expert_dataset=dataset,
                 hidden_size_adversary=64,
                 verbose=0)

    model.learn(1000)
    model.save("GAIL-{}".format(env_id))
    model = model.load("GAIL-{}".format(env_id), env=env)
    model.learn(1000)

    obs = env.reset()

    for _ in range(1000):
        action, _ = model.predict(obs)
        obs, _, done, _ = env.step(action)
        if done:
            obs = env.reset()
    del dataset, model
예제 #2
0
def train(env, implemented_combos, model_logdir, arg_dict, pretrained_model=None):
    model_name = arg_dict["algo"] + '_' + str(arg_dict["steps"])
    conf_pth   = os.path.join(model_logdir, "train.json")
    model_path = os.path.join(model_logdir, "best_model.zip")
    arg_dict["model_path"] = model_path
    with open(conf_pth, "w") as f:
        json.dump(arg_dict, f, indent=4)

    model_args = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][1]
    model_kwargs = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][2]
    if pretrained_model:
        if not os.path.isabs(pretrained_model):
            pretrained_model = pkg_resources.resource_filename("myGym", pretrained_model)
        env = model_args[1]
        vec_env = DummyVecEnv([lambda: env])
        model = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][0].load(pretrained_model, vec_env)
    else:
        model = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][0](*model_args, **model_kwargs)

    if arg_dict["algo"] == "gail":
        # Multi processing: (using MPI)
        if arg_dict["train_framework"] == 'tensorflow':
            # Generate expert trajectories (train expert)
            generate_expert_traj(model, model_name, n_timesteps=3000, n_episodes=100)
            # Load the expert dataset
            dataset = ExpertDataset(expert_path=model_name+'.npz', traj_limitation=10, verbose=1)
            model = GAIL_T('MlpPolicy', model_name, dataset, verbose=1)
            # Note: in practice, you need to train for 1M steps to have a working policy

    start_time = time.time()
    callbacks_list = []
    if pretrained_model:
        model_logdir = pretrained_model.split('/')
        model_logdir = model_logdir[:-1]
        model_logdir = "/".join(model_logdir)
        auto_save_callback = SaveOnBestTrainingRewardCallback(check_freq=1024, logdir=model_logdir, env=env, engine=arg_dict["engine"], multiprocessing=arg_dict["multiprocessing"])
    else:
        auto_save_callback = SaveOnBestTrainingRewardCallback(check_freq=1024, logdir=model_logdir, env=env, engine=arg_dict["engine"], multiprocessing=arg_dict["multiprocessing"])
    callbacks_list.append(auto_save_callback)
    if arg_dict["eval_freq"]:
        eval_env = configure_env(arg_dict, model_logdir, for_train=False)
        eval_callback = CustomEvalCallback(eval_env, log_path=model_logdir,
                                           eval_freq=arg_dict["eval_freq"],
                                           n_eval_episodes=arg_dict["eval_episodes"],
                                           record=arg_dict["record"],
                                           camera_id=arg_dict["camera"])
        callbacks_list.append(eval_callback)
    #callbacks_list.append(PlottingCallback(model_logdir))
    with ProgressBarManager(total_timesteps=arg_dict["steps"]) as progress_callback:
        callbacks_list.append(progress_callback)
        model.learn(total_timesteps=arg_dict["steps"], callback=callbacks_list)
    model.save(os.path.join(model_logdir, model_name))
    print("Training time: {:.2f} s".format(time.time() - start_time))

    # info_keywords in monitor class above is neccessary for pybullet to save_results
    # when using the info_keywords for mujoco we get an error
    if arg_dict["engine"] == "pybullet":
        save_results(arg_dict, model_name, env, model_logdir)
    return model
예제 #3
0
def test_gail_callback(tmp_path):
    dataset = ExpertDataset(expert_path=EXPERT_PATH_PENDULUM, traj_limitation=10,
                            sequential_preprocessing=True, verbose=0)
    model = GAIL("MlpPolicy", "Pendulum-v0", dataset)
    checkpoint_callback = CheckpointCallback(save_freq=500, save_path=str(tmp_path / 'logs/gail/'), name_prefix='gail')
    model.learn(total_timesteps=1000, callback=checkpoint_callback)
    shutil.rmtree(str(tmp_path / 'logs/gail/'))
    del dataset, model
예제 #4
0
def test_gail(tmp_path, expert_env):
    env_id, expert_path, load_from_memory = expert_env
    env = gym.make(env_id)

    traj_data = None
    if load_from_memory:
        traj_data = np.load(expert_path)
        expert_path = None
    dataset = ExpertDataset(traj_data=traj_data,
                            expert_path=expert_path,
                            traj_limitation=10,
                            sequential_preprocessing=True)

    # Note: train for 1M steps to have a working policy
    model = GAIL('MlpPolicy',
                 env,
                 adversary_entcoeff=0.0,
                 lam=0.92,
                 max_kl=0.001,
                 expert_dataset=dataset,
                 hidden_size_adversary=64,
                 verbose=0)

    model.learn(300)
    model.save(str(tmp_path / "GAIL-{}".format(env_id)))
    model = model.load(str(tmp_path / "GAIL-{}".format(env_id)), env=env)
    model.learn(300)

    evaluate_policy(model, env, n_eval_episodes=5)
    del dataset, model
def bc_from_dataset_and_params(dataset, bc_params, model_save_dir, num_epochs, lr, adam_eps):
    # Setup env
    gym_env = init_gym_env(bc_params)

    # Train and save model
    create_dir_if_not_exists(BC_SAVE_DIR + model_save_dir)

    model = GAIL("MlpPolicy", gym_env, dataset, verbose=1)
    model.pretrain(dataset, n_epochs=num_epochs, learning_rate=lr, adam_epsilon=adam_eps, save_dir=BC_SAVE_DIR + model_save_dir)

    save_bc_model(model_save_dir, model, bc_params)
    return model
예제 #6
0
def train():

    # Load Model

    env = gym.make('roundabout-v0')

    model = DQN(MlpPolicy, env, verbose=1)
    generate_expert_traj(model, 'expert_roundabout', n_timesteps=1000, n_episodes=10)

    #Data Augmentation
    expert_data = dict(np.load('expert_roundabout.npz'))
    print("my keys are:" + str(expert_data.keys()))
    obs = expert_data['obs']
    obs.shape
    expert_data['obs'] = obs.ravel()  # convert to 1D array
    print("my keys are:" + str(expert_data.keys()))
    np.savez('expert_roundabout.npz', expert_data)

    dataset = ExpertDataset(expert_path='expert_roundabout.npz', traj_limitation=10, verbose=1)
    model = GAIL('MlpPolicy', env, dataset, verbose=1)
    model.learn(total_timesteps=1000)
    model.save("gail_roundabout")

    env.close()
    del env
예제 #7
0
def load_bc_model_from_path(model_name):
    # NOTE: The lowest loss and highest accuracy models
    # were also saved, can be found in the same dir with
    # special suffixes.
    bc_metadata = load_pickle(BC_SAVE_DIR + model_name + "/bc_metadata")
    bc_params = bc_metadata["bc_params"]
    model = GAIL.load(BC_SAVE_DIR + model_name + "/model")
    return model, bc_params
def train(params):

    # create model
    env = FlattenObservation(gym.make(params.get("environment")))
    exp_name = params.get("model_name") + "_train_" + params.get("environment")
    log_dir = './logs/' + exp_name
    expert_name = 'expert_{0}'.format(exp_name)

    if params.get("model_name") == 'TRPO':
        print("Loading TRPO Model")
        model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
        model.learn(total_timesteps=params.get("train_steps"))
        model.save(exp_name)

    if params.get("model_name") == 'PPO':
        print("Loading PPO Model")
        model = PPO1(MlpPolicy,
                     env,
                     verbose=1,
                     tensorboard_log=log_dir,
                     entcoeff=params.get("ent_coef"),
                     gamma=params.get("gamma"),
                     optim_batchsize=params.get("batch_size"),
                     clip_param=params.get("clip_range"),
                     lam=params.get("gae_lambda"))
        model.learn(total_timesteps=params.get("train_steps"))
        model.save(exp_name)

    if params.get("expert_exists") is False:
        print("Training expert trajectories")
        # Train expert controller (if needed) and record expert trajectories.
        generate_expert_traj(model,
                             expert_name,
                             n_timesteps=params.get("expert_timesteps"),
                             n_episodes=params.get("n_episodes"))

    dataset = ExpertDataset(
        expert_path='{0}.npz'.format(expert_name),
        traj_limitation=-1,
        randomize=True,  # if the dataset should be shuffled
        verbose=1)

    model = GAIL('MlpPolicy', env, dataset, verbose=1,
                 tensorboard_log=log_dir)  # Check out for defaults

    if params.get("pre_train") is True:
        print("Pretraining Dataset with Behavioural Cloning")
        model.pretrain(dataset, n_epochs=10000)

    print("Executing GAIL Learning")
    model.learn(total_timesteps=params.get("train_steps"))
    model.save("BC" + exp_name)

    env.close()
    del env
예제 #9
0
def stable_gail(
    venv,
    expert=None,
    expert_venv=None,
    state_only=False,
    total_timesteps=10000,
    gen_batch_size=200,
    disc_batch_size=100,
    policy_lr=1e-3,
    callback=None,
    **kwargs,
):
    dataset = get_expert_dataset(expert, expert_venv, total_timesteps)

    policy = GAIL("MlpPolicy", venv, dataset)
    policy.learn(total_timesteps=total_timesteps)

    results = {}
    results["policy"] = policy

    return results
예제 #10
0
def main(args):
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device
    env = gym.make(args.env)
    train_log_dir = os.path.join(
        args.train_log_dir,
        args.env + '_' + args.expert + '_' + args.policy_type)
    if args.expert == 'PPO':
        expert_model = PPO1(args.policy_type,
                            env,
                            verbose=1,
                            tensorboard_log=train_log_dir)
    else:
        raise NotImplementedError
    expert_model.learn(total_timesteps=args.expert_training_step)
    generate_expert_traj(expert_model,
                         os.path.join(train_log_dir, 'expert_traj'),
                         n_timesteps=1000,
                         n_episodes=args.expert_episodes)

    dataset = ExpertDataset(expert_path=os.path.join(train_log_dir,
                                                     'expert_traj.npz'),
                            traj_limitation=-1)
    gail_model = GAIL(args.policy_type,
                      env,
                      dataset,
                      verbose=1,
                      tensorboard_log=train_log_dir)
    gail_model.learn(args.student_training_step)
    evaluate(gail_model, env, num_steps=10000)
    gail_model.save(train_log_dir)
    env.close()
예제 #11
0
def run_gail():
    parser = argparse.ArgumentParser()
    parser.add_argument('expert',
                        type=str,
                        default=None,
                        help='Expert path (*.npz)')
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--note', type=str, default='test')
    parser.add_argument('--env', type=str, default='PongNoFrameskip-v4')
    parser.add_argument('--num-steps', type=int, default=1000000)
    parser.add_argument('--policy',
                        type=str,
                        default='CnnPolicy',
                        choices=[
                            'CnnPolicy', 'CnnLstmPolicy', 'CnnLnLstmPolicy',
                            'MlpPolicy', 'MlpLstmPolicy', 'MlpLnLstmPolicy'
                        ],
                        help='Policy architecture')
    args = parser.parse_args()

    logger.configure(os.path.join('logs', args.env, args.note))
    logger.info(args)

    if 'NoFrameskip' in args.env:
        env = VecFrameStack(make_atari_env(args.env, 1, args.seed), 4)
    else:
        import gym
        env = gym.make(args.env)

    dataset = ExpertDataset(expert_path=args.expert,
                            batch_size=128,
                            train_fraction=0.99,
                            verbose=1)
    model = GAIL(args.policy,
                 env,
                 dataset,
                 timesteps_per_batch=1280,
                 verbose=1)
    model.learn(len(dataset.train_loader) * 1280)
예제 #12
0
def optimize_agent(trial):
    """ Train the model and optimise
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    model_params = optimize_GAIL(trial)
    seed = trial.suggest_int('numpyseed', 1, 429496729)
    np.random.seed(seed)
    original_env = gym.make('rustyblocks-v0')
    original_env.max_invalid_tries = 3
    env = DummyVecEnv([lambda: original_env])
    model = GAIL("MlpPolicy", env, verbose=0, **model_params)
    print("DOING LEARING a2c")
    original_env.force_progression = False
    model.learn(int(2e4 * 5), seed=seed)
    print("DONE LEARING a2c")
    original_env.max_invalid_tries = -1

    rewards = []
    n_episodes, reward_sum = 0, 0.0

    obs = env.reset()
    original_env.force_progression = True
    original_env.invalid_try_limit = 5000
    while n_episodes < 4:
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward

        if done:
            rewards.append(reward_sum)
            reward_sum = 0.0
            n_episodes += 1
            obs = env.reset()

    last_reward = np.mean(rewards)
    trial.report(last_reward)

    return last_reward
예제 #13
0
def load_model(path: str, algorithm: str):
    from stable_baselines import PPO2, DQN, A2C, ACER, GAIL, TRPO
    if algorithm == 'PPO2':
        return PPO2.load(path)
    if algorithm == 'DQN':
        return DQN.load(path)
    if algorithm == 'A2C':
        return A2C.load(path)
    if algorithm == 'ACER':
        return ACER.load(path)
    if algorithm == 'GAIL':
        return GAIL.load(path)
    if algorithm == 'TRPO':
        return TRPO.load(path)
    return None
예제 #14
0
def train_GAIL(env_train, model_name, timesteps=1000):
    """GAIL Model"""
    #from stable_baselines.gail import ExportDataset, generate_expert_traj
    start = time.time()
    # generate expert trajectories
    model = SAC('MLpPolicy', env_train, verbose=1)
    generate_expert_traj(model, 'expert_model_gail', n_timesteps=100, n_episodes=10)

    # Load dataset
    dataset = ExpertDataset(expert_path='expert_model_gail.npz', traj_limitation=10, verbose=1)
    model = GAIL('MLpPolicy', env_train, dataset, verbose=1)

    model.learn(total_timesteps=1000)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (PPO): ', (end - start) / 60, ' minutes')
    return model
예제 #15
0
def train_gail_withppo2():
    env = gimbal(5, 500)
    env = DummyVecEnv([lambda: env])
    model = PPO2.load("./models/baseline_ppo2_t1")
    generate_expert_traj(model,
                         './models/baseline_expert_t1',
                         env,
                         n_timesteps=0,
                         n_episodes=100)
    dataset = ExpertDataset(expert_path='./models/baseline_expert_t1.npz',
                            traj_limitation=-1,
                            verbose=1)
    model = GAIL("MlpPolicy", env, dataset, verbose=1)
    model.learn(total_timesteps=500000)
    model.save("./models/baseline_gail_ppo2_t1")
예제 #16
0
def main(args):
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device

    # train expert model for multiple times and save the best model
    best_reward = -np.inf
    train_env = make_vec_env(args.env, n_envs=args.n_env)
    eval_env = gym.make(args.env)

    for i in range(args.times_expert):
        train_env.reset()
        train_log_dir = os.path.join(args.train_log_dir,
                                     args.env + '_' + args.expert)
        if args.expert == 'PPO':
            expert_model = PPO2(args.policy_type, env=train_env, n_steps=args.n_steps, nminibatches=args.nminibatches, noptepochs=args.noptepochs, ent_coef=args.ent_coef,\
                            lam=args.lam, gamma=args.gamma, cliprange=args.cliprange, learning_rate=args.learning_rate, verbose=1, tensorboard_log=train_log_dir)
        else:
            raise NotImplementedError
        expert_model.learn(total_timesteps=args.expert_training_step)
        mean_reward = evaluate(expert_model, eval_env, num_steps=10000)
        if mean_reward > best_reward:
            best_reward = mean_reward
            expert_model.save(
                os.path.join(args.train_log_dir, args.env + '_expert'))
        del expert_model
    train_env.reset()
    expert_model = PPO2.load(os.path.join(args.train_log_dir,
                                          args.env + '_expert'),
                             env=train_env)
    generate_expert_traj(expert_model,
                         os.path.join(train_log_dir, 'expert_traj'),
                         n_timesteps=-1,
                         n_episodes=args.expert_episodes)
    train_env.close()

    dataset = ExpertDataset(expert_path=os.path.join(train_log_dir,
                                                     'expert_traj.npz'),
                            traj_limitation=-1)
    gail_model = GAIL(args.policy_type,
                      args.env,
                      dataset,
                      verbose=1,
                      tensorboard_log=train_log_dir)
    gail_model.learn(args.student_training_step)

    evaluate(gail_model, eval_env, num_steps=10000)
    gail_model.save(os.path.join(args.train_log_dir, args.env + '_GAIL'))
    eval_env.close()
예제 #17
0
def eval_with_standard_baselines(n_games, model_name, display=False):
    """Method to evaluate agent performance with stable-baselines infrastructure,
    just to make sure everything is compatible and integrating correctly."""
    bc_metadata = load_pickle(BC_SAVE_DIR + model_name + "/bc_metadata")
    bc_params = bc_metadata["bc_params"]
    model = GAIL.load(BC_SAVE_DIR + model_name + "/model")

    gym_env = init_gym_env(bc_params)

    tot_rew = 0
    for i in tqdm.trange(n_games):
        obs, _ = gym_env.reset()
        done = False
        while not done:
            ob0, ob1 = obs
            a0 = stable_baselines_predict_fn(model, ob0)
            a1 = stable_baselines_predict_fn(model, ob1)
            joint_action = (a0, a1)
            (obs, _), rewards, done, info = gym_env.step(joint_action)
            tot_rew += rewards

    print("avg reward", tot_rew / n_games)
    return tot_rew / n_games
예제 #18
0
def trian_agent_with_gail(load):
    from stable_baselines.common.policies import MlpPolicy
    from stable_baselines import GAIL

    env = gym.make("F16GCAS-v0")

    class CustomPolicy(MlpPolicy):
        def __init__(self, *args, **kwargs):
            super(CustomPolicy, self).__init__(*args, **kwargs,
                                               layers=[128, 128])
    if not load:
        ExpData = ExpertDataset("./lqr_export.npz")
        model = GAIL(CustomPolicy, env, ExpData, verbose=1)
        model.learn(total_timesteps=1000000)
        model.save(ROOT+"/trained_models/TDRL/f16/gail/128_128")
    else:
        # with model.graph.as_default():
        #     for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/pi/'):
        #         print(i)
        model = GAIL.load(ROOT+"/trained_models/TDRL/f16/gail/128_128", env=env)
        with model.graph.as_default():
            print(tf.all_variables())

    return model
예제 #19
0
# Generate expert trajectories (train expert)
env = PrticleEnv(alpha=1,
                 beta=10,
                 win_thre=1,
                 max_timestep=256,
                 for_circle_traj=True)

model = PPO1.load("model/part_circle_exp2_epoch05_sib.zip")
model.set_env(env)
generate_expert_traj(model,
                     'expert_part_circle_exp2_epoch05_sib',
                     n_episodes=10)

# Load the expert dataset
dataset = ExpertDataset(expert_path='expert_part_circle_exp2_epoch05_sib.npz',
                        traj_limitation=10,
                        verbose=1)

model = GAIL('MlpPolicy'\
             ,DummyVecEnv([lambda: PrticleEnv(alpha=1,beta=10,win_thre=1, max_timestep=256,for_circle_traj=True)])\
             , dataset, verbose=1, n_cpu_tf_sess=None)

# Note: in practice, you need to train for 1M steps to have a working policy
model.learn(total_timesteps=int(1e4))
model.save("_gail_sanity_test_exp1")

del model

# %%
예제 #20
0
def build_model(algo, env_name, log_dir, expert_dataset=None):
    """
    Initialize model according to algorithm, architecture and hyperparameters
    :param algo: (str) Name of rl algorithm - 'sac', 'ppo2' etc.
    :param env_name:(str)
    :param log_dir:(str)
    :param expert_dataset:(ExpertDataset)
    :return:model: stable_baselines model
    """
    model = None
    if algo == 'sac':
        policy_kwargs = dict(layers=[64, 64, 64], layer_norm=False)

        model = SAC('MlpPolicy',
                    env_name,
                    gamma=0.99,
                    learning_rate=1e-4,
                    buffer_size=500000,
                    learning_starts=5000,
                    train_freq=500,
                    batch_size=64,
                    policy_kwargs=policy_kwargs,
                    tau=0.01,
                    ent_coef='auto_0.1',
                    target_update_interval=1,
                    gradient_steps=1,
                    target_entropy='auto',
                    action_noise=None,
                    random_exploration=0.0,
                    verbose=2,
                    tensorboard_log=log_dir,
                    _init_setup_model=True,
                    full_tensorboard_log=True,
                    seed=None,
                    n_cpu_tf_sess=None)
    elif algo == 'ppo1':
        model = PPO1('MlpPolicy',
                     env_name,
                     gamma=0.99,
                     timesteps_per_actorbatch=256,
                     clip_param=0.2,
                     entcoeff=0.01,
                     optim_epochs=4,
                     optim_stepsize=1e-3,
                     optim_batchsize=64,
                     lam=0.95,
                     adam_epsilon=1e-5,
                     schedule='linear',
                     verbose=0,
                     tensorboard_log=None,
                     _init_setup_model=True,
                     policy_kwargs=None,
                     full_tensorboard_log=False,
                     seed=None,
                     n_cpu_tf_sess=1)
    elif algo == 'trpo':
        model = TRPO('MlpPolicy',
                     env_name,
                     timesteps_per_batch=4096,
                     tensorboard_log=log_dir,
                     verbose=1)
    elif algo == 'gail':
        assert expert_dataset is not None
        model = GAIL('MlpPolicy',
                     env_name,
                     expert_dataset,
                     tensorboard_log=log_dir,
                     verbose=1)
    assert model is not None
    return model
예제 #21
0
                    type=int,
                    help='Number of games to test.')
parser.add_argument('-s', '--save', default=True, type=bool)

args = parser.parse_args()

sys.path.append('/Users/cusgadmin/Documents/UCB/Academics/SSastry/\
    Multi_agent_competition')
os.chdir(
    '/Users/cusgadmin/Documents/UCB/Academics/SSastry/Multi_agent_competition/'
)

print(colored('Testing learnt policy from model file {} for {} games!'.\
  format(args.model,args.num_test),'red'))
start_time = time.time()
model = GAIL.load(args.model)
env = gym.make('gym_pursuitevasion_small:pursuitevasion_small-v0')
g = 1
obs = env.reset(ep=g)
e_win_games = int(0)
env.render(mode='human', highlight=True, ep=g)
if args.save:
    metadata = dict(title='Game')
    writer = FFMpegWriter(fps=5, metadata=metadata)
    writer.setup(env.window.fig, "test_game.mp4", 300)
    writer.grab_frame()
while True:
    action, _states = model.predict(obs)
    obs, rewards, done, e_win = env.step(action)
    env.render(mode='human', highlight=True, ep=g)
    if args.save:
예제 #22
0
import gym

from stable_baselines import GAIL, SAC
from stable_baselines.gail import ExpertDataset, generate_expert_traj

# Generate expert trajectories (train expert)
model = SAC('MlpPolicy', 'Pendulum-v0', verbose=1)
generate_expert_traj(model, 'expert_pendulum', n_timesteps=100, n_episodes=10)

# Load the expert dataset
dataset = ExpertDataset(expert_path='expert_pendulum.npz', traj_limitation=10, verbose=1)

model = GAIL('MlpPolicy', 'Pendulum-v0', dataset, verbose=1)
# Note: in practice, you need to train for 1M steps to have a working policy
model.learn(total_timesteps=100000)
model.save("gail_pendulum")

del model # remove to demonstrate saving and loading

model = GAIL.load("gail_pendulum")

env = gym.make('Pendulum-v0')
obs = env.reset()
while True:
  action, _states = model.predict(obs)
  obs, rewards, dones, info = env.step(action)
  env.render()
예제 #23
0
    #env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])
    env = gym.make('gym_docking:docking-v1')

    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env, n_envs=num_cpu, seed=0)

    checkpoint_callback = CheckpointCallback(
        save_freq=int(5e4),
        save_path='./logs/',
        name_prefix='rl_model_621_gail_10M')

    dataset = ExpertDataset(expert_path='./expert_PID/expert_PID_new.npz',
                            traj_limitation=-1,
                            batch_size=10)

    model = GAIL(policy='MlpPolicy',
                 env=env,
                 verbose=1,
                 tensorboard_log="./gail_docking_tensorboard/",
                 policy_kwargs=dict(
                     net_arch=[dict(pi=[128, 128], vf=[128, 128])],
                     act_fun=tf.nn.relu),
                 expert_dataset=dataset)

    # load trained model
    # model = PPO2.load("./ppo2_docking_621_random_pre.zip", env=env, tensorboard_log="./ppo2_docking_tensorboard/")

    model.learn(total_timesteps=int(10e6), callback=checkpoint_callback)
    model.save("gail_docking_621_10M")
예제 #24
0
def train(env_id, algo, num_timesteps, seed, sgd_steps, t_pi, t_c, lam, log,
          expert_path, pretrain, pretrain_epochs, mdpo_update_steps,
          num_trajectories, expert_model, exploration_bonus, bonus_coef,
          random_action_len, is_action_features, dir_name, neural, lipschitz,
          args):
    """
    Train TRPO model for the mujoco environment, for testing purposes
    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """

    with tf_util.single_threaded_session():
        # from mpi4py import MPI
        # rank = MPI.COMM_WORLD.Get_rank()
        rank = 0
        env_name = env_id[:-3].lower()
        log_dir = './experiments/' + env_name + '/' + str(algo).lower() + '/'\
                  + 'tpi' + str(t_pi) + '_tc' + str(t_c) + '_lam' + str(lam)
        log_dir += '_' + dir_name + '/'
        log_name = str(algo) + '_updateSteps' + str(mdpo_update_steps)
        # log_name += '_randLen' + str(random_action_len)
        if exploration_bonus:
            log_name += '_exploration' + str(bonus_coef)
        if pretrain:
            log_name += '_pretrain' + str(pretrain_epochs)
        if not is_action_features:
            log_name += "_states_only"
        log_name += '_s' + str(seed)

        log_path = log_dir + log_name
        expert_path = './experts/' + expert_path

        num_timesteps = int(num_timesteps)

        args = args.__dict__

        dir_path = os.getcwd() + log_dir[1:]
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
            with open(os.getcwd() + log_dir[1:] + 'args.txt', 'w') as file:
                file.write("Experiment Arguments:")
                for key, val in args.items():
                    print(key, ": ", val, file=file)

        if log:
            if rank == 0:
                logger.configure(log_path)
            else:
                logger.configure(log_path, format_strs=[])
                logger.set_level(logger.DISABLED)
        else:
            if rank == 0:
                logger.configure()
            else:
                logger.configure(format_strs=[])
                logger.set_level(logger.DISABLED)

        # workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

        # env = make_mujoco_env(env_id, workerseed)
        def make_env():
            # env_out = gym.make(env_id, reset_noise_scale=1.0)
            env_out = gym.make(env_id)
            env_out = bench.Monitor(env_out,
                                    logger.get_dir(),
                                    allow_early_resets=True)
            env_out.seed(seed)
            env_out = wrap_mujoco(env_out, random_action_len=random_action_len)
            return env_out

        #

        env = DummyVecEnv([make_env])
        # env = VecNormalize(env)

        if algo == 'Train':
            train = True
        else:
            train = False

        if algo == 'Evaluate':
            eval = True
        else:
            eval = False

        if train:
            from stable_baselines import SAC
            env = VecNormalize(env, norm_reward=False, norm_obs=False)

            if num_timesteps > 0:
                model = SAC('MlpPolicy',
                            env_id,
                            verbose=1,
                            buffer_size=1000000,
                            batch_size=256,
                            ent_coef='auto',
                            train_freq=1,
                            tau=0.01,
                            gradient_steps=1,
                            learning_starts=10000)
            else:
                model = SAC.load(expert_model, env)
            generate_expert_traj(model,
                                 expert_path,
                                 n_timesteps=num_timesteps,
                                 n_episodes=num_trajectories)
            if num_timesteps > 0:
                model.save('sac_' + env_name + '_' + str(num_timesteps))
        elif eval:
            from stable_baselines import SAC
            env = VecNormalize(env, norm_reward=False, norm_obs=False)
            model = SAC.load(expert_model, env)
            generate_expert_traj(model,
                                 expert_path,
                                 n_timesteps=num_timesteps,
                                 n_episodes=10,
                                 evaluate=True)
        else:
            expert_path = expert_path + '.npz'
            dataset = ExpertDataset(expert_path=expert_path,
                                    traj_limitation=10,
                                    verbose=1)

            if algo == 'MDAL':
                model = MDAL_MDPO_OFF('MlpPolicy',
                                      env,
                                      dataset,
                                      verbose=1,
                                      tensorboard_log="./experiments/" +
                                      env_name + "/mdal/",
                                      seed=seed,
                                      buffer_size=1000000,
                                      ent_coef=0.0,
                                      learning_starts=10000,
                                      batch_size=256,
                                      tau=0.01,
                                      gamma=0.99,
                                      gradient_steps=sgd_steps,
                                      mdpo_update_steps=mdpo_update_steps,
                                      lam=0.0,
                                      train_freq=1,
                                      d_step=10,
                                      tsallis_q=1,
                                      reparameterize=True,
                                      t_pi=t_pi,
                                      t_c=t_c,
                                      exploration_bonus=exploration_bonus,
                                      bonus_coef=bonus_coef,
                                      is_action_features=is_action_features,
                                      neural=neural,
                                      lipschitz=lipschitz)
            elif algo == 'MDAL_ON_POLICY':
                model = MDAL_MDPO_ON('MlpPolicy',
                                     env,
                                     dataset,
                                     verbose=1,
                                     timesteps_per_batch=2048,
                                     tensorboard_log="./experiments/" +
                                     env_name + "/mdal_mdpo_on/",
                                     seed=seed,
                                     max_kl=0.01,
                                     cg_iters=10,
                                     cg_damping=0.1,
                                     entcoeff=0.0,
                                     adversary_entcoeff=0.001,
                                     gamma=0.99,
                                     lam=0.95,
                                     vf_iters=5,
                                     vf_stepsize=1e-3,
                                     sgd_steps=sgd_steps,
                                     klcoeff=1.0,
                                     method="multistep-SGD",
                                     tsallis_q=1.0,
                                     t_pi=t_pi,
                                     t_c=t_c,
                                     exploration_bonus=exploration_bonus,
                                     bonus_coef=bonus_coef,
                                     is_action_features=is_action_features,
                                     neural=neural)

            elif algo == 'MDAL_TRPO':
                model = MDAL_TRPO('MlpPolicy',
                                  env,
                                  dataset,
                                  verbose=1,
                                  tensorboard_log="./experiments/" + env_name +
                                  "/mdal_trpo/",
                                  seed=seed,
                                  gamma=0.99,
                                  g_step=3,
                                  d_step=5,
                                  sgd_steps=1,
                                  d_stepsize=9e-5,
                                  entcoeff=0.0,
                                  adversary_entcoeff=0.001,
                                  max_kl=t_pi,
                                  t_pi=t_pi,
                                  t_c=t_c,
                                  exploration_bonus=exploration_bonus,
                                  bonus_coef=bonus_coef,
                                  is_action_features=is_action_features,
                                  neural=neural,
                                  lam=0.98,
                                  timesteps_per_batch=2000,
                                  lipschitz=lipschitz)

            elif algo == 'GAIL':
                from mpi4py import MPI
                from stable_baselines import GAIL

                model = GAIL('MlpPolicy',
                             env,
                             dataset,
                             verbose=1,
                             tensorboard_log="./experiments/" + env_name +
                             "/gail/",
                             seed=seed,
                             entcoeff=0.0,
                             adversary_entcoeff=0.001,
                             lipschitz=lipschitz)

            elif algo == 'GAIL_MDPO_OFF':
                # from mpi4py import MPI
                from stable_baselines import GAIL_MDPO_OFF

                model = GAIL_MDPO_OFF('MlpPolicy',
                                      env,
                                      dataset,
                                      verbose=1,
                                      tensorboard_log="./experiments/" +
                                      env_name + "/gail_mdpo_off/",
                                      seed=seed,
                                      ent_coef=0.0,
                                      adversary_entcoeff=0.001,
                                      buffer_size=1000000,
                                      learning_starts=10000,
                                      batch_size=256,
                                      tau=0.01,
                                      gamma=0.99,
                                      gradient_steps=sgd_steps,
                                      mdpo_update_steps=mdpo_update_steps,
                                      lam=0.0,
                                      train_freq=1,
                                      tsallis_q=1,
                                      reparameterize=True,
                                      t_pi=t_pi,
                                      t_c=t_c,
                                      exploration_bonus=exploration_bonus,
                                      bonus_coef=bonus_coef,
                                      is_action_features=is_action_features,
                                      lipschitz=lipschitz)
            else:
                raise ValueError("Not a valid algorithm.")

            if pretrain:
                model.pretrain(dataset, n_epochs=pretrain_epochs)

            model.learn(total_timesteps=num_timesteps, tb_log_name=log_name)

        env.close()
예제 #25
0
def build_model(algo, policy, env_name, log_dir, expert_dataset=None):
    """
    Initialize model according to algorithm, architecture and hyperparameters
    :param algo: (str) Name of rl algorithm - 'sac', 'ppo2' etc.
    :param env_name:(str)
    :param log_dir:(str)
    :param expert_dataset:(ExpertDataset)
    :return:model: stable_baselines model
    """
    from stable_baselines.common.vec_env import DummyVecEnv
    model = None
    if algo == 'sac':
        # policy_kwargs = dict(layers=[64, 64, 64],layer_norm=False)

        # model = SAC(policy, env_name, gamma=0.99, learning_rate=1e-4, buffer_size=500000,
        #             learning_starts=5000, train_freq=500, batch_size=64, policy_kwargs=policy_kwargs,
        #             tau=0.01, ent_coef='auto_0.1', target_update_interval=1,
        #             gradient_steps=1, target_entropy='auto', action_noise=None,
        #             random_exploration=0.0, verbose=2, tensorboard_log=log_dir,
        #             _init_setup_model=True, full_tensorboard_log=True,
        #             seed=None, n_cpu_tf_sess=None)

        # SAC - start learning from scratch
        # policy_kwargs = dict(act_fun=tf.nn.relu, net_arch=[32, 32, 32])
        policy_kwargs = dict(layers=[32, 32, 32], layer_norm=False)

        env = DummyVecEnv([lambda: gym.make(env_name)])
        # model = A2C(CnnMlpPolicy, env, verbose=1,gamma=0.99, learning_rate=1e-4,  tensorboard_log=log_dir, _init_setup_model=True, full_tensorboard_log=True,seed=None, n_cpu_tf_sess=None)

        model = SAC(CustomSacCnnMlpPolicy,
                    env=env,
                    gamma=0.99,
                    learning_rate=1e-4,
                    buffer_size=50000,
                    learning_starts=1000,
                    train_freq=100,
                    batch_size=1,
                    tau=0.01,
                    ent_coef='auto',
                    target_update_interval=1,
                    gradient_steps=1,
                    target_entropy='auto',
                    action_noise=None,
                    random_exploration=0.0,
                    verbose=1,
                    tensorboard_log=log_dir,
                    _init_setup_model=True,
                    full_tensorboard_log=True,
                    seed=None,
                    n_cpu_tf_sess=None)

    elif algo == 'ppo1':
        model = PPO1('MlpPolicy',
                     env_name,
                     gamma=0.99,
                     timesteps_per_actorbatch=256,
                     clip_param=0.2,
                     entcoeff=0.01,
                     optim_epochs=4,
                     optim_stepsize=1e-3,
                     optim_batchsize=64,
                     lam=0.95,
                     adam_epsilon=1e-5,
                     schedule='linear',
                     verbose=0,
                     tensorboard_log=None,
                     _init_setup_model=True,
                     policy_kwargs=None,
                     full_tensorboard_log=False,
                     seed=None,
                     n_cpu_tf_sess=1)
    elif algo == 'trpo':
        model = TRPO('MlpPolicy',
                     env_name,
                     timesteps_per_batch=4096,
                     tensorboard_log=log_dir,
                     verbose=1)
    elif algo == 'gail':
        assert expert_dataset is not None
        model = GAIL('MlpPolicy',
                     env_name,
                     expert_dataset,
                     tensorboard_log=log_dir,
                     verbose=1)
    assert model is not None
    return model
예제 #26
0
    'host': '172.21.217.140',
    'nget': 150
}

env = gym.make(**env_dict)

# Generate expert trajectories (train expert)
model = SAC('MlpPolicy', env, verbose=1)
generate_expert_traj(model, 'expert_prescan', n_timesteps=100, n_episodes=10)

# Load the expert dataset
dataset = ExpertDataset(expert_path='expert_prescan.npz',
                        traj_limitation=10,
                        verbose=1)

model = GAIL("MlpPolicy", env, dataset, verbose=1)
# Note: in practice, you need to train for 1M steps to have a working policy
try:
    model.learn(total_timesteps=1000)
except:
    pass
model.save(save_load)
'''
del model # remove to demonstrate saving and loading

model = GAIL.load(save_load)

env = gym.make(**env_dict)
obs = env.reset()
while True:
  action, _states = model.predict(obs)
sys.path.append('/Users/cusgadmin/Documents/UCB/Academics/SSastry/\
	Multi_agent_competition')
os.chdir('/Users/cusgadmin/Documents/UCB/Academics/SSastry/Multi_agent_competition/')

if args.train:
	now = datetime.datetime.now()

	print(colored('Loading expert data from {}!'.format(args.exp_file),'red'))
	exp_data = np.load(args.exp_file)
	print(colored('Expert evader has won {} games!'\
		.format(len(exp_data['episode_returns'])),'red'))
	dataset = ExpertDataset(expert_path=args.exp_file, verbose=1)

	start_time = time.time()
	model = GAIL('MlpPolicy', 'gym_pursuitevasion_small:pursuitevasion_small-v0', dataset, verbose=1)

	print(colored('Training a behaviour cloning agent for {} iterations!'.format(int(args.total_iters)),'red'))
	model.pretrain(dataset=dataset,n_epochs=int(args.total_iters))
	model.save('games{}_iters{}_{}_bc_pursuitevasion_small'.format(len(exp_data['episode_returns']),\
			int(args.total_iters),str(now.strftime('%Y%m%d'))))
	end_time = time.time()
	print(colored('Training time: {:.2f}s = {:.2f}min = {:.4f}hrs'.format(end_time-start_time,\
		(end_time-start_time)/60,(end_time-start_time)/3600),'red'))
	print(colored('Trained BC policy','red'))
	
else: #test
	print(colored('Trained on expert data from {}!'.format(args.exp_file),'red'))
	# exp_data = np.load(args.exp_file)s
	print(colored('Testing learnt policy from model file {} for {} games!'.\
		format(args.model,int(args.num_test)),'red'))
예제 #28
0
                         n_episodes=1000)
    env.close()

    print("Ending expert training, training with GAIL")
    # Load the expert dataset
    worker_id += 1
    env = UnityEnv(env_name, worker_id=worker_id,
                   use_visual=False)  # , no_graphics=True
    env = DummyVecEnv([
        lambda: env
    ])  # The algorithms require a vectorized environment to run
    dataset = ExpertDataset(expert_path='expert_basic_env.npz',
                            traj_limitation=10,
                            verbose=1)

    model = GAIL("MlpPolicy", env, dataset, verbose=1)
    model.learn(total_timesteps=30000)
    model.save(log_dir + "model")
    print("evaluating agent")
    #evaluate agent
    episodes = 100
    ep_r = []
    ep_l = []
    for e in range(episodes):
        obs = env.reset()
        total_r = 0.
        total_l = 0.
        while total_l < 200:
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            total_l += 1.