示例#1
0
def test_gail(expert_env):
    env_id, expert_path = expert_env
    env = gym.make(env_id)
    dataset = ExpertDataset(expert_path=expert_path,
                            traj_limitation=10,
                            sequential_preprocessing=True)

    # Note: train for 1M steps to have a working policy
    model = GAIL('MlpPolicy',
                 env,
                 adversary_entcoeff=0.0,
                 lam=0.92,
                 max_kl=0.001,
                 expert_dataset=dataset,
                 hidden_size_adversary=64,
                 verbose=0)

    model.learn(1000)
    model.save("GAIL-{}".format(env_id))
    model = model.load("GAIL-{}".format(env_id), env=env)
    model.learn(1000)

    obs = env.reset()

    for _ in range(1000):
        action, _ = model.predict(obs)
        obs, _, done, _ = env.step(action)
        if done:
            obs = env.reset()
    del dataset, model
def main(args):
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device
    env = gym.make(args.env)
    train_log_dir = os.path.join(
        args.train_log_dir,
        args.env + '_' + args.expert + '_' + args.policy_type)
    if args.expert == 'PPO':
        expert_model = PPO1(args.policy_type,
                            env,
                            verbose=1,
                            tensorboard_log=train_log_dir)
    else:
        raise NotImplementedError
    expert_model.learn(total_timesteps=args.expert_training_step)
    generate_expert_traj(expert_model,
                         os.path.join(train_log_dir, 'expert_traj'),
                         n_timesteps=1000,
                         n_episodes=args.expert_episodes)

    dataset = ExpertDataset(expert_path=os.path.join(train_log_dir,
                                                     'expert_traj.npz'),
                            traj_limitation=-1)
    gail_model = GAIL(args.policy_type,
                      env,
                      dataset,
                      verbose=1,
                      tensorboard_log=train_log_dir)
    gail_model.learn(args.student_training_step)
    evaluate(gail_model, env, num_steps=10000)
    gail_model.save(train_log_dir)
    env.close()
示例#3
0
def test_gail(tmp_path, expert_env):
    env_id, expert_path, load_from_memory = expert_env
    env = gym.make(env_id)

    traj_data = None
    if load_from_memory:
        traj_data = np.load(expert_path)
        expert_path = None
    dataset = ExpertDataset(traj_data=traj_data,
                            expert_path=expert_path,
                            traj_limitation=10,
                            sequential_preprocessing=True)

    # Note: train for 1M steps to have a working policy
    model = GAIL('MlpPolicy',
                 env,
                 adversary_entcoeff=0.0,
                 lam=0.92,
                 max_kl=0.001,
                 expert_dataset=dataset,
                 hidden_size_adversary=64,
                 verbose=0)

    model.learn(300)
    model.save(str(tmp_path / "GAIL-{}".format(env_id)))
    model = model.load(str(tmp_path / "GAIL-{}".format(env_id)), env=env)
    model.learn(300)

    evaluate_policy(model, env, n_eval_episodes=5)
    del dataset, model
示例#4
0
def train(env, implemented_combos, model_logdir, arg_dict, pretrained_model=None):
    model_name = arg_dict["algo"] + '_' + str(arg_dict["steps"])
    conf_pth   = os.path.join(model_logdir, "train.json")
    model_path = os.path.join(model_logdir, "best_model.zip")
    arg_dict["model_path"] = model_path
    with open(conf_pth, "w") as f:
        json.dump(arg_dict, f, indent=4)

    model_args = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][1]
    model_kwargs = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][2]
    if pretrained_model:
        if not os.path.isabs(pretrained_model):
            pretrained_model = pkg_resources.resource_filename("myGym", pretrained_model)
        env = model_args[1]
        vec_env = DummyVecEnv([lambda: env])
        model = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][0].load(pretrained_model, vec_env)
    else:
        model = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][0](*model_args, **model_kwargs)

    if arg_dict["algo"] == "gail":
        # Multi processing: (using MPI)
        if arg_dict["train_framework"] == 'tensorflow':
            # Generate expert trajectories (train expert)
            generate_expert_traj(model, model_name, n_timesteps=3000, n_episodes=100)
            # Load the expert dataset
            dataset = ExpertDataset(expert_path=model_name+'.npz', traj_limitation=10, verbose=1)
            model = GAIL_T('MlpPolicy', model_name, dataset, verbose=1)
            # Note: in practice, you need to train for 1M steps to have a working policy

    start_time = time.time()
    callbacks_list = []
    if pretrained_model:
        model_logdir = pretrained_model.split('/')
        model_logdir = model_logdir[:-1]
        model_logdir = "/".join(model_logdir)
        auto_save_callback = SaveOnBestTrainingRewardCallback(check_freq=1024, logdir=model_logdir, env=env, engine=arg_dict["engine"], multiprocessing=arg_dict["multiprocessing"])
    else:
        auto_save_callback = SaveOnBestTrainingRewardCallback(check_freq=1024, logdir=model_logdir, env=env, engine=arg_dict["engine"], multiprocessing=arg_dict["multiprocessing"])
    callbacks_list.append(auto_save_callback)
    if arg_dict["eval_freq"]:
        eval_env = configure_env(arg_dict, model_logdir, for_train=False)
        eval_callback = CustomEvalCallback(eval_env, log_path=model_logdir,
                                           eval_freq=arg_dict["eval_freq"],
                                           n_eval_episodes=arg_dict["eval_episodes"],
                                           record=arg_dict["record"],
                                           camera_id=arg_dict["camera"])
        callbacks_list.append(eval_callback)
    #callbacks_list.append(PlottingCallback(model_logdir))
    with ProgressBarManager(total_timesteps=arg_dict["steps"]) as progress_callback:
        callbacks_list.append(progress_callback)
        model.learn(total_timesteps=arg_dict["steps"], callback=callbacks_list)
    model.save(os.path.join(model_logdir, model_name))
    print("Training time: {:.2f} s".format(time.time() - start_time))

    # info_keywords in monitor class above is neccessary for pybullet to save_results
    # when using the info_keywords for mujoco we get an error
    if arg_dict["engine"] == "pybullet":
        save_results(arg_dict, model_name, env, model_logdir)
    return model
示例#5
0
def train_gail_withppo2():
    env = gimbal(5, 500)
    env = DummyVecEnv([lambda: env])
    model = PPO2.load("./models/baseline_ppo2_t1")
    generate_expert_traj(model,
                         './models/baseline_expert_t1',
                         env,
                         n_timesteps=0,
                         n_episodes=100)
    dataset = ExpertDataset(expert_path='./models/baseline_expert_t1.npz',
                            traj_limitation=-1,
                            verbose=1)
    model = GAIL("MlpPolicy", env, dataset, verbose=1)
    model.learn(total_timesteps=500000)
    model.save("./models/baseline_gail_ppo2_t1")
示例#6
0
def main(args):
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device

    # train expert model for multiple times and save the best model
    best_reward = -np.inf
    train_env = make_vec_env(args.env, n_envs=args.n_env)
    eval_env = gym.make(args.env)

    for i in range(args.times_expert):
        train_env.reset()
        train_log_dir = os.path.join(args.train_log_dir,
                                     args.env + '_' + args.expert)
        if args.expert == 'PPO':
            expert_model = PPO2(args.policy_type, env=train_env, n_steps=args.n_steps, nminibatches=args.nminibatches, noptepochs=args.noptepochs, ent_coef=args.ent_coef,\
                            lam=args.lam, gamma=args.gamma, cliprange=args.cliprange, learning_rate=args.learning_rate, verbose=1, tensorboard_log=train_log_dir)
        else:
            raise NotImplementedError
        expert_model.learn(total_timesteps=args.expert_training_step)
        mean_reward = evaluate(expert_model, eval_env, num_steps=10000)
        if mean_reward > best_reward:
            best_reward = mean_reward
            expert_model.save(
                os.path.join(args.train_log_dir, args.env + '_expert'))
        del expert_model
    train_env.reset()
    expert_model = PPO2.load(os.path.join(args.train_log_dir,
                                          args.env + '_expert'),
                             env=train_env)
    generate_expert_traj(expert_model,
                         os.path.join(train_log_dir, 'expert_traj'),
                         n_timesteps=-1,
                         n_episodes=args.expert_episodes)
    train_env.close()

    dataset = ExpertDataset(expert_path=os.path.join(train_log_dir,
                                                     'expert_traj.npz'),
                            traj_limitation=-1)
    gail_model = GAIL(args.policy_type,
                      args.env,
                      dataset,
                      verbose=1,
                      tensorboard_log=train_log_dir)
    gail_model.learn(args.student_training_step)

    evaluate(gail_model, eval_env, num_steps=10000)
    gail_model.save(os.path.join(args.train_log_dir, args.env + '_GAIL'))
    eval_env.close()
示例#7
0
def trian_agent_with_gail(load):
    from stable_baselines.common.policies import MlpPolicy
    from stable_baselines import GAIL

    env = gym.make("F16GCAS-v0")

    class CustomPolicy(MlpPolicy):
        def __init__(self, *args, **kwargs):
            super(CustomPolicy, self).__init__(*args, **kwargs,
                                               layers=[128, 128])
    if not load:
        ExpData = ExpertDataset("./lqr_export.npz")
        model = GAIL(CustomPolicy, env, ExpData, verbose=1)
        model.learn(total_timesteps=1000000)
        model.save(ROOT+"/trained_models/TDRL/f16/gail/128_128")
    else:
        # with model.graph.as_default():
        #     for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/pi/'):
        #         print(i)
        model = GAIL.load(ROOT+"/trained_models/TDRL/f16/gail/128_128", env=env)
        with model.graph.as_default():
            print(tf.all_variables())

    return model
示例#8
0
    #env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])
    env = gym.make('gym_docking:docking-v1')

    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env, n_envs=num_cpu, seed=0)

    checkpoint_callback = CheckpointCallback(
        save_freq=int(5e4),
        save_path='./logs/',
        name_prefix='rl_model_621_gail_10M')

    dataset = ExpertDataset(expert_path='./expert_PID/expert_PID_new.npz',
                            traj_limitation=-1,
                            batch_size=10)

    model = GAIL(policy='MlpPolicy',
                 env=env,
                 verbose=1,
                 tensorboard_log="./gail_docking_tensorboard/",
                 policy_kwargs=dict(
                     net_arch=[dict(pi=[128, 128], vf=[128, 128])],
                     act_fun=tf.nn.relu),
                 expert_dataset=dataset)

    # load trained model
    # model = PPO2.load("./ppo2_docking_621_random_pre.zip", env=env, tensorboard_log="./ppo2_docking_tensorboard/")

    model.learn(total_timesteps=int(10e6), callback=checkpoint_callback)
    model.save("gail_docking_621_10M")
if args.train:
	now = datetime.datetime.now()

	print(colored('Loading expert data from {}!'.format(args.exp_file),'red'))
	exp_data = np.load(args.exp_file)
	print(colored('Expert evader has won {} games!'\
		.format(len(exp_data['episode_returns'])),'red'))
	dataset = ExpertDataset(expert_path=args.exp_file, verbose=1)

	start_time = time.time()
	model = GAIL('MlpPolicy', 'gym_pursuitevasion_small:pursuitevasion_small-v0', dataset, verbose=1)

	print(colored('Training a behaviour cloning agent for {} iterations!'.format(int(args.total_iters)),'red'))
	model.pretrain(dataset=dataset,n_epochs=int(args.total_iters))
	model.save('games{}_iters{}_{}_bc_pursuitevasion_small'.format(len(exp_data['episode_returns']),\
			int(args.total_iters),str(now.strftime('%Y%m%d'))))
	end_time = time.time()
	print(colored('Training time: {:.2f}s = {:.2f}min = {:.4f}hrs'.format(end_time-start_time,\
		(end_time-start_time)/60,(end_time-start_time)/3600),'red'))
	print(colored('Trained BC policy','red'))
	
else: #test
	print(colored('Trained on expert data from {}!'.format(args.exp_file),'red'))
	# exp_data = np.load(args.exp_file)s
	print(colored('Testing learnt policy from model file {} for {} games!'.\
		format(args.model,int(args.num_test)),'red'))
	start_time = time.time()
	model = GAIL.load(args.model)
	env = gym.make('gym_pursuitevasion_small:pursuitevasion_small-v0')
	g = 1
	obs = env.reset(ep=g)
    # if args.pretrain:
    print(colored('Pretraining a behaviour cloning agent!', 'red'))
    model.pretrain(dataset=dataset, n_epochs=int(args.pretrain_iters))
    # model.save('games{}_pretrained_{}_bc{}_trpo_pursuitevasion_small'.format(int(args.generate_num),\
    # 		str(now.strftime('%Y%m%d')),int(args.pretrain_iters)))

    end_time1 = time.time()
    print(colored('Pretraining time: {:.2f}s = {:.2f}min = {:.4f}hrs'.format(end_time1-start_time1,\
     (end_time1-start_time1)/60,(end_time1-start_time1)/3600),'red'))

    start_time2 = time.time()
    print(colored('Training a GAIL agent!', 'red'))
    model.learn(total_timesteps=int(args.total_iters))

    if args.generate_expert:
        model.save('games{}_iters{}_{}_bc{}_gail_trpo_pursuitevasion_small'.format(int(args.generate_num),\
         int(args.total_iters),str(now.strftime('%Y%m%d')),int(args.pretrain_iters)))
    else:
        model.save('games{}_iters{}_{}_bc{}_gail_trpo_pursuitevasion_small'.format(len(exp_data['episode_returns']),\
         int(args.total_iters),str(now.strftime('%Y%m%d')),int(args.pretrain_iters)))
    end_time2 = time.time()
    print(colored('Total Training time: {:.2f}s = {:.2f}min = {:.4f}hrs'.format(end_time2-start_time1,\
     (end_time2-start_time1)/60,(end_time2-start_time1)/3600),'red'))
    print(colored('Trained TRPO+GAIL policy', 'red'))
else:  #test
    if not args.generate_expert:
        print(
            colored('Trained on expert data from {}!'.format(args.exp_file),
                    'red'))
        # exp_data = np.load(args.exp_file)
    print(colored('Testing learnt policy from model file {} for {} games!'.\
     format(args.model,args.num_test),'red'))
示例#11
0
# Generate expert trajectories (train expert)
env = PrticleEnv(alpha=1,
                 beta=10,
                 win_thre=1,
                 max_timestep=256,
                 for_circle_traj=True)

model = PPO1.load("model/part_circle_exp2_epoch05_sib.zip")
model.set_env(env)
generate_expert_traj(model,
                     'expert_part_circle_exp2_epoch05_sib',
                     n_episodes=10)

# Load the expert dataset
dataset = ExpertDataset(expert_path='expert_part_circle_exp2_epoch05_sib.npz',
                        traj_limitation=10,
                        verbose=1)

model = GAIL('MlpPolicy'\
             ,DummyVecEnv([lambda: PrticleEnv(alpha=1,beta=10,win_thre=1, max_timestep=256,for_circle_traj=True)])\
             , dataset, verbose=1, n_cpu_tf_sess=None)

# Note: in practice, you need to train for 1M steps to have a working policy
model.learn(total_timesteps=int(1e4))
model.save("_gail_sanity_test_exp1")

del model

# %%
示例#12
0
    print("Ending expert training, training with GAIL")
    # Load the expert dataset
    worker_id += 1
    env = UnityEnv(env_name, worker_id=worker_id,
                   use_visual=False)  # , no_graphics=True
    env = DummyVecEnv([
        lambda: env
    ])  # The algorithms require a vectorized environment to run
    dataset = ExpertDataset(expert_path='expert_basic_env.npz',
                            traj_limitation=10,
                            verbose=1)

    model = GAIL("MlpPolicy", env, dataset, verbose=1)
    model.learn(total_timesteps=30000)
    model.save(log_dir + "model")
    print("evaluating agent")
    #evaluate agent
    episodes = 100
    ep_r = []
    ep_l = []
    for e in range(episodes):
        obs = env.reset()
        total_r = 0.
        total_l = 0.
        while total_l < 200:
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            total_l += 1.
            total_r += reward
            if done: