def test_gail(expert_env): env_id, expert_path = expert_env env = gym.make(env_id) dataset = ExpertDataset(expert_path=expert_path, traj_limitation=10, sequential_preprocessing=True) # Note: train for 1M steps to have a working policy model = GAIL('MlpPolicy', env, adversary_entcoeff=0.0, lam=0.92, max_kl=0.001, expert_dataset=dataset, hidden_size_adversary=64, verbose=0) model.learn(1000) model.save("GAIL-{}".format(env_id)) model = model.load("GAIL-{}".format(env_id), env=env) model.learn(1000) obs = env.reset() for _ in range(1000): action, _ = model.predict(obs) obs, _, done, _ = env.step(action) if done: obs = env.reset() del dataset, model
def main(args): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.device env = gym.make(args.env) train_log_dir = os.path.join( args.train_log_dir, args.env + '_' + args.expert + '_' + args.policy_type) if args.expert == 'PPO': expert_model = PPO1(args.policy_type, env, verbose=1, tensorboard_log=train_log_dir) else: raise NotImplementedError expert_model.learn(total_timesteps=args.expert_training_step) generate_expert_traj(expert_model, os.path.join(train_log_dir, 'expert_traj'), n_timesteps=1000, n_episodes=args.expert_episodes) dataset = ExpertDataset(expert_path=os.path.join(train_log_dir, 'expert_traj.npz'), traj_limitation=-1) gail_model = GAIL(args.policy_type, env, dataset, verbose=1, tensorboard_log=train_log_dir) gail_model.learn(args.student_training_step) evaluate(gail_model, env, num_steps=10000) gail_model.save(train_log_dir) env.close()
def test_gail(tmp_path, expert_env): env_id, expert_path, load_from_memory = expert_env env = gym.make(env_id) traj_data = None if load_from_memory: traj_data = np.load(expert_path) expert_path = None dataset = ExpertDataset(traj_data=traj_data, expert_path=expert_path, traj_limitation=10, sequential_preprocessing=True) # Note: train for 1M steps to have a working policy model = GAIL('MlpPolicy', env, adversary_entcoeff=0.0, lam=0.92, max_kl=0.001, expert_dataset=dataset, hidden_size_adversary=64, verbose=0) model.learn(300) model.save(str(tmp_path / "GAIL-{}".format(env_id))) model = model.load(str(tmp_path / "GAIL-{}".format(env_id)), env=env) model.learn(300) evaluate_policy(model, env, n_eval_episodes=5) del dataset, model
def train(env, implemented_combos, model_logdir, arg_dict, pretrained_model=None): model_name = arg_dict["algo"] + '_' + str(arg_dict["steps"]) conf_pth = os.path.join(model_logdir, "train.json") model_path = os.path.join(model_logdir, "best_model.zip") arg_dict["model_path"] = model_path with open(conf_pth, "w") as f: json.dump(arg_dict, f, indent=4) model_args = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][1] model_kwargs = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][2] if pretrained_model: if not os.path.isabs(pretrained_model): pretrained_model = pkg_resources.resource_filename("myGym", pretrained_model) env = model_args[1] vec_env = DummyVecEnv([lambda: env]) model = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][0].load(pretrained_model, vec_env) else: model = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][0](*model_args, **model_kwargs) if arg_dict["algo"] == "gail": # Multi processing: (using MPI) if arg_dict["train_framework"] == 'tensorflow': # Generate expert trajectories (train expert) generate_expert_traj(model, model_name, n_timesteps=3000, n_episodes=100) # Load the expert dataset dataset = ExpertDataset(expert_path=model_name+'.npz', traj_limitation=10, verbose=1) model = GAIL_T('MlpPolicy', model_name, dataset, verbose=1) # Note: in practice, you need to train for 1M steps to have a working policy start_time = time.time() callbacks_list = [] if pretrained_model: model_logdir = pretrained_model.split('/') model_logdir = model_logdir[:-1] model_logdir = "/".join(model_logdir) auto_save_callback = SaveOnBestTrainingRewardCallback(check_freq=1024, logdir=model_logdir, env=env, engine=arg_dict["engine"], multiprocessing=arg_dict["multiprocessing"]) else: auto_save_callback = SaveOnBestTrainingRewardCallback(check_freq=1024, logdir=model_logdir, env=env, engine=arg_dict["engine"], multiprocessing=arg_dict["multiprocessing"]) callbacks_list.append(auto_save_callback) if arg_dict["eval_freq"]: eval_env = configure_env(arg_dict, model_logdir, for_train=False) eval_callback = CustomEvalCallback(eval_env, log_path=model_logdir, eval_freq=arg_dict["eval_freq"], n_eval_episodes=arg_dict["eval_episodes"], record=arg_dict["record"], camera_id=arg_dict["camera"]) callbacks_list.append(eval_callback) #callbacks_list.append(PlottingCallback(model_logdir)) with ProgressBarManager(total_timesteps=arg_dict["steps"]) as progress_callback: callbacks_list.append(progress_callback) model.learn(total_timesteps=arg_dict["steps"], callback=callbacks_list) model.save(os.path.join(model_logdir, model_name)) print("Training time: {:.2f} s".format(time.time() - start_time)) # info_keywords in monitor class above is neccessary for pybullet to save_results # when using the info_keywords for mujoco we get an error if arg_dict["engine"] == "pybullet": save_results(arg_dict, model_name, env, model_logdir) return model
def train_gail_withppo2(): env = gimbal(5, 500) env = DummyVecEnv([lambda: env]) model = PPO2.load("./models/baseline_ppo2_t1") generate_expert_traj(model, './models/baseline_expert_t1', env, n_timesteps=0, n_episodes=100) dataset = ExpertDataset(expert_path='./models/baseline_expert_t1.npz', traj_limitation=-1, verbose=1) model = GAIL("MlpPolicy", env, dataset, verbose=1) model.learn(total_timesteps=500000) model.save("./models/baseline_gail_ppo2_t1")
def main(args): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.device # train expert model for multiple times and save the best model best_reward = -np.inf train_env = make_vec_env(args.env, n_envs=args.n_env) eval_env = gym.make(args.env) for i in range(args.times_expert): train_env.reset() train_log_dir = os.path.join(args.train_log_dir, args.env + '_' + args.expert) if args.expert == 'PPO': expert_model = PPO2(args.policy_type, env=train_env, n_steps=args.n_steps, nminibatches=args.nminibatches, noptepochs=args.noptepochs, ent_coef=args.ent_coef,\ lam=args.lam, gamma=args.gamma, cliprange=args.cliprange, learning_rate=args.learning_rate, verbose=1, tensorboard_log=train_log_dir) else: raise NotImplementedError expert_model.learn(total_timesteps=args.expert_training_step) mean_reward = evaluate(expert_model, eval_env, num_steps=10000) if mean_reward > best_reward: best_reward = mean_reward expert_model.save( os.path.join(args.train_log_dir, args.env + '_expert')) del expert_model train_env.reset() expert_model = PPO2.load(os.path.join(args.train_log_dir, args.env + '_expert'), env=train_env) generate_expert_traj(expert_model, os.path.join(train_log_dir, 'expert_traj'), n_timesteps=-1, n_episodes=args.expert_episodes) train_env.close() dataset = ExpertDataset(expert_path=os.path.join(train_log_dir, 'expert_traj.npz'), traj_limitation=-1) gail_model = GAIL(args.policy_type, args.env, dataset, verbose=1, tensorboard_log=train_log_dir) gail_model.learn(args.student_training_step) evaluate(gail_model, eval_env, num_steps=10000) gail_model.save(os.path.join(args.train_log_dir, args.env + '_GAIL')) eval_env.close()
def trian_agent_with_gail(load): from stable_baselines.common.policies import MlpPolicy from stable_baselines import GAIL env = gym.make("F16GCAS-v0") class CustomPolicy(MlpPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[128, 128]) if not load: ExpData = ExpertDataset("./lqr_export.npz") model = GAIL(CustomPolicy, env, ExpData, verbose=1) model.learn(total_timesteps=1000000) model.save(ROOT+"/trained_models/TDRL/f16/gail/128_128") else: # with model.graph.as_default(): # for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/pi/'): # print(i) model = GAIL.load(ROOT+"/trained_models/TDRL/f16/gail/128_128", env=env) with model.graph.as_default(): print(tf.all_variables()) return model
#env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) env = gym.make('gym_docking:docking-v1') # Stable Baselines provides you with make_vec_env() helper # which does exactly the previous steps for you: # env = make_vec_env(env, n_envs=num_cpu, seed=0) checkpoint_callback = CheckpointCallback( save_freq=int(5e4), save_path='./logs/', name_prefix='rl_model_621_gail_10M') dataset = ExpertDataset(expert_path='./expert_PID/expert_PID_new.npz', traj_limitation=-1, batch_size=10) model = GAIL(policy='MlpPolicy', env=env, verbose=1, tensorboard_log="./gail_docking_tensorboard/", policy_kwargs=dict( net_arch=[dict(pi=[128, 128], vf=[128, 128])], act_fun=tf.nn.relu), expert_dataset=dataset) # load trained model # model = PPO2.load("./ppo2_docking_621_random_pre.zip", env=env, tensorboard_log="./ppo2_docking_tensorboard/") model.learn(total_timesteps=int(10e6), callback=checkpoint_callback) model.save("gail_docking_621_10M")
if args.train: now = datetime.datetime.now() print(colored('Loading expert data from {}!'.format(args.exp_file),'red')) exp_data = np.load(args.exp_file) print(colored('Expert evader has won {} games!'\ .format(len(exp_data['episode_returns'])),'red')) dataset = ExpertDataset(expert_path=args.exp_file, verbose=1) start_time = time.time() model = GAIL('MlpPolicy', 'gym_pursuitevasion_small:pursuitevasion_small-v0', dataset, verbose=1) print(colored('Training a behaviour cloning agent for {} iterations!'.format(int(args.total_iters)),'red')) model.pretrain(dataset=dataset,n_epochs=int(args.total_iters)) model.save('games{}_iters{}_{}_bc_pursuitevasion_small'.format(len(exp_data['episode_returns']),\ int(args.total_iters),str(now.strftime('%Y%m%d')))) end_time = time.time() print(colored('Training time: {:.2f}s = {:.2f}min = {:.4f}hrs'.format(end_time-start_time,\ (end_time-start_time)/60,(end_time-start_time)/3600),'red')) print(colored('Trained BC policy','red')) else: #test print(colored('Trained on expert data from {}!'.format(args.exp_file),'red')) # exp_data = np.load(args.exp_file)s print(colored('Testing learnt policy from model file {} for {} games!'.\ format(args.model,int(args.num_test)),'red')) start_time = time.time() model = GAIL.load(args.model) env = gym.make('gym_pursuitevasion_small:pursuitevasion_small-v0') g = 1 obs = env.reset(ep=g)
# if args.pretrain: print(colored('Pretraining a behaviour cloning agent!', 'red')) model.pretrain(dataset=dataset, n_epochs=int(args.pretrain_iters)) # model.save('games{}_pretrained_{}_bc{}_trpo_pursuitevasion_small'.format(int(args.generate_num),\ # str(now.strftime('%Y%m%d')),int(args.pretrain_iters))) end_time1 = time.time() print(colored('Pretraining time: {:.2f}s = {:.2f}min = {:.4f}hrs'.format(end_time1-start_time1,\ (end_time1-start_time1)/60,(end_time1-start_time1)/3600),'red')) start_time2 = time.time() print(colored('Training a GAIL agent!', 'red')) model.learn(total_timesteps=int(args.total_iters)) if args.generate_expert: model.save('games{}_iters{}_{}_bc{}_gail_trpo_pursuitevasion_small'.format(int(args.generate_num),\ int(args.total_iters),str(now.strftime('%Y%m%d')),int(args.pretrain_iters))) else: model.save('games{}_iters{}_{}_bc{}_gail_trpo_pursuitevasion_small'.format(len(exp_data['episode_returns']),\ int(args.total_iters),str(now.strftime('%Y%m%d')),int(args.pretrain_iters))) end_time2 = time.time() print(colored('Total Training time: {:.2f}s = {:.2f}min = {:.4f}hrs'.format(end_time2-start_time1,\ (end_time2-start_time1)/60,(end_time2-start_time1)/3600),'red')) print(colored('Trained TRPO+GAIL policy', 'red')) else: #test if not args.generate_expert: print( colored('Trained on expert data from {}!'.format(args.exp_file), 'red')) # exp_data = np.load(args.exp_file) print(colored('Testing learnt policy from model file {} for {} games!'.\ format(args.model,args.num_test),'red'))
# Generate expert trajectories (train expert) env = PrticleEnv(alpha=1, beta=10, win_thre=1, max_timestep=256, for_circle_traj=True) model = PPO1.load("model/part_circle_exp2_epoch05_sib.zip") model.set_env(env) generate_expert_traj(model, 'expert_part_circle_exp2_epoch05_sib', n_episodes=10) # Load the expert dataset dataset = ExpertDataset(expert_path='expert_part_circle_exp2_epoch05_sib.npz', traj_limitation=10, verbose=1) model = GAIL('MlpPolicy'\ ,DummyVecEnv([lambda: PrticleEnv(alpha=1,beta=10,win_thre=1, max_timestep=256,for_circle_traj=True)])\ , dataset, verbose=1, n_cpu_tf_sess=None) # Note: in practice, you need to train for 1M steps to have a working policy model.learn(total_timesteps=int(1e4)) model.save("_gail_sanity_test_exp1") del model # %%
print("Ending expert training, training with GAIL") # Load the expert dataset worker_id += 1 env = UnityEnv(env_name, worker_id=worker_id, use_visual=False) # , no_graphics=True env = DummyVecEnv([ lambda: env ]) # The algorithms require a vectorized environment to run dataset = ExpertDataset(expert_path='expert_basic_env.npz', traj_limitation=10, verbose=1) model = GAIL("MlpPolicy", env, dataset, verbose=1) model.learn(total_timesteps=30000) model.save(log_dir + "model") print("evaluating agent") #evaluate agent episodes = 100 ep_r = [] ep_l = [] for e in range(episodes): obs = env.reset() total_r = 0. total_l = 0. while total_l < 200: action, _states = model.predict(obs) obs, reward, done, info = env.step(action) total_l += 1. total_r += reward if done: