def main(args): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.device env = gym.make(args.env) train_log_dir = os.path.join( args.train_log_dir, args.env + '_' + args.expert + '_' + args.policy_type) if args.expert == 'PPO': expert_model = PPO1(args.policy_type, env, verbose=1, tensorboard_log=train_log_dir) else: raise NotImplementedError expert_model.learn(total_timesteps=args.expert_training_step) generate_expert_traj(expert_model, os.path.join(train_log_dir, 'expert_traj'), n_timesteps=1000, n_episodes=args.expert_episodes) dataset = ExpertDataset(expert_path=os.path.join(train_log_dir, 'expert_traj.npz'), traj_limitation=-1) gail_model = GAIL(args.policy_type, env, dataset, verbose=1, tensorboard_log=train_log_dir) gail_model.learn(args.student_training_step) evaluate(gail_model, env, num_steps=10000) gail_model.save(train_log_dir) env.close()
def test_gail(tmp_path, expert_env): env_id, expert_path, load_from_memory = expert_env env = gym.make(env_id) traj_data = None if load_from_memory: traj_data = np.load(expert_path) expert_path = None dataset = ExpertDataset(traj_data=traj_data, expert_path=expert_path, traj_limitation=10, sequential_preprocessing=True) # Note: train for 1M steps to have a working policy model = GAIL('MlpPolicy', env, adversary_entcoeff=0.0, lam=0.92, max_kl=0.001, expert_dataset=dataset, hidden_size_adversary=64, verbose=0) model.learn(300) model.save(str(tmp_path / "GAIL-{}".format(env_id))) model = model.load(str(tmp_path / "GAIL-{}".format(env_id)), env=env) model.learn(300) evaluate_policy(model, env, n_eval_episodes=5) del dataset, model
def test_gail(expert_env): env_id, expert_path = expert_env env = gym.make(env_id) dataset = ExpertDataset(expert_path=expert_path, traj_limitation=10, sequential_preprocessing=True) # Note: train for 1M steps to have a working policy model = GAIL('MlpPolicy', env, adversary_entcoeff=0.0, lam=0.92, max_kl=0.001, expert_dataset=dataset, hidden_size_adversary=64, verbose=0) model.learn(1000) model.save("GAIL-{}".format(env_id)) model = model.load("GAIL-{}".format(env_id), env=env) model.learn(1000) obs = env.reset() for _ in range(1000): action, _ = model.predict(obs) obs, _, done, _ = env.step(action) if done: obs = env.reset() del dataset, model
def train(env, implemented_combos, model_logdir, arg_dict, pretrained_model=None): model_name = arg_dict["algo"] + '_' + str(arg_dict["steps"]) conf_pth = os.path.join(model_logdir, "train.json") model_path = os.path.join(model_logdir, "best_model.zip") arg_dict["model_path"] = model_path with open(conf_pth, "w") as f: json.dump(arg_dict, f, indent=4) model_args = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][1] model_kwargs = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][2] if pretrained_model: if not os.path.isabs(pretrained_model): pretrained_model = pkg_resources.resource_filename("myGym", pretrained_model) env = model_args[1] vec_env = DummyVecEnv([lambda: env]) model = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][0].load(pretrained_model, vec_env) else: model = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][0](*model_args, **model_kwargs) if arg_dict["algo"] == "gail": # Multi processing: (using MPI) if arg_dict["train_framework"] == 'tensorflow': # Generate expert trajectories (train expert) generate_expert_traj(model, model_name, n_timesteps=3000, n_episodes=100) # Load the expert dataset dataset = ExpertDataset(expert_path=model_name+'.npz', traj_limitation=10, verbose=1) model = GAIL_T('MlpPolicy', model_name, dataset, verbose=1) # Note: in practice, you need to train for 1M steps to have a working policy start_time = time.time() callbacks_list = [] if pretrained_model: model_logdir = pretrained_model.split('/') model_logdir = model_logdir[:-1] model_logdir = "/".join(model_logdir) auto_save_callback = SaveOnBestTrainingRewardCallback(check_freq=1024, logdir=model_logdir, env=env, engine=arg_dict["engine"], multiprocessing=arg_dict["multiprocessing"]) else: auto_save_callback = SaveOnBestTrainingRewardCallback(check_freq=1024, logdir=model_logdir, env=env, engine=arg_dict["engine"], multiprocessing=arg_dict["multiprocessing"]) callbacks_list.append(auto_save_callback) if arg_dict["eval_freq"]: eval_env = configure_env(arg_dict, model_logdir, for_train=False) eval_callback = CustomEvalCallback(eval_env, log_path=model_logdir, eval_freq=arg_dict["eval_freq"], n_eval_episodes=arg_dict["eval_episodes"], record=arg_dict["record"], camera_id=arg_dict["camera"]) callbacks_list.append(eval_callback) #callbacks_list.append(PlottingCallback(model_logdir)) with ProgressBarManager(total_timesteps=arg_dict["steps"]) as progress_callback: callbacks_list.append(progress_callback) model.learn(total_timesteps=arg_dict["steps"], callback=callbacks_list) model.save(os.path.join(model_logdir, model_name)) print("Training time: {:.2f} s".format(time.time() - start_time)) # info_keywords in monitor class above is neccessary for pybullet to save_results # when using the info_keywords for mujoco we get an error if arg_dict["engine"] == "pybullet": save_results(arg_dict, model_name, env, model_logdir) return model
def test_gail_callback(tmp_path): dataset = ExpertDataset(expert_path=EXPERT_PATH_PENDULUM, traj_limitation=10, sequential_preprocessing=True, verbose=0) model = GAIL("MlpPolicy", "Pendulum-v0", dataset) checkpoint_callback = CheckpointCallback(save_freq=500, save_path=str(tmp_path / 'logs/gail/'), name_prefix='gail') model.learn(total_timesteps=1000, callback=checkpoint_callback) shutil.rmtree(str(tmp_path / 'logs/gail/')) del dataset, model
def train_gail_withppo2(): env = gimbal(5, 500) env = DummyVecEnv([lambda: env]) model = PPO2.load("./models/baseline_ppo2_t1") generate_expert_traj(model, './models/baseline_expert_t1', env, n_timesteps=0, n_episodes=100) dataset = ExpertDataset(expert_path='./models/baseline_expert_t1.npz', traj_limitation=-1, verbose=1) model = GAIL("MlpPolicy", env, dataset, verbose=1) model.learn(total_timesteps=500000) model.save("./models/baseline_gail_ppo2_t1")
def main(args): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.device # train expert model for multiple times and save the best model best_reward = -np.inf train_env = make_vec_env(args.env, n_envs=args.n_env) eval_env = gym.make(args.env) for i in range(args.times_expert): train_env.reset() train_log_dir = os.path.join(args.train_log_dir, args.env + '_' + args.expert) if args.expert == 'PPO': expert_model = PPO2(args.policy_type, env=train_env, n_steps=args.n_steps, nminibatches=args.nminibatches, noptepochs=args.noptepochs, ent_coef=args.ent_coef,\ lam=args.lam, gamma=args.gamma, cliprange=args.cliprange, learning_rate=args.learning_rate, verbose=1, tensorboard_log=train_log_dir) else: raise NotImplementedError expert_model.learn(total_timesteps=args.expert_training_step) mean_reward = evaluate(expert_model, eval_env, num_steps=10000) if mean_reward > best_reward: best_reward = mean_reward expert_model.save( os.path.join(args.train_log_dir, args.env + '_expert')) del expert_model train_env.reset() expert_model = PPO2.load(os.path.join(args.train_log_dir, args.env + '_expert'), env=train_env) generate_expert_traj(expert_model, os.path.join(train_log_dir, 'expert_traj'), n_timesteps=-1, n_episodes=args.expert_episodes) train_env.close() dataset = ExpertDataset(expert_path=os.path.join(train_log_dir, 'expert_traj.npz'), traj_limitation=-1) gail_model = GAIL(args.policy_type, args.env, dataset, verbose=1, tensorboard_log=train_log_dir) gail_model.learn(args.student_training_step) evaluate(gail_model, eval_env, num_steps=10000) gail_model.save(os.path.join(args.train_log_dir, args.env + '_GAIL')) eval_env.close()
def stable_gail( venv, expert=None, expert_venv=None, state_only=False, total_timesteps=10000, gen_batch_size=200, disc_batch_size=100, policy_lr=1e-3, callback=None, **kwargs, ): dataset = get_expert_dataset(expert, expert_venv, total_timesteps) policy = GAIL("MlpPolicy", venv, dataset) policy.learn(total_timesteps=total_timesteps) results = {} results["policy"] = policy return results
def run_gail(): parser = argparse.ArgumentParser() parser.add_argument('expert', type=str, default=None, help='Expert path (*.npz)') parser.add_argument('--seed', type=int, default=0) parser.add_argument('--note', type=str, default='test') parser.add_argument('--env', type=str, default='PongNoFrameskip-v4') parser.add_argument('--num-steps', type=int, default=1000000) parser.add_argument('--policy', type=str, default='CnnPolicy', choices=[ 'CnnPolicy', 'CnnLstmPolicy', 'CnnLnLstmPolicy', 'MlpPolicy', 'MlpLstmPolicy', 'MlpLnLstmPolicy' ], help='Policy architecture') args = parser.parse_args() logger.configure(os.path.join('logs', args.env, args.note)) logger.info(args) if 'NoFrameskip' in args.env: env = VecFrameStack(make_atari_env(args.env, 1, args.seed), 4) else: import gym env = gym.make(args.env) dataset = ExpertDataset(expert_path=args.expert, batch_size=128, train_fraction=0.99, verbose=1) model = GAIL(args.policy, env, dataset, timesteps_per_batch=1280, verbose=1) model.learn(len(dataset.train_loader) * 1280)
def optimize_agent(trial): """ Train the model and optimise Optuna maximises the negative log likelihood, so we need to negate the reward here """ model_params = optimize_GAIL(trial) seed = trial.suggest_int('numpyseed', 1, 429496729) np.random.seed(seed) original_env = gym.make('rustyblocks-v0') original_env.max_invalid_tries = 3 env = DummyVecEnv([lambda: original_env]) model = GAIL("MlpPolicy", env, verbose=0, **model_params) print("DOING LEARING a2c") original_env.force_progression = False model.learn(int(2e4 * 5), seed=seed) print("DONE LEARING a2c") original_env.max_invalid_tries = -1 rewards = [] n_episodes, reward_sum = 0, 0.0 obs = env.reset() original_env.force_progression = True original_env.invalid_try_limit = 5000 while n_episodes < 4: action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = env.reset() last_reward = np.mean(rewards) trial.report(last_reward) return last_reward
def trian_agent_with_gail(load): from stable_baselines.common.policies import MlpPolicy from stable_baselines import GAIL env = gym.make("F16GCAS-v0") class CustomPolicy(MlpPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[128, 128]) if not load: ExpData = ExpertDataset("./lqr_export.npz") model = GAIL(CustomPolicy, env, ExpData, verbose=1) model.learn(total_timesteps=1000000) model.save(ROOT+"/trained_models/TDRL/f16/gail/128_128") else: # with model.graph.as_default(): # for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/pi/'): # print(i) model = GAIL.load(ROOT+"/trained_models/TDRL/f16/gail/128_128", env=env) with model.graph.as_default(): print(tf.all_variables()) return model
#env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) env = gym.make('gym_docking:docking-v1') # Stable Baselines provides you with make_vec_env() helper # which does exactly the previous steps for you: # env = make_vec_env(env, n_envs=num_cpu, seed=0) checkpoint_callback = CheckpointCallback( save_freq=int(5e4), save_path='./logs/', name_prefix='rl_model_621_gail_10M') dataset = ExpertDataset(expert_path='./expert_PID/expert_PID_new.npz', traj_limitation=-1, batch_size=10) model = GAIL(policy='MlpPolicy', env=env, verbose=1, tensorboard_log="./gail_docking_tensorboard/", policy_kwargs=dict( net_arch=[dict(pi=[128, 128], vf=[128, 128])], act_fun=tf.nn.relu), expert_dataset=dataset) # load trained model # model = PPO2.load("./ppo2_docking_621_random_pre.zip", env=env, tensorboard_log="./ppo2_docking_tensorboard/") model.learn(total_timesteps=int(10e6), callback=checkpoint_callback) model.save("gail_docking_621_10M")
dataset, verbose=1) # if args.pretrain: print(colored('Pretraining a behaviour cloning agent!', 'red')) model.pretrain(dataset=dataset, n_epochs=int(args.pretrain_iters)) # model.save('games{}_pretrained_{}_bc{}_trpo_pursuitevasion_small'.format(int(args.generate_num),\ # str(now.strftime('%Y%m%d')),int(args.pretrain_iters))) end_time1 = time.time() print(colored('Pretraining time: {:.2f}s = {:.2f}min = {:.4f}hrs'.format(end_time1-start_time1,\ (end_time1-start_time1)/60,(end_time1-start_time1)/3600),'red')) start_time2 = time.time() print(colored('Training a GAIL agent!', 'red')) model.learn(total_timesteps=int(args.total_iters)) if args.generate_expert: model.save('games{}_iters{}_{}_bc{}_gail_trpo_pursuitevasion_small'.format(int(args.generate_num),\ int(args.total_iters),str(now.strftime('%Y%m%d')),int(args.pretrain_iters))) else: model.save('games{}_iters{}_{}_bc{}_gail_trpo_pursuitevasion_small'.format(len(exp_data['episode_returns']),\ int(args.total_iters),str(now.strftime('%Y%m%d')),int(args.pretrain_iters))) end_time2 = time.time() print(colored('Total Training time: {:.2f}s = {:.2f}min = {:.4f}hrs'.format(end_time2-start_time1,\ (end_time2-start_time1)/60,(end_time2-start_time1)/3600),'red')) print(colored('Trained TRPO+GAIL policy', 'red')) else: #test if not args.generate_expert: print( colored('Trained on expert data from {}!'.format(args.exp_file),
# Generate expert trajectories (train expert) env = PrticleEnv(alpha=1, beta=10, win_thre=1, max_timestep=256, for_circle_traj=True) model = PPO1.load("model/part_circle_exp2_epoch05_sib.zip") model.set_env(env) generate_expert_traj(model, 'expert_part_circle_exp2_epoch05_sib', n_episodes=10) # Load the expert dataset dataset = ExpertDataset(expert_path='expert_part_circle_exp2_epoch05_sib.npz', traj_limitation=10, verbose=1) model = GAIL('MlpPolicy'\ ,DummyVecEnv([lambda: PrticleEnv(alpha=1,beta=10,win_thre=1, max_timestep=256,for_circle_traj=True)])\ , dataset, verbose=1, n_cpu_tf_sess=None) # Note: in practice, you need to train for 1M steps to have a working policy model.learn(total_timesteps=int(1e4)) model.save("_gail_sanity_test_exp1") del model # %%
env.close() print("Ending expert training, training with GAIL") # Load the expert dataset worker_id += 1 env = UnityEnv(env_name, worker_id=worker_id, use_visual=False) # , no_graphics=True env = DummyVecEnv([ lambda: env ]) # The algorithms require a vectorized environment to run dataset = ExpertDataset(expert_path='expert_basic_env.npz', traj_limitation=10, verbose=1) model = GAIL("MlpPolicy", env, dataset, verbose=1) model.learn(total_timesteps=30000) model.save(log_dir + "model") print("evaluating agent") #evaluate agent episodes = 100 ep_r = [] ep_l = [] for e in range(episodes): obs = env.reset() total_r = 0. total_l = 0. while total_l < 200: action, _states = model.predict(obs) obs, reward, done, info = env.step(action) total_l += 1. total_r += reward