def symmetric_bc(model_savename, bc_params, num_epochs=1000, lr=1e-4, adam_eps=1e-8): """DEPRECATED: Trains two BC models from the same data. Splits data 50-50 and uses each subset as training data for one model and validation for the other.""" expert_trajs = get_trajs_from_data(bc_params["data_params"]) save_npz_file(expert_trajs, "temp") train_dataset = ExpertDataset(expert_path="temp", verbose=1, train_fraction=0.5) train_indices = train_dataset.train_loader.original_indices val_indices = train_dataset.val_loader.original_indices # Train BC model train_model_save_dir = model_savename + "_train/" bc_from_dataset_and_params(train_dataset, bc_params, train_model_save_dir, num_epochs, lr, adam_eps) # Switching testing and validation datasets (somewhat hacky) indices_split = (val_indices, train_indices) test_dataset = ExpertDataset(expert_path="temp", verbose=1, train_fraction=0.5, indices_split=indices_split) # Test BC model test_model_save_dir = model_savename + "_test/" bc_from_dataset_and_params(test_dataset, bc_params, test_model_save_dir, num_epochs, lr, adam_eps)
def test_dataset_param_validation(): with pytest.raises(ValueError): ExpertDataset() traj_data = np.load(EXPERT_PATH_PENDULUM) with pytest.raises(ValueError): ExpertDataset(traj_data=traj_data, expert_path=EXPERT_PATH_PENDULUM)
def main(args): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.device env = gym.make(args.env) train_log_dir = os.path.join( args.train_log_dir, args.env + '_' + args.expert + '_' + args.policy_type) if args.expert == 'PPO': expert_model = PPO1(args.policy_type, env, verbose=1, tensorboard_log=train_log_dir) else: raise NotImplementedError expert_model.learn(total_timesteps=args.expert_training_step) generate_expert_traj(expert_model, os.path.join(train_log_dir, 'expert_traj'), n_timesteps=1000, n_episodes=args.expert_episodes) dataset = ExpertDataset(expert_path=os.path.join(train_log_dir, 'expert_traj.npz'), traj_limitation=-1) gail_model = GAIL(args.policy_type, env, dataset, verbose=1, tensorboard_log=train_log_dir) gail_model.learn(args.student_training_step) evaluate(gail_model, env, num_steps=10000) gail_model.save(train_log_dir) env.close()
def pre_train(self): # Using only one expert trajectory # you can specify `traj_limitation=-1` for using the whole dataset dataset = ExpertDataset(expert_path='expert_cartpole.npz', traj_limitation=1, batch_size=128) model = PPO2('MlpPolicy', 'CartPole-v1', verbose=1) # Pretrain the PPO2 model model.pretrain(dataset, n_epochs=1000) # As an option, you can train the RL agent # model.learn(int(1e5)) # Test the pre-trained model env = model.get_env() obs = env.reset() reward_sum = 0.0 for _ in range(1000): action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward env.render() if done: print(reward_sum) reward_sum = 0.0 obs = env.reset() env.close()
def main(env): n_actions = env.action_space.shape[0] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Using only one expert trajectory # you can specify `traj_limitation=-1` for using the whole dataset file_dir = "/home/vignesh/Thesis_Suture_data/trial2/ambf_data/" dataset = ExpertDataset(expert_path=file_dir + 'expert_psm_data.npz', traj_limitation=1, batch_size=32) model = DDPG(MlpPolicy, env, gamma=0.95, verbose=1, nb_train_steps=300, nb_rollout_steps=150, param_noise=param_noise, batch_size=128, action_noise=action_noise, random_exploration=0.05, normalize_observations=True, tensorboard_log="./ddpg_dvrk_tensorboard/", observation_range=(-1.5, 1.5)) model.pretrain(dataset, n_epochs=1000) model.save("./gail_robot_env")
def train_agent_with_a2c(load=False): from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import SubprocVecEnv from stable_baselines import A2C # multiprocess environment n_cpu = 4 env = SubprocVecEnv([lambda: gym.make('F16GCAS-v0') for i in range(n_cpu)]) env = gym.make("F16GCAS-v0") class CustomPolicy(MlpPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[128, 128]) if not load: model = A2C(env=env, verbose=1, policy=CustomPolicy) # model.learn(total_timesteps=1000000) ExpData = ExpertDataset("./lqr_export.npz") model.pretrain(ExpData, n_epochs=100) else: model = A2C.load(ROOT+"/trained_models/TDRL/f16/a2c/128_128", env=env) with model.graph.as_default(): for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/pi/'): print(i) return model
def train_agent_with_ddpg(load): from stable_baselines.ddpg.policies import FeedForwardPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise from stable_baselines import DDPG # Create and wrap the environment env = gym.make('F16GCAS-v0') env = DummyVecEnv([lambda: env]) # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.01) * np.ones(n_actions)) # Custom MLP policy of two layers of size 16 each class CustomPolicy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[128, 128], layer_norm=False, feature_extraction="mlp") model = DDPG(CustomPolicy, env, verbose=1, action_noise=action_noise) if not load: ExpData = ExpertDataset("./lqr_export.npz") model.pretrain(ExpData, n_epochs=100) model.save(ROOT+"/trained_models/TDRL/f16/ddpg/128_128") else: model = DDPG.load(ROOT+"/trained_models/TDRL/f16/ddpg/128_128", policy=CustomPolicy, env=env) return model
def main(exp_traj_fn, rep_as_str, from_scratch): env_name = f"zelda-{rep_as_str}-v0" log_dir = f'runs/{rep_as_str}' kwargs_dict = {'resume': False, 'render': True} if rep_as_str == 'wide': policy = FullyConvPolicyBigMap else: policy = CustomPolicyBigMap env = make_vec_envs(env_name, rep_as_str, log_dir, n_cpu=1, **kwargs_dict) model = PPO2(policy, env, verbose=1, tensorboard_log=f"./runs/{rep_as_str}") if not from_scratch: model.load(f'models/{rep_as_str}/zelda_{rep_as_str}', env=env) dataset = ExpertDataset( expert_path=f'expert_trajectories/{rep_as_str}/{exp_traj_fn}.npz', traj_limitation=-1, batch_size=15) start_time = time.process_time() model.set_env(env) model.pretrain(dataset, n_epochs=15) end_time = time.process_time() print(f"training took {end_time - start_time} seconds") model.save(f'models/{rep_as_str}/zelda_{rep_as_str}')
def train(): # Load Model env = gym.make('roundabout-v0') model = DQN(MlpPolicy, env, verbose=1) generate_expert_traj(model, 'expert_roundabout', n_timesteps=1000, n_episodes=10) #Data Augmentation expert_data = dict(np.load('expert_roundabout.npz')) print("my keys are:" + str(expert_data.keys())) obs = expert_data['obs'] obs.shape expert_data['obs'] = obs.ravel() # convert to 1D array print("my keys are:" + str(expert_data.keys())) np.savez('expert_roundabout.npz', expert_data) dataset = ExpertDataset(expert_path='expert_roundabout.npz', traj_limitation=10, verbose=1) model = GAIL('MlpPolicy', env, dataset, verbose=1) model.learn(total_timesteps=1000) model.save("gail_roundabout") env.close() del env
def test_behavior_cloning_discrete(tmp_path, model_class): dataset = ExpertDataset(expert_path=EXPERT_PATH_DISCRETE, traj_limitation=10, sequential_preprocessing=True, verbose=0) model = model_class("MlpPolicy", "CartPole-v1") model.pretrain(dataset, n_epochs=10) model.save(str(tmp_path / "test-pretrain")) del dataset, model
def test_gail(expert_env): env_id, expert_path = expert_env env = gym.make(env_id) dataset = ExpertDataset(expert_path=expert_path, traj_limitation=10, sequential_preprocessing=True) # Note: train for 1M steps to have a working policy model = GAIL('MlpPolicy', env, adversary_entcoeff=0.0, lam=0.92, max_kl=0.001, expert_dataset=dataset, hidden_size_adversary=64, verbose=0) model.learn(1000) model.save("GAIL-{}".format(env_id)) model = model.load("GAIL-{}".format(env_id), env=env) model.learn(1000) obs = env.reset() for _ in range(1000): action, _ = model.predict(obs) obs, _, done, _ = env.step(action) if done: obs = env.reset() del dataset, model
def test_gail(tmp_path, expert_env): env_id, expert_path, load_from_memory = expert_env env = gym.make(env_id) traj_data = None if load_from_memory: traj_data = np.load(expert_path) expert_path = None dataset = ExpertDataset(traj_data=traj_data, expert_path=expert_path, traj_limitation=10, sequential_preprocessing=True) # Note: train for 1M steps to have a working policy model = GAIL('MlpPolicy', env, adversary_entcoeff=0.0, lam=0.92, max_kl=0.001, expert_dataset=dataset, hidden_size_adversary=64, verbose=0) model.learn(300) model.save(str(tmp_path / "GAIL-{}".format(env_id))) model = model.load(str(tmp_path / "GAIL-{}".format(env_id)), env=env) model.learn(300) evaluate_policy(model, env, n_eval_episodes=5) del dataset, model
def imitate(model, expert_path, model_path, learning_rate, n_epochs=1000): dataset = ExpertDataset(expert_path=expert_path + '.npz', batch_size=128) model.pretrain(dataset, n_epochs=n_epochs, learning_rate=learning_rate) model.save(model_path)
def train(env, implemented_combos, model_logdir, arg_dict, pretrained_model=None): model_name = arg_dict["algo"] + '_' + str(arg_dict["steps"]) conf_pth = os.path.join(model_logdir, "train.json") model_path = os.path.join(model_logdir, "best_model.zip") arg_dict["model_path"] = model_path with open(conf_pth, "w") as f: json.dump(arg_dict, f, indent=4) model_args = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][1] model_kwargs = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][2] if pretrained_model: if not os.path.isabs(pretrained_model): pretrained_model = pkg_resources.resource_filename("myGym", pretrained_model) env = model_args[1] vec_env = DummyVecEnv([lambda: env]) model = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][0].load(pretrained_model, vec_env) else: model = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][0](*model_args, **model_kwargs) if arg_dict["algo"] == "gail": # Multi processing: (using MPI) if arg_dict["train_framework"] == 'tensorflow': # Generate expert trajectories (train expert) generate_expert_traj(model, model_name, n_timesteps=3000, n_episodes=100) # Load the expert dataset dataset = ExpertDataset(expert_path=model_name+'.npz', traj_limitation=10, verbose=1) model = GAIL_T('MlpPolicy', model_name, dataset, verbose=1) # Note: in practice, you need to train for 1M steps to have a working policy start_time = time.time() callbacks_list = [] if pretrained_model: model_logdir = pretrained_model.split('/') model_logdir = model_logdir[:-1] model_logdir = "/".join(model_logdir) auto_save_callback = SaveOnBestTrainingRewardCallback(check_freq=1024, logdir=model_logdir, env=env, engine=arg_dict["engine"], multiprocessing=arg_dict["multiprocessing"]) else: auto_save_callback = SaveOnBestTrainingRewardCallback(check_freq=1024, logdir=model_logdir, env=env, engine=arg_dict["engine"], multiprocessing=arg_dict["multiprocessing"]) callbacks_list.append(auto_save_callback) if arg_dict["eval_freq"]: eval_env = configure_env(arg_dict, model_logdir, for_train=False) eval_callback = CustomEvalCallback(eval_env, log_path=model_logdir, eval_freq=arg_dict["eval_freq"], n_eval_episodes=arg_dict["eval_episodes"], record=arg_dict["record"], camera_id=arg_dict["camera"]) callbacks_list.append(eval_callback) #callbacks_list.append(PlottingCallback(model_logdir)) with ProgressBarManager(total_timesteps=arg_dict["steps"]) as progress_callback: callbacks_list.append(progress_callback) model.learn(total_timesteps=arg_dict["steps"], callback=callbacks_list) model.save(os.path.join(model_logdir, model_name)) print("Training time: {:.2f} s".format(time.time() - start_time)) # info_keywords in monitor class above is neccessary for pybullet to save_results # when using the info_keywords for mujoco we get an error if arg_dict["engine"] == "pybullet": save_results(arg_dict, model_name, env, model_logdir) return model
def test_gail_callback(tmp_path): dataset = ExpertDataset(expert_path=EXPERT_PATH_PENDULUM, traj_limitation=10, sequential_preprocessing=True, verbose=0) model = GAIL("MlpPolicy", "Pendulum-v0", dataset) checkpoint_callback = CheckpointCallback(save_freq=500, save_path=str(tmp_path / 'logs/gail/'), name_prefix='gail') model.learn(total_timesteps=1000, callback=checkpoint_callback) shutil.rmtree(str(tmp_path / 'logs/gail/')) del dataset, model
def test_pretrain_twice(tmp_path): """ Test pretraining twice in the same execution. """ dataset = ExpertDataset(expert_path=EXPERT_PATH_PENDULUM, traj_limitation=10, sequential_preprocessing=True, verbose=0) model = PPO2("MlpPolicy", "Pendulum-v0") model.pretrain(dataset, n_epochs=5) model.pretrain(dataset, n_epochs=5) del dataset, model
def test_behavior_cloning_box(tmp_path, model_class): """ Behavior cloning with continuous actions. """ dataset = ExpertDataset(expert_path=EXPERT_PATH_PENDULUM, traj_limitation=10, sequential_preprocessing=True, verbose=0) model = model_class("MlpPolicy", "Pendulum-v0") model.pretrain(dataset, n_epochs=20) model.save(str(tmp_path / "test-pretrain")) del dataset, model
def train_bc_agent(model_save_dir, bc_params, num_epochs=1000, lr=1e-4, adam_eps=1e-8): # Extract necessary expert data and save in right format expert_trajs = get_trajs_from_data(**bc_params["data_params"]) # Load the expert dataset save_npz_file(expert_trajs, "temp.npz") dataset = ExpertDataset(expert_path="temp.npz", verbose=1, train_fraction=0.85) assert dataset is not None assert dataset.train_loader is not None return bc_from_dataset_and_params(dataset, bc_params, model_save_dir, num_epochs, lr, adam_eps)
def train(params): # create model env = FlattenObservation(gym.make(params.get("environment"))) exp_name = params.get("model_name") + "_train_" + params.get("environment") log_dir = './logs/' + exp_name expert_name = 'expert_{0}'.format(exp_name) if params.get("model_name") == 'TRPO': print("Loading TRPO Model") model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=log_dir) model.learn(total_timesteps=params.get("train_steps")) model.save(exp_name) if params.get("model_name") == 'PPO': print("Loading PPO Model") model = PPO1(MlpPolicy, env, verbose=1, tensorboard_log=log_dir, entcoeff=params.get("ent_coef"), gamma=params.get("gamma"), optim_batchsize=params.get("batch_size"), clip_param=params.get("clip_range"), lam=params.get("gae_lambda")) model.learn(total_timesteps=params.get("train_steps")) model.save(exp_name) if params.get("expert_exists") is False: print("Training expert trajectories") # Train expert controller (if needed) and record expert trajectories. generate_expert_traj(model, expert_name, n_timesteps=params.get("expert_timesteps"), n_episodes=params.get("n_episodes")) dataset = ExpertDataset( expert_path='{0}.npz'.format(expert_name), traj_limitation=-1, randomize=True, # if the dataset should be shuffled verbose=1) model = GAIL('MlpPolicy', env, dataset, verbose=1, tensorboard_log=log_dir) # Check out for defaults if params.get("pre_train") is True: print("Pretraining Dataset with Behavioural Cloning") model.pretrain(dataset, n_epochs=10000) print("Executing GAIL Learning") model.learn(total_timesteps=params.get("train_steps")) model.save("BC" + exp_name) env.close() del env
def _pretrain(self): if self.config['meta'].get('pretrain', None): logging.info("Starting pretraining.") pretrain_config = copy.deepcopy(self.config['meta']['pretrain']) archive_location = pretrain_config.get('expert_path') n_epochs = pretrain_config.pop('n_epochs', 1000) assert os.path.exists( archive_location ), "Could not find archive with pretraining data at {}".format( archive_location) dataset = ExpertDataset(**pretrain_config) self.agent.pretrain(dataset, n_epochs=n_epochs)
def get_expert_dataset( expert, venv, total_timesteps, ): filename = f"/tmp/{uuid.uuid4()}" n_episodes = total_timesteps // get_horizon(venv) generate_expert_traj(expert, save_path=filename, env=venv, n_episodes=n_episodes) dataset = ExpertDataset(expert_path=f"{filename}.npz", verbose=0) return dataset
def train_gail_withppo2(): env = gimbal(5, 500) env = DummyVecEnv([lambda: env]) model = PPO2.load("./models/baseline_ppo2_t1") generate_expert_traj(model, './models/baseline_expert_t1', env, n_timesteps=0, n_episodes=100) dataset = ExpertDataset(expert_path='./models/baseline_expert_t1.npz', traj_limitation=-1, verbose=1) model = GAIL("MlpPolicy", env, dataset, verbose=1) model.learn(total_timesteps=500000) model.save("./models/baseline_gail_ppo2_t1")
def test_pretrain_images(tmp_path): env = make_atari_env("PongNoFrameskip-v4", num_env=1, seed=0) env = VecFrameStack(env, n_stack=4) model = PPO2('CnnPolicy', env) generate_expert_traj(model, str(tmp_path / 'expert_pong'), n_timesteps=0, n_episodes=1, image_folder=str(tmp_path / 'pretrain_recorded_images')) expert_path = str(tmp_path / 'expert_pong.npz') dataset = ExpertDataset(expert_path=expert_path, traj_limitation=1, batch_size=32, sequential_preprocessing=True) model.pretrain(dataset, n_epochs=2) shutil.rmtree(str(tmp_path / 'pretrain_recorded_images')) env.close() del dataset, model, env
def pre_train(self, num_e=1, load="saves/m19"): env_id = 'default' num_e = 1 log_dir = "saves" # Usingenv = make_env() only one expert trajectory # you can specify `traj_limitation=-1` for using the whole dataset dataset = ExpertDataset(expert_path='default2.npz', traj_limitation=1, batch_size=128) self.env = SubprocVecEnv( [self.make_env(env_id, i) for i in range(num_e)]) #env = Template_Gym() #self.env = DummyVecEnv([lambda: env]) #self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) #env = make_env() #model = GAIL("MlpPolicy", env=env, expert_dataset=dataset, verbose=1) #self.env.save_running_average("saves"+self.config.pair) self.model = PPO2(MlpPolicy, self.env, verbose=1, nminibatches=1, learning_rate=1e-5, tensorboard_log="./m1ln4") #self.model = PPO2.load("saves/m19", self.env, policy=CustomPolicy, tensorboard_log="./default/" ) #self.env.save_running_average("saves"+self.config.pair) # Pretrain the PPO2 model self.model.pretrain(dataset, n_epochs=10000) # As an option, you can train the RL agent #self.model.learn(int(100000000)) # Test the pre-trained model self.env = self.model.get_env() #self.env.save_running_average("saves"+self.config.pair) obs = self.env.reset() reward_sum = 0.0 for _ in range(11): action, _ = self.model.predict(obs) obs, reward, done, _ = self.env.step(action) reward_sum += reward #self.env.render() if done: print(reward_sum) reward_sum = 0.0 obs = self.env.reset() self.env.close()
def main(args): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.device # train expert model for multiple times and save the best model best_reward = -np.inf train_env = make_vec_env(args.env, n_envs=args.n_env) eval_env = gym.make(args.env) for i in range(args.times_expert): train_env.reset() train_log_dir = os.path.join(args.train_log_dir, args.env + '_' + args.expert) if args.expert == 'PPO': expert_model = PPO2(args.policy_type, env=train_env, n_steps=args.n_steps, nminibatches=args.nminibatches, noptepochs=args.noptepochs, ent_coef=args.ent_coef,\ lam=args.lam, gamma=args.gamma, cliprange=args.cliprange, learning_rate=args.learning_rate, verbose=1, tensorboard_log=train_log_dir) else: raise NotImplementedError expert_model.learn(total_timesteps=args.expert_training_step) mean_reward = evaluate(expert_model, eval_env, num_steps=10000) if mean_reward > best_reward: best_reward = mean_reward expert_model.save( os.path.join(args.train_log_dir, args.env + '_expert')) del expert_model train_env.reset() expert_model = PPO2.load(os.path.join(args.train_log_dir, args.env + '_expert'), env=train_env) generate_expert_traj(expert_model, os.path.join(train_log_dir, 'expert_traj'), n_timesteps=-1, n_episodes=args.expert_episodes) train_env.close() dataset = ExpertDataset(expert_path=os.path.join(train_log_dir, 'expert_traj.npz'), traj_limitation=-1) gail_model = GAIL(args.policy_type, args.env, dataset, verbose=1, tensorboard_log=train_log_dir) gail_model.learn(args.student_training_step) evaluate(gail_model, eval_env, num_steps=10000) gail_model.save(os.path.join(args.train_log_dir, args.env + '_GAIL')) eval_env.close()
def run_gail(): parser = argparse.ArgumentParser() parser.add_argument('expert', type=str, default=None, help='Expert path (*.npz)') parser.add_argument('--seed', type=int, default=0) parser.add_argument('--note', type=str, default='test') parser.add_argument('--env', type=str, default='PongNoFrameskip-v4') parser.add_argument('--num-steps', type=int, default=1000000) parser.add_argument('--policy', type=str, default='CnnPolicy', choices=[ 'CnnPolicy', 'CnnLstmPolicy', 'CnnLnLstmPolicy', 'MlpPolicy', 'MlpLstmPolicy', 'MlpLnLstmPolicy' ], help='Policy architecture') args = parser.parse_args() logger.configure(os.path.join('logs', args.env, args.note)) logger.info(args) if 'NoFrameskip' in args.env: env = VecFrameStack(make_atari_env(args.env, 1, args.seed), 4) else: import gym env = gym.make(args.env) dataset = ExpertDataset(expert_path=args.expert, batch_size=128, train_fraction=0.99, verbose=1) model = GAIL(args.policy, env, dataset, timesteps_per_batch=1280, verbose=1) model.learn(len(dataset.train_loader) * 1280)
def main(): env = gym.make("BowlingNoFrameskip-v0") env = MaxAndSkipEnv(env, skip=4) env = WarpFrame(env) env = DummyVecEnv([lambda: env]) dataset = ExpertDataset(expert_path="bowling_demo.npz", verbose=1) model = PPO2("CnnPolicy", env, verbose=1) model.pretrain(dataset, n_epochs=1000) model.save("bowling_model") state = env.reset() total_reward = 0 while True: env.render() time.sleep(1 / 60) action, _ = model.predict(state) state, reward, done, info = env.step(action) total_reward += reward[0] if done: print(total_reward) state = env.reset() total_reward = 0
def trian_agent_with_gail(load): from stable_baselines.common.policies import MlpPolicy from stable_baselines import GAIL env = gym.make("F16GCAS-v0") class CustomPolicy(MlpPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[128, 128]) if not load: ExpData = ExpertDataset("./lqr_export.npz") model = GAIL(CustomPolicy, env, ExpData, verbose=1) model.learn(total_timesteps=1000000) model.save(ROOT+"/trained_models/TDRL/f16/gail/128_128") else: # with model.graph.as_default(): # for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/pi/'): # print(i) model = GAIL.load(ROOT+"/trained_models/TDRL/f16/gail/128_128", env=env) with model.graph.as_default(): print(tf.all_variables()) return model
def main(): global save_path, log_dir, model, best_mean_reward mk_dir(args.checkpoint_dir + args.policy) save_path = args.checkpoint_dir + args.policy + "/" + args.policy log_dir = args.summary_dir + args.policy mk_dir(log_dir) env = gym.make("SegmentationEnv-v0", objs_dir=args.objs_dir, max_scenes=args.max_scenes, sample_size=args.sample_size, diff_punishment=args.diff_punishment, max_steps_per_scene=args.max_steps_per_scene, scene_mode=args.scene_mode, point_mode=args.point_mode, voxel_size=args.voxel_size, voxel_mode=args.voxel_mode, single_scenes=args.single_scenes, early_diff=args.early_diff, wall_weight=args.wall_weight) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([ lambda: env ]) # The algorithms require a vectorized environment to run env = VecCheckNan(env, raise_exception=True) net_module = importlib.import_module(args.policy) model = PPO2(net_module.Policy, env, verbose=args.verbose, tensorboard_log=log_dir, learning_rate=args.learning_rate, ent_coef=args.ent_coef, cliprange=args.cliprange, cliprange_vf=args.cliprange_vf, lam=args.lam, gamma=args.gamma, seed=args.seed, n_cpu_tf_sess=args.n_cpu_tf_sess, noptepochs=args.noptepochs, nminibatches=args.nminibatches, n_steps=args.n_steps, max_grad_norm=args.max_grad_norm) if os.path.isfile("expert_trajectories.npz") and args.pretrain == 1: print("------------start pretrain------------") #dataset = ExpertDataset(expert_path="expert_trajectories.npz", special_shape=True, traj_limitation=100, batch_size=16) dataset = ExpertDataset(expert_path="expert_trajectories.npz", special_shape=True, train_fraction=args.train_fraction, batch_size=args.pretrain_batch_size) #model.pretrain(dataset, learning_rate=0.001, n_epochs=1000) model = model.pretrain(dataset, val_interval=1, learning_rate=args.pretrain_learning_rate, n_epochs=args.pretrain_n_epochs) print("pretrain finished -- save model") model.save(save_path) returns = [] print("Calculate mean reward") n_episodes = 10 for i in range(n_episodes): total_reward = 0 obs = env.reset() while True: action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) total_reward += reward if done: returns.append(total_reward) break returns = np.array(returns) best_mean_reward = np.mean(returns) print("Best mean reward: {:.2f}".format(best_mean_reward)) model.learn(total_timesteps=args.total_timesteps, callback=callback) env.close()
# generate expert trajectory env_depth, env_width, nlayers = 3, 3, 2 def expert(obs): try: state = State(env_depth, env_width).load_obs(obs) return get_behav(state, weights={'fr': 0.3}) except NoPathError: return np.zeros(env_depth * 2) # generate_expert_traj(expert, 'expert', Env(env_depth, env_width, nlayers), n_episodes=100) # pretrain model dataset = ExpertDataset(expert_path='expert.npz') model = SAC('MlpPolicy', Env(env_depth, env_width, nlayers), verbose=1) model.pretrain(dataset, n_epochs=5000) model.save('pretrained_sac') # Test the pre-trained model env = model.get_env() obs = env.reset() reward_sum = 0 i = 0 for j in range(1000): action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward i += 1