def test_generate_vec_env_non_image_observation(): env = DummyVecEnv([lambda: gym.make('CartPole-v1')] * 2) model = PPO2('MlpPolicy', env) model.learn(total_timesteps=300) generate_expert_traj(model, save_path='.', n_timesteps=0, n_episodes=5)
def main(args): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.device env = gym.make(args.env) train_log_dir = os.path.join( args.train_log_dir, args.env + '_' + args.expert + '_' + args.policy_type) if args.expert == 'PPO': expert_model = PPO1(args.policy_type, env, verbose=1, tensorboard_log=train_log_dir) else: raise NotImplementedError expert_model.learn(total_timesteps=args.expert_training_step) generate_expert_traj(expert_model, os.path.join(train_log_dir, 'expert_traj'), n_timesteps=1000, n_episodes=args.expert_episodes) dataset = ExpertDataset(expert_path=os.path.join(train_log_dir, 'expert_traj.npz'), traj_limitation=-1) gail_model = GAIL(args.policy_type, env, dataset, verbose=1, tensorboard_log=train_log_dir) gail_model.learn(args.student_training_step) evaluate(gail_model, env, num_steps=10000) gail_model.save(train_log_dir) env.close()
def train(): # Load Model env = gym.make('roundabout-v0') model = DQN(MlpPolicy, env, verbose=1) generate_expert_traj(model, 'expert_roundabout', n_timesteps=1000, n_episodes=10) #Data Augmentation expert_data = dict(np.load('expert_roundabout.npz')) print("my keys are:" + str(expert_data.keys())) obs = expert_data['obs'] obs.shape expert_data['obs'] = obs.ravel() # convert to 1D array print("my keys are:" + str(expert_data.keys())) np.savez('expert_roundabout.npz', expert_data) dataset = ExpertDataset(expert_path='expert_roundabout.npz', traj_limitation=10, verbose=1) model = GAIL('MlpPolicy', env, dataset, verbose=1) model.learn(total_timesteps=1000) model.save("gail_roundabout") env.close() del env
def train(env, implemented_combos, model_logdir, arg_dict, pretrained_model=None): model_name = arg_dict["algo"] + '_' + str(arg_dict["steps"]) conf_pth = os.path.join(model_logdir, "train.json") model_path = os.path.join(model_logdir, "best_model.zip") arg_dict["model_path"] = model_path with open(conf_pth, "w") as f: json.dump(arg_dict, f, indent=4) model_args = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][1] model_kwargs = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][2] if pretrained_model: if not os.path.isabs(pretrained_model): pretrained_model = pkg_resources.resource_filename("myGym", pretrained_model) env = model_args[1] vec_env = DummyVecEnv([lambda: env]) model = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][0].load(pretrained_model, vec_env) else: model = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][0](*model_args, **model_kwargs) if arg_dict["algo"] == "gail": # Multi processing: (using MPI) if arg_dict["train_framework"] == 'tensorflow': # Generate expert trajectories (train expert) generate_expert_traj(model, model_name, n_timesteps=3000, n_episodes=100) # Load the expert dataset dataset = ExpertDataset(expert_path=model_name+'.npz', traj_limitation=10, verbose=1) model = GAIL_T('MlpPolicy', model_name, dataset, verbose=1) # Note: in practice, you need to train for 1M steps to have a working policy start_time = time.time() callbacks_list = [] if pretrained_model: model_logdir = pretrained_model.split('/') model_logdir = model_logdir[:-1] model_logdir = "/".join(model_logdir) auto_save_callback = SaveOnBestTrainingRewardCallback(check_freq=1024, logdir=model_logdir, env=env, engine=arg_dict["engine"], multiprocessing=arg_dict["multiprocessing"]) else: auto_save_callback = SaveOnBestTrainingRewardCallback(check_freq=1024, logdir=model_logdir, env=env, engine=arg_dict["engine"], multiprocessing=arg_dict["multiprocessing"]) callbacks_list.append(auto_save_callback) if arg_dict["eval_freq"]: eval_env = configure_env(arg_dict, model_logdir, for_train=False) eval_callback = CustomEvalCallback(eval_env, log_path=model_logdir, eval_freq=arg_dict["eval_freq"], n_eval_episodes=arg_dict["eval_episodes"], record=arg_dict["record"], camera_id=arg_dict["camera"]) callbacks_list.append(eval_callback) #callbacks_list.append(PlottingCallback(model_logdir)) with ProgressBarManager(total_timesteps=arg_dict["steps"]) as progress_callback: callbacks_list.append(progress_callback) model.learn(total_timesteps=arg_dict["steps"], callback=callbacks_list) model.save(os.path.join(model_logdir, model_name)) print("Training time: {:.2f} s".format(time.time() - start_time)) # info_keywords in monitor class above is neccessary for pybullet to save_results # when using the info_keywords for mujoco we get an error if arg_dict["engine"] == "pybullet": save_results(arg_dict, model_name, env, model_logdir) return model
def lecture(self): teacher = DummyExpert() #teacher = NormalizeActionWrapper(teacher) print("Let me show you how it's done.") generate_expert_traj(teacher.teach, 'dummy_expert_rocket', self.env, n_episodes=10)
def test_generate_callable(tmp_path): """ Test generating expert trajectories with a callable. """ env = gym.make("CartPole-v1") # Here the expert is a random agent def dummy_expert(_obs): return env.action_space.sample() generate_expert_traj(dummy_expert, tmp_path / 'dummy_expert_cartpole', env, n_timesteps=0, n_episodes=10)
def train(params): # create model env = FlattenObservation(gym.make(params.get("environment"))) exp_name = params.get("model_name") + "_train_" + params.get("environment") log_dir = './logs/' + exp_name expert_name = 'expert_{0}'.format(exp_name) if params.get("model_name") == 'TRPO': print("Loading TRPO Model") model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=log_dir) model.learn(total_timesteps=params.get("train_steps")) model.save(exp_name) if params.get("model_name") == 'PPO': print("Loading PPO Model") model = PPO1(MlpPolicy, env, verbose=1, tensorboard_log=log_dir, entcoeff=params.get("ent_coef"), gamma=params.get("gamma"), optim_batchsize=params.get("batch_size"), clip_param=params.get("clip_range"), lam=params.get("gae_lambda")) model.learn(total_timesteps=params.get("train_steps")) model.save(exp_name) if params.get("expert_exists") is False: print("Training expert trajectories") # Train expert controller (if needed) and record expert trajectories. generate_expert_traj(model, expert_name, n_timesteps=params.get("expert_timesteps"), n_episodes=params.get("n_episodes")) dataset = ExpertDataset( expert_path='{0}.npz'.format(expert_name), traj_limitation=-1, randomize=True, # if the dataset should be shuffled verbose=1) model = GAIL('MlpPolicy', env, dataset, verbose=1, tensorboard_log=log_dir) # Check out for defaults if params.get("pre_train") is True: print("Pretraining Dataset with Behavioural Cloning") model.pretrain(dataset, n_epochs=10000) print("Executing GAIL Learning") model.learn(total_timesteps=params.get("train_steps")) model.save("BC" + exp_name) env.close() del env
def generate_obs(environment, record_path, n_episodes): global env, model env = environment print('Starting record...') # model = get_existing_model(os.path.join('models', 'Self6hr_human50_self114hr')) # generate_expert_traj(acer_expert, record_path, env, n_episodes=n_episodes) generate_expert_traj(human_expert, record_path, env, n_episodes=n_episodes) print( f'Recording of {n_episodes} episodes complete. Saved file to {record_path}.npz' )
def test_generate(generate_env): model, policy, env_name, n_env, n_episodes = generate_env if n_env > 1: env = make_atari_env(env_name, num_env=n_env, seed=0) model = model(policy, env, verbose=0) else: model = model(policy, env_name, verbose=0) generate_expert_traj(model, 'expert', n_timesteps=1000, n_episodes=n_episodes, image_folder='test_recorded_images')
def test_pretrain_images(tmp_path): env = make_atari_env("PongNoFrameskip-v4", num_env=1, seed=0) env = VecFrameStack(env, n_stack=4) model = PPO2('CnnPolicy', env) generate_expert_traj(model, str(tmp_path / 'expert_pong'), n_timesteps=0, n_episodes=1, image_folder=str(tmp_path / 'pretrain_recorded_images')) expert_path = str(tmp_path / 'expert_pong.npz') dataset = ExpertDataset(expert_path=expert_path, traj_limitation=1, batch_size=32, sequential_preprocessing=True) model.pretrain(dataset, n_epochs=2) shutil.rmtree(str(tmp_path / 'pretrain_recorded_images')) env.close() del dataset, model, env
def get_expert_dataset( expert, venv, total_timesteps, ): filename = f"/tmp/{uuid.uuid4()}" n_episodes = total_timesteps // get_horizon(venv) generate_expert_traj(expert, save_path=filename, env=venv, n_episodes=n_episodes) dataset = ExpertDataset(expert_path=f"{filename}.npz", verbose=0) return dataset
def train_gail_withppo2(): env = gimbal(5, 500) env = DummyVecEnv([lambda: env]) model = PPO2.load("./models/baseline_ppo2_t1") generate_expert_traj(model, './models/baseline_expert_t1', env, n_timesteps=0, n_episodes=100) dataset = ExpertDataset(expert_path='./models/baseline_expert_t1.npz', traj_limitation=-1, verbose=1) model = GAIL("MlpPolicy", env, dataset, verbose=1) model.learn(total_timesteps=500000) model.save("./models/baseline_gail_ppo2_t1")
def main(args): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.device # train expert model for multiple times and save the best model best_reward = -np.inf train_env = make_vec_env(args.env, n_envs=args.n_env) eval_env = gym.make(args.env) for i in range(args.times_expert): train_env.reset() train_log_dir = os.path.join(args.train_log_dir, args.env + '_' + args.expert) if args.expert == 'PPO': expert_model = PPO2(args.policy_type, env=train_env, n_steps=args.n_steps, nminibatches=args.nminibatches, noptepochs=args.noptepochs, ent_coef=args.ent_coef,\ lam=args.lam, gamma=args.gamma, cliprange=args.cliprange, learning_rate=args.learning_rate, verbose=1, tensorboard_log=train_log_dir) else: raise NotImplementedError expert_model.learn(total_timesteps=args.expert_training_step) mean_reward = evaluate(expert_model, eval_env, num_steps=10000) if mean_reward > best_reward: best_reward = mean_reward expert_model.save( os.path.join(args.train_log_dir, args.env + '_expert')) del expert_model train_env.reset() expert_model = PPO2.load(os.path.join(args.train_log_dir, args.env + '_expert'), env=train_env) generate_expert_traj(expert_model, os.path.join(train_log_dir, 'expert_traj'), n_timesteps=-1, n_episodes=args.expert_episodes) train_env.close() dataset = ExpertDataset(expert_path=os.path.join(train_log_dir, 'expert_traj.npz'), traj_limitation=-1) gail_model = GAIL(args.policy_type, args.env, dataset, verbose=1, tensorboard_log=train_log_dir) gail_model.learn(args.student_training_step) evaluate(gail_model, eval_env, num_steps=10000) gail_model.save(os.path.join(args.train_log_dir, args.env + '_GAIL')) eval_env.close()
def test_generate(generate_env): model, policy, env_name, n_env, n_episodes = generate_env if n_env > 1: env = make_atari_env(env_name, num_env=n_env, seed=0) model = model(policy, env, verbose=0) else: model = model(policy, env_name, verbose=0) dataset = generate_expert_traj(model, 'expert', n_timesteps=1000, n_episodes=n_episodes, image_folder='test_recorded_images') assert set(dataset.keys()).issuperset( ['actions', 'obs', 'rewards', 'episode_returns', 'episode_starts']) assert sum(dataset['episode_starts']) == n_episodes assert len(dataset['episode_returns']) == n_episodes n_timesteps = len(dataset['episode_starts']) for key, val in dataset.items(): if key != 'episode_returns': assert val.shape[ 0] == n_timesteps, "inconsistent number of timesteps at '{}'".format( key) dataset_loaded = np.load('expert.npz') assert dataset.keys() == dataset_loaded.keys() for key in dataset.keys(): assert (dataset[key] == dataset_loaded[key] ).all(), "different data at '{}'".format(key)
def test_generate(tmp_path, generate_env): model, policy, env_name, n_env, n_episodes = generate_env if n_env > 1: env = make_atari_env(env_name, num_env=n_env, seed=0) model = model(policy, env, verbose=0) else: model = model(policy, env_name, verbose=0) dataset = generate_expert_traj(model, str(tmp_path / 'expert'), n_timesteps=300, n_episodes=n_episodes, image_folder=str(tmp_path / 'test_recorded_images')) assert set(dataset.keys()).issuperset( ['actions', 'obs', 'rewards', 'episode_returns', 'episode_starts']) assert sum(dataset['episode_starts']) == n_episodes assert len(dataset['episode_returns']) == n_episodes n_timesteps = len(dataset['episode_starts']) for key, val in dataset.items(): if key != 'episode_returns': assert val.shape[ 0] == n_timesteps, "inconsistent number of timesteps at '{}'".format( key) dataset_loaded = np.load(str(tmp_path / 'expert.npz'), allow_pickle=True) assert dataset.keys() == dataset_loaded.keys() for key in dataset.keys(): assert (dataset[key] == dataset_loaded[key] ).all(), "different data at '{}'".format(key) # Cleanup folder if os.path.isdir(str(tmp_path / 'test_recorded_images')): shutil.rmtree(str(tmp_path / 'test_recorded_images'))
def gen_pre_train(self, num_e=1, save='default2', episodes=1000): #self.create_envs(game_name=game, state_name=state, num_env=num_e) #self.env=SubprocVecEnv(self.env_fns) env_id = 'default' num_e = 1 self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_e)]) #env = Template_Gym() #self.env = DummyVecEnv([lambda: env]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) #env = make_env() #model = GAIL("MlpPolicy", env=env, expert_dataset=dataset, verbose=1) self.env.load_running_average("saves") self.model = PPO2.load("saves/m19", self.env, policy=CustomPolicy, tensorboard_log="./default/" ) self.env.load_running_average("saves") #env = make_env() #self.expert_agent = generate_expert_traj(self.model, save, self.env, n_episodes=episodes)
def generate_pretrain_data(args): env = gym.make(args.env, n_particles=args.n_particles) env_copy = gym.make(args.env, n_particles=args.n_particles) env.seed(args.seed) env.reset() env_copy.seed(args.seed) env_copy = GymMazeWrapper(env_copy, render=args.render) pre_alg = MoveToRandomCornerAlgorithm(env_copy) alg = ALGORITHMS[args.algorithm](env_copy) target_alg = TargetPointMoverAlgorithm(env_copy, tuple(env_copy.get_goal())) env = ReplayWrapper(env, env_copy, [pre_alg], alg, target_alg, downscale=True, frame_stack=True) generate_expert_traj(env.next, args.generate_pretrain_data, env=env, n_episodes=2)
def test_generate_cartpole(): model = DQN('MlpPolicy', 'CartPole-v1', verbose=0) generate_expert_traj(model, 'expert_cartpole', n_timesteps=1000, n_episodes=10)
global_buy_counter = 0 global_sell_counter = 0 global_last_action = 0 # The algorithms require a vectorized environment to run # Data will be saved in a numpy archive named `expert_cartpole.npz` # when using something different than an RL expert, # you must pass the environment object explicitly # Automatically normalize the input features and reward #VecEnv = DummyVecEnv([lambda: create_trade_env(data,symbol)]) # Automatically normalize the input features and reward #VecEnv = VecNormalize(VecEnv, norm_obs=True, norm_reward=True, # clip_obs=10.) generate_expert_traj(expert_trader, 'expert_trader_ORG_' + symbol, env, n_episodes=10) # %% [markdown] # # Read Recording Set # %% # Pre-Train a Model using Behavior Cloning #import ExpertDataset # Using only one expert trajectory # you can specify `traj_limitation=-1` for using the whole dataset dataset = ExpertDataset(expert_path='expert_trader_ORG_' + symbol + '.npz', traj_limitation=10, batch_size=64, randomize=False) dataset.plot()
from stable_baselines.gail import generate_expert_traj, ExpertDataset from stable_baselines import PPO2 import time import numpy as np # THIS SECTION IS FOR GEN EXP TRAJ kwargs_dict = {'resume': False, 'render': False} log_dir = f'runs/wide' env_name = f"zelda-wide-v0" policy = FullyConvPolicyBigMap env = make_vec_envs(env_name, "wide", log_dir, n_cpu=1, **kwargs_dict) model = PPO2(policy, env, verbose=1, tensorboard_log=f"./runs/wide") a_dict = generate_expert_traj(model, 'expert_wide', n_timesteps=int(0), n_episodes=1) print(a_dict) numpy_dict = np.load('expert_wide.npz') print(type(numpy_dict)) print(list(numpy_dict.keys())) # ['actions', 'obs', 'rewards', 'episode_returns', 'episode_starts'] print(f"ACTIONS") print(f"=============================") print(numpy_dict['actions']) print(numpy_dict['actions'].shape) print(f"=============================") print(f"=============================") print(f"=============================")
from stable_baselines import DQN from stable_baselines.gail import generate_expert_traj import gym import highway_env model = DQN('MlpPolicy', 'overtaking-v0', verbose=1) # Train a DQN agent for 1e5 timesteps and generate 10 trajectories # data will be saved in a numpy archive named `expert_cartpole.npz` generate_expert_traj(model, 'expert_overtaking', n_timesteps=int(1e5), n_episodes=10)
def test_generate_pendulum(): model = SAC('MlpPolicy', 'Pendulum-v0', verbose=0) generate_expert_traj(model, 'expert_pendulum', n_timesteps=1000, n_episodes=10)
timestep_per_epoch = int(1e5) expert_n_episodes = 100 ############################################ if __name__ == "__main__": if not os.path.exists(save_name): os.makedirs(save_name) # Generate expert trajectories (train expert) print("\n...Generate expert trajectories\n") env = PrticleEnv(alpha=1, beta=10, win_thre=1, max_timestep=256) model = PPO1.load("model/part_circle_exp2_epoch05_sib.zip") model.set_env(env) generate_expert_traj(model, 'expert_part_circle_exp2_epoch05_sib', n_episodes=expert_n_episodes) print("...finish\n") # Load the expert dataset print("\n...Load the expert dataset\n") dataset = ExpertDataset( expert_path='expert_part_circle_exp2_epoch05_sib.npz', traj_limitation=-1, verbose=1) print("...finish\n") model = GAIL('MlpPolicy'\ ,DummyVecEnv([lambda: PrticleEnv(alpha=1,beta=10,win_thre=1, max_timestep=256)])\ , dataset, tensorboard_log=save_name, verbose=0, n_cpu_tf_sess=None)
def generate(): model = LQRModel() generate_expert_traj(model, save_path="./lqr_export.npz", env=None, n_timesteps=0, n_episodes=10)
from mycart import MyCartPoleEnv from mycartCont import MyCartContEnv from stable_baselines.gail import generate_expert_traj import numpy as np env = MyCartContEnv() # Here the expert is a random agent # but it can be any python function, e.g. a PID controller def dummy_expert(_obs): x, x_dot, theta, theta_dot = _obs #print(obs) K1 = -50 K2 = -5 K3 = -4 K4 = -2 action = [-K1 * theta - K2 * theta_dot - K3 * (x - env.xref) - K4 * x_dot] return action # Data will be saved in a numpy archive named `expert_cartpole.npz` # when using something different than an RL expert, # you must pass the environment object explicitely generate_expert_traj(dummy_expert, 'dummy_expert_cartpole', env, n_episodes=100)
degToRad(-90), degToRad(-90) ] actions = [ mov0, mov1, mov2, mov3, mov4, mov5, mov6, mov7, mov8, mov9, mov10, mov11 ] actions = [np.array(mov) for mov in actions] def dummy_expert(_obs): global state global actions state += 1 if state == 11: state = 0 # TODO add noise to each angle independently return actions[state] + round(0.2 * np.random.random_sample() - 0.1, 2) # +-0.1 rad as noise for all angles env = gym.make('gym_quadruped:quadruped-v0', visualize=False) # Data will be saved in a numpy archive named `dummy_quadruped.npz` # when using something different than an RL expert, # you must pass the environment object explicitly generate_expert_traj(dummy_expert, './pretrain/dummy_quadruped', env, n_episodes=200)
except: pass try: order_type, goal = actions_list[0] actions_list = actions_list[1:] except: order_type, goal = 0, 0 if order_type == 0 and goal == 0: action = 0 else: action = decode_action(order_type, goal) return action # make folder if not exist try: os.mkdir(specified_path) except: pass ## Generate Data based on heuristic for pre-training # Data will be saved in a numpy archive named `heuristic_expert.npz` env.reset() generate_expert_traj(dummy_expert, join(specified_path, 'heuristic_expert'), env, n_episodes=args.numepisodes)
from stable_baselines.gail import generate_expert_traj from tetris import TetrisEnv from tetris.bot import Bot if __name__ == '__main__': import os import argparse parser = argparse.ArgumentParser() parser.add_argument('episod', type=int) parser.add_argument('output_dir') args = parser.parse_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) filename = os.path.join(args.output_dir, 'dataset-{}.npz'.format(args.episod)) if os.path.exists(filename): raise OSError(filename, 'already exists!') env = TetrisEnv() model = Bot(env) trajs = generate_expert_traj(model.predict, env=env, n_episodes=args.episod, save_path=None, image_folder=None) np.savez(filename, **trajs)
from stable_baselines import DQN from stable_baselines.gail import generate_expert_traj model = DQN('MlpPolicy', 'CartPole-v1', verbose=1) # Train a DQN agent for 1e5 timesteps and generate 10 trajectories # data will be saved in a numpy archive named `expert_cartpole.npz` num_episodes = 10000 generate_expert_traj(model, '../data/expert/cartpole' + str(num_episodes), n_timesteps=int(1e5), n_episodes=num_episodes)
:param _obs: (np.ndarray) Current observation :return: (np.ndarray) action taken by the expert """ while True: env.render() print_play_keys(env.action_str) time.sleep(0.2) key_pressed = keyboard.read_key() # return index of action if valid key is pressed if key_pressed: if key_pressed in KEY_ACTION_DICT: return KEY_ACTION_DICT[key_pressed] elif key_pressed == "esc": print("You pressed esc, exiting!!") break else: print("You pressed wrong key. Press Esc key to exit, OR:") # Data will be saved in a numpy archive named `expert_+env_id.npz` # when using something different than an RL expert, # you must pass the environment object explicitly env.render() episodes = 50 generate_expert_traj(human_expert, 'expert_' + env_id + '_' + str(episodes) + 'demos', env, n_episodes=episodes)