def train(env_name, num_time_steps, policy_kwargs, eval_ep, eval_freq, ckpt_freq, load_model=None): env = gym.make(env_name) env_ = gym.make(env_name) rank = MPI.COMM_WORLD.Get_rank() today = date.today() today = str(today).replace('-', '_') now = datetime.now() current_time = now.strftime("%H_%M_%S") model_name = env_name + '_PPO1_' + today + current_time Path('./run/' + model_name).mkdir(parents=True, exist_ok=True) path = os.path.join(os.path.dirname(__file__), './run/' + model_name) ############################ # callback # ############################ callbacklist = [] eval_callback = EvalCallback_wandb(env_, n_eval_episodes=eval_ep, eval_freq=eval_freq, log_path=path) ckpt_callback = CheckpointCallback(save_freq=ckpt_freq, save_path='./run/' + model_name + '/ckpt', name_prefix='') callbacklist.append(eval_callback) callbacklist.append(ckpt_callback) callback = CallbackList(callbacklist) if load_model: model = PPO1.load(env=env, load_path=load_model) else: model = PPO1(MlpPolicy, env, verbose=1, policy_kwargs=policy_kwargs) ############################ # Logging # ############################ if rank == 0: logger.configure(path) config = {} config['load'] = [{'load_model': load_model}] config['eval'] = [{'eval_freq': eval_freq, 'eval_ep': eval_ep}] config['ckpt'] = [{'ckpt_freq': ckpt_freq}] config['policy'] = [{'policy_network': policy_kwargs}] with open('./run/' + model_name + '/' + model_name + '.txt', 'w+') as outfile: json.dump(config, outfile, indent=4) else: logger.configure(path, format_strs=[]) ############################ # run # ############################ model.learn(total_timesteps=int(num_time_steps), callback=callback) model.save(path + '/finish')
def train(env_id, num_timesteps, seed): """ Train PPO1 model for Atari environments, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_atari(env_id) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) model = PPO1(CnnPolicy, env, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2) model.learn(total_timesteps=num_timesteps) env.close() del env
def train(): """ Train PPO1 model for slime volleyball, in MPI multiprocessing. Tested for 96 CPUs. """ rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure(folder=LOGDIR) else: logger.configure(format_strs=[]) workerseed = SEED + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_env(workerseed) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=1) eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES) model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback) env.close() del env if rank == 0: model.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point.
def getPpo1(env, arch): return PPO1( env=env, policy=MlpPolicy, policy_kwargs=dict(net_arch=arch), n_cpu_tf_sess=None )
def main(args): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.device env = gym.make(args.env) train_log_dir = os.path.join( args.train_log_dir, args.env + '_' + args.expert + '_' + args.policy_type) if args.expert == 'PPO': expert_model = PPO1(args.policy_type, env, verbose=1, tensorboard_log=train_log_dir) else: raise NotImplementedError expert_model.learn(total_timesteps=args.expert_training_step) generate_expert_traj(expert_model, os.path.join(train_log_dir, 'expert_traj'), n_timesteps=1000, n_episodes=args.expert_episodes) dataset = ExpertDataset(expert_path=os.path.join(train_log_dir, 'expert_traj.npz'), traj_limitation=-1) gail_model = GAIL(args.policy_type, env, dataset, verbose=1, tensorboard_log=train_log_dir) gail_model.learn(args.student_training_step) evaluate(gail_model, env, num_steps=10000) gail_model.save(train_log_dir) env.close()
def build_model(self): if self.is_stack: if self.game_type == "box": self.env = DummyVecEnv([lambda: self.env]) self.model = PPO1(MlpPolicy, self.env, verbose=0, gamma=self.gamma, lam=self.c1, entcoeff=self.c2, clip_param=self.clip_epslion, adam_epsilon=self.lr) if self.game_type == "atari": self.model = PPO2(CnnPolicy, self.env, verbose=1, gamma=self.gamma, vf_coef=self.c1, ent_coef=self.c2, cliprange=self.clip_epslion, learning_rate=self.lr) else: if self.game_type=="box": self.env = DummyVecEnv([lambda: self.env]) self.model = PPO1(MlpPolicy, self.env, verbose=0,gamma=self.gamma,lam=self.c1,entcoeff=self.c2,clip_param=self.clip_epslion,adam_epsilon=self.lr) if self.game_type=="atari": self.model = PPO2(CnnLstmPolicy, self.env, verbose=1,gamma=self.gamma,vf_coef=self.c1,ent_coef=self.c2,cliprange=self.clip_epslion,learning_rate=self.lr)
def train(params): # create model env = FlattenObservation(gym.make(params.get("environment"))) exp_name = params.get("model_name") + "_train_" + params.get("environment") log_dir = './logs/' + exp_name expert_name = 'expert_{0}'.format(exp_name) if params.get("model_name") == 'TRPO': print("Loading TRPO Model") model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=log_dir) model.learn(total_timesteps=params.get("train_steps")) model.save(exp_name) if params.get("model_name") == 'PPO': print("Loading PPO Model") model = PPO1(MlpPolicy, env, verbose=1, tensorboard_log=log_dir, entcoeff=params.get("ent_coef"), gamma=params.get("gamma"), optim_batchsize=params.get("batch_size"), clip_param=params.get("clip_range"), lam=params.get("gae_lambda")) model.learn(total_timesteps=params.get("train_steps")) model.save(exp_name) if params.get("expert_exists") is False: print("Training expert trajectories") # Train expert controller (if needed) and record expert trajectories. generate_expert_traj(model, expert_name, n_timesteps=params.get("expert_timesteps"), n_episodes=params.get("n_episodes")) dataset = ExpertDataset( expert_path='{0}.npz'.format(expert_name), traj_limitation=-1, randomize=True, # if the dataset should be shuffled verbose=1) model = GAIL('MlpPolicy', env, dataset, verbose=1, tensorboard_log=log_dir) # Check out for defaults if params.get("pre_train") is True: print("Pretraining Dataset with Behavioural Cloning") model.pretrain(dataset, n_epochs=10000) print("Executing GAIL Learning") model.learn(total_timesteps=params.get("train_steps")) model.save("BC" + exp_name) env.close() del env
def ppo1(env_id, log_dir, timesteps): # Create log dir os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment env = gym.make(env_id) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) model = PPO1(MlpPolicy, env, verbose=0) # Train the agent print("Beginning training episodes with PPO1.") model.learn(total_timesteps=timesteps) env.close()
def test_action_mask_run_ppo1(vec_env, policy, env_class): env = vec_env([env_class]) model = PPO1(policy, env, verbose=0) obs, done, action_masks = env.reset(), [False], [] while not done[0]: action, _states = model.predict(obs, action_mask=action_masks) obs, _, done, infos = env.step(action) action_masks.clear() for info in infos: env_action_mask = info.get('action_mask') action_masks.append(env_action_mask) env.close()
def ppo1_train(): # best parames fxcm_11_H4_full_2015_2018_train_6300 v_policy = MlpPolicy # policies = [MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy] v_gamma = 0.99 # default 0.99 v_learning_rate = 0.0003 # default 0.0003 v_ent_coef = 'auto' # default 'auto' v_env = PortfolioEnv(settings['data_file'], settings['output_file'], settings['strategy_name'], settings['total_steps'], settings['window_length'], settings['capital_base'], settings['lot_size'], settings['leverage'], settings['commission_percent'], settings['commission_fixed'], settings['max_slippage_percent'], settings['start_idx'], settings['compute_indicators'], settings['compute_reward'], settings['compute_position'], settings['debug']) # Create the vectorized environment # v_env = DummyVecEnv([lambda: v_env]) # Normalize environment # v_env = VecNormalize(v_env, norm_obs=settings['norm_obs'], norm_reward=settings['norm_reward'], clip_obs=settings['clip_obs'], clip_reward=settings['clip_reward'], gamma=p_gamma, epsilon=EPS) # n_actions = v_env.action_space.shape[-1] # v_action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) v_action_noise = None # for v_policy, v_gamma, v_lam in it.product(p_policy, p_gamma, p_lam): # print(str(v_policy) + '_' + str(v_gamma) + '_' + str(v_lam)) model_name = settings['model_name'] + '_' + str( settings['total_timestamp']) + '_' + str( settings['window_length']) + '_' + str( settings['compute_indicators']) + '_' + str(v_gamma) + '_' + ( uuid.uuid4().hex)[:16] model = PPO1(env=v_env, policy=v_policy, gamma=v_gamma, verbose=0, tensorboard_log='log_' + model_name) model.learn(total_timesteps=(settings['total_timestamp'])) model.save(MODELS_DIR + model_name) # v_env.save_running_average(MODELS_DIR) del model
def main(): parser = argparse.ArgumentParser() parser.add_argument("--algorithm") parser.add_argument("--env") parser.add_argument("--steps") parser.add_argument("--alpha") parser.add_argument("--grid_search") args = parser.parse_args() algorithm = args.algorithm env = gym.make(args.env) grid_search = args.grid_search alpha = args.alpha if algorithm == "ppo1": from stable_baselines import PPO1 from stable_baselines.common.policies import MlpPolicy model = PPO1(MlpPolicy, env, verbose=1) else: from stable_baselines import DQN from stable_baselines.deepq.policies import MlpPolicy model = DQN(MlpPolicy, env, learning_rate=alpha, verbose=1) model.learn(total_timesteps=int(args.steps), log_interval=10) model.save(f"{algorithm}_cartpole") del model # remove to demonstrate saving and loading if algorithm == "ppo1": model = PPO1.load(f"{algorithm}_cartpole") else: model = DQN.load(f"{algorithm}_cartpole") mean_reward = evaluate(model, env, num_steps=10000) hparams_str = f" algorithm={algorithm} env={args.env} steps={args.steps} alpha={alpha}" if grid_search: with open("grid_search_results.txt", "a") as myfile: myfile.write(str(mean_reward) + hparams_str) myfile.close() else: print(str(mean_reward) + hparams_str)
def train(env_dict, save_folder, log_dir): """ Run training on a Toribash Environment. Saves a model and the environment configurations used. Because the actions may need to be remembered, this method builds the action space here and saves it to the environment dictionary Args: env_dict (dictionary): The dictionary from the yaml file. save_folder (filepath): path to save models log_dir (filepath): path to save logs. If file is run, then found inside of save_folder """ # setting up reward and action space if(env_dict['agent'] == 'single'): env_dict = load_single_model(env_dict) elif(env_dict['agent'] == 'multi'): env_dict = load_multi_model(env_dict) elif(env_dict['agent'] == 'limb'): env_dict['env_name'] = 'Toribash-{}-v0'.format(env_dict['limb']) elif(env_dict['agent'] == 'hierarchy'): env_dict = load_hierarchy_model(env_dict) else: raise ValueError("Incorrect agent type given. Make sure agent: [single, multi, limb, hierarchy]" + "\n And, make sure other necessary components are loaded correctly." ) with open(os.path.join(save_folder, 'configs_dict.pkl'), 'wb') as f: pickle.dump(env_dict, f) # setting up the model and environment env = make_env(env_dict, env_dict['env_name']) model = PPO1(MlpPolicy, env, verbose=1, tensorboard_log="./tensorboard/{}/".format(env_dict['savename']), optim_stepsize=0.01) try: model.learn(total_timesteps=env_dict['timesteps'], callback=callback) except KeyboardInterrupt as identifier: print("Incomplete Model Save") model.save(os.path.join(save_folder, 'incomplete')) finally: model.save(os.path.join(save_folder, 'final_model.pkl'))
def ppo1_nmileg_pool(sensory_value): RL_method = "PPO1" # total_MC_runs = 50 experiment_ID = "handtest_rot_pool_with_MC_C_task0/" save_name_extension = RL_method total_timesteps = 500000 sensory_info = "sensory_{}".format(sensory_value) current_mc_run_num =22 #starts from 0 for mc_cntr in range(current_mc_run_num, current_mc_run_num+1): log_dir = "./logs/{}/MC_{}/{}/{}/".format(experiment_ID, mc_cntr, RL_method, sensory_info) # defining the environments env = gym.make('HandManipulate-v1{}'.format(sensory_value)) #env = gym.wrappers.Monitor(env, "./tmp/gym-results", video_callable=False, force=True) ## setting the Monitor env = gym.wrappers.Monitor(env, log_dir+"Monitor/", video_callable=False, force=True, uid="Monitor_info") # defining the initial model if RL_method == "PPO1": model = PPO1(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir) elif RL_method == "PPO2": env = DummyVecEnv([lambda: env]) model = PPO2(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir) elif RL_method == "DDPG": env = DummyVecEnv([lambda: env]) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5)* 5 * np.ones(n_actions)) model = DDPG(DDPG_MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tensorboard_log=log_dir) else: raise ValueError("Invalid RL mode") # setting the environment on the model #model.set_env(env) # setting the random seed for some of the random instances random_seed = mc_cntr random.seed(random_seed) env.seed(random_seed) env.action_space.seed(random_seed) np.random.seed(random_seed) tf.random.set_random_seed(random_seed) # training the model # training the model model.learn(total_timesteps=total_timesteps) # saving the trained model model.save(log_dir+"/model") return None
def create_ppo1(self): return PPO1(MlpPolicy, self.env, gamma=0.99, timesteps_per_actorbatch=1500, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=0.001, optim_batchsize=256, lam=0.95, adam_epsilon=1e-05, schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1)
def ppo1_nmileg_pool(stiffness_value): RL_method = "PPO1" experiment_ID = "experiment_4_pool_A/mc_1/" save_name_extension = RL_method total_timesteps = 500000 stiffness_value_str = "stiffness_{}".format(stiffness_value) log_dir = "./logs/{}/{}/{}/".format(experiment_ID, RL_method, stiffness_value_str) # defining the environments env = gym.make('TSNMILeg{}-v1'.format(stiffness_value)) #env = gym.wrappers.Monitor(env, "./tmp/gym-results", video_callable=False, force=True) # defining the initial model if RL_method == "PPO1": model = PPO1(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir) elif RL_method == "PPO2": env = DummyVecEnv([lambda: env]) model = PPO2(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir) elif RL_method == "DDPG": env = DummyVecEnv([lambda: env]) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * 5 * np.ones(n_actions)) model = DDPG(DDPG_MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tensorboard_log=log_dir) else: raise ValueError("Invalid RL mode") # setting the environment on the model #model.set_env(env) # training the model # training the model model.learn(total_timesteps=total_timesteps) # saving the trained model model.save(log_dir + "/model") return None
def advlearn(env, model_name=None, dir_dict=None): _, _ = setup_logger(SAVE_DIR, EXP_NAME) if model_name == 'ppo1_oppomodel': ## inline hyperparameters ## param timesteps_per_actorbatch: timesteps per actor per update ## other inline hyperparameters is by default choice in file 'PPO1_model_value' model = PPO1_model_value( MlpPolicy_hua, env, timesteps_per_actorbatch=1000, verbose=1, tensorboard_log=dir_dict['tb'], hyper_weights=dir_dict['_hyper_weights'], benigned_model_file=None, full_tensorboard_log=False, black_box_att=dir_dict['_black_box'], attention_weights=dir_dict['_attention'], model_saved_loc=dir_dict['model'], clipped_attention=dir_dict['_clipped_attention'], exp_method=dir_dict['_x_method'], mimic_model_path=dir_dict['_mimic_model_path'], save_victim_traj=dir_dict['_save_victim_traj']) else: model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=1000, verbose=1, tensorboard_log=dir_dict['tb']) try: model.learn(TRAINING_ITER, callback=callback, seed=SEED) except ValueError as e: traceback.print_exc() print("Learn exit!") model_file_name = "{0}agent.pkl".format(dir_dict['model']) model.save(model_file_name)
from stable_baselines.common.identity_env import IdentityEnv, IdentityEnvBox from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.common.evaluation import evaluate_policy # Hyperparameters for learning identity for each RL model LEARN_FUNC_DICT = { 'a2c': lambda e: A2C(policy="MlpPolicy", learning_rate=1e-3, n_steps=1, gamma=0.7, env=e, seed=0).learn(total_timesteps=10000), 'acer': lambda e: ACER(policy="MlpPolicy", env=e, seed=0, n_steps=1, replay_ratio=1).learn(total_timesteps=15000), 'acktr': lambda e: ACKTR(policy="MlpPolicy", env=e, seed=0, learning_rate=5e-4, n_steps=1).learn(total_timesteps=20000), 'dqn': lambda e: DQN(policy="MlpPolicy", batch_size=16, gamma=0.1, exploration_fraction=0.001, env=e, seed=0).learn(total_timesteps=40000), 'ppo1': lambda e: PPO1(policy="MlpPolicy", env=e, seed=0, lam=0.5, optim_batchsize=16, optim_stepsize=1e-3).learn(total_timesteps=15000), 'ppo2': lambda e: PPO2(policy="MlpPolicy", env=e, seed=0, learning_rate=1.5e-3, lam=0.8).learn(total_timesteps=20000), 'trpo': lambda e: TRPO(policy="MlpPolicy", env=e, seed=0, max_kl=0.05, lam=0.7).learn(total_timesteps=10000), } @pytest.mark.slow @pytest.mark.parametrize("model_name", ['a2c', 'acer', 'acktr', 'dqn', 'ppo1', 'ppo2', 'trpo']) def test_identity(model_name): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) :param model_name: (str) Name of the RL model
from stable_baselines.common.policies import FeedForwardPolicy from stable_baselines import PPO1 env = gym.make('CartPole-v1') class MyMlpPolicy(FeedForwardPolicy): def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs): arch = [32, 64] super(MyMlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, net_arch=[{"pi": arch, "vf": arch}], feature_extraction="mlp", **_kwargs) global training_sess training_sess = sess model = PPO1(MyMlpPolicy, env, verbose=1, timesteps_per_actorbatch=250) model.learn(total_timesteps=25000) # model.save("ppo1_cartpole") # # del model # remove to demonstrate saving and loading # # model = PPO1.load("ppo1_cartpole") obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
# Hyperparameters for learning identity for each RL model LEARN_FUNC_DICT = { 'a2c': lambda e: A2C(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'acer': lambda e: ACER(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'acktr': lambda e: ACKTR(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'deepq': lambda e: DeepQ(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'ddpg': lambda e: DDPG(policy="MlpPolicy", env=e, param_noise=PARAM_NOISE_DDPG). learn(total_timesteps=1000), 'ppo1': lambda e: PPO1(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'ppo2': lambda e: PPO2(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'trpo': lambda e: TRPO(policy="MlpPolicy", env=e).learn(total_timesteps=1000), } @pytest.mark.slow @pytest.mark.parametrize( "model_name", ['a2c', 'acer', 'acktr', 'deepq', 'ppo1', 'ppo2', 'trpo']) def test_identity(model_name): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action)
def train(training_tag): env = gym.make(ENVIRONMENT_NAME) env = DummyVecEnv([lambda: env]) data = pd.DataFrame() #env._max_episode_steps = 200 if(isinstance(training_tag, float)): model = CLAC(clac_MlpPolicy, env, mut_inf_coef=training_tag, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS) for step in range(TRAINING_STEPS): #print("length normal: ", env.unwrapped.envs[0].length) (model, learning_results) = model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100) #data = data.append(learning_results, ignore_index=True) data = data.append(test(model, "CLAC" + str(training_tag), training_tag, False, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "CLAC" + str(training_tag), training_tag, 1, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "CLAC" + str(training_tag), training_tag, 2, (step + 1) * TRAINING_TIMESTEPS)) file_tag = str(training_tag).replace(".", "p") if(SAVE_AGENTS): model.save(SAVE_FOLDER + "/models/CLAC_" + ENVIRONMENT_NAME + "_s" + str(step) + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS)) if(SAVE_FINAL_AGENT): model.save(SAVE_FOLDER + "/models/CLAC_" + ENVIRONMENT_NAME + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS)) env.reset() del model step = 0 model = SAC(sac_MlpPolicy, env, ent_coef=training_tag, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS) for step in range(TRAINING_STEPS): (model, learning_results) = model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100) #data = data.append(learning_results, ignore_index=True) data = data.append(test(model, "SAC" + str(training_tag), training_tag, False, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "SAC" + str(training_tag), training_tag, 1, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "SAC" + str(training_tag), training_tag, 2, (step + 1) * TRAINING_TIMESTEPS)) file_tag = str(training_tag).replace(".", "p") if(SAVE_AGENTS): model.save(SAVE_FOLDER + "/models/SAC_" + ENVIRONMENT_NAME + "_s" + str(step) + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS)) if(SAVE_FINAL_AGENT): model.save(SAVE_FOLDER + "/models/SAC_" + ENVIRONMENT_NAME + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS)) env.reset() del model if(training_tag == "CLAC"): model = CLAC(clac_MlpPolicy, env, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS) for step in range(TRAINING_STEPS): (model, learning_results) = model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100) #data = data.append(learning_results, ignore_index=True) data = data.append(test(model, "CLAC", "auto", False, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "CLAC", "auto", 1, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "CLAC", "auto", 2, (step + 1) * TRAINING_TIMESTEPS)) if(SAVE_AGENTS): model.save(SAVE_FOLDER + "/models/CLAC_" + ENVIRONMENT_NAME + "_s" + str(step) + "_auto" + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS)) if(SAVE_FINAL_AGENT): model.save(SAVE_FOLDER + "/models/CLAC_" + ENVIRONMENT_NAME + "_t" + "_auto" + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS)) env.reset() del model if(training_tag == "SAC"): model = SAC(sac_MlpPolicy, env, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS) for step in range(TRAINING_STEPS): (model, learning_results) = model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100) #data = data.append(learning_results, ignore_index=True) data = data.append(test(model, "SAC", "auto", False, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "SAC", "auto", 1, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "SAC", "auto", 2, (step + 1) * TRAINING_TIMESTEPS)) if(SAVE_AGENTS): model.save(SAVE_FOLDER + "/models/SAC_" + ENVIRONMENT_NAME + "_s" + str(step) + "_auto" + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS)) if(SAVE_FINAL_AGENT): model.save(SAVE_FOLDER + "/models/SAC_" + ENVIRONMENT_NAME + "_t" + "_auto" + "_i" + str(CURRENT_ITERATION) + "_ts" + str( TRAINING_STEPS * TRAINING_TIMESTEPS)) env.reset() del model if(training_tag == "DDPG"): # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG(DDPG_MlpPolicy, env, verbose=VERBOSITY, param_noise=param_noise, action_noise=action_noise, policy_kwargs = POLICY_KWARGS) for step in range(TRAINING_STEPS): (model, learning_results) = model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100) #data = data.append(learning_results, ignore_index=True) data = data.append(test(model, "DDPG", None, False, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "DDPG", None, 1, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "DDPG", None, 2, (step + 1) * TRAINING_TIMESTEPS)) if(SAVE_AGENTS): model.save(SAVE_FOLDER + "/models/DDPG_" + ENVIRONMENT_NAME + "_s" + str(step) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS)) if(SAVE_FINAL_AGENT): model.save(SAVE_FOLDER + "/models/DDPG_" + ENVIRONMENT_NAME + "_t" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS)) env.reset() del model if(training_tag == "PPO1"): model = PPO1(MlpPolicy, env, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS) for step in range(TRAINING_STEPS): model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100) data = data.append(test(model, "PPO1", training_tag, False, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "PPO1", training_tag, 1, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "PPO1", training_tag, 2, (step + 1) * TRAINING_TIMESTEPS)) if(SAVE_AGENTS): model.save(SAVE_FOLDER + "/models/PPO1_" + ENVIRONMENT_NAME + "_s" + str(step) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS)) if(SAVE_FINAL_AGENT): model.save(SAVE_FOLDER + "/models/PPO1_" + ENVIRONMENT_NAME + "_t" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS)) env.reset() del model if(training_tag == "A2C"): model = A2C(MlpPolicy, env, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS) for step in range(TRAINING_STEPS): model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100) data = data.append(test(model, "A2C", training_tag, False, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "A2C", training_tag, 1, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "A2C", training_tag, 2, (step + 1) * TRAINING_TIMESTEPS)) if(SAVE_AGENTS): model.save(SAVE_FOLDER + "/models/A2C_" + ENVIRONMENT_NAME + "_s" + str(step) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS)) if(SAVE_FINAL_AGENT): model.save(SAVE_FOLDER + "/models/A2C_" + ENVIRONMENT_NAME + "_t" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS)) env.reset() del model return data
from stable_baselines import A2C import numpy as np import robosumo.envs from gym import spaces from robosumo.policy_zoo import LSTMPolicy, MLPPolicy from robosumo.policy_zoo.utils import load_params, set_from_flat from wrapper import RoboSumoWrapper from stable_baselines import PPO1 # env = make_vec_env('RoboSumo-Ant-vs-Ant-v0', n_envs=4) env = gym.make('RoboSumo-Ant-vs-Ant-v0') print("original action space: ", env.action_space) print("original observation space: ", env.observation_space) env_player1 = RoboSumoWrapper(env, player_id=1) policy1 = PPO1(MlpPolicy, env_player1, verbose=1) env_player0 = RoboSumoWrapper(env) policy0 = PPO1(MlpPolicy, env_player0, verbose=1) env_player0.opponent_policy = policy1 print("action space of policy0 is: ", policy0.action_space) print("observation space of policy0 is: ", policy0.observation_space) policy0.learn(total_timesteps=5) policy0.save("policy0") del policy0 # remove to demonstrate saving and loading model = PPO1.load("policy0")
def main(): parser = custom_arg_parser() args = parser.parse_args() load_defaults(args) print("Arguments:{}".format(args)) # Create the model name with all the parameters model_dir_name = serialize_args(args) print("Model name: {}".format(model_dir_name)) if args.model is not None: model_save_path = os.path.dirname(args.model) + "/" tb_save_path = model_save_path.replace("learned_models","tb_logs") else: model_save_path = "../../learned_models/" + model_dir_name + "/" tb_save_path = "../../tb_logs/" + model_dir_name + "/" print("Model save path:{}".format(model_save_path)) print("TB logs save path:{}".format(tb_save_path)) final_model_path = model_save_path + "final_" + model_dir_name model_load_path = args.model show_render = args.visualize # Save args to json for training from checkpoints if not os.path.exists(model_save_path): os.makedirs(model_save_path) with open(model_save_path + "args.json", 'w+') as f: json.dump(vars(args),f,indent=2,sort_keys=True) env = GymWrapper( suite.make( "JR2Door", has_renderer = show_render, use_camera_obs = False, ignore_done = False, control_freq = args.control_freq, horizon = args.horizon, door_type = args.door_type, bot_motion = args.bot_motion, robot_pos = args.robot_pos, robot_theta = args.robot_theta, dist_to_handle_coef = args.rcoef_dist_to_handle, door_angle_coef = args.rcoef_door_angle, handle_con_coef = args.rcoef_handle_con, body_door_con_coef = args.rcoef_body_door_con, self_con_coef = args.rcoef_self_con, arm_handle_con_coef = args.rcoef_arm_handle_con, arm_door_con_coef = args.rcoef_arm_door_con, force_coef = args.rcoef_force, gripper_touch_coef = args.rcoef_gripper_touch, dist_to_door_coef = args.rcoef_dist_to_door, wall_con_coef = args.rcoef_wall_con, reset_on_large_force= args.reset_on_large_force, debug_print = args.print_info, eef_type = args.eef_type, door_init_qpos = args.door_init_qpos, goal_offset = args.goal_offset, ) ) if args.slurm: env = SubprocVecEnv([lambda: env for i in range(args.n_cpu)]) else: env = DummyVecEnv([lambda: env]) # Load the specified model, if there is one if args.model is not None: # Training from checkpoint, so need to reset timesteps for tb reset_num_timesteps = False if args.rl_alg == "ppo2": model = PPO2.load(model_load_path,env=env) print("Succesfully loaded PPO2 model") if args.rl_alg == "ppo1": model = PPO1.load(model_load_path,env=env) print("Succesfully loaded PPO1 model") else: # New model, so need to reset timesteps for tb reset_num_timesteps = True if args.rl_alg == "ppo2": model = PPO2( args.policy, env, verbose=args.verbose, n_steps=args.n_steps, nminibatches=args.minibatches, noptepochs=args.opt_epochs, cliprange=args.clip_range, ent_coef=args.ent_coef, tensorboard_log=tb_save_path, #full_tensorboard_log=True ) elif args.rl_alg == "ppo1": model = PPO1( args.policy, env, verbose=args.verbose, timesteps_per_actorbatch=args.n_steps, optim_epochs=args.opt_epochs, tensorboard_log=tb_save_path, ) if args.replay: # Replay a policy obs = env.reset() count = 0 with open('episode-reward.csv', mode='w') as fid: writer = csv.writer(fid, delimiter=',') writer.writerow("reward") while(count < 1000): env.render() count += 1 print(count) while True: if args.model is None: print("Error: No model has been specified") action, _states = model.predict(obs,deterministic=True) #print("action {}".format(action)) obs, reward, done, info = env.step(action) env.render() #print(obs) #print(env.sim.data.qpos[env._ref_joint_vel_indexes]) #time.sleep(0.1) with open('episode-reward.csv', mode='a') as fid: writer = csv.writer(fid, delimiter=',') writer.writerow(reward) #if done: # quit() else: # Train model.learn( total_timesteps = args.total_timesteps, save_dir = model_save_path, render=show_render, reset_num_timesteps=reset_num_timesteps, ) model.save(final_model_path) print("Done training") obs = env.reset()
def build_model(algo, policy, env_name, log_dir, expert_dataset=None): """ Initialize model according to algorithm, architecture and hyperparameters :param algo: (str) Name of rl algorithm - 'sac', 'ppo2' etc. :param env_name:(str) :param log_dir:(str) :param expert_dataset:(ExpertDataset) :return:model: stable_baselines model """ from stable_baselines.common.vec_env import DummyVecEnv model = None if algo == 'sac': # policy_kwargs = dict(layers=[64, 64, 64],layer_norm=False) # model = SAC(policy, env_name, gamma=0.99, learning_rate=1e-4, buffer_size=500000, # learning_starts=5000, train_freq=500, batch_size=64, policy_kwargs=policy_kwargs, # tau=0.01, ent_coef='auto_0.1', target_update_interval=1, # gradient_steps=1, target_entropy='auto', action_noise=None, # random_exploration=0.0, verbose=2, tensorboard_log=log_dir, # _init_setup_model=True, full_tensorboard_log=True, # seed=None, n_cpu_tf_sess=None) # SAC - start learning from scratch # policy_kwargs = dict(act_fun=tf.nn.relu, net_arch=[32, 32, 32]) policy_kwargs = dict(layers=[32, 32, 32], layer_norm=False) env = DummyVecEnv([lambda: gym.make(env_name)]) # model = A2C(CnnMlpPolicy, env, verbose=1,gamma=0.99, learning_rate=1e-4, tensorboard_log=log_dir, _init_setup_model=True, full_tensorboard_log=True,seed=None, n_cpu_tf_sess=None) model = SAC(CustomSacCnnMlpPolicy, env=env, gamma=0.99, learning_rate=1e-4, buffer_size=50000, learning_starts=1000, train_freq=100, batch_size=1, tau=0.01, ent_coef='auto', target_update_interval=1, gradient_steps=1, target_entropy='auto', action_noise=None, random_exploration=0.0, verbose=1, tensorboard_log=log_dir, _init_setup_model=True, full_tensorboard_log=True, seed=None, n_cpu_tf_sess=None) elif algo == 'ppo1': model = PPO1('MlpPolicy', env_name, gamma=0.99, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, lam=0.95, adam_epsilon=1e-5, schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1) elif algo == 'trpo': model = TRPO('MlpPolicy', env_name, timesteps_per_batch=4096, tensorboard_log=log_dir, verbose=1) elif algo == 'gail': assert expert_dataset is not None model = GAIL('MlpPolicy', env_name, expert_dataset, tensorboard_log=log_dir, verbose=1) assert model is not None return model
return mean_100ep_reward parser = argparse.ArgumentParser() parser.add_argument("--algorithm") args = parser.parse_args() algorithm = args.algorithm env = gym.make('CartPole-v0') if algorithm == "ppo1": from stable_baselines import PPO1 from stable_baselines.common.policies import MlpPolicy model = PPO1(MlpPolicy, env, verbose=1) else: from stable_baselines import DQN from stable_baselines.deepq.policies import MlpPolicy model = DQN(MlpPolicy, env, verbose=1) model.learn(total_timesteps=int(2e4), log_interval=10) model.save(f"{algorithm}_cartpole") del model # remove to demonstrate saving and loading if algorithm == "ppo1": model = PPO1.load(f"{algorithm}_cartpole") else: model = DQN.load(f"{algorithm}_cartpole")
def get_model(env): # Initialize agent return PPO1(CustomCnnPolicy, env, verbose=0)
eval_callback = EvalCallback(eval_env, best_model_save_path='./tf_model_logs/best_model', log_path='./tf_model_logs/best_model_results', eval_freq=10000) # Create the callback list callback = CallbackList([checkpoint_callback, eval_callback]) env = gym_env.PegInEnv( "PandaPegIn", has_offscreen_renderer=True, # has_renderer=True, use_camera_obs=False, control_freq=100, ) model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=5, gamma=0.99, lam=0.95, schedule='linear', tensorboard_log='runs', verbose=1) model.learn(total_timesteps=500000, callback=callback)
""" Simple test to check that PPO1 is running with no errors (see issue #50) """ from stable_baselines import PPO1 if __name__ == '__main__': model = PPO1('MlpPolicy', 'CartPole-v1', schedule='linear', verbose=0) model.learn(total_timesteps=1000)
episodeLength=100, bullseye=8) callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-100, verbose=1) eval_callback = EvalCallback(env, best_model_save_path='./logs/best', log_path='./logs/', eval_freq=500, deterministic=True, render=False, callback_on_new_best=callback_on_best) # Added checkpoint because I lost model data after a crash when the webcam shutdown because the screen went to sleep :( checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/', name_prefix='ppo1_model') cb = CallbackList([checkpoint_callback, eval_callback]) policy_kwargs = {'layers': [128, 128]} model = PPO1(MlpPolicy, env, tensorboard_log="./logs/", policy_kwargs=policy_kwargs) model.learn(total_timesteps=20000, callback=cb) model.save("ppo1_rpi_led_nn128") print('model saved')
def main(): # Argument parser to select model type parser = argparse.ArgumentParser(description="Train a reinforcement learning flight controller.") parser.add_argument('-m','--model', help="RL Agent to train on.") args = vars(parser.parse_args()) # Create a Comet experiment with an API key experiment = Experiment(api_key="Bq3mQixNCv2jVzq2YBhLdxq9A", project_name="rl-flight-controller", workspace="alexbarnett12", log_env_gpu = False, log_env_cpu = False, log_env_host= False, log_git_metadata = False, log_git_patch = False) # Load training parameters cfg = configparser.ConfigParser() cfg.read(TRAINING_CONFIG) params = cfg["PARAMETERS"] # Set training parameters learning_rate_max = float(params["learning_rate_max"]) learning_rate_min = float(params["learning_rate_min"]) n_steps = int(params["N_steps"]) noptepochs = int(params["Noptepochs"]) nminibatches = int(params["Nminibatches"]) gamma = float(params["Gamma"]) lam = float(params["Lam"]) clip = float(params["Clip"]) ent_coeff = float(params["Ent_coeff"]) total_timesteps = int(params["Total_timesteps"]) # Linearly decreasing learning rate (only for PPO2) lr_callback = create_lr_callback(learning_rate_max, learning_rate_min) # Report hyperparameters to Comet hyper_params = {"learning_rate": learning_rate_max, "steps": n_steps, "epochs": noptepochs, "minibatches": nminibatches, "gamma": gamma, "lambda": lam, "clip_range": clip, "ent_coeff": ent_coeff, "total_timesteps": total_timesteps} experiment.log_parameters(hyper_params) # You can set the level to logger.DEBUG or logger.WARN if you # want to change the amount of output. logger.set_level(logger.DEBUG) # Create save directory and various save paths model_log_dir = create_model_log_dir() save_path = "./logs/" + model_log_dir + "/ckpts/" best_model_save_path = "./logs/" + model_log_dir + "/best_model/" log_path = "./logs/" + model_log_dir + "/results/" tensorboard_dir = "./logs/" + model_log_dir + "/tensorboard/" model_save_path = "./logs/saved_models/" + model_log_dir # Save training and reward params to model directory shutil.copy("./gymfc/reward_params.config", "./logs/" + model_log_dir + "/reward_params.config") shutil.copy("./gymfc/training_params.config", "./logs/" + model_log_dir + "/training_params.config") # Create a callback to save model checkpoints checkpoint_callback = CheckpointCallback(save_freq=100000, save_path=save_path, name_prefix='rl_model') # Create a separate evaluation environment #eval_env = gym.make('attitude-fc-v0') # Callback to evaluate the model during training #eval_callback = EvalCallback(eval_env, best_model_save_path=best_model_save_path, # log_path=log_path, eval_freq=100000) # Create training environment env = gym.make('attitude-fc-v0') # Callback to add max penalty watchers to Tensorboard tb_callback = TensorboardCallback(env) # Create the callback list #callback = CallbackList([checkpoint_callback, eval_callback, tb_callback]) callback = CallbackList([checkpoint_callback, tb_callback]) # RL Agent; Current options are PPO1 or PPO2 # Note: PPO2 does not work w/o vectorized environments (gymfc is not vectorized) if args["model"] == "PPO2": print("PPO2!") model = PPO2(MlpPolicy, env, n_steps=n_steps, learning_rate=lr_callback, noptepochs=noptepochs, nminibatches=nminibatches, gamma=gamma, lam=lam, cliprange=clip, ent_coef=ent_coeff, tensorboard_log=tensorboard_dir, policy_kwargs= {layers: [32,32]}) experiment.add_tag("PPO2") else: model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=n_steps, optim_stepsize = learning_rate_max, schedule="linear", optim_epochs=noptepochs, optim_batchsize=nminibatches, gamma=gamma, lam=lam, clip_param=clip, entcoeff=ent_coeff, tensorboard_log=tensorboard_dir) experiment.add_tag("PPO1") # Train the model. Clean up environment on user cancellation try: model.learn(total_timesteps=total_timesteps, callback=callback) except KeyboardInterrupt: print("INFO: Ctrl-C caught. Cleaning up...") env.close() eval_env.close() model.save(model_save_path) env.close() eval_env.close()
# env = gym.make('CartPole-v0') gamma = arg_or_default("--gamma", default=0.99) print("gamma = %f" % gamma) class MyMlpPolicy(FeedForwardPolicy): def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs): super(MyMlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, net_arch=[{"pi": arch, "vf": arch}], feature_extraction="mlp", **_kwargs) global training_sess training_sess = sess model = PPO1(MyMlpPolicy, env, verbose=1, schedule='constant', timesteps_per_actorbatch=8192, optim_batchsize=2048, gamma=gamma) # for i in range(0, 6): # with model.graph.as_default(): # saver = tf.compat.v1.train.Saver() # saver.save(training_sess, "./pcc_model_%d.ckpt" % i) model.learn(total_timesteps=(100 * 8192)) env.testing(True) obs = env.reset() for _ in range(10 * 8192): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) env.reset() env.testing(False)