def main(output_folder_path:Path): # Set gym-carla environment agent_config = AgentConfig.parse_file(Path("configurations/agent_configuration.json")) carla_config = CarlaConfig.parse_file(Path("configurations/carla_configuration.json")) params = { "agent_config": agent_config, "carla_config": carla_config, "ego_agent_class": RLPIDAgent, "max_collision": 5 } env = gym.make('roar-pid-v0', params=params) env.reset() model_params: dict = { "verbose": 1, "render": True, "tensorboard_log": (output_folder_path / "tensorboard").as_posix() } latest_model_path = find_latest_model(output_folder_path) if latest_model_path is None: model = DDPG(LnMlpPolicy, env=env, **model_params) # full tensorboard log can take up space quickly else: model = DDPG.load(latest_model_path, env=env, **model_params) model.render = True model.tensorboard_log = (output_folder_path / "tensorboard").as_posix() logging_callback = LoggingCallback(model=model) checkpoint_callback = CheckpointCallback(save_freq=1000, verbose=2, save_path=(output_folder_path / "checkpoints").as_posix()) event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback) callbacks = CallbackList([checkpoint_callback, event_callback, logging_callback]) model = model.learn(total_timesteps=int(1e10), callback=callbacks, reset_num_timesteps=False) model.save(f"pid_ddpg_{datetime.now()}")
def test_callbacks(model_class): env_id = 'Pendulum-v0' if model_class in [ACER, DQN]: env_id = 'CartPole-v1' allowed_failures = [] # Number of training timesteps is too short # otherwise, the training would take too long, or would require # custom parameter per algorithm if model_class in [PPO1, DQN, TRPO]: allowed_failures = ['rollout_end'] # Create RL model model = model_class('MlpPolicy', env_id) checkpoint_callback = CheckpointCallback(save_freq=500, save_path=LOG_FOLDER) # For testing: use the same training env eval_env = model.get_env() # Stop training if the performance is good enough callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200, verbose=1) eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, best_model_save_path=LOG_FOLDER, log_path=LOG_FOLDER, eval_freq=100) # Equivalent to the `checkpoint_callback` # but here in an event-driven manner checkpoint_on_event = CheckpointCallback(save_freq=1, save_path=LOG_FOLDER, name_prefix='event') event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event) callback = CallbackList( [checkpoint_callback, eval_callback, event_callback]) model.learn(500, callback=callback) model.learn(200, callback=None) custom_callback = CustomCallback() model.learn(200, callback=custom_callback) # Check that every called were executed custom_callback.validate(allowed_failures=allowed_failures) # Transform callback into a callback list automatically custom_callback = CustomCallback() model.learn(500, callback=[checkpoint_callback, eval_callback, custom_callback]) # Check that every called were executed custom_callback.validate(allowed_failures=allowed_failures) # Automatic wrapping, old way of doing callbacks model.learn(200, callback=lambda _locals, _globals: True) # Cleanup if os.path.exists(LOG_FOLDER): shutil.rmtree(LOG_FOLDER)
def train(self, tensorboard_log: str) -> None: try: self.load_model(tensorboard_log=tensorboard_log) except: self.create_model(tensorboard_log=tensorboard_log) # Stop training if reward gets close to zero callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-0.1, verbose=1) eval_callback = EvalCallback(self.env, callback_on_new_best=callback_on_best, verbose=1) # Save model at regular time intervals checkpoint_callback = CheckpointCallback( save_freq=1000, save_path='./model_checkpoints/') # Chain callbacks together callback = CallbackList([eval_callback, checkpoint_callback]) # Train model self.model.learn(total_timesteps=int(1e10), callback=callback, tb_log_name="run") # Save trained model print("Training is finished!")
def train(env_name, num_time_steps, policy_kwargs, eval_ep, eval_freq, ckpt_freq, load_model=None): env = gym.make(env_name) env_ = gym.make(env_name) rank = MPI.COMM_WORLD.Get_rank() today = date.today() today = str(today).replace('-', '_') now = datetime.now() current_time = now.strftime("%H_%M_%S") model_name = env_name + '_PPO1_' + today + current_time Path('./run/' + model_name).mkdir(parents=True, exist_ok=True) path = os.path.join(os.path.dirname(__file__), './run/' + model_name) ############################ # callback # ############################ callbacklist = [] eval_callback = EvalCallback_wandb(env_, n_eval_episodes=eval_ep, eval_freq=eval_freq, log_path=path) ckpt_callback = CheckpointCallback(save_freq=ckpt_freq, save_path='./run/' + model_name + '/ckpt', name_prefix='') callbacklist.append(eval_callback) callbacklist.append(ckpt_callback) callback = CallbackList(callbacklist) if load_model: model = PPO1.load(env=env, load_path=load_model) else: model = PPO1(MlpPolicy, env, verbose=1, policy_kwargs=policy_kwargs) ############################ # Logging # ############################ if rank == 0: logger.configure(path) config = {} config['load'] = [{'load_model': load_model}] config['eval'] = [{'eval_freq': eval_freq, 'eval_ep': eval_ep}] config['ckpt'] = [{'ckpt_freq': ckpt_freq}] config['policy'] = [{'policy_network': policy_kwargs}] with open('./run/' + model_name + '/' + model_name + '.txt', 'w+') as outfile: json.dump(config, outfile, indent=4) else: logger.configure(path, format_strs=[]) ############################ # run # ############################ model.learn(total_timesteps=int(num_time_steps), callback=callback) model.save(path + '/finish')
def main( training_env: PSMCartesianHERDDPGEnv, eval_env: PSMCartesianHERDDPGEnv = None, log_dir='./.logs/results' ): os.makedirs(log_dir, exist_ok=True) # training_env = Monitor(training_env, log_dir) n_actions = training_env.action_space.shape[0] noise_std = 0.2 # Currently using OU noise action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions) ) model_class = DDPG # works also with SAC, DDPG and TD3 rl_model_kwargs = { 'actor_lr': 1e-3, 'critic_lr': 1e-3, 'action_noise': action_noise, 'nb_train_steps': 300, 'nb_rollout_steps': 100, 'gamma': 0.95, 'observation_range': (-1.5, 1.5), 'random_exploration': 0.05, 'normalize_observations': True, 'critic_l2_reg': 0.01 } # Available strategies (cf paper): future, final, episode, random model = HER( 'MlpPolicy', training_env, model_class, verbose=1, n_sampled_goal=4, goal_selection_strategy='future', buffer_size=int(1e5), batch_size=128, tensorboard_log="./ddpg_dvrk_tensorboard/", **rl_model_kwargs ) # Reset the model training_env.reset() # Create callbacks checkpoint_callback = CheckpointCallback( save_freq=100000, save_path="./ddpg_dvrk_tensorboard/" ) # save_path="./.model/model_checkpoint/") #save_freq=100000 # eval_callback = EvalCallback(training_env, best_model_save_path='./ddpg_dvrk_tensorboard/best_model', # log_path=log_dir, eval_freq=500) callback = CallbackList([checkpoint_callback]) # , eval_callback]) # Train the model model.learn(4000000, log_interval=100, callback=callback) model.save("./her_robot_env")
def train(env_name, num_time_steps, policy_kwargs, eval_ep, eval_freq, ckpt_freq, load_model=None): env = gym.make(env_name) # env.render() env_ = gym.make(env_name) today = date.today() today = str(today).replace('-', '_') now = datetime.now() current_time = now.strftime("%H_%M_%S") model_name = env_name + '_SAC_' + today + current_time Path('./run/' + model_name).mkdir(parents=True, exist_ok=True) path = os.path.join(os.path.dirname(__file__), './run/' + model_name) env = Monitor(env, filename=path) ############################ # Logging # ############################ logger.configure(path) config = {} config['load'] = [{'load_model': load_model}] config['eval'] = [{'eval_freq': eval_freq, 'eval_ep': eval_ep}] config['ckpt'] = [{'ckpt_freq': ckpt_freq}] config['policy'] = [{'policy_network': policy_kwargs}] with open('./run/' + model_name + '/' + model_name + '.txt', 'w+') as outfile: json.dump(config, outfile, indent=4) ############################ # callback # ############################ callbacklist = [] ckpt_callback = CheckpointCallback(save_freq=ckpt_freq, save_path='./run/' + model_name + '/ckpt', name_prefix='') eval_callback = EvalCallback_wandb_SAC(env_, n_eval_episodes=eval_ep, eval_freq=eval_freq, log_path=path) callbacklist.append(ckpt_callback) callbacklist.append(eval_callback) callback = CallbackList(callbacklist) ############################ # run # ############################ # policy_kwargs = dict(net_arch=[128, dict(vf=[256], pi=[16])]) model = SAC(MlpPolicy, env, verbose=1) model.learn(total_timesteps=int(num_time_steps), log_interval=20, callback=callback) model.save(path + "SAC_Walker2d")
def train(self, timesteps: int, callbacks: Sequence[BaseCallback] = None, num_checkpoints=4) -> None: callbacks = [] if callbacks is None else callbacks cb = CheckpointCallback(save_freq=timesteps // num_checkpoints, save_path=self._dirs.models, name_prefix=self._dirs.prefix) self._model.learn(total_timesteps=timesteps, callback=CallbackList([cb, *callbacks]))
def train(self, timesteps: int, num_checkpoints=4, callbacks: Sequence[BaseCallback] = None): ppo_offset = 128 callbacks = [] if callbacks is None else callbacks cb = CheckpointCallback(save_freq=timesteps // num_checkpoints, save_path=self._dirs.models, name_prefix=self._dirs.prefix) self._model.learn(total_timesteps=timesteps + ppo_offset, callback=CallbackList([cb, *callbacks]), log_interval=100)
def main(output_folder_path: Path): # Set gym-carla environment agent_config = AgentConfig.parse_file( Path("configurations/agent_configuration.json")) carla_config = CarlaConfig.parse_file( Path("configurations/carla_configuration.json")) params = { "agent_config": agent_config, "carla_config": carla_config, "ego_agent_class": RLLocalPlannerAgent, "max_collision": 5, } env = gym.make('roar-local-planner-v0', params=params) env.reset() model_params: dict = { "verbose": 1, "render": True, "env": env, "n_cpu_tf_sess": None, "buffer_size": 1000, "nb_train_steps": 50, "nb_rollout_steps": 100, # "nb_eval_steps": 50, "batch_size": 32, } latest_model_path = find_latest_model(Path(output_folder_path)) if latest_model_path is None: model = DDPG(CnnPolicy, **model_params) else: model = DDPG.load(latest_model_path, **model_params) tensorboard_dir = (output_folder_path / "tensorboard") ckpt_dir = (output_folder_path / "checkpoints") tensorboard_dir.mkdir(parents=True, exist_ok=True) ckpt_dir.mkdir(parents=True, exist_ok=True) model.tensorboard_log = tensorboard_dir.as_posix() model.render = True logging_callback = LoggingCallback(model=model) checkpoint_callback = CheckpointCallback(save_freq=1000, verbose=2, save_path=ckpt_dir.as_posix()) event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback) callbacks = CallbackList( [checkpoint_callback, event_callback, logging_callback]) model = model.learn(total_timesteps=int(1e10), callback=callbacks, reset_num_timesteps=False) model.save(f"local_planner_ddpg_{datetime.now()}")
def main(logdir): # params SLEEP_RATE = 100 #100Hz N_EPISODE = 1000 EPISODE_TIME = 30 EPISODE_LENGTH = SLEEP_RATE * EPISODE_TIME TOTAL_TIMESTEPS = EPISODE_LENGTH * N_EPISODE # logdir logdir = '/home/yliu2/rl_log/sac_mpc/ALT/3act/2' checkpoint_path = os.path.join(logdir, 'checkpoint') callback_path = logdir final_model_path = logdir + '/final_model' # env env = BlimpEnv(SLEEP_RATE) env = Monitor(env, logdir) # env = make_vec_env(lambda: env, n_envs=1, monitor_dir=logdir) print("Observation space:", env.observation_space) print("Shape:", env.observation_space.shape) print("Action space:", env.action_space) # # # callback SAVE_FREQ = EPISODE_LENGTH * 20 # every 1 episode checkpoint_callback = CheckpointCallback(save_freq=SAVE_FREQ, save_path=checkpoint_path, name_prefix='sac_callback_model') save_on_best_training_reward_callback = SaveOnBestTrainingRewardCallback( check_freq=SAVE_FREQ, log_dir=callback_path) callback = CallbackList( [checkpoint_callback, save_on_best_training_reward_callback]) # traing got kill for some reason so continue from the checkpoint model_path = '/home/yliu2/rl_log/sac_mpc/ALT/3act/2/best_model.zip' model = SAC.load(model_path) model.set_env(env) print("---------- Start Learing -----------") model.learn(total_timesteps=TOTAL_TIMESTEPS, log_interval=SAVE_FREQ, callback=callback) print("---------- Finish Learning ----------") model.save(final_model_path) del model # remove to demonstrate saving and loading model = SAC.load(final_model_path) results_plotter.plot_results([logdir], TOTAL_TIMESTEPS, results_plotter.X_TIMESTEPS, "SAC BLIMP") plt.show()
def main(output_folder_path: Path): # Set gym-carla environment agent_config = AgentConfig.parse_file( Path("configurations/agent_configuration.json")) carla_config = CarlaConfig.parse_file( Path("configurations/carla_configuration.json")) params = { "agent_config": agent_config, "carla_config": carla_config, "ego_agent_class": RLLocalPlannerAgent, "max_collision": 5, } env = gym.make('roar-local-planner-v1', params=params) env.reset() tensorboard_dir, ckpt_dir = prep_dir(output_folder_path) model_params: dict = { "verbose": 1, "render": True, "env": env, "n_cpu_tf_sess": 2, "buffer_size": 10, "random_exploration": 0.1, "tensorboard_log": tensorboard_dir.as_posix(), } latest_model_path = find_latest_model(Path(output_folder_path)) if latest_model_path is None: model = DDPG( LnMlpPolicy, **model_params) # full tensorboard log can take up space quickly else: model = DDPG.load(latest_model_path, **model_params) logging_callback = LoggingCallback(model=model) checkpoint_callback = CheckpointCallback(save_freq=1000, verbose=2, save_path=ckpt_dir.as_posix()) event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback) callbacks = CallbackList( [checkpoint_callback, event_callback, logging_callback]) model = model.learn(total_timesteps=int(1e10), callback=callbacks, reset_num_timesteps=False) model.save(f"local_planner_v1_ddpg_{datetime.now()}")
def train(self): # Load latest model if available try: path = os.getcwd() os.chdir(os.getcwd() + '/model_checkpoints') files = [x for x in os.listdir() if x.endswith(".zip")] num = [] for file in files: num.append([int(x) for x in file.split('_') if x.isdigit()][0]) filename = "rl_model_" + str(max(num)) + "_steps.zip" print("Tentative: " + filename) self.model = PPO2.load(load_path=filename, env=DummyVecEnv([lambda: self.env]), tensorboard_log='./a2c_rasp_tensorboard/') print("Successfully loaded the previous model: " + filename) os.chdir(path) except: # Vector-encode our new environment env = DummyVecEnv([lambda: self.env]) # Create new model self.model = PPO2('MlpPolicy', env, verbose=1, tensorboard_log='./a2c_rasp_tensorboard/') print("Successfully created new model") # Stop training if reward get close to zero callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1e-2, verbose=1) eval_callback = EvalCallback(self.env, callback_on_new_best=callback_on_best, verbose=1) # Save model at regular time intervals checkpoint_callback = CheckpointCallback(save_freq=2000, save_path='./model_checkpoints/') # Chain callbacks together callback = CallbackList([eval_callback, checkpoint_callback]) # Train model episode = 1 while episode < 10: # Update location of red dot _ = self.env.square if self.env.trainable: print("Beginning episode number {}".format(episode)) self.model.learn(total_timesteps=int(1e10), callback=callback, tb_log_name="run") episode += 1 # Save trained model self.model.save("raspberry_agent")
def setup(model_params, output_folder_path): latest_model_path = find_latest_model(Path(output_folder_path)) if latest_model_path is None: print("Creating model...") model = DDPG(CnnPolicy, **model_params) else: print("Loading model...") model = DDPG.load(latest_model_path, **model_params) tensorboard_dir = (output_folder_path / "tensorboard") ckpt_dir = (output_folder_path / "checkpoints") tensorboard_dir.mkdir(parents=True, exist_ok=True) ckpt_dir.mkdir(parents=True, exist_ok=True) checkpoint_callback = CheckpointCallback(save_freq=200, verbose=2, save_path=ckpt_dir.as_posix()) # event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback) logging_callback = LoggingCallback(model=model, verbose=1) callbacks = CallbackList([checkpoint_callback, logging_callback]) return model, callbacks
def main(): agent_data = pd.read_csv('../output_EURUSD_M1_/agentData.csv') agent_data = agent_data.drop(agent_data.columns[0], axis=1) agent_data = agent_data.astype('float32') env = SubprocVecEnv([lambda: ForexTradingEnv(agent_data)] * 10, ) #env = DummyVecEnv([lambda: ForexTradingEnv(agent_data)], ) # model = DQN(CustomDQNPolicy, env, gamma=0.95, verbose=1, tensorboard_log = "./tensorboard", entcoeff=0.005, adam_epsilon = 1e-6) import tensorflow as tf from TenorboardCallbacks import TensorboardCallback checkpoint_callback = CheckpointCallback(save_freq=1000000, save_path='./models/', name_prefix='ppo2') for curr in [1]: model = PPO2(PPO2Policy_Basic, env, verbose=1, tensorboard_log="./tensorboard", vf_coef=1e-7, ent_coef=1e-4, n_steps=512, gamma=0.99) #model = PPO2.load("5_days_model/ppo2_999000000_steps.zip", policy=PPO2Policy_Basic, env = env,verbose=1, tensorboard_log = "./tensorboard") model.learn(total_timesteps=10000000000, log_interval=10000000, callback=CallbackList( [TensorboardCallback(env), checkpoint_callback])) model.save(model_fileName) obs = env.reset() for i in range(2000000): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) if i % 1 == 0: env.render() if done: break
def learn(env_name, seed, load_path, save_path, tensorboard_log, total_timesteps, n_cpu): save_path = env_name if save_path is None else save_path checkpoint_callback = CheckpointCallback(save_freq=2000, save_path=save_path) eval_env = make_env(env_name, n_cpu, seed)() eval_callback = EvalCallback(eval_env, best_model_save_path=save_path+'/best', log_path=tensorboard_log, eval_freq=1000) callback = CallbackList([checkpoint_callback, eval_callback]) policy = CnnPolicy # policy = CnnLstmPolicy # policy = CnnLnLstmPolicy print(env_name, policy) # Run this to enable SubprocVecEnv on Mac OS X. # export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES # see https://github.com/rtomayko/shotgun/issues/69#issuecomment-338401331 env = SubprocVecEnv([make_env(env_name, i, seed) for i in range(n_cpu)]) if load_path is not None: model = PPO2.load(load_path, env, verbose=1, tensorboard_log=tensorboard_log) else: model = PPO2(policy, env, verbose=1, tensorboard_log=tensorboard_log) model.learn(total_timesteps=total_timesteps, log_interval=5, callback=callback) print('saving model:', save_path+'/latest_model') model.save(save_path+'/latest_model') env.close()
def get_train_callback(eval_env, seed, log_dir, save_f=10000, eval_f=50000, eval_ep=1000): checkpoint_callback = CheckpointCallback(save_freq=save_f, save_path=log_dir) # Separate evaluation env eval_callback = EvalTensorboardCallback( eval_env, best_model_save_path=os.path.join(log_dir, 'best_model'), log_path=os.path.join(log_dir, 'evaluation_results'), eval_freq=eval_f, n_eval_episodes=eval_ep, deterministic=True, render=False, seed=seed) # Create the callback list callback = CallbackList([checkpoint_callback, eval_callback]) return callback
:param rank: (int) index of the subprocess """ def _init(): env = environment(x,y,z, gamma) env.seed(seed + rank) return env set_global_seeds(seed) return _init #points_values=list([[0,LR1],[1000000,LR2]]) #Sched=PiecewiseSchedule(points_values, outside_value=LR2) if __name__ == '__main__': num_cpu = 1 # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(x,y,z, i) for i in range(num_cpu)]) eval_env=environment(x,y,z, gamma) # Stable Baselines provides you with make_vec_env() helper # which does exactly the previous steps for you: # env = make_vec_env(env_id, n_envs=num_cpu, seed=0) scenario=str(f'RG_t{test}_lr{LR}_gamma{gamma}_batch{batch_size}') callbacklist=CallbackList([TimeLimit(episodetimesteps), EvalCallback(eval_env, log_path=scenario, deterministic=False)]) model = A2C(CnnPolicy, env, gamma=gamma, verbose=1)#, tensorboard_log=scenario) model.learn(total_timesteps=episodetimesteps**99, callback=callbacklist)
def train_initial_policy( model_name, algo=ALGO, env_name=ENV_NAME, time_steps=TIME_STEPS): """Uses the specified algorithm on the target environment""" print("Using algorithm : ", algo.__name__) print("Model saved as : ", "data/models/" +algo.__name__+"_initial_policy_"+env_name+"_.pkl") # define the environment here env = gym.make(env_name) env.seed(SEED) if NOISE_VALUE>0 : env = NoisyRealEnv(env, noise_value=NOISE_VALUE) if MUJOCO_NORMALIZE: env = MujocoNormalized(env) print('~~ ENV Obs RANGE : ', env.observation_space.low, env.observation_space.high) print('~~~ ENV Action RANGE : ', env.action_space.low, env.action_space.high) if algo.__name__ == "ACKTR": print('Using SubprovVecEnv') env = SubprocVecEnv([lambda: env for i in range(8)]) elif algo.__name__ == "SAC": print('Using standard gym environment') env = env else: print('Using Dummy Vec Env') env = DummyVecEnv([lambda : env]) if NORMALIZE : env = VecNormalize(env, training=True, norm_obs=True, norm_reward=False, clip_reward=1e6, ) with open('data/target_policy_params.yaml') as file: args = yaml.load(file, Loader=yaml.FullLoader) args = args[algo.__name__][PARAMS_ENV] print('~~ Loaded args file ~~') if algo.__name__ == "SAC": print('Initializing SAC with RLBaselinesZoo hyperparameters .. ') print('using 256 node architecture as in the paper') class CustomPolicy(ffp_sac): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, feature_extraction="mlp", layers=[256, 256]) model = SAC(CustomPolicy, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', batch_size=args['batch_size'], buffer_size=args['buffer_size'], ent_coef=args['ent_coef'], learning_starts=args['learning_starts'], learning_rate=args['learning_rate'], train_freq=args['train_freq'], seed=SEED, ) elif algo.__name__ == "TD3": print('Initializing TD3 with RLBaselinesZoo hyperparameters .. ') # hyperparameters suggestions from : # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/td3/HopperBulletEnv-v0/config.yml n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=float(args['noise_std']) * np.ones(n_actions)) class CustomPolicy2(ffp_td3): def __init__(self, *args, **kwargs): super(CustomPolicy2, self).__init__(*args, **kwargs, feature_extraction="mlp", layers=[400, 300]) model = TD3(CustomPolicy2, env, verbose = 1, tensorboard_log = 'data/TBlogs/initial_policy_training', batch_size = args['batch_size'], buffer_size = args['buffer_size'], gamma = args['gamma'], gradient_steps = args['gradient_steps'], learning_rate = args['learning_rate'], learning_starts = args['learning_starts'], action_noise = action_noise, train_freq=args['train_freq'], seed=SEED, ) elif algo.__name__ == "TRPO": print('Initializing TRPO with RLBaselinesZoo hyperparameters .. ') # hyperparameters suggestions from : # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/sac/HopperBulletEnv-v0/config.yml model = TRPO(mlp_standard, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', timesteps_per_batch=args['timesteps_per_batch'], lam=args['lam'], max_kl=args['max_kl'], gamma=args['gamma'], vf_iters=args['vf_iters'], vf_stepsize=args['vf_stepsize'], entcoeff=args['entcoeff'], cg_damping=args['cg_damping'], cg_iters=args['cg_iters'], seed=SEED, ) elif algo.__name__ == "ACKTR": print('Initializing ACKTR') model = ACKTR(mlp_standard, env, verbose=1, n_steps=128, ent_coef=0.01, lr_schedule='constant', learning_rate=0.0217, max_grad_norm=0.5, gamma=0.99, vf_coef=0.946, seed=SEED) elif algo.__name__ == "PPO2": print('Initializing PPO2') print('Num envs : ', env.num_envs) model = PPO2(mlp_standard, env, n_steps=int(args['n_steps']/env.num_envs), nminibatches=args['nminibatches'], lam=args['lam'], gamma=args['gamma'], ent_coef=args['ent_coef'], noptepochs=args['noptepochs'], learning_rate=args['learning_rate'], cliprange=args['cliprange'], verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', seed=SEED, ) else: print('No algorithm matched. Using SAC .. ') model = SAC(CustomPolicy, env, verbose=1, batch_size=args['batch_size'], buffer_size=args['buffer_size'], ent_coef=args['ent_coef'], learning_starts=args['learning_starts'], learning_rate=args['learning_rate'], train_freq=args['train_freq'], seed=SEED, ) # change model name if using normalization if NORMALIZE: model_name = model_name.replace('.pkl', 'normalized_.pkl') elif MUJOCO_NORMALIZE: model_name = model_name.replace('.pkl', 'mujoco_norm_.pkl') if SAVE_BEST_FOR_20: model.learn(total_timesteps=time_steps, tb_log_name=model_name, log_interval=10, callback=eval_callback) save_the_model() model_name = model_name.replace('best_', '') model.save(model_name) elif SAVE_INTERMEDIATE: check_callback = CheckpointCallback(save_freq=SAVE_FREQ, save_path=model_name[:-4], name_prefix=ENV_NAME + '_' + str(SEED), verbose=1, ) eval_env = DummyVecEnv([lambda: gym.make(ENV_NAME)]) eval_env.seed(SEED) eval_callback = EvalCallback(eval_env, n_eval_episodes=10, eval_freq=SAVE_FREQ, log_path=model_name[:-4], deterministic=False, render=False, verbose=1) callbacks = CallbackList([check_callback, eval_callback]) model.learn(total_timesteps=time_steps, tb_log_name=model_name.split('/')[-1], log_interval=10, callback=callbacks) model.save(model_name) npzfile = np.load(model_name[:-4] + '/evaluations.npz') average_rewards = np.mean(npzfile['results'], axis=1)[:, 0] with open(model_name[:-4] + "/eval_results.txt", "a") as f: for i in range(np.shape(average_rewards)[0]): f.write("{}, {}\n".format(npzfile['timesteps'][i], average_rewards[i])) evaluate_policy_on_env(env, model, render=False, iters=50) else: model.learn(total_timesteps=time_steps, tb_log_name=model_name.split('/')[-1], log_interval=10,) model.save(model_name) evaluate_policy_on_env(env, model, render=False, iters=50) # save the environment params if NORMALIZE: # env.save(model_name.replace('.pkl', 'stats_.pkl')) env.save('data/models/env_stats/'+env_name+'.pkl') print('done :: ', model_name) exit()
return _init if __name__ == '__main__': num_cpu = 15 # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(x,y,z, i) for i in range(num_cpu)]) eval_env=environment(x, y, z, gamma, cutoffpenaltyscalar, rg_prob, turnspc, savepath) # Stable Baselines provides you with make_vec_env() helper # which does exactly the previous steps for you: # env = make_vec_env(env_id, n_envs=num_cpu, seed=0) #create callbacks to record data, initiate events during training. callbacklist=CallbackList([TimeLimit(episodetimesteps), EvalCallback(eval_env, log_path=savepath, n_eval_episodes=5 , deterministic=False, best_model_save_path=savepath)]) #create model with Stable Baselines package. model = A2C(CnnPolicy, env, gamma=gamma, n_steps=updatesteps, learning_rate=LR, verbose=1)#, tensorboard_log=scenario) model.learn(total_timesteps=episodetimesteps**50, callback=callbacklist) #total timesteps set to very large number so program will terminate based on runtime parameter) #create learning curve plot evaluations= './%s/%s/evaluations.npz' % (storagefolder,scenario) data=np.load(evaluations) results=data['results'] y=np.average(results, axis=1) timesteps=data['timesteps'] plt.plot(timesteps,y)
parser.add_argument( "--seeds", nargs="+", type=int, default=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], help="Seeds for evaluation", ) parser.add_argument("--fd_port", type=int, default=55555) args = parser.parse_args() for b in args.benchmarks: for s in args.seeds: logger = Logger(experiment_name=f"PPO_{b}_s{s}", output_path=Path(args.outdir)) perf_logger = logger.add_module(PerformanceTrackingWrapper) config = {"seed": s, "logger": perf_logger, "benchmark": b} if b == "FastDownwardBenchmark": config["port"] = args.fd_port env = make_benchmark(config) model = PPO2("MlpPolicy", env) logging = LoggerCallback(logger) checkpoint = CheckpointCallback( save_freq=1000, save_path=f"{args.outdir}/PPO_{b}_s{s}/models", name_prefix="model", ) callback = CallbackList([logging, checkpoint]) model.learn(total_timesteps=args.timesteps, callback=callback) logger.close()
# td3_env = env checkpoint_on_event = CheckpointCallback(save_freq=1000, save_path= "./logs/model_checkpoints", name_prefix='rl_model') event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event) eval_callback = EvalCallback(td3_env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=100, deterministic=True, render=False) # td3_model.learning_starts = 100 custom_callback = customCallback(verbose=0) callback = CallbackList([custom_callback, checkpoint_on_event]) td3_model = TD3(Td3MlpPolicy, td3_env, gamma = GAMMA, learning_rate = LEARNING_RATE, buffer_size = BUFFER_SIZE, learning_starts = LEARNING_STARTS, train_freq = TRAIN_FREQ, gradient_steps = GRADIENT_STEPS, batch_size = BATCH_SIZE, tau = TAU, policy_delay = POLICY_DELAY, action_noise = td3_noise, target_policy_noise = TARGET_POLICY_NOISE, target_noise_clip = TARGET_NOISE_CLIP, random_exploration = RANDOM_EXPLORATION,
rPiIP='192.168.0.183', rPiPort=50000, episodeLength=100, bullseye=10) callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-20, verbose=1) eval_callback = EvalCallback(env, best_model_save_path='./logs/best', log_path='./logs/', eval_freq=500, deterministic=True, render=False, callback_on_new_best=callback_on_best) # Added checkpoint because I lost model data after a crash when the webcam shutdown because the screen went to sleep :( checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/', name_prefix='ppo1_model') cb = CallbackList([eval_callback, checkpoint_callback]) model = DQN(MlpPolicy, env, verbose=1, double_q=True, tensorboard_log='./logs/') model.learn(total_timesteps=2000, callback=cb) model.save("dqn_rpi_led")
def main(): # Argument parser to select model type parser = argparse.ArgumentParser(description="Train a reinforcement learning flight controller.") parser.add_argument('-m','--model', help="RL Agent to train on.") args = vars(parser.parse_args()) # Create a Comet experiment with an API key experiment = Experiment(api_key="Bq3mQixNCv2jVzq2YBhLdxq9A", project_name="rl-flight-controller", workspace="alexbarnett12", log_env_gpu = False, log_env_cpu = False, log_env_host= False, log_git_metadata = False, log_git_patch = False) # Load training parameters cfg = configparser.ConfigParser() cfg.read(TRAINING_CONFIG) params = cfg["PARAMETERS"] # Set training parameters learning_rate_max = float(params["learning_rate_max"]) learning_rate_min = float(params["learning_rate_min"]) n_steps = int(params["N_steps"]) noptepochs = int(params["Noptepochs"]) nminibatches = int(params["Nminibatches"]) gamma = float(params["Gamma"]) lam = float(params["Lam"]) clip = float(params["Clip"]) ent_coeff = float(params["Ent_coeff"]) total_timesteps = int(params["Total_timesteps"]) # Linearly decreasing learning rate (only for PPO2) lr_callback = create_lr_callback(learning_rate_max, learning_rate_min) # Report hyperparameters to Comet hyper_params = {"learning_rate": learning_rate_max, "steps": n_steps, "epochs": noptepochs, "minibatches": nminibatches, "gamma": gamma, "lambda": lam, "clip_range": clip, "ent_coeff": ent_coeff, "total_timesteps": total_timesteps} experiment.log_parameters(hyper_params) # You can set the level to logger.DEBUG or logger.WARN if you # want to change the amount of output. logger.set_level(logger.DEBUG) # Create save directory and various save paths model_log_dir = create_model_log_dir() save_path = "./logs/" + model_log_dir + "/ckpts/" best_model_save_path = "./logs/" + model_log_dir + "/best_model/" log_path = "./logs/" + model_log_dir + "/results/" tensorboard_dir = "./logs/" + model_log_dir + "/tensorboard/" model_save_path = "./logs/saved_models/" + model_log_dir # Save training and reward params to model directory shutil.copy("./gymfc/reward_params.config", "./logs/" + model_log_dir + "/reward_params.config") shutil.copy("./gymfc/training_params.config", "./logs/" + model_log_dir + "/training_params.config") # Create a callback to save model checkpoints checkpoint_callback = CheckpointCallback(save_freq=100000, save_path=save_path, name_prefix='rl_model') # Create a separate evaluation environment #eval_env = gym.make('attitude-fc-v0') # Callback to evaluate the model during training #eval_callback = EvalCallback(eval_env, best_model_save_path=best_model_save_path, # log_path=log_path, eval_freq=100000) # Create training environment env = gym.make('attitude-fc-v0') # Callback to add max penalty watchers to Tensorboard tb_callback = TensorboardCallback(env) # Create the callback list #callback = CallbackList([checkpoint_callback, eval_callback, tb_callback]) callback = CallbackList([checkpoint_callback, tb_callback]) # RL Agent; Current options are PPO1 or PPO2 # Note: PPO2 does not work w/o vectorized environments (gymfc is not vectorized) if args["model"] == "PPO2": print("PPO2!") model = PPO2(MlpPolicy, env, n_steps=n_steps, learning_rate=lr_callback, noptepochs=noptepochs, nminibatches=nminibatches, gamma=gamma, lam=lam, cliprange=clip, ent_coef=ent_coeff, tensorboard_log=tensorboard_dir, policy_kwargs= {layers: [32,32]}) experiment.add_tag("PPO2") else: model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=n_steps, optim_stepsize = learning_rate_max, schedule="linear", optim_epochs=noptepochs, optim_batchsize=nminibatches, gamma=gamma, lam=lam, clip_param=clip, entcoeff=ent_coeff, tensorboard_log=tensorboard_dir) experiment.add_tag("PPO1") # Train the model. Clean up environment on user cancellation try: model.learn(total_timesteps=total_timesteps, callback=callback) except KeyboardInterrupt: print("INFO: Ctrl-C caught. Cleaning up...") env.close() eval_env.close() model.save(model_save_path) env.close() eval_env.close()
def __init__(self, env: Env, params: dict, model_path: str, log_path: str): """Initialize. :param env: gym environment. Assuming observation space is a tuple, where first component is from original env, and the second is temporal goal state. :param params: dict of parameters, like `default_parameters`. :param model_path: directory where to save models. :param log_path: directory where to save tensorboard logs. """ # Check if params["initialize_file"]: raise ValueError( "Initialization not supported; use resuming option") if params["action_bias"]: raise ValueError("Action bias is not maintained here") # Alias original_env = env # Load a saved agent for the action bias self.biased_agent: Optional[DQN] = None if params["action_bias"]: loading_params = dict(params) loading_params["resume_file"] = params["action_bias"] loading_params["action_bias"] = None self.biased_agent = TrainStableBaselines( env=env, params=loading_params, model_path=model_path, log_path=log_path, ).model # Collect statistics # (assuming future wrappers do not modify episodes) env = MyStatsRecorder(env=env, gamma=params["gamma"]) # Callbacks checkpoint_callback = CustomCheckpointCallback( save_path=model_path, save_freq=params["save_freq"], extra=None, ) stats_logger_callback = StatsLoggerCallback(stats_recorder=env, scope="env0") callbacks_list = [checkpoint_callback, stats_logger_callback] if params["render"]: renderer_callback = RendererCallback() callbacks_list.append(renderer_callback) # If training a passive agent log this too if params["active_passive_agents"]: # Find the reward shaping env reward_shaping_env = find_wrapper(env, RewardShapingWrapper) passive_stats_env = MyStatsRecorder( env=UnshapedEnv(reward_shaping_env), gamma=params["gamma"], ) passive_stats_callback = StatsLoggerCallback( stats_recorder=passive_stats_env, scope="env1", ) callbacks_list.append(passive_stats_callback) # Make it move with the original env env = UnshapedEnvWrapper( shaped_env=env, unshaped_env=passive_stats_env, ) original_reward_getter = env.get_reward # alias else: original_reward_getter = None # Combine callbacks all_callbacks = CallbackList(callbacks_list) # Define or load resuming = bool(params["resume_file"]) if not resuming: # Normalizer normalized_env = NormalizeEnvWrapper( env=env, training=True, entry=0, # Only env features, not temporal goal state ) flat_env = BoxAutomataStates(normalized_env) # Saving normalizer too checkpoint_callback.saver.extra_model = normalized_env # Agent model = DQN( env=flat_env, policy=ModularPolicy, policy_kwargs={ "layer_norm": params["layer_norm"], "layers": params["layers"], "shared_layers": params["shared_layers"], "dueling": params["dueling"], }, gamma=params["gamma"], learning_rate=params["learning_rate"], train_freq=params["train_freq"], double_q=True, batch_size=params["batch_size"], buffer_size=params["buffer_size"], learning_starts=params["learning_starts"], prioritized_replay=True, target_network_update_freq=params[ "target_network_update_freq"], exploration_fraction=params["exploration_fraction"], exploration_final_eps=params["exploration_final_eps"], exploration_initial_eps=params["exploration_initial_eps"], active_passive_agents=params["active_passive_agents"], passive_reward_getter=original_reward_getter, tensorboard_log=log_path, full_tensorboard_log=False, verbose=1, ) else: # Reload model model, extra_model, counters = checkpoint_callback.load( path=params["resume_file"], ) # Restore normalizer and env normalized_env = extra_model normalized_env.set_env(env) flat_env = BoxAutomataStates(normalized_env) # Restore properties model.tensorboard_log = log_path model.num_timesteps = counters["step"] model.learning_starts = params["learning_starts"] + counters["step"] model.set_env(flat_env) model.passive_reward_getter = original_reward_getter # Store self.params = params self.resuming = resuming self.saver = checkpoint_callback self.logger = stats_logger_callback self.callbacks = all_callbacks self.model: DQN = model self.normalized_env = normalized_env self.testing_agent = model if not params[ "test_passive"] else model.passive_agent
policy_kwargs = dict(act_fun=tf.nn.relu, net_arch=net_arch, obs_norm_init=init_obs_norm, act_norm_init=init_act_norm) n_time_step = 10 * 10**6 save_gif_callback = SaveGifCallback(save_freq=int(0.5 * 10**6), save_path=os.path.join( log_path, run_id, 'training_videos'), fps=int(1. / env._policy_step)) rwd_term_callback = DiscRwdTerminate(th_perc=0.9, n_skip=500) rwd_rec_callback = SaveRewardPortionsCallback( fullfilename=os.path.join(log_path, run_id, 'reward_portions.txt')) callback = CallbackList( [save_gif_callback, rwd_term_callback, rwd_rec_callback]) model = PPO2(NormalMlpPolicy, env, gamma=0.95, n_steps=8192, ent_coef=0.0001, nminibatches=4, noptepochs=4, learning_rate=5.0 * 10**(-4), policy_kwargs=policy_kwargs, verbose=False, tensorboard_log=os.path.join(log_path, run_id)) model.learn(total_timesteps=n_time_step, log_interval=1000, reset_num_timesteps=False,
return _init if __name__ == '__main__': num_cpu = 12 # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(inputfile, i) for i in range(num_cpu)]) eval_env = environment(inputfile, gamma) # Stable Baselines provides you with make_vec_env() helper # which does exactly the previous steps for you: # env = make_vec_env(env_id, n_envs=num_cpu, seed=0) scenario = str( f'{inputfile_s}_t{test}_lr{LR_s}_gamma{gamma_s}_batch{batch_size}') callbacklist = CallbackList([ TimeLimit(episodetimesteps), EvalCallback(eval_env, log_path=scenario, n_eval_episodes=5) ]) model = PPO2(MlpPolicy, env, gamma=gamma, n_steps=batch_size, learning_rate=LR, verbose=1) #, tensorboard_log=scenario) model.learn(total_timesteps=episodetimesteps**99, callback=callbacklist) filename = './%s/evaluations.npz' % scenario data = np.load(filename) results = data['results'] y = np.average(results, axis=1)
save_path='./tf_model_logs/') # Separate evaluation env eval_env = gym_env.PegInEnv( "PandaPegIn", has_offscreen_renderer=True, # has_renderer=True, use_camera_obs=False, control_freq=100, ) eval_callback = EvalCallback(eval_env, best_model_save_path='./tf_model_logs/best_model', log_path='./tf_model_logs/best_model_results', eval_freq=10000) # Create the callback list callback = CallbackList([checkpoint_callback, eval_callback]) env = gym_env.PegInEnv( "PandaPegIn", has_offscreen_renderer=True, # has_renderer=True, use_camera_obs=False, control_freq=100, ) model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=5,
# Create the vectorized environment #env = environment(x,y,z,0.95, 0.05, savepath, 'MlpPolicy', rg_prob='loadenv') env = environment( x, y, z, gamma, turnspc, policyname, rg_prob='loadenv' ) #SubprocVecEnv([make_env(x,y,z, i) for i in range(num_cpu)]) #eval_env=environment(x, y, z, gamma, turnspc, savepath, policyname, rg_prob='loadenv') # Stable Baselines provides you with make_vec_env() helper # which does exactly the previous steps for you: # env = make_vec_env(env_id, n_envs=num_cpu, seed=0) #create callbacks to record data, initiate events during training. callbacklist = CallbackList([ TimeLimit(episodetimesteps), EvalCallback(env, log_path=savepath, n_eval_episodes=1, eval_freq=10000, deterministic=det, best_model_save_path=savepath) ]) if (os.path.exists("%s/best_model.zip" % savepath)): # Instantiate the agent model = ACER(policy, env, gamma=gamma, n_steps=episodetimesteps, learning_rate=LR, buffer_size=10000, verbose=1) # Load the trained agent
env = SubprocVecEnv([make_env(x, y, z, i) for i in range(num_cpu)]) eval_env = evalenv(x, y, z, turnspc, policyname) env1 = environment(x, y, z, turnspc, scalar, policyname) # Stable Baselines provides you with make_vec_env() helper # which does exactly the previous steps for you: # env = make_vec_env(env_id, n_envs=num_cpu, seed=0) #create callbacks to record data, initiate events during training. callbacklist = CallbackList([ TimeLimit(episodetimesteps), EvalCallback(eval_env, log_path=evpath, n_eval_episodes=100, eval_freq=50000, deterministic=True, best_model_save_path=evpath), EvalCallback(env1, log_path=savepath, n_eval_episodes=20, eval_freq=10000, deterministic=False, best_model_save_path=savepath) ]) if (os.path.exists("%s/final_model.zip" % savepath)): # Instantiate the agent model = ACER(policy, env, gamma=gamma, n_steps=episodetimesteps, learning_rate=LR, buffer_size=5000,
def main(logdir): # params SLEEP_RATE = 100 #1 2 10 50 100Hz EPISODE_TIME = 30 # 30 120 sec USE_MPC = False N_EPISODE = 1000000 Action_Choice = np.array([1, 1, 1, 1, 0, 0, 0, 0]) EPISODE_LENGTH = SLEEP_RATE * EPISODE_TIME TOTAL_TIMESTEPS = EPISODE_LENGTH * N_EPISODE # logdir logdir = os.path.join(logdir, strftime("%Y-%m-%d--%H:%M:%S", localtime())) os.makedirs(logdir) checkpoint_path = os.path.join(logdir, 'checkpoint') callback_path = logdir final_model_path = logdir + '/final_model' # env env = BlimpEnv(SLEEP_RATE, EPISODE_TIME, USE_MPC, Action_Choice) env = Monitor(env, logdir) # env = make_vec_env(lambda: env, n_envs=1, monitor_dir=logdir) print("Observation space:", env.observation_space) print("Shape:", env.observation_space.shape) print("Action space:", env.action_space) # callback SAVE_FREQ = EPISODE_LENGTH * 100 # save model for every 20 episode checkpoint_callback = CheckpointCallback(save_freq=SAVE_FREQ, save_path=checkpoint_path, name_prefix='sac_callback_model') save_on_best_training_reward_callback = SaveOnBestTrainingRewardCallback( check_freq=SAVE_FREQ, log_dir=callback_path) callback = CallbackList( [checkpoint_callback, save_on_best_training_reward_callback]) # agent model = SAC(MlpPolicy, env, gamma=0.98, learning_rate=0.0003, buffer_size=1000000, learning_starts=EPISODE_LENGTH * 20, train_freq=1, batch_size=256, tau=0.01, ent_coef='auto', target_update_interval=1, gradient_steps=1, target_entropy='auto', action_noise=None, verbose=1, tensorboard_log=logdir, full_tensorboard_log=True, _init_setup_model=True) print("---------- Start Learing -----------") model.learn(total_timesteps=TOTAL_TIMESTEPS, log_interval=SAVE_FREQ, callback=callback) print("---------- Finish Learning ----------") model.save(final_model_path) del model # remove to demonstrate saving and loading model = SAC.load(final_model_path) results_plotter.plot_results([logdir], TOTAL_TIMESTEPS, results_plotter.X_TIMESTEPS, "SAC BLIMP") plt.show()