def create_env(env_name, normalized, Training=False): env = gym.make(env_name) if normalized: from stable_baselines.common.vec_env import VecNormalize, DummyVecEnv vec_env = DummyVecEnv([lambda: env]) vec_env = VecNormalize.load('data/models/env_stats/'+env_name+'.pkl', venv=vec_env) vec_env.training = Training vec_env.reward_range = env.reward_range return env
def main(): args = get_args() choose_device(args.device) set_global_seeds(args.seed) env_id = args.env exp_id = args.exp_id algo = args.algo env_name = env_id[:-3] env_index = env_list.index(env_id) # Pass CustomEnv arguments: follow this for your CustomEnv if reward not known prior to training env_kwargs = {} if args.env_kwargs is None else args.env_kwargs if (args.env_kwargs is not None) and (env_id in ['AirSim-v0']): if 'rew_land' in env_kwargs: if (int(env_kwargs['rew_land']) in [500, 1000, 10000]): env_success[-1] = int(env_kwargs['rew_land']) else: raise ValueError( 'Given env reward not acceptable. Please try again') params = [exp_id, env_name.lower()] folder = [exp_id, env_name.lower(), args.algo.lower()] tensorboard_path, monitor_path, callback_path = None, None, None if args.tensorboard: tensorboard_path = "tensorboard/{}_{}".format(*params) make_dir(tensorboard_path) # if args.train_RL: # Begin training here (location of this condition also decides experiment performance) # Load hyperparameters from yaml file with open('hyperparams/{}.yml'.format(args.algo), 'r') as f: hyperparams_dict = yaml.safe_load(f) if env_id in list(hyperparams_dict.keys()): hyperparams = hyperparams_dict[env_id] else: raise ValueError("Hyperparameters not found for {}-{}".format( args.algo, env_id)) if args.hyperparams is not None: # Overwrite hyperparams if needed hyperparams.update(args.hyperparams) # OPTIONAL: Print saved hyperparams saved_hyperparams = OrderedDict([(key, hyperparams[key]) for key in sorted(hyperparams.keys())]) if args.verbose > 0: pprint(saved_hyperparams) if args.n_envs > 1: # if args.verbose: print("Overwriting n_envs with n={}".format(args.n_envs)) n_envs = args.n_envs else: n_envs = hyperparams.get('n_envs', 1) # choose Monitor log path according to multiprocessing setting if args.monitor: if n_envs == 1: monitor_path = 'logs/single/{}_{}_{}'.format(*folder) else: if algo not in ['dqn', 'her', 'sac', 'td3']: monitor_path = 'logs/multi/{}_{}_{}'.format(*folder) make_dir(monitor_path) if int(float(args.timesteps_RL)) > 0: # if args.verbose: print("Overwriting n_timesteps with n={}".format( int(float(args.timesteps_RL)))) n_timesteps = int(float(args.timesteps_RL)) else: n_timesteps = int(hyperparams['n_timesteps']) # Convert to python object if needed if 'policy_kwargs' in hyperparams.keys() and isinstance( hyperparams['policy_kwargs'], str): hyperparams['policy_kwargs'] = eval(hyperparams['policy_kwargs']) if 'n_envs' in hyperparams.keys(): del hyperparams['n_envs'] del hyperparams['n_timesteps'] #To avoid error env_wrapper = get_wrapper_class(hyperparams) if 'env_wrapper' in hyperparams.keys(): del hyperparams['env_wrapper'] # if (algo=='ppo2' and ('learning_rate' in hyperparams.keys())): # hyperparams['learning_rate'] = linear_schedule(hyperparams['learning_rate']) def create_env(n_envs, eval_env=False): if algo in ['a2c', 'acer', 'acktr', 'ppo2']: if n_envs > 1: env = SubprocVecEnv([ make_env(env_id, i, args.seed, log_dir=monitor_path, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs) ]) else: env = DummyVecEnv([ make_env(env_id, 0, args.seed, log_dir=monitor_path, wrapper_class=env_wrapper, env_kwargs=env_kwargs) ]) env = DummyVecEnv([lambda: gym.make(env_id, **env_kwargs)]) if env_wrapper is not None: env = env_wrapper(env) elif ((algo in ['dqn', 'her', 'sac', 'td3']) and n_envs > 1): raise ValueError( "Error: {} does not support multiprocessing!".format(algo)) elif ((algo in ['ddpg', 'ppo1', 'trpo', 'gail']) and n_envs > 1): raise ValueError( "Error: {} uses MPI for multiprocessing!".format(algo)) else: env = make_vec_env(env_id, n_envs=n_envs, seed=args.seed, monitor_dir=monitor_path, wrapper_class=env_wrapper, env_kwargs=env_kwargs) if args.normalize: # choose from multiple options # env = VecNormalize(env, clip_obs=np.inf) env = VecNormalize(env, norm_reward=False, clip_obs=np.inf) # env = VecNormalize(env, norm_reward=False, clip_obs=np.inf, **normalize_kwargs) return env # Zoo: env = SubprocVecEnv([make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs)]) # Zoo: env = DummyVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs)]) env = create_env(n_envs) # if args.train_RL: # checking impact of the if-condition position on experiment reproducibility callback, callback_path = [], "callbacks/{}_{}_{}".format(*folder) save_freq, eval_freq = 100 * episode_len[env_index], 100 * episode_len[ env_index] save_freq, eval_freq = max(save_freq // n_envs, 1), max(eval_freq // n_envs, 1) make_dir(callback_path) if args.check_callback: callback.append( CheckpointCallback(save_freq=save_freq, save_path=callback_path, name_prefix='rl_model', verbose=1)) if args.eval_callback: callback.append( EvalCallback(create_env(1, eval_env=True), best_model_save_path=callback_path, log_path=callback_path, eval_freq=eval_freq, verbose=1)) model = (algo_list[args.algo])(env=env, seed=args.seed, tensorboard_log=tensorboard_path, n_cpu_tf_sess=1, verbose=args.verbose, **hyperparams) print('\nTraining {} on {} now... \n'.format(algo, env_id)) start_time = time.time() model.learn(total_timesteps=n_timesteps, callback=callback) total_time = time.time() - start_time if args.normalize: env.save(os.path.join(callback_path, "vec_normalize.pkl")) if n_envs > 1 or (algo in ['ddpg', 'trpo', 'gail']): print("Took {:.2f}s for multiprocessed version - {:.2f} FPS".format( total_time, n_timesteps / total_time)) else: print("Took {:.2f}s for single process version - {:.2f} FPS".format( total_time, n_timesteps / total_time)) env = DummyVecEnv([make_env(env_id, 0, args.seed, env_kwargs=env_kwargs)]) if args.normalize: env = VecNormalize.load( os.path.join(callback_path, "vec_normalize.pkl"), env) env.training = False env.norm_reward = False env.seed(args.seed) # Evaluate RL model - choose either best model or last available model model = (algo_list[algo]).load(os.path.join(callback_path, 'best_model')) # model = (algo_list[algo]).load("models/{}_{}_{}".format(*folder)) model.set_env(env) evaluate('policy', model, env_id, env, algo, 100) if args.monitor: results_plotter.plot_results([monitor_path], n_timesteps, results_plotter.X_TIMESTEPS, "{} {}".format(algo, env_id)) plot_results(monitor_path) if args.test: print('\nTesting policy...\n') obs = env.reset() for _ in range(n_timesteps): action, _states = model.predict(obs, deterministic=True) if isinstance(env.action_space, gym.spaces.Box): action = np.clip(action, env.action_space.low, env.action_space.high) obs, rewards, dones, info = env.step(action) episode_reward += rewards env.render() if dones: done_count += 1 success_count = check_success(env_index, env_success, success_count) total_reward += episode_reward episode_reward = 0 env.reset() print('\n{}/{} successful episodes'.format(success_count, done_count)) average_reward = total_reward / done_count print('\nAverage reward: {}'.format(average_reward)) env.close()
def normalize_env( env, orig_log_dir, sb_version, vectorize=True, continue_learning=False, evaluate=False, evaluate_during_learning=False, normalize_kwargs=None, ): if vectorize: env = DummyVecEnv([lambda: env]) logger.debug("Normalize: {}".format(normalize_kwargs)) if evaluate: # FIXME in continue learning training should be True so that we update the running average of obs and # rewards with new samples; if I do that, the algo performs very poorly even with no changes in the env if sb_version == "sb3": env = VecNormalize3(env, training=False, **normalize_kwargs) else: env = VecNormalize(env, training=False, **normalize_kwargs) if not evaluate_during_learning or continue_learning: if not os.path.exists( os.path.join(orig_log_dir, "vecnormalize.pkl")): env_name = get_env_name(env=env.unwrapped, sb_version=sb_version) index_last_separator = orig_log_dir.rindex("/") new_orig_log_dir = os.path.join( orig_log_dir[0:index_last_separator], "logs_" + env_name) logger.debug( "{} does not exist. Trying to search it in the original model directory {}" .format(os.path.join(orig_log_dir, "vecnormalize.pkl"), new_orig_log_dir)) assert os.path.exists(new_orig_log_dir), "{} does not exist" assert os.path.exists( os.path.join(new_orig_log_dir, "vecnormalize.pkl")), ( os.path.join(new_orig_log_dir, "vecnormalize.pkl") + " does not exist") logger.debug("[evaluate] Loading {}".format( os.path.join(new_orig_log_dir, "vecnormalize.pkl"))) if sb_version == "sb3": env = VecNormalize3.load( os.path.join(new_orig_log_dir, "vecnormalize.pkl"), env) else: env = VecNormalize.load( os.path.join(new_orig_log_dir, "vecnormalize.pkl"), env) else: logger.debug("[evaluate] Loading {}".format( os.path.join(orig_log_dir, "vecnormalize.pkl"))) if sb_version == "sb3": env = VecNormalize3.load( os.path.join(orig_log_dir, "vecnormalize.pkl"), env) else: env = VecNormalize.load( os.path.join(orig_log_dir, "vecnormalize.pkl"), env) # Deactivate training and reward normalization env.training = False env.norm_reward = False elif continue_learning: # FIXME: don't know why but during continue learning I have to disable training otherwise performance # is not the same as in the model trained from scratch even without changing the params of the environment. # in rl-baselines-zoo this is not done during continue learning: # https://github.com/araffin/rl-baselines-zoo/blob/master/train.py#L365 if sb_version == "sb3": env = VecNormalize3(env, training=False, **normalize_kwargs) else: env = VecNormalize(env, training=False, **normalize_kwargs) assert os.path.exists(os.path.join( orig_log_dir, "vecnormalize.pkl")), ( os.path.join(orig_log_dir, "vecnormalize.pkl") + " does not exist") logger.debug("[continue_learning] Loading {}".format( os.path.join(orig_log_dir, "vecnormalize.pkl"))) if sb_version == "sb3": env = VecNormalize3.load( os.path.join(orig_log_dir, "vecnormalize.pkl"), env) else: env = VecNormalize.load( os.path.join(orig_log_dir, "vecnormalize.pkl"), env) else: if sb_version == "sb3": env = VecNormalize3(env, **normalize_kwargs) else: env = VecNormalize(env, **normalize_kwargs) return env
if __name__ == '__main__': log_dir = 'models/hover/empty_world_small/finalVec' stats_path = os.path.join(log_dir, "vec_normalize.pkl") env_id = 'CrazyflieObstacleEval-v0' # Load the agent model = PPO2.load(log_dir + '/ppo2_final') # Load the saved statistics env = DummyVecEnv([ lambda: gym.make(env_id, n_obstacles=1, avoidance_method='Heuristic') ]) env = VecNormalize.load(stats_path, env) # do not update them at test time env.training = False # reward normalization is not needed at test time env.norm_reward = False eval_episodes = 50 total_goals_reached = 0 total_collisions = 0 total_flips = 0 total_steps_exceeded = 0 total_potential_collisions = 0 total_collisions_avoided = 0 total_timsteps = 0 # Observe trained agent for i_episode in range(eval_episodes):