normalize = True del hyperparams['normalize'] if 'policy_kwargs' in hyperparams.keys(): # Convert to python object if needed if isinstance(hyperparams['policy_kwargs'], str): hyperparams['policy_kwargs'] = eval(hyperparams['policy_kwargs']) # Delete keys so the dict can be pass to the model constructor if 'n_envs' in hyperparams.keys(): del hyperparams['n_envs'] del hyperparams['n_timesteps'] # obtain a class object from a wrapper name string in hyperparams # and delete the entry env_wrapper = get_wrapper_class(hyperparams) if 'env_wrapper' in hyperparams.keys(): del hyperparams['env_wrapper'] log_path = f"{args.log_folder}/{args.algo}/" save_path = os.path.join( log_path, f"{env_id}_{get_latest_run_id(log_path, env_id) + 1}{uuid_str}") params_path = f"{save_path}/{env_id}" os.makedirs(params_path, exist_ok=True) callbacks = get_callback_class(hyperparams) if 'callback' in hyperparams.keys(): del hyperparams['callback'] if args.save_freq > 0:
normalize_kwargs = eval(normalize) normalize = True del hyperparams['normalize'] if 'policy_kwargs' in hyperparams.keys(): hyperparams['policy_kwargs'] = eval(hyperparams['policy_kwargs']) # Delete keys so the dict can be pass to the model constructor if 'n_envs' in hyperparams.keys(): del hyperparams['n_envs'] del hyperparams['n_timesteps'] # obtain a class object from a wrapper name string in hyperparams # and delete the entry if 'Fetch' in args.env[0]: env_wrapper = get_wrapper_class( {'env_wrapper': 'utils.wrappers.DoneOnSuccessWrapper'}) else: env_wrapper = None if 'env_wrapper' in hyperparams.keys(): del hyperparams['env_wrapper'] def create_env(n_envs): """ Create the environment and wrap it if necessary :param n_envs: (int) :return: (gym.Env) """ global hyperparams if is_atari: if args.verbose > 0:
def main(): args = get_args() choose_device(args.device) set_global_seeds(args.seed) env_id = args.env exp_id = args.exp_id algo = args.algo env_name = env_id[:-3] env_index = env_list.index(env_id) # Pass CustomEnv arguments: follow this for your CustomEnv if reward not known prior to training env_kwargs = {} if args.env_kwargs is None else args.env_kwargs if (args.env_kwargs is not None) and (env_id in ['AirSim-v0']): if 'rew_land' in env_kwargs: if (int(env_kwargs['rew_land']) in [500, 1000, 10000]): env_success[-1] = int(env_kwargs['rew_land']) else: raise ValueError( 'Given env reward not acceptable. Please try again') params = [exp_id, env_name.lower()] folder = [exp_id, env_name.lower(), args.algo.lower()] tensorboard_path, monitor_path, callback_path = None, None, None if args.tensorboard: tensorboard_path = "tensorboard/{}_{}".format(*params) make_dir(tensorboard_path) # if args.train_RL: # Begin training here (location of this condition also decides experiment performance) # Load hyperparameters from yaml file with open('hyperparams/{}.yml'.format(args.algo), 'r') as f: hyperparams_dict = yaml.safe_load(f) if env_id in list(hyperparams_dict.keys()): hyperparams = hyperparams_dict[env_id] else: raise ValueError("Hyperparameters not found for {}-{}".format( args.algo, env_id)) if args.hyperparams is not None: # Overwrite hyperparams if needed hyperparams.update(args.hyperparams) # OPTIONAL: Print saved hyperparams saved_hyperparams = OrderedDict([(key, hyperparams[key]) for key in sorted(hyperparams.keys())]) if args.verbose > 0: pprint(saved_hyperparams) if args.n_envs > 1: # if args.verbose: print("Overwriting n_envs with n={}".format(args.n_envs)) n_envs = args.n_envs else: n_envs = hyperparams.get('n_envs', 1) # choose Monitor log path according to multiprocessing setting if args.monitor: if n_envs == 1: monitor_path = 'logs/single/{}_{}_{}'.format(*folder) else: if algo not in ['dqn', 'her', 'sac', 'td3']: monitor_path = 'logs/multi/{}_{}_{}'.format(*folder) make_dir(monitor_path) if int(float(args.timesteps_RL)) > 0: # if args.verbose: print("Overwriting n_timesteps with n={}".format( int(float(args.timesteps_RL)))) n_timesteps = int(float(args.timesteps_RL)) else: n_timesteps = int(hyperparams['n_timesteps']) # Convert to python object if needed if 'policy_kwargs' in hyperparams.keys() and isinstance( hyperparams['policy_kwargs'], str): hyperparams['policy_kwargs'] = eval(hyperparams['policy_kwargs']) if 'n_envs' in hyperparams.keys(): del hyperparams['n_envs'] del hyperparams['n_timesteps'] #To avoid error env_wrapper = get_wrapper_class(hyperparams) if 'env_wrapper' in hyperparams.keys(): del hyperparams['env_wrapper'] # if (algo=='ppo2' and ('learning_rate' in hyperparams.keys())): # hyperparams['learning_rate'] = linear_schedule(hyperparams['learning_rate']) def create_env(n_envs, eval_env=False): if algo in ['a2c', 'acer', 'acktr', 'ppo2']: if n_envs > 1: env = SubprocVecEnv([ make_env(env_id, i, args.seed, log_dir=monitor_path, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs) ]) else: env = DummyVecEnv([ make_env(env_id, 0, args.seed, log_dir=monitor_path, wrapper_class=env_wrapper, env_kwargs=env_kwargs) ]) env = DummyVecEnv([lambda: gym.make(env_id, **env_kwargs)]) if env_wrapper is not None: env = env_wrapper(env) elif ((algo in ['dqn', 'her', 'sac', 'td3']) and n_envs > 1): raise ValueError( "Error: {} does not support multiprocessing!".format(algo)) elif ((algo in ['ddpg', 'ppo1', 'trpo', 'gail']) and n_envs > 1): raise ValueError( "Error: {} uses MPI for multiprocessing!".format(algo)) else: env = make_vec_env(env_id, n_envs=n_envs, seed=args.seed, monitor_dir=monitor_path, wrapper_class=env_wrapper, env_kwargs=env_kwargs) if args.normalize: # choose from multiple options # env = VecNormalize(env, clip_obs=np.inf) env = VecNormalize(env, norm_reward=False, clip_obs=np.inf) # env = VecNormalize(env, norm_reward=False, clip_obs=np.inf, **normalize_kwargs) return env # Zoo: env = SubprocVecEnv([make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs)]) # Zoo: env = DummyVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs)]) env = create_env(n_envs) # if args.train_RL: # checking impact of the if-condition position on experiment reproducibility callback, callback_path = [], "callbacks/{}_{}_{}".format(*folder) save_freq, eval_freq = 100 * episode_len[env_index], 100 * episode_len[ env_index] save_freq, eval_freq = max(save_freq // n_envs, 1), max(eval_freq // n_envs, 1) make_dir(callback_path) if args.check_callback: callback.append( CheckpointCallback(save_freq=save_freq, save_path=callback_path, name_prefix='rl_model', verbose=1)) if args.eval_callback: callback.append( EvalCallback(create_env(1, eval_env=True), best_model_save_path=callback_path, log_path=callback_path, eval_freq=eval_freq, verbose=1)) model = (algo_list[args.algo])(env=env, seed=args.seed, tensorboard_log=tensorboard_path, n_cpu_tf_sess=1, verbose=args.verbose, **hyperparams) print('\nTraining {} on {} now... \n'.format(algo, env_id)) start_time = time.time() model.learn(total_timesteps=n_timesteps, callback=callback) total_time = time.time() - start_time if args.normalize: env.save(os.path.join(callback_path, "vec_normalize.pkl")) if n_envs > 1 or (algo in ['ddpg', 'trpo', 'gail']): print("Took {:.2f}s for multiprocessed version - {:.2f} FPS".format( total_time, n_timesteps / total_time)) else: print("Took {:.2f}s for single process version - {:.2f} FPS".format( total_time, n_timesteps / total_time)) env = DummyVecEnv([make_env(env_id, 0, args.seed, env_kwargs=env_kwargs)]) if args.normalize: env = VecNormalize.load( os.path.join(callback_path, "vec_normalize.pkl"), env) env.training = False env.norm_reward = False env.seed(args.seed) # Evaluate RL model - choose either best model or last available model model = (algo_list[algo]).load(os.path.join(callback_path, 'best_model')) # model = (algo_list[algo]).load("models/{}_{}_{}".format(*folder)) model.set_env(env) evaluate('policy', model, env_id, env, algo, 100) if args.monitor: results_plotter.plot_results([monitor_path], n_timesteps, results_plotter.X_TIMESTEPS, "{} {}".format(algo, env_id)) plot_results(monitor_path) if args.test: print('\nTesting policy...\n') obs = env.reset() for _ in range(n_timesteps): action, _states = model.predict(obs, deterministic=True) if isinstance(env.action_space, gym.spaces.Box): action = np.clip(action, env.action_space.low, env.action_space.high) obs, rewards, dones, info = env.step(action) episode_reward += rewards env.render() if dones: done_count += 1 success_count = check_success(env_index, env_success, success_count) total_reward += episode_reward episode_reward = 0 env.reset() print('\n{}/{} successful episodes'.format(success_count, done_count)) average_reward = total_reward / done_count print('\nAverage reward: {}'.format(average_reward)) env.close()