Exemplo n.º 1
0
            normalize = True
        del hyperparams['normalize']

    if 'policy_kwargs' in hyperparams.keys():
        # Convert to python object if needed
        if isinstance(hyperparams['policy_kwargs'], str):
            hyperparams['policy_kwargs'] = eval(hyperparams['policy_kwargs'])

    # Delete keys so the dict can be pass to the model constructor
    if 'n_envs' in hyperparams.keys():
        del hyperparams['n_envs']
    del hyperparams['n_timesteps']

    # obtain a class object from a wrapper name string in hyperparams
    # and delete the entry
    env_wrapper = get_wrapper_class(hyperparams)
    if 'env_wrapper' in hyperparams.keys():
        del hyperparams['env_wrapper']

    log_path = f"{args.log_folder}/{args.algo}/"
    save_path = os.path.join(
        log_path,
        f"{env_id}_{get_latest_run_id(log_path, env_id) + 1}{uuid_str}")
    params_path = f"{save_path}/{env_id}"
    os.makedirs(params_path, exist_ok=True)

    callbacks = get_callback_class(hyperparams)
    if 'callback' in hyperparams.keys():
        del hyperparams['callback']

    if args.save_freq > 0:
Exemplo n.º 2
0
                normalize_kwargs = eval(normalize)
                normalize = True
            del hyperparams['normalize']

        if 'policy_kwargs' in hyperparams.keys():
            hyperparams['policy_kwargs'] = eval(hyperparams['policy_kwargs'])

        # Delete keys so the dict can be pass to the model constructor
        if 'n_envs' in hyperparams.keys():
            del hyperparams['n_envs']
        del hyperparams['n_timesteps']

        # obtain a class object from a wrapper name string in hyperparams
        # and delete the entry
        if 'Fetch' in args.env[0]:
            env_wrapper = get_wrapper_class(
                {'env_wrapper': 'utils.wrappers.DoneOnSuccessWrapper'})
        else:
            env_wrapper = None
        if 'env_wrapper' in hyperparams.keys():
            del hyperparams['env_wrapper']

        def create_env(n_envs):
            """
            Create the environment and wrap it if necessary
            :param n_envs: (int)
            :return: (gym.Env)
            """
            global hyperparams

            if is_atari:
                if args.verbose > 0:
Exemplo n.º 3
0
def main():

    args = get_args()
    choose_device(args.device)
    set_global_seeds(args.seed)

    env_id = args.env
    exp_id = args.exp_id
    algo = args.algo
    env_name = env_id[:-3]
    env_index = env_list.index(env_id)

    # Pass CustomEnv arguments: follow this for your CustomEnv if reward not known prior to training
    env_kwargs = {} if args.env_kwargs is None else args.env_kwargs
    if (args.env_kwargs is not None) and (env_id in ['AirSim-v0']):
        if 'rew_land' in env_kwargs:
            if (int(env_kwargs['rew_land']) in [500, 1000, 10000]):
                env_success[-1] = int(env_kwargs['rew_land'])
            else:
                raise ValueError(
                    'Given env reward not acceptable. Please try again')

    params = [exp_id, env_name.lower()]
    folder = [exp_id, env_name.lower(), args.algo.lower()]
    tensorboard_path, monitor_path, callback_path = None, None, None

    if args.tensorboard:
        tensorboard_path = "tensorboard/{}_{}".format(*params)
        make_dir(tensorboard_path)

    # if args.train_RL: # Begin training here (location of this condition also decides experiment performance)

    # Load hyperparameters from yaml file
    with open('hyperparams/{}.yml'.format(args.algo), 'r') as f:
        hyperparams_dict = yaml.safe_load(f)
        if env_id in list(hyperparams_dict.keys()):
            hyperparams = hyperparams_dict[env_id]
        else:
            raise ValueError("Hyperparameters not found for {}-{}".format(
                args.algo, env_id))

    if args.hyperparams is not None:
        # Overwrite hyperparams if needed
        hyperparams.update(args.hyperparams)

    # OPTIONAL: Print saved hyperparams
    saved_hyperparams = OrderedDict([(key, hyperparams[key])
                                     for key in sorted(hyperparams.keys())])
    if args.verbose > 0:
        pprint(saved_hyperparams)

    if args.n_envs > 1:
        # if args.verbose:
        print("Overwriting n_envs with n={}".format(args.n_envs))
        n_envs = args.n_envs
    else:
        n_envs = hyperparams.get('n_envs', 1)

    # choose Monitor log path according to multiprocessing setting
    if args.monitor:
        if n_envs == 1:
            monitor_path = 'logs/single/{}_{}_{}'.format(*folder)
        else:
            if algo not in ['dqn', 'her', 'sac', 'td3']:
                monitor_path = 'logs/multi/{}_{}_{}'.format(*folder)
        make_dir(monitor_path)

    if int(float(args.timesteps_RL)) > 0:
        # if args.verbose:
        print("Overwriting n_timesteps with n={}".format(
            int(float(args.timesteps_RL))))
        n_timesteps = int(float(args.timesteps_RL))
    else:
        n_timesteps = int(hyperparams['n_timesteps'])

    # Convert to python object if needed
    if 'policy_kwargs' in hyperparams.keys() and isinstance(
            hyperparams['policy_kwargs'], str):
        hyperparams['policy_kwargs'] = eval(hyperparams['policy_kwargs'])

    if 'n_envs' in hyperparams.keys():
        del hyperparams['n_envs']
    del hyperparams['n_timesteps']  #To avoid error

    env_wrapper = get_wrapper_class(hyperparams)
    if 'env_wrapper' in hyperparams.keys():
        del hyperparams['env_wrapper']

    # if (algo=='ppo2' and ('learning_rate' in hyperparams.keys())):
    #     hyperparams['learning_rate'] = linear_schedule(hyperparams['learning_rate'])

    def create_env(n_envs, eval_env=False):
        if algo in ['a2c', 'acer', 'acktr', 'ppo2']:
            if n_envs > 1:
                env = SubprocVecEnv([
                    make_env(env_id,
                             i,
                             args.seed,
                             log_dir=monitor_path,
                             wrapper_class=env_wrapper,
                             env_kwargs=env_kwargs) for i in range(n_envs)
                ])
            else:
                env = DummyVecEnv([
                    make_env(env_id,
                             0,
                             args.seed,
                             log_dir=monitor_path,
                             wrapper_class=env_wrapper,
                             env_kwargs=env_kwargs)
                ])
            env = DummyVecEnv([lambda: gym.make(env_id, **env_kwargs)])
            if env_wrapper is not None:
                env = env_wrapper(env)
        elif ((algo in ['dqn', 'her', 'sac', 'td3']) and n_envs > 1):
            raise ValueError(
                "Error: {} does not support multiprocessing!".format(algo))
        elif ((algo in ['ddpg', 'ppo1', 'trpo', 'gail']) and n_envs > 1):
            raise ValueError(
                "Error: {} uses MPI for multiprocessing!".format(algo))
        else:
            env = make_vec_env(env_id,
                               n_envs=n_envs,
                               seed=args.seed,
                               monitor_dir=monitor_path,
                               wrapper_class=env_wrapper,
                               env_kwargs=env_kwargs)

        if args.normalize:  # choose from multiple options
            # env = VecNormalize(env, clip_obs=np.inf)
            env = VecNormalize(env, norm_reward=False, clip_obs=np.inf)
            # env = VecNormalize(env, norm_reward=False, clip_obs=np.inf, **normalize_kwargs)
        return env

    # Zoo: env = SubprocVecEnv([make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs)])
    # Zoo: env = DummyVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs)])
    env = create_env(n_envs)

    # if args.train_RL: # checking impact of the if-condition position on experiment reproducibility

    callback, callback_path = [], "callbacks/{}_{}_{}".format(*folder)
    save_freq, eval_freq = 100 * episode_len[env_index], 100 * episode_len[
        env_index]
    save_freq, eval_freq = max(save_freq // n_envs,
                               1), max(eval_freq // n_envs, 1)
    make_dir(callback_path)
    if args.check_callback:
        callback.append(
            CheckpointCallback(save_freq=save_freq,
                               save_path=callback_path,
                               name_prefix='rl_model',
                               verbose=1))
    if args.eval_callback:
        callback.append(
            EvalCallback(create_env(1, eval_env=True),
                         best_model_save_path=callback_path,
                         log_path=callback_path,
                         eval_freq=eval_freq,
                         verbose=1))

    model = (algo_list[args.algo])(env=env,
                                   seed=args.seed,
                                   tensorboard_log=tensorboard_path,
                                   n_cpu_tf_sess=1,
                                   verbose=args.verbose,
                                   **hyperparams)
    print('\nTraining {} on {} now... \n'.format(algo, env_id))

    start_time = time.time()
    model.learn(total_timesteps=n_timesteps, callback=callback)
    total_time = time.time() - start_time

    if args.normalize:
        env.save(os.path.join(callback_path, "vec_normalize.pkl"))

    if n_envs > 1 or (algo in ['ddpg', 'trpo', 'gail']):
        print("Took {:.2f}s for multiprocessed version - {:.2f} FPS".format(
            total_time, n_timesteps / total_time))
    else:
        print("Took {:.2f}s for single process version - {:.2f} FPS".format(
            total_time, n_timesteps / total_time))

    env = DummyVecEnv([make_env(env_id, 0, args.seed, env_kwargs=env_kwargs)])

    if args.normalize:
        env = VecNormalize.load(
            os.path.join(callback_path, "vec_normalize.pkl"), env)
        env.training = False
        env.norm_reward = False
        env.seed(args.seed)

    # Evaluate RL model - choose either best model or last available model
    model = (algo_list[algo]).load(os.path.join(callback_path, 'best_model'))
    # model = (algo_list[algo]).load("models/{}_{}_{}".format(*folder))
    model.set_env(env)
    evaluate('policy', model, env_id, env, algo, 100)

    if args.monitor:
        results_plotter.plot_results([monitor_path], n_timesteps,
                                     results_plotter.X_TIMESTEPS,
                                     "{} {}".format(algo, env_id))
        plot_results(monitor_path)

    if args.test:
        print('\nTesting policy...\n')
        obs = env.reset()
        for _ in range(n_timesteps):
            action, _states = model.predict(obs, deterministic=True)
            if isinstance(env.action_space, gym.spaces.Box):
                action = np.clip(action, env.action_space.low,
                                 env.action_space.high)
            obs, rewards, dones, info = env.step(action)
            episode_reward += rewards
            env.render()
            if dones:
                done_count += 1
                success_count = check_success(env_index, env_success,
                                              success_count)
                total_reward += episode_reward
                episode_reward = 0
                env.reset()
        print('\n{}/{} successful episodes'.format(success_count, done_count))
        average_reward = total_reward / done_count
        print('\nAverage reward: {}'.format(average_reward))
        env.close()