예제 #1
0
def test_train_disc_small_expert_data_warning(tmpdir, _algorithm_cls):
    logger.configure(tmpdir, ["tensorboard", "stdout"])
    venv = util.make_vec_env(
        "CartPole-v1",
        n_envs=2,
        parallel=_parallel,
    )

    gen_algo = util.init_rl(venv, verbose=1)
    small_data = rollout.generate_transitions(gen_algo, venv, n_timesteps=20)

    with pytest.raises(ValueError, match="Transitions.*expert_batch_size"):
        _algorithm_cls(
            venv=venv,
            expert_data=small_data,
            expert_batch_size=21,
            gen_algo=gen_algo,
            log_dir=tmpdir,
        )

    with pytest.raises(ValueError, match="expert_batch_size.*positive"):
        _algorithm_cls(
            venv=venv,
            expert_data=small_data,
            expert_batch_size=-1,
            gen_algo=gen_algo,
            log_dir=tmpdir,
        )
예제 #2
0
def trainer(_algorithm_cls, _parallel: bool, tmpdir: str,
            _convert_dataset: bool):
    logger.configure(tmpdir, ["tensorboard", "stdout"])
    trajs = types.load(
        "tests/data/expert_models/cartpole_0/rollouts/final.pkl")
    if _convert_dataset:
        trans = rollout.flatten_trajectories(trajs)
        expert_data = datasets.TransitionsDictDatasetAdaptor(trans)
    else:
        expert_data = rollout.flatten_trajectories(trajs)

    venv = util.make_vec_env(
        "CartPole-v1",
        n_envs=2,
        parallel=_parallel,
        log_dir=tmpdir,
    )

    gen_policy = util.init_rl(venv, verbose=1)

    return _algorithm_cls(
        venv=venv,
        expert_data=expert_data,
        gen_policy=gen_policy,
        log_dir=tmpdir,
    )
예제 #3
0
def test_density_trainer_smoke():
    # tests whether density trainer runs, not whether it's good
    # (it's actually really poor)
    env_name = "Pendulum-v0"
    rollout_path = "tests/data/expert_models/pendulum_0/rollouts/final.pkl"
    rollouts = types.load(rollout_path)[:2]
    env = util.make_vec_env(env_name, 2)
    imitation_trainer = util.init_rl(env)
    density_trainer = DensityTrainer(
        env,
        rollouts=rollouts,
        imitation_trainer=imitation_trainer,
        density_type=STATE_ACTION_DENSITY,
        is_stationary=False,
        kernel="gaussian",
    )
    density_trainer.train_policy(n_timesteps=2)
    density_trainer.test_policy(n_trajectories=2)
예제 #4
0
def test_train_disc_small_expert_data_warning(tmpdir, _algorithm_cls):
    logger.configure(tmpdir, ["tensorboard", "stdout"])
    venv = util.make_vec_env(
        "CartPole-v1",
        n_envs=2,
        parallel=_parallel,
        log_dir=tmpdir,
    )

    gen_algo = util.init_rl(venv, verbose=1)
    small_data = rollout.generate_transitions(gen_algo, venv, n_timesteps=20)

    with pytest.warns(RuntimeWarning, match="discriminator batch size"):
        _algorithm_cls(
            venv=venv,
            expert_data=small_data,
            gen_algo=gen_algo,
            log_dir=tmpdir,
        )
예제 #5
0
def trainer(
    _algorithm_cls,
    _parallel: bool,
    tmpdir: str,
    _convert_dataset: bool,
    expert_batch_size: int,
    expert_transitions: types.Transitions,
):
    logger.configure(tmpdir, ["tensorboard", "stdout"])
    if _convert_dataset:
        expert_data = th_data.DataLoader(
            expert_transitions,
            batch_size=expert_batch_size,
            collate_fn=types.transitions_collate_fn,
            shuffle=True,
            drop_last=True,
        )
    else:
        expert_data = expert_transitions

    venv = util.make_vec_env(
        "CartPole-v1",
        n_envs=2,
        parallel=_parallel,
        log_dir=tmpdir,
    )

    gen_algo = util.init_rl(venv, verbose=1)

    trainer = _algorithm_cls(
        venv=venv,
        expert_data=expert_data,
        expert_batch_size=expert_batch_size,
        gen_algo=gen_algo,
        log_dir=tmpdir,
    )

    try:
        yield trainer
    finally:
        venv.close()
예제 #6
0
def test_density_trainer(density_type, is_stationary):
    env_name = "Pendulum-v0"
    rollout_path = "tests/data/expert_models/pendulum_0/rollouts/final.pkl"
    rollouts = types.load(rollout_path)
    env = util.make_vec_env(env_name, 2)
    imitation_trainer = util.init_rl(env)
    density_trainer = DensityTrainer(
        env,
        rollouts=rollouts,
        imitation_trainer=imitation_trainer,
        density_type=density_type,
        is_stationary=is_stationary,
        kernel="gaussian",
    )
    novice_stats = density_trainer.test_policy()
    density_trainer.train_policy(2000)
    good_stats = density_trainer.test_policy()
    # Novice is bad
    assert novice_stats["return_mean"] < -500
    # Density is also pretty bad, but shouldn't make things more than 50% worse.
    # It would be nice to have a less flaky/more meaningful test here.
    assert good_stats["return_mean"] > 1.5 * novice_stats["return_mean"]
예제 #7
0
def rand_policy(venv):
    return util.init_rl(venv)
예제 #8
0
def init_trainer(
    env_name: str,
    expert_trajectories: Sequence[types.Trajectory],
    *,
    log_dir: str,
    seed: int = 0,
    use_gail: bool = False,
    num_vec: int = 8,
    parallel: bool = False,
    max_episode_steps: Optional[int] = None,
    scale: bool = True,
    airl_entropy_weight: float = 1.0,
    discrim_kwargs: dict = {},
    reward_kwargs: dict = {},
    trainer_kwargs: dict = {},
    init_rl_kwargs: dict = {},
):
    """Builds an AdversarialTrainer, ready to be trained on expert demonstrations.

    Args:
      env_name: The string id of a gym environment.
      expert_trajectories: Demonstrations from expert.
      seed: Random seed.
      log_dir: Directory for logging output. Will generate a unique sub-directory
          within this directory for all output.
      use_gail: If True, then train using GAIL. If False, then train
          using AIRL.
      num_vec: The number of vectorized environments.
      parallel: If True, then use SubprocVecEnv; otherwise, DummyVecEnv.
      max_episode_steps: If specified, wraps VecEnv in TimeLimit wrapper with
          this episode length before returning.
      policy_dir: The directory containing the pickled experts for
          generating rollouts.
      scale: If True, then scale input Tensors to the interval [0, 1].
      airl_entropy_weight: Only applicable for AIRL. The `entropy_weight`
          argument of `DiscrimNetAIRL.__init__`.
      trainer_kwargs: Arguments for the Trainer constructor.
      reward_kwargs: Arguments for the `*RewardNet` constructor.
      discrim_kwargs: Arguments for the `DiscrimNet*` constructor.
      init_rl_kwargs: Keyword arguments passed to `init_rl`,
          used to initialize the RL algorithm.
    """
    logger.configure(folder=log_dir, format_strs=["tensorboard", "stdout"])
    env = util.make_vec_env(
        env_name,
        num_vec,
        seed=seed,
        parallel=parallel,
        log_dir=log_dir,
        max_episode_steps=max_episode_steps,
    )
    gen_policy = util.init_rl(env, verbose=1, **init_rl_kwargs)

    if use_gail:
        discrim = discrim_net.DiscrimNetGAIL(env.observation_space,
                                             env.action_space,
                                             scale=scale,
                                             **discrim_kwargs)
    else:
        rn = BasicShapedRewardNet(env.observation_space,
                                  env.action_space,
                                  scale=scale,
                                  **reward_kwargs)
        discrim = discrim_net.DiscrimNetAIRL(
            rn, entropy_weight=airl_entropy_weight, **discrim_kwargs)

    expert_demos = rollout.flatten_trajectories(expert_trajectories)
    trainer = AdversarialTrainer(env,
                                 gen_policy,
                                 discrim,
                                 expert_demos,
                                 log_dir=log_dir,
                                 **trainer_kwargs)
    return trainer
예제 #9
0
def rollouts_and_policy(
    _run,
    _seed: int,
    env_name: str,
    total_timesteps: int,
    *,
    log_dir: str,
    num_vec: int,
    parallel: bool,
    max_episode_steps: Optional[int],
    normalize: bool,
    normalize_kwargs: dict,
    init_rl_kwargs: dict,
    n_episodes_eval: int,
    reward_type: Optional[str],
    reward_path: Optional[str],
    rollout_save_interval: int,
    rollout_save_final: bool,
    rollout_save_n_timesteps: Optional[int],
    rollout_save_n_episodes: Optional[int],
    policy_save_interval: int,
    policy_save_final: bool,
    init_tensorboard: bool,
) -> dict:
    """Trains an expert policy from scratch and saves the rollouts and policy.

    Checkpoints:
      At applicable training steps `step` (where step is either an integer or
      "final"):

        - Policies are saved to `{log_dir}/policies/{step}/`.
        - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`.

    Args:
        env_name: The gym.Env name. Loaded as VecEnv.
        total_timesteps: Number of training timesteps in `model.learn()`.
        log_dir: The root directory to save metrics and checkpoints to.
        num_vec: Number of environments in VecEnv.
        parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv.
        max_episode_steps: If not None, then environments are wrapped by
            TimeLimit so that they have at most `max_episode_steps` steps per
            episode.
        normalize: If True, then rescale observations and reward.
        normalize_kwargs: kwargs for `VecNormalize`.
        init_rl_kwargs: kwargs for `init_rl`.

        n_episodes_eval: The number of episodes to average over when calculating
            the average ground truth reward return of the final policy.

        reward_type: If provided, then load the serialized reward of this type,
            wrapping the environment in this reward. This is useful to test
            whether a reward model transfers. For more information, see
            `imitation.rewards.serialize.load_reward`.
        reward_path: A specifier, such as a path to a file on disk, used by
            reward_type to load the reward model. For more information, see
            `imitation.rewards.serialize.load_reward`.

        rollout_save_interval: The number of training updates in between
            intermediate rollout saves. If the argument is nonpositive, then
            don't save intermediate updates.
        rollout_save_final: If True, then save rollouts right after training is
            finished.
        rollout_save_n_timesteps: The minimum number of timesteps saved in every
            file. Could be more than `rollout_save_n_timesteps` because
            trajectories are saved by episode rather than by transition.
            Must set exactly one of `rollout_save_n_timesteps`
            and `rollout_save_n_episodes`.
        rollout_save_n_episodes: The number of episodes saved in every
            file. Must set exactly one of `rollout_save_n_timesteps` and
            `rollout_save_n_episodes`.

        policy_save_interval: The number of training updates between saves. Has
            the same semantics are `rollout_save_interval`.
        policy_save_final: If True, then save the policy right after training is
            finished.

        init_tensorboard: If True, then write tensorboard logs to {log_dir}/sb_tb
            and "output/summary/...".

    Returns:
      The return value of `rollout_stats()` using the final policy.
    """
    os.makedirs(log_dir, exist_ok=True)
    sacred_util.build_sacred_symlink(log_dir, _run)

    sample_until = rollout.make_sample_until(rollout_save_n_timesteps,
                                             rollout_save_n_episodes)
    eval_sample_until = rollout.min_episodes(n_episodes_eval)

    with networks.make_session():
        tf.logging.set_verbosity(tf.logging.INFO)
        logger.configure(folder=osp.join(log_dir, "rl"),
                         format_strs=["tensorboard", "stdout"])

        rollout_dir = osp.join(log_dir, "rollouts")
        policy_dir = osp.join(log_dir, "policies")
        os.makedirs(rollout_dir, exist_ok=True)
        os.makedirs(policy_dir, exist_ok=True)

        if init_tensorboard:
            sb_tensorboard_dir = osp.join(log_dir, "sb_tb")
            # Convert sacred's ReadOnlyDict to dict so we can modify on next line.
            init_rl_kwargs = dict(init_rl_kwargs)
            init_rl_kwargs["tensorboard_log"] = sb_tensorboard_dir

        venv = util.make_vec_env(
            env_name,
            num_vec,
            seed=_seed,
            parallel=parallel,
            log_dir=log_dir,
            max_episode_steps=max_episode_steps,
        )

        log_callbacks = []
        with contextlib.ExitStack() as stack:
            if reward_type is not None:
                reward_fn_ctx = load_reward(reward_type, reward_path, venv)
                reward_fn = stack.enter_context(reward_fn_ctx)
                venv = RewardVecEnvWrapper(venv, reward_fn)
                log_callbacks.append(venv.log_callback)
                tf.logging.info(
                    f"Wrapped env in reward {reward_type} from {reward_path}.")

            vec_normalize = None
            if normalize:
                venv = vec_normalize = VecNormalize(venv, **normalize_kwargs)

            policy = util.init_rl(venv, verbose=1, **init_rl_kwargs)

            # Make callback to save intermediate artifacts during training.
            step = 0

            def callback(locals_: dict, _) -> bool:
                nonlocal step
                step += 1
                policy = locals_["self"]

                # TODO(adam): make logging frequency configurable
                for callback in log_callbacks:
                    callback(sb_logger)

                if rollout_save_interval > 0 and step % rollout_save_interval == 0:
                    save_path = osp.join(rollout_dir, f"{step}.pkl")
                    rollout.rollout_and_save(save_path, policy, venv,
                                             sample_until)
                if policy_save_interval > 0 and step % policy_save_interval == 0:
                    output_dir = os.path.join(policy_dir, f"{step:05d}")
                    serialize.save_stable_model(output_dir, policy,
                                                vec_normalize)

            policy.learn(total_timesteps, callback=callback)

            # Save final artifacts after training is complete.
            if rollout_save_final:
                save_path = osp.join(rollout_dir, "final.pkl")
                rollout.rollout_and_save(save_path, policy, venv, sample_until)
            if policy_save_final:
                output_dir = os.path.join(policy_dir, "final")
                serialize.save_stable_model(output_dir, policy, vec_normalize)

            # Final evaluation of expert policy.
            trajs = rollout.generate_trajectories(policy, venv,
                                                  eval_sample_until)
            stats = rollout.rollout_stats(trajs)

    return stats
예제 #10
0
def train(
    _run,
    _seed: int,
    algorithm: str,
    env_name: str,
    num_vec: int,
    parallel: bool,
    max_episode_steps: Optional[int],
    rollout_path: str,
    n_expert_demos: Optional[int],
    log_dir: str,
    total_timesteps: int,
    n_episodes_eval: int,
    init_tensorboard: bool,
    checkpoint_interval: int,
    gen_batch_size: int,
    init_rl_kwargs: Mapping,
    algorithm_kwargs: Mapping[str, Mapping],
    discrim_net_kwargs: Mapping[str, Mapping],
) -> dict:
    """Train an adversarial-network-based imitation learning algorithm.

    Checkpoints:
        - DiscrimNets are saved to `f"{log_dir}/checkpoints/{step}/discrim/"`,
            where step is either the training round or "final".
        - Generator policies are saved to `f"{log_dir}/checkpoints/{step}/gen_policy/"`.

    Args:
        _seed: Random seed.
        algorithm: A case-insensitive string determining which adversarial imitation
            learning algorithm is executed. Either "airl" or "gail".
        env_name: The environment to train in.
        num_vec: Number of `gym.Env` to vectorize.
        parallel: Whether to use "true" parallelism. If True, then use `SubProcVecEnv`.
            Otherwise, use `DummyVecEnv` which steps through environments serially.
        max_episode_steps: If not None, then a TimeLimit wrapper is applied to each
            environment to artificially limit the maximum number of timesteps in an
            episode.
        rollout_path: Path to pickle containing list of Trajectories. Used as
            expert demonstrations.
        n_expert_demos: The number of expert trajectories to actually use
            after loading them from `rollout_path`.
            If None, then use all available trajectories.
            If `n_expert_demos` is an `int`, then use exactly `n_expert_demos`
            trajectories, erroring if there aren't enough trajectories. If there are
            surplus trajectories, then use the first `n_expert_demos` trajectories and
            drop the rest.
        log_dir: Directory to save models and other logging to.
        total_timesteps: The number of transitions to sample from the environment
            during training.
        n_episodes_eval: The number of episodes to average over when calculating
            the average episode reward of the imitation policy for return.
        init_tensorboard: If True, then write tensorboard logs to `{log_dir}/sb_tb`.
        checkpoint_interval: Save the discriminator and generator models every
            `checkpoint_interval` rounds and after training is complete. If 0,
            then only save weights after training is complete. If <0, then don't
            save weights at all.
        gen_batch_size: Batch size for generator updates. Sacred automatically uses
            this to calculate `n_steps` in `init_rl_kwargs`. In the script body, this
            is only used in sanity checks.
        init_rl_kwargs: Keyword arguments for `init_rl`, the RL algorithm initialization
            utility function.
        algorithm_kwargs: Keyword arguments for the `GAIL` or `AIRL` constructor
            that can apply to either constructor. Unlike a regular kwargs argument, this
            argument can only have the following keys: "shared", "airl", and "gail".

            `algorithm_kwargs["airl"]`, if it is provided, is a kwargs `Mapping` passed
            to the `AIRL` constructor when `algorithm == "airl"`. Likewise
            `algorithm_kwargs["gail"]` is passed to the `GAIL` constructor when
            `algorithm == "gail"`. `algorithm_kwargs["shared"]`, if provided, is passed
            to both the `AIRL` and `GAIL` constructors. Duplicate keyword argument keys
            between `algorithm_kwargs["shared"]` and `algorithm_kwargs["airl"]` (or
            "gail") leads to an error.
        discrim_net_kwargs: Keyword arguments for the `DiscrimNet` constructor. Unlike a
            regular kwargs argument, this argument can only have the following keys:
            "shared", "airl", "gail". These keys have the same meaning as they do in
            `algorithm_kwargs`.

    Returns:
        A dictionary with two keys. "imit_stats" gives the return value of
        `rollout_stats()` on rollouts test-reward-wrapped environment, using the final
        policy (remember that the ground-truth reward can be recovered from the
        "monitor_return" key). "expert_stats" gives the return value of
        `rollout_stats()` on the expert demonstrations loaded from `rollout_path`.
    """
    if gen_batch_size % num_vec != 0:
        raise ValueError(
            f"num_vec={num_vec} must evenly divide gen_batch_size={gen_batch_size}."
        )

    allowed_keys = {"shared", "gail", "airl"}
    if not discrim_net_kwargs.keys() <= allowed_keys:
        raise ValueError(
            f"Invalid discrim_net_kwargs.keys()={discrim_net_kwargs.keys()}. "
            f"Allowed keys: {allowed_keys}"
        )
    if not algorithm_kwargs.keys() <= allowed_keys:
        raise ValueError(
            f"Invalid discrim_net_kwargs.keys()={algorithm_kwargs.keys()}. "
            f"Allowed keys: {allowed_keys}"
        )

    if not os.path.exists(rollout_path):
        raise ValueError(f"File at rollout_path={rollout_path} does not exist.")

    expert_trajs = types.load(rollout_path)
    if n_expert_demos is not None:
        if not len(expert_trajs) >= n_expert_demos:
            raise ValueError(
                f"Want to use n_expert_demos={n_expert_demos} trajectories, but only "
                f"{len(expert_trajs)} are available via {rollout_path}."
            )
        expert_trajs = expert_trajs[:n_expert_demos]
    expert_transitions = rollout.flatten_trajectories(expert_trajs)

    total_timesteps = int(total_timesteps)

    logging.info("Logging to %s", log_dir)
    logger.configure(log_dir, ["tensorboard", "stdout"])
    os.makedirs(log_dir, exist_ok=True)
    sacred_util.build_sacred_symlink(log_dir, _run)

    venv = util.make_vec_env(
        env_name,
        num_vec,
        seed=_seed,
        parallel=parallel,
        log_dir=log_dir,
        max_episode_steps=max_episode_steps,
    )

    # if init_tensorboard:
    #     tensorboard_log = osp.join(log_dir, "sb_tb")
    # else:
    #     tensorboard_log = None

    gen_algo = util.init_rl(
        # FIXME(sam): ignoring tensorboard_log is a hack to prevent SB3 from
        # re-configuring the logger (SB3 issue #109). See init_rl() for details.
        # TODO(shwang): Let's get rid of init_rl after SB3 issue #109 is fixed?
        # Besides sidestepping #109, init_rl is just a stub function.
        venv,
        **init_rl_kwargs,
    )

    discrim_kwargs_shared = discrim_net_kwargs.get("shared", {})
    discrim_kwargs_algo = discrim_net_kwargs.get(algorithm, {})
    final_discrim_kwargs = dict(**discrim_kwargs_shared, **discrim_kwargs_algo)

    algorithm_kwargs_shared = algorithm_kwargs.get("shared", {})
    algorithm_kwargs_algo = algorithm_kwargs.get(algorithm, {})
    final_algorithm_kwargs = dict(
        **algorithm_kwargs_shared,
        **algorithm_kwargs_algo,
    )

    if algorithm.lower() == "gail":
        algo_cls = adversarial.GAIL
    elif algorithm.lower() == "airl":
        algo_cls = adversarial.AIRL
    else:
        raise ValueError(f"Invalid value algorithm={algorithm}.")

    trainer = algo_cls(
        venv=venv,
        expert_data=expert_transitions,
        gen_algo=gen_algo,
        log_dir=log_dir,
        discrim_kwargs=final_discrim_kwargs,
        **final_algorithm_kwargs,
    )

    def callback(round_num):
        if checkpoint_interval > 0 and round_num % checkpoint_interval == 0:
            save(trainer, os.path.join(log_dir, "checkpoints", f"{round_num:05d}"))

    trainer.train(total_timesteps, callback)

    # Save final artifacts.
    if checkpoint_interval >= 0:
        save(trainer, os.path.join(log_dir, "checkpoints", "final"))

    # Final evaluation of imitation policy.
    results = {}
    sample_until_eval = rollout.min_episodes(n_episodes_eval)
    trajs = rollout.generate_trajectories(
        trainer.gen_algo, trainer.venv_train_norm, sample_until=sample_until_eval
    )
    results["expert_stats"] = rollout.rollout_stats(expert_trajs)
    results["imit_stats"] = rollout.rollout_stats(trajs)
    return results