示例#1
0
def trainer(_algorithm_cls, _parallel: bool, tmpdir: str,
            _convert_dataset: bool):
    logger.configure(tmpdir, ["tensorboard", "stdout"])
    trajs = types.load(
        "tests/data/expert_models/cartpole_0/rollouts/final.pkl")
    if _convert_dataset:
        trans = rollout.flatten_trajectories(trajs)
        expert_data = datasets.TransitionsDictDatasetAdaptor(trans)
    else:
        expert_data = rollout.flatten_trajectories(trajs)

    venv = util.make_vec_env(
        "CartPole-v1",
        n_envs=2,
        parallel=_parallel,
        log_dir=tmpdir,
    )

    gen_policy = util.init_rl(venv, verbose=1)

    return _algorithm_cls(
        venv=venv,
        expert_data=expert_data,
        gen_policy=gen_policy,
        log_dir=tmpdir,
    )
示例#2
0
def trainer(request, session, venv):
    convert_dataset = request.param
    rollouts = types.load(ROLLOUT_PATH)
    data = rollout.flatten_trajectories(rollouts)
    if convert_dataset:
        data = datasets.TransitionsDictDatasetAdaptor(
            data, datasets.EpochOrderDictDataset)
    return bc.BC(venv.observation_space, venv.action_space, expert_data=data)
示例#3
0
 def _load_all_demos(self):
     num_demos_by_round = []
     for round_num in range(self._last_loaded_round + 1, self.round_num + 1):
         round_dir = self._demo_dir_path_for_round(round_num)
         demo_paths = self._get_demo_paths(round_dir)
         self._all_demos.extend(_load_trajectory(p) for p in demo_paths)
         num_demos_by_round.append(len(demo_paths))
     tf.logging.info(f"Loaded {len(self._all_demos)} total")
     demo_transitions = rollout.flatten_trajectories(self._all_demos)
     return demo_transitions, num_demos_by_round
示例#4
0
def test_train_from_random_dict_dataset(venv):
    # make sure that we can construct BC instance & train from a RandomDictDataset
    rollouts = types.load(ROLLOUT_PATH)
    data = rollout.flatten_trajectories(rollouts)
    data = datasets.TransitionsDictDatasetAdaptor(data,
                                                  datasets.RandomDictDataset)
    trainer = bc.BC(venv.observation_space,
                    venv.action_space,
                    expert_data=data)
    trainer.train(n_epochs=1)
 def f(total_timesteps: int) -> types.Transitions:
     trajs = trajectory_callable(
         sample_until=rollout.min_timesteps(total_timesteps))
     trans = rollout.flatten_trajectories(trajs)
     assert len(trans) >= total_timesteps
     as_dict = dataclasses.asdict(trans)
     truncated = {
         k: arr[:total_timesteps]
         for k, arr in as_dict.items()
     }
     return dataclasses.replace(trans, **truncated)
示例#6
0
def test_potential_shaping_cycle(graph,
                                 session,
                                 venv,
                                 potential_cls,
                                 discount: float,
                                 num_episodes: int = 10) -> None:
    """Test that potential shaping is constant on any fixed-length cycle.

    Specifically, performs rollouts of a random policy in the environment.
    Fixes the starting state for each trajectory at the all-zero state.
    Then computes episode return, and checks they're all equal.

    Requires environment be fixed length, otherwise the episode return will vary
    (except in the undiscounted case).
    """
    policy = base.RandomPolicy(venv.observation_space, venv.action_space)
    trajectories = rollout.generate_trajectories(
        policy, venv, sample_until=rollout.min_episodes(num_episodes))
    transitions = rollout.flatten_trajectories(trajectories)

    # Make initial state fixed as all-zero.
    # Note don't need to change final state, since `dones` being `True` should
    # force potential to be zero at those states.
    obs = np.array(transitions.obs)
    idxs = np.where(transitions.dones)[0] + 1
    idxs = np.pad(idxs[:-1], (1, 0), "constant")
    obs[idxs, :] = 0
    transitions = dataclasses.replace(transitions, obs=obs)

    with graph.as_default(), session.as_default():
        reward_model = potential_cls(venv.observation_space,
                                     venv.action_space,
                                     discount=discount)
        session.run(tf.global_variables_initializer())
        rews = rewards.evaluate_models({"m": reward_model}, transitions)

    rets = rewards.compute_return_from_rews(rews,
                                            transitions.dones,
                                            discount=discount)["m"]
    if discount == 1.0:
        assert np.allclose(rets, 0.0, atol=1e-5)
    assert np.allclose(rets, np.mean(rets), atol=1e-5)
示例#7
0
def train_bc(env, experiment_path):
    """
    Train GAIL on rollouts in the experiment path, save checkpoints
    and evaluate those checkpoints

    Based on code here
    https://github.com/HumanCompatibleAI/imitation/blob/master/src/imitation/scripts/train_adversarial.py
    """
    rollout_file = os.path.join(experiment_path, ROLLOUTS_FILE)
    bc_model_directory = os.path.join(experiment_path, BC_MODEL_DIRECTORY)
    bc_log_directory = os.path.join(experiment_path, BC_LOG_DIRECTORY)
    if os.path.isdir(bc_log_directory):
        print("Skipping BC training (log directory exists)")
        return
    os.makedirs(bc_model_directory, exist_ok=True)
    os.makedirs(bc_log_directory, exist_ok=True)
    logger.configure(bc_log_directory)

    expert_trajs = types.load(rollout_file)
    expert_transitions = rollout.flatten_trajectories(expert_trajs)

    env = gym.make(env)

    trainer = BC(env.observation_space,
                 env.action_space,
                 expert_data=expert_transitions,
                 policy_class=MlpPolicy,
                 device="cpu",
                 ent_weight=0.0)

    env.close()

    def callback(locals):
        path = os.path.join(bc_model_directory,
                            "epoch_{}".format(locals["epoch_num"]))
        trainer.save_policy(path)

    trainer.save_policy(os.path.join(experiment_path, "start_bc"))
    trainer.train(BC_TRAIN_EPOCHS, on_epoch_end=callback)

    # Save trained policy
    trainer.save_policy(os.path.join(experiment_path, "final_bc"))
示例#8
0
def trainer(batch_size, venv, expert_data_type):
    rollouts = types.load(ROLLOUT_PATH)
    trans = rollout.flatten_trajectories(rollouts)
    if expert_data_type == "data_loader":
        expert_data = th_data.DataLoader(
            trans,
            batch_size=batch_size,
            shuffle=True,
            collate_fn=types.transitions_collate_fn,
        )
    elif expert_data_type == "ducktyped_data_loader":
        expert_data = DucktypedDataset(trans, batch_size)
    elif expert_data_type == "transitions":
        expert_data = trans
    else:  # pragma: no cover
        raise ValueError(expert_data_type)

    return bc.BC(
        venv.observation_space,
        venv.action_space,
        expert_data=expert_data,
    )
示例#9
0
def compute_return_of_models(
    models: Mapping[K, RewardModel],
    trajectories: Sequence[types.Trajectory],
    discount: float = 1.0,
) -> Mapping[K, np.ndarray]:
    """Computes the returns of each trajectory under each model.

    Args:
        models: A collection of reward models.
        trajectories: A sequence of trajectories.
        discount: The discount rate; defaults to undiscounted.

    Returns:
        A collection of NumPy arrays containing the returns from each model.
    """
    # Reward models are Markovian so only operate on a timestep at a time,
    # expecting input shape (batch_size, ) + {obs,act}_shape. Flatten the
    # trajectories to accommodate this.
    transitions = rollout.flatten_trajectories(trajectories)
    preds = evaluate_models(models, transitions)

    return compute_return_from_rews(preds, transitions.dones, discount)
示例#10
0
def init_trainer(
    env_name: str,
    expert_trajectories: Sequence[types.Trajectory],
    *,
    log_dir: str,
    seed: int = 0,
    use_gail: bool = False,
    num_vec: int = 8,
    parallel: bool = False,
    max_episode_steps: Optional[int] = None,
    scale: bool = True,
    airl_entropy_weight: float = 1.0,
    discrim_kwargs: dict = {},
    reward_kwargs: dict = {},
    trainer_kwargs: dict = {},
    init_rl_kwargs: dict = {},
):
    """Builds an AdversarialTrainer, ready to be trained on expert demonstrations.

    Args:
      env_name: The string id of a gym environment.
      expert_trajectories: Demonstrations from expert.
      seed: Random seed.
      log_dir: Directory for logging output. Will generate a unique sub-directory
          within this directory for all output.
      use_gail: If True, then train using GAIL. If False, then train
          using AIRL.
      num_vec: The number of vectorized environments.
      parallel: If True, then use SubprocVecEnv; otherwise, DummyVecEnv.
      max_episode_steps: If specified, wraps VecEnv in TimeLimit wrapper with
          this episode length before returning.
      policy_dir: The directory containing the pickled experts for
          generating rollouts.
      scale: If True, then scale input Tensors to the interval [0, 1].
      airl_entropy_weight: Only applicable for AIRL. The `entropy_weight`
          argument of `DiscrimNetAIRL.__init__`.
      trainer_kwargs: Arguments for the Trainer constructor.
      reward_kwargs: Arguments for the `*RewardNet` constructor.
      discrim_kwargs: Arguments for the `DiscrimNet*` constructor.
      init_rl_kwargs: Keyword arguments passed to `init_rl`,
          used to initialize the RL algorithm.
    """
    logger.configure(folder=log_dir, format_strs=["tensorboard", "stdout"])
    env = util.make_vec_env(
        env_name,
        num_vec,
        seed=seed,
        parallel=parallel,
        log_dir=log_dir,
        max_episode_steps=max_episode_steps,
    )
    gen_policy = util.init_rl(env, verbose=1, **init_rl_kwargs)

    if use_gail:
        discrim = discrim_net.DiscrimNetGAIL(env.observation_space,
                                             env.action_space,
                                             scale=scale,
                                             **discrim_kwargs)
    else:
        rn = BasicShapedRewardNet(env.observation_space,
                                  env.action_space,
                                  scale=scale,
                                  **reward_kwargs)
        discrim = discrim_net.DiscrimNetAIRL(
            rn, entropy_weight=airl_entropy_weight, **discrim_kwargs)

    expert_demos = rollout.flatten_trajectories(expert_trajectories)
    trainer = AdversarialTrainer(env,
                                 gen_policy,
                                 discrim,
                                 expert_demos,
                                 log_dir=log_dir,
                                 **trainer_kwargs)
    return trainer
示例#11
0
 def convert_traj_to_coords_filtered(trajs: Sequence[types.Trajectory]):
     trans = rollout.flatten_trajectories(trajs)
     obs = trans.obs
     if filter_trans_by_act:
         obs = obs[trans.acts == act]
     return obs[:, 0], obs[:, 1]
示例#12
0
from imitation.algorithms import adversarial, bc
from imitation.data import rollout
from imitation.util import logger, util

# Load pickled test demonstrations.
with open("tests/data/expert_models/cartpole_0/rollouts/final.pkl", "rb") as f:
    # This is a list of `imitation.data.types.Trajectory`, where
    # every instance contains observations and actions for a single expert
    # demonstration.
    trajectories = pickle.load(f)

# Convert List[types.Trajectory] to an instance of `imitation.data.types.Transitions`.
# This is a more general dataclass containing unordered
# (observation, actions, next_observation) transitions.
transitions = rollout.flatten_trajectories(trajectories)

venv = util.make_vec_env("CartPole-v1", n_envs=2)

tempdir = tempfile.TemporaryDirectory(prefix="quickstart")
tempdir_path = pathlib.Path(tempdir.name)
print(
    f"All Tensorboards and logging are being written inside {tempdir_path}/.")

# Train BC on expert data.
# BC also accepts as `expert_data` any PyTorch-style DataLoader that iterates over
# dictionaries containing observations and actions.
logger.configure(tempdir_path / "BC/")
bc_trainer = bc.BC(venv.observation_space,
                   venv.action_space,
                   expert_data=transitions)
示例#13
0
def expert_transitions():
    trajs = types.load(
        "tests/data/expert_models/cartpole_0/rollouts/final.pkl")
    trans = rollout.flatten_trajectories(trajs)
    return trans
示例#14
0
def make_trainer():
    env_name = "CartPole-v1"
    env = util.make_vec_env(env_name, 2)
    rollouts = types.load(ROLLOUT_PATH)
    rollouts = rollout.flatten_trajectories(rollouts)
    return bc.BCTrainer(env, expert_demos=rollouts)
示例#15
0
def imitation_learning(expert_traj_path, imitation_algo_name, rl_algo_name,
                       env_name):
    # Load pickled expert demonstrations.
    with open(expert_traj_path, "rb") as f:
        # This is a list of `imitation.data.types.Trajectory`, where
        # every instance contains observations and actions for a single expert
        # demonstration.
        trajectories = pickle.load(f)
    # Convert List[types.Trajectory] to an instance of `imitation.data.types.Transitions`.
    # This is a more general dataclass containing unordered
    # (observation, actions, next_observation) transitions.
    transitions = rollout.flatten_trajectories(trajectories)

    venv = util.make_vec_env(env_name, n_envs=2)

    # tempdir = tempfile.TemporaryDirectory(prefix="il_results/{}_{}".format(rl_algo_name, env_name))
    # tempdir_path = pathlib.Path(tempdir.name)
    # print(f"All Tensorboards and logging are being written inside {tempdir_path}/.")
    log_path = "il_results/{}_{}/{}/".format(rl_algo_name, env_name,
                                             imitation_algo_name)

    if imitation_algo_name == 'BC':
        # Train BC on expert data.
        # BC also accepts as `expert_data` any PyTorch-style DataLoader that iterates over
        # dictionaries containing observations and actions.
        logger.configure(log_path, format_strs=["stdout", "tensorboard"])
        trainer = bc.BC(venv.observation_space,
                        venv.action_space,
                        expert_data=transitions)
        trainer.train(n_epochs=100, log_interval=1)

    elif imitation_algo_name == 'GAIL':
        logger.configure(log_path, format_strs=["stdout", "tensorboard"])
        gail_trainer = adversarial.GAIL(
            venv,
            expert_data=transitions,
            expert_batch_size=32,
            gen_algo=sb3.PPO("MlpPolicy", venv, verbose=1, n_steps=1024),
            discrim_kwargs={
                'discrim_net':
                ActObsMLP(
                    action_space=venv.action_space,
                    observation_space=venv.observation_space,
                    hid_sizes=(32, 32),
                )
            })
        gail_trainer.train(total_timesteps=2048)
        trainer = gail_trainer.gen_algo
    elif imitation_algo_name == 'AIRL':
        # Train AIRL on expert data.
        logger.configure(log_path)
        airl_trainer = adversarial.AIRL(
            venv,
            expert_data=transitions,
            expert_batch_size=32,
            gen_algo=sb3.PPO("MlpPolicy", venv, verbose=1, n_steps=1024),
        )
        airl_trainer.train(total_timesteps=2048)

    sample_until = rollout.min_episodes(15)
    trained_ret_mean = rollout.mean_return(trainer.policy, venv, sample_until)
    # trainer.save_policy("{}/bc_policy.pth.tar".format(log_path))
    th.save(trainer.policy,
            "{}/{}_policy.pth.tar".format(log_path, imitation_algo_name))

    return trained_ret_mean
示例#16
0
def train(
    _run,
    _seed: int,
    algorithm: str,
    env_name: str,
    num_vec: int,
    parallel: bool,
    max_episode_steps: Optional[int],
    rollout_path: str,
    n_expert_demos: Optional[int],
    log_dir: str,
    total_timesteps: int,
    n_episodes_eval: int,
    init_tensorboard: bool,
    checkpoint_interval: int,
    gen_batch_size: int,
    init_rl_kwargs: Mapping,
    algorithm_kwargs: Mapping[str, Mapping],
    discrim_net_kwargs: Mapping[str, Mapping],
) -> dict:
    """Train an adversarial-network-based imitation learning algorithm.

    Checkpoints:
        - DiscrimNets are saved to `f"{log_dir}/checkpoints/{step}/discrim/"`,
            where step is either the training round or "final".
        - Generator policies are saved to `f"{log_dir}/checkpoints/{step}/gen_policy/"`.

    Args:
        _seed: Random seed.
        algorithm: A case-insensitive string determining which adversarial imitation
            learning algorithm is executed. Either "airl" or "gail".
        env_name: The environment to train in.
        num_vec: Number of `gym.Env` to vectorize.
        parallel: Whether to use "true" parallelism. If True, then use `SubProcVecEnv`.
            Otherwise, use `DummyVecEnv` which steps through environments serially.
        max_episode_steps: If not None, then a TimeLimit wrapper is applied to each
            environment to artificially limit the maximum number of timesteps in an
            episode.
        rollout_path: Path to pickle containing list of Trajectories. Used as
            expert demonstrations.
        n_expert_demos: The number of expert trajectories to actually use
            after loading them from `rollout_path`.
            If None, then use all available trajectories.
            If `n_expert_demos` is an `int`, then use exactly `n_expert_demos`
            trajectories, erroring if there aren't enough trajectories. If there are
            surplus trajectories, then use the first `n_expert_demos` trajectories and
            drop the rest.
        log_dir: Directory to save models and other logging to.
        total_timesteps: The number of transitions to sample from the environment
            during training.
        n_episodes_eval: The number of episodes to average over when calculating
            the average episode reward of the imitation policy for return.
        init_tensorboard: If True, then write tensorboard logs to `{log_dir}/sb_tb`.
        checkpoint_interval: Save the discriminator and generator models every
            `checkpoint_interval` rounds and after training is complete. If 0,
            then only save weights after training is complete. If <0, then don't
            save weights at all.
        gen_batch_size: Batch size for generator updates. Sacred automatically uses
            this to calculate `n_steps` in `init_rl_kwargs`. In the script body, this
            is only used in sanity checks.
        init_rl_kwargs: Keyword arguments for `init_rl`, the RL algorithm initialization
            utility function.
        algorithm_kwargs: Keyword arguments for the `GAIL` or `AIRL` constructor
            that can apply to either constructor. Unlike a regular kwargs argument, this
            argument can only have the following keys: "shared", "airl", and "gail".

            `algorithm_kwargs["airl"]`, if it is provided, is a kwargs `Mapping` passed
            to the `AIRL` constructor when `algorithm == "airl"`. Likewise
            `algorithm_kwargs["gail"]` is passed to the `GAIL` constructor when
            `algorithm == "gail"`. `algorithm_kwargs["shared"]`, if provided, is passed
            to both the `AIRL` and `GAIL` constructors. Duplicate keyword argument keys
            between `algorithm_kwargs["shared"]` and `algorithm_kwargs["airl"]` (or
            "gail") leads to an error.
        discrim_net_kwargs: Keyword arguments for the `DiscrimNet` constructor. Unlike a
            regular kwargs argument, this argument can only have the following keys:
            "shared", "airl", "gail". These keys have the same meaning as they do in
            `algorithm_kwargs`.

    Returns:
        A dictionary with two keys. "imit_stats" gives the return value of
        `rollout_stats()` on rollouts test-reward-wrapped environment, using the final
        policy (remember that the ground-truth reward can be recovered from the
        "monitor_return" key). "expert_stats" gives the return value of
        `rollout_stats()` on the expert demonstrations loaded from `rollout_path`.
    """
    if gen_batch_size % num_vec != 0:
        raise ValueError(
            f"num_vec={num_vec} must evenly divide gen_batch_size={gen_batch_size}."
        )

    allowed_keys = {"shared", "gail", "airl"}
    if not discrim_net_kwargs.keys() <= allowed_keys:
        raise ValueError(
            f"Invalid discrim_net_kwargs.keys()={discrim_net_kwargs.keys()}. "
            f"Allowed keys: {allowed_keys}"
        )
    if not algorithm_kwargs.keys() <= allowed_keys:
        raise ValueError(
            f"Invalid discrim_net_kwargs.keys()={algorithm_kwargs.keys()}. "
            f"Allowed keys: {allowed_keys}"
        )

    if not os.path.exists(rollout_path):
        raise ValueError(f"File at rollout_path={rollout_path} does not exist.")

    expert_trajs = types.load(rollout_path)
    if n_expert_demos is not None:
        if not len(expert_trajs) >= n_expert_demos:
            raise ValueError(
                f"Want to use n_expert_demos={n_expert_demos} trajectories, but only "
                f"{len(expert_trajs)} are available via {rollout_path}."
            )
        expert_trajs = expert_trajs[:n_expert_demos]
    expert_transitions = rollout.flatten_trajectories(expert_trajs)

    total_timesteps = int(total_timesteps)

    logging.info("Logging to %s", log_dir)
    logger.configure(log_dir, ["tensorboard", "stdout"])
    os.makedirs(log_dir, exist_ok=True)
    sacred_util.build_sacred_symlink(log_dir, _run)

    venv = util.make_vec_env(
        env_name,
        num_vec,
        seed=_seed,
        parallel=parallel,
        log_dir=log_dir,
        max_episode_steps=max_episode_steps,
    )

    # if init_tensorboard:
    #     tensorboard_log = osp.join(log_dir, "sb_tb")
    # else:
    #     tensorboard_log = None

    gen_algo = util.init_rl(
        # FIXME(sam): ignoring tensorboard_log is a hack to prevent SB3 from
        # re-configuring the logger (SB3 issue #109). See init_rl() for details.
        # TODO(shwang): Let's get rid of init_rl after SB3 issue #109 is fixed?
        # Besides sidestepping #109, init_rl is just a stub function.
        venv,
        **init_rl_kwargs,
    )

    discrim_kwargs_shared = discrim_net_kwargs.get("shared", {})
    discrim_kwargs_algo = discrim_net_kwargs.get(algorithm, {})
    final_discrim_kwargs = dict(**discrim_kwargs_shared, **discrim_kwargs_algo)

    algorithm_kwargs_shared = algorithm_kwargs.get("shared", {})
    algorithm_kwargs_algo = algorithm_kwargs.get(algorithm, {})
    final_algorithm_kwargs = dict(
        **algorithm_kwargs_shared,
        **algorithm_kwargs_algo,
    )

    if algorithm.lower() == "gail":
        algo_cls = adversarial.GAIL
    elif algorithm.lower() == "airl":
        algo_cls = adversarial.AIRL
    else:
        raise ValueError(f"Invalid value algorithm={algorithm}.")

    trainer = algo_cls(
        venv=venv,
        expert_data=expert_transitions,
        gen_algo=gen_algo,
        log_dir=log_dir,
        discrim_kwargs=final_discrim_kwargs,
        **final_algorithm_kwargs,
    )

    def callback(round_num):
        if checkpoint_interval > 0 and round_num % checkpoint_interval == 0:
            save(trainer, os.path.join(log_dir, "checkpoints", f"{round_num:05d}"))

    trainer.train(total_timesteps, callback)

    # Save final artifacts.
    if checkpoint_interval >= 0:
        save(trainer, os.path.join(log_dir, "checkpoints", "final"))

    # Final evaluation of imitation policy.
    results = {}
    sample_until_eval = rollout.min_episodes(n_episodes_eval)
    trajs = rollout.generate_trajectories(
        trainer.gen_algo, trainer.venv_train_norm, sample_until=sample_until_eval
    )
    results["expert_stats"] = rollout.rollout_stats(expert_trajs)
    results["imit_stats"] = rollout.rollout_stats(trajs)
    return results