示例#1
0
        def reward_fn_loader(
                path: str, venv: vec_env.VecEnv) -> Iterator[common.RewardFn]:
            """Load a TensorFlow reward model, then convert it into a Callable."""
            reward_model_loader = self.get(key)
            with networks.make_session() as (_, sess):
                reward_model = reward_model_loader(path, venv)

                def reward_fn(obs: np.ndarray, actions: np.ndarray,
                              next_obs: np.ndarray,
                              steps: np.ndarray) -> np.ndarray:
                    """Helper method computing reward for registered model."""
                    del steps
                    # TODO(adam): RewardFn should probably include dones?
                    dones = np.zeros(len(obs), dtype=np.bool)
                    transitions = types.Transitions(
                        obs=obs,
                        acts=actions,
                        next_obs=next_obs,
                        dones=dones,
                        infos=None,
                    )
                    fd = rewards.make_feed_dict([reward_model], transitions)
                    return sess.run(reward_model.reward, feed_dict=fd)

                yield reward_fn
示例#2
0
def plot_pm_reward(
    styles: Iterable[str],
    env_name: str,
    discount: float,
    models: Sequence[Tuple[str, str, str]],
    data_root: str,
    # Mesh parameters
    pos_lim: float,
    pos_density: int,
    vel_lim: float,
    act_lim: float,
    density: int,
    # Figure parameters
    ncols: int,
    cbar_kwargs: Mapping[str, Any],
    log_dir: str,
    fmt: str,
) -> xr.DataArray:
    """Entry-point into script to visualize a reward model for point mass."""
    with stylesheets.setup_styles(styles):
        env = gym.make(env_name)
        venv = vec_env.DummyVecEnv([lambda: env])
        goal = np.array([0.0])

        rewards = {}
        with networks.make_session():
            for model_name, reward_type, reward_path in models:
                reward_path = os.path.join(data_root, reward_path)
                model = serialize.load_reward(reward_type, reward_path, venv,
                                              discount)
                reward = point_mass_analysis.evaluate_reward_model(
                    env,
                    model,
                    goal=goal,
                    pos_lim=pos_lim,
                    pos_density=pos_density,
                    vel_lim=vel_lim,
                    act_lim=act_lim,
                    density=density,
                )
                rewards[model_name] = reward

        if len(rewards) == 1:
            reward = next(iter(rewards.values()))
            kwargs = {"col_wrap": ncols}
        else:
            reward = xr.Dataset(rewards).to_array("model")
            kwargs = {"row": "Model"}

        fig = point_mass_analysis.plot_reward(reward,
                                              cbar_kwargs=cbar_kwargs,
                                              **kwargs)
        save_path = os.path.join(log_dir, "reward")
        visualize.save_fig(save_path, fig, fmt=fmt)

        return reward
示例#3
0
def regress(
    seed: int,
    # Dataset
    env_name: str,
    discount: float,
    # Target specification
    target_reward_type: str,
    target_reward_path: str,
    # Model parameters
    make_source: MakeModelFn,
    source_init: bool,
    make_trainer: MakeTrainerFn,
    do_training: DoTrainingFn,
    # Logging
    log_dir: str,
    checkpoint_interval: int,
) -> V:
    """Train a model on target and save the results, reporting training stats."""
    # This venv is needed by serialize.load_reward, but is never stepped.
    venv = vec_env.DummyVecEnv([lambda: gym.make(env_name)])

    with networks.make_session() as (_, sess):
        tf.random.set_random_seed(seed)

        with tf.variable_scope("source") as model_scope:
            model = make_source(venv)

        with tf.variable_scope("target"):
            target = serialize.load_reward(target_reward_type,
                                           target_reward_path, venv, discount)

        with tf.variable_scope("train") as train_scope:
            trainer = make_trainer(model, model_scope, target)

        # Do not initialize any variables from target, which have already been
        # set during serialization.
        init_vars = train_scope.global_variables()
        if source_init:
            init_vars += model_scope.global_variables()
        sess.run(tf.initializers.variables(init_vars))

        def callback(epoch: int) -> None:
            if checkpoint_interval > 0 and epoch % checkpoint_interval == 0:
                trainer.model.save(
                    os.path.join(log_dir, "checkpoints", f"{epoch:05d}"))

        stats = do_training(target, trainer, callback)

        # Trainer may wrap source, so save `trainer.model` not source directly
        # (see e.g. RegressWrappedModel).
        trainer.model.save(os.path.join(log_dir, "checkpoints", "final"))

        with open(os.path.join(log_dir, "stats.pkl"), "wb") as f:
            pickle.dump(stats, f)

    return stats
示例#4
0
def get_affine_from_models(env_name: str, paths: Iterable[str]):
    """Extract affine parameters from reward model."""
    venv = vec_env.DummyVecEnv([lambda: gym.make(env_name)])
    res = {}
    with networks.make_session():
        for path in paths:
            model = serialize.load_reward(
                "evaluating_rewards/RewardModel-v0",
                os.path.join(path, "model"),
                venv,
            )
            return model.models["wrapped"][0].get_weights()
    return res
示例#5
0
    def loader(path: str, venv: VecEnv) -> Iterator[common.RewardFn]:
        """Load train (shaped) or test (not shaped) reward from path."""
        del venv  # Unused.
        with networks.make_session() as (graph, sess):
            net = reward_net.RewardNet.load(path)
            reward = net.reward_output_train if shaped else net.reward_output_test

            def rew_fn(
                obs: np.ndarray,
                act: np.ndarray,
                next_obs: np.ndarray,
                dones: np.ndarray,
            ) -> np.ndarray:
                fd = {
                    net.obs_ph: obs,
                    net.act_ph: act,
                    net.next_obs_ph: next_obs,
                    net.done_ph: dones,
                }
                rew = sess.run(reward, feed_dict=fd)
                assert rew.shape == (len(obs), )
                return rew

            yield rew_fn
示例#6
0
def rollouts_and_policy(
    _run,
    _seed: int,
    env_name: str,
    total_timesteps: int,
    *,
    log_dir: str,
    num_vec: int,
    parallel: bool,
    max_episode_steps: Optional[int],
    normalize: bool,
    normalize_kwargs: dict,
    init_rl_kwargs: dict,
    n_episodes_eval: int,
    reward_type: Optional[str],
    reward_path: Optional[str],
    rollout_save_interval: int,
    rollout_save_final: bool,
    rollout_save_n_timesteps: Optional[int],
    rollout_save_n_episodes: Optional[int],
    policy_save_interval: int,
    policy_save_final: bool,
    init_tensorboard: bool,
) -> dict:
    """Trains an expert policy from scratch and saves the rollouts and policy.

    Checkpoints:
      At applicable training steps `step` (where step is either an integer or
      "final"):

        - Policies are saved to `{log_dir}/policies/{step}/`.
        - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`.

    Args:
        env_name: The gym.Env name. Loaded as VecEnv.
        total_timesteps: Number of training timesteps in `model.learn()`.
        log_dir: The root directory to save metrics and checkpoints to.
        num_vec: Number of environments in VecEnv.
        parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv.
        max_episode_steps: If not None, then environments are wrapped by
            TimeLimit so that they have at most `max_episode_steps` steps per
            episode.
        normalize: If True, then rescale observations and reward.
        normalize_kwargs: kwargs for `VecNormalize`.
        init_rl_kwargs: kwargs for `init_rl`.

        n_episodes_eval: The number of episodes to average over when calculating
            the average ground truth reward return of the final policy.

        reward_type: If provided, then load the serialized reward of this type,
            wrapping the environment in this reward. This is useful to test
            whether a reward model transfers. For more information, see
            `imitation.rewards.serialize.load_reward`.
        reward_path: A specifier, such as a path to a file on disk, used by
            reward_type to load the reward model. For more information, see
            `imitation.rewards.serialize.load_reward`.

        rollout_save_interval: The number of training updates in between
            intermediate rollout saves. If the argument is nonpositive, then
            don't save intermediate updates.
        rollout_save_final: If True, then save rollouts right after training is
            finished.
        rollout_save_n_timesteps: The minimum number of timesteps saved in every
            file. Could be more than `rollout_save_n_timesteps` because
            trajectories are saved by episode rather than by transition.
            Must set exactly one of `rollout_save_n_timesteps`
            and `rollout_save_n_episodes`.
        rollout_save_n_episodes: The number of episodes saved in every
            file. Must set exactly one of `rollout_save_n_timesteps` and
            `rollout_save_n_episodes`.

        policy_save_interval: The number of training updates between saves. Has
            the same semantics are `rollout_save_interval`.
        policy_save_final: If True, then save the policy right after training is
            finished.

        init_tensorboard: If True, then write tensorboard logs to {log_dir}/sb_tb
            and "output/summary/...".

    Returns:
      The return value of `rollout_stats()` using the final policy.
    """
    os.makedirs(log_dir, exist_ok=True)
    sacred_util.build_sacred_symlink(log_dir, _run)

    sample_until = rollout.make_sample_until(rollout_save_n_timesteps,
                                             rollout_save_n_episodes)
    eval_sample_until = rollout.min_episodes(n_episodes_eval)

    with networks.make_session():
        tf.logging.set_verbosity(tf.logging.INFO)
        logger.configure(folder=osp.join(log_dir, "rl"),
                         format_strs=["tensorboard", "stdout"])

        rollout_dir = osp.join(log_dir, "rollouts")
        policy_dir = osp.join(log_dir, "policies")
        os.makedirs(rollout_dir, exist_ok=True)
        os.makedirs(policy_dir, exist_ok=True)

        if init_tensorboard:
            sb_tensorboard_dir = osp.join(log_dir, "sb_tb")
            # Convert sacred's ReadOnlyDict to dict so we can modify on next line.
            init_rl_kwargs = dict(init_rl_kwargs)
            init_rl_kwargs["tensorboard_log"] = sb_tensorboard_dir

        venv = util.make_vec_env(
            env_name,
            num_vec,
            seed=_seed,
            parallel=parallel,
            log_dir=log_dir,
            max_episode_steps=max_episode_steps,
        )

        log_callbacks = []
        with contextlib.ExitStack() as stack:
            if reward_type is not None:
                reward_fn_ctx = load_reward(reward_type, reward_path, venv)
                reward_fn = stack.enter_context(reward_fn_ctx)
                venv = RewardVecEnvWrapper(venv, reward_fn)
                log_callbacks.append(venv.log_callback)
                tf.logging.info(
                    f"Wrapped env in reward {reward_type} from {reward_path}.")

            vec_normalize = None
            if normalize:
                venv = vec_normalize = VecNormalize(venv, **normalize_kwargs)

            policy = util.init_rl(venv, verbose=1, **init_rl_kwargs)

            # Make callback to save intermediate artifacts during training.
            step = 0

            def callback(locals_: dict, _) -> bool:
                nonlocal step
                step += 1
                policy = locals_["self"]

                # TODO(adam): make logging frequency configurable
                for callback in log_callbacks:
                    callback(sb_logger)

                if rollout_save_interval > 0 and step % rollout_save_interval == 0:
                    save_path = osp.join(rollout_dir, f"{step}.pkl")
                    rollout.rollout_and_save(save_path, policy, venv,
                                             sample_until)
                if policy_save_interval > 0 and step % policy_save_interval == 0:
                    output_dir = os.path.join(policy_dir, f"{step:05d}")
                    serialize.save_stable_model(output_dir, policy,
                                                vec_normalize)

            policy.learn(total_timesteps, callback=callback)

            # Save final artifacts after training is complete.
            if rollout_save_final:
                save_path = osp.join(rollout_dir, "final.pkl")
                rollout.rollout_and_save(save_path, policy, venv, sample_until)
            if policy_save_final:
                output_dir = os.path.join(policy_dir, "final")
                serialize.save_stable_model(output_dir, policy, vec_normalize)

            # Final evaluation of expert policy.
            trajs = rollout.generate_trajectories(policy, venv,
                                                  eval_sample_until)
            stats = rollout.rollout_stats(trajs)

    return stats
示例#7
0
 def wrapper(*args, **kwargs) -> Iterator[T]:
     with networks.make_session():
         yield fn(*args, **kwargs)
示例#8
0
def train(
    _run,
    _seed: int,
    env_name: str,
    rollout_path: str,
    n_expert_demos: Optional[int],
    log_dir: str,
    init_trainer_kwargs: dict,
    total_timesteps: int,
    n_episodes_eval: int,
    init_tensorboard: bool,
    checkpoint_interval: int,
) -> dict:
    """Train an adversarial-network-based imitation learning algorithm.

    Plots (turn on using `plot_interval > 0`):
      - Plot discriminator loss during discriminator training steps in blue and
        discriminator loss during generator training steps in red.
      - Plot the performance of the generator policy versus the performance of
        a random policy. Also plot the performance of an expert policy if that is
        provided in the arguments.

    Checkpoints:
      - DiscrimNets are saved to f"{log_dir}/checkpoints/{step}/discrim/",
        where step is either the training epoch or "final".
      - Generator policies are saved to
        f"{log_dir}/checkpoints/{step}/gen_policy/".

    Args:
      _seed: Random seed.
      env_name: The environment to train in.
      rollout_path: Path to pickle containing list of Trajectories. Used as
        expert demonstrations.
      n_expert_demos: The number of expert trajectories to actually use
        after loading them from `rollout_path`.
        If None, then use all available trajectories.
        If `n_expert_demos` is an `int`, then use exactly `n_expert_demos`
        trajectories, erroring if there aren't enough trajectories. If there are
        surplus trajectories, then use the
        first `n_expert_demos` trajectories and drop the rest.
      log_dir: Directory to save models and other logging to.

      init_trainer_kwargs: Keyword arguments passed to `init_trainer`,
        used to initialize the trainer.
      total_timesteps: The number of transitions to sample from the environment
        during training.
      n_episodes_eval: The number of episodes to average over when calculating
        the average episode reward of the imitation policy for return.

      plot_interval: The number of epochs between each plot. If negative,
        then plots are disabled. If zero, then only plot at the end of training.
      n_plot_episodes: The number of episodes averaged over when
        calculating the average episode reward of a policy for the performance
        plots.
      extra_episode_data_interval: Usually mean episode rewards are calculated
        immediately before every plot. Set this parameter to a nonnegative number
        to also add episode reward data points every
        `extra_episodes_data_interval` epochs.
      show_plots: Figures are always saved to `f"{log_dir}/plots/*.png"`. If
        `show_plots` is True, then also show plots as they are created.
      init_tensorboard: If True, then write tensorboard logs to `{log_dir}/sb_tb`.

      checkpoint_interval: Save the discriminator and generator models every
        `checkpoint_interval` epochs and after training is complete. If 0,
        then only save weights after training is complete. If <0, then don't
        save weights at all.

    Returns:
      A dictionary with two keys. "imit_stats" gives the return value of
        `rollout_stats()` on rollouts test-reward-wrapped
        environment, using the final policy (remember that the ground-truth reward
        can be recovered from the "monitor_return" key). "expert_stats" gives the
        return value of `rollout_stats()` on the expert demonstrations loaded from
        `rollout_path`.
    """
    total_timesteps = int(total_timesteps)

    tf.logging.info("Logging to %s", log_dir)
    os.makedirs(log_dir, exist_ok=True)
    sacred_util.build_sacred_symlink(log_dir, _run)

    # Calculate stats for expert rollouts. Used for plot and return value.
    expert_trajs = types.load(rollout_path)

    if n_expert_demos is not None:
        assert len(expert_trajs) >= n_expert_demos
        expert_trajs = expert_trajs[:n_expert_demos]

    expert_stats = rollout.rollout_stats(expert_trajs)

    with networks.make_session():
        if init_tensorboard:
            sb_tensorboard_dir = osp.join(log_dir, "sb_tb")
            kwargs = init_trainer_kwargs
            kwargs["init_rl_kwargs"] = kwargs.get("init_rl_kwargs", {})
            kwargs["init_rl_kwargs"]["tensorboard_log"] = sb_tensorboard_dir

        trainer = init_trainer(env_name,
                               expert_trajs,
                               seed=_seed,
                               log_dir=log_dir,
                               **init_trainer_kwargs)

        def callback(epoch):
            if checkpoint_interval > 0 and epoch % checkpoint_interval == 0:
                save(trainer,
                     os.path.join(log_dir, "checkpoints", f"{epoch:05d}"))

        trainer.train(total_timesteps, callback)

        # Save final artifacts.
        if checkpoint_interval >= 0:
            save(trainer, os.path.join(log_dir, "checkpoints", "final"))

        # Final evaluation of imitation policy.
        results = {}
        sample_until_eval = rollout.min_episodes(n_episodes_eval)
        trajs = rollout.generate_trajectories(trainer.gen_policy,
                                              trainer.venv_test,
                                              sample_until=sample_until_eval)
        results["imit_stats"] = rollout.rollout_stats(trajs)
        results["expert_stats"] = expert_stats
        return results
示例#9
0
def train(
    _run,
    _seed: int,
    algorithm: str,
    env_name: str,
    num_vec: int,
    parallel: bool,
    max_episode_steps: Optional[int],
    rollout_path: str,
    n_expert_demos: Optional[int],
    log_dir: str,
    total_timesteps: int,
    n_episodes_eval: int,
    init_tensorboard: bool,
    checkpoint_interval: int,
    init_rl_kwargs: Mapping,
    algorithm_kwargs: Mapping[str, Mapping],
    discrim_net_kwargs: Mapping[str, Mapping],
) -> dict:
    """Train an adversarial-network-based imitation learning algorithm.

    Checkpoints:
        - DiscrimNets are saved to `f"{log_dir}/checkpoints/{step}/discrim/"`,
            where step is either the training epoch or "final".
        - Generator policies are saved to `f"{log_dir}/checkpoints/{step}/gen_policy/"`.

    Args:
        _seed: Random seed.
        algorithm: A case-insensitive string determining which adversarial imitation
            learning algorithm is executed. Either "airl" or "gail".
        env_name: The environment to train in.
        num_vec: Number of `gym.Env` to vectorize.
        parallel: Whether to use "true" parallelism. If True, then use `SubProcVecEnv`.
            Otherwise, use `DummyVecEnv` which steps through environments serially.
        max_episode_steps: If not None, then a TimeLimit wrapper is applied to each
            environment to artificially limit the maximum number of timesteps in an
            episode.
        rollout_path: Path to pickle containing list of Trajectories. Used as
            expert demonstrations.
        n_expert_demos: The number of expert trajectories to actually use
            after loading them from `rollout_path`.
            If None, then use all available trajectories.
            If `n_expert_demos` is an `int`, then use exactly `n_expert_demos`
            trajectories, erroring if there aren't enough trajectories. If there are
            surplus trajectories, then use the first `n_expert_demos` trajectories and
            drop the rest.
        log_dir: Directory to save models and other logging to.
        total_timesteps: The number of transitions to sample from the environment
            during training.
        n_episodes_eval: The number of episodes to average over when calculating
            the average episode reward of the imitation policy for return.
        init_tensorboard: If True, then write tensorboard logs to `{log_dir}/sb_tb`.
        checkpoint_interval: Save the discriminator and generator models every
            `checkpoint_interval` epochs and after training is complete. If 0,
            then only save weights after training is complete. If <0, then don't
            save weights at all.
        init_rl_kwargs: Keyword arguments for `init_rl`, the RL algorithm initialization
            utility function.
        algorithm_kwargs: Keyword arguments for the `GAIL` or `AIRL` constructor
            that can apply to either constructor. Unlike a regular kwargs argument, this
            argument can only have the following keys: "shared", "airl", and "gail".

            `algorithm_kwargs["airl"]`, if it is provided, is a kwargs `Mapping` passed
            to the `AIRL` constructor when `algorithm == "airl"`. Likewise
            `algorithm_kwargs["gail"]` is passed to the `GAIL` constructor when
            `algorithm == "gail"`. `algorithm_kwargs["shared"]`, if provided, is passed
            to both the `AIRL` and `GAIL` constructors. Duplicate keyword argument keys
            between `algorithm_kwargs["shared"]` and `algorithm_kwargs["airl"]` (or
            "gail") leads to an error.
        discrim_net_kwargs: Keyword arguments for the `DiscrimNet` constructor. Unlike a
            regular kwargs argument, this argument can only have the following keys:
            "shared", "airl", "gail". These keys have the same meaning as they do in
            `algorithm_kwargs`.

    Returns:
        A dictionary with two keys. "imit_stats" gives the return value of
        `rollout_stats()` on rollouts test-reward-wrapped environment, using the final
        policy (remember that the ground-truth reward can be recovered from the
        "monitor_return" key). "expert_stats" gives the return value of
        `rollout_stats()` on the expert demonstrations loaded from `rollout_path`.
    """
    assert os.path.exists(rollout_path)
    total_timesteps = int(total_timesteps)

    tf.logging.info("Logging to %s", log_dir)
    logger.configure(log_dir, ["tensorboard", "stdout"])
    os.makedirs(log_dir, exist_ok=True)
    sacred_util.build_sacred_symlink(log_dir, _run)

    expert_trajs = types.load(rollout_path)
    if n_expert_demos is not None:
        assert len(expert_trajs) >= n_expert_demos
        expert_trajs = expert_trajs[:n_expert_demos]
    expert_transitions = rollout.flatten_trajectories(expert_trajs)

    with networks.make_session():
        if init_tensorboard:
            tensorboard_log = osp.join(log_dir, "sb_tb")
        else:
            tensorboard_log = None

        venv = util.make_vec_env(
            env_name,
            num_vec,
            seed=_seed,
            parallel=parallel,
            log_dir=log_dir,
            max_episode_steps=max_episode_steps,
        )

        # TODO(shwang): Let's get rid of init_rl later on?
        # It's really just a stub function now.
        gen_policy = util.init_rl(venv,
                                  verbose=1,
                                  tensorboard_log=tensorboard_log,
                                  **init_rl_kwargs)

        # Convert Sacred's ReadOnlyDict to dict so we can modify it.
        allowed_keys = {"shared", "gail", "airl"}
        assert discrim_net_kwargs.keys() <= allowed_keys
        assert algorithm_kwargs.keys() <= allowed_keys

        discrim_kwargs_shared = discrim_net_kwargs.get("shared", {})
        discrim_kwargs_algo = discrim_net_kwargs.get(algorithm, {})
        final_discrim_kwargs = dict(**discrim_kwargs_shared,
                                    **discrim_kwargs_algo)

        algorithm_kwargs_shared = algorithm_kwargs.get("shared", {})
        algorithm_kwargs_algo = algorithm_kwargs.get(algorithm, {})
        final_algorithm_kwargs = dict(
            **algorithm_kwargs_shared,
            **algorithm_kwargs_algo,
        )

        if algorithm.lower() == "gail":
            algo_cls = adversarial.GAIL
        elif algorithm.lower() == "airl":
            algo_cls = adversarial.AIRL
        else:
            raise ValueError(f"Invalid value algorithm={algorithm}.")

        trainer = algo_cls(
            venv=venv,
            expert_data=expert_transitions,
            gen_policy=gen_policy,
            log_dir=log_dir,
            discrim_kwargs=final_discrim_kwargs,
            **final_algorithm_kwargs,
        )

        def callback(epoch):
            if checkpoint_interval > 0 and epoch % checkpoint_interval == 0:
                save(trainer,
                     os.path.join(log_dir, "checkpoints", f"{epoch:05d}"))

        trainer.train(total_timesteps, callback)

        # Save final artifacts.
        if checkpoint_interval >= 0:
            save(trainer, os.path.join(log_dir, "checkpoints", "final"))

        # Final evaluation of imitation policy.
        results = {}
        sample_until_eval = rollout.min_episodes(n_episodes_eval)
        trajs = rollout.generate_trajectories(trainer.gen_policy,
                                              trainer.venv_test,
                                              sample_until=sample_until_eval)
        results["expert_stats"] = rollout.rollout_stats(expert_trajs)
        results["imit_stats"] = rollout.rollout_stats(trajs)
        return results