示例#1
0
def make_PPO2(env_name, num_vec):
    env = util.make_vec_env(env_name, num_vec)
    # TODO(adam): add support for wrapping env with VecNormalize
    # (This is non-trivial since we'd need to make sure it's also applied
    # when the policy is re-loaded to generate rollouts.)
    policy = util.make_blank_policy(env, verbose=1, init_tensorboard=True)
    return policy
示例#2
0
def make_trainer():
    env_name = 'CartPole-v1'
    env = util.make_vec_env(env_name, 2)
    with open(ROLLOUT_PATH, "rb") as f:
        rollouts = pickle.load(f)
    rollouts = util.rollout.flatten_trajectories(rollouts)
    return bc.BCTrainer(env, expert_demos=rollouts)
def test_reward_overwrite():
    """Test that reward wrapper actually overwrites base rewards."""
    env_id = 'Pendulum-v0'
    num_envs = 3
    env = util.make_vec_env(env_id, num_envs)
    reward_fn = FunkyReward()
    wrapped_env = util.reward_wrapper.RewardVecEnvWrapper(env, reward_fn)
    policy = RandomPolicy(env.observation_space, env.action_space)
    default_stats = util.rollout.rollout_stats(policy, env, n_episodes=10)
    wrapped_stats = util.rollout.rollout_stats(policy,
                                               wrapped_env,
                                               n_episodes=10)
    # Pendulum-v0 always has negative rewards
    assert default_stats['return_max'] < 0
    # ours gives between 1 * traj_len and num_envs * traj_len reward
    # (trajectories are all constant length of 200 in Pendulum)
    steps = wrapped_stats['len_mean']
    assert wrapped_stats['return_min'] == 1 * steps
    assert wrapped_stats['return_max'] == num_envs * steps

    # check that wrapped reward is negative (all pendulum rewards is negative)
    # and other rewards are non-negative
    rand_act, _, _, _ = policy.step(wrapped_env.reset())
    _, rew, _, infos = wrapped_env.step(rand_act)
    assert np.all(rew >= 0)
    assert np.all([info_dict['wrapped_env_rew'] < 0 for info_dict in infos])
def test_density_reward(density_type, is_stationary):
  # test on Pendulum rather than Cartpole because I don't handle episodes that
  # terminate early yet (see issue #40)
  env_name = 'Pendulum-v0'
  env = util.make_vec_env(env_name, 2)

  # construct density-based reward from expert rollouts
  with open("tests/data/expert_models/pendulum_0/rollouts/final.pkl",
            "rb") as f:
    expert_trajectories_all = pickle.load(f)
  n_experts = len(expert_trajectories_all)
  expert_trajectories_train = expert_trajectories_all[:n_experts // 2]
  reward_fn = DensityReward(trajectories=expert_trajectories_train,
                            density_type=density_type,
                            kernel='gaussian',
                            obs_space=env.observation_space,
                            act_space=env.action_space,
                            is_stationary=is_stationary,
                            kernel_bandwidth=0.2,
                            standardise_inputs=True)

  # check that expert policy does better than a random policy under our reward
  # function
  random_policy = RandomPolicy(env.observation_space, env.action_space)
  sample_until = rollout.min_episodes(n_experts // 2)
  random_trajectories = rollout.generate_trajectories(random_policy,
                                                      env,
                                                      sample_until=sample_until)
  expert_trajectories_test = expert_trajectories_all[n_experts // 2:]
  random_score = score_trajectories(random_trajectories, reward_fn)
  expert_score = score_trajectories(expert_trajectories_test, reward_fn)
  assert expert_score > random_score
示例#5
0
def test_bc():
    env_id = 'CartPole-v1'
    env = util.make_vec_env(env_id, 2)
    rollouts = util.rollout.load_trajectories(
        "tests/data/rollouts/CartPole-v1*.pkl")
    rollouts = util.rollout.flatten_trajectories(rollouts)
    bc_trainer = bc.BCTrainer(env, expert_rollouts=rollouts)
    novice_stats = bc_trainer.test_policy()
    bc_trainer.train(n_epochs=40)
    good_stats = bc_trainer.test_policy()
    # novice is bad
    assert novice_stats["return_mean"] < 100.0
    # bc is okay but isn't perfect (for the purpose of this test)
    assert good_stats["return_mean"] > 200.0
示例#6
0
def test_bc():
  env_name = 'CartPole-v1'
  env = util.make_vec_env(env_name, 2)
  with open("tests/data/expert_models/cartpole_0/rollouts/final.pkl",
            "rb") as f:
    rollouts = pickle.load(f)
  rollouts = util.rollout.flatten_trajectories(rollouts)
  bc_trainer = bc.BCTrainer(env, expert_demos=rollouts)
  novice_stats = bc_trainer.test_policy()
  bc_trainer.train(n_epochs=40)
  good_stats = bc_trainer.test_policy(min_episodes=25)
  # novice is bad
  assert novice_stats["return_mean"] < 80.0
  # bc is okay but isn't perfect (for the purpose of this test)
  assert good_stats["return_mean"] > 350.0
示例#7
0
def rollouts_from_policy(
    _seed: int,
    *,
    num_vec: int,
    rollout_save_n_timesteps: int,
    rollout_save_n_episodes: int,
    log_dir: str,
    policy_path: str,
    policy_type: str = "ppo2",
    env_name: str = "CartPole-v1",
    parallel: bool = True,
    rollout_save_dir: Optional[str] = None,
    max_episode_steps: Optional[int] = None,
) -> None:
    """Loads a saved policy and generates rollouts.

  Default save path is f"{log_dir}/rollouts/{env_name}.pkl". Change to
  f"{rollout_save_dir}/{env_name}.pkl" by setting the `rollout_save_dir` param.
  Unlisted arguments are the same as in `rollouts_and_policy()`.

  Args:
      policy_type: Argument to `imitation.policies.serialize.load_policy`.
      policy_path: Argument to `imitation.policies.serialize.load_policy`. If
          not provided, then defaults to f"expert_models/{env_name}".
      rollout_save_dir: Rollout pickle is saved in this directory as
          f"{env_name}.pkl".
  """
    if rollout_save_dir is None:
        rollout_save_dir = osp.join(log_dir, "rollouts")

    venv = util.make_vec_env(env_name,
                             num_vec,
                             seed=_seed,
                             parallel=parallel,
                             log_dir=log_dir,
                             max_episode_steps=max_episode_steps)

    with serialize.load_policy(policy_type, policy_path, venv) as policy:
        os.makedirs(rollout_save_dir, exist_ok=True)
        util.rollout.save(
            rollout_save_dir,
            policy,
            venv,
            basename=env_name,
            n_timesteps=rollout_save_n_timesteps,
            n_episodes=rollout_save_n_episodes,
        )
示例#8
0
def test_bc():
    env_id = 'CartPole-v1'
    policy_dir = gin.query_parameter('init_trainer.policy_dir')
    env = util.make_vec_env(env_id, 2)
    expert_algos = util.load_policy(env, basedir=policy_dir)
    if not expert_algos:
        raise ValueError(env)
    bc_trainer = bc.BCTrainer(env,
                              expert_trainers=expert_algos,
                              n_expert_timesteps=2000)
    novice_stats = bc_trainer.test_policy()
    bc_trainer.train(n_epochs=40)
    good_stats = bc_trainer.test_policy()
    # novice is bad
    assert novice_stats["return_mean"] < 100.0
    # bc is okay but isn't perfect (for the purpose of this test)
    assert good_stats["return_mean"] > 200.0
示例#9
0
def test_density_trainer(density_type, is_stationary):
    env_id = 'Pendulum-v0'
    rollouts = rollout.load_trajectories(f"tests/data/rollouts/{env_id}_*.pkl")
    env = util.make_vec_env(env_id, 2)
    imitation_trainer = util.init_rl(env)
    density_trainer = DensityTrainer(env,
                                     rollouts=rollouts,
                                     imitation_trainer=imitation_trainer,
                                     density_type=density_type,
                                     is_stationary=is_stationary,
                                     kernel='gaussian')
    novice_stats = density_trainer.test_policy()
    density_trainer.train_policy(2000)
    good_stats = density_trainer.test_policy()
    # Novice is bad
    assert novice_stats["return_mean"] < -500
    # Density is also pretty bad, but shouldn't make things more than 50% worse.
    # It would be nice to have a less flaky/more meaningful test here.
    assert good_stats["return_mean"] > 1.5 * novice_stats["return_mean"]
示例#10
0
def init_trainer(env_id, policy_dir, use_gail, use_random_expert=True,
                 num_vec=8, discrim_scale=False,
                 discrim_kwargs={}, reward_kwargs={}, trainer_kwargs={}):
  """Builds a Trainer, ready to be trained on a vectorized environment
  and either expert rollout data or random rollout data.

  Args:
    env_id (str): The string id of a gym environment.
    use_gail (bool): If True, then train using GAIL. If False, then train
        using AIRL.
    policy_dir (str): The directory containing the pickled experts for
        generating rollouts. Only applicable if `use_random_expert` is True.
    use_random_expert (bool):
        If True, then use a blank (random) policy to generate rollouts.
        If False, then load an expert policy. Will crash if there is no expert
        policy in `policy_dir`.
    trainer_kwargs (dict): Aguments for the Trainer constructor.
    reward_kwargs (dict): Arguments for the `*RewardNet` constructor.
    discrim_kwargs (dict): Arguments for the `DiscrimNet*` constructor.
  """
  env = util.make_vec_env(env_id, num_vec)
  gen_policy = util.make_blank_policy(env, verbose=1)

  if use_random_expert:
    expert_policies = [gen_policy]
  else:
    expert_policies = util.load_policy(env, basedir=policy_dir)
    if expert_policies is None:
      raise ValueError(env)

  if use_gail:
    discrim = discrim_net.DiscrimNetGAIL(env.observation_space,
                                         env.action_space,
                                         scale=discrim_scale,
                                         **discrim_kwargs)
  else:
    rn = BasicShapedRewardNet(env.observation_space, env.action_space,
                              scale=discrim_scale, **reward_kwargs)
    discrim = discrim_net.DiscrimNetAIRL(rn, **discrim_kwargs)

  trainer = Trainer(env, gen_policy, discrim,
                    expert_policies=expert_policies, **trainer_kwargs)
  return trainer
示例#11
0
def rollouts_from_policy(
    _run,
    _seed: int,
    *,
    num_vec: int,
    rollout_save_n_timesteps: int,
    rollout_save_n_episodes: int,
    log_dir: str,
    policy_path: str,
    policy_type: str,
    env_name: str,
    parallel: bool,
    rollout_save_path: str,
    max_episode_steps: Optional[int],
    dac: bool,
) -> None:
    """Loads a saved policy and generates rollouts.

  Unlisted arguments are the same as in `rollouts_and_policy()`.

  Args:
      policy_type: Argument to `imitation.policies.serialize.load_policy`.
      policy_path: Argument to `imitation.policies.serialize.load_policy`.
      rollout_save_path: Rollout pickle is saved to this path.
  """
    os.makedirs(log_dir, exist_ok=True)
    sacred_util.build_sacred_symlink(log_dir, _run)

    sample_until = util.rollout.make_sample_until(rollout_save_n_timesteps,
                                                  rollout_save_n_episodes)

    venv = util.make_vec_env(env_name,
                             num_vec,
                             seed=_seed,
                             parallel=parallel,
                             log_dir=log_dir,
                             max_episode_steps=max_episode_steps,
                             dac=dac)

    with serialize.load_policy(policy_type, policy_path, venv) as policy:
        print(policy)
        util.rollout.save(rollout_save_path, policy, venv, sample_until)
def test_density_trainer(density_type, is_stationary):
  env_name = 'Pendulum-v0'
  with open("tests/data/expert_models/pendulum_0/rollouts/final.pkl",
            "rb") as f:
    rollouts = pickle.load(f)
  env = util.make_vec_env(env_name, 2)
  imitation_trainer = util.init_rl(env)
  density_trainer = DensityTrainer(env,
                                   rollouts=rollouts,
                                   imitation_trainer=imitation_trainer,
                                   density_type=density_type,
                                   is_stationary=is_stationary,
                                   kernel='gaussian')
  novice_stats = density_trainer.test_policy()
  density_trainer.train_policy(2000)
  good_stats = density_trainer.test_policy()
  # Novice is bad
  assert novice_stats["return_mean"] < -500
  # Density is also pretty bad, but shouldn't make things more than 50% worse.
  # It would be nice to have a less flaky/more meaningful test here.
  assert good_stats["return_mean"] > 1.5 * novice_stats["return_mean"]
示例#13
0
def test_serialize_identity(session, env_name, discrim_net_cls, tmpdir):
    """Does output of deserialized discriminator match that of original?"""
    venv = util.make_vec_env(env_name, parallel=False)
    original = DISCRIM_NET_SETUPS[discrim_net_cls](venv)
    random = base.RandomPolicy(venv.observation_space, venv.action_space)
    session.run(tf.global_variables_initializer())

    original.save(tmpdir)
    with tf.variable_scope("loaded"):
        loaded = discrim_net.DiscrimNet.load(tmpdir)

    transitions = util.rollout.generate_transitions(random,
                                                    venv,
                                                    n_timesteps=100)
    length = len(transitions.obs)  # n_timesteps is only a lower bound
    labels = np.random.randint(2, size=length).astype(np.float32)
    log_prob = np.random.randn(length)

    feed_dict = {}
    outputs = {'train': [], 'test': []}
    for net in [original, loaded]:
        feed_dict.update({
            net.obs_ph: transitions.obs,
            net.act_ph: transitions.acts,
            net.next_obs_ph: transitions.next_obs,
            net.labels_gen_is_one_ph: labels,
            net.log_policy_act_prob_ph: log_prob,
        })
        outputs['train'].append(net.policy_train_reward)
        outputs['test'].append(net.policy_test_reward)

    rewards = session.run(outputs, feed_dict=feed_dict)

    for key, predictions in rewards.items():
        assert len(predictions) == 2
        assert np.allclose(predictions[0], predictions[1])
示例#14
0
def test_trained_policy_better_than_random(use_gail,
                                           env='CartPole-v1',
                                           n_episodes=50):
    """
  Make sure that generator policy trained to mimick expert policy
  demonstrations) achieves higher reward than a random policy.

  In other words, perform a basic check on the imitation learning
  capabilities of AIRL and GAIL.
  """
    env = util.make_vec_env(env, 32)
    trainer = init_trainer(env, use_expert_rollouts=True, use_gail=use_gail)
    expert_policy = util.load_policy(env, basedir="expert_models")
    random_policy = util.make_blank_policy(env)
    if expert_policy is None:
        pytest.fail("Couldn't load expert_policy!")

    trainer.train(n_epochs=200)

    # Idea: Plot n_epochs vs generator reward.
    for _ in range(4):
        expert_ret = rollout.mean_return(expert_policy,
                                         env,
                                         n_episodes=n_episodes)
        gen_ret = rollout.mean_return(trainer.gen_policy,
                                      env,
                                      n_episodes=n_episodes)
        random_ret = rollout.mean_return(random_policy,
                                         env,
                                         n_episodes=n_episodes)

        print("expert return:", expert_ret)
        print("generator return:", gen_ret)
        print("random return:", random_ret)
        assert expert_ret > random_ret
        assert gen_ret > random_ret
示例#15
0
def rollouts_and_policy(
  _seed: int,
  env_name: str,
  total_timesteps: int,
  *,
  log_dir: str = None,
  num_vec: int = 8,
  parallel: bool = False,
  normalize: bool = True,
  make_blank_policy_kwargs: dict = {},

  rollout_save_interval: int = 0,
  rollout_save_final: bool = False,
  rollout_save_n_timesteps: Optional[int] = None,
  rollout_save_n_episodes: Optional[int] = None,

  policy_save_interval: int = -1,
  policy_save_final: bool = True,
) -> None:
  """Trains an expert policy from scratch and saves the rollouts and policy.

  At applicable training steps `step` (where step is either an integer or
  "final"):

      - Policies are saved to `{log_dir}/policies/{step}.pkl`.
      - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`.

  Args:
      env_name: The gym.Env name. Loaded as VecEnv.
      total_timesteps: Number of training timesteps in `model.learn()`.
      log_dir: The root directory to save metrics and checkpoints to.
      num_vec: Number of environments in VecEnv.
      parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv.
      normalize: If True, then rescale observations and reward.
      make_blank_policy_kwargs: Kwargs for `make_blank_policy`.
      rollout_save_interval: The number of training updates in between
          intermediate rollout saves. If the argument is nonpositive, then
          don't save intermediate updates.
      rollout_save_final: If True, then save rollouts right after training is
          finished.
      rollout_save_n_timesteps: The minimum number of timesteps saved in every
          file. Could be more than `rollout_save_n_timesteps` because
          trajectories are saved by episode rather than by transition.
          Must set exactly one of `rollout_save_n_timesteps`
          and `rollout_save_n_episodes`.
      rollout_save_n_episodes: The number of episodes saved in every
          file. Must set exactly one of `rollout_save_n_timesteps` and
          `rollout_save_n_episodes`.
      policy_save_interval: The number of training updates between saves. Has
          the same semantics are `rollout_save_interval`.
      policy_save_final: If True, then save the policy right after training is
          finished.
  """
  _validate_traj_generate_params(rollout_save_n_timesteps,
                                 rollout_save_n_episodes)

  with util.make_session():
    tf.logging.set_verbosity(tf.logging.INFO)
    sb_logger.configure(folder=osp.join(log_dir, 'rl'),
                        format_strs=['tensorboard', 'stdout'])

    rollout_dir = osp.join(log_dir, "rollouts")
    policy_dir = osp.join(log_dir, "policies")
    os.makedirs(rollout_dir, exist_ok=True)
    os.makedirs(policy_dir, exist_ok=True)

    venv = util.make_vec_env(env_name, num_vec, seed=_seed,
                             parallel=parallel, log_dir=log_dir)
    vec_normalize = None
    if normalize:
      venv = vec_normalize = VecNormalize(venv)

    policy = util.init_rl(venv, verbose=1,
                          **make_blank_policy_kwargs)

    # Make callback to save intermediate artifacts during training.
    step = 0
    rollout_ok = rollout_save_interval > 0
    policy_ok = policy_save_interval > 0

    def callback(locals_: dict, _) -> bool:
      nonlocal step
      step += 1
      policy = locals_['self']

      if rollout_ok and step % rollout_save_interval == 0:
        util.rollout.save(
          rollout_dir, policy, venv, step,
          n_timesteps=rollout_save_n_timesteps,
          n_episodes=rollout_save_n_episodes)
      if policy_ok and step % policy_save_interval == 0:
        output_dir = os.path.join(policy_dir, f'{step:5d}')
        serialize.save_stable_model(output_dir, policy, vec_normalize)
      return True

    policy.learn(total_timesteps, callback=callback)

    # Save final artifacts after training is complete.
    if rollout_save_final:
      util.rollout.save(
        rollout_dir, policy, venv, "final",
        n_timesteps=rollout_save_n_timesteps,
        n_episodes=rollout_save_n_episodes)
    if policy_save_final:
      output_dir = os.path.join(policy_dir, "final")
      serialize.save_stable_model(output_dir, policy, vec_normalize)
示例#16
0
def train(_run, _seed: int, env_name: str, rollout_path: str, normalize: bool,
          normalize_kwargs: dict, n_expert_demos: Optional[int], log_dir: str,
          init_trainer_kwargs: dict, total_timesteps: int,
          n_episodes_eval: int, init_tensorboard: bool,
          checkpoint_interval: int, dac: bool, rollout_save_n_timesteps: int,
          rollout_save_n_episodes: int, num_vec: int, parallel: bool,
          max_episode_steps: Optional[int]) -> dict:
    """Train an adversarial-network-based imitation learning algorithm.

  Plots (turn on using `plot_interval > 0`):
    - Plot discriminator loss during discriminator training steps in blue and
      discriminator loss during generator training steps in red.
    - Plot the performance of the generator policy versus the performance of
      a random policy. Also plot the performance of an expert policy if that is
      provided in the arguments.

  Checkpoints:
    - DiscrimNets are saved to f"{log_dir}/checkpoints/{step}/discrim/",
      where step is either the training epoch or "final".
    - Generator policies are saved to
      f"{log_dir}/checkpoints/{step}/gen_policy/".

  Args:
    _seed: Random seed.
    env_name: The environment to train in.
    rollout_path: Path to pickle containing list of Trajectories. Used as
      expert demonstrations.
    n_expert_demos: The number of expert trajectories to actually use
      after loading them from `rollout_path`.
      If None, then use all available trajectories.
      If `n_expert_demos` is an `int`, then use exactly `n_expert_demos`
      trajectories, erroring if there aren't enough trajectories. If there are
      surplus trajectories, then use the
      first `n_expert_demos` trajectories and drop the rest.
    log_dir: Directory to save models and other logging to.

    init_trainer_kwargs: Keyword arguments passed to `init_trainer`,
      used to initialize the trainer.
    total_timesteps: The number of transitions to sample from the environment
      during training.
    n_episodes_eval: The number of episodes to average over when calculating
      the average episode reward of the imitation policy for return.

    plot_interval: The number of epochs between each plot. If negative,
      then plots are disabled. If zero, then only plot at the end of training.
    n_plot_episodes: The number of episodes averaged over when
      calculating the average episode reward of a policy for the performance
      plots.
    extra_episode_data_interval: Usually mean episode rewards are calculated
      immediately before every plot. Set this parameter to a nonnegative number
      to also add episode reward data points every
      `extra_episodes_data_interval` epochs.
    show_plots: Figures are always saved to `f"{log_dir}/plots/*.png"`. If
      `show_plots` is True, then also show plots as they are created.
    init_tensorboard: If True, then write tensorboard logs to `{log_dir}/sb_tb`.

    checkpoint_interval: Save the discriminator and generator models every
      `checkpoint_interval` epochs and after training is complete. If 0,
      then only save weights after training is complete. If <0, then don't
      save weights at all.

  Returns:
    A dictionary with two keys. "imit_stats" gives the return value of
      `rollout_stats()` on rollouts test-reward-wrapped
      environment, using the final policy (remember that the ground-truth reward
      can be recovered from the "monitor_return" key). "expert_stats" gives the
      return value of `rollout_stats()` on the expert demonstrations loaded from
      `rollout_path`.
  """
    total_timesteps = int(total_timesteps)

    tf.logging.info("Logging to %s", log_dir)
    os.makedirs(log_dir, exist_ok=True)
    # try:
    #     sacred_util.build_sacred_symlink(log_dir, _run)
    # except Exception as e:
    #     print("didnt build symlink")
    # # Calculate stats for expert rollouts. Used for plot and return value.
    # with open(rollout_path, "rb") as f:
    #     expert_trajs = pickle.load(f)

    # if n_expert_demos is not None:
    #     assert len(expert_trajs) >= n_expert_demos
    #     expert_trajs = expert_trajs[:n_expert_demos]
    #
    # # expert_stats = util.rollout.rollout_stats(expert_trajs)
    sample_until = util.rollout.make_sample_until(rollout_save_n_timesteps,
                                                  rollout_save_n_episodes)

    with util.make_session():
        venv = util.make_vec_env(env_name,
                                 num_vec,
                                 seed=_seed,
                                 parallel=parallel,
                                 log_dir=log_dir,
                                 max_episode_steps=max_episode_steps,
                                 dac=dac)

        print("type of venv is: ", type(venv))
        vec_normalize = None
        venv = vec_normalize = VecNormalize(venv)
        print("type of venv is: ", type(venv))
        # time.sleep(10)
        gen_policy_path = os.path.join(log_dir, "checkpoints", "final",
                                       "gen_policy")
        print("gen policy path is: ", gen_policy_path)
        time.sleep(10)
        with serialize.load_policy('ppo2', gen_policy_path, venv) as policy:
            print(policy)
            print('right before: ', type(venv))
            time.sleep(10)
            util.rollout.save(gen_policy_path, policy, venv, sample_until)
示例#17
0
def init_trainer(
    env_name: str,
    expert_trajectories: Sequence[rollout.Trajectory],
    *,
    log_dir: str,
    seed: int = 0,
    use_gail: bool = False,
    num_vec: int = 8,
    parallel: bool = False,
    max_episode_steps: Optional[int] = None,
    scale: bool = True,
    airl_entropy_weight: float = 1.0,
    discrim_kwargs: dict = {},
    reward_kwargs: dict = {},
    trainer_kwargs: dict = {},
    init_rl_kwargs: dict = {},
):
    """Builds an AdversarialTrainer, ready to be trained on a vectorized
    environment and expert demonstrations.

  Args:
    env_name: The string id of a gym environment.
    expert_trajectories: Demonstrations from expert.
    seed: Random seed.
    log_dir: Directory for logging output. Will generate a unique sub-directory
        within this directory for all output.
    use_gail: If True, then train using GAIL. If False, then train
        using AIRL.
    num_vec: The number of vectorized environments.
    parallel: If True, then use SubprocVecEnv; otherwise, DummyVecEnv.
    max_episode_steps: If specified, wraps VecEnv in TimeLimit wrapper with
        this episode length before returning.
    policy_dir: The directory containing the pickled experts for
        generating rollouts.
    scale: If True, then scale input Tensors to the interval [0, 1].
    airl_entropy_weight: Only applicable for AIRL. The `entropy_weight`
        argument of `DiscrimNetAIRL.__init__`.
    trainer_kwargs: Arguments for the Trainer constructor.
    reward_kwargs: Arguments for the `*RewardNet` constructor.
    discrim_kwargs: Arguments for the `DiscrimNet*` constructor.
    init_rl_kwargs: Keyword arguments passed to `init_rl`,
        used to initialize the RL algorithm.
  """
    util.logger.configure(folder=log_dir,
                          format_strs=['tensorboard', 'stdout'])
    env = util.make_vec_env(env_name,
                            num_vec,
                            seed=seed,
                            parallel=parallel,
                            log_dir=log_dir,
                            max_episode_steps=max_episode_steps)
    gen_policy = util.init_rl(env, verbose=1, **init_rl_kwargs)

    if use_gail:
        discrim = discrim_net.DiscrimNetGAIL(env.observation_space,
                                             env.action_space,
                                             scale=scale,
                                             **discrim_kwargs)
    else:
        rn = BasicShapedRewardNet(env.observation_space,
                                  env.action_space,
                                  scale=scale,
                                  **reward_kwargs)
        discrim = discrim_net.DiscrimNetAIRL(
            rn, entropy_weight=airl_entropy_weight, **discrim_kwargs)

    expert_demos = util.rollout.flatten_trajectories(expert_trajectories)
    trainer = AdversarialTrainer(env,
                                 gen_policy,
                                 discrim,
                                 expert_demos,
                                 log_dir=log_dir,
                                 **trainer_kwargs)
    return trainer
示例#18
0
def init_trainer(
    env_id: str,
    rollout_glob: str,
    *,
    n_expert_demos: Optional[int] = None,
    seed: int = 0,
    log_dir: str = None,
    use_gail: bool = False,
    num_vec: int = 8,
    parallel: bool = False,
    max_n_files: int = 1,
    scale: bool = True,
    airl_entropy_weight: float = 1.0,
    discrim_kwargs: bool = {},
    reward_kwargs: bool = {},
    trainer_kwargs: bool = {},
    make_blank_policy_kwargs: bool = {},
):
    """Builds a Trainer, ready to be trained on a vectorized environment
  and expert demonstrations.

  Args:
    env_id: The string id of a gym environment.
    rollout_glob: Argument for `imitation.util.rollout.load_trajectories`.
    n_expert_demos: The number of expert trajectories to actually use
        after loading them via `load_trajectories`.
        If None, then use all available trajectories.
        If `n_expert_demos` is an `int`, then use
        exactly `n_expert_demos` trajectories, erroring if there aren't
        enough trajectories. If there are surplus trajectories, then use the
        first `n_expert_demos` trajectories and drop the rest.
    seed: Random seed.
    log_dir: Directory for logging output.
    use_gail: If True, then train using GAIL. If False, then train
        using AIRL.
    num_vec: The number of vectorized environments.
    parallel: If True, then use SubprocVecEnv; otherwise, DummyVecEnv.
    max_n_files: If provided, then only load the most recent `max_n_files`
        files, as sorted by modification times.
    policy_dir: The directory containing the pickled experts for
        generating rollouts.
    scale: If True, then scale input Tensors to the interval [0, 1].
    airl_entropy_weight: Only applicable for AIRL. The `entropy_weight`
        argument of `DiscrimNetAIRL.__init__`.
    trainer_kwargs: Arguments for the Trainer constructor.
    reward_kwargs: Arguments for the `*RewardNet` constructor.
    discrim_kwargs: Arguments for the `DiscrimNet*` constructor.
    make_blank_policy_kwargs: Keyword arguments passed to `make_blank_policy`,
        used to initialize the trainer.
  """
    env = util.make_vec_env(env_id,
                            num_vec,
                            seed=seed,
                            parallel=parallel,
                            log_dir=log_dir)
    gen_policy = util.init_rl(env, verbose=1, **make_blank_policy_kwargs)

    if use_gail:
        discrim = discrim_net.DiscrimNetGAIL(env.observation_space,
                                             env.action_space,
                                             scale=scale,
                                             **discrim_kwargs)
    else:
        rn = BasicShapedRewardNet(env.observation_space,
                                  env.action_space,
                                  scale=scale,
                                  **reward_kwargs)
        discrim = discrim_net.DiscrimNetAIRL(
            rn, entropy_weight=airl_entropy_weight, **discrim_kwargs)

    expert_demos = util.rollout.load_trajectories(rollout_glob,
                                                  max_n_files=max_n_files)
    if n_expert_demos is not None:
        assert len(expert_demos) >= n_expert_demos
        expert_demos = expert_demos[:n_expert_demos]

    expert_rollouts = util.rollout.flatten_trajectories(expert_demos)[:3]
    trainer = Trainer(env, gen_policy, discrim, expert_rollouts,
                      **trainer_kwargs)
    return trainer
示例#19
0
def rollouts_and_policy(
    _seed: int,
    env_name: str,
    total_timesteps: int,
    *,
    log_dir: str = None,
    num_vec: int = 8,
    parallel: bool = False,
    max_episode_steps: Optional[int] = None,
    normalize: bool = True,
    make_blank_policy_kwargs: dict = {},
    reward_type: Optional[str] = None,
    reward_path: Optional[str] = None,
    rollout_save_interval: int = 0,
    rollout_save_final: bool = False,
    rollout_save_n_timesteps: Optional[int] = None,
    rollout_save_n_episodes: Optional[int] = None,
    policy_save_interval: int = -1,
    policy_save_final: bool = True,
) -> None:
    """Trains an expert policy from scratch and saves the rollouts and policy.

  At applicable training steps `step` (where step is either an integer or
  "final"):

      - Policies are saved to `{log_dir}/policies/{step}.pkl`.
      - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`.

  Args:
      env_name: The gym.Env name. Loaded as VecEnv.
      total_timesteps: Number of training timesteps in `model.learn()`.
      log_dir: The root directory to save metrics and checkpoints to.
      num_vec: Number of environments in VecEnv.
      parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv.
      max_episode_steps: If not None, then environments are wrapped by
          TimeLimit so that they have at most `max_episode_steps` steps per
          episode.
      normalize: If True, then rescale observations and reward.
      make_blank_policy_kwargs: Kwargs for `make_blank_policy`.

      reward_type: If provided, then load the serialized reward of this type,
          wrapping the environment in this reward. This is useful to test
          whether a reward model transfers. For more information, see
          `imitation.rewards.serialize.load_reward`.
      reward_path: A specifier, such as a path to a file on disk, used by
          reward_type to load the reward model. For more information, see
          `imitation.rewards.serialize.load_reward`.

      rollout_save_interval: The number of training updates in between
          intermediate rollout saves. If the argument is nonpositive, then
          don't save intermediate updates.
      rollout_save_final: If True, then save rollouts right after training is
          finished.
      rollout_save_n_timesteps: The minimum number of timesteps saved in every
          file. Could be more than `rollout_save_n_timesteps` because
          trajectories are saved by episode rather than by transition.
          Must set exactly one of `rollout_save_n_timesteps`
          and `rollout_save_n_episodes`.
      rollout_save_n_episodes: The number of episodes saved in every
          file. Must set exactly one of `rollout_save_n_timesteps` and
          `rollout_save_n_episodes`.

      policy_save_interval: The number of training updates between saves. Has
          the same semantics are `rollout_save_interval`.
      policy_save_final: If True, then save the policy right after training is
          finished.
  """
    _validate_traj_generate_params(rollout_save_n_timesteps,
                                   rollout_save_n_episodes)

    with util.make_session():
        tf.logging.set_verbosity(tf.logging.INFO)
        sb_logger.configure(folder=osp.join(log_dir, 'rl'),
                            format_strs=['tensorboard', 'stdout'])

        rollout_dir = osp.join(log_dir, "rollouts")
        policy_dir = osp.join(log_dir, "policies")
        os.makedirs(rollout_dir, exist_ok=True)
        os.makedirs(policy_dir, exist_ok=True)

        venv = util.make_vec_env(env_name,
                                 num_vec,
                                 seed=_seed,
                                 parallel=parallel,
                                 log_dir=log_dir,
                                 max_episode_steps=max_episode_steps)

        log_callbacks = []
        with contextlib.ExitStack() as stack:
            if reward_type is not None:
                reward_fn_ctx = load_reward(reward_type, reward_path, venv)
                reward_fn = stack.enter_context(reward_fn_ctx)
                venv = RewardVecEnvWrapper(venv, reward_fn)
                log_callbacks.append(venv.log_callback)
                tf.logging.info(
                    f"Wrapped env in reward {reward_type} from {reward_path}.")

            vec_normalize = None
            if normalize:
                venv = vec_normalize = VecNormalize(venv)

            policy = util.init_rl(venv, verbose=1, **make_blank_policy_kwargs)

            # Make callback to save intermediate artifacts during training.
            step = 0

            def callback(locals_: dict, _) -> bool:
                nonlocal step
                step += 1
                policy = locals_['self']

                # TODO(adam): make logging frequency configurable
                for callback in log_callbacks:
                    callback(sb_logger)

                if rollout_save_interval > 0 and step % rollout_save_interval == 0:
                    util.rollout.save(rollout_dir,
                                      policy,
                                      venv,
                                      step,
                                      n_timesteps=rollout_save_n_timesteps,
                                      n_episodes=rollout_save_n_episodes)
                if policy_save_interval > 0 and step % policy_save_interval == 0:
                    output_dir = os.path.join(policy_dir, f'{step:05d}')
                    serialize.save_stable_model(output_dir, policy,
                                                vec_normalize)
                return True  # Continue training.

            policy.learn(total_timesteps, callback=callback)

            # Save final artifacts after training is complete.
            if rollout_save_final:
                util.rollout.save(rollout_dir,
                                  policy,
                                  venv,
                                  "final",
                                  n_timesteps=rollout_save_n_timesteps,
                                  n_episodes=rollout_save_n_episodes)
            if policy_save_final:
                output_dir = os.path.join(policy_dir, "final")
                serialize.save_stable_model(output_dir, policy, vec_normalize)
示例#20
0
def rollouts_and_policy(
  _run,
  _seed: int,
  env_name: str,
  total_timesteps: int,
  *,
  log_dir: str,
  num_vec: int,
  parallel: bool,
  max_episode_steps: Optional[int],
  normalize: bool,
  normalize_kwargs: dict,
  init_rl_kwargs: dict,

  n_episodes_eval: int,

  reward_type: Optional[str],
  reward_path: Optional[str],

  rollout_save_interval: int,
  rollout_save_final: bool,
  rollout_save_n_timesteps: Optional[int],
  rollout_save_n_episodes: Optional[int],

  policy_save_interval: int,
  policy_save_final: bool,

  init_tensorboard: bool,
) -> dict:
  """Trains an expert policy from scratch and saves the rollouts and policy.

  Checkpoints:
    At applicable training steps `step` (where step is either an integer or
    "final"):

      - Policies are saved to `{log_dir}/policies/{step}/`.
      - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`.

  Args:
      env_name: The gym.Env name. Loaded as VecEnv.
      total_timesteps: Number of training timesteps in `model.learn()`.
      log_dir: The root directory to save metrics and checkpoints to.
      num_vec: Number of environments in VecEnv.
      parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv.
      max_episode_steps: If not None, then environments are wrapped by
          TimeLimit so that they have at most `max_episode_steps` steps per
          episode.
      normalize: If True, then rescale observations and reward.
      normalize_kwargs: kwargs for `VecNormalize`.
      init_rl_kwargs: kwargs for `init_rl`.

      n_episodes_eval: The number of episodes to average over when calculating
          the average ground truth reward return of the final policy.

      reward_type: If provided, then load the serialized reward of this type,
          wrapping the environment in this reward. This is useful to test
          whether a reward model transfers. For more information, see
          `imitation.rewards.serialize.load_reward`.
      reward_path: A specifier, such as a path to a file on disk, used by
          reward_type to load the reward model. For more information, see
          `imitation.rewards.serialize.load_reward`.

      rollout_save_interval: The number of training updates in between
          intermediate rollout saves. If the argument is nonpositive, then
          don't save intermediate updates.
      rollout_save_final: If True, then save rollouts right after training is
          finished.
      rollout_save_n_timesteps: The minimum number of timesteps saved in every
          file. Could be more than `rollout_save_n_timesteps` because
          trajectories are saved by episode rather than by transition.
          Must set exactly one of `rollout_save_n_timesteps`
          and `rollout_save_n_episodes`.
      rollout_save_n_episodes: The number of episodes saved in every
          file. Must set exactly one of `rollout_save_n_timesteps` and
          `rollout_save_n_episodes`.

      policy_save_interval: The number of training updates between saves. Has
          the same semantics are `rollout_save_interval`.
      policy_save_final: If True, then save the policy right after training is
          finished.

      init_tensorboard: If True, then write tensorboard logs to {log_dir}/sb_tb
          and "output/summary/...".

  Returns:
    The return value of `rollout_stats()` using the final policy.
  """
  os.makedirs(log_dir, exist_ok=True)
  sacred_util.build_sacred_symlink(log_dir, _run)

  sample_until = util.rollout.make_sample_until(rollout_save_n_timesteps,
                                                rollout_save_n_episodes)
  eval_sample_until = util.rollout.min_episodes(n_episodes_eval)

  with util.make_session():
    tf.logging.set_verbosity(tf.logging.INFO)
    sb_logger.configure(folder=osp.join(log_dir, 'rl'),
                        format_strs=['tensorboard', 'stdout'])

    rollout_dir = osp.join(log_dir, "rollouts")
    policy_dir = osp.join(log_dir, "policies")
    os.makedirs(rollout_dir, exist_ok=True)
    os.makedirs(policy_dir, exist_ok=True)

    if init_tensorboard:
      sb_tensorboard_dir = osp.join(log_dir, "sb_tb")
      init_rl_kwargs["tensorboard_log"] = sb_tensorboard_dir

    venv = util.make_vec_env(env_name, num_vec, seed=_seed,
                             parallel=parallel, log_dir=log_dir,
                             max_episode_steps=max_episode_steps)

    log_callbacks = []
    with contextlib.ExitStack() as stack:
      if reward_type is not None:
        reward_fn_ctx = load_reward(reward_type, reward_path, venv)
        reward_fn = stack.enter_context(reward_fn_ctx)
        venv = RewardVecEnvWrapper(venv, reward_fn)
        log_callbacks.append(venv.log_callback)
        tf.logging.info(
            f"Wrapped env in reward {reward_type} from {reward_path}.")

      vec_normalize = None
      if normalize:
        venv = vec_normalize = VecNormalize(venv, **normalize_kwargs)

      policy = util.init_rl(venv, verbose=1, **init_rl_kwargs)

      # Make callback to save intermediate artifacts during training.
      step = 0

      def callback(locals_: dict, _) -> bool:
        nonlocal step
        step += 1
        policy = locals_['self']

        # TODO(adam): make logging frequency configurable
        for callback in log_callbacks:
          callback(sb_logger)

        if rollout_save_interval > 0 and step % rollout_save_interval == 0:
          save_path = osp.join(rollout_dir, f"{step}.pkl")
          util.rollout.save(save_path, policy, venv, sample_until)
        if policy_save_interval > 0 and step % policy_save_interval == 0:
          output_dir = os.path.join(policy_dir, f'{step:05d}')
          serialize.save_stable_model(output_dir, policy, vec_normalize)

      policy.learn(total_timesteps, callback=callback)

      # Save final artifacts after training is complete.
      if rollout_save_final:
        save_path = osp.join(rollout_dir, "final.pkl")
        util.rollout.save(save_path, policy, venv, sample_until)
      if policy_save_final:
        output_dir = os.path.join(policy_dir, "final")
        serialize.save_stable_model(output_dir, policy, vec_normalize)

      # Final evaluation of expert policy.
      trajs = util.rollout.generate_trajectories(
          policy, venv, eval_sample_until)
      stats = util.rollout.rollout_stats(trajs)

  return stats
示例#21
0
def test_discrim_net_no_crash(session, env_name, discrim_net_cls):
    # If parallel=True, codecov sometimes acts up.
    venv = util.make_vec_env(env_name, parallel=False)
    DISCRIM_NET_SETUPS[discrim_net_cls](venv)