예제 #1
0
def test_train_disc_improve_D(use_gail,
                              env='CartPole-v1',
                              n_timesteps=200,
                              n_steps=1000):
    trainer = init_trainer(env, use_gail=use_gail)
    obs_old, act, obs_new, _ = rollout.flatten_trajectories(
        rollout.generate(trainer.gen_policy, env, n_timesteps=n_timesteps))
    kwargs = dict(gen_old_obs=obs_old, gen_act=act, gen_new_obs=obs_new)
    loss1 = trainer.eval_disc_loss(**kwargs)
    trainer.train_disc(n_steps=n_steps, **kwargs)
    loss2 = trainer.eval_disc_loss(**kwargs)
    assert loss2 < loss1
예제 #2
0
def test_wrap_learned_reward_no_crash(use_gail, env="CartPole-v1"):
    """
  Briefly train with AIRL, and then used the learned reward to wrap
  a duplicate environment. Finally, use that learned reward to train
  a policy.
  """
    trainer = init_trainer(env, use_gail=use_gail)
    trainer.train(n_epochs=1)

    learned_reward_env = trainer.wrap_env_test_reward(env)
    policy = util.make_blank_policy(env, init_tensorboard=False)
    policy.set_env(learned_reward_env)
    policy.learn(10)
예제 #3
0
def test_trained_policy_better_than_random(use_gail,
                                           env='CartPole-v1',
                                           n_episodes=50):
    """
  Make sure that generator policy trained to mimick expert policy
  demonstrations) achieves higher reward than a random policy.

  In other words, perform a basic check on the imitation learning
  capabilities of AIRL and GAIL.
  """
    env = util.make_vec_env(env, 32)
    trainer = init_trainer(env, use_expert_rollouts=True, use_gail=use_gail)
    expert_policy = util.load_policy(env, basedir="expert_models")
    random_policy = util.make_blank_policy(env)
    if expert_policy is None:
        pytest.fail("Couldn't load expert_policy!")

    trainer.train(n_epochs=200)

    # Idea: Plot n_epochs vs generator reward.
    for _ in range(4):
        expert_ret = rollout.mean_return(expert_policy,
                                         env,
                                         n_episodes=n_episodes)
        gen_ret = rollout.mean_return(trainer.gen_policy,
                                      env,
                                      n_episodes=n_episodes)
        random_ret = rollout.mean_return(random_policy,
                                         env,
                                         n_episodes=n_episodes)

        print("expert return:", expert_ret)
        print("generator return:", gen_ret)
        print("random return:", random_ret)
        assert expert_ret > random_ret
        assert gen_ret > random_ret
예제 #4
0
def train_and_plot(_seed: int,
                   env_name: str,
                   log_dir: str,
                   *,
                   n_epochs: int = 100,
                   n_epochs_per_plot: Optional[float] = None,
                   n_disc_steps_per_epoch: int = 10,
                   n_gen_steps_per_epoch: int = 10000,
                   n_episodes_per_reward_data: int = 5,
                   n_episodes_eval: int = 50,
                   checkpoint_interval: int = 5,
                   interactive: bool = True,
                   expert_policy=None,
                   init_trainer_kwargs: dict = {},
                   ) -> Dict[str, float]:
  """Alternate between training the generator and discriminator.

  Every epoch:
    - Plot discriminator loss during discriminator training steps in blue and
      discriminator loss during generator training steps in red.
    - Plot the performance of the generator policy versus the performance of
      a random policy. Also plot the performance of an expert policy if that is
      provided in the arguments.

  Args:
      _seed: Random seed.
      env_name: The environment to train in.
      log_dir: Directory to save models and other logging to.
      n_epochs: The number of epochs to train. Each epoch consists of
          `n_disc_steps_per_epoch` discriminator steps followed by
          `n_gen_steps_per_epoch` generator steps.
      n_epochs_per_plot: An optional number, greater than or equal to 1. The
          (possibly fractional) number of epochs between each plot. The first
          plot is at epoch 0, after the first discrim and generator steps.
          If `n_epochs_per_plot is None`, then don't make any plots.
      n_disc_steps_per_epoch: The number of discriminator update steps during
          every training epoch.
      n_gen_plot_episodes: The number of generator update steps during every
          generator epoch.
      n_episodes_per_reward_data: The number of episodes averaged over when
          calculating the average episode reward of a policy for the performance
          plots.
      n_episodes_eval: The number of episodes to average over when calculating
          the average ground truth reward return of the final policy.
      checkpoint_interval: Save the discriminator and generator models every
          `checkpoint_interval` epochs and after training is complete. If <=0,
          then only save weights after training is complete.
      interactive: Figures are always saved to `output/*.png`. If `interactive`
        is True, then also show plots as they are created.
      expert_policy (BasePolicy or BaseRLModel, optional): If provided, then
          also plot the performance of this expert policy.
      init_trainer_kwargs: Keyword arguments passed to `init_trainer`,
        used to initialize the trainer.

  Returns:
      results: A dictionary with two keys, "mean" and "std_err". The
          corresponding values are the mean and standard error of
          ground truth episode return for the imitation learning algorithm.
  """
  assert n_epochs_per_plot is None or n_epochs_per_plot >= 1

  with util.make_session():
    trainer = init_trainer(env_name, seed=_seed, log_dir=log_dir,
                           **init_trainer_kwargs)

    tf.logging.info("Logging to %s", log_dir)
    os.makedirs(log_dir, exist_ok=True)
    sb_logger.configure(folder=osp.join(log_dir, 'generator'),
                        format_strs=['tensorboard', 'stdout'])

    plot_idx = 0
    gen_data = ([], [])
    disc_data = ([], [])

    def disc_plot_add_data(gen_mode: bool = False):
      """Evaluates and records the discriminator loss for plotting later.

      Args:
          gen_mode: Whether the generator or the discriminator is active.
              We use this to color the data points.
      """
      nonlocal plot_idx
      mode = "gen" if gen_mode else "dis"
      X, Y = gen_data if gen_mode else disc_data
      # Divide by two since we get two data points (gen and disc) per epoch.
      X.append(plot_idx / 2)
      Y.append(trainer.eval_disc_loss())
      tf.logging.info(
          "plot idx ({}): {} disc loss: {}"
          .format(mode, plot_idx, Y[-1]))
      plot_idx += 1

    def disc_plot_show():
      """Render a plot of discriminator loss vs. training epoch number."""
      plt.scatter(disc_data[0], disc_data[1], c='g', alpha=0.7, s=4,
                  label="discriminator loss (dis step)")
      plt.scatter(gen_data[0], gen_data[1], c='r', alpha=0.7, s=4,
                  label="discriminator loss (gen step)")
      plt.title("Discriminator loss")
      plt.legend()
      _savefig_timestamp("plot_fight_loss_disc", interactive)

    gen_ep_reward = defaultdict(list)
    rand_ep_reward = defaultdict(list)
    exp_ep_reward = defaultdict(list)

    def ep_reward_plot_add_data(env, name):
      """Calculate and record average episode returns."""
      gen_policy = trainer.gen_policy
      gen_ret = util.rollout.mean_return(
          gen_policy, env, n_episodes=n_episodes_per_reward_data)
      gen_ep_reward[name].append(gen_ret)
      tf.logging.info("generator return: {}".format(gen_ret))

      rand_policy = util.init_rl(trainer.env)
      rand_ret = util.rollout.mean_return(
          rand_policy, env, n_episodes=n_episodes_per_reward_data)
      rand_ep_reward[name].append(rand_ret)
      tf.logging.info("random return: {}".format(rand_ret))

      if expert_policy is not None:
          exp_ret = util.rollout.mean_return(
              expert_policy, env, n_episodes=n_episodes_per_reward_data)
          exp_ep_reward[name].append(exp_ret)
          tf.logging.info("exp return: {}".format(exp_ret))

    def ep_reward_plot_show():
      """Render and show average episode reward plots."""
      for name in gen_ep_reward:
        plt.title(name + " Performance")
        plt.xlabel("epochs")
        plt.ylabel("Average reward per episode (n={})"
                   .format(n_episodes_per_reward_data))
        plt.plot(gen_ep_reward[name], label="avg gen ep reward", c="red")
        plt.plot(rand_ep_reward[name],
                 label="avg random ep reward", c="black")
        plt.plot(exp_ep_reward[name], label="avg exp ep reward", c="blue")
        plt.legend()
        _savefig_timestamp("plot_fight_epreward_gen", interactive)

    if n_epochs_per_plot is not None:
      n_plots_per_epoch = 1 / n_epochs_per_plot
    else:
      n_plots_per_epoch = None

    def should_plot_now(epoch) -> bool:
      """For positive epochs, returns True if a plot should be rendered now.

      This also controls the frequency at which `ep_reward_plot_add_data` is
      called, because generating those rollouts is too expensive to perform
      every timestep.
      """
      assert epoch >= 1
      if n_plots_per_epoch is None:
        return False
      plot_num = math.floor(n_plots_per_epoch * epoch)
      prev_plot_num = math.floor(n_plots_per_epoch * (epoch - 1))
      assert abs(plot_num - prev_plot_num) <= 1
      return plot_num != prev_plot_num

    # Collect data for epoch 0.
    if n_epochs_per_plot is not None:
      disc_plot_add_data(False)
      ep_reward_plot_add_data(trainer.env, "Ground Truth Reward")
      ep_reward_plot_add_data(trainer.env_train, "Train Reward")
      ep_reward_plot_add_data(trainer.env_test, "Test Reward")

    # Main training loop.
    for epoch in tqdm.tqdm(range(1, n_epochs+1), desc="epoch"):
      trainer.train_disc(n_disc_steps_per_epoch)
      disc_plot_add_data(False)
      trainer.train_gen(n_gen_steps_per_epoch)
      disc_plot_add_data(True)

      if should_plot_now(epoch):
        disc_plot_show()
        ep_reward_plot_add_data(trainer.env, "Ground Truth Reward")
        ep_reward_plot_add_data(trainer.env_train, "Train Reward")
        ep_reward_plot_add_data(trainer.env_test, "Test Reward")
        ep_reward_plot_show()

      if checkpoint_interval > 0 and epoch % checkpoint_interval == 0:
        save(trainer, os.path.join(log_dir, "checkpoints", f"{epoch:05d}"))

    save(trainer, os.path.join(log_dir, "final"))

    # Final evaluation of imitation policy.
    stats = util.rollout.rollout_stats(trainer.gen_policy,
                                       trainer.env,
                                       n_episodes=n_episodes_eval)
    assert stats["n_traj"] >= n_episodes_eval
    mean = stats["return_mean"]
    std_err = stats["return_std"] / math.sqrt(n_episodes_eval)
    print(f"[result] Mean Episode Return: {mean:.4g} ± {std_err:.3g} "
          f"(n={stats['n_traj']})")

    return dict(mean=mean, std_err=std_err)
예제 #5
0
def train_and_plot(
    policy_dir,
    env='CartPole-v1',
    n_epochs=100,
    n_plots_each_per_epoch=0,
    n_disc_steps_per_epoch=10,
    n_gen_steps_per_epoch=10000,
    n_gen_plot_episodes=0,
    trainer_hook_fn=None,
    trainer=None,
    interactive=True,
):
    """Alternate between training the generator and discriminator.

  Every epoch:
    - Plot discriminator loss during discriminator training steps in blue and
      discriminator loss during generator training steps in red.
    - Plot the performance of the generator policy versus the performance of
      a random policy.

  Params:
    - ...
  """
    if trainer is None:
        trainer = init_trainer(env, policy_dir=policy_dir)
    if trainer_hook_fn:
        trainer_hook_fn(trainer)

    os.makedirs("output/", exist_ok=True)

    plot_idx = 0

    gen_data = ([], [])
    disc_data = ([], [])

    def add_plot_disc(gen_mode=False):
        """
    Arguments:
        gen_mode (bool): Whether the generator or the discriminator is active.
            We use this to color the data points.
    """
        if n_plots_each_per_epoch <= 0:
            return

        mode = "gen" if gen_mode else "dis"
        X, Y = gen_data if gen_mode else disc_data
        X.append(plot_idx)
        Y.append(trainer.eval_disc_loss())
        tf.logging.info("plot idx ({}): {} disc loss: {}".format(
            mode, plot_idx, Y[-1]))

    def show_plot_disc():
        if n_plots_each_per_epoch <= 0:
            return

        plt.scatter(disc_data[0],
                    disc_data[1],
                    c='g',
                    alpha=0.7,
                    s=4,
                    label="discriminator loss (dis step)")
        plt.scatter(gen_data[0],
                    gen_data[1],
                    c='r',
                    alpha=0.7,
                    s=4,
                    label="discriminator loss (gen step)")
        plt.title("epoch={}".format(epoch_num))
        plt.legend()
        _savefig_timestamp("plot_fight_loss_disc", interactive)

    gen_ep_reward = defaultdict(list)
    rand_ep_reward = defaultdict(list)
    exp_ep_reward = defaultdict(list)

    def add_plot_gen(env, name):
        if n_gen_plot_episodes <= 0:
            return

        gen_policy = trainer.gen_policy
        rand_policy = util.make_blank_policy(env)
        exp_policy = trainer.expert_policies[-1]

        gen_ret = util.rollout.mean_return(gen_policy,
                                           env,
                                           n_episodes=n_gen_plot_episodes)
        rand_ret = util.rollout.mean_return(rand_policy,
                                            env,
                                            n_episodes=n_gen_plot_episodes)
        exp_ret = util.rollout.mean_return(exp_policy,
                                           env,
                                           n_episodes=n_gen_plot_episodes)
        gen_ep_reward[name].append(gen_ret)
        rand_ep_reward[name].append(rand_ret)
        exp_ep_reward[name].append(exp_ret)
        tf.logging.info("generator return: {}".format(gen_ret))
        tf.logging.info("random return: {}".format(rand_ret))
        tf.logging.info("exp return: {}".format(exp_ret))

    def show_plot_gen():
        if n_gen_plot_episodes <= 0:
            return

        for name in gen_ep_reward:
            plt.title(name + " Performance")
            plt.xlabel("epochs")
            plt.ylabel("Average reward per episode (n={})".format(
                n_gen_plot_episodes))
            plt.plot(gen_ep_reward[name], label="avg gen ep reward", c="red")
            plt.plot(rand_ep_reward[name],
                     label="avg random ep reward",
                     c="black")
            plt.plot(exp_ep_reward[name], label="avg exp ep reward", c="blue")
            plt.legend()
            _savefig_timestamp("plot_fight_epreward_gen", interactive)

    add_plot_disc(False)
    add_plot_gen(env, "True Reward")
    add_plot_gen(trainer.wrap_env_test_reward(env), "Learned Reward")

    if n_plots_each_per_epoch <= 0:
        n_gen_steps_per_plot = float('Inf')
        n_disc_steps_per_plot = float('Inf')
    else:
        n_gen_steps_per_plot = int(
            round(n_gen_steps_per_epoch / n_plots_each_per_epoch))
        n_disc_steps_per_plot = int(
            round(n_disc_steps_per_epoch / n_plots_each_per_epoch))

    def train_plot_itr(steps, gen_mode, steps_per_plot):
        nonlocal plot_idx
        while steps > 0:
            steps_to_train = min(steps, steps_per_plot)
            if gen_mode:
                trainer.train_gen(n_steps=steps_to_train)
            else:
                trainer.train_disc(n_steps=steps_to_train)
            steps -= steps_to_train
            add_plot_disc(gen_mode)
            plot_idx += 1

    for epoch_num in tqdm.trange(n_epochs, desc="epoch"):
        train_plot_itr(n_disc_steps_per_epoch, False, n_disc_steps_per_plot)
        train_plot_itr(n_gen_steps_per_epoch, True, n_gen_steps_per_plot)

        add_plot_gen(env, "True Reward")
        add_plot_gen(trainer.wrap_env_test_reward(env), "Learned Reward")

        show_plot_disc()
        show_plot_gen()
        if trainer_hook_fn:
            trainer_hook_fn(trainer)

    return trainer, gen_data, disc_data, gen_ep_reward
예제 #6
0
def test_train_gen_no_crash(use_gail, env='CartPole-v1', n_steps=10):
    trainer = init_trainer(env, use_gail=use_gail)
    trainer.train_gen(n_steps)
예제 #7
0
def test_train_disc_no_crash(use_gail, env='CartPole-v1', n_timesteps=200):
    trainer = init_trainer(env, use_gail=use_gail)
    trainer.train_disc()
    obs_old, act, obs_new, _ = rollout.flatten_trajectories(
        rollout.generate(trainer.gen_policy, env, n_timesteps=n_timesteps))
    trainer.train_disc(gen_old_obs=obs_old, gen_act=act, gen_new_obs=obs_new)
예제 #8
0
def test_init_no_crash(use_gail, env='CartPole-v1'):
    init_trainer(env, use_gail=use_gail)
예제 #9
0
def test_train_no_crash(use_gail, env='CartPole-v1'):
    trainer = init_trainer(env, use_gail=use_gail)
    trainer.train(n_epochs=1)
예제 #10
0
def init_test_trainer(env_id: str, use_gail: bool, parallel: bool = False):
  return init_trainer(env_id=env_id,
                      rollout_glob=f"tests/data/rollouts/{env_id}*.pkl",
                      use_gail=use_gail,
                      parallel=parallel)