def test_train_disc_improve_D(use_gail, env='CartPole-v1', n_timesteps=200, n_steps=1000): trainer = init_trainer(env, use_gail=use_gail) obs_old, act, obs_new, _ = rollout.flatten_trajectories( rollout.generate(trainer.gen_policy, env, n_timesteps=n_timesteps)) kwargs = dict(gen_old_obs=obs_old, gen_act=act, gen_new_obs=obs_new) loss1 = trainer.eval_disc_loss(**kwargs) trainer.train_disc(n_steps=n_steps, **kwargs) loss2 = trainer.eval_disc_loss(**kwargs) assert loss2 < loss1
def test_wrap_learned_reward_no_crash(use_gail, env="CartPole-v1"): """ Briefly train with AIRL, and then used the learned reward to wrap a duplicate environment. Finally, use that learned reward to train a policy. """ trainer = init_trainer(env, use_gail=use_gail) trainer.train(n_epochs=1) learned_reward_env = trainer.wrap_env_test_reward(env) policy = util.make_blank_policy(env, init_tensorboard=False) policy.set_env(learned_reward_env) policy.learn(10)
def test_trained_policy_better_than_random(use_gail, env='CartPole-v1', n_episodes=50): """ Make sure that generator policy trained to mimick expert policy demonstrations) achieves higher reward than a random policy. In other words, perform a basic check on the imitation learning capabilities of AIRL and GAIL. """ env = util.make_vec_env(env, 32) trainer = init_trainer(env, use_expert_rollouts=True, use_gail=use_gail) expert_policy = util.load_policy(env, basedir="expert_models") random_policy = util.make_blank_policy(env) if expert_policy is None: pytest.fail("Couldn't load expert_policy!") trainer.train(n_epochs=200) # Idea: Plot n_epochs vs generator reward. for _ in range(4): expert_ret = rollout.mean_return(expert_policy, env, n_episodes=n_episodes) gen_ret = rollout.mean_return(trainer.gen_policy, env, n_episodes=n_episodes) random_ret = rollout.mean_return(random_policy, env, n_episodes=n_episodes) print("expert return:", expert_ret) print("generator return:", gen_ret) print("random return:", random_ret) assert expert_ret > random_ret assert gen_ret > random_ret
def train_and_plot(_seed: int, env_name: str, log_dir: str, *, n_epochs: int = 100, n_epochs_per_plot: Optional[float] = None, n_disc_steps_per_epoch: int = 10, n_gen_steps_per_epoch: int = 10000, n_episodes_per_reward_data: int = 5, n_episodes_eval: int = 50, checkpoint_interval: int = 5, interactive: bool = True, expert_policy=None, init_trainer_kwargs: dict = {}, ) -> Dict[str, float]: """Alternate between training the generator and discriminator. Every epoch: - Plot discriminator loss during discriminator training steps in blue and discriminator loss during generator training steps in red. - Plot the performance of the generator policy versus the performance of a random policy. Also plot the performance of an expert policy if that is provided in the arguments. Args: _seed: Random seed. env_name: The environment to train in. log_dir: Directory to save models and other logging to. n_epochs: The number of epochs to train. Each epoch consists of `n_disc_steps_per_epoch` discriminator steps followed by `n_gen_steps_per_epoch` generator steps. n_epochs_per_plot: An optional number, greater than or equal to 1. The (possibly fractional) number of epochs between each plot. The first plot is at epoch 0, after the first discrim and generator steps. If `n_epochs_per_plot is None`, then don't make any plots. n_disc_steps_per_epoch: The number of discriminator update steps during every training epoch. n_gen_plot_episodes: The number of generator update steps during every generator epoch. n_episodes_per_reward_data: The number of episodes averaged over when calculating the average episode reward of a policy for the performance plots. n_episodes_eval: The number of episodes to average over when calculating the average ground truth reward return of the final policy. checkpoint_interval: Save the discriminator and generator models every `checkpoint_interval` epochs and after training is complete. If <=0, then only save weights after training is complete. interactive: Figures are always saved to `output/*.png`. If `interactive` is True, then also show plots as they are created. expert_policy (BasePolicy or BaseRLModel, optional): If provided, then also plot the performance of this expert policy. init_trainer_kwargs: Keyword arguments passed to `init_trainer`, used to initialize the trainer. Returns: results: A dictionary with two keys, "mean" and "std_err". The corresponding values are the mean and standard error of ground truth episode return for the imitation learning algorithm. """ assert n_epochs_per_plot is None or n_epochs_per_plot >= 1 with util.make_session(): trainer = init_trainer(env_name, seed=_seed, log_dir=log_dir, **init_trainer_kwargs) tf.logging.info("Logging to %s", log_dir) os.makedirs(log_dir, exist_ok=True) sb_logger.configure(folder=osp.join(log_dir, 'generator'), format_strs=['tensorboard', 'stdout']) plot_idx = 0 gen_data = ([], []) disc_data = ([], []) def disc_plot_add_data(gen_mode: bool = False): """Evaluates and records the discriminator loss for plotting later. Args: gen_mode: Whether the generator or the discriminator is active. We use this to color the data points. """ nonlocal plot_idx mode = "gen" if gen_mode else "dis" X, Y = gen_data if gen_mode else disc_data # Divide by two since we get two data points (gen and disc) per epoch. X.append(plot_idx / 2) Y.append(trainer.eval_disc_loss()) tf.logging.info( "plot idx ({}): {} disc loss: {}" .format(mode, plot_idx, Y[-1])) plot_idx += 1 def disc_plot_show(): """Render a plot of discriminator loss vs. training epoch number.""" plt.scatter(disc_data[0], disc_data[1], c='g', alpha=0.7, s=4, label="discriminator loss (dis step)") plt.scatter(gen_data[0], gen_data[1], c='r', alpha=0.7, s=4, label="discriminator loss (gen step)") plt.title("Discriminator loss") plt.legend() _savefig_timestamp("plot_fight_loss_disc", interactive) gen_ep_reward = defaultdict(list) rand_ep_reward = defaultdict(list) exp_ep_reward = defaultdict(list) def ep_reward_plot_add_data(env, name): """Calculate and record average episode returns.""" gen_policy = trainer.gen_policy gen_ret = util.rollout.mean_return( gen_policy, env, n_episodes=n_episodes_per_reward_data) gen_ep_reward[name].append(gen_ret) tf.logging.info("generator return: {}".format(gen_ret)) rand_policy = util.init_rl(trainer.env) rand_ret = util.rollout.mean_return( rand_policy, env, n_episodes=n_episodes_per_reward_data) rand_ep_reward[name].append(rand_ret) tf.logging.info("random return: {}".format(rand_ret)) if expert_policy is not None: exp_ret = util.rollout.mean_return( expert_policy, env, n_episodes=n_episodes_per_reward_data) exp_ep_reward[name].append(exp_ret) tf.logging.info("exp return: {}".format(exp_ret)) def ep_reward_plot_show(): """Render and show average episode reward plots.""" for name in gen_ep_reward: plt.title(name + " Performance") plt.xlabel("epochs") plt.ylabel("Average reward per episode (n={})" .format(n_episodes_per_reward_data)) plt.plot(gen_ep_reward[name], label="avg gen ep reward", c="red") plt.plot(rand_ep_reward[name], label="avg random ep reward", c="black") plt.plot(exp_ep_reward[name], label="avg exp ep reward", c="blue") plt.legend() _savefig_timestamp("plot_fight_epreward_gen", interactive) if n_epochs_per_plot is not None: n_plots_per_epoch = 1 / n_epochs_per_plot else: n_plots_per_epoch = None def should_plot_now(epoch) -> bool: """For positive epochs, returns True if a plot should be rendered now. This also controls the frequency at which `ep_reward_plot_add_data` is called, because generating those rollouts is too expensive to perform every timestep. """ assert epoch >= 1 if n_plots_per_epoch is None: return False plot_num = math.floor(n_plots_per_epoch * epoch) prev_plot_num = math.floor(n_plots_per_epoch * (epoch - 1)) assert abs(plot_num - prev_plot_num) <= 1 return plot_num != prev_plot_num # Collect data for epoch 0. if n_epochs_per_plot is not None: disc_plot_add_data(False) ep_reward_plot_add_data(trainer.env, "Ground Truth Reward") ep_reward_plot_add_data(trainer.env_train, "Train Reward") ep_reward_plot_add_data(trainer.env_test, "Test Reward") # Main training loop. for epoch in tqdm.tqdm(range(1, n_epochs+1), desc="epoch"): trainer.train_disc(n_disc_steps_per_epoch) disc_plot_add_data(False) trainer.train_gen(n_gen_steps_per_epoch) disc_plot_add_data(True) if should_plot_now(epoch): disc_plot_show() ep_reward_plot_add_data(trainer.env, "Ground Truth Reward") ep_reward_plot_add_data(trainer.env_train, "Train Reward") ep_reward_plot_add_data(trainer.env_test, "Test Reward") ep_reward_plot_show() if checkpoint_interval > 0 and epoch % checkpoint_interval == 0: save(trainer, os.path.join(log_dir, "checkpoints", f"{epoch:05d}")) save(trainer, os.path.join(log_dir, "final")) # Final evaluation of imitation policy. stats = util.rollout.rollout_stats(trainer.gen_policy, trainer.env, n_episodes=n_episodes_eval) assert stats["n_traj"] >= n_episodes_eval mean = stats["return_mean"] std_err = stats["return_std"] / math.sqrt(n_episodes_eval) print(f"[result] Mean Episode Return: {mean:.4g} ± {std_err:.3g} " f"(n={stats['n_traj']})") return dict(mean=mean, std_err=std_err)
def train_and_plot( policy_dir, env='CartPole-v1', n_epochs=100, n_plots_each_per_epoch=0, n_disc_steps_per_epoch=10, n_gen_steps_per_epoch=10000, n_gen_plot_episodes=0, trainer_hook_fn=None, trainer=None, interactive=True, ): """Alternate between training the generator and discriminator. Every epoch: - Plot discriminator loss during discriminator training steps in blue and discriminator loss during generator training steps in red. - Plot the performance of the generator policy versus the performance of a random policy. Params: - ... """ if trainer is None: trainer = init_trainer(env, policy_dir=policy_dir) if trainer_hook_fn: trainer_hook_fn(trainer) os.makedirs("output/", exist_ok=True) plot_idx = 0 gen_data = ([], []) disc_data = ([], []) def add_plot_disc(gen_mode=False): """ Arguments: gen_mode (bool): Whether the generator or the discriminator is active. We use this to color the data points. """ if n_plots_each_per_epoch <= 0: return mode = "gen" if gen_mode else "dis" X, Y = gen_data if gen_mode else disc_data X.append(plot_idx) Y.append(trainer.eval_disc_loss()) tf.logging.info("plot idx ({}): {} disc loss: {}".format( mode, plot_idx, Y[-1])) def show_plot_disc(): if n_plots_each_per_epoch <= 0: return plt.scatter(disc_data[0], disc_data[1], c='g', alpha=0.7, s=4, label="discriminator loss (dis step)") plt.scatter(gen_data[0], gen_data[1], c='r', alpha=0.7, s=4, label="discriminator loss (gen step)") plt.title("epoch={}".format(epoch_num)) plt.legend() _savefig_timestamp("plot_fight_loss_disc", interactive) gen_ep_reward = defaultdict(list) rand_ep_reward = defaultdict(list) exp_ep_reward = defaultdict(list) def add_plot_gen(env, name): if n_gen_plot_episodes <= 0: return gen_policy = trainer.gen_policy rand_policy = util.make_blank_policy(env) exp_policy = trainer.expert_policies[-1] gen_ret = util.rollout.mean_return(gen_policy, env, n_episodes=n_gen_plot_episodes) rand_ret = util.rollout.mean_return(rand_policy, env, n_episodes=n_gen_plot_episodes) exp_ret = util.rollout.mean_return(exp_policy, env, n_episodes=n_gen_plot_episodes) gen_ep_reward[name].append(gen_ret) rand_ep_reward[name].append(rand_ret) exp_ep_reward[name].append(exp_ret) tf.logging.info("generator return: {}".format(gen_ret)) tf.logging.info("random return: {}".format(rand_ret)) tf.logging.info("exp return: {}".format(exp_ret)) def show_plot_gen(): if n_gen_plot_episodes <= 0: return for name in gen_ep_reward: plt.title(name + " Performance") plt.xlabel("epochs") plt.ylabel("Average reward per episode (n={})".format( n_gen_plot_episodes)) plt.plot(gen_ep_reward[name], label="avg gen ep reward", c="red") plt.plot(rand_ep_reward[name], label="avg random ep reward", c="black") plt.plot(exp_ep_reward[name], label="avg exp ep reward", c="blue") plt.legend() _savefig_timestamp("plot_fight_epreward_gen", interactive) add_plot_disc(False) add_plot_gen(env, "True Reward") add_plot_gen(trainer.wrap_env_test_reward(env), "Learned Reward") if n_plots_each_per_epoch <= 0: n_gen_steps_per_plot = float('Inf') n_disc_steps_per_plot = float('Inf') else: n_gen_steps_per_plot = int( round(n_gen_steps_per_epoch / n_plots_each_per_epoch)) n_disc_steps_per_plot = int( round(n_disc_steps_per_epoch / n_plots_each_per_epoch)) def train_plot_itr(steps, gen_mode, steps_per_plot): nonlocal plot_idx while steps > 0: steps_to_train = min(steps, steps_per_plot) if gen_mode: trainer.train_gen(n_steps=steps_to_train) else: trainer.train_disc(n_steps=steps_to_train) steps -= steps_to_train add_plot_disc(gen_mode) plot_idx += 1 for epoch_num in tqdm.trange(n_epochs, desc="epoch"): train_plot_itr(n_disc_steps_per_epoch, False, n_disc_steps_per_plot) train_plot_itr(n_gen_steps_per_epoch, True, n_gen_steps_per_plot) add_plot_gen(env, "True Reward") add_plot_gen(trainer.wrap_env_test_reward(env), "Learned Reward") show_plot_disc() show_plot_gen() if trainer_hook_fn: trainer_hook_fn(trainer) return trainer, gen_data, disc_data, gen_ep_reward
def test_train_gen_no_crash(use_gail, env='CartPole-v1', n_steps=10): trainer = init_trainer(env, use_gail=use_gail) trainer.train_gen(n_steps)
def test_train_disc_no_crash(use_gail, env='CartPole-v1', n_timesteps=200): trainer = init_trainer(env, use_gail=use_gail) trainer.train_disc() obs_old, act, obs_new, _ = rollout.flatten_trajectories( rollout.generate(trainer.gen_policy, env, n_timesteps=n_timesteps)) trainer.train_disc(gen_old_obs=obs_old, gen_act=act, gen_new_obs=obs_new)
def test_init_no_crash(use_gail, env='CartPole-v1'): init_trainer(env, use_gail=use_gail)
def test_train_no_crash(use_gail, env='CartPole-v1'): trainer = init_trainer(env, use_gail=use_gail) trainer.train(n_epochs=1)
def init_test_trainer(env_id: str, use_gail: bool, parallel: bool = False): return init_trainer(env_id=env_id, rollout_glob=f"tests/data/rollouts/{env_id}*.pkl", use_gail=use_gail, parallel=parallel)