def test_bc(trainer: bc.BC, venv): sample_until = rollout.min_episodes(15) novice_ret_mean = rollout.mean_return(trainer.policy, venv, sample_until) trainer.train(n_epochs=1, on_epoch_end=lambda _: print("epoch end")) trained_ret_mean = rollout.mean_return(trainer.policy, venv, sample_until) # Typically <80 score is bad, >350 is okay. We want an improvement of at # least 50 points, which seems like it's not noise. assert trained_ret_mean - novice_ret_mean > 50
def test_bc(trainer: bc.BC, venv): sample_until = rollout.min_episodes(25) novice_ret_mean = rollout.mean_return(trainer.policy, venv, sample_until) trainer.train(n_epochs=40) trained_ret_mean = rollout.mean_return(trainer.policy, venv, sample_until) # novice is bad assert novice_ret_mean < 80.0 # bc is okay but isn't perfect (for the purpose of this test) assert trained_ret_mean > 350.0
def test_trainer_makes_progress(tmpdir, session): venv = util.make_vec_env(ENV_NAME, 10) trainer = make_trainer(tmpdir) with pytest.raises(dagger.NeedsDemosException): trainer.extend_and_update() assert trainer.round_num == 0 pre_train_rew_mean = rollout.mean_return( trainer.bc_trainer.policy, venv, sample_until=rollout.min_episodes(20), deterministic_policy=True, ) # checking that the initial policy is poor can be flaky; sometimes the # randomly initialised policy performs very well, and it's not clear why # assert pre_train_rew_mean < 100 with serialize.load_policy("ppo2", EXPERT_POLICY_PATH, venv) as expert_policy: for i in range(5): # roll out a few trajectories for dataset, then train for a few steps collector = trainer.get_trajectory_collector() for _ in range(10): obs = collector.reset() done = False while not done: (expert_action,), _, _, _ = expert_policy.step( obs[None], deterministic=True ) obs, _, done, _ = collector.step(expert_action) trainer.extend_and_update(n_epochs=10) # make sure we're doing better than a random policy would post_train_rew_mean = rollout.mean_return( trainer.bc_trainer.policy, venv, sample_until=rollout.min_episodes(20), deterministic_policy=True, ) assert post_train_rew_mean > 150, ( f"pre-train mean {pre_train_rew_mean}, post-train mean " f"{post_train_rew_mean}" )
def imitation_learning(expert_traj_path, imitation_algo_name, rl_algo_name, env_name): # Load pickled expert demonstrations. with open(expert_traj_path, "rb") as f: # This is a list of `imitation.data.types.Trajectory`, where # every instance contains observations and actions for a single expert # demonstration. trajectories = pickle.load(f) # Convert List[types.Trajectory] to an instance of `imitation.data.types.Transitions`. # This is a more general dataclass containing unordered # (observation, actions, next_observation) transitions. transitions = rollout.flatten_trajectories(trajectories) venv = util.make_vec_env(env_name, n_envs=2) # tempdir = tempfile.TemporaryDirectory(prefix="il_results/{}_{}".format(rl_algo_name, env_name)) # tempdir_path = pathlib.Path(tempdir.name) # print(f"All Tensorboards and logging are being written inside {tempdir_path}/.") log_path = "il_results/{}_{}/{}/".format(rl_algo_name, env_name, imitation_algo_name) if imitation_algo_name == 'BC': # Train BC on expert data. # BC also accepts as `expert_data` any PyTorch-style DataLoader that iterates over # dictionaries containing observations and actions. logger.configure(log_path, format_strs=["stdout", "tensorboard"]) trainer = bc.BC(venv.observation_space, venv.action_space, expert_data=transitions) trainer.train(n_epochs=100, log_interval=1) elif imitation_algo_name == 'GAIL': logger.configure(log_path, format_strs=["stdout", "tensorboard"]) gail_trainer = adversarial.GAIL( venv, expert_data=transitions, expert_batch_size=32, gen_algo=sb3.PPO("MlpPolicy", venv, verbose=1, n_steps=1024), discrim_kwargs={ 'discrim_net': ActObsMLP( action_space=venv.action_space, observation_space=venv.observation_space, hid_sizes=(32, 32), ) }) gail_trainer.train(total_timesteps=2048) trainer = gail_trainer.gen_algo elif imitation_algo_name == 'AIRL': # Train AIRL on expert data. logger.configure(log_path) airl_trainer = adversarial.AIRL( venv, expert_data=transitions, expert_batch_size=32, gen_algo=sb3.PPO("MlpPolicy", venv, verbose=1, n_steps=1024), ) airl_trainer.train(total_timesteps=2048) sample_until = rollout.min_episodes(15) trained_ret_mean = rollout.mean_return(trainer.policy, venv, sample_until) # trainer.save_policy("{}/bc_policy.pth.tar".format(log_path)) th.save(trainer.policy, "{}/{}_policy.pth.tar".format(log_path, imitation_algo_name)) return trained_ret_mean