示例#1
0
def test_bc(trainer: bc.BC, venv):
    sample_until = rollout.min_episodes(15)
    novice_ret_mean = rollout.mean_return(trainer.policy, venv, sample_until)
    trainer.train(n_epochs=1, on_epoch_end=lambda _: print("epoch end"))
    trained_ret_mean = rollout.mean_return(trainer.policy, venv, sample_until)
    # Typically <80 score is bad, >350 is okay. We want an improvement of at
    # least 50 points, which seems like it's not noise.
    assert trained_ret_mean - novice_ret_mean > 50
示例#2
0
def test_bc(trainer: bc.BC, venv):
    sample_until = rollout.min_episodes(25)
    novice_ret_mean = rollout.mean_return(trainer.policy, venv, sample_until)
    trainer.train(n_epochs=40)
    trained_ret_mean = rollout.mean_return(trainer.policy, venv, sample_until)
    # novice is bad
    assert novice_ret_mean < 80.0
    # bc is okay but isn't perfect (for the purpose of this test)
    assert trained_ret_mean > 350.0
示例#3
0
def test_trainer_makes_progress(tmpdir, session):
    venv = util.make_vec_env(ENV_NAME, 10)
    trainer = make_trainer(tmpdir)
    with pytest.raises(dagger.NeedsDemosException):
        trainer.extend_and_update()
    assert trainer.round_num == 0
    pre_train_rew_mean = rollout.mean_return(
        trainer.bc_trainer.policy,
        venv,
        sample_until=rollout.min_episodes(20),
        deterministic_policy=True,
    )
    # checking that the initial policy is poor can be flaky; sometimes the
    # randomly initialised policy performs very well, and it's not clear why
    # assert pre_train_rew_mean < 100
    with serialize.load_policy("ppo2", EXPERT_POLICY_PATH, venv) as expert_policy:
        for i in range(5):
            # roll out a few trajectories for dataset, then train for a few steps
            collector = trainer.get_trajectory_collector()
            for _ in range(10):
                obs = collector.reset()
                done = False
                while not done:
                    (expert_action,), _, _, _ = expert_policy.step(
                        obs[None], deterministic=True
                    )
                    obs, _, done, _ = collector.step(expert_action)
            trainer.extend_and_update(n_epochs=10)
    # make sure we're doing better than a random policy would
    post_train_rew_mean = rollout.mean_return(
        trainer.bc_trainer.policy,
        venv,
        sample_until=rollout.min_episodes(20),
        deterministic_policy=True,
    )
    assert post_train_rew_mean > 150, (
        f"pre-train mean {pre_train_rew_mean}, post-train mean "
        f"{post_train_rew_mean}"
    )
示例#4
0
def imitation_learning(expert_traj_path, imitation_algo_name, rl_algo_name,
                       env_name):
    # Load pickled expert demonstrations.
    with open(expert_traj_path, "rb") as f:
        # This is a list of `imitation.data.types.Trajectory`, where
        # every instance contains observations and actions for a single expert
        # demonstration.
        trajectories = pickle.load(f)
    # Convert List[types.Trajectory] to an instance of `imitation.data.types.Transitions`.
    # This is a more general dataclass containing unordered
    # (observation, actions, next_observation) transitions.
    transitions = rollout.flatten_trajectories(trajectories)

    venv = util.make_vec_env(env_name, n_envs=2)

    # tempdir = tempfile.TemporaryDirectory(prefix="il_results/{}_{}".format(rl_algo_name, env_name))
    # tempdir_path = pathlib.Path(tempdir.name)
    # print(f"All Tensorboards and logging are being written inside {tempdir_path}/.")
    log_path = "il_results/{}_{}/{}/".format(rl_algo_name, env_name,
                                             imitation_algo_name)

    if imitation_algo_name == 'BC':
        # Train BC on expert data.
        # BC also accepts as `expert_data` any PyTorch-style DataLoader that iterates over
        # dictionaries containing observations and actions.
        logger.configure(log_path, format_strs=["stdout", "tensorboard"])
        trainer = bc.BC(venv.observation_space,
                        venv.action_space,
                        expert_data=transitions)
        trainer.train(n_epochs=100, log_interval=1)

    elif imitation_algo_name == 'GAIL':
        logger.configure(log_path, format_strs=["stdout", "tensorboard"])
        gail_trainer = adversarial.GAIL(
            venv,
            expert_data=transitions,
            expert_batch_size=32,
            gen_algo=sb3.PPO("MlpPolicy", venv, verbose=1, n_steps=1024),
            discrim_kwargs={
                'discrim_net':
                ActObsMLP(
                    action_space=venv.action_space,
                    observation_space=venv.observation_space,
                    hid_sizes=(32, 32),
                )
            })
        gail_trainer.train(total_timesteps=2048)
        trainer = gail_trainer.gen_algo
    elif imitation_algo_name == 'AIRL':
        # Train AIRL on expert data.
        logger.configure(log_path)
        airl_trainer = adversarial.AIRL(
            venv,
            expert_data=transitions,
            expert_batch_size=32,
            gen_algo=sb3.PPO("MlpPolicy", venv, verbose=1, n_steps=1024),
        )
        airl_trainer.train(total_timesteps=2048)

    sample_until = rollout.min_episodes(15)
    trained_ret_mean = rollout.mean_return(trainer.policy, venv, sample_until)
    # trainer.save_policy("{}/bc_policy.pth.tar".format(log_path))
    th.save(trainer.policy,
            "{}/{}_policy.pth.tar".format(log_path, imitation_algo_name))

    return trained_ret_mean