예제 #1
0
def test_serialize_identity(discrim_net, venv, tmpdir):
    """Does output of deserialized discriminator match that of original?"""
    original = discrim_net
    random = base.RandomPolicy(venv.observation_space, venv.action_space)

    tmppath = os.path.join(tmpdir, "discrim_net.pt")
    th.save(original, tmppath)
    loaded = th.load(tmppath)

    transitions = rollout.generate_transitions(random, venv, n_timesteps=100)

    rewards = {"train": [], "test": []}
    for net in [original, loaded]:
        rewards["train"].append(
            net.predict_reward_train(
                transitions.obs,
                transitions.acts,
                transitions.next_obs,
                transitions.dones,
            ))
        rewards["test"].append(
            net.predict_reward_test(
                transitions.obs,
                transitions.acts,
                transitions.next_obs,
                transitions.dones,
            ))

    for key, predictions in rewards.items():
        assert len(predictions) == 2
        assert np.allclose(predictions[0], predictions[1])
예제 #2
0
def test_train_disc_small_expert_data_warning(tmpdir, _algorithm_cls):
    logger.configure(tmpdir, ["tensorboard", "stdout"])
    venv = util.make_vec_env(
        "CartPole-v1",
        n_envs=2,
        parallel=_parallel,
    )

    gen_algo = util.init_rl(venv, verbose=1)
    small_data = rollout.generate_transitions(gen_algo, venv, n_timesteps=20)

    with pytest.raises(ValueError, match="Transitions.*expert_batch_size"):
        _algorithm_cls(
            venv=venv,
            expert_data=small_data,
            expert_batch_size=21,
            gen_algo=gen_algo,
            log_dir=tmpdir,
        )

    with pytest.raises(ValueError, match="expert_batch_size.*positive"):
        _algorithm_cls(
            venv=venv,
            expert_data=small_data,
            expert_batch_size=-1,
            gen_algo=gen_algo,
            log_dir=tmpdir,
        )
예제 #3
0
def test_serialize_identity(env_name, model_cfg, normalize, tmpdir):
    """Test output actions of deserialized policy are same as original."""
    orig_venv = venv = util.make_vec_env(env_name, n_envs=1, parallel=False)
    vec_normalize = None
    if normalize:
        venv = vec_normalize = VecNormalize(venv)

    model_name, model_cls_name = model_cfg
    try:
        model_cls = registry.load_attr(model_cls_name)
    except (AttributeError, ImportError):  # pragma: no cover
        pytest.skip(
            "Couldn't load stable baselines class. "
            "(Probably because mpi4py not installed.)"
        )

    model = model_cls("MlpPolicy", venv)
    model.learn(1000)

    venv.env_method("seed", 0)
    venv.reset()
    if normalize:
        # don't want statistics to change as we collect rollouts
        vec_normalize.training = False
    orig_rollout = rollout.generate_transitions(
        model,
        venv,
        n_timesteps=1000,
        deterministic_policy=True,
        rng=np.random.RandomState(0),
    )

    serialize.save_stable_model(tmpdir, model, vec_normalize)
    # We use `orig_venv` since `load_policy` automatically wraps `loaded`
    # with a VecNormalize, when appropriate.
    with serialize.load_policy(model_name, tmpdir, orig_venv) as loaded:
        orig_venv.env_method("seed", 0)
        orig_venv.reset()
        new_rollout = rollout.generate_transitions(
            loaded,
            orig_venv,
            n_timesteps=1000,
            deterministic_policy=True,
            rng=np.random.RandomState(0),
        )

    assert np.allclose(orig_rollout.acts, new_rollout.acts)
예제 #4
0
def test_potential_shaping_invariants(graph,
                                      session,
                                      venv,
                                      potential_cls,
                                      discount: float,
                                      num_timesteps: int = 100):
    """Test that potential shaping obeys several invariants.

    Specifically:
        1. new_potential must be constant when dones is true, and zero when `discount == 1.0`.
        2. new_potential depends only on next observation.
        3. old_potential depends only on current observation.
        4. Shaping is discount * new_potential - old_potential.
    """
    # Invariants:
    # When done, new_potential should always be zero.
    # self.discount * new_potential - old_potential should equal the output
    # Same old_obs should have same old_potential; same new_obs should have same new_potential.
    policy = base.RandomPolicy(venv.observation_space, venv.action_space)
    transitions = rollout.generate_transitions(policy,
                                               venv,
                                               n_timesteps=num_timesteps)

    with graph.as_default(), session.as_default():
        potential = potential_cls(venv.observation_space,
                                  venv.action_space,
                                  discount=discount)
        session.run(tf.global_variables_initializer())
        (old_pot, ), (new_pot, ) = rewards.evaluate_potentials([potential],
                                                               transitions)

    # Check invariant 1: new_potential must be zero when dones is true
    transitions_all_done = dataclasses.replace(transitions,
                                               dones=np.ones_like(
                                                   transitions.dones,
                                                   dtype=np.bool))
    with session.as_default():
        _, new_pot_done = rewards.evaluate_potentials([potential],
                                                      transitions_all_done)
    expected_new_pot_done = 0.0 if discount == 1.0 else np.mean(new_pot_done)
    assert np.allclose(new_pot_done, expected_new_pot_done)

    # Check invariants 2 and 3: {new,old}_potential depend only on {next,current} observation
    def _shuffle(fld: str):
        arr = np.array(getattr(transitions, fld))
        np.random.shuffle(arr)
        trans = dataclasses.replace(transitions, **{fld: arr})
        with session.as_default():
            return rewards.evaluate_potentials([potential], trans)

    (old_pot_shuffled, ), _ = _shuffle("next_obs")
    _, (new_pot_shuffled, ) = _shuffle("obs")
    assert np.all(old_pot == old_pot_shuffled)
    assert np.all(new_pot == new_pot_shuffled)

    # Check invariant 4: that reward output is as expected given potentials
    with session.as_default():
        rew = rewards.evaluate_models({"m": potential}, transitions)["m"]
    assert np.allclose(rew, discount * new_pot - old_pot)
예제 #5
0
def test_train_disc_step_no_crash(trainer, expert_batch_size):
    transitions = rollout.generate_transitions(
        trainer.gen_algo,
        trainer.venv,
        n_timesteps=expert_batch_size,
        truncate=True,
    )
    trainer.train_disc(gen_samples=types.dataclass_quick_asdict(transitions))
예제 #6
0
def test_actions_valid(env_name, policy_type):
    """Test output actions of our custom policies always lie in action space."""
    venv = util.make_vec_env(env_name, n_envs=1, parallel=False)
    policy = serialize.load_policy(policy_type, "foobar", venv)
    transitions = rollout.generate_transitions(policy, venv, n_timesteps=100)

    for a in transitions.acts:
        assert venv.action_space.contains(a)
예제 #7
0
def test_train_disc_improve_D(tmpdir, trainer, n_timesteps=200, n_steps=1000):
    gen_samples = rollout.generate_transitions(trainer.gen_policy,
                                               trainer.venv_train_norm,
                                               n_timesteps=n_timesteps)
    loss1 = trainer.eval_disc_loss(gen_samples=gen_samples)
    for _ in range(n_steps):
        trainer.train_disc_step(gen_samples=gen_samples)
    loss2 = trainer.eval_disc_loss(gen_samples=gen_samples)
    assert loss2 < loss1
예제 #8
0
def test_serialize_identity(env_name, model_cfg, normalize, tmpdir):
    """Test output actions of deserialized policy are same as original."""
    orig_venv = venv = util.make_vec_env(env_name, n_envs=1, parallel=False)
    vec_normalize = None
    if normalize:
        venv = vec_normalize = VecNormalize(venv)

    model_name, model_cls_name = model_cfg
    model_cls = registry.load_attr(model_cls_name)

    # FIXME(sam): verbose=1 is a hack to stop it from setting up SB logger
    model = model_cls("MlpPolicy", venv, verbose=1)
    model.learn(1000)

    venv.env_method("seed", 0)
    venv.reset()
    if normalize:
        # don't want statistics to change as we collect rollouts
        vec_normalize.training = False
    orig_rollout = rollout.generate_transitions(
        model,
        venv,
        n_timesteps=1000,
        deterministic_policy=True,
        rng=np.random.RandomState(0),
    )

    serialize.save_stable_model(tmpdir, model, vec_normalize)
    # We use `orig_venv` since `load_policy` automatically wraps `loaded`
    # with a VecNormalize, when appropriate.
    loaded = serialize.load_policy(model_name, tmpdir, orig_venv)
    orig_venv.env_method("seed", 0)
    orig_venv.reset()
    new_rollout = rollout.generate_transitions(
        loaded,
        orig_venv,
        n_timesteps=1000,
        deterministic_policy=True,
        rng=np.random.RandomState(0),
    )

    assert np.allclose(orig_rollout.acts, new_rollout.acts)
예제 #9
0
def test_train_disc_improve_D(tmpdir, trainer, n_timesteps=200, n_steps=100):
    gen_samples = rollout.generate_transitions(
        trainer.gen_algo, trainer.venv_train_norm, n_timesteps=n_timesteps
    )
    init_stats = None
    final_stats = None
    for _ in range(n_steps):
        final_stats = trainer.train_disc_step(gen_samples=gen_samples)
        if init_stats is None:
            init_stats = final_stats
    assert final_stats["disc_loss"] < init_stats["disc_loss"]
예제 #10
0
def test_ground_truth_similar_to_gym(graph, session, venv, reward_id):
    """Checks that reward models predictions match those of Gym reward."""
    # Generate rollouts, recording Gym reward
    policy = base_policies.RandomPolicy(venv.observation_space, venv.action_space)
    transitions = rollout.generate_transitions(policy, venv, n_timesteps=1024)
    gym_reward = transitions.rews

    # Make predictions using reward model
    with graph.as_default(), session.as_default():
        reward_model = serialize.load_reward(reward_id, "dummy", venv, 1.0)
        pred_reward = base.evaluate_models({"m": reward_model}, transitions)["m"]

    # Are the predictions close to true Gym reward?
    np.testing.assert_allclose(gym_reward, pred_reward, rtol=0, atol=5e-5)
예제 #11
0
def test_serialize_identity(env_name, net_cls, tmpdir):
    """Does output of deserialized reward network match that of original?"""
    logging.info(f"Testing {net_cls}")

    venv = util.make_vec_env(env_name, n_envs=1, parallel=False)
    original = net_cls(venv.observation_space, venv.action_space)
    random = base.RandomPolicy(venv.observation_space, venv.action_space)

    tmppath = os.path.join(tmpdir, "reward.pt")
    th.save(original, tmppath)
    loaded = th.load(tmppath)

    assert original.observation_space == loaded.observation_space
    assert original.action_space == loaded.action_space

    transitions = rollout.generate_transitions(random, venv, n_timesteps=100)

    unshaped_fn = serialize.load_reward("RewardNet_unshaped", tmppath, venv)
    shaped_fn = serialize.load_reward("RewardNet_shaped", tmppath, venv)
    rewards = {
        "train": [],
        "test": [],
    }
    for net in [original, loaded]:
        trans_args = (
            transitions.obs,
            transitions.acts,
            transitions.next_obs,
            transitions.dones,
        )
        rewards["train"].append(net.predict_reward_train(*trans_args))
        rewards["test"].append(net.predict_reward_test(*trans_args))

    args = (
        transitions.obs,
        transitions.acts,
        transitions.next_obs,
        transitions.dones,
    )
    rewards["train"].append(shaped_fn(*args))
    rewards["test"].append(unshaped_fn(*args))

    for key, predictions in rewards.items():
        assert len(predictions) == 3
        assert np.allclose(predictions[0], predictions[1])
        assert np.allclose(predictions[0], predictions[2])
예제 #12
0
def test_train_disc_small_expert_data_warning(tmpdir, _algorithm_cls):
    logger.configure(tmpdir, ["tensorboard", "stdout"])
    venv = util.make_vec_env(
        "CartPole-v1",
        n_envs=2,
        parallel=_parallel,
        log_dir=tmpdir,
    )

    gen_algo = util.init_rl(venv, verbose=1)
    small_data = rollout.generate_transitions(gen_algo, venv, n_timesteps=20)

    with pytest.warns(RuntimeWarning, match="discriminator batch size"):
        _algorithm_cls(
            venv=venv,
            expert_data=small_data,
            gen_algo=gen_algo,
            log_dir=tmpdir,
        )
예제 #13
0
def test_serialize_identity(session, env_name, net_cls, tmpdir):
    """Does output of deserialized reward network match that of original?"""
    logging.info(f"Testing {net_cls}")

    venv = util.make_vec_env(env_name, n_envs=1, parallel=False)
    with tf.variable_scope("original"):
        original = net_cls(venv.observation_space, venv.action_space)
    random = base.RandomPolicy(venv.observation_space, venv.action_space)
    session.run(tf.global_variables_initializer())

    original.save(tmpdir)
    with tf.variable_scope("loaded"):
        loaded = reward_net.RewardNet.load(tmpdir)

    assert original.observation_space == loaded.observation_space
    assert original.action_space == loaded.action_space

    transitions = rollout.generate_transitions(random, venv, n_timesteps=100)
    feed_dict = {}
    outputs = {"train": [], "test": []}
    for net in [original, loaded]:
        feed_dict.update(_make_feed_dict(net, transitions))
        outputs["train"].append(net.reward_output_train)
        outputs["test"].append(net.reward_output_test)

    with serialize.load_reward("RewardNet_unshaped", tmpdir, venv) as unshaped_fn:
        with serialize.load_reward("RewardNet_shaped", tmpdir, venv) as shaped_fn:
            rewards = session.run(outputs, feed_dict=feed_dict)

            args = (
                transitions.obs,
                transitions.acts,
                transitions.next_obs,
                transitions.dones,
            )
            rewards["train"].append(shaped_fn(*args))
            rewards["test"].append(unshaped_fn(*args))

    for key, predictions in rewards.items():
        assert len(predictions) == 3
        assert np.allclose(predictions[0], predictions[1])
        assert np.allclose(predictions[0], predictions[2])
예제 #14
0
def test_train_disc_improve_D(tmpdir,
                              trainer,
                              expert_transitions,
                              expert_batch_size,
                              n_steps=3):
    expert_samples = expert_transitions[:expert_batch_size]
    expert_samples = types.dataclass_quick_asdict(expert_samples)
    gen_samples = rollout.generate_transitions(
        trainer.gen_algo,
        trainer.venv_train,
        n_timesteps=expert_batch_size,
        truncate=True,
    )
    gen_samples = types.dataclass_quick_asdict(gen_samples)
    init_stats = final_stats = None
    for _ in range(n_steps):
        final_stats = trainer.train_disc(gen_samples=gen_samples,
                                         expert_samples=expert_samples)
        if init_stats is None:
            init_stats = final_stats
    assert final_stats["disc_loss"] < init_stats["disc_loss"]
예제 #15
0
def test_serialize_identity(session, env_name, discrim_net_cls, tmpdir):
    """Does output of deserialized discriminator match that of original?"""
    venv = util.make_vec_env(env_name, parallel=False)
    original = DISCRIM_NET_SETUPS[discrim_net_cls](venv)
    random = base.RandomPolicy(venv.observation_space, venv.action_space)
    session.run(tf.global_variables_initializer())

    original.save(tmpdir)
    with tf.variable_scope("loaded"):
        loaded = discrim_net.DiscrimNet.load(tmpdir)

    transitions = rollout.generate_transitions(random, venv, n_timesteps=100)
    length = len(transitions.obs)  # n_timesteps is only a lower bound
    labels = np.random.randint(2, size=length).astype(np.float32)
    log_prob = np.random.randn(length)

    feed_dict = {}
    outputs = {"train": [], "test": []}
    for net in [original, loaded]:
        feed_dict.update(
            {
                net.obs_ph: transitions.obs,
                net.act_ph: transitions.acts,
                net.next_obs_ph: transitions.next_obs,
                net.labels_gen_is_one_ph: labels,
                net.log_policy_act_prob_ph: log_prob,
            }
        )
        outputs["train"].append(net.policy_train_reward)
        outputs["test"].append(net.policy_test_reward)

    rewards = session.run(outputs, feed_dict=feed_dict)

    for key, predictions in rewards.items():
        assert len(predictions) == 2
        assert np.allclose(predictions[0], predictions[1])
예제 #16
0
 def f(total_timesteps: int) -> types.Transitions:
     # TODO(adam): inefficient -- discards partial trajectories and resets environment
     return rollout.generate_transitions(policy,
                                         venv,
                                         n_timesteps=total_timesteps)
예제 #17
0
def test_train_disc_step_no_crash(trainer, n_timesteps=200):
    transitions = rollout.generate_transitions(
        trainer.gen_algo, trainer.venv, n_timesteps=n_timesteps
    )
    trainer.train_disc_step(gen_samples=transitions)
예제 #18
0
def test_train_disc_step_no_crash(tmpdir, use_gail, parallel, n_timesteps=200):
    trainer = init_test_trainer(tmpdir, use_gail=use_gail, parallel=parallel)
    transitions = rollout.generate_transitions(
        trainer.gen_policy, trainer.venv, n_timesteps=n_timesteps
    )
    trainer.train_disc_step(gen_samples=transitions)