Exemplo n.º 1
0
def test_serialize_identity(session, env_id, reward_net_cls):
    """Does output of deserialized reward network match that of original?"""
    env = gym.make(env_id)
    with tf.variable_scope("original"):
        original = reward_net_cls(env.observation_space, env.action_space)
    random = base.RandomPolicy(env.observation_space, env.action_space)
    session.run(tf.global_variables_initializer())

    with tempfile.TemporaryDirectory(
            prefix='imitation-serialize-rew') as tmpdir:
        original.save(tmpdir)
        with tf.variable_scope("loaded"):
            loaded = reward_net_cls.load(tmpdir)

    assert original.observation_space == loaded.observation_space
    assert original.action_space == loaded.action_space

    rollouts = rollout.generate_transitions(random, env, n_timesteps=100)
    feed_dict = {}
    outputs = {'train': [], 'test': []}
    for net in [original, loaded]:
        feed_dict.update(_make_feed_dict(net, rollouts))
        outputs['train'].append(net.reward_output_train)
        outputs['test'].append(net.reward_output_test)

    rewards = session.run(outputs, feed_dict=feed_dict)

    for key, predictions in rewards.items():
        assert len(predictions) == 2
        assert np.allclose(predictions[0], predictions[1])
Exemplo n.º 2
0
def test_serialize_identity(discrim_net, venv, tmpdir):
    """Does output of deserialized discriminator match that of original?"""
    original = discrim_net
    random = base.RandomPolicy(venv.observation_space, venv.action_space)

    tmppath = os.path.join(tmpdir, "discrim_net.pt")
    th.save(original, tmppath)
    loaded = th.load(tmppath)

    transitions = rollout.generate_transitions(random, venv, n_timesteps=100)

    rewards = {"train": [], "test": []}
    for net in [original, loaded]:
        rewards["train"].append(
            net.predict_reward_train(
                transitions.obs,
                transitions.acts,
                transitions.next_obs,
                transitions.dones,
            ))
        rewards["test"].append(
            net.predict_reward_test(
                transitions.obs,
                transitions.acts,
                transitions.next_obs,
                transitions.dones,
            ))

    for key, predictions in rewards.items():
        assert len(predictions) == 2
        assert np.allclose(predictions[0], predictions[1])
Exemplo n.º 3
0
def test_serialize_identity(session, env_id, discrim_net_cls):
  """Does output of deserialized discriminator match that of original?"""
  env = gym.make(env_id)
  original = DISCRIM_NET_SETUPS[discrim_net_cls](env)
  random = base.RandomPolicy(env.observation_space, env.action_space)
  session.run(tf.global_variables_initializer())

  with tempfile.TemporaryDirectory(prefix='imitation-serialize') as tmpdir:
    original.save(tmpdir)
    with tf.variable_scope("loaded"):
      loaded = discrim_net_cls.load(tmpdir)

  old_obs, act, new_obs, _rew = rollout.generate_transitions(random, env,
                                                             n_timesteps=100)
  labels = np.random.randint(2, size=len(old_obs)).astype(np.float32)
  log_prob = np.random.randn(len(old_obs))

  feed_dict = {}
  outputs = {'train': [], 'test': []}
  for net in [original, loaded]:
    feed_dict.update({
        net.old_obs_ph: old_obs,
        net.act_ph: act,
        net.new_obs_ph: new_obs,
        net.labels_ph: labels,
        net.log_policy_act_prob_ph: log_prob,
    })
    outputs['train'].append(net.policy_train_reward)
    outputs['test'].append(net.policy_test_reward)

  rewards = session.run(outputs, feed_dict=feed_dict)

  for key, predictions in rewards.items():
    assert len(predictions) == 2
    assert np.allclose(predictions[0], predictions[1])
Exemplo n.º 4
0
    def f(make_model):
        policy = base_policies.RandomPolicy(venv.observation_space, venv.action_space)
        with datasets.transitions_factory_from_policy(venv, policy) as dataset_callable:
            batch = dataset_callable(1024)

            with graph.as_default(), session.as_default():
                original = make_model(venv)
                session.run(tf.global_variables_initializer())

                with tempfile.TemporaryDirectory(prefix="eval-rew-serialize") as tmpdir:
                    original.save(tmpdir)

                    with tf.variable_scope("loaded_direct"):
                        loaded_direct = util_serialize.Serializable.load(tmpdir)

                    model_name = "evaluating_rewards/RewardModel-v0"
                    loaded_indirect = serialize.load_reward(model_name, tmpdir, venv)

                models = {"o": original, "ld": loaded_direct, "li": loaded_indirect}
                preds = base.evaluate_models(models, batch)

            for model in models.values():
                assert original.observation_space == model.observation_space
                assert original.action_space == model.action_space

            assert len(preds) == len(models)
            for pred in preds.values():
                assert np.allclose(preds["o"], pred)
Exemplo n.º 5
0
def test_potential_shaping_invariants(graph,
                                      session,
                                      venv,
                                      potential_cls,
                                      discount: float,
                                      num_timesteps: int = 100):
    """Test that potential shaping obeys several invariants.

    Specifically:
        1. new_potential must be constant when dones is true, and zero when `discount == 1.0`.
        2. new_potential depends only on next observation.
        3. old_potential depends only on current observation.
        4. Shaping is discount * new_potential - old_potential.
    """
    # Invariants:
    # When done, new_potential should always be zero.
    # self.discount * new_potential - old_potential should equal the output
    # Same old_obs should have same old_potential; same new_obs should have same new_potential.
    policy = base.RandomPolicy(venv.observation_space, venv.action_space)
    transitions = rollout.generate_transitions(policy,
                                               venv,
                                               n_timesteps=num_timesteps)

    with graph.as_default(), session.as_default():
        potential = potential_cls(venv.observation_space,
                                  venv.action_space,
                                  discount=discount)
        session.run(tf.global_variables_initializer())
        (old_pot, ), (new_pot, ) = rewards.evaluate_potentials([potential],
                                                               transitions)

    # Check invariant 1: new_potential must be zero when dones is true
    transitions_all_done = dataclasses.replace(transitions,
                                               dones=np.ones_like(
                                                   transitions.dones,
                                                   dtype=np.bool))
    with session.as_default():
        _, new_pot_done = rewards.evaluate_potentials([potential],
                                                      transitions_all_done)
    expected_new_pot_done = 0.0 if discount == 1.0 else np.mean(new_pot_done)
    assert np.allclose(new_pot_done, expected_new_pot_done)

    # Check invariants 2 and 3: {new,old}_potential depend only on {next,current} observation
    def _shuffle(fld: str):
        arr = np.array(getattr(transitions, fld))
        np.random.shuffle(arr)
        trans = dataclasses.replace(transitions, **{fld: arr})
        with session.as_default():
            return rewards.evaluate_potentials([potential], trans)

    (old_pot_shuffled, ), _ = _shuffle("next_obs")
    _, (new_pot_shuffled, ) = _shuffle("obs")
    assert np.all(old_pot == old_pot_shuffled)
    assert np.all(new_pot == new_pot_shuffled)

    # Check invariant 4: that reward output is as expected given potentials
    with session.as_default():
        rew = rewards.evaluate_models({"m": potential}, transitions)["m"]
    assert np.allclose(rew, discount * new_pot - old_pot)
Exemplo n.º 6
0
def test_ground_truth_similar_to_gym(graph, session, venv, reward_id):
    """Checks that reward models predictions match those of Gym reward."""
    # Generate rollouts, recording Gym reward
    policy = base_policies.RandomPolicy(venv.observation_space, venv.action_space)
    transitions = rollout.generate_transitions(policy, venv, n_timesteps=1024)
    gym_reward = transitions.rews

    # Make predictions using reward model
    with graph.as_default(), session.as_default():
        reward_model = serialize.load_reward(reward_id, "dummy", venv, 1.0)
        pred_reward = base.evaluate_models({"m": reward_model}, transitions)["m"]

    # Are the predictions close to true Gym reward?
    np.testing.assert_allclose(gym_reward, pred_reward, rtol=0, atol=5e-5)
Exemplo n.º 7
0
def test_serialize_identity(env_name, net_cls, tmpdir):
    """Does output of deserialized reward network match that of original?"""
    logging.info(f"Testing {net_cls}")

    venv = util.make_vec_env(env_name, n_envs=1, parallel=False)
    original = net_cls(venv.observation_space, venv.action_space)
    random = base.RandomPolicy(venv.observation_space, venv.action_space)

    tmppath = os.path.join(tmpdir, "reward.pt")
    th.save(original, tmppath)
    loaded = th.load(tmppath)

    assert original.observation_space == loaded.observation_space
    assert original.action_space == loaded.action_space

    transitions = rollout.generate_transitions(random, venv, n_timesteps=100)

    unshaped_fn = serialize.load_reward("RewardNet_unshaped", tmppath, venv)
    shaped_fn = serialize.load_reward("RewardNet_shaped", tmppath, venv)
    rewards = {
        "train": [],
        "test": [],
    }
    for net in [original, loaded]:
        trans_args = (
            transitions.obs,
            transitions.acts,
            transitions.next_obs,
            transitions.dones,
        )
        rewards["train"].append(net.predict_reward_train(*trans_args))
        rewards["test"].append(net.predict_reward_test(*trans_args))

    args = (
        transitions.obs,
        transitions.acts,
        transitions.next_obs,
        transitions.dones,
    )
    rewards["train"].append(shaped_fn(*args))
    rewards["test"].append(unshaped_fn(*args))

    for key, predictions in rewards.items():
        assert len(predictions) == 3
        assert np.allclose(predictions[0], predictions[1])
        assert np.allclose(predictions[0], predictions[2])
Exemplo n.º 8
0
def test_serialize_identity(session, env_name, reward_net):
    """Does output of deserialized reward network match that of original?"""
    net_name, net_cls = reward_net
    print(f"Testing {net_name}")

    venv = util.make_vec_env(env_name, n_envs=1, parallel=False)
    with tf.variable_scope("original"):
        original = net_cls(venv.observation_space, venv.action_space)
    random = base.RandomPolicy(venv.observation_space, venv.action_space)
    session.run(tf.global_variables_initializer())

    with tempfile.TemporaryDirectory(
            prefix='imitation-serialize-rew') as tmpdir:
        original.save(tmpdir)
        with tf.variable_scope("loaded"):
            loaded = net_cls.load(tmpdir)

        assert original.observation_space == loaded.observation_space
        assert original.action_space == loaded.action_space

        rollouts = rollout.generate_transitions(random, venv, n_timesteps=100)
        feed_dict = {}
        outputs = {'train': [], 'test': []}
        for net in [original, loaded]:
            feed_dict.update(_make_feed_dict(net, rollouts))
            outputs['train'].append(net.reward_output_train)
            outputs['test'].append(net.reward_output_test)

        unshaped_name = f"{net_name}_unshaped"
        shaped_name = f"{net_name}_shaped"
        with serialize.load_reward(unshaped_name, tmpdir, venv) as unshaped_fn:
            with serialize.load_reward(shaped_name, tmpdir, venv) as shaped_fn:
                rewards = session.run(outputs, feed_dict=feed_dict)

                old_obs, actions, new_obs, _ = rollouts
                steps = np.zeros((old_obs.shape[0], ))
                rewards['train'].append(
                    shaped_fn(old_obs, actions, new_obs, steps))
                rewards['test'].append(
                    unshaped_fn(old_obs, actions, new_obs, steps))

    for key, predictions in rewards.items():
        assert len(predictions) == 3
        assert np.allclose(predictions[0], predictions[1])
        assert np.allclose(predictions[0], predictions[2])
Exemplo n.º 9
0
def test_potential_shaping_cycle(graph,
                                 session,
                                 venv,
                                 potential_cls,
                                 discount: float,
                                 num_episodes: int = 10) -> None:
    """Test that potential shaping is constant on any fixed-length cycle.

    Specifically, performs rollouts of a random policy in the environment.
    Fixes the starting state for each trajectory at the all-zero state.
    Then computes episode return, and checks they're all equal.

    Requires environment be fixed length, otherwise the episode return will vary
    (except in the undiscounted case).
    """
    policy = base.RandomPolicy(venv.observation_space, venv.action_space)
    trajectories = rollout.generate_trajectories(
        policy, venv, sample_until=rollout.min_episodes(num_episodes))
    transitions = rollout.flatten_trajectories(trajectories)

    # Make initial state fixed as all-zero.
    # Note don't need to change final state, since `dones` being `True` should
    # force potential to be zero at those states.
    obs = np.array(transitions.obs)
    idxs = np.where(transitions.dones)[0] + 1
    idxs = np.pad(idxs[:-1], (1, 0), "constant")
    obs[idxs, :] = 0
    transitions = dataclasses.replace(transitions, obs=obs)

    with graph.as_default(), session.as_default():
        reward_model = potential_cls(venv.observation_space,
                                     venv.action_space,
                                     discount=discount)
        session.run(tf.global_variables_initializer())
        rews = rewards.evaluate_models({"m": reward_model}, transitions)

    rets = rewards.compute_return_from_rews(rews,
                                            transitions.dones,
                                            discount=discount)["m"]
    if discount == 1.0:
        assert np.allclose(rets, 0.0, atol=1e-5)
    assert np.allclose(rets, np.mean(rets), atol=1e-5)
Exemplo n.º 10
0
def test_serialize_identity(session, env_name, net_cls, tmpdir):
    """Does output of deserialized reward network match that of original?"""
    logging.info(f"Testing {net_cls}")

    venv = util.make_vec_env(env_name, n_envs=1, parallel=False)
    with tf.variable_scope("original"):
        original = net_cls(venv.observation_space, venv.action_space)
    random = base.RandomPolicy(venv.observation_space, venv.action_space)
    session.run(tf.global_variables_initializer())

    original.save(tmpdir)
    with tf.variable_scope("loaded"):
        loaded = reward_net.RewardNet.load(tmpdir)

    assert original.observation_space == loaded.observation_space
    assert original.action_space == loaded.action_space

    transitions = rollout.generate_transitions(random, venv, n_timesteps=100)
    feed_dict = {}
    outputs = {"train": [], "test": []}
    for net in [original, loaded]:
        feed_dict.update(_make_feed_dict(net, transitions))
        outputs["train"].append(net.reward_output_train)
        outputs["test"].append(net.reward_output_test)

    with serialize.load_reward("RewardNet_unshaped", tmpdir, venv) as unshaped_fn:
        with serialize.load_reward("RewardNet_shaped", tmpdir, venv) as shaped_fn:
            rewards = session.run(outputs, feed_dict=feed_dict)

            args = (
                transitions.obs,
                transitions.acts,
                transitions.next_obs,
                transitions.dones,
            )
            rewards["train"].append(shaped_fn(*args))
            rewards["test"].append(unshaped_fn(*args))

    for key, predictions in rewards.items():
        assert len(predictions) == 3
        assert np.allclose(predictions[0], predictions[1])
        assert np.allclose(predictions[0], predictions[2])
Exemplo n.º 11
0
def test_serialize_identity(session, env_name, discrim_net_cls, tmpdir):
    """Does output of deserialized discriminator match that of original?"""
    venv = util.make_vec_env(env_name, parallel=False)
    original = DISCRIM_NET_SETUPS[discrim_net_cls](venv)
    random = base.RandomPolicy(venv.observation_space, venv.action_space)
    session.run(tf.global_variables_initializer())

    original.save(tmpdir)
    with tf.variable_scope("loaded"):
        loaded = discrim_net.DiscrimNet.load(tmpdir)

    transitions = rollout.generate_transitions(random, venv, n_timesteps=100)
    length = len(transitions.obs)  # n_timesteps is only a lower bound
    labels = np.random.randint(2, size=length).astype(np.float32)
    log_prob = np.random.randn(length)

    feed_dict = {}
    outputs = {"train": [], "test": []}
    for net in [original, loaded]:
        feed_dict.update(
            {
                net.obs_ph: transitions.obs,
                net.act_ph: transitions.acts,
                net.next_obs_ph: transitions.next_obs,
                net.labels_gen_is_one_ph: labels,
                net.log_policy_act_prob_ph: log_prob,
            }
        )
        outputs["train"].append(net.policy_train_reward)
        outputs["test"].append(net.policy_test_reward)

    rewards = session.run(outputs, feed_dict=feed_dict)

    for key, predictions in rewards.items():
        assert len(predictions) == 2
        assert np.allclose(predictions[0], predictions[1])