def test_serialize_identity(session, env_id, reward_net_cls): """Does output of deserialized reward network match that of original?""" env = gym.make(env_id) with tf.variable_scope("original"): original = reward_net_cls(env.observation_space, env.action_space) random = base.RandomPolicy(env.observation_space, env.action_space) session.run(tf.global_variables_initializer()) with tempfile.TemporaryDirectory( prefix='imitation-serialize-rew') as tmpdir: original.save(tmpdir) with tf.variable_scope("loaded"): loaded = reward_net_cls.load(tmpdir) assert original.observation_space == loaded.observation_space assert original.action_space == loaded.action_space rollouts = rollout.generate_transitions(random, env, n_timesteps=100) feed_dict = {} outputs = {'train': [], 'test': []} for net in [original, loaded]: feed_dict.update(_make_feed_dict(net, rollouts)) outputs['train'].append(net.reward_output_train) outputs['test'].append(net.reward_output_test) rewards = session.run(outputs, feed_dict=feed_dict) for key, predictions in rewards.items(): assert len(predictions) == 2 assert np.allclose(predictions[0], predictions[1])
def test_serialize_identity(discrim_net, venv, tmpdir): """Does output of deserialized discriminator match that of original?""" original = discrim_net random = base.RandomPolicy(venv.observation_space, venv.action_space) tmppath = os.path.join(tmpdir, "discrim_net.pt") th.save(original, tmppath) loaded = th.load(tmppath) transitions = rollout.generate_transitions(random, venv, n_timesteps=100) rewards = {"train": [], "test": []} for net in [original, loaded]: rewards["train"].append( net.predict_reward_train( transitions.obs, transitions.acts, transitions.next_obs, transitions.dones, )) rewards["test"].append( net.predict_reward_test( transitions.obs, transitions.acts, transitions.next_obs, transitions.dones, )) for key, predictions in rewards.items(): assert len(predictions) == 2 assert np.allclose(predictions[0], predictions[1])
def test_serialize_identity(session, env_id, discrim_net_cls): """Does output of deserialized discriminator match that of original?""" env = gym.make(env_id) original = DISCRIM_NET_SETUPS[discrim_net_cls](env) random = base.RandomPolicy(env.observation_space, env.action_space) session.run(tf.global_variables_initializer()) with tempfile.TemporaryDirectory(prefix='imitation-serialize') as tmpdir: original.save(tmpdir) with tf.variable_scope("loaded"): loaded = discrim_net_cls.load(tmpdir) old_obs, act, new_obs, _rew = rollout.generate_transitions(random, env, n_timesteps=100) labels = np.random.randint(2, size=len(old_obs)).astype(np.float32) log_prob = np.random.randn(len(old_obs)) feed_dict = {} outputs = {'train': [], 'test': []} for net in [original, loaded]: feed_dict.update({ net.old_obs_ph: old_obs, net.act_ph: act, net.new_obs_ph: new_obs, net.labels_ph: labels, net.log_policy_act_prob_ph: log_prob, }) outputs['train'].append(net.policy_train_reward) outputs['test'].append(net.policy_test_reward) rewards = session.run(outputs, feed_dict=feed_dict) for key, predictions in rewards.items(): assert len(predictions) == 2 assert np.allclose(predictions[0], predictions[1])
def f(make_model): policy = base_policies.RandomPolicy(venv.observation_space, venv.action_space) with datasets.transitions_factory_from_policy(venv, policy) as dataset_callable: batch = dataset_callable(1024) with graph.as_default(), session.as_default(): original = make_model(venv) session.run(tf.global_variables_initializer()) with tempfile.TemporaryDirectory(prefix="eval-rew-serialize") as tmpdir: original.save(tmpdir) with tf.variable_scope("loaded_direct"): loaded_direct = util_serialize.Serializable.load(tmpdir) model_name = "evaluating_rewards/RewardModel-v0" loaded_indirect = serialize.load_reward(model_name, tmpdir, venv) models = {"o": original, "ld": loaded_direct, "li": loaded_indirect} preds = base.evaluate_models(models, batch) for model in models.values(): assert original.observation_space == model.observation_space assert original.action_space == model.action_space assert len(preds) == len(models) for pred in preds.values(): assert np.allclose(preds["o"], pred)
def test_potential_shaping_invariants(graph, session, venv, potential_cls, discount: float, num_timesteps: int = 100): """Test that potential shaping obeys several invariants. Specifically: 1. new_potential must be constant when dones is true, and zero when `discount == 1.0`. 2. new_potential depends only on next observation. 3. old_potential depends only on current observation. 4. Shaping is discount * new_potential - old_potential. """ # Invariants: # When done, new_potential should always be zero. # self.discount * new_potential - old_potential should equal the output # Same old_obs should have same old_potential; same new_obs should have same new_potential. policy = base.RandomPolicy(venv.observation_space, venv.action_space) transitions = rollout.generate_transitions(policy, venv, n_timesteps=num_timesteps) with graph.as_default(), session.as_default(): potential = potential_cls(venv.observation_space, venv.action_space, discount=discount) session.run(tf.global_variables_initializer()) (old_pot, ), (new_pot, ) = rewards.evaluate_potentials([potential], transitions) # Check invariant 1: new_potential must be zero when dones is true transitions_all_done = dataclasses.replace(transitions, dones=np.ones_like( transitions.dones, dtype=np.bool)) with session.as_default(): _, new_pot_done = rewards.evaluate_potentials([potential], transitions_all_done) expected_new_pot_done = 0.0 if discount == 1.0 else np.mean(new_pot_done) assert np.allclose(new_pot_done, expected_new_pot_done) # Check invariants 2 and 3: {new,old}_potential depend only on {next,current} observation def _shuffle(fld: str): arr = np.array(getattr(transitions, fld)) np.random.shuffle(arr) trans = dataclasses.replace(transitions, **{fld: arr}) with session.as_default(): return rewards.evaluate_potentials([potential], trans) (old_pot_shuffled, ), _ = _shuffle("next_obs") _, (new_pot_shuffled, ) = _shuffle("obs") assert np.all(old_pot == old_pot_shuffled) assert np.all(new_pot == new_pot_shuffled) # Check invariant 4: that reward output is as expected given potentials with session.as_default(): rew = rewards.evaluate_models({"m": potential}, transitions)["m"] assert np.allclose(rew, discount * new_pot - old_pot)
def test_ground_truth_similar_to_gym(graph, session, venv, reward_id): """Checks that reward models predictions match those of Gym reward.""" # Generate rollouts, recording Gym reward policy = base_policies.RandomPolicy(venv.observation_space, venv.action_space) transitions = rollout.generate_transitions(policy, venv, n_timesteps=1024) gym_reward = transitions.rews # Make predictions using reward model with graph.as_default(), session.as_default(): reward_model = serialize.load_reward(reward_id, "dummy", venv, 1.0) pred_reward = base.evaluate_models({"m": reward_model}, transitions)["m"] # Are the predictions close to true Gym reward? np.testing.assert_allclose(gym_reward, pred_reward, rtol=0, atol=5e-5)
def test_serialize_identity(env_name, net_cls, tmpdir): """Does output of deserialized reward network match that of original?""" logging.info(f"Testing {net_cls}") venv = util.make_vec_env(env_name, n_envs=1, parallel=False) original = net_cls(venv.observation_space, venv.action_space) random = base.RandomPolicy(venv.observation_space, venv.action_space) tmppath = os.path.join(tmpdir, "reward.pt") th.save(original, tmppath) loaded = th.load(tmppath) assert original.observation_space == loaded.observation_space assert original.action_space == loaded.action_space transitions = rollout.generate_transitions(random, venv, n_timesteps=100) unshaped_fn = serialize.load_reward("RewardNet_unshaped", tmppath, venv) shaped_fn = serialize.load_reward("RewardNet_shaped", tmppath, venv) rewards = { "train": [], "test": [], } for net in [original, loaded]: trans_args = ( transitions.obs, transitions.acts, transitions.next_obs, transitions.dones, ) rewards["train"].append(net.predict_reward_train(*trans_args)) rewards["test"].append(net.predict_reward_test(*trans_args)) args = ( transitions.obs, transitions.acts, transitions.next_obs, transitions.dones, ) rewards["train"].append(shaped_fn(*args)) rewards["test"].append(unshaped_fn(*args)) for key, predictions in rewards.items(): assert len(predictions) == 3 assert np.allclose(predictions[0], predictions[1]) assert np.allclose(predictions[0], predictions[2])
def test_serialize_identity(session, env_name, reward_net): """Does output of deserialized reward network match that of original?""" net_name, net_cls = reward_net print(f"Testing {net_name}") venv = util.make_vec_env(env_name, n_envs=1, parallel=False) with tf.variable_scope("original"): original = net_cls(venv.observation_space, venv.action_space) random = base.RandomPolicy(venv.observation_space, venv.action_space) session.run(tf.global_variables_initializer()) with tempfile.TemporaryDirectory( prefix='imitation-serialize-rew') as tmpdir: original.save(tmpdir) with tf.variable_scope("loaded"): loaded = net_cls.load(tmpdir) assert original.observation_space == loaded.observation_space assert original.action_space == loaded.action_space rollouts = rollout.generate_transitions(random, venv, n_timesteps=100) feed_dict = {} outputs = {'train': [], 'test': []} for net in [original, loaded]: feed_dict.update(_make_feed_dict(net, rollouts)) outputs['train'].append(net.reward_output_train) outputs['test'].append(net.reward_output_test) unshaped_name = f"{net_name}_unshaped" shaped_name = f"{net_name}_shaped" with serialize.load_reward(unshaped_name, tmpdir, venv) as unshaped_fn: with serialize.load_reward(shaped_name, tmpdir, venv) as shaped_fn: rewards = session.run(outputs, feed_dict=feed_dict) old_obs, actions, new_obs, _ = rollouts steps = np.zeros((old_obs.shape[0], )) rewards['train'].append( shaped_fn(old_obs, actions, new_obs, steps)) rewards['test'].append( unshaped_fn(old_obs, actions, new_obs, steps)) for key, predictions in rewards.items(): assert len(predictions) == 3 assert np.allclose(predictions[0], predictions[1]) assert np.allclose(predictions[0], predictions[2])
def test_potential_shaping_cycle(graph, session, venv, potential_cls, discount: float, num_episodes: int = 10) -> None: """Test that potential shaping is constant on any fixed-length cycle. Specifically, performs rollouts of a random policy in the environment. Fixes the starting state for each trajectory at the all-zero state. Then computes episode return, and checks they're all equal. Requires environment be fixed length, otherwise the episode return will vary (except in the undiscounted case). """ policy = base.RandomPolicy(venv.observation_space, venv.action_space) trajectories = rollout.generate_trajectories( policy, venv, sample_until=rollout.min_episodes(num_episodes)) transitions = rollout.flatten_trajectories(trajectories) # Make initial state fixed as all-zero. # Note don't need to change final state, since `dones` being `True` should # force potential to be zero at those states. obs = np.array(transitions.obs) idxs = np.where(transitions.dones)[0] + 1 idxs = np.pad(idxs[:-1], (1, 0), "constant") obs[idxs, :] = 0 transitions = dataclasses.replace(transitions, obs=obs) with graph.as_default(), session.as_default(): reward_model = potential_cls(venv.observation_space, venv.action_space, discount=discount) session.run(tf.global_variables_initializer()) rews = rewards.evaluate_models({"m": reward_model}, transitions) rets = rewards.compute_return_from_rews(rews, transitions.dones, discount=discount)["m"] if discount == 1.0: assert np.allclose(rets, 0.0, atol=1e-5) assert np.allclose(rets, np.mean(rets), atol=1e-5)
def test_serialize_identity(session, env_name, net_cls, tmpdir): """Does output of deserialized reward network match that of original?""" logging.info(f"Testing {net_cls}") venv = util.make_vec_env(env_name, n_envs=1, parallel=False) with tf.variable_scope("original"): original = net_cls(venv.observation_space, venv.action_space) random = base.RandomPolicy(venv.observation_space, venv.action_space) session.run(tf.global_variables_initializer()) original.save(tmpdir) with tf.variable_scope("loaded"): loaded = reward_net.RewardNet.load(tmpdir) assert original.observation_space == loaded.observation_space assert original.action_space == loaded.action_space transitions = rollout.generate_transitions(random, venv, n_timesteps=100) feed_dict = {} outputs = {"train": [], "test": []} for net in [original, loaded]: feed_dict.update(_make_feed_dict(net, transitions)) outputs["train"].append(net.reward_output_train) outputs["test"].append(net.reward_output_test) with serialize.load_reward("RewardNet_unshaped", tmpdir, venv) as unshaped_fn: with serialize.load_reward("RewardNet_shaped", tmpdir, venv) as shaped_fn: rewards = session.run(outputs, feed_dict=feed_dict) args = ( transitions.obs, transitions.acts, transitions.next_obs, transitions.dones, ) rewards["train"].append(shaped_fn(*args)) rewards["test"].append(unshaped_fn(*args)) for key, predictions in rewards.items(): assert len(predictions) == 3 assert np.allclose(predictions[0], predictions[1]) assert np.allclose(predictions[0], predictions[2])
def test_serialize_identity(session, env_name, discrim_net_cls, tmpdir): """Does output of deserialized discriminator match that of original?""" venv = util.make_vec_env(env_name, parallel=False) original = DISCRIM_NET_SETUPS[discrim_net_cls](venv) random = base.RandomPolicy(venv.observation_space, venv.action_space) session.run(tf.global_variables_initializer()) original.save(tmpdir) with tf.variable_scope("loaded"): loaded = discrim_net.DiscrimNet.load(tmpdir) transitions = rollout.generate_transitions(random, venv, n_timesteps=100) length = len(transitions.obs) # n_timesteps is only a lower bound labels = np.random.randint(2, size=length).astype(np.float32) log_prob = np.random.randn(length) feed_dict = {} outputs = {"train": [], "test": []} for net in [original, loaded]: feed_dict.update( { net.obs_ph: transitions.obs, net.act_ph: transitions.acts, net.next_obs_ph: transitions.next_obs, net.labels_gen_is_one_ph: labels, net.log_policy_act_prob_ph: log_prob, } ) outputs["train"].append(net.policy_train_reward) outputs["test"].append(net.policy_test_reward) rewards = session.run(outputs, feed_dict=feed_dict) for key, predictions in rewards.items(): assert len(predictions) == 2 assert np.allclose(predictions[0], predictions[1])