def test_eval_job(self): # Create test context. summary_dir = self.create_tempdir().full_path environment = test_envs.CountingEnv(steps_per_episode=4) action_tensor_spec = tensor_spec.from_spec(environment.action_spec()) time_step_tensor_spec = tensor_spec.from_spec( environment.time_step_spec()) policy = py_tf_eager_policy.PyTFEagerPolicy( random_tf_policy.RandomTFPolicy(time_step_tensor_spec, action_tensor_spec)) mock_variable_container = mock.create_autospec( reverb_variable_container.ReverbVariableContainer) with mock.patch.object( tf.summary, 'scalar', autospec=True) as mock_scalar_summary, mock.patch.object( train_utils, 'wait_for_predicate', autospec=True): # Run the function tested. eval_job.evaluate(summary_dir=summary_dir, policy=policy, environment_name=None, suite_load_fn=lambda _: environment, variable_container=mock_variable_container, is_running=_NTimesReturnTrue(n=2)) # Check if the expected calls happened. # As an input, an eval job is expected to fetch data from the variable # container. mock_variable_container.assert_has_calls( [mock.call.update(mock.ANY)]) # As an output, an eval job is expected to write at least the average # return corresponding to the first step. mock_scalar_summary.assert_any_call( name='eval_actor/AverageReturn', data=mock.ANY, step=mock.ANY)
def test_observation_stacked(self): env = test_envs.CountingEnv() history_env = wrappers.HistoryWrapper(env, 3) time_step = history_env.reset() self.assertEqual([0, 0, 0], time_step.observation.tolist()) time_step = history_env.step(0) self.assertEqual([0, 0, 1], time_step.observation.tolist()) time_step = history_env.step(0) self.assertEqual([0, 1, 2], time_step.observation.tolist()) time_step = history_env.step(0) self.assertEqual([1, 2, 3], time_step.observation.tolist())
def test_sequential(self): num_episodes = 3 steps_per_episode = 4 env = test_envs.CountingEnv(steps_per_episode) for episode in range(num_episodes): step = 0 time_step = env.reset() self.assertEqual(episode * 10 + step, time_step.observation) while not time_step.is_last(): time_step = env.step(0) step += 1 self.assertEqual(episode * 10 + step, time_step.observation) self.assertEqual(episode * 10 + steps_per_episode, time_step.observation)
def test_eval_job_constant_eval(self): """Tests eval every step for 2 steps. This test's `variable_container` passes the same train step twice to test that `is_train_step_the_same_or_behind` is working as expected. If were not working, the number of train steps processed will be incorrect (2x higher). """ summary_dir = self.create_tempdir().full_path environment = test_envs.CountingEnv(steps_per_episode=4) action_tensor_spec = tensor_spec.from_spec(environment.action_spec()) time_step_tensor_spec = tensor_spec.from_spec(environment.time_step_spec()) policy = py_tf_eager_policy.PyTFEagerPolicy( random_tf_policy.RandomTFPolicy(time_step_tensor_spec, action_tensor_spec)) mock_variable_container = mock.create_autospec( reverb_variable_container.ReverbVariableContainer) class VCUpdateIncrementEveryOtherTrainStep(object): """Side effect that updates train_step on every other call.""" def __init__(self): self.fake_train_step = -1 self.call_count = 0 def __call__(self, variables): if self.call_count % 2: self.fake_train_step += 1 variables[reverb_variable_container.TRAIN_STEP_KEY].assign( self.fake_train_step) self.call_count += 1 fake_update = VCUpdateIncrementEveryOtherTrainStep() mock_variable_container.update.side_effect = fake_update with mock.patch.object( tf.summary, 'scalar', autospec=True) as mock_scalar_summary: eval_job.evaluate( summary_dir=summary_dir, policy=policy, environment_name=None, suite_load_fn=lambda _: environment, variable_container=mock_variable_container, eval_interval=1, is_running=_NTimesReturnTrue(n=2)) summary_count = self.count_summary_scalar_tags_in_call_list( mock_scalar_summary, 'Metrics/eval_actor/AverageReturn') self.assertEqual(summary_count, 2)
def test_observation_tiled(self): env = test_envs.CountingEnv() # Force observations to be non zero for the test env._episodes = 2 history_env = wrappers.HistoryWrapper(env, 3, tile_first_step_obs=True) # Extra reset to make observations in base env not 0. time_step = history_env.reset() self.assertEqual([20, 20, 20], time_step.observation.tolist()) time_step = history_env.step(0) self.assertEqual([20, 20, 21], time_step.observation.tolist()) time_step = history_env.step(0) self.assertEqual([20, 21, 22], time_step.observation.tolist()) time_step = history_env.step(0) self.assertEqual([21, 22, 23], time_step.observation.tolist())
def test_observation_and_action_stacked(self): env = test_envs.CountingEnv() history_env = wrappers.HistoryWrapper(env, 3, include_actions=True) time_step = history_env.reset() self.assertEqual([0, 0, 0], time_step.observation['observation'].tolist()) self.assertEqual([0, 0, 0], time_step.observation['action'].tolist()) time_step = history_env.step(5) self.assertEqual([0, 0, 1], time_step.observation['observation'].tolist()) self.assertEqual([0, 0, 5], time_step.observation['action'].tolist()) time_step = history_env.step(6) self.assertEqual([0, 1, 2], time_step.observation['observation'].tolist()) self.assertEqual([0, 5, 6], time_step.observation['action'].tolist()) time_step = history_env.step(7) self.assertEqual([1, 2, 3], time_step.observation['observation'].tolist()) self.assertEqual([5, 6, 7], time_step.observation['action'].tolist())
def test_eval_job(self): """Tests the eval job doing an eval every 5 steps for 10 train steps.""" summary_dir = self.create_tempdir().full_path environment = test_envs.CountingEnv(steps_per_episode=4) action_tensor_spec = tensor_spec.from_spec(environment.action_spec()) time_step_tensor_spec = tensor_spec.from_spec(environment.time_step_spec()) policy = py_tf_eager_policy.PyTFEagerPolicy( random_tf_policy.RandomTFPolicy(time_step_tensor_spec, action_tensor_spec)) class VCUpdateIncrementTrainStep(object): """Side effect that updates train_step.""" def __init__(self): self.fake_train_step = -1 def __call__(self, variables): self.fake_train_step += 1 variables[reverb_variable_container.TRAIN_STEP_KEY].assign( self.fake_train_step) mock_variable_container = mock.create_autospec( reverb_variable_container.ReverbVariableContainer) fake_update = VCUpdateIncrementTrainStep() mock_variable_container.update.side_effect = fake_update with mock.patch.object( tf.summary, 'scalar', autospec=True) as mock_scalar_summary: # Run the function tested. # 11 loops to do 10 steps becaue the eval occurs on the loop after the # train_step is found. eval_job.evaluate( summary_dir=summary_dir, policy=policy, environment_name=None, suite_load_fn=lambda _: environment, variable_container=mock_variable_container, eval_interval=5, is_running=_NTimesReturnTrue(n=11)) summary_count = self.count_summary_scalar_tags_in_call_list( mock_scalar_summary, 'Metrics/eval_actor/AverageReturn') self.assertEqual(summary_count, 3)
def test_steps_sequencial(self): env = test_envs.CountingEnv(steps_per_episode=4) time_step = env.reset() self.assertTrue(time_step.is_first()) self.assertEqual(0, time_step.observation) time_step = env.step(0) self.assertTrue(time_step.is_mid()) self.assertEqual(1, time_step.observation) time_step = env.step(0) self.assertTrue(time_step.is_mid()) self.assertEqual(2, time_step.observation) time_step = env.step(0) self.assertTrue(time_step.is_mid()) self.assertEqual(3, time_step.observation) time_step = env.step(0) self.assertTrue(time_step.is_last()) self.assertEqual(4, time_step.observation) time_step = env.step(0) self.assertTrue(time_step.is_first()) self.assertEqual(10, time_step.observation)
def test_validate_specs(self): env = test_envs.CountingEnv(steps_per_episode=15) env_utils.validate_py_environment(env, episodes=10)