def testLoss(self, agent_class): with tf.compat.v2.summary.record_if(False): q_net = test_utils.DummyNet(self._observation_spec, self._action_spec) agent = agent_class(self._time_step_spec, self._action_spec, q_network=q_net, optimizer=None) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = [tf.constant([[0], [1]], dtype=tf.int32)] rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) next_observations = tf.constant([[5, 6], [7, 8]], dtype=tf.float32) next_time_steps = ts.transition(next_observations, rewards, discounts) expected_loss = 26.0 loss_info = agent._loss(time_steps, actions, next_time_steps) self.evaluate(tf.compat.v1.initialize_all_variables()) total_loss, _ = self.evaluate(loss_info) self.assertAllClose(total_loss, expected_loss)
def testCategoricalActions(self, action_probs): action_spec = [ tensor_spec.BoundedTensorSpec((1,), tf.int32, 0, len(action_probs)-1), tensor_spec.BoundedTensorSpec((), tf.int32, 0, len(action_probs)-1)] wrapped_policy = DistributionPolicy([ tfp.distributions.Categorical(probs=[action_probs]), tfp.distributions.Categorical(probs=action_probs) ], self._time_step_spec, action_spec) policy = greedy_policy.GreedyPolicy(wrapped_policy) self.assertEqual(policy.time_step_spec(), self._time_step_spec) self.assertEqual(policy.action_spec(), action_spec) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step) nest.assert_same_structure(action_spec, action_step.action) action_ = self.evaluate(action_step.action) self.assertEqual(action_[0][0], np.argmax(action_probs)) self.assertEqual(action_[1], np.argmax(action_probs)) self.assertAllEqual(action_[0].shape, [ 1, ] + action_spec[0].shape.as_list()) self.assertAllEqual(action_[1].shape, [ 1, ] + action_spec[1].shape.as_list())
def testCriticLoss(self): agent = ddpg_agent.DdpgAgent( self._time_step_spec, self._action_spec, actor_network=self._unbounded_actor_net, critic_network=self._critic_net, actor_optimizer=None, critic_optimizer=None, ) observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)] time_steps = ts.restart(observations, batch_size=2) actions = [tf.constant([[5], [6]], dtype=tf.float32)] rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) next_observations = [tf.constant([[5, 6], [7, 8]], dtype=tf.float32)] next_time_steps = ts.transition(next_observations, rewards, discounts) expected_loss = 59.6 loss = agent.critic_loss(time_steps, actions, next_time_steps) self.evaluate(tf.global_variables_initializer()) loss_ = self.evaluate(loss) self.assertAllClose(loss_, expected_loss)
def testPolicyGradientLoss(self): actor_net = DummyActorNet(self._action_spec) agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.train.AdamOptimizer(), normalize_observations=False, normalize_rewards=False, actor_net=actor_net, importance_ratio_clipping=10.0, ) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([[0], [1]], dtype=tf.float32) sample_action_log_probs = tf.constant([0.9, 0.3], dtype=tf.float32) advantages = tf.constant([1.9, 1.0], dtype=tf.float32) weights = tf.ones_like(advantages) current_policy_distribution, unused_network_state = actor_net( time_steps.observation, time_steps.step_type, ()) expected_loss = -0.0164646133 loss = agent.policy_gradient_loss(time_steps, actions, sample_action_log_probs, advantages, current_policy_distribution, weights) self.evaluate(tf.global_variables_initializer()) loss_ = self.evaluate(loss) self.assertAllClose(loss_, expected_loss)
def testPolicyGradientLoss(self): if tf.executing_eagerly(): self.skipTest('b/123777433') agent = reinforce_agent.ReinforceAgent( self._time_step_spec, self._action_spec, actor_network=DummyActorNet(self._obs_spec, self._action_spec, unbounded_actions=True), optimizer=None, ) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([[0], [1]], dtype=tf.float32) actions_distribution = agent.collect_policy.distribution( time_steps).action returns = tf.constant([1.9, 1.0], dtype=tf.float32) expected_loss = 10.983667373657227 loss = agent.policy_gradient_loss(actions_distribution, actions, time_steps.is_last(), returns) self.evaluate(tf.compat.v1.global_variables_initializer()) loss_ = self.evaluate(loss) self.assertAllClose(loss_, expected_loss)
def testDeferredBatchingAction(self): if tf.executing_eagerly(): self.skipTest('b/123770140') # Construct policy without providing batch_size. tf_policy = q_policy.QPolicy( self._time_step_spec, self._action_spec, q_network=DummyNet(stateful=False)) policy = py_tf_policy.PyTFPolicy(tf_policy) # But time_steps have batch_size of 5 batch_size = 5 single_observation = np.array([1, 2], dtype=np.float32) time_steps = [ts.restart(single_observation)] * batch_size time_steps = fast_map_structure(lambda *arrays: np.stack(arrays), *time_steps) with self.cached_session(): self.evaluate(tf.compat.v1.global_variables_initializer()) action_steps = policy.action(time_steps) self.assertEqual(action_steps.action.shape, (batch_size,)) for a in action_steps.action: self.assertIn(a, (0, 1)) self.assertAllEqual(action_steps.state, ())
def testDistributionRaisesNotImplementedError(self): mock_tf_py_policy = tf_py_policy.TFPyPolicy( self._get_mock_py_policy()) observation = tf.ones([5], tf.float32) time_step = ts.restart(observation) with self.assertRaises(NotImplementedError): mock_tf_py_policy.distribution(time_step=time_step)
def testMismatchedDtypesListAction(self): with self.assertRaisesRegexp(TypeError, ".*dtype that doesn't match.*"): policy = TFPolicyMismatchedDtypesListAction() observation = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_step = ts.restart(observation) policy.action(time_step)
def testAction(self, batch_size): if tf.executing_eagerly(): self.skipTest('b/123770140') single_observation = np.array([1, 2], dtype=np.float32) time_steps = ts.restart(single_observation) if batch_size is not None: time_steps = [time_steps] * batch_size time_steps = fast_map_structure(lambda *arrays: np.stack(arrays), *time_steps) policy = py_tf_policy.PyTFPolicy(self._tf_policy) with self.cached_session(): policy_state = policy.get_initial_state(batch_size) self.evaluate(tf.compat.v1.global_variables_initializer()) action_steps = policy.action(time_steps, policy_state) self.assertEqual(action_steps.action.dtype, np.int32) if batch_size is None: self.assertEqual(action_steps.action.shape, ()) self.assertIn(action_steps.action, (0, 1)) self.assertEqual(action_steps.state, np.zeros([1, 1])) else: self.assertEqual(action_steps.action.shape, (batch_size, )) for a in action_steps.action: self.assertIn(a, (0, 1)) self.assertAllEqual(action_steps.state, np.zeros([batch_size, 1]))
def testInitializeRestoreAgent(self, agent_class, run_mode): if tf.executing_eagerly() and run_mode == context.graph_mode: self.skipTest('b/123778560') with run_mode(): q_net = DummyNet(self._observation_spec, self._action_spec) agent = agent_class(self._time_step_spec, self._action_spec, q_network=q_net, optimizer=None) observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)] time_steps = ts.restart(observations, batch_size=2) policy = agent.policy action_step = policy.action(time_steps) self.evaluate(tf.compat.v1.initialize_all_variables()) checkpoint = tf.train.Checkpoint(agent=agent) latest_checkpoint = tf.train.latest_checkpoint(self.get_temp_dir()) checkpoint_load_status = checkpoint.restore(latest_checkpoint) if tf.executing_eagerly(): self.evaluate(checkpoint_load_status.initialize_or_restore()) self.assertAllEqual(self.evaluate(action_step.action), [[[0], [0]]]) else: with self.cached_session() as sess: checkpoint_load_status.initialize_or_restore(sess) self.assertAllEqual(sess.run(action_step.action), [[[0], [0]]])
def testCriticLoss(self): if tf.executing_eagerly(): self.skipTest('b/123772477') agent = td3_agent.Td3Agent(self._time_step_spec, self._action_spec, critic_network=self._critic_net, actor_network=self._unbounded_actor_net, actor_optimizer=None, critic_optimizer=None) observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)] time_steps = ts.restart(observations, batch_size=2) actions = [tf.constant([[5], [6]], dtype=tf.float32)] rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) next_observations = [tf.constant([[5, 6], [7, 8]], dtype=tf.float32)] next_time_steps = ts.transition(next_observations, rewards, discounts) expected_loss = 120.0912 loss = agent.critic_loss(time_steps, actions, next_time_steps, seed=1234) self.evaluate(tf.compat.v1.global_variables_initializer()) loss_ = self.evaluate(loss) self.assertAllClose(loss_, expected_loss)
def testLoss(self, agent_class, run_mode): if tf.executing_eagerly() and run_mode == context.graph_mode: self.skipTest('b/123778560') with run_mode(), tf.compat.v2.summary.record_if(False): q_net = DummyNet(self._observation_spec, self._action_spec) agent = agent_class(self._time_step_spec, self._action_spec, q_network=q_net, optimizer=None) observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)] time_steps = ts.restart(observations, batch_size=2) actions = [tf.constant([[0], [1]], dtype=tf.int32)] rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) next_observations = [ tf.constant([[5, 6], [7, 8]], dtype=tf.float32) ] next_time_steps = ts.transition(next_observations, rewards, discounts) expected_loss = 26.0 loss, _ = agent.loss(time_steps, actions, next_time_steps) self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertAllClose(self.evaluate(loss), expected_loss)
def testUpdate(self): tf.compat.v1.set_random_seed(1) policy = q_policy.QPolicy( self._time_step_spec, self._action_spec, q_network=DummyNet()) new_policy = q_policy.QPolicy( self._time_step_spec, self._action_spec, q_network=DummyNet()) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=2) self.assertEqual(policy.variables(), []) self.assertEqual(new_policy.variables(), []) action_step = policy.action(time_step, seed=1) new_action_step = new_policy.action(time_step, seed=1) self.assertEqual(len(policy.variables()), 2) self.assertEqual(len(new_policy.variables()), 2) self.assertEqual(action_step.action.shape, new_action_step.action.shape) self.assertEqual(action_step.action.dtype, new_action_step.action.dtype) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual(self.evaluate(new_policy.update(policy)), None) self.assertAllEqual(self.evaluate(action_step.action), [[1], [1]]) self.assertAllEqual(self.evaluate(new_action_step.action), [[1], [1]])
def testCriticLoss(self): agent = td3_agent.Td3Agent(self._time_step_spec, self._action_spec, critic_network=self._critic_net, actor_network=self._unbounded_actor_net, actor_optimizer=None, critic_optimizer=None) observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)] time_steps = ts.restart(observations, batch_size=2) actions = [tf.constant([[5], [6]], dtype=tf.float32)] rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) next_observations = [tf.constant([[5, 6], [7, 8]], dtype=tf.float32)] next_time_steps = ts.transition(next_observations, rewards, discounts) # TODO(kbanoop): The loss changed from 119.054 to 118.910903931. Is this # worth investigating? expected_loss = 118.9109 loss = agent.critic_loss(time_steps, actions, next_time_steps) self.evaluate(tf.global_variables_initializer()) loss_ = self.evaluate(loss) self.assertAllClose(loss_, expected_loss)
def testRestart(self): observation = tf.constant(-1) time_step = ts.restart(observation) time_step_ = self.evaluate(time_step) self.assertEqual(ts.StepType.FIRST, time_step_.step_type) self.assertEqual(-1, time_step_.observation) self.assertEqual(0.0, time_step_.reward) self.assertEqual(1.0, time_step_.discount)
def testRestart(self): observation = -1 time_step = ts.restart(observation) self.assertEqual(ts.StepType.FIRST, time_step.step_type) self.assertEqual(-1, time_step.observation) self.assertEqual(0.0, time_step.reward) self.assertEqual(1.0, time_step.discount)
def testBatchRestart(self): obs_spec = [tensor_spec.TensorSpec([2], tf.float32)] time_step_spec = ts.time_step_spec(obs_spec) observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)] time_steps = ts.restart(observations, 2) time_step_batched = nest_utils.is_batched_nested_tensors( time_steps, time_step_spec) self.assertTrue(time_step_batched)
def testRestartBatched(self): observation = np.array([[-1], [-1]]) time_step = ts.restart(observation, batch_size=2) self.assertItemsEqual([ts.StepType.FIRST] * 2, time_step.step_type) self.assertItemsEqual(observation, time_step.observation) self.assertItemsEqual([0.0, 0.0], time_step.reward) self.assertItemsEqual([1.0, 1.0], time_step.discount)
def _collect_step(self, time_step, metric_observers, train=False): """Run a single step (or 2 steps on life loss) in the environment.""" if train: policy = self._collect_policy else: policy = self._eval_policy with self._action_timer: action_step = policy.action(time_step) with self._step_timer: next_time_step = self._env.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) if next_time_step.is_last() and not self.game_over(): traj = traj._replace(discount=np.array([1.0], dtype=np.float32)) if train: self._store_to_rb(traj) # When AtariPreprocessing.terminal_on_life_loss is True, we receive LAST # time_steps when lives are lost but the game is not over.In this mode, the # replay buffer and agent's policy must see the life loss as a LAST step # and the subsequent step as a FIRST step. However, we do not want to # actually terminate the episode and metrics should be computed as if all # steps were MID steps, since life loss is not actually a terminal event # (it is mostly a trick to make it easier to propagate rewards backwards by # shortening episode durations from the agent's perspective). if next_time_step.is_last() and not self.game_over(): # Update metrics as if this is a mid-episode step. next_time_step = ts.transition(next_time_step.observation, next_time_step.reward) self._observe( metric_observers, trajectory.from_transition(time_step, action_step, next_time_step)) # Produce the next step as if this is the first step of an episode and # store to RB as such. The next_time_step will be a MID time step. reward = time_step.reward time_step = ts.restart(next_time_step.observation) with self._action_timer: action_step = policy.action(time_step) with self._step_timer: next_time_step = self._env.step(action_step.action) if train: self._store_to_rb( trajectory.from_transition(time_step, action_step, next_time_step)) # Update metrics as if this is a mid-episode step. time_step = ts.transition(time_step.observation, reward) traj = trajectory.from_transition(time_step, action_step, next_time_step) self._observe(metric_observers, traj) return next_time_step
def create_time_step(self): observation = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_step = ts.restart(observation) observation_spec = tensor_spec.TensorSpec( observation.shape.as_list(), tf.float32) time_step_spec = ts.time_step_spec(observation_spec) return time_step_spec, time_step
def testActionList(self): action_spec = [tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1)] policy = q_policy.QPolicy( self._time_step_spec, action_spec, q_network=DummyNet()) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step, seed=1) self.assertTrue(isinstance(action_step.action, list)) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllEqual(self.evaluate(action_step.action), [[[1], [1]]])
def reset(self): # TODO(oars): Upcoming update on gym adds **kwargs on reset. Update this to # support that. observation = self._gym_env.reset() self._info = None self._done = False if self._match_obs_space_dtype: observation = self._to_obs_space_dtype(observation) return ts.restart(observation)
def testMatchedDtypes(self): policy = TFPolicyMismatchedDtypes() # Overwrite the action_spec to match the dtype of _action. policy._action_spec = tensor_spec.BoundedTensorSpec([1], tf.int64, 0, 1) observation = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_step = ts.restart(observation) policy.action(time_step)
def testFixedPolicySingle(self): observations = tf.constant([1, 2], dtype=tf.float32) time_step = ts.restart(observations) action_step = self._policy.action(time_step) distribution_step = self._policy.distribution(time_step) mode = distribution_step.action.mode() self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllEqual(self.evaluate(action_step.action), [self._fixed_action]) self.assertAllEqual(self.evaluate(mode), [self._fixed_action])
def _step(self, action): self._state = (self._state + 1) % 3 self.steps += 1 self.actions_taken.append(action) observation = [self._state] if self._state == 0: return ts.restart(observation) elif self._state == 2: self.episodes += 1 return ts.termination(observation, reward=1.0) return ts.transition(observation, reward=0.0)
def testDistribution(self): tf.set_random_seed(1) policy = q_policy.QPolicy( self._time_step_spec, self._action_spec, q_network=DummyNet()) observations = tf.constant([[1, 2]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=1) distribution_step = policy.distribution(time_step) mode = distribution_step.action.mode() self.evaluate(tf.global_variables_initializer()) # All the biases and weights of the fake network are positive, so the action # corresponding to observation with index 1 will have higher q value. self.assertAllEqual(self.evaluate(mode), [1])
def setUp(self): super(EpsilonGreedyPolicyTest, self).setUp() self._obs_spec = tensor_spec.TensorSpec([2], tf.float32) self._time_step_spec = ts.time_step_spec(self._obs_spec) self._num_actions = 3 self._greedy_action = 1 self._action_spec = tensor_spec.BoundedTensorSpec( (1, ), tf.int32, 0, self._num_actions - 1) self._policy = fixed_policy.FixedPolicy( np.asarray([self._greedy_action], dtype=np.int32), self._time_step_spec, self._action_spec) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) self._time_step = ts.restart(observations, batch_size=2)
def testAction(self): tf.compat.v1.set_random_seed(1) policy = q_policy.QPolicy( self._time_step_spec, self._action_spec, q_network=DummyNet()) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step, seed=1) self.assertEqual(action_step.action.shape.as_list(), [2, 1]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllEqual(self.evaluate(action_step.action), [[1], [1]])
def testDistribution(self): tf.compat.v1.set_random_seed(1) policy = q_policy.QPolicy( self._time_step_spec, self._action_spec, q_network=DummyNet()) observations = tf.constant([[1, 2]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=1) distribution_step = policy.distribution(time_step) mode = distribution_step.action.mode() self.evaluate(tf.compat.v1.global_variables_initializer()) # The weights of index 0 are all 1 and the weights of index 1 are all 1.5, # so the Q values of index 1 will be higher. self.assertAllEqual([[1]], self.evaluate(mode))
def testFixedPolicyBatched(self): batch_size = 2 observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=batch_size) action_step = self._policy.action(time_step) distribution_step = self._policy.distribution(time_step) mode = distribution_step.action.mode() self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllEqual(self.evaluate(action_step.action), [[self._fixed_action]] * batch_size) self.assertAllEqual(self.evaluate(mode), [[self._fixed_action]] * batch_size)