def testLoss(self): cloning_net = get_dummy_net(self._action_spec) agent = behavioral_cloning_agent.BehavioralCloningAgent( self._time_step_spec, self._action_spec, cloning_network=cloning_net, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001)) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) actions = tf.constant([0, 1], dtype=tf.int32) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) experience = trajectory.first(observation=observations, action=actions, policy_info=(), reward=rewards, discount=discounts) self.evaluate(tf.compat.v1.global_variables_initializer()) expected_loss = tf.reduce_mean( input_tensor=tf.nn.sparse_softmax_cross_entropy_with_logits( labels=actions, logits=cloning_net(observations)[0])) loss_info = agent.train(experience) total_loss = self.evaluate(loss_info.loss) self.assertAllClose(total_loss, expected_loss) test_util.test_loss_and_train_output(test=self, expect_equal_loss_values=True, agent=agent, experience=experience)
def testLossNotMatching(self): class MyAgentWithLossNotMatching(MyAgent): def _loss(self, experience, weights=None, extra=None): return tf_agent.LossInfo(loss=(), extra=(experience, ())) train_argspec = { 'extra': tf.TensorSpec(dtype=tf.float32, shape=[3, 4]) } agent = MyAgentWithLossNotMatching(train_argspec=train_argspec) extra = tf.ones(shape=[3, 4], dtype=tf.float32) experience = tf.nest.map_structure( lambda x: x[tf.newaxis, ...], trajectory.from_episode(observation={'obs': tf.constant([1.0])}, action=(), policy_info=(), reward=tf.constant([1.0]))) with self.assertRaisesRegex( ValueError, r'.*`LossInfo` from train\(\) and `LossInfo` from loss\(\) do not have ' 'matching structures.*'): test_util.test_loss_and_train_output(test=self, expect_equal_loss_values=True, agent=agent, experience=experience, extra=extra)
def testLoss(self): agent = MyAgent() extra = tf.ones(shape=[3, 4], dtype=tf.float32) experience = tf.nest.map_structure( lambda x: x[tf.newaxis, ...], trajectory.from_episode(observation={'obs': tf.constant([1.0])}, action=(), policy_info=(), reward=tf.constant([1.0]))) test_util.test_loss_and_train_output(test=self, expect_equal_loss_values=True, agent=agent, experience=experience, extra=extra)
def testLoss(self, mock_actions_and_log_probs, mock_apply_gradients): # Mock _actions_and_log_probs so that _train() and _loss() run on the same # sampled values. actions = tf.constant([[0.2], [0.5], [-0.8]]) log_pi = tf.constant([-1.1, -0.8, -0.5]) mock_actions_and_log_probs.return_value = (actions, log_pi) # Skip applying gradients since mocking _actions_and_log_probs. del mock_apply_gradients actor_net = actor_distribution_network.ActorDistributionNetwork( self._obs_spec, self._action_spec, fc_layer_params=(10,), continuous_projection_net=tanh_normal_projection_network .TanhNormalProjectionNetwork) agent = sac_agent.SacAgent( self._time_step_spec, self._action_spec, critic_network=DummyCriticNet(), actor_network=actor_net, actor_optimizer=tf.compat.v1.train.AdamOptimizer(0.001), critic_optimizer=tf.compat.v1.train.AdamOptimizer(0.001), alpha_optimizer=tf.compat.v1.train.AdamOptimizer(0.001)) observations = tf.constant( [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]], dtype=tf.float32) actions = tf.constant([[[0], [1]], [[2], [3]], [[4], [5]]], dtype=tf.float32) time_steps = ts.TimeStep( step_type=tf.constant([[1, 1]] * 3, dtype=tf.int32), reward=tf.constant([[1, 1]] * 3, dtype=tf.float32), discount=tf.constant([[1, 1]] * 3, dtype=tf.float32), observation=observations) experience = trajectory.Trajectory( time_steps.step_type, observations, actions, (), time_steps.step_type, time_steps.reward, time_steps.discount) test_util.test_loss_and_train_output( test=self, expect_equal_loss_values=True, agent=agent, experience=experience)