def testLoss(self):
        cloning_net = get_dummy_net(self._action_spec)
        agent = behavioral_cloning_agent.BehavioralCloningAgent(
            self._time_step_spec,
            self._action_spec,
            cloning_network=cloning_net,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001))

        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        actions = tf.constant([0, 1], dtype=tf.int32)
        rewards = tf.constant([10, 20], dtype=tf.float32)
        discounts = tf.constant([0.9, 0.9], dtype=tf.float32)

        experience = trajectory.first(observation=observations,
                                      action=actions,
                                      policy_info=(),
                                      reward=rewards,
                                      discount=discounts)

        self.evaluate(tf.compat.v1.global_variables_initializer())

        expected_loss = tf.reduce_mean(
            input_tensor=tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=actions, logits=cloning_net(observations)[0]))

        loss_info = agent.train(experience)
        total_loss = self.evaluate(loss_info.loss)

        self.assertAllClose(total_loss, expected_loss)

        test_util.test_loss_and_train_output(test=self,
                                             expect_equal_loss_values=True,
                                             agent=agent,
                                             experience=experience)
Exemplo n.º 2
0
    def testLossNotMatching(self):
        class MyAgentWithLossNotMatching(MyAgent):
            def _loss(self, experience, weights=None, extra=None):
                return tf_agent.LossInfo(loss=(), extra=(experience, ()))

        train_argspec = {
            'extra': tf.TensorSpec(dtype=tf.float32, shape=[3, 4])
        }
        agent = MyAgentWithLossNotMatching(train_argspec=train_argspec)
        extra = tf.ones(shape=[3, 4], dtype=tf.float32)
        experience = tf.nest.map_structure(
            lambda x: x[tf.newaxis, ...],
            trajectory.from_episode(observation={'obs': tf.constant([1.0])},
                                    action=(),
                                    policy_info=(),
                                    reward=tf.constant([1.0])))

        with self.assertRaisesRegex(
                ValueError,
                r'.*`LossInfo` from train\(\) and `LossInfo` from loss\(\) do not have '
                'matching structures.*'):
            test_util.test_loss_and_train_output(test=self,
                                                 expect_equal_loss_values=True,
                                                 agent=agent,
                                                 experience=experience,
                                                 extra=extra)
Exemplo n.º 3
0
 def testLoss(self):
     agent = MyAgent()
     extra = tf.ones(shape=[3, 4], dtype=tf.float32)
     experience = tf.nest.map_structure(
         lambda x: x[tf.newaxis, ...],
         trajectory.from_episode(observation={'obs': tf.constant([1.0])},
                                 action=(),
                                 policy_info=(),
                                 reward=tf.constant([1.0])))
     test_util.test_loss_and_train_output(test=self,
                                          expect_equal_loss_values=True,
                                          agent=agent,
                                          experience=experience,
                                          extra=extra)
Exemplo n.º 4
0
  def testLoss(self, mock_actions_and_log_probs, mock_apply_gradients):
    # Mock _actions_and_log_probs so that _train() and _loss() run on the same
    # sampled values.
    actions = tf.constant([[0.2], [0.5], [-0.8]])
    log_pi = tf.constant([-1.1, -0.8, -0.5])
    mock_actions_and_log_probs.return_value = (actions, log_pi)

    # Skip applying gradients since mocking _actions_and_log_probs.
    del mock_apply_gradients

    actor_net = actor_distribution_network.ActorDistributionNetwork(
        self._obs_spec,
        self._action_spec,
        fc_layer_params=(10,),
        continuous_projection_net=tanh_normal_projection_network
        .TanhNormalProjectionNetwork)

    agent = sac_agent.SacAgent(
        self._time_step_spec,
        self._action_spec,
        critic_network=DummyCriticNet(),
        actor_network=actor_net,
        actor_optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
        critic_optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
        alpha_optimizer=tf.compat.v1.train.AdamOptimizer(0.001))

    observations = tf.constant(
        [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]],
        dtype=tf.float32)
    actions = tf.constant([[[0], [1]], [[2], [3]], [[4], [5]]],
                          dtype=tf.float32)
    time_steps = ts.TimeStep(
        step_type=tf.constant([[1, 1]] * 3, dtype=tf.int32),
        reward=tf.constant([[1, 1]] * 3, dtype=tf.float32),
        discount=tf.constant([[1, 1]] * 3, dtype=tf.float32),
        observation=observations)

    experience = trajectory.Trajectory(
        time_steps.step_type, observations, actions, (),
        time_steps.step_type, time_steps.reward, time_steps.discount)

    test_util.test_loss_and_train_output(
        test=self,
        expect_equal_loss_values=True,
        agent=agent,
        experience=experience)