Exemplo n.º 1
0
  def testTrain(self):
    agent = categorical_dqn_agent.CategoricalDqnAgent(
        self._time_step_spec,
        self._action_spec,
        self._dummy_categorical_net,
        self._optimizer)

    observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
    time_steps = ts.restart(observations, batch_size=2)

    actions = tf.constant([0, 1], dtype=tf.int32)
    action_steps = policy_step.PolicyStep(actions)

    rewards = tf.constant([10, 20], dtype=tf.float32)
    discounts = tf.constant([0.9, 0.9], dtype=tf.float32)
    next_observations = tf.constant([[5, 6], [7, 8]], dtype=tf.float32)
    next_time_steps = ts.transition(next_observations, rewards, discounts)

    experience = test_utils.stacked_trajectory_from_transition(
        time_steps, action_steps, next_time_steps)

    train_step = agent.train(experience, weights=None)

    # Due to the constant initialization of the DummyCategoricalNet, we can
    # expect the same loss every time.
    expected_loss = 2.19525
    self.evaluate(tf.compat.v1.global_variables_initializer())
    evaluated_loss, _ = self.evaluate(train_step)
    self.assertAllClose(evaluated_loss, expected_loss, atol=1e-4)
Exemplo n.º 2
0
    def testLossWithL2Regularization(self, agent_class):
        q_net = DummyNet(self._observation_spec,
                         self._action_spec,
                         l2_regularization_weight=1.0)
        agent = agent_class(self._time_step_spec,
                            self._action_spec,
                            q_network=q_net,
                            optimizer=None)

        observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)]
        time_steps = ts.restart(observations, batch_size=2)

        actions = tf.constant([[0], [1]], dtype=tf.int32)
        action_steps = policy_step.PolicyStep(actions)

        rewards = tf.constant([10, 20], dtype=tf.float32)
        discounts = tf.constant([0.9, 0.9], dtype=tf.float32)
        next_observations = [tf.constant([[5, 6], [7, 8]], dtype=tf.float32)]
        next_time_steps = ts.transition(next_observations, rewards, discounts)

        experience = trajectories_test_utils.stacked_trajectory_from_transition(
            time_steps, action_steps, next_time_steps)

        # See the loss explanation in testLoss above.
        # L2_regularization_loss: 2^2 + 1^2 + 1^2 + 1^2 = 7.0
        # Overall loss: 26.0 (from testLoss) + 7.0 = 33.0
        expected_loss = 33.0
        loss, _ = agent._loss(experience)

        self.evaluate(tf.compat.v1.initialize_all_variables())
        self.assertAllClose(self.evaluate(loss), expected_loss)
Exemplo n.º 3
0
  def testInitialize(self):
    agent = categorical_dqn_agent.CategoricalDqnAgent(
        self._time_step_spec,
        self._action_spec,
        self._categorical_net,
        self._optimizer)

    observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
    time_steps = ts.restart(observations, batch_size=2)
    actions = tf.constant([0, 1], dtype=tf.int32)
    action_steps = policy_step.PolicyStep(actions)

    rewards = tf.constant([10, 20], dtype=tf.float32)
    discounts = tf.constant([0.9, 0.9], dtype=tf.float32)
    next_time_steps = ts.transition(observations, rewards, discounts)

    experience = test_utils.stacked_trajectory_from_transition(
        time_steps, action_steps, next_time_steps)

    loss_info = agent._loss(experience)
    initialize = agent.initialize()

    self.evaluate(tf.compat.v1.global_variables_initializer())
    losses = self.evaluate(loss_info).loss
    self.assertGreater(losses, 0.0)

    critic_variables = agent._q_network.variables
    target_critic_variables = agent._target_q_network.variables
    self.assertTrue(critic_variables)
    self.assertTrue(target_critic_variables)
    self.evaluate(initialize)
    for s, t in zip(critic_variables, target_critic_variables):
      self.assertAllClose(self.evaluate(s), self.evaluate(t))
Exemplo n.º 4
0
    def testLossWithMaskedActions(self, agent_class):
        # Observations are now a tuple of the usual observation and an action mask.
        observation_spec_with_mask = (self._observation_spec,
                                      tensor_spec.BoundedTensorSpec([2],
                                                                    tf.int32,
                                                                    0, 1))
        time_step_spec = ts.time_step_spec(observation_spec_with_mask)
        q_net = DummyNet(observation_spec_with_mask,
                         self._action_spec,
                         mask_split_fn=lambda x: (x[0], x[1]))
        agent = agent_class(time_step_spec,
                            self._action_spec,
                            q_network=q_net,
                            optimizer=None)

        # For observations, the masks are set up so that all actions are valid.
        observations = ([tf.constant([[1, 2], [3, 4]], dtype=tf.float32)],
                        tf.constant([[1, 1], [1, 1]], dtype=tf.int32))
        time_steps = ts.restart(observations, batch_size=2)

        actions = tf.constant([[0], [1]], dtype=tf.int32)
        action_steps = policy_step.PolicyStep(actions)

        rewards = tf.constant([10, 20], dtype=tf.float32)
        discounts = tf.constant([0.9, 0.9], dtype=tf.float32)

        # For next_observations, the masks are set up so that only one action is
        # valid for each element in the batch.
        next_observations = ([tf.constant([[5, 6], [7, 8]], dtype=tf.float32)],
                             tf.constant([[0, 1], [1, 0]], dtype=tf.int32))
        next_time_steps = ts.transition(next_observations, rewards, discounts)

        experience = trajectories_test_utils.stacked_trajectory_from_transition(
            time_steps, action_steps, next_time_steps)

        # Using the kernel initializer [[2, 1], [1, 1]] and bias initializer
        # [[1], [1]] from DummyNet above, we can calculate the following values:
        # Q-value for first observation/action pair: 2 * 1 + 1 * 2 + 1 = 5
        # Q-value for second observation/action pair: 1 * 3 + 1 * 4 + 1 = 8
        # (Here we use the second row of the kernel initializer above, since the
        # chosen action is now 1 instead of 0.)
        #
        # For target Q-values, because of the masks we only have one valid choice of
        # action for each next_observation:
        # Target Q-value for first next_observation (only action 1 is valid):
        # 1 * 5 + 1 * 6 + 1 = 12
        # Target Q-value for second next_observation (only action 0 is valid):
        # 2 * 7 + 1 * 8 + 1 = 23
        # TD targets: 10 + 0.9 * 12 = 20.8 and 20 + 0.9 * 23 = 40.7
        # TD errors: 20.8 - 5 = 15.8 and 40.7 - 8 = 32.7
        # TD loss: 15.3 and 32.2 (Huber loss subtracts 0.5)
        # Overall loss: (15.3 + 32.2) / 2 = 23.75
        expected_loss = 23.75
        loss, _ = agent._loss(experience)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertAllClose(self.evaluate(loss), expected_loss)
Exemplo n.º 5
0
    def testLoss(self):
        q_net = DummyNet((self._observation_spec, self._action_spec))
        agent = qtopt_agent.QtOptAgent(self._time_step_spec,
                                       self._action_spec,
                                       q_network=q_net,
                                       optimizer=None,
                                       init_mean_cem=self._mean,
                                       init_var_cem=self._var,
                                       num_samples_cem=self._num_samples,
                                       actions_sampler=self._sampler)

        agent._target_q_network_delayed = DummyNet(
            (self._observation_spec, self._action_spec), bias=1)

        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        time_steps = ts.restart(observations, batch_size=2)

        actions = tf.constant([[0.0], [0.0]], dtype=tf.float32)
        action_steps = policy_step.PolicyStep(actions, info=())

        rewards = tf.constant([10, 20], dtype=tf.float32)
        discounts = tf.constant([0.9, 0.9], dtype=tf.float32)
        next_observations = tf.constant([[5, 6], [7, 8]], dtype=tf.float32)
        next_time_steps = ts.transition(next_observations, rewards, discounts)

        experience = trajectories_test_utils.stacked_trajectory_from_transition(
            time_steps, action_steps, next_time_steps)

        # Using the kernel initializer [[2, 1], [1, 1]] and bias initializer
        # ([[2], [2]] for q_network/target_network, [[1], [1]] for delayed
        # target_network)
        # from DummyNet above, we can calculate the following values:
        # Q Network:
        # Q-value for first observation/action pair: 2 * 1 + 1 * 2 + 2 = 6
        # Q-value for second observation/action pair: 2 * 3 + 1 * 4 + 2 = 12
        # Target Network:
        # Q-value for first next_observation: 2 * 5 + 1 * 6 + 2 = 18
        # Q-value for second next_observation: 2 * 7 + 1 * 8 + 2 = 24
        # Delayed Target Network:
        # Q-value for first next_observation: 2 * 5 + 1 * 6 + 1 = 17
        # Q-value for second next_observation: 2 * 7 + 1 * 8 + 1 = 23
        # TD targets: 10 + 0.9 * min(17, 18) = 25.3; 20 + 0.9 * min(23, 24) = 40.7
        # TD errors: 25.3 - 6 = 19.3; 40.7 - 12 = 28.7
        # TD loss: 18.8 and 28.2 (Huber loss subtracts 0.5)
        # Overall loss: (18.8 + 28.2) / 2 = 23.5
        expected_td_loss = 23.5
        loss, loss_info = agent._loss(experience)

        self.evaluate(tf.compat.v1.initialize_all_variables())
        self.assertAllClose(self.evaluate(loss), expected_td_loss)
        self.assertAllClose(self.evaluate(tf.reduce_mean(loss_info.td_loss)),
                            expected_td_loss)
    def testCriticLossWithMaskedActions(self):
        # Observations are now a tuple of the usual observation and an action mask.
        observation_spec_with_mask = (self._obs_spec,
                                      tensor_spec.BoundedTensorSpec([2],
                                                                    tf.int32,
                                                                    0, 1))
        time_step_spec = ts.time_step_spec(observation_spec_with_mask)
        dummy_categorical_net = DummyCategoricalNet(self._obs_spec)
        agent = categorical_dqn_agent.CategoricalDqnAgent(
            time_step_spec,
            self._action_spec,
            dummy_categorical_net,
            self._optimizer,
            observation_and_action_constraint_splitter=lambda x: (x[0], x[1]))

        # For `observations`, the masks are set up so that only one action is valid
        # for each element in the batch.
        observations = (tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
                        tf.constant([[1, 0], [0, 1]], dtype=tf.int32))
        time_steps = ts.restart(observations, batch_size=2)

        actions = tf.constant([0, 1], dtype=tf.int32)
        action_steps = policy_step.PolicyStep(actions)

        rewards = tf.constant([10, 20], dtype=tf.float32)
        discounts = tf.constant([0.9, 0.9], dtype=tf.float32)

        # For `next_observations`, the masks are set up so the opposite actions as
        # before are valid.
        next_observations = (tf.constant([[5, 6], [7, 8]], dtype=tf.float32),
                             tf.constant([[0, 1], [1, 0]], dtype=tf.int32))
        next_time_steps = ts.transition(next_observations, rewards, discounts)

        experience = test_utils.stacked_trajectory_from_transition(
            time_steps, action_steps, next_time_steps)

        # Due to the constant initialization of the DummyCategoricalNet, we can
        # expect the same loss every time. Note this is different from the loss in
        # testCriticLoss above due to previously optimal actions being masked out.
        expected_loss = 5.062895
        loss_info = agent._loss(experience)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        evaluated_loss = self.evaluate(loss_info).loss
        self.assertAllClose(evaluated_loss, expected_loss, atol=1e-4)
Exemplo n.º 7
0
  def testLossWithChangedOptimalActions(self, agent_class):
    q_net = DummyNet(self._observation_spec, self._action_spec)
    agent = agent_class(
        self._time_step_spec,
        self._action_spec,
        q_network=q_net,
        optimizer=None)

    observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
    time_steps = ts.restart(observations, batch_size=2)

    actions = tf.constant([0, 1], dtype=tf.int32)
    action_steps = policy_step.PolicyStep(actions)

    rewards = tf.constant([10, 20], dtype=tf.float32)
    discounts = tf.constant([0.9, 0.9], dtype=tf.float32)

    # Note that instead of [[5, 6], [7, 8]] as before, we now have -5 and -7.
    next_observations = tf.constant([[-5, 6], [-7, 8]], dtype=tf.float32)
    next_time_steps = ts.transition(next_observations, rewards, discounts)

    experience = trajectories_test_utils.stacked_trajectory_from_transition(
        time_steps, action_steps, next_time_steps)

    # Using the kernel initializer [[2, 1], [1, 1]] and bias initializer
    # [[1], [1]] from DummyNet above, we can calculate the following values:
    # Q-value for first observation/action pair: 2 * 1 + 1 * 2 + 1 = 5
    # Q-value for second observation/action pair: 1 * 3 + 1 * 4 + 1 = 8
    # (Here we use the second row of the kernel initializer above, since the
    # chosen action is now 1 instead of 0.)
    #
    # For the target Q-values here, note that since we've replaced 5 and 7 with
    # -5 and -7, it is better to use action 1 with a kernel of [1, 1] instead of
    # action 0 with a kernel of [2, 1].
    # Target Q-value for first next_observation: 1 * -5 + 1 * 6 + 1 = 2
    # Target Q-value for second next_observation: 1 * -7 + 1 * 8 + 1 = 2
    # TD targets: 10 + 0.9 * 2 = 11.8 and 20 + 0.9 * 2 = 21.8
    # TD errors: 11.8 - 5 = 6.8 and 21.8 - 8 = 13.8
    # TD loss: 6.3 and 13.3 (Huber loss subtracts 0.5)
    # Overall loss: (6.3 + 13.3) / 2 = 9.8
    expected_loss = 9.8
    loss, _ = agent._loss(experience)

    self.evaluate(tf.compat.v1.global_variables_initializer())
    self.assertAllClose(self.evaluate(loss), expected_loss)
Exemplo n.º 8
0
    def testLoss(self, agent_class, run_mode):
        if tf.executing_eagerly() and run_mode == context.graph_mode:
            self.skipTest('b/123778560')
        with run_mode(), tf.compat.v2.summary.record_if(False):
            q_net = DummyNet(self._observation_spec, self._action_spec)
            agent = agent_class(self._time_step_spec,
                                self._action_spec,
                                q_network=q_net,
                                optimizer=None)

            observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)]
            time_steps = ts.restart(observations, batch_size=2)

            actions = [tf.constant([[0], [1]], dtype=tf.int32)]
            action_steps = policy_step.PolicyStep(actions)

            rewards = tf.constant([10, 20], dtype=tf.float32)
            discounts = tf.constant([0.9, 0.9], dtype=tf.float32)
            next_observations = [
                tf.constant([[5, 6], [7, 8]], dtype=tf.float32)
            ]
            next_time_steps = ts.transition(next_observations, rewards,
                                            discounts)

            experience = test_utils.stacked_trajectory_from_transition(
                time_steps, action_steps, next_time_steps)

            # Using the kernel initializer [[2, 1], [1, 1]] and bias initializer
            # [[1], [1]] from DummyNet above, we can calculate the following values:
            # Q-value for first observation/action pair: 2 * 1 + 1 * 2 + 1 = 5
            # Q-value for second observation/action pair: 1 * 3 + 1 * 4 + 1 = 8
            # (Here we use the second row of the kernel initializer above, since the
            # chosen action is now 1 instead of 0.)
            # Q-value for first next_observation: 2 * 5 + 1 * 6 + 1 = 17
            # Q-value for second next_observation: 2 * 7 + 1 * 8 + 1 = 23
            # TD targets: 10 + 0.9 * 17 = 25.3 and 20 + 0.9 * 23 = 40.7
            # TD errors: 25.3 - 5 = 20.3 and 40.7 - 8 = 32.7
            # TD loss: 19.8 and 32.2 (Huber loss subtracts 0.5)
            # Overall loss: (19.8 + 32.2) / 2 = 26
            expected_loss = 26.0
            loss, _ = agent._loss(experience)

            self.evaluate(tf.compat.v1.initialize_all_variables())
            self.assertAllClose(self.evaluate(loss), expected_loss)
Exemplo n.º 9
0
    def testUpdateTarget(self):
        agent = categorical_dqn_agent.CategoricalDqnAgent(
            self._time_step_spec, self._action_spec, self._categorical_net,
            self._optimizer)

        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        time_steps = ts.restart(observations, batch_size=2)
        actions = tf.constant([0, 1], dtype=tf.int32)
        action_steps = policy_step.PolicyStep(actions)
        experience = test_utils.stacked_trajectory_from_transition(
            time_steps, action_steps, time_steps)

        loss_info = agent._loss(experience)
        update_targets = agent._update_target()

        self.evaluate(tf.compat.v1.global_variables_initializer())
        losses = self.evaluate(loss_info).loss
        self.assertGreater(losses, 0.0)
        self.evaluate(update_targets)
Exemplo n.º 10
0
  def testLoss(self, agent_class):
    q_net = DummyNet(self._observation_spec, self._action_spec)
    agent = agent_class(
        self._time_step_spec,
        self._action_spec,
        q_network=q_net,
        optimizer=None)

    observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
    time_steps = ts.restart(observations, batch_size=2)

    actions = tf.constant([0, 1], dtype=tf.int32)
    action_steps = policy_step.PolicyStep(actions)

    rewards = tf.constant([10, 20], dtype=tf.float32)
    discounts = tf.constant([0.9, 0.9], dtype=tf.float32)
    next_observations = tf.constant([[5, 6], [7, 8]], dtype=tf.float32)
    next_time_steps = ts.transition(next_observations, rewards, discounts)

    experience = trajectories_test_utils.stacked_trajectory_from_transition(
        time_steps, action_steps, next_time_steps)

    # Using the kernel initializer [[2, 1], [1, 1]] and bias initializer
    # [[1], [1]] from DummyNet above, we can calculate the following values:
    # Q-value for first observation/action pair: 2 * 1 + 1 * 2 + 1 = 5
    # Q-value for second observation/action pair: 1 * 3 + 1 * 4 + 1 = 8
    # (Here we use the second row of the kernel initializer above, since the
    # chosen action is now 1 instead of 0.)
    #
    # For target Q-values, action 0 produces a greater Q-value with a kernel of
    # [2, 1] instead of [1, 1] for action 1.
    # Target Q-value for first next_observation: 2 * 5 + 1 * 6 + 1 = 17
    # Target Q-value for second next_observation: 2 * 7 + 1 * 8 + 1 = 23
    # TD targets: 10 + 0.9 * 17 = 25.3 and 20 + 0.9 * 23 = 40.7
    # TD errors: 25.3 - 5 = 20.3 and 40.7 - 8 = 32.7
    # TD loss: 19.8 and 32.2 (Huber loss subtracts 0.5)
    # Overall loss: (19.8 + 32.2) / 2 = 26
    expected_loss = 26.0
    loss, _ = agent._loss(experience)

    self.evaluate(tf.compat.v1.global_variables_initializer())
    self.assertAllClose(self.evaluate(loss), expected_loss)