示例#1
0
    def testLoss(self, agent_class):
        with tf.compat.v2.summary.record_if(False):
            q_net = test_utils.DummyNet(self._observation_spec,
                                        self._action_spec)
            agent = agent_class(self._time_step_spec,
                                self._action_spec,
                                q_network=q_net,
                                optimizer=None)

            observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
            time_steps = ts.restart(observations, batch_size=2)

            actions = [tf.constant([[0], [1]], dtype=tf.int32)]

            rewards = tf.constant([10, 20], dtype=tf.float32)
            discounts = tf.constant([0.9, 0.9], dtype=tf.float32)
            next_observations = tf.constant([[5, 6], [7, 8]], dtype=tf.float32)
            next_time_steps = ts.transition(next_observations, rewards,
                                            discounts)

            expected_loss = 26.0
            loss_info = agent._loss(time_steps, actions, next_time_steps)
            self.evaluate(tf.compat.v1.initialize_all_variables())
            total_loss, _ = self.evaluate(loss_info)

            self.assertAllClose(total_loss, expected_loss)
示例#2
0
  def testCategoricalActions(self, action_probs):
    action_spec = [
        tensor_spec.BoundedTensorSpec((1,), tf.int32, 0, len(action_probs)-1),
        tensor_spec.BoundedTensorSpec((), tf.int32, 0, len(action_probs)-1)]
    wrapped_policy = DistributionPolicy([
        tfp.distributions.Categorical(probs=[action_probs]),
        tfp.distributions.Categorical(probs=action_probs)
    ], self._time_step_spec, action_spec)
    policy = greedy_policy.GreedyPolicy(wrapped_policy)

    self.assertEqual(policy.time_step_spec(), self._time_step_spec)
    self.assertEqual(policy.action_spec(), action_spec)

    observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
    time_step = ts.restart(observations, batch_size=2)
    action_step = policy.action(time_step)
    nest.assert_same_structure(action_spec, action_step.action)

    action_ = self.evaluate(action_step.action)
    self.assertEqual(action_[0][0], np.argmax(action_probs))
    self.assertEqual(action_[1], np.argmax(action_probs))
    self.assertAllEqual(action_[0].shape, [
        1,
    ] + action_spec[0].shape.as_list())
    self.assertAllEqual(action_[1].shape, [
        1,
    ] + action_spec[1].shape.as_list())
示例#3
0
    def testCriticLoss(self):
        agent = ddpg_agent.DdpgAgent(
            self._time_step_spec,
            self._action_spec,
            actor_network=self._unbounded_actor_net,
            critic_network=self._critic_net,
            actor_optimizer=None,
            critic_optimizer=None,
        )

        observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)]
        time_steps = ts.restart(observations, batch_size=2)

        actions = [tf.constant([[5], [6]], dtype=tf.float32)]

        rewards = tf.constant([10, 20], dtype=tf.float32)
        discounts = tf.constant([0.9, 0.9], dtype=tf.float32)
        next_observations = [tf.constant([[5, 6], [7, 8]], dtype=tf.float32)]
        next_time_steps = ts.transition(next_observations, rewards, discounts)

        expected_loss = 59.6
        loss = agent.critic_loss(time_steps, actions, next_time_steps)

        self.evaluate(tf.global_variables_initializer())
        loss_ = self.evaluate(loss)
        self.assertAllClose(loss_, expected_loss)
示例#4
0
    def testPolicyGradientLoss(self):
        actor_net = DummyActorNet(self._action_spec)
        agent = ppo_agent.PPOAgent(
            self._time_step_spec,
            self._action_spec,
            tf.train.AdamOptimizer(),
            normalize_observations=False,
            normalize_rewards=False,
            actor_net=actor_net,
            importance_ratio_clipping=10.0,
        )

        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        time_steps = ts.restart(observations, batch_size=2)
        actions = tf.constant([[0], [1]], dtype=tf.float32)
        sample_action_log_probs = tf.constant([0.9, 0.3], dtype=tf.float32)
        advantages = tf.constant([1.9, 1.0], dtype=tf.float32)
        weights = tf.ones_like(advantages)

        current_policy_distribution, unused_network_state = actor_net(
            time_steps.observation, time_steps.step_type, ())

        expected_loss = -0.0164646133
        loss = agent.policy_gradient_loss(time_steps, actions,
                                          sample_action_log_probs, advantages,
                                          current_policy_distribution, weights)

        self.evaluate(tf.global_variables_initializer())
        loss_ = self.evaluate(loss)
        self.assertAllClose(loss_, expected_loss)
示例#5
0
    def testPolicyGradientLoss(self):
        if tf.executing_eagerly():
            self.skipTest('b/123777433')

        agent = reinforce_agent.ReinforceAgent(
            self._time_step_spec,
            self._action_spec,
            actor_network=DummyActorNet(self._obs_spec,
                                        self._action_spec,
                                        unbounded_actions=True),
            optimizer=None,
        )

        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        time_steps = ts.restart(observations, batch_size=2)
        actions = tf.constant([[0], [1]], dtype=tf.float32)
        actions_distribution = agent.collect_policy.distribution(
            time_steps).action
        returns = tf.constant([1.9, 1.0], dtype=tf.float32)

        expected_loss = 10.983667373657227
        loss = agent.policy_gradient_loss(actions_distribution, actions,
                                          time_steps.is_last(), returns)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        loss_ = self.evaluate(loss)
        self.assertAllClose(loss_, expected_loss)
示例#6
0
  def testDeferredBatchingAction(self):
    if tf.executing_eagerly():
      self.skipTest('b/123770140')

    # Construct policy without providing batch_size.
    tf_policy = q_policy.QPolicy(
        self._time_step_spec,
        self._action_spec,
        q_network=DummyNet(stateful=False))
    policy = py_tf_policy.PyTFPolicy(tf_policy)

    # But time_steps have batch_size of 5
    batch_size = 5
    single_observation = np.array([1, 2], dtype=np.float32)
    time_steps = [ts.restart(single_observation)] * batch_size
    time_steps = fast_map_structure(lambda *arrays: np.stack(arrays),
                                    *time_steps)

    with self.cached_session():
      self.evaluate(tf.compat.v1.global_variables_initializer())
      action_steps = policy.action(time_steps)
      self.assertEqual(action_steps.action.shape, (batch_size,))
      for a in action_steps.action:
        self.assertIn(a, (0, 1))
      self.assertAllEqual(action_steps.state, ())
 def testDistributionRaisesNotImplementedError(self):
   mock_tf_py_policy = tf_py_policy.TFPyPolicy(
       self._get_mock_py_policy())
   observation = tf.ones([5], tf.float32)
   time_step = ts.restart(observation)
   with self.assertRaises(NotImplementedError):
     mock_tf_py_policy.distribution(time_step=time_step)
示例#8
0
 def testMismatchedDtypesListAction(self):
     with self.assertRaisesRegexp(TypeError,
                                  ".*dtype that doesn't match.*"):
         policy = TFPolicyMismatchedDtypesListAction()
         observation = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
         time_step = ts.restart(observation)
         policy.action(time_step)
示例#9
0
    def testAction(self, batch_size):
        if tf.executing_eagerly():
            self.skipTest('b/123770140')

        single_observation = np.array([1, 2], dtype=np.float32)
        time_steps = ts.restart(single_observation)
        if batch_size is not None:
            time_steps = [time_steps] * batch_size
            time_steps = fast_map_structure(lambda *arrays: np.stack(arrays),
                                            *time_steps)
        policy = py_tf_policy.PyTFPolicy(self._tf_policy)

        with self.cached_session():
            policy_state = policy.get_initial_state(batch_size)
            self.evaluate(tf.compat.v1.global_variables_initializer())
            action_steps = policy.action(time_steps, policy_state)
            self.assertEqual(action_steps.action.dtype, np.int32)
            if batch_size is None:
                self.assertEqual(action_steps.action.shape, ())
                self.assertIn(action_steps.action, (0, 1))
                self.assertEqual(action_steps.state, np.zeros([1, 1]))
            else:
                self.assertEqual(action_steps.action.shape, (batch_size, ))
                for a in action_steps.action:
                    self.assertIn(a, (0, 1))
                self.assertAllEqual(action_steps.state,
                                    np.zeros([batch_size, 1]))
示例#10
0
    def testInitializeRestoreAgent(self, agent_class, run_mode):
        if tf.executing_eagerly() and run_mode == context.graph_mode:
            self.skipTest('b/123778560')
        with run_mode():
            q_net = DummyNet(self._observation_spec, self._action_spec)
            agent = agent_class(self._time_step_spec,
                                self._action_spec,
                                q_network=q_net,
                                optimizer=None)
            observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)]
            time_steps = ts.restart(observations, batch_size=2)
            policy = agent.policy
            action_step = policy.action(time_steps)
            self.evaluate(tf.compat.v1.initialize_all_variables())

            checkpoint = tf.train.Checkpoint(agent=agent)

            latest_checkpoint = tf.train.latest_checkpoint(self.get_temp_dir())
            checkpoint_load_status = checkpoint.restore(latest_checkpoint)

            if tf.executing_eagerly():
                self.evaluate(checkpoint_load_status.initialize_or_restore())
                self.assertAllEqual(self.evaluate(action_step.action),
                                    [[[0], [0]]])
            else:
                with self.cached_session() as sess:
                    checkpoint_load_status.initialize_or_restore(sess)
                    self.assertAllEqual(sess.run(action_step.action),
                                        [[[0], [0]]])
示例#11
0
    def testCriticLoss(self):
        if tf.executing_eagerly():
            self.skipTest('b/123772477')

        agent = td3_agent.Td3Agent(self._time_step_spec,
                                   self._action_spec,
                                   critic_network=self._critic_net,
                                   actor_network=self._unbounded_actor_net,
                                   actor_optimizer=None,
                                   critic_optimizer=None)

        observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)]
        time_steps = ts.restart(observations, batch_size=2)
        actions = [tf.constant([[5], [6]], dtype=tf.float32)]

        rewards = tf.constant([10, 20], dtype=tf.float32)
        discounts = tf.constant([0.9, 0.9], dtype=tf.float32)
        next_observations = [tf.constant([[5, 6], [7, 8]], dtype=tf.float32)]
        next_time_steps = ts.transition(next_observations, rewards, discounts)

        expected_loss = 120.0912
        loss = agent.critic_loss(time_steps,
                                 actions,
                                 next_time_steps,
                                 seed=1234)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        loss_ = self.evaluate(loss)
        self.assertAllClose(loss_, expected_loss)
示例#12
0
    def testLoss(self, agent_class, run_mode):
        if tf.executing_eagerly() and run_mode == context.graph_mode:
            self.skipTest('b/123778560')
        with run_mode(), tf.compat.v2.summary.record_if(False):
            q_net = DummyNet(self._observation_spec, self._action_spec)
            agent = agent_class(self._time_step_spec,
                                self._action_spec,
                                q_network=q_net,
                                optimizer=None)

            observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)]
            time_steps = ts.restart(observations, batch_size=2)

            actions = [tf.constant([[0], [1]], dtype=tf.int32)]

            rewards = tf.constant([10, 20], dtype=tf.float32)
            discounts = tf.constant([0.9, 0.9], dtype=tf.float32)
            next_observations = [
                tf.constant([[5, 6], [7, 8]], dtype=tf.float32)
            ]
            next_time_steps = ts.transition(next_observations, rewards,
                                            discounts)

            expected_loss = 26.0
            loss, _ = agent.loss(time_steps, actions, next_time_steps)

            self.evaluate(tf.compat.v1.initialize_all_variables())
            self.assertAllClose(self.evaluate(loss), expected_loss)
示例#13
0
  def testUpdate(self):
    tf.compat.v1.set_random_seed(1)
    policy = q_policy.QPolicy(
        self._time_step_spec, self._action_spec, q_network=DummyNet())
    new_policy = q_policy.QPolicy(
        self._time_step_spec, self._action_spec, q_network=DummyNet())

    observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
    time_step = ts.restart(observations, batch_size=2)

    self.assertEqual(policy.variables(), [])
    self.assertEqual(new_policy.variables(), [])

    action_step = policy.action(time_step, seed=1)
    new_action_step = new_policy.action(time_step, seed=1)

    self.assertEqual(len(policy.variables()), 2)
    self.assertEqual(len(new_policy.variables()), 2)
    self.assertEqual(action_step.action.shape, new_action_step.action.shape)
    self.assertEqual(action_step.action.dtype, new_action_step.action.dtype)

    self.evaluate(tf.compat.v1.global_variables_initializer())
    self.assertEqual(self.evaluate(new_policy.update(policy)), None)

    self.assertAllEqual(self.evaluate(action_step.action), [[1], [1]])
    self.assertAllEqual(self.evaluate(new_action_step.action), [[1], [1]])
示例#14
0
    def testCriticLoss(self):
        agent = td3_agent.Td3Agent(self._time_step_spec,
                                   self._action_spec,
                                   critic_network=self._critic_net,
                                   actor_network=self._unbounded_actor_net,
                                   actor_optimizer=None,
                                   critic_optimizer=None)

        observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)]
        time_steps = ts.restart(observations, batch_size=2)
        actions = [tf.constant([[5], [6]], dtype=tf.float32)]

        rewards = tf.constant([10, 20], dtype=tf.float32)
        discounts = tf.constant([0.9, 0.9], dtype=tf.float32)
        next_observations = [tf.constant([[5, 6], [7, 8]], dtype=tf.float32)]
        next_time_steps = ts.transition(next_observations, rewards, discounts)

        # TODO(kbanoop): The loss changed from 119.054 to 118.910903931. Is this
        # worth investigating?
        expected_loss = 118.9109
        loss = agent.critic_loss(time_steps, actions, next_time_steps)

        self.evaluate(tf.global_variables_initializer())
        loss_ = self.evaluate(loss)
        self.assertAllClose(loss_, expected_loss)
示例#15
0
 def testRestart(self):
     observation = tf.constant(-1)
     time_step = ts.restart(observation)
     time_step_ = self.evaluate(time_step)
     self.assertEqual(ts.StepType.FIRST, time_step_.step_type)
     self.assertEqual(-1, time_step_.observation)
     self.assertEqual(0.0, time_step_.reward)
     self.assertEqual(1.0, time_step_.discount)
示例#16
0
    def testRestart(self):
        observation = -1
        time_step = ts.restart(observation)

        self.assertEqual(ts.StepType.FIRST, time_step.step_type)
        self.assertEqual(-1, time_step.observation)
        self.assertEqual(0.0, time_step.reward)
        self.assertEqual(1.0, time_step.discount)
示例#17
0
 def testBatchRestart(self):
     obs_spec = [tensor_spec.TensorSpec([2], tf.float32)]
     time_step_spec = ts.time_step_spec(obs_spec)
     observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)]
     time_steps = ts.restart(observations, 2)
     time_step_batched = nest_utils.is_batched_nested_tensors(
         time_steps, time_step_spec)
     self.assertTrue(time_step_batched)
示例#18
0
    def testRestartBatched(self):
        observation = np.array([[-1], [-1]])
        time_step = ts.restart(observation, batch_size=2)

        self.assertItemsEqual([ts.StepType.FIRST] * 2, time_step.step_type)
        self.assertItemsEqual(observation, time_step.observation)
        self.assertItemsEqual([0.0, 0.0], time_step.reward)
        self.assertItemsEqual([1.0, 1.0], time_step.discount)
示例#19
0
    def _collect_step(self, time_step, metric_observers, train=False):
        """Run a single step (or 2 steps on life loss) in the environment."""
        if train:
            policy = self._collect_policy
        else:
            policy = self._eval_policy

        with self._action_timer:
            action_step = policy.action(time_step)
        with self._step_timer:
            next_time_step = self._env.step(action_step.action)
            traj = trajectory.from_transition(time_step, action_step,
                                              next_time_step)

        if next_time_step.is_last() and not self.game_over():
            traj = traj._replace(discount=np.array([1.0], dtype=np.float32))

        if train:
            self._store_to_rb(traj)

        # When AtariPreprocessing.terminal_on_life_loss is True, we receive LAST
        # time_steps when lives are lost but the game is not over.In this mode, the
        # replay buffer and agent's policy must see the life loss as a LAST step
        # and the subsequent step as a FIRST step. However, we do not want to
        # actually terminate the episode and metrics should be computed as if all
        # steps were MID steps, since life loss is not actually a terminal event
        # (it is mostly a trick to make it easier to propagate rewards backwards by
        # shortening episode durations from the agent's perspective).
        if next_time_step.is_last() and not self.game_over():
            # Update metrics as if this is a mid-episode step.
            next_time_step = ts.transition(next_time_step.observation,
                                           next_time_step.reward)
            self._observe(
                metric_observers,
                trajectory.from_transition(time_step, action_step,
                                           next_time_step))

            # Produce the next step as if this is the first step of an episode and
            # store to RB as such. The next_time_step will be a MID time step.
            reward = time_step.reward
            time_step = ts.restart(next_time_step.observation)
            with self._action_timer:
                action_step = policy.action(time_step)
            with self._step_timer:
                next_time_step = self._env.step(action_step.action)
            if train:
                self._store_to_rb(
                    trajectory.from_transition(time_step, action_step,
                                               next_time_step))

            # Update metrics as if this is a mid-episode step.
            time_step = ts.transition(time_step.observation, reward)
            traj = trajectory.from_transition(time_step, action_step,
                                              next_time_step)

        self._observe(metric_observers, traj)

        return next_time_step
示例#20
0
  def create_time_step(self):
    observation = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
    time_step = ts.restart(observation)

    observation_spec = tensor_spec.TensorSpec(
        observation.shape.as_list(), tf.float32)
    time_step_spec = ts.time_step_spec(observation_spec)

    return time_step_spec, time_step
示例#21
0
 def testActionList(self):
   action_spec = [tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1)]
   policy = q_policy.QPolicy(
       self._time_step_spec, action_spec, q_network=DummyNet())
   observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
   time_step = ts.restart(observations, batch_size=2)
   action_step = policy.action(time_step, seed=1)
   self.assertTrue(isinstance(action_step.action, list))
   self.evaluate(tf.compat.v1.global_variables_initializer())
   self.assertAllEqual(self.evaluate(action_step.action), [[[1], [1]]])
示例#22
0
    def reset(self):
        # TODO(oars): Upcoming update on gym adds **kwargs on reset. Update this to
        # support that.
        observation = self._gym_env.reset()
        self._info = None
        self._done = False

        if self._match_obs_space_dtype:
            observation = self._to_obs_space_dtype(observation)
        return ts.restart(observation)
示例#23
0
    def testMatchedDtypes(self):
        policy = TFPolicyMismatchedDtypes()

        # Overwrite the action_spec to match the dtype of _action.
        policy._action_spec = tensor_spec.BoundedTensorSpec([1], tf.int64, 0,
                                                            1)

        observation = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        time_step = ts.restart(observation)
        policy.action(time_step)
示例#24
0
    def testFixedPolicySingle(self):
        observations = tf.constant([1, 2], dtype=tf.float32)
        time_step = ts.restart(observations)
        action_step = self._policy.action(time_step)
        distribution_step = self._policy.distribution(time_step)
        mode = distribution_step.action.mode()

        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertAllEqual(self.evaluate(action_step.action),
                            [self._fixed_action])
        self.assertAllEqual(self.evaluate(mode), [self._fixed_action])
    def _step(self, action):
        self._state = (self._state + 1) % 3
        self.steps += 1
        self.actions_taken.append(action)

        observation = [self._state]
        if self._state == 0:
            return ts.restart(observation)
        elif self._state == 2:
            self.episodes += 1
            return ts.termination(observation, reward=1.0)
        return ts.transition(observation, reward=0.0)
示例#26
0
  def testDistribution(self):
    tf.set_random_seed(1)
    policy = q_policy.QPolicy(
        self._time_step_spec, self._action_spec, q_network=DummyNet())

    observations = tf.constant([[1, 2]], dtype=tf.float32)
    time_step = ts.restart(observations, batch_size=1)
    distribution_step = policy.distribution(time_step)
    mode = distribution_step.action.mode()
    self.evaluate(tf.global_variables_initializer())
    # All the biases and weights of the fake network are positive, so the action
    # corresponding to observation with index 1 will have higher q value.
    self.assertAllEqual(self.evaluate(mode), [1])
示例#27
0
 def setUp(self):
     super(EpsilonGreedyPolicyTest, self).setUp()
     self._obs_spec = tensor_spec.TensorSpec([2], tf.float32)
     self._time_step_spec = ts.time_step_spec(self._obs_spec)
     self._num_actions = 3
     self._greedy_action = 1
     self._action_spec = tensor_spec.BoundedTensorSpec(
         (1, ), tf.int32, 0, self._num_actions - 1)
     self._policy = fixed_policy.FixedPolicy(
         np.asarray([self._greedy_action], dtype=np.int32),
         self._time_step_spec, self._action_spec)
     observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
     self._time_step = ts.restart(observations, batch_size=2)
示例#28
0
  def testAction(self):
    tf.compat.v1.set_random_seed(1)
    policy = q_policy.QPolicy(
        self._time_step_spec, self._action_spec, q_network=DummyNet())

    observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
    time_step = ts.restart(observations, batch_size=2)
    action_step = policy.action(time_step, seed=1)
    self.assertEqual(action_step.action.shape.as_list(), [2, 1])
    self.assertEqual(action_step.action.dtype, tf.int32)
    # Initialize all variables
    self.evaluate(tf.compat.v1.global_variables_initializer())
    self.assertAllEqual(self.evaluate(action_step.action), [[1], [1]])
示例#29
0
  def testDistribution(self):
    tf.compat.v1.set_random_seed(1)
    policy = q_policy.QPolicy(
        self._time_step_spec, self._action_spec, q_network=DummyNet())

    observations = tf.constant([[1, 2]], dtype=tf.float32)
    time_step = ts.restart(observations, batch_size=1)
    distribution_step = policy.distribution(time_step)
    mode = distribution_step.action.mode()
    self.evaluate(tf.compat.v1.global_variables_initializer())
    # The weights of index 0 are all 1 and the weights of index 1 are all 1.5,
    # so the Q values of index 1 will be higher.
    self.assertAllEqual([[1]], self.evaluate(mode))
示例#30
0
    def testFixedPolicyBatched(self):
        batch_size = 2
        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        time_step = ts.restart(observations, batch_size=batch_size)
        action_step = self._policy.action(time_step)
        distribution_step = self._policy.distribution(time_step)
        mode = distribution_step.action.mode()

        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertAllEqual(self.evaluate(action_step.action),
                            [[self._fixed_action]] * batch_size)
        self.assertAllEqual(self.evaluate(mode),
                            [[self._fixed_action]] * batch_size)