示例#1
0
 def testTrainArgspec(self):
     train_argspec = {
         'extra': tf.TensorSpec(dtype=tf.float32, shape=[3, 4])
     }
     agent = MyAgent(train_argspec=train_argspec)
     extra = tf.ones(shape=[3, 4], dtype=tf.float32)
     experience = tf.nest.map_structure(
         lambda x: x[tf.newaxis, ...],
         trajectory.from_episode(observation={'obs': tf.constant([1.0])},
                                 action=(),
                                 policy_info=(),
                                 reward=tf.constant([1.0])))
     loss_info = agent.train(experience, extra=extra)
     tf.nest.map_structure(self.assertAllEqual, (experience, extra),
                           loss_info.extra)
     extra_newdim = tf.ones(shape=[2, 3, 4], dtype=tf.float32)
     loss_info_newdim = agent.train(experience, extra=extra_newdim)
     self.assertAllEqual(loss_info_newdim.extra[1], extra_newdim)
     with self.assertRaisesRegex(ValueError,
                                 'Inconsistent dtypes or shapes between'):
         agent.train(experience,
                     extra=tf.ones(shape=[3, 5], dtype=tf.float32))
     with self.assertRaisesRegex(ValueError,
                                 'Inconsistent dtypes or shapes between'):
         agent.train(experience,
                     extra=tf.ones(shape=[3, 4], dtype=tf.int32))
示例#2
0
 def testFromEpisodeWithCompositeTensorOfTensors(self):
     observation = tf.SparseTensor(
         indices=tf.random.uniform((7, 2), maxval=9, dtype=tf.int64),
         values=tf.random.uniform((7, )),
         dense_shape=[4, 10
                      ])  # The 4 is important, it must match reward length.
     action = ()
     policy_info = ()
     reward = tf.random.uniform((4, ))
     traj = trajectory.from_episode(observation,
                                    action,
                                    policy_info,
                                    reward,
                                    discount=None)
     self.assertTrue(tf.is_tensor(traj.step_type))
     traj_val, obs_val, reward_val = self.evaluate(
         (traj, observation, reward))
     first = ts.StepType.FIRST
     mid = ts.StepType.MID
     last = ts.StepType.LAST
     self.assertAllEqual(traj_val.step_type, [first, mid, mid, mid])
     self.assertAllEqual(traj_val.next_step_type, [mid, mid, mid, last])
     self.assertAllClose(traj_val.observation, obs_val)
     self.assertAllEqual(traj_val.reward, reward_val)
     self.assertAllEqual(traj_val.discount, [1.0, 1.0, 1.0, 1.0])
示例#3
0
    def testLossNotMatching(self):
        class MyAgentWithLossNotMatching(MyAgent):
            def _loss(self, experience, weights=None, extra=None):
                return tf_agent.LossInfo(loss=(), extra=(experience, ()))

        train_argspec = {
            'extra': tf.TensorSpec(dtype=tf.float32, shape=[3, 4])
        }
        agent = MyAgentWithLossNotMatching(train_argspec=train_argspec)
        extra = tf.ones(shape=[3, 4], dtype=tf.float32)
        experience = tf.nest.map_structure(
            lambda x: x[tf.newaxis, ...],
            trajectory.from_episode(observation={'obs': tf.constant([1.0])},
                                    action=(),
                                    policy_info=(),
                                    reward=tf.constant([1.0])))

        with self.assertRaisesRegex(
                ValueError,
                r'.*`LossInfo` from train\(\) and `LossInfo` from loss\(\) do not have '
                'matching structures.*'):
            test_util.test_loss_and_train_output(test=self,
                                                 expect_equal_loss_values=True,
                                                 agent=agent,
                                                 experience=experience,
                                                 extra=extra)
示例#4
0
    def run(self, time_step, policy_state=()):
        """Run policy in environment given initial time_step and policy_state.
        Args:
            time_step: The initial time_step.
            policy_state: The initial policy_state.
        Returns:
            A tuple (final time_step, final policy_state).
        """
        for num_episodes in range(self._max_episodes):
            time_step = self.env.reset()
            policy_state = self.policy.get_initial_state()

            observation = []
            action = []
            policy_info = []
            reward = []

            while not self.env.done:
                action_step = self.policy.action(time_step, policy_state)
                if self.env.debug:
                    self.env.visualize(action_step.action, action_step.info)
                next_time_step = self.env.step(action_step.action)
                next_policy_state = action_step.state

                if len(self.observers) > 0:
                    observation.append(time_step.observation)
                    action.append(action_step.action)
                    policy_info.append(action_step.info)
                    reward.append(next_time_step.reward)

                time_step = next_time_step
                policy_state = next_policy_state

            if len(self.observers) > 0:
                # TODO: Find a better way than repeating the last action.
                observation.append(time_step.observation)
                action.append(action_step.action)
                policy_info.append(action_step.info)
                reward.append(next_time_step.reward)

                observation = stack_nested_arrays(observation)
                action = stack_nested_arrays(action)
                policy_info = stack_nested_arrays(policy_info)
                reward = stack_nested_arrays(reward)

                traj = trajectory.from_episode(observation, action,
                                               policy_info, reward)

                for observer in self.observers:
                    observer(traj)

        return time_step, policy_state
示例#5
0
 def testLoss(self):
     agent = MyAgent()
     extra = tf.ones(shape=[3, 4], dtype=tf.float32)
     experience = tf.nest.map_structure(
         lambda x: x[tf.newaxis, ...],
         trajectory.from_episode(observation={'obs': tf.constant([1.0])},
                                 action=(),
                                 policy_info=(),
                                 reward=tf.constant([1.0])))
     test_util.test_loss_and_train_output(test=self,
                                          expect_equal_loss_values=True,
                                          agent=agent,
                                          experience=experience,
                                          extra=extra)
示例#6
0
def experience_to_traj(rlt):
    rlt = np.array(rlt)
    d = rlt[:, 2]
    v = rlt[:, 3] / 0.001
    v = v.astype(np.int32)

    discount = np.ones_like(v) * 0.99
    policy_info = rlt[:, 4]
    reward = -np.abs(d - 1.22)
    traj = from_episode(observation=d,
                        action=v,
                        reward=reward,
                        discount=discount,
                        policy_info=policy_info)
    return traj
示例#7
0
 def testTrainIgnoresExtraFields(self):
   agent = MyAgent()
   extra = tf.ones(shape=[3, 4], dtype=tf.float32)
   experience = tf.nest.map_structure(
       lambda x: x[tf.newaxis, ...],
       trajectory.from_episode(
           observation={
               'obs': tf.constant([1.0]), 'ignored': tf.constant([2.0])},
           action=(),
           policy_info=(),
           reward=tf.constant([1.0])))
   loss_info = agent.train(experience, extra=extra)
   reduced_experience = experience._replace(
       observation=copy.copy(experience.observation))
   del reduced_experience.observation['ignored']
   tf.nest.map_structure(
       self.assertAllEqual, (reduced_experience, extra), loss_info.extra)
示例#8
0
 def testFromEpisodeArray(self):
   observation = np.random.rand(4, 5)
   action = ()
   policy_info = ()
   reward = np.random.rand(4)
   traj = trajectory.from_episode(
       observation, action, policy_info, reward, discount=None)
   self.assertFalse(tf.is_tensor(traj.step_type))
   first = ts.StepType.FIRST
   mid = ts.StepType.MID
   last = ts.StepType.LAST
   self.assertAllEqual(
       traj.step_type, [first, mid, mid, mid])
   self.assertAllEqual(
       traj.next_step_type, [mid, mid, mid, last])
   self.assertAllEqual(traj.observation, observation)
   self.assertAllEqual(traj.reward, reward)
   self.assertAllEqual(traj.discount, [1.0, 1.0, 1.0, 1.0])
示例#9
0
 def testFromEpisodeTensor(self):
   observation = tf.random.uniform((4, 5))
   action = ()
   policy_info = ()
   reward = tf.random.uniform((4,))
   traj = trajectory.from_episode(
       observation, action, policy_info, reward, discount=None)
   self.assertTrue(tf.is_tensor(traj.step_type))
   traj_val, obs_val, reward_val = self.evaluate((traj, observation, reward))
   first = ts.StepType.FIRST
   mid = ts.StepType.MID
   last = ts.StepType.LAST
   self.assertAllEqual(
       traj_val.step_type, [first, mid, mid, mid])
   self.assertAllEqual(
       traj_val.next_step_type, [mid, mid, mid, last])
   self.assertAllEqual(traj_val.observation, obs_val)
   self.assertAllEqual(traj_val.reward, reward_val)
   self.assertAllEqual(traj_val.discount, [1.0, 1.0, 1.0, 1.0])
示例#10
0
    def _parser_fn(serialized_proto):
        """Helper function that is returned by create_`parser_fn`."""
        # We copy through all context features at each frame, so even though we know
        # they don't change from frame to frame, they are still sequence features
        # and stored in the feature list.
        context_features = {}
        # pylint: disable=g-complex-comprehension
        sequence_features = dict(
            (tensor_spec.name,
             tf.io.FixedLenSequenceFeature(shape=tensor_spec.shape,
                                           dtype=tensor_spec.dtype))
            for tensor_spec in time_step_spec.observation.values())
        sequence_features[action_spec.name] = tf.io.FixedLenSequenceFeature(
            shape=action_spec.shape, dtype=action_spec.dtype)
        sequence_features[
            time_step_spec.reward.name] = tf.io.FixedLenSequenceFeature(
                shape=time_step_spec.reward.shape,
                dtype=time_step_spec.reward.dtype)
        sequence_features.update(
            _get_policy_info_parsing_dict(agent_name, action_spec))

        # pylint: enable=g-complex-comprehension
        with tf.name_scope('parse'):
            _, parsed_sequence = tf.io.parse_single_sequence_example(
                serialized_proto,
                context_features=context_features,
                sequence_features=sequence_features)
            # TODO(yundi): make the transformed reward configurable.
            action = parsed_sequence[action_spec.name]
            reward = tf.cast(parsed_sequence[time_step_spec.reward.name],
                             tf.float32)

            policy_info = _process_parsed_sequence_and_get_policy_info(
                parsed_sequence, agent_name, action_spec)

            del parsed_sequence[time_step_spec.reward.name]
            del parsed_sequence[action_spec.name]
            full_trajectory = trajectory.from_episode(
                observation=parsed_sequence,
                action=action,
                policy_info=policy_info,
                reward=reward)
            return full_trajectory