Exemplo n.º 1
0
    def testBatch(self, metric_class, expected_result):
        metric = metric_class()

        metric(
            nest_utils.stack_nested_arrays([
                trajectory.boundary((), (), (), 0., 1.),
                trajectory.boundary((), (), (), 0., 1.)
            ]))
        metric(
            nest_utils.stack_nested_arrays([
                trajectory.first((), (), (), 1., 1.),
                trajectory.first((), (), (), 1., 1.)
            ]))
        metric(
            nest_utils.stack_nested_arrays([
                trajectory.mid((), (), (), 2., 1.),
                trajectory.last((), (), (), 3., 0.)
            ]))
        metric(
            nest_utils.stack_nested_arrays([
                trajectory.last((), (), (), 3., 0.),
                trajectory.boundary((), (), (), 0., 1.)
            ]))
        metric(
            nest_utils.stack_nested_arrays([
                trajectory.boundary((), (), (), 0., 1.),
                trajectory.first((), (), (), 1., 1.)
            ]))
        self.assertEqual(expected_result, metric.result(), 5.0)
Exemplo n.º 2
0
    def testBatchSizeProvided(self, metric_class, expected_result):
        metric = py_metrics.AverageReturnMetric(batch_size=2)

        metric(
            nest_utils.stack_nested_arrays([
                trajectory.boundary((), (), (), 0., 1.),
                trajectory.boundary((), (), (), 0., 1.)
            ]))
        metric(
            nest_utils.stack_nested_arrays([
                trajectory.first((), (), (), 1., 1.),
                trajectory.first((), (), (), 1., 1.)
            ]))
        metric(
            nest_utils.stack_nested_arrays([
                trajectory.mid((), (), (), 2., 1.),
                trajectory.last((), (), (), 3., 0.)
            ]))
        metric(
            nest_utils.stack_nested_arrays([
                trajectory.last((), (), (), 3., 0.),
                trajectory.boundary((), (), (), 0., 1.)
            ]))
        metric(
            nest_utils.stack_nested_arrays([
                trajectory.boundary((), (), (), 0., 1.),
                trajectory.first((), (), (), 1., 1.)
            ]))
        self.assertEqual(metric.result(), 5.0)
Exemplo n.º 3
0
    def _create_trajectories(self):
        # Order of args for trajectory methods:
        # observation, action, policy_info, reward, discount
        ts0 = nest_utils.stack_nested_tensors([
            trajectory.boundary((), (), (), 0., 1.),
            trajectory.boundary((), (), (), 0., 1.)
        ])
        ts1 = nest_utils.stack_nested_tensors([
            trajectory.first((), (), (), 1., 1.),
            trajectory.first((), (), (), 2., 1.)
        ])
        ts2 = nest_utils.stack_nested_tensors([
            trajectory.last((), (), (), 3., 1.),
            trajectory.last((), (), (), 4., 1.)
        ])
        ts3 = nest_utils.stack_nested_tensors([
            trajectory.boundary((), (), (), 0., 1.),
            trajectory.boundary((), (), (), 0., 1.)
        ])
        ts4 = nest_utils.stack_nested_tensors([
            trajectory.first((), (), (), 5., 1.),
            trajectory.first((), (), (), 6., 1.)
        ])
        ts5 = nest_utils.stack_nested_tensors([
            trajectory.last((), (), (), 7., 1.),
            trajectory.last((), (), (), 8., 1.)
        ])

        return [ts0, ts1, ts2, ts3, ts4, ts5]
Exemplo n.º 4
0
    def setUp(self):
        super(BatchedPyMetricTest, self).setUp()
        # Order of args for trajectory methods:
        # observation, action, policy_info, reward, discount
        ts0 = nest_utils.stack_nested_tensors([
            trajectory.boundary((), (), (), 0., 1.),
            trajectory.boundary((), (), (), 0., 1.)
        ])
        ts1 = nest_utils.stack_nested_tensors([
            trajectory.first((), (), (), 1., 1.),
            trajectory.first((), (), (), 2., 1.)
        ])
        ts2 = nest_utils.stack_nested_tensors([
            trajectory.last((), (), (), 3., 1.),
            trajectory.last((), (), (), 4., 1.)
        ])
        ts3 = nest_utils.stack_nested_tensors([
            trajectory.boundary((), (), (), 0., 1.),
            trajectory.boundary((), (), (), 0., 1.)
        ])
        ts4 = nest_utils.stack_nested_tensors([
            trajectory.first((), (), (), 5., 1.),
            trajectory.first((), (), (), 6., 1.)
        ])
        ts5 = nest_utils.stack_nested_tensors([
            trajectory.last((), (), (), 7., 1.),
            trajectory.last((), (), (), 8., 1.)
        ])

        self._ts = [ts0, ts1, ts2, ts3, ts4, ts5]
Exemplo n.º 5
0
 def setUp(self):
   super(PyDriverTest, self).setUp()
   f0 = np.array(0., dtype=np.float32)
   f1 = np.array(1., dtype=np.float32)
   # Order of args for trajectory methods:
   # (observation, action, policy_info, reward, discount)
   self._trajectories = [
       trajectory.first(0, 1, 2, f1, f1),
       trajectory.last(1, 2, 4, f1, f0),
       trajectory.boundary(3, 1, 2, f0, f1),
       trajectory.first(0, 1, 2, f1, f1),
       trajectory.last(1, 2, 4, f1, f0),
       trajectory.boundary(3, 1, 2, f0, f1),
       trajectory.first(0, 1, 2, f1, f1),
   ]
Exemplo n.º 6
0
    def testLoss(self):
        cloning_net = DummyNet(self._observation_spec, self._action_spec)
        agent = behavioral_cloning_agent.BehavioralCloningAgent(
            self._time_step_spec,
            self._action_spec,
            cloning_network=cloning_net,
            optimizer=None)

        observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)]
        actions = [tf.constant([0, 1], dtype=tf.int32)]
        rewards = tf.constant([10, 20], dtype=tf.float32)
        discounts = tf.constant([0.9, 0.9], dtype=tf.float32)

        experience = trajectory.first(observation=observations,
                                      action=actions,
                                      policy_info=(),
                                      reward=rewards,
                                      discount=discounts)
        loss_info = agent._loss(experience)
        total_loss = loss_info.loss

        expected_loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=cloning_net(observations)[0], labels=actions[0]))

        self.evaluate(tf.initialize_all_variables())
        self.assertAllClose(total_loss, expected_loss)
Exemplo n.º 7
0
 def testZeroEpisodes(self, metric_class, expected_result):
     metric = metric_class()
     # Order of args for trajectory methods:
     # observation, action, policy_info, reward, discount
     metric(trajectory.boundary((), (), (), 0., 1.))
     metric(trajectory.first((), (), (), 1., 1.))
     self.assertEqual(expected_result, metric.result())
Exemplo n.º 8
0
    def testAverageOneEpisodeWithReset(self, metric_class, expected_result):
        metric = metric_class()

        metric(trajectory.first((), (), (), 0., 1.))
        metric(trajectory.mid((), (), (), 1., 1.))
        metric(trajectory.mid((), (), (), 2., 1.))
        # The episode is reset.
        #
        # This could happen when using the dynamic_episode_driver with
        # parallel_py_environment. When the parallel episodes are of different
        # lengths and num_episodes is reached, some episodes would be left in "MID".
        # When the driver runs again, all environments are reset at the beginning
        # of the tf.while_loop and the unfinished episodes would get "FIRST" without
        # seeing "LAST".
        metric(trajectory.first((), (), (), 3., 1.))
        metric(trajectory.last((), (), (), 4., 1.))
        self.assertEqual(expected_result, metric.result())
Exemplo n.º 9
0
 def testFirstArrays(self):
   observation = ()
   action = ()
   policy_info = ()
   reward = np.array([1.0, 1.0, 2.0])
   discount = np.array([1.0, 1.0, 1.0])
   traj = trajectory.first(observation, action, policy_info, reward, discount)
   self.assertFalse(tf.contrib.framework.is_tensor(traj.step_type))
   self.assertAllEqual(traj.step_type, [ts.StepType.FIRST] * 3)
   self.assertAllEqual(traj.next_step_type, [ts.StepType.MID] * 3)
Exemplo n.º 10
0
 def testFirstTensors(self):
   observation = ()
   action = ()
   policy_info = ()
   reward = tf.constant([1.0, 1.0, 2.0])
   discount = tf.constant([1.0, 1.0, 1.0])
   traj = trajectory.first(observation, action, policy_info, reward, discount)
   self.assertTrue(tf.contrib.framework.is_tensor(traj.step_type))
   traj_val = self.evaluate(traj)
   self.assertAllEqual(traj_val.step_type, [ts.StepType.FIRST] * 3)
   self.assertAllEqual(traj_val.next_step_type, [ts.StepType.MID] * 3)
Exemplo n.º 11
0
    def testAverageTwoEpisode(self, metric_class, expected_result):
        metric = metric_class()

        metric(trajectory.boundary((), (), (), 0., 1.))
        metric(trajectory.first((), (), (), 1., 1.))
        metric(trajectory.mid((), (), (), 2., 1.))
        metric(trajectory.last((), (), (), 3., 0.))
        metric(trajectory.boundary((), (), (), 0., 1.))

        # TODO(kbanoop): Add optional next_step_type arg to trajectory.first. Or
        # implement trajectory.first_last().
        metric(
            trajectory.Trajectory(ts.StepType.FIRST, (), (), (),
                                  ts.StepType.LAST, -6., 1.))

        self.assertEqual(expected_result, metric.result())
Exemplo n.º 12
0
def trajectory_first(observation):
    return trajectory.first(observation=observation,
                            action=1,
                            policy_info=(),
                            reward=np.array(1, dtype=np.float32),
                            discount=1.0)