Exemplo n.º 1
0
  def testBatchedEnvironment(self, max_steps, max_episodes, expected_length):

    expected_trajectories = [
        trajectory.Trajectory(
            step_type=np.array([0, 0]),
            observation=np.array([0, 0]),
            action=np.array([2, 1]),
            policy_info=np.array([4, 2]),
            next_step_type=np.array([1, 1]),
            reward=np.array([1., 1.]),
            discount=np.array([1., 1.])),
        trajectory.Trajectory(
            step_type=np.array([1, 1]),
            observation=np.array([2, 1]),
            action=np.array([1, 2]),
            policy_info=np.array([2, 4]),
            next_step_type=np.array([2, 1]),
            reward=np.array([1., 1.]),
            discount=np.array([0., 1.])),
        trajectory.Trajectory(
            step_type=np.array([2, 1]),
            observation=np.array([3, 3]),
            action=np.array([2, 1]),
            policy_info=np.array([4, 2]),
            next_step_type=np.array([0, 2]),
            reward=np.array([0., 1.]),
            discount=np.array([1., 0.]))
    ]

    env1 = driver_test_utils.PyEnvironmentMock(final_state=3)
    env2 = driver_test_utils.PyEnvironmentMock(final_state=4)
    env = batched_py_environment.BatchedPyEnvironment([env1, env2])

    policy = driver_test_utils.PyPolicyMock(
        env.time_step_spec(),
        env.action_spec(),
        initial_policy_state=np.array([1, 2]))
    replay_buffer_observer = MockReplayBufferObserver()

    driver = py_driver.PyDriver(
        env,
        policy,
        observers=[replay_buffer_observer],
        max_steps=max_steps,
        max_episodes=max_episodes,
    )
    initial_time_step = env.reset()
    initial_policy_state = policy.get_initial_state()
    driver.run(initial_time_step, initial_policy_state)
    trajectories = replay_buffer_observer.gather_all()

    self.assertEqual(
        len(trajectories), len(expected_trajectories[:expected_length]))

    for t1, t2 in zip(trajectories, expected_trajectories[:expected_length]):
      for t1_field, t2_field in zip(t1, t2):
        self.assertAllEqual(t1_field, t2_field)
Exemplo n.º 2
0
    def testToTransition(self):
        first = ts.StepType.FIRST
        mid = ts.StepType.MID
        last = ts.StepType.LAST

        # Define a batch size 1, 3-step trajectory.
        traj = trajectory.Trajectory(
            step_type=np.array([[first, mid, last]]),
            next_step_type=np.array([[mid, last, first]]),
            observation=np.array([[10.0, 20.0, 30.0]]),
            action=np.array([[11.0, 22.0, 33.0]]),
            # reward at step 0 is an invalid dummy reward.
            reward=np.array([[0.0, 1.0, 2.0]]),
            discount=np.array([[1.0, 1.0, 0.0]]),
            policy_info=np.array([[1.0, 2.0, 3.0]]))

        time_steps, policy_steps, next_time_steps = trajectory.to_transition(
            traj)

        self.assertAllEqual(time_steps.step_type, np.array([[first, mid]]))
        self.assertAllEqual(time_steps.observation, np.array([[10.0, 20.0]]))

        self.assertAllEqual(next_time_steps.step_type, np.array([[mid, last]]))
        self.assertAllEqual(next_time_steps.observation,
                            np.array([[20.0, 30.0]]))
        self.assertAllEqual(next_time_steps.reward, np.array([[0.0, 1.0]]))
        self.assertAllEqual(next_time_steps.discount, np.array([[1.0, 1.0]]))

        self.assertAllEqual(policy_steps.action, np.array([[11.0, 22.0]]))
        self.assertAllEqual(policy_steps.info, np.array([[1.0, 2.0]]))
Exemplo n.º 3
0
def make_random_trajectory():
    time_step_spec = ts.time_step_spec(
        tensor_spec.TensorSpec([], tf.int64, name='observation'))
    action_spec = tensor_spec.BoundedTensorSpec([],
                                                tf.int32,
                                                minimum=1,
                                                maximum=2,
                                                name='action')
    # info and policy state specs match that of TFPolicyMock.
    outer_dims = [1, 6]  # (batch_size, time)
    traj = trajectory.Trajectory(
        observation=tensor_spec.sample_spec_nest(time_step_spec.observation,
                                                 outer_dims=outer_dims),
        action=tensor_spec.sample_bounded_spec(action_spec,
                                               outer_dims=outer_dims),
        policy_info=tensor_spec.sample_bounded_spec(action_spec,
                                                    outer_dims=outer_dims),
        reward=tf.fill(outer_dims, 0.0),
        # step_type is F M L F M L.
        step_type=tf.reshape(tf.range(0, 6) % 3, outer_dims),
        # next_step_type is M L F M L F.
        next_step_type=tf.reshape(tf.range(1, 7) % 3, outer_dims),
        discount=tf.fill(outer_dims, 1.0),
    )
    return traj, time_step_spec, action_spec
Exemplo n.º 4
0
    def testTrain(self, num_epochs, use_td_lambda_return):
        if tf.executing_eagerly():
            self.skipTest('b/123777119')  # Secondary bug: ('b/123770140')

        with tf.compat.v2.summary.record_if(False):
            agent = ppo_agent.PPOAgent(
                self._time_step_spec,
                self._action_spec,
                tf.compat.v1.train.AdamOptimizer(),
                actor_net=DummyActorNet(self._action_spec, ),
                value_net=DummyValueNet(outer_rank=2),
                normalize_observations=False,
                num_epochs=num_epochs,
                use_gae=use_td_lambda_return,
                use_td_lambda_return=use_td_lambda_return)
            observations = tf.constant([
                [[1, 2], [3, 4], [5, 6]],
                [[1, 2], [3, 4], [5, 6]],
            ],
                                       dtype=tf.float32)

            time_steps = ts.TimeStep(step_type=tf.constant([[1] * 3] * 2,
                                                           dtype=tf.int32),
                                     reward=tf.constant([[1] * 3] * 2,
                                                        dtype=tf.float32),
                                     discount=tf.constant([[1] * 3] * 2,
                                                          dtype=tf.float32),
                                     observation=observations)
            actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]],
                                  dtype=tf.float32)

            action_distribution_parameters = {
                'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32),
                'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32),
            }

            policy_info = action_distribution_parameters

            experience = trajectory.Trajectory(
                time_steps.step_type, observations, actions, policy_info,
                time_steps.step_type, time_steps.reward, time_steps.discount)

            # Mock the build_train_op to return an op for incrementing this counter.
            counter = tf.compat.v1.train.get_or_create_global_step()
            zero = tf.constant(0, dtype=tf.float32)
            agent.build_train_op = (
                lambda *_, **__: tf_agent.LossInfo(  # pylint: disable=g-long-lambda
                    counter.assign_add(1), ppo_agent.PPOLossInfo(*[zero] * 5)))

            train_op = agent.train(experience)

            self.evaluate(tf.compat.v1.global_variables_initializer())

            # Assert that counter starts out at zero.
            self.assertEqual(0, self.evaluate(counter))

            self.evaluate(train_op)

            # Assert that train_op ran increment_counter num_epochs times.
            self.assertEqual(num_epochs, self.evaluate(counter))
Exemplo n.º 5
0
    def testTrain(self, num_epochs, use_td_lambda_return):
        with tf.compat.v2.summary.record_if(False):
            # Mock the build_train_op to return an op for incrementing this counter.
            counter = common.create_variable('test_train_counter')
            agent = ppo_agent.PPOAgent(
                self._time_step_spec,
                self._action_spec,
                tf.compat.v1.train.AdamOptimizer(),
                actor_net=DummyActorNet(
                    self._obs_spec,
                    self._action_spec,
                ),
                value_net=DummyValueNet(self._obs_spec),
                normalize_observations=False,
                num_epochs=num_epochs,
                use_gae=use_td_lambda_return,
                use_td_lambda_return=use_td_lambda_return,
                train_step_counter=counter)
            observations = tf.constant([
                [[1, 2], [3, 4], [5, 6]],
                [[1, 2], [3, 4], [5, 6]],
            ],
                                       dtype=tf.float32)

            time_steps = ts.TimeStep(step_type=tf.constant([[1] * 3] * 2,
                                                           dtype=tf.int32),
                                     reward=tf.constant([[1] * 3] * 2,
                                                        dtype=tf.float32),
                                     discount=tf.constant([[1] * 3] * 2,
                                                          dtype=tf.float32),
                                     observation=observations)
            actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]],
                                  dtype=tf.float32)

            action_distribution_parameters = {
                'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32),
                'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32),
            }

            policy_info = action_distribution_parameters

            experience = trajectory.Trajectory(
                time_steps.step_type, observations, actions, policy_info,
                time_steps.step_type, time_steps.reward, time_steps.discount)

            # Force variable creation.
            agent.policy.variables()

            if not tf.executing_eagerly():
                loss = agent.train(experience)
            else:
                loss = lambda: agent.train(experience)

            # Assert that counter starts out at zero.
            self.evaluate(tf.compat.v1.initialize_all_variables())
            self.assertEqual(0, self.evaluate(counter))
            self.evaluate(loss)
            # Assert that train_op ran increment_counter num_epochs times.
            self.assertEqual(num_epochs, self.evaluate(counter))
Exemplo n.º 6
0
  def testTrain(self, num_epochs):
    agent = ppo_agent.PPOAgent(
        self._time_step_spec,
        self._action_spec,
        tf.train.AdamOptimizer(),
        actor_net=DummyActorNet(self._action_spec,),
        value_net=DummyValueNet(outer_rank=2),
        normalize_observations=False,
        num_epochs=num_epochs,
    )
    observations = tf.constant([
        [[1, 2], [3, 4], [5, 6]],
        [[1, 2], [3, 4], [5, 6]],
    ],
                               dtype=tf.float32)
    time_steps = ts.TimeStep(
        step_type=tf.constant([[1] * 3] * 2, dtype=tf.int32),
        reward=tf.constant([[1] * 3] * 2, dtype=tf.float32),
        discount=tf.constant([[1] * 3] * 2, dtype=tf.float32),
        observation=observations)
    actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]], dtype=tf.float32)
    action_distribution_parameters = {
        'loc': tf.constant([[0.0, 0.0], [0.0, 0.0]], dtype=tf.float32),
        'scale': tf.constant([[1.0, 1.0], [1.0, 1.0]], dtype=tf.float32),
    }
    policy_info = action_distribution_parameters

    experience = trajectory.Trajectory(
        time_steps.step_type, observations, actions, policy_info,
        time_steps.step_type, time_steps.reward, time_steps.discount)

    # Mock the build_train_op to return an op for incrementing this counter.
    counter = tf.train.get_or_create_global_step()
    zero = tf.constant(0, dtype=tf.float32)
    agent.build_train_op = (
        lambda *_, **__: (counter.assign_add(1), [zero] * 5))

    train_op = agent.train(experience)

    with self.test_session() as sess:
      sess.run(tf.global_variables_initializer())

      # Assert that counter starts out at zero.
      counter_ = sess.run(counter)
      self.assertEqual(0, counter_)

      sess.run(train_op)

      # Assert that train_op ran increment_counter num_epochs times.
      counter_ = sess.run(counter)
      self.assertEqual(num_epochs, counter_)
Exemplo n.º 7
0
    def testAverageTwoEpisode(self, metric_class, expected_result):
        metric = metric_class()

        metric(trajectory.boundary((), (), (), 0., 1.))
        metric(trajectory.first((), (), (), 1., 1.))
        metric(trajectory.mid((), (), (), 2., 1.))
        metric(trajectory.last((), (), (), 3., 0.))
        metric(trajectory.boundary((), (), (), 0., 1.))

        # TODO(kbanoop): Add optional next_step_type arg to trajectory.first. Or
        # implement trajectory.first_last().
        metric(
            trajectory.Trajectory(ts.StepType.FIRST, (), (), (),
                                  ts.StepType.LAST, -6., 1.))

        self.assertEqual(expected_result, metric.result())
Exemplo n.º 8
0
    def testTrainWithRnn(self):
        with tf.compat.v2.summary.record_if(False):
            actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork(
                self._obs_spec,
                self._action_spec,
                input_fc_layer_params=None,
                output_fc_layer_params=None,
                conv_layer_params=None,
                lstm_size=(40, ))

            counter = common.create_variable('test_train_counter')
            agent = reinforce_agent.ReinforceAgent(
                self._time_step_spec,
                self._action_spec,
                actor_network=actor_net,
                optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
                train_step_counter=counter)

            batch_size = 5
            observations = tf.constant([[[1, 2], [3, 4], [5, 6]]] * batch_size,
                                       dtype=tf.float32)
            time_steps = ts.TimeStep(
                step_type=tf.constant([[1] * 3] * batch_size, dtype=tf.int32),
                reward=tf.constant([[1] * 3] * batch_size, dtype=tf.float32),
                discount=tf.constant([[1] * 3] * batch_size, dtype=tf.float32),
                observation=observations)
            actions = tf.constant([[[0], [1], [1]]] * batch_size,
                                  dtype=tf.float32)

            experience = trajectory.Trajectory(time_steps.step_type,
                                               observations, actions, (),
                                               time_steps.step_type,
                                               time_steps.reward,
                                               time_steps.discount)

            # Force variable creation.
            agent.policy.variables()

            if tf.executing_eagerly():
                loss = lambda: agent.train(experience)
            else:
                loss = agent.train(experience)

            self.evaluate(tf.compat.v1.initialize_all_variables())
            self.assertEqual(self.evaluate(counter), 0)
            self.evaluate(loss)
            self.assertEqual(self.evaluate(counter), 1)
Exemplo n.º 9
0
def make_random_trajectory():
    """Creates a random trajectory.

  This trajectory contains Tensors shaped `[1, 6, ...]` where `1` is the batch
  and `6` is the number of time steps.

  Observations are unbounded but actions are bounded to take values within
  `[1, 2]`.

  Policy info is also provided, and is equal to the actions.  It can be removed
  via:

  ```python
  traj = make_random_trajectory().clone(policy_info=())
  ```

  Returns:
    A `Trajectory`.
  """
    time_step_spec = ts.time_step_spec(
        tensor_spec.TensorSpec([], tf.int64, name='observation'))
    action_spec = tensor_spec.BoundedTensorSpec([],
                                                tf.int32,
                                                minimum=1,
                                                maximum=2,
                                                name='action')
    # info and policy state specs match that of TFPolicyMock.
    outer_dims = [1, 6]  # (batch_size, time)
    traj = trajectory.Trajectory(
        observation=tensor_spec.sample_spec_nest(time_step_spec.observation,
                                                 outer_dims=outer_dims),
        action=tensor_spec.sample_bounded_spec(action_spec,
                                               outer_dims=outer_dims),
        policy_info=tensor_spec.sample_bounded_spec(action_spec,
                                                    outer_dims=outer_dims),
        reward=tf.fill(outer_dims, 0.0),
        # step_type is F M L F M L.
        step_type=tf.reshape(tf.range(0, 6) % 3, outer_dims),
        # next_step_type is M L F M L F.
        next_step_type=tf.reshape(tf.range(1, 7) % 3, outer_dims),
        discount=tf.fill(outer_dims, 1.0),
    )
    return traj, time_step_spec, action_spec
Exemplo n.º 10
0
    def testAgentDoesNotFailWhenNestedObservationActionAndDebugSummaries(self):
        summary_writer = tf.compat.v2.summary.create_file_writer(
            FLAGS.test_tmpdir, flush_millis=10000)
        summary_writer.set_as_default()

        nested_obs_spec = (self._obs_spec, self._obs_spec, {
            'a': self._obs_spec,
            'b': self._obs_spec,
        })
        nested_time_spec = ts.time_step_spec(nested_obs_spec)

        nested_act_spec = (self._action_spec, {
            'c': self._action_spec,
            'd': self._action_spec
        })

        class NestedActorNet(network.DistributionNetwork):
            def __init__(self, dummy_model):
                output_spec = (dummy_model.output_spec, {
                    'c': dummy_model.output_spec,
                    'd': dummy_model.output_spec,
                })
                super(NestedActorNet,
                      self).__init__(dummy_model.input_tensor_spec, (),
                                     output_spec=output_spec,
                                     name='NestedActorNet')
                self.dummy_model = dummy_model

            def call(self, *args, **kwargs):
                dummy_ans, _ = self.dummy_model(*args, **kwargs)
                return (dummy_ans, {'c': dummy_ans, 'd': dummy_ans}), ()

        dummy_model = DummyActorNet(nested_obs_spec, self._action_spec)
        agent = ppo_agent.PPOAgent(nested_time_spec,
                                   nested_act_spec,
                                   tf.compat.v1.train.AdamOptimizer(),
                                   actor_net=NestedActorNet(dummy_model),
                                   value_net=DummyValueNet(nested_obs_spec),
                                   debug_summaries=True)

        observations = tf.constant([
            [[1, 2], [3, 4], [5, 6]],
            [[1, 2], [3, 4], [5, 6]],
        ],
                                   dtype=tf.float32)

        observations = (observations, observations, {
            'a': observations,
            'b': observations,
        })

        time_steps = ts.TimeStep(step_type=tf.constant([[1] * 3] * 2,
                                                       dtype=tf.int32),
                                 reward=tf.constant([[1] * 3] * 2,
                                                    dtype=tf.float32),
                                 discount=tf.constant([[1] * 3] * 2,
                                                      dtype=tf.float32),
                                 observation=observations)
        actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]],
                              dtype=tf.float32)

        actions = (actions, {
            'c': actions,
            'd': actions,
        })

        action_distribution_parameters = {
            'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32),
            'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32),
        }
        action_distribution_parameters = (action_distribution_parameters, {
            'c': action_distribution_parameters,
            'd': action_distribution_parameters,
        })

        policy_info = action_distribution_parameters

        experience = trajectory.Trajectory(time_steps.step_type, observations,
                                           actions, policy_info,
                                           time_steps.step_type,
                                           time_steps.reward,
                                           time_steps.discount)

        agent.train(experience)