Exemplo n.º 1
0
    def _construct(self, batch_size, graph):
        """Construct the agent graph through placeholders."""

        self._batch_size = batch_size
        self._batched = batch_size is not None

        outer_dims = [self._batch_size] if self._batched else [1]
        with graph.as_default():
            self._time_step = tensor_spec.to_nest_placeholder(
                self._tf_policy.time_step_spec, outer_dims=outer_dims)
            self._tf_initial_state = self._tf_policy.get_initial_state(
                batch_size=self._batch_size or 1)

            self._policy_state = tf.nest.map_structure(
                lambda ps: tf.compat.v1.placeholder(  # pylint: disable=g-long-lambda
                    ps.dtype,
                    ps.shape,
                    name='policy_state'),
                self._tf_initial_state)
            self._action_step = self._tf_policy.action(self._time_step,
                                                       self._policy_state,
                                                       seed=self._seed)

            self._actions = tensor_spec.to_nest_placeholder(
                self._tf_policy.action_spec, outer_dims=outer_dims)
            self._action_distribution = self._tf_policy.distribution(
                self._time_step, policy_state=self._policy_state).action
            self._action_mean = self._action_distribution.mean()
            self._log_prob = common.log_probability(
                self._action_distribution, self._actions,
                self._tf_policy.action_spec)
Exemplo n.º 2
0
    def testOneStepUpdatesObservers(self):
        if tf.executing_eagerly():
            self.skipTest('b/123880556')

        env = tf_py_environment.TFPyEnvironment(
            driver_test_utils.PyEnvironmentMock())
        policy = driver_test_utils.TFPolicyMock(env.time_step_spec(),
                                                env.action_spec())
        policy_state_ph = tensor_spec.to_nest_placeholder(
            policy.policy_state_spec,
            default=0,
            name_scope='policy_state_ph',
            outer_dims=(1, ))
        num_episodes_observer = driver_test_utils.NumEpisodesObserver()

        driver = dynamic_step_driver.DynamicStepDriver(
            env, policy, observers=[num_episodes_observer])
        run_driver = driver.run(policy_state=policy_state_ph)

        with self.session() as session:
            session.run(tf.compat.v1.global_variables_initializer())
            _, policy_state = session.run(run_driver)
            for _ in range(4):
                _, policy_state = session.run(
                    run_driver, feed_dict={policy_state_ph: policy_state})
            self.assertEqual(self.evaluate(num_episodes_observer.num_episodes),
                             2)
Exemplo n.º 3
0
 def testCreatePlaceholderWithNameScope(self):
     self.skipIfExecutingEagerly()
     obs_spec = tensor_spec.TensorSpec([2], tf.float32, "obs")
     time_step_spec = ts.time_step_spec(obs_spec)
     ph = tensor_spec.to_nest_placeholder(time_step_spec,
                                          name_scope="action")
     self.assertEqual(ph.observation.name, "action/obs:0")
Exemplo n.º 4
0
 def testCreatePlaceholderFromTimeStepSpec(self):
     obs_spec = tensor_spec.TensorSpec([2], tf.float32, "obs")
     time_step_spec = ts.time_step_spec(obs_spec)
     ph = tensor_spec.to_nest_placeholder(time_step_spec)
     self.assertIsInstance(ph, ts.TimeStep)
     self.assertEqual(ph.observation.name, "obs:0")
     self.assertEqual(ph.observation.dtype, tf.float32)
     self.assertEqual(ph.observation.shape, tf.TensorShape([2]))
Exemplo n.º 5
0
  def _set_up_feeds_and_fetches(self):
    outer_dims = [self._batch_size] if self._batched else [1]
    self._time_step = tensor_spec.to_nest_placeholder(
        self._tf_policy.time_step_spec(), outer_dims=outer_dims)
    self._tf_initial_state = self._tf_policy.get_initial_state(
        batch_size=self._batch_size or 1)

    self._policy_state = nest.map_structure(
        lambda ps: tf.placeholder(  # pylint: disable=g-long-lambda
            ps.dtype, ps.shape, name='policy_state'),
        self._tf_initial_state)
    self._action_step = self._tf_policy.action(
        self._time_step, self._policy_state, seed=self._seed)
Exemplo n.º 6
0
 def testCreatePlaceholderFromTuple(self):
   self.skipIfExecutingEagerly()
   specs = (
       tensor_spec.TensorSpec(shape=(), dtype=tf.float32, name="act_prob"),
       tensor_spec.TensorSpec(shape=(), dtype=tf.float32, name="value_pred"),
   )
   ph = tensor_spec.to_nest_placeholder(specs)
   self.assertEqual(2, len(ph))
   self.assertEqual(ph[0].name, "act_prob:0")
   self.assertEqual(ph[0].dtype, tf.float32)
   self.assertEqual(ph[0].shape, tf.TensorShape([]))
   self.assertEqual(ph[1].name, "value_pred:0")
   self.assertEqual(ph[1].dtype, tf.float32)
   self.assertEqual(ph[1].shape, tf.TensorShape([]))
Exemplo n.º 7
0
    def testOneStepReplayBufferObservers(self):
        if tf.executing_eagerly():
            self.skipTest('b/123880556')

        env = tf_py_environment.TFPyEnvironment(
            driver_test_utils.PyEnvironmentMock())
        policy = driver_test_utils.TFPolicyMock(env.time_step_spec(),
                                                env.action_spec())
        policy_state_ph = tensor_spec.to_nest_placeholder(
            policy.policy_state_spec,
            default=0,
            name_scope='policy_state_ph',
            outer_dims=(1, ))
        replay_buffer = driver_test_utils.make_replay_buffer(policy)

        driver = dynamic_step_driver.DynamicStepDriver(
            env, policy, num_steps=1, observers=[replay_buffer.add_batch])

        run_driver = driver.run(policy_state=policy_state_ph)
        rb_gather_all = replay_buffer.gather_all()

        with self.session() as session:
            session.run(tf.compat.v1.global_variables_initializer())
            _, policy_state = session.run(run_driver)
            for _ in range(5):
                _, policy_state = session.run(
                    run_driver, feed_dict={policy_state_ph: policy_state})

            trajectories = self.evaluate(rb_gather_all)

        self.assertAllEqual(trajectories.step_type, [[0, 1, 2, 0, 1, 2, 0, 1]])
        self.assertAllEqual(trajectories.observation,
                            [[0, 1, 3, 0, 1, 3, 0, 1]])
        self.assertAllEqual(trajectories.action, [[1, 2, 1, 1, 2, 1, 1, 2]])
        self.assertAllEqual(trajectories.policy_info,
                            [[2, 4, 2, 2, 4, 2, 2, 4]])
        self.assertAllEqual(trajectories.next_step_type,
                            [[1, 2, 0, 1, 2, 0, 1, 2]])
        self.assertAllEqual(trajectories.reward,
                            [[1., 1., 0., 1., 1., 0., 1., 1.]])
        self.assertAllEqual(trajectories.discount,
                            [[1., 0., 1, 1, 0, 1., 1., 0.]])
Exemplo n.º 8
0
    def _construct(self, batch_size, graph):
        """Construct the agent graph through placeholders."""

        self._batch_size = batch_size
        self._batched = batch_size is not None

        outer_dims = [self._batch_size] if self._batched else [1]
        with graph.as_default():
            self._time_step = tensor_spec.to_nest_placeholder(
                self._tf_policy.time_step_spec(), outer_dims=outer_dims)
            self._tf_initial_state = self._tf_policy.get_initial_state(
                batch_size=self._batch_size or 1)

            self._policy_state = nest.map_structure(
                lambda ps: tf.placeholder(  # pylint: disable=g-long-lambda
                    ps.dtype,
                    ps.shape,
                    name='policy_state'),
                self._tf_initial_state)
            self._action_step = self._tf_policy.action(self._time_step,
                                                       self._policy_state,
                                                       seed=self._seed)
Exemplo n.º 9
0
  def initialize(self, batch_size):
    if self._built:
      raise RuntimeError('initialize() called twice.')

    self._batch_size = batch_size
    self._batched = batch_size is not None

    outer_dims = [self._batch_size] if self._batched else [1]
    self._time_step = tensor_spec.to_nest_placeholder(
        self._tf_policy.time_step_spec(), outer_dims=outer_dims)
    self._tf_initial_state = self._tf_policy.get_initial_state(
        batch_size=self._batch_size or 1)

    self._policy_state = nest.map_structure(
        lambda ps: tf.placeholder(  # pylint: disable=g-long-lambda
            ps.dtype, ps.shape, name='policy_state'),
        self._tf_initial_state)
    self._action_step = self._tf_policy.action(
        self._time_step, self._policy_state, seed=self._seed)

    self.session.run(tf.initializers.variables(self._tf_policy.variables()))

    self._built = True