def _construct(self, batch_size, graph): """Construct the agent graph through placeholders.""" self._batch_size = batch_size self._batched = batch_size is not None outer_dims = [self._batch_size] if self._batched else [1] with graph.as_default(): self._time_step = tensor_spec.to_nest_placeholder( self._tf_policy.time_step_spec, outer_dims=outer_dims) self._tf_initial_state = self._tf_policy.get_initial_state( batch_size=self._batch_size or 1) self._policy_state = tf.nest.map_structure( lambda ps: tf.compat.v1.placeholder( # pylint: disable=g-long-lambda ps.dtype, ps.shape, name='policy_state'), self._tf_initial_state) self._action_step = self._tf_policy.action(self._time_step, self._policy_state, seed=self._seed) self._actions = tensor_spec.to_nest_placeholder( self._tf_policy.action_spec, outer_dims=outer_dims) self._action_distribution = self._tf_policy.distribution( self._time_step, policy_state=self._policy_state).action self._action_mean = self._action_distribution.mean() self._log_prob = common.log_probability( self._action_distribution, self._actions, self._tf_policy.action_spec)
def testOneStepUpdatesObservers(self): if tf.executing_eagerly(): self.skipTest('b/123880556') env = tf_py_environment.TFPyEnvironment( driver_test_utils.PyEnvironmentMock()) policy = driver_test_utils.TFPolicyMock(env.time_step_spec(), env.action_spec()) policy_state_ph = tensor_spec.to_nest_placeholder( policy.policy_state_spec, default=0, name_scope='policy_state_ph', outer_dims=(1, )) num_episodes_observer = driver_test_utils.NumEpisodesObserver() driver = dynamic_step_driver.DynamicStepDriver( env, policy, observers=[num_episodes_observer]) run_driver = driver.run(policy_state=policy_state_ph) with self.session() as session: session.run(tf.compat.v1.global_variables_initializer()) _, policy_state = session.run(run_driver) for _ in range(4): _, policy_state = session.run( run_driver, feed_dict={policy_state_ph: policy_state}) self.assertEqual(self.evaluate(num_episodes_observer.num_episodes), 2)
def testCreatePlaceholderWithNameScope(self): self.skipIfExecutingEagerly() obs_spec = tensor_spec.TensorSpec([2], tf.float32, "obs") time_step_spec = ts.time_step_spec(obs_spec) ph = tensor_spec.to_nest_placeholder(time_step_spec, name_scope="action") self.assertEqual(ph.observation.name, "action/obs:0")
def testCreatePlaceholderFromTimeStepSpec(self): obs_spec = tensor_spec.TensorSpec([2], tf.float32, "obs") time_step_spec = ts.time_step_spec(obs_spec) ph = tensor_spec.to_nest_placeholder(time_step_spec) self.assertIsInstance(ph, ts.TimeStep) self.assertEqual(ph.observation.name, "obs:0") self.assertEqual(ph.observation.dtype, tf.float32) self.assertEqual(ph.observation.shape, tf.TensorShape([2]))
def _set_up_feeds_and_fetches(self): outer_dims = [self._batch_size] if self._batched else [1] self._time_step = tensor_spec.to_nest_placeholder( self._tf_policy.time_step_spec(), outer_dims=outer_dims) self._tf_initial_state = self._tf_policy.get_initial_state( batch_size=self._batch_size or 1) self._policy_state = nest.map_structure( lambda ps: tf.placeholder( # pylint: disable=g-long-lambda ps.dtype, ps.shape, name='policy_state'), self._tf_initial_state) self._action_step = self._tf_policy.action( self._time_step, self._policy_state, seed=self._seed)
def testCreatePlaceholderFromTuple(self): self.skipIfExecutingEagerly() specs = ( tensor_spec.TensorSpec(shape=(), dtype=tf.float32, name="act_prob"), tensor_spec.TensorSpec(shape=(), dtype=tf.float32, name="value_pred"), ) ph = tensor_spec.to_nest_placeholder(specs) self.assertEqual(2, len(ph)) self.assertEqual(ph[0].name, "act_prob:0") self.assertEqual(ph[0].dtype, tf.float32) self.assertEqual(ph[0].shape, tf.TensorShape([])) self.assertEqual(ph[1].name, "value_pred:0") self.assertEqual(ph[1].dtype, tf.float32) self.assertEqual(ph[1].shape, tf.TensorShape([]))
def testOneStepReplayBufferObservers(self): if tf.executing_eagerly(): self.skipTest('b/123880556') env = tf_py_environment.TFPyEnvironment( driver_test_utils.PyEnvironmentMock()) policy = driver_test_utils.TFPolicyMock(env.time_step_spec(), env.action_spec()) policy_state_ph = tensor_spec.to_nest_placeholder( policy.policy_state_spec, default=0, name_scope='policy_state_ph', outer_dims=(1, )) replay_buffer = driver_test_utils.make_replay_buffer(policy) driver = dynamic_step_driver.DynamicStepDriver( env, policy, num_steps=1, observers=[replay_buffer.add_batch]) run_driver = driver.run(policy_state=policy_state_ph) rb_gather_all = replay_buffer.gather_all() with self.session() as session: session.run(tf.compat.v1.global_variables_initializer()) _, policy_state = session.run(run_driver) for _ in range(5): _, policy_state = session.run( run_driver, feed_dict={policy_state_ph: policy_state}) trajectories = self.evaluate(rb_gather_all) self.assertAllEqual(trajectories.step_type, [[0, 1, 2, 0, 1, 2, 0, 1]]) self.assertAllEqual(trajectories.observation, [[0, 1, 3, 0, 1, 3, 0, 1]]) self.assertAllEqual(trajectories.action, [[1, 2, 1, 1, 2, 1, 1, 2]]) self.assertAllEqual(trajectories.policy_info, [[2, 4, 2, 2, 4, 2, 2, 4]]) self.assertAllEqual(trajectories.next_step_type, [[1, 2, 0, 1, 2, 0, 1, 2]]) self.assertAllEqual(trajectories.reward, [[1., 1., 0., 1., 1., 0., 1., 1.]]) self.assertAllEqual(trajectories.discount, [[1., 0., 1, 1, 0, 1., 1., 0.]])
def _construct(self, batch_size, graph): """Construct the agent graph through placeholders.""" self._batch_size = batch_size self._batched = batch_size is not None outer_dims = [self._batch_size] if self._batched else [1] with graph.as_default(): self._time_step = tensor_spec.to_nest_placeholder( self._tf_policy.time_step_spec(), outer_dims=outer_dims) self._tf_initial_state = self._tf_policy.get_initial_state( batch_size=self._batch_size or 1) self._policy_state = nest.map_structure( lambda ps: tf.placeholder( # pylint: disable=g-long-lambda ps.dtype, ps.shape, name='policy_state'), self._tf_initial_state) self._action_step = self._tf_policy.action(self._time_step, self._policy_state, seed=self._seed)
def initialize(self, batch_size): if self._built: raise RuntimeError('initialize() called twice.') self._batch_size = batch_size self._batched = batch_size is not None outer_dims = [self._batch_size] if self._batched else [1] self._time_step = tensor_spec.to_nest_placeholder( self._tf_policy.time_step_spec(), outer_dims=outer_dims) self._tf_initial_state = self._tf_policy.get_initial_state( batch_size=self._batch_size or 1) self._policy_state = nest.map_structure( lambda ps: tf.placeholder( # pylint: disable=g-long-lambda ps.dtype, ps.shape, name='policy_state'), self._tf_initial_state) self._action_step = self._tf_policy.action( self._time_step, self._policy_state, seed=self._seed) self.session.run(tf.initializers.variables(self._tf_policy.variables())) self._built = True