def _get_mock_py_policy(self): mock_py_policy = mock.create_autospec(py_policy.Base) observation_spec = tensor_spec.TensorSpec([5], dtype=tf.float32) mock_py_policy.time_step_spec.return_value = ts.time_step_spec( observation_spec) mock_py_policy.action_spec.return_value = tensor_spec.BoundedTensorSpec( [3], tf.float32, -1.0, 1.0) mock_py_policy.policy_state_spec.return_value = () mock_py_policy.info_spec.return_value = () return mock_py_policy
def testBuilds(self): observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.float32, 0, 1) time_step_spec = ts.time_step_spec(observation_spec) time_step = tensor_spec.sample_spec_nest(time_step_spec, outer_dims=(1, )) action_spec = [ tensor_spec.BoundedTensorSpec((2, ), tf.float32, 2, 3), tensor_spec.BoundedTensorSpec((3, ), tf.float32, 0, 3) ] net = actor_rnn_network.ActorRnnNetwork(observation_spec, action_spec, conv_layer_params=[(4, 2, 2)], input_fc_layer_params=(5, ), lstm_size=(3, ), output_fc_layer_params=(5, )) actions, network_state = net(time_step.observation, time_step.step_type) self.evaluate(tf.global_variables_initializer()) self.assertEqual([1, 2], actions[0].shape.as_list()) self.assertEqual([1, 3], actions[1].shape.as_list()) self.assertEqual(13, len(net.variables)) # Conv Net Kernel self.assertEqual((2, 2, 3, 4), net.variables[0].shape) # Conv Net bias self.assertEqual((4, ), net.variables[1].shape) # Fc Kernel self.assertEqual((64, 5), net.variables[2].shape) # Fc Bias self.assertEqual((5, ), net.variables[3].shape) # LSTM Cell Kernel self.assertEqual((5, 12), net.variables[4].shape) # LSTM Cell Recurrent Kernel self.assertEqual((3, 12), net.variables[5].shape) # LSTM Cell Bias self.assertEqual((12, ), net.variables[6].shape) # Fc Kernel self.assertEqual((3, 5), net.variables[7].shape) # Fc Bias self.assertEqual((5, ), net.variables[8].shape) # Action 1 Kernel self.assertEqual((5, 2), net.variables[9].shape) # Action 1 Bias self.assertEqual((2, ), net.variables[10].shape) # Action 2 Kernel self.assertEqual((5, 3), net.variables[11].shape) # Action 2 Bias self.assertEqual((3, ), net.variables[12].shape) # Assert LSTM cell is created. self.assertEqual((1, 3), network_state[0].shape) self.assertEqual((1, 3), network_state[1].shape)
def setUp(self): super(PyTFPolicyTest, self).setUp() self._obs_spec = tensor_spec.TensorSpec([2], tf.float32, 'obs') self._time_step_spec = ts.time_step_spec(self._obs_spec) self._action_spec = tensor_spec.BoundedTensorSpec([], tf.int32, 0, 1, 'action') self._float_action_spec = tensor_spec.BoundedTensorSpec([], tf.float32, 0, 1, 'action') self._tf_policy = q_policy.QPolicy(self._time_step_spec, self._action_spec, q_network=DummyNet())
def setUp(self): super(OuNoisePolicyTest, self).setUp() self._obs_spec = tensor_spec.TensorSpec([2], tf.float32) self._time_step_spec = ts.time_step_spec(self._obs_spec) self._action_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, 2, 3) actor_network = DummyActionNet(self._obs_spec, self._action_spec) self._wrapped_policy = actor_policy.ActorPolicy( time_step_spec=self._time_step_spec, action_spec=self._action_spec, actor_network=actor_network, clip=False)
def _make_parallel_py_environment(self, constructor=None, num_envs=2): self.observation_spec = array_spec.ArraySpec((3, 3), np.float32) self.time_step_spec = ts.time_step_spec(self.observation_spec) self.action_spec = array_spec.BoundedArraySpec([7], dtype=np.float32, minimum=-1.0, maximum=1.0) constructor = constructor or functools.partial( random_py_environment.RandomPyEnvironment, self.observation_spec, self.action_spec) return parallel_py_environment.ParallelPyEnvironment( env_constructors=[constructor] * num_envs, blocking=True)
def setUp(self): super(DdpgAgentTest, self).setUp() self._obs_spec = [tensor_spec.TensorSpec([2], tf.float32)] self._time_step_spec = ts.time_step_spec(self._obs_spec) self._action_spec = [tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1)] network_input_spec = (self._obs_spec, self._action_spec) self._critic_net = DummyCriticNetwork(network_input_spec) self._bounded_actor_net = DummyActorNetwork( self._obs_spec, self._action_spec, unbounded_actions=False) self._unbounded_actor_net = DummyActorNetwork( self._obs_spec, self._action_spec, unbounded_actions=True)
def time_step_spec(self): """Describes the `TimeStep` fields returned by `step()`. Override this method to define an environment that uses non-standard values for any of the items returned by `step`. For example, an environment with array-valued rewards. Returns: A `TimeStep` namedtuple containing (possibly nested) `ArraySpec`s defining the step_type, reward, discount, and observation structure. """ return ts.time_step_spec(self.observation_spec())
def setUp(self): super(EpsilonGreedyPolicyTest, self).setUp() self._obs_spec = tensor_spec.TensorSpec([2], tf.float32) self._time_step_spec = ts.time_step_spec(self._obs_spec) self._num_actions = 3 self._greedy_action = 1 self._action_spec = tensor_spec.BoundedTensorSpec( (1, ), tf.int32, 0, self._num_actions - 1) self._policy = fixed_policy.FixedPolicy( np.asarray([self._greedy_action], dtype=np.int32), self._time_step_spec, self._action_spec) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) self._time_step = ts.restart(observations, batch_size=2)
def testAction(self): py_observation_spec = array_spec.BoundedArraySpec((3, ), np.int32, 1, 1) py_time_step_spec = ts.time_step_spec(py_observation_spec) py_action_spec = array_spec.BoundedArraySpec((7, ), np.int32, 1, 1) py_policy_state_spec = array_spec.BoundedArraySpec((5, ), np.int32, 0, 1) py_policy_info_spec = array_spec.BoundedArraySpec((3, ), np.int32, 0, 1) mock_py_policy = mock.create_autospec(py_policy.Base) mock_py_policy.time_step_spec = py_time_step_spec mock_py_policy.action_spec = py_action_spec mock_py_policy.policy_state_spec = py_policy_state_spec mock_py_policy.info_spec = py_policy_info_spec expected_py_policy_state = np.ones(py_policy_state_spec.shape, py_policy_state_spec.dtype) expected_py_time_step = tf.nest.map_structure( lambda arr_spec: np.ones(arr_spec.shape, arr_spec.dtype), py_time_step_spec) expected_py_action = np.ones(py_action_spec.shape, py_action_spec.dtype) expected_new_py_policy_state = np.zeros(py_policy_state_spec.shape, py_policy_state_spec.dtype) expected_py_info = np.zeros(py_policy_info_spec.shape, py_policy_info_spec.dtype) mock_py_policy.action.return_value = policy_step.PolicyStep( expected_py_action, expected_new_py_policy_state, expected_py_info) tf_mock_py_policy = tf_py_policy.TFPyPolicy(mock_py_policy) time_step = tf.nest.map_structure( lambda arr_spec: tf.ones(arr_spec.shape, arr_spec.dtype), py_time_step_spec) action_step = tf_mock_py_policy.action( time_step, tf.ones(py_policy_state_spec.shape, tf.int32)) py_action_step = self.evaluate(action_step) self.assertEqual(1, mock_py_policy.action.call_count) np.testing.assert_equal( mock_py_policy.action.call_args[1]['time_step'], expected_py_time_step) np.testing.assert_equal( mock_py_policy.action.call_args[1]['policy_state'], expected_py_policy_state) np.testing.assert_equal(py_action_step.action, expected_py_action) np.testing.assert_equal(py_action_step.state, expected_new_py_policy_state) np.testing.assert_equal(py_action_step.info, expected_py_info)
def test_get_distribution_class_spec(self): ones = tf.ones(shape=[2], dtype=tf.float32) obs_spec = tensor_spec.TensorSpec(shape=[5], dtype=tf.float32) time_step_spec = ts.time_step_spec(obs_spec) mock_policy = mock.create_autospec(actor_policy.ActorPolicy) mock_policy.distribution.return_value = policy_step.PolicyStep( (tfp.distributions.Categorical(logits=ones), tfp.distributions.Normal(ones, ones)), None) class_spec = ppo_utils.get_distribution_class_spec( mock_policy, time_step_spec) self.assertAllEqual( (tfp.distributions.Categorical, tfp.distributions.Normal), class_spec)
def __init__(self, time_step_spec, action_spec, seed=None, outer_dims=None): self._seed = seed self._outer_dims = outer_dims self._rng = np.random.RandomState(seed) if time_step_spec is None: time_step_spec = ts.time_step_spec() super(RandomPyPolicy, self).__init__(time_step_spec=time_step_spec, action_spec=action_spec)
def test_get_distribution_params_spec(self): ones = tf.ones(shape=[1, 2], dtype=tf.float32) obs_spec = tensor_spec.TensorSpec(shape=[5], dtype=tf.float32) time_step_spec = ts.time_step_spec(obs_spec) mock_policy = mock.create_autospec(actor_policy.ActorPolicy) mock_policy._distribution.return_value = policy_step.PolicyStep( (tfp.distributions.Categorical(logits=ones), tfp.distributions.Normal(ones, ones))) params_spec = ppo_utils.get_distribution_params_spec( mock_policy, time_step_spec) self.assertAllEqual( [set(['logits']), set(['loc', 'scale'])], [set(d.keys()) for d in params_spec]) self.assertAllEqual([[[2]], [[2], [2]]], [[d[k].shape for k in d] for d in params_spec])
def _initial_collect(self): """Collect initial experience before training begins.""" logging.info('Collecting initial experience...') time_step_spec = ts.time_step_spec(self._env.observation_spec()) random_policy = random_py_policy.RandomPyPolicy( time_step_spec, self._env.action_spec()) time_step = self._env.reset() while self._replay_buffer.size < self._initial_collect_steps: if self.game_over(): time_step = self._env.reset() action_step = random_policy.action(time_step) next_time_step = self._env.step(action_step.action) self._replay_buffer.add_batch(trajectory.from_transition( time_step, action_step, next_time_step)) time_step = next_time_step logging.info('Done.')
def setUp(self): tf.compat.v1.enable_resource_variables() super(TD3AgentTest, self).setUp() self._obs_spec = [tensor_spec.TensorSpec([2], tf.float32)] self._time_step_spec = ts.time_step_spec(self._obs_spec) self._action_spec = [ tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1) ] input_spec = (self._obs_spec, self._action_spec) self._critic_net = DummyCriticNetwork(input_spec) self._bounded_actor_net = DummyActorNetwork(self._obs_spec, self._action_spec, unbounded_actions=False) self._unbounded_actor_net = DummyActorNetwork(self._obs_spec, self._action_spec, unbounded_actions=True)
def __init__(self, initial_state=0, dtype=tf.int64, scope='TFEnviroment'): self._dtype = dtype self._scope = scope self._initial_state = tf.cast(initial_state, dtype=self._dtype) observation_spec = specs.TensorSpec([1], self._dtype, 'observation') action_spec = specs.BoundedTensorSpec([], tf.int32, minimum=0, maximum=10) time_step_spec = ts.time_step_spec(observation_spec) super(TFEnvironmentMock, self).__init__(time_step_spec, action_spec) self._state = common.create_variable('state', initial_state, dtype=self._dtype) self.steps = common.create_variable('steps', 0) self.episodes = common.create_variable('episodes', 0) self.resets = common.create_variable('resets', 0)
def testBuilds(self): observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.int32, 0, 1) time_step_spec = ts.time_step_spec(observation_spec) time_step = tensor_spec.sample_spec_nest(time_step_spec, outer_dims=(1, 3)) net = value_rnn_network.ValueRnnNetwork(observation_spec, conv_layer_params=[(4, 2, 2)], input_fc_layer_params=(5, ), lstm_size=(7, ), output_fc_layer_params=(3, )) value, state = net(time_step.observation, time_step.step_type) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual((1, 3), value.shape) self.assertEqual(11, len(net.variables)) # Conv Net Kernel self.assertEqual((2, 2, 3, 4), net.variables[0].shape) # Conv Net bias self.assertEqual((4, ), net.variables[1].shape) # Fc Kernel self.assertEqual((64, 5), net.variables[2].shape) # Fc Bias self.assertEqual((5, ), net.variables[3].shape) # LSTM Cell Kernel self.assertEqual((5, 28), net.variables[4].shape) # LSTM Cell Recurrent Kernel self.assertEqual((7, 28), net.variables[5].shape) # LSTM Cell Bias self.assertEqual((28, ), net.variables[6].shape) # Fc Kernel self.assertEqual((7, 3), net.variables[7].shape) # Fc Bias self.assertEqual((3, ), net.variables[8].shape) # Value Shrink Kernel self.assertEqual((3, 1), net.variables[9].shape) # Value Shrink bias self.assertEqual((1, ), net.variables[10].shape) # Assert LSTM cell is created. self.assertEqual((1, 7), state[0].shape) self.assertEqual((1, 7), state[1].shape)
def setUp(self): super(FixedPolicyTest, self).setUp() # Creates an MDP with: # - dim(observation) = 2 # - number of actions = 4 self._obs_spec = tensor_spec.TensorSpec([2], tf.float32) self._time_step_spec = ts.time_step_spec(self._obs_spec) self._num_actions = 4 self._action_spec = tensor_spec.BoundedTensorSpec( shape=(1,), dtype=tf.int32, minimum=0, maximum=self._num_actions - 1) # The policy always outputs the same action. self._fixed_action = 1 self._policy = fixed_policy.FixedPolicy( np.asarray([self._fixed_action], dtype=np.int32), self._time_step_spec, self._action_spec)
def _create_replay_buffer(self, rb_cls): self._stack_count = 4 self._single_shape = (15, 15, 1) shape = (15, 15, self._stack_count) observation_spec = array_spec.ArraySpec(shape, np.int32, 'obs') time_step_spec = ts.time_step_spec(observation_spec) action_spec = policy_step.PolicyStep( array_spec.BoundedArraySpec(shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')) self._trajectory_spec = trajectory.from_transition( time_step_spec, action_spec, time_step_spec) self._capacity = 32 self._replay_buffer = rb_cls(data_spec=self._trajectory_spec, capacity=self._capacity)
def __init__(self, initial_state=0, dtype=tf.int64, scope='TFEnviroment'): self._dtype = dtype self._scope = scope self._initial_state = tf.cast(initial_state, dtype=self._dtype) observation_spec = specs.TensorSpec([1], self._dtype, 'observation') action_spec = specs.BoundedTensorSpec([], tf.int32, minimum=0, maximum=10) time_step_spec = ts.time_step_spec(observation_spec) super(TFEnvironmentMock, self).__init__(time_step_spec, action_spec) with tf.compat.v1.variable_scope(self._scope): self._state = tf.Variable(initial_state, name='state', dtype=self._dtype) self.steps = tf.Variable(0, name='steps') self.episodes = tf.Variable(0, name='episodes') self.resets = tf.Variable(0, name='resets')
def testObservationSpec(self): observation_spec = [ array_spec.ArraySpec((1, 2, 3), np.int32, 'obs1'), array_spec.ArraySpec((1, 2, 3), np.float32, 'obs2') ] time_step_spec = ts.time_step_spec(observation_spec) self.assertEqual(time_step_spec.observation, observation_spec) self.assertEqual(time_step_spec.step_type, array_spec.ArraySpec([], np.int32, name='step_type')) self.assertEqual(time_step_spec.reward, array_spec.ArraySpec([], np.float32, name='reward')) self.assertEqual( time_step_spec.discount, array_spec.BoundedArraySpec([], np.float32, minimum=0.0, maximum=1.0, name='discount'))
def make_random_trajectory(): """Creates a random trajectory. This trajectory contains Tensors shaped `[1, 6, ...]` where `1` is the batch and `6` is the number of time steps. Observations are unbounded but actions are bounded to take values within `[1, 2]`. Policy info is also provided, and is equal to the actions. It can be removed via: ```python traj = make_random_trajectory().clone(policy_info=()) ``` Returns: A `Trajectory`. """ time_step_spec = ts.time_step_spec( tensor_spec.TensorSpec([], tf.int64, name='observation')) action_spec = tensor_spec.BoundedTensorSpec([], tf.int32, minimum=1, maximum=2, name='action') # info and policy state specs match that of TFPolicyMock. outer_dims = [1, 6] # (batch_size, time) traj = trajectory.Trajectory( observation=tensor_spec.sample_spec_nest(time_step_spec.observation, outer_dims=outer_dims), action=tensor_spec.sample_bounded_spec(action_spec, outer_dims=outer_dims), policy_info=tensor_spec.sample_bounded_spec(action_spec, outer_dims=outer_dims), reward=tf.fill(outer_dims, 0.0), # step_type is F M L F M L. step_type=tf.reshape(tf.range(0, 6) % 3, outer_dims), # next_step_type is M L F M L F. next_step_type=tf.reshape(tf.range(1, 7) % 3, outer_dims), discount=tf.fill(outer_dims, 1.0), ) return traj, time_step_spec, action_spec
def testObservationSpec(self): observation_spec = [ tensor_spec.TensorSpec((1, 2, 3), tf.int32, 'obs1'), tensor_spec.TensorSpec((1, 2, 3), tf.float32, 'obs2') ] time_step_spec = ts.time_step_spec(observation_spec) self.assertEqual(time_step_spec.observation, observation_spec) self.assertEqual( time_step_spec.step_type, tensor_spec.TensorSpec([], tf.int32, name='step_type')) self.assertEqual(time_step_spec.reward, tensor_spec.TensorSpec([], tf.float32, name='reward')) self.assertEqual( time_step_spec.discount, tensor_spec.BoundedTensorSpec([], tf.float32, minimum=0.0, maximum=1.0, name='discount'))
def testGeneratesBatchedActionsWithoutSpecifyingOuterDims(self): action_spec = [ array_spec.BoundedArraySpec((2, 3), np.int32, -10, 10), array_spec.BoundedArraySpec((1, 2), np.int32, -10, 10) ] time_step_spec = time_step.time_step_spec( observation_spec=array_spec.ArraySpec((1,), np.int32)) policy = random_py_policy.RandomPyPolicy( time_step_spec=time_step_spec, action_spec=action_spec) action_step = policy.action( time_step.restart(np.array([[1], [2], [3]], dtype=np.int32))) tf.nest.assert_same_structure(action_spec, action_step.action) self.assertEqual((3, 2, 3), action_step.action[0].shape) self.assertEqual((3, 1, 2), action_step.action[1].shape) self.assertTrue(np.all(action_step.action[0] >= -10)) self.assertTrue(np.all(action_step.action[0] <= 10)) self.assertTrue(np.all(action_step.action[1] >= -10)) self.assertTrue(np.all(action_step.action[1] <= 10))
def testClipping(self): action_spec = (tensor_spec.BoundedTensorSpec([1], tf.float32, 2, 3), tensor_spec.TensorSpec([1], tf.float32), tensor_spec.BoundedTensorSpec([1], tf.int32, 2, 3), tensor_spec.TensorSpec([1], tf.int32)) time_step_spec = ts.time_step_spec(action_spec) policy = TfPassThroughPolicy(time_step_spec, action_spec, clip=True) observation = (tf.constant(1, shape=(1, ), dtype=tf.float32), tf.constant(1, shape=(1, ), dtype=tf.float32), tf.constant(1, shape=(1, ), dtype=tf.int32), tf.constant(1, shape=(1, ), dtype=tf.int32)) time_step = ts.restart(observation) clipped_action = self.evaluate(policy.action(time_step).action) self.assertEqual(2, clipped_action[0]) self.assertEqual(1, clipped_action[1]) self.assertEqual(2, clipped_action[2]) self.assertEqual(1, clipped_action[3])
def _generate_replay_buffer(self, rb_cls): stack_count = 4 shape = (15, 15, stack_count) single_shape = (15, 15, 1) observation_spec = array_spec.ArraySpec(shape, np.int32, 'obs') time_step_spec = ts.time_step_spec(observation_spec) action_spec = policy_step.PolicyStep( array_spec.BoundedArraySpec(shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')) self._trajectory_spec = trajectory.from_transition( time_step_spec, action_spec, time_step_spec) self._capacity = 32 self._replay_buffer = rb_cls(data_spec=self._trajectory_spec, capacity=self._capacity) # Generate N frames: the value of pixels is the frame index. # The observations will be generated by stacking K frames out of those N, # generating some redundancies between the observations. single_frames = [] frame_count = 100 for k in range(frame_count): single_frames.append(np.full(single_shape, k, dtype=np.int32)) # Add stack of frames to the replay buffer. time_steps = [] for k in range(len(single_frames) - stack_count + 1): observation = np.concatenate(single_frames[k:k + stack_count], axis=-1) time_steps.append(ts.transition(observation, reward=0.0)) self._transition_count = len(time_steps) - 1 dummy_action = policy_step.PolicyStep(np.int32(0)) for k in range(self._transition_count): self._replay_buffer.add_batch( nest_utils.batch_nested_array( trajectory.from_transition(time_steps[k], dummy_action, time_steps[k + 1])))
def testRandomPyPolicyGeneratesActionTensors(self): py_action_spec = array_spec.BoundedArraySpec((7, ), np.int32, -10, 10) observation = tf.ones([3], tf.float32) time_step = ts.restart(observation) observation_spec = tensor_spec.TensorSpec.from_tensor(observation) time_step_spec = ts.time_step_spec(observation_spec) tf_py_random_policy = tf_py_policy.TFPyPolicy( random_py_policy.RandomPyPolicy(time_step_spec=time_step_spec, action_spec=py_action_spec)) action_step = tf_py_random_policy.action(time_step=time_step) py_action, py_new_policy_state = self.evaluate( [action_step.action, action_step.state]) self.assertEqual(py_action.shape, py_action_spec.shape) self.assertTrue(np.all(py_action >= py_action_spec.minimum)) self.assertTrue(np.all(py_action <= py_action_spec.maximum)) self.assertEqual(py_new_policy_state, ())
def testBuilds(self): observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.float32, 0, 1) time_step_spec = ts.time_step_spec(observation_spec) time_step = tensor_spec.sample_spec_nest(time_step_spec, outer_dims=(1,)) action_spec = [ tensor_spec.BoundedTensorSpec((2,), tf.float32, 2, 3), tensor_spec.BoundedTensorSpec((3,), tf.int32, 0, 3) ] net = actor_distribution_network.ActorDistributionNetwork( observation_spec, action_spec, conv_layer_params=[(4, 2, 2)], fc_layer_params=(5,)) action_distributions, _ = net(time_step.observation, time_step.step_type, ()) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual([1, 2], action_distributions[0].mode().shape.as_list()) self.assertEqual([1, 3], action_distributions[1].mode().shape.as_list())
def testBuildsStackedLstm(self): observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.int32, 0, 1) time_step_spec = ts.time_step_spec(observation_spec) time_step = tensor_spec.sample_spec_nest(time_step_spec, outer_dims=(1, 3)) net = value_rnn_network.ValueRnnNetwork(observation_spec, conv_layer_params=[(4, 2, 2)], input_fc_layer_params=(5, ), lstm_size=(7, 5), output_fc_layer_params=(3, )) _, state = net(time_step.observation, time_step.step_type) self.evaluate(tf.compat.v1.global_variables_initializer()) # Assert LSTM cell is created. self.assertEqual((1, 7), state[0][0].shape) self.assertEqual((1, 7), state[0][1].shape) # Assert LSTM cell is created. self.assertEqual((1, 5), state[1][0].shape) self.assertEqual((1, 5), state[1][1].shape)
def setUp(self): super(GreedyPolicyTest, self).setUp() self._obs_spec = tensor_spec.TensorSpec([2], tf.float32) self._time_step_spec = ts.time_step_spec(self._obs_spec)
def setUp(self): super(ReinforceAgentTest, self).setUp() self._obs_spec = tensor_spec.TensorSpec([2], tf.float32) self._time_step_spec = ts.time_step_spec(self._obs_spec) self._action_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1)