def test_tfrecord_observer(self): tfrecord_observer = example_encoding_dataset.TFRecordObserver( self.dataset_path, self.simple_data_spec) # Draw a random sample from the simple spec sample = tensor_spec.sample_spec_nest(self.simple_data_spec, np.random.RandomState(0), outer_dims=(1, )) # Write to file using __call__() function for _ in range(3): tfrecord_observer(sample) # Manually flush tfrecord_observer.flush() # Delete should call close() function del tfrecord_observer
def test_build(self, outer_dims): observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.float32, 0, 1) time_step_spec = ts.time_step_spec(observation_spec) time_step = tensor_spec.sample_spec_nest(time_step_spec, outer_dims=outer_dims) action_spec = [ tensor_spec.BoundedTensorSpec((2, ), tf.float32, 2, 3), tensor_spec.BoundedTensorSpec((3, ), tf.float32, 0, 3) ] net = actor_rnn_network.ActorRnnNetwork(observation_spec, action_spec, conv_layer_params=[(4, 2, 2)], input_fc_layer_params=(5, ), lstm_size=(3, ), output_fc_layer_params=(5, )) actions, network_state = net(time_step.observation, time_step.step_type) self.assertEqual(list(outer_dims) + [2], actions[0].shape.as_list()) self.assertEqual(list(outer_dims) + [3], actions[1].shape.as_list()) self.assertEqual(13, len(net.variables)) # Conv Net Kernel self.assertEqual((2, 2, 3, 4), net.variables[0].shape) # Conv Net bias self.assertEqual((4, ), net.variables[1].shape) # Fc Kernel self.assertEqual((64, 5), net.variables[2].shape) # Fc Bias self.assertEqual((5, ), net.variables[3].shape) # LSTM Cell Kernel self.assertEqual((5, 12), net.variables[4].shape) # LSTM Cell Recurrent Kernel self.assertEqual((3, 12), net.variables[5].shape) # LSTM Cell Bias self.assertEqual((12, ), net.variables[6].shape) # Fc Kernel self.assertEqual((3, 5), net.variables[7].shape) # Fc Bias self.assertEqual((5, ), net.variables[8].shape) # Action 1 Kernel self.assertEqual((5, 2), net.variables[9].shape) # Action 1 Bias self.assertEqual((2, ), net.variables[10].shape) # Action 2 Kernel self.assertEqual((5, 3), net.variables[11].shape) # Action 2 Bias self.assertEqual((3, ), net.variables[12].shape)
def get_distribution_class_spec(policy, time_step_spec): """Gets a nest of action distribution classes. Args: policy: Policy for constructing action distribution. time_step_spec: Spec for time_step for creating action distribution. Returns: The nest of distribution class references. """ sample_distribution_step = policy.distribution( tensor_spec.sample_spec_nest(time_step_spec, outer_dims=[1]), policy_state=policy.get_initial_state(1)) sample_distribution = sample_distribution_step.action return nest.map_structure(lambda dist: dist.__class__, sample_distribution)
def _action(self, time_step, policy_state, seed): observation_and_action_constraint_splitter = ( self.observation_and_action_constraint_splitter) if observation_and_action_constraint_splitter is not None: _, mask = observation_and_action_constraint_splitter( time_step.observation) zero_logits = tf.cast(tf.zeros_like(mask), tf.float32) masked_categorical = masked.MaskedCategorical(zero_logits, mask) #Modified to accomodate scalar action spaces #action_ = tf.cast(masked_categorical.sample() + self.action_spec.minimum, # self.action_spec.dtype) action_ = tf.reshape( tf.cast(masked_categorical.sample() + self.action_spec.minimum, self.action_spec.dtype), [1]) # If the action spec says each action should be shaped (1,), add another # dimension so the final shape is (B, 1) rather than (B,). if self.action_spec.shape.rank == 1: action_ = tf.expand_dims(action_, axis=-1) else: outer_dims = nest_utils.get_outer_shape(time_step, self._time_step_spec) action_ = tensor_spec.sample_spec_nest(self._action_spec, seed=seed, outer_dims=outer_dims) # TODO(b/78181147): Investigate why this control dependency is required. if time_step is not None: with tf.control_dependencies(tf.nest.flatten(time_step)): action_ = tf.nest.map_structure(tf.identity, action_) step = policy_step.PolicyStep(action_, policy_state) if self.emit_log_probability: if observation_and_action_constraint_splitter is not None: log_probability = masked_categorical.log_prob( action_ - self.action_spec.minimum) else: action_probability = tf.nest.map_structure( _uniform_probability, self._action_spec) log_probability = tf.nest.map_structure( tf.math.log, action_probability) info = policy_step.PolicyInfo(log_probability=log_probability) return step._replace(info=info) return step
def create_variables(self, input_tensor_spec=None, **kwargs): """Force creation of the network's variables. Return output specs. Args: input_tensor_spec: (Optional). Override or provide an input tensor spec when creating variables. **kwargs: Other arguments to `network.call()`, e.g. `training=True`. Returns: Output specs - a nested spec calculated from the outputs (excluding any batch dimensions). If any of the output elements is a tfp `Distribution`, the associated spec entry returned is `None`. Raises: ValueError: If no `input_tensor_spec` is provided, and the network did not provide one during construction. """ if self._network_output_spec is not None: return self._network_output_spec if self._input_tensor_spec is None: self._input_tensor_spec = input_tensor_spec input_tensor_spec = self._input_tensor_spec if input_tensor_spec is None: raise ValueError( "Unable to create_variables: no input_tensor_spec provided, and " "Network did not define one.") random_input = tensor_spec.sample_spec_nest( input_tensor_spec, outer_dims=(1,)) initial_state = self.get_initial_state(batch_size=1) step_type = tf.fill((1,), time_step.StepType.FIRST) outputs = self.__call__( random_input, step_type=step_type, network_state=initial_state, **kwargs) def _calc_unbatched_spec(x): if isinstance(x, tfp.distributions.Distribution): return None else: return nest_utils.remove_singleton_batch_spec_dim( tf.type_spec_from_value(x), outer_ndim=1) self._network_output_spec = tf.nest.map_structure( _calc_unbatched_spec, outputs[0]) return self._network_output_spec
def testL2RegularizationLossWithSharedVariables(self, not_zero): policy_l2_reg = 4e-4 * not_zero value_function_l2_reg = 2e-4 * not_zero shared_vars_l2_reg = 1e-4 * not_zero actor_net, value_net = _create_joint_actor_value_networks( self._obs_spec, self._action_spec) agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.compat.v1.train.AdamOptimizer(), actor_net=actor_net, value_net=value_net, normalize_observations=False, policy_l2_reg=policy_l2_reg, value_function_l2_reg=value_function_l2_reg, shared_vars_l2_reg=shared_vars_l2_reg, ) # Call other loss functions to make sure trainable variables are # constructed. observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([[0], [1]], dtype=tf.float32) returns = tf.constant([1.9, 1.0], dtype=tf.float32) sample_action_log_probs = tf.constant([[0.9], [0.3]], dtype=tf.float32) advantages = tf.constant([1.9, 1.0], dtype=tf.float32) current_policy_distribution, unused_network_state = DummyActorNet( self._obs_spec, self._action_spec)(time_steps.observation, time_steps.step_type, ()) weights = tf.ones_like(advantages) agent.policy_gradient_loss(time_steps, actions, sample_action_log_probs, advantages, current_policy_distribution, weights) agent.value_estimation_loss(time_steps, returns, weights) # Now request L2 regularization loss. # Value function weights are [2, 1], actor net weights are [2, 1, 1, 1], # shared weights are [3, 1, 1, 1]. expected_loss = value_function_l2_reg * (2**2 + 1) + policy_l2_reg * ( 2**2 + 1 + 1 + 1) + shared_vars_l2_reg * (3**2 + 1 + 1 + 1) # Make sure the network is built before we try to get variables. agent.policy.action( tensor_spec.sample_spec_nest(self._time_step_spec, outer_dims=(2, ))) loss = agent.l2_regularization_loss() self.evaluate(tf.compat.v1.global_variables_initializer()) loss_ = self.evaluate(loss) self.assertAllClose(loss_, expected_loss)
def test_auto_reset(self): time_step = self.evaluate(self.random_env.reset()) random_action = self.evaluate( tensor_spec.sample_spec_nest(self.action_spec, outer_dims=(1,))) while not time_step.is_last(): time_step = self.evaluate(self.random_env.step(random_action)) self.assertTrue(time_step.is_last()) current_time_step = self.evaluate(self.random_env.current_time_step()) self.assertTrue(current_time_step.is_last()) first_time_step = self.evaluate(self.random_env.step(random_action)) self.assertTrue(first_time_step.is_first())
def testHandleBatchOnlyObservation(self): observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.float32, 0, 1) time_step_spec = ts.time_step_spec(observation_spec) time_step = tensor_spec.sample_spec_nest(time_step_spec, outer_dims=(3, )) net = value_rnn_network.ValueRnnNetwork(observation_spec, conv_layer_params=[(4, 2, 2)], input_fc_layer_params=(5, ), lstm_size=(7, 5), output_fc_layer_params=(3, )) value, _ = net(time_step.observation, time_step.step_type) self.assertEqual([3], value.shape.as_list())
def testPrunes(self): converter = data_converter.AsNStepTransition(self._data_context, gamma=0.5) my_spec = self._data_context.transition_spec.replace( action_step=self._data_context.transition_spec.action_step.replace( action={ 'action1': tf.TensorSpec((), tf.float32), 'action2': tf.TensorSpec([4], tf.int32) })) transition = tensor_spec.sample_spec_nest(my_spec, outer_dims=[2]) converted = converter(transition) expected = tf.nest.map_structure(lambda x: x, transition) del expected.action_step.action['action2'] (expected, converted) = self.evaluate((expected, converted)) tf.nest.map_structure(self.assertAllEqual, converted, expected)
def testCreateFeedForwardCommonTowerNetworkWithEmptyArmLayers( self, batch_size, feature_dim, num_actions): obs_spec = bandit_spec_utils.create_per_arm_observation_spec( 7, feature_dim, num_actions) net = gafn.create_feed_forward_common_tower_network( obs_spec, global_layers=(4, 3, 2), arm_layers=(), common_layers=(7, 6, 5)) input_nest = tensor_spec.sample_spec_nest(obs_spec, outer_dims=(batch_size, )) output, _ = net(input_nest) self.evaluate(tf.compat.v1.global_variables_initializer()) output = self.evaluate(output) self.assertAllEqual(output.shape, (batch_size, num_actions))
def testAgentFollowsActionSpec(self, agent_class): agent = agent_class( self._time_step_spec, self._action_spec, q_network=q_network.QNetwork(self._observation_spec, self._action_spec), optimizer=None) self.assertTrue(agent.policy() is not None) policy = agent.policy() observation = tensor_spec.sample_spec_nest( self._time_step_spec, seed=42, outer_dims=(1,)) action_op = policy.action(observation).action self.evaluate(tf.initialize_all_variables()) action = self.evaluate(action_op) self.assertEqual([1] + self._action_spec[0].shape.as_list(), list(action[0].shape))
def test_state_saved_after_step(self): self.evaluate(self.random_env.reset()) random_action = self.evaluate( tensor_spec.sample_spec_nest(self.action_spec, outer_dims=(1, ))) expected_time_step = self.evaluate(self.random_env.step(random_action)) current_time_step = self.evaluate(self.random_env.current_time_step()) np.testing.assert_almost_equal(expected_time_step.step_type, current_time_step.step_type) np.testing.assert_almost_equal(expected_time_step.observation, current_time_step.observation) np.testing.assert_almost_equal(expected_time_step.discount, current_time_step.discount) np.testing.assert_almost_equal(expected_time_step.reward, current_time_step.reward)
def testBuildsScalarContinuousActionSpace(self): observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.float32, 0, 1) time_step_spec = ts.time_step_spec(observation_spec) time_step = tensor_spec.sample_spec_nest(time_step_spec, outer_dims=(1, )) action_spec = tensor_spec.BoundedTensorSpec((), tf.float32, 2, 3) net = actor_distribution_network.ActorDistributionNetwork( observation_spec, action_spec) action_distributions, _ = net(time_step.observation, time_step.step_type, ()) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual([1], action_distributions.mode().shape.as_list())
def testBuilds(self): observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.float32, 0, 1) time_step_spec = ts.time_step_spec(observation_spec) time_step = tensor_spec.sample_spec_nest(time_step_spec, outer_dims=(1, 3)) net = value_rnn_network.ValueRnnNetwork(observation_spec, conv_layer_params=[(4, 2, 2)], input_fc_layer_params=(5, ), lstm_size=(7, ), output_fc_layer_params=(3, )) value, state = net(time_step.observation, step_type=time_step.step_type, network_state=net.get_initial_state(batch_size=1)) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual((1, 3), value.shape) self.assertEqual(11, len(net.variables)) # Conv Net Kernel self.assertEqual((2, 2, 3, 4), net.variables[0].shape) # Conv Net bias self.assertEqual((4, ), net.variables[1].shape) # Fc Kernel self.assertEqual((64, 5), net.variables[2].shape) # Fc Bias self.assertEqual((5, ), net.variables[3].shape) # LSTM Cell Kernel self.assertEqual((5, 28), net.variables[4].shape) # LSTM Cell Recurrent Kernel self.assertEqual((7, 28), net.variables[5].shape) # LSTM Cell Bias self.assertEqual((28, ), net.variables[6].shape) # Fc Kernel self.assertEqual((7, 3), net.variables[7].shape) # Fc Bias self.assertEqual((3, ), net.variables[8].shape) # Value Shrink Kernel self.assertEqual((3, 1), net.variables[9].shape) # Value Shrink bias self.assertEqual((1, ), net.variables[10].shape) # Assert LSTM cell is created. self.assertEqual((1, 7), state[0].shape) self.assertEqual((1, 7), state[1].shape)
def testAgentFollowsActionSpecWithScalarAction(self, agent_class): action_spec = [tensor_spec.BoundedTensorSpec((), tf.int32, 0, 1)] agent = agent_class(self._time_step_spec, action_spec, q_network=q_network.QNetwork( self._observation_spec, action_spec), optimizer=None) self.assertIsNotNone(agent.policy) policy = agent.policy observation = tensor_spec.sample_spec_nest(self._time_step_spec, seed=42, outer_dims=(1, )) action_op = policy.action(observation).action self.evaluate(tf.compat.v1.initialize_all_variables()) action = self.evaluate(action_op) self.assertEqual([1] + action_spec[0].shape.as_list(), list(action[0].shape))
def _action(self, time_step, policy_state, seed): outer_dims = nest_utils.get_outer_shape(time_step, self._time_step_spec) action_ = tensor_spec.sample_spec_nest( self._action_spec, seed=seed, outer_dims=outer_dims) # TODO(b/78181147): Investigate why this control dependency is required. if time_step is not None: with tf.control_dependencies(tf.nest.flatten(time_step)): action_ = tf.nest.map_structure(tf.identity, action_) step = policy_step.PolicyStep(action_, policy_state) if self.emit_log_probability: action_probability = tf.nest.map_structure(_uniform_probability, self._action_spec) log_probability = tf.nest.map_structure(tf.math.log, action_probability) info = policy_step.PolicyInfo(log_probability=log_probability) return step._replace(info=info) return step
def make_random_trajectory(): """Creates a random trajectory. This trajectory contains Tensors shaped `[1, 6, ...]` where `1` is the batch and `6` is the number of time steps. Observations are unbounded but actions are bounded to take values within `[1, 2]`. Policy info is also provided, and is equal to the actions. It can be removed via: ```python traj = make_random_trajectory().clone(policy_info=()) ``` Returns: A `Trajectory`. """ time_step_spec = ts.time_step_spec( tensor_spec.TensorSpec([], tf.int32, name='observation')) action_spec = tensor_spec.BoundedTensorSpec([], tf.int32, minimum=1, maximum=2, name='action') # info and policy state specs match that of TFPolicyMock. outer_dims = [1, 6] # (batch_size, time) traj = trajectory.Trajectory( observation=tensor_spec.sample_spec_nest(time_step_spec.observation, outer_dims=outer_dims), action=tensor_spec.sample_bounded_spec(action_spec, outer_dims=outer_dims), policy_info=tensor_spec.sample_bounded_spec(action_spec, outer_dims=outer_dims), reward=tf.fill(outer_dims, tf.constant(0, dtype=tf.float32)), # step_type is F M L F M L. step_type=tf.reshape(tf.range(0, 6) % 3, outer_dims), # next_step_type is M L F M L F. next_step_type=tf.reshape(tf.range(1, 7) % 3, outer_dims), discount=tf.fill(outer_dims, tf.constant(1, dtype=tf.float32)), ) return traj, time_step_spec, action_spec
def testPolicySaverCompatibility(self): observation_spec = tensor_spec.TensorSpec(shape=(100,), dtype=tf.float32) action_spec = tensor_spec.TensorSpec(shape=(5,), dtype=tf.float32) time_step_tensor_spec = ts.time_step_spec(observation_spec) net = ActorNetwork(observation_spec, action_spec) net.create_variables() policy = actor_policy.ActorPolicy(time_step_tensor_spec, action_spec, net) sample = tensor_spec.sample_spec_nest( time_step_tensor_spec, outer_dims=(5,)) policy.action(sample) train_step = common.create_variable('train_step') saver = policy_saver.PolicySaver(policy, train_step=train_step) self.initialize_v1_variables() with self.cached_session(): saver.save(os.path.join(FLAGS.test_tmpdir, 'sequential_layer_model'))
def testProcessExperiencePerArmFeaturesWithMask(self): mask_spec = tensor_spec.BoundedTensorSpec(shape=(5, ), minimum=0, maximum=1, dtype=tf.int32) observation_spec = ({ 'global': tf.TensorSpec(shape=(4, ), dtype=tf.float32), 'per_arm': { 'f1': tf.TensorSpec(shape=(5, ), dtype=tf.string), 'f2': tf.TensorSpec(shape=(5, 2), dtype=tf.int32) } }, mask_spec) time_step_spec = time_step.time_step_spec(observation_spec) policy_info_spec = policy_utilities.PerArmPolicyInfo( chosen_arm_features={ 'f1': tf.TensorSpec(shape=(), dtype=tf.string), 'f2': tf.TensorSpec(shape=(2, ), dtype=tf.int32) }) training_data_spec = trajectory.Trajectory( step_type=time_step_spec.step_type, observation=time_step_spec.observation, action=tensor_spec.BoundedTensorSpec(shape=(), minimum=0, maximum=4, dtype=tf.int32), policy_info=policy_info_spec, next_step_type=time_step_spec.step_type, reward=tensor_spec.BoundedTensorSpec(shape=(), minimum=0, maximum=2, dtype=tf.float32), discount=time_step_spec.discount) experience = tensor_spec.sample_spec_nest(training_data_spec, outer_dims=(7, 2)) observation, action, reward = utils.process_experience_for_neural_agents( experience, lambda x: (x[0], x[1]), True, training_data_spec) self.assertEqual( observation['per_arm']['f1'][0], experience.policy_info.chosen_arm_features['f1'][0, 0]) self.assertAllEqual(action, tf.zeros(14, dtype=tf.int32)) self.assertEqual(reward[0], experience.reward[0, 0])
def testNestSample(self, dtype): if dtype == tf.string: self.skipTest("Not compatible with string type.") nested_spec = example_nested_tensor_spec(dtype) sample = tensor_spec.sample_spec_nest(nested_spec) spec_1 = tensor_spec.BoundedTensorSpec.from_spec(nested_spec["spec_1"]) bounded_spec_1 = nested_spec["bounded_spec_1"] sample_ = self.evaluate(sample) self.assertTrue(np.all(sample_["spec_1"] >= spec_1.minimum)) self.assertTrue(np.all(sample_["spec_1"] <= spec_1.maximum)) self.assertTrue( np.all(sample_["bounded_spec_1"] >= bounded_spec_1.minimum)) self.assertTrue( np.all(sample_["bounded_spec_1"] <= bounded_spec_1.maximum)) self.assertIn("spec_2", sample_["dict_spec"]) tensor_spec_2 = sample_["dict_spec"]["spec_2"] self.assertTrue(np.all(tensor_spec_2 >= spec_1.minimum)) self.assertTrue(np.all(tensor_spec_2 <= spec_1.maximum)) self.assertIn("bounded_spec_2", sample_["dict_spec"]) sampled_bounded_spec_2 = sample_["dict_spec"]["bounded_spec_2"] self.assertTrue(np.all(sampled_bounded_spec_2 >= spec_1.minimum)) self.assertTrue(np.all(sampled_bounded_spec_2 <= spec_1.maximum)) self.assertIn("tuple_spec", sample_) self.assertTrue(np.all(sample_["tuple_spec"][0] >= spec_1.minimum)) self.assertTrue(np.all(sample_["tuple_spec"][0] <= spec_1.maximum)) self.assertTrue( np.all(sample_["tuple_spec"][1] >= bounded_spec_1.minimum)) self.assertTrue( np.all(sample_["tuple_spec"][1] <= bounded_spec_1.maximum)) self.assertIn("list_spec", sample_) self.assertTrue(np.all(sample_["list_spec"][0] >= spec_1.minimum)) self.assertTrue(np.all(sample_["list_spec"][0] <= spec_1.maximum)) self.assertTrue(np.all(sample_["list_spec"][1][0] >= spec_1.minimum)) self.assertTrue(np.all(sample_["list_spec"][1][0] <= spec_1.maximum)) self.assertTrue( np.all(sample_["list_spec"][1][1] >= bounded_spec_1.minimum)) self.assertTrue( np.all(sample_["list_spec"][1][1] <= bounded_spec_1.maximum))
def test_dict_spec_and_pre_processing(self): input_spec = { 'a': tensor_spec.TensorSpec((32, 32, 3), tf.float32), 'b': tensor_spec.TensorSpec((32, 32, 3), tf.float32) } network = encoding_network.EncodingNetwork( input_spec, preprocessing_layers={ 'a': tf.keras.layers.Flatten(), 'b': tf.keras.layers.Flatten() }, fc_layer_params=(), preprocessing_combiner=tf.keras.layers.Concatenate(axis=-1), activation_fn=tf.keras.activations.tanh, ) sample_input = tensor_spec.sample_spec_nest(input_spec) output, _ = network(sample_input) # 6144 is the shape from a concat of flat (32, 32, 3) x2. self.assertEqual((6144, ), output.shape)
def testHandlePreprocessingLayers(self): observation_spec = (tensor_spec.TensorSpec([1], tf.float32), tensor_spec.TensorSpec([], tf.float32)) observation = tensor_spec.sample_spec_nest( observation_spec, outer_dims=(3,)) preprocessing_layers = (tf.keras.layers.Dense(4), tf.keras.Sequential([ tf.keras.layers.Reshape((1,)), tf.keras.layers.Dense(4) ])) net = value_network.ValueNetwork( observation_spec, preprocessing_layers=preprocessing_layers, preprocessing_combiner=tf.keras.layers.Add()) value, _ = net(observation) self.assertEqual([3], value.shape.as_list()) self.assertGreater(len(net.trainable_variables), 4)
def create_variables(self): if not self.built: random_input = tensor_spec.sample_spec_nest(self.input_tensor_spec, outer_dims=(1, )) step_type = tf.expand_dims(time_step.StepType.FIRST, 0) output_tensors = self.__call__(random_input, step_type, None) with tf.variable_scope(self._name): scope = tf.get_variable_scope() self._weights = framework.get_variables(scope=scope) self._trainable_weights = framework.get_trainable_variables( scope=scope) self._non_trainable_weights = [ var for var in self._weights if var not in self._trainable_weights ] if self._output_tensor_spec is None: self._output_tensor_spec = nest.map_structure( lambda t: tensor_spec.TensorSpec.from_tensor( tf.squeeze(t, axis=0), name=t.name), output_tensors)
def testBuilds(self): observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.float32, 0, 1) time_step_spec = ts.time_step_spec(observation_spec) time_step = tensor_spec.sample_spec_nest(time_step_spec, outer_dims=(1,)) action_spec = [ tensor_spec.BoundedTensorSpec((2,), tf.float32, 2, 3), tensor_spec.BoundedTensorSpec((3,), tf.int32, 0, 3) ] net = actor_distribution_network.ActorDistributionNetwork( observation_spec, action_spec, conv_layer_params=[(4, 2, 2)], fc_layer_params=(5,)) action_distributions, _ = net(time_step.observation, time_step.step_type, ()) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual([1, 2], action_distributions[0].mode().shape.as_list()) self.assertEqual([1, 3], action_distributions[1].mode().shape.as_list())
def testHandlePreprocessingLayers(self): observation_spec = (tensor_spec.TensorSpec([1], tf.float32), tensor_spec.TensorSpec([], tf.float32)) time_step_spec = ts.time_step_spec(observation_spec) time_step = tensor_spec.sample_spec_nest(time_step_spec, outer_dims=(2, 3)) preprocessing_layers = (tf.keras.layers.Dense(4), tf.keras.Sequential([ tf.keras.layers.Reshape((1, )), tf.keras.layers.Dense(4) ])) net = value_rnn_network.ValueRnnNetwork( observation_spec, preprocessing_layers=preprocessing_layers, preprocessing_combiner=tf.keras.layers.Add()) value, _ = net(time_step.observation, time_step.step_type, net.get_initial_state(batch_size=2)) self.assertEqual([2, 3], value.shape.as_list()) self.assertGreater(len(net.trainable_variables), 4)
def create_variables(self): if not self.built: random_input = tensor_spec.sample_spec_nest(self.input_tensor_spec, outer_dims=(1, )) step_type = tf.expand_dims(time_step.StepType.FIRST, 0) output_tensors = self.__call__(random_input, step_type, None) with tf.compat.v1.variable_scope(self._name): self._weights = tf.compat.v1.get_collection( key=tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope=self._name) self._trainable_weights = tf.compat.v1.trainable_variables( scope=self._name) self._non_trainable_weights = [ var for var in self._weights if var not in self._trainable_weights ] if self._output_tensor_spec is None: self._output_tensor_spec = tf.nest.map_structure( lambda t: tensor_spec.TensorSpec.from_tensor( tf.squeeze(t, axis=0)), output_tensors)
def test_auto_reset(self): time_step = self.evaluate(self.random_env.reset()) random_action = self.evaluate( tensor_spec.sample_spec_nest(self.action_spec, outer_dims=(1, ))) attempts = 0 # With a 1/10 chance of resetting on each step, the probability of failure # after 500 attempts should be 0.9^500, roughly 1e-23. If we miss more than # 500 attempts, we can safely assume the test is broken. while not time_step.is_last() and attempts < 500: time_step = self.evaluate(self.random_env.step(random_action)) attempts += 1 self.assertLess(attempts, 500) self.assertTrue(time_step.is_last()) current_time_step = self.evaluate(self.random_env.current_time_step()) self.assertTrue(current_time_step.is_last()) first_time_step = self.evaluate(self.random_env.step(random_action)) self.assertTrue(first_time_step.is_first())
def testHandlePreprocessingLayers(self, lstm_size, rnn_construction_fn): observation_spec = (tensor_spec.TensorSpec([1], tf.float32), tensor_spec.TensorSpec([], tf.float32)) time_step_spec = ts.time_step_spec(observation_spec) time_step = tensor_spec.sample_spec_nest(time_step_spec, outer_dims=(3, 4)) action_spec = [ tensor_spec.BoundedTensorSpec((2, ), tf.float32, 2, 3), tensor_spec.BoundedTensorSpec((3, ), tf.int32, 0, 3) ] preprocessing_layers = (tf.keras.layers.Dense(4), sequential_layer.SequentialLayer([ tf.keras.layers.Reshape((1, )), tf.keras.layers.Dense(4) ])) net = actor_distribution_rnn_network.ActorDistributionRnnNetwork( observation_spec, action_spec, preprocessing_layers=preprocessing_layers, lstm_size=lstm_size, preprocessing_combiner=tf.keras.layers.Add(), rnn_construction_fn=rnn_construction_fn, rnn_construction_kwargs={'lstm_size': 3}) initial_state = actor_policy.ActorPolicy(time_step_spec, action_spec, net).get_initial_state(3) action_distributions, _ = net(time_step.observation, time_step.step_type, initial_state) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual([3, 4, 2], action_distributions[0].mode().shape.as_list()) self.assertEqual([3, 4, 3], action_distributions[1].mode().shape.as_list()) self.assertGreater(len(net.trainable_variables), 4)
def testAgentTrajectoryTrain(self): agent = td3_agent.Td3Agent( self._time_step_spec, self._action_spec, critic_network=self._critic_net, actor_network=self._bounded_actor_net, actor_optimizer=tf.compat.v1.train.AdamOptimizer(0.001), critic_optimizer=tf.compat.v1.train.AdamOptimizer(0.001), ) trajectory_spec = trajectory.Trajectory( step_type=self._time_step_spec.step_type, observation=self._time_step_spec.observation, action=self._action_spec, policy_info=(), next_step_type=self._time_step_spec.step_type, reward=tensor_spec.BoundedTensorSpec( [], tf.float32, minimum=0.0, maximum=1.0, name='reward'), discount=self._time_step_spec.discount) sample_trajectory_experience = tensor_spec.sample_spec_nest( trajectory_spec, outer_dims=(3, 2)) agent.train(sample_trajectory_experience)
def testBuildsStackedLstm(self): observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.int32, 0, 1) time_step_spec = ts.time_step_spec(observation_spec) time_step = tensor_spec.sample_spec_nest(time_step_spec, outer_dims=(1, 3)) net = value_rnn_network.ValueRnnNetwork(observation_spec, conv_layer_params=[(4, 2, 2)], input_fc_layer_params=(5, ), lstm_size=(7, 5), output_fc_layer_params=(3, )) _, state = net(time_step.observation, time_step.step_type) self.evaluate(tf.compat.v1.global_variables_initializer()) # Assert LSTM cell is created. self.assertEqual((1, 7), state[0][0].shape) self.assertEqual((1, 7), state[0][1].shape) # Assert LSTM cell is created. self.assertEqual((1, 5), state[1][0].shape) self.assertEqual((1, 5), state[1][1].shape)