def test_defaults(self): categorical_column_a = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=3) categorical_column_b = fc_lib.categorical_column_with_identity( key='bbb', num_buckets=3) embedding_dimension = 2 embedding_column_b, embedding_column_a = tpu_fc.shared_embedding_columns( [categorical_column_b, categorical_column_a], dimension=embedding_dimension) self.assertIs(categorical_column_a, embedding_column_a.categorical_column) self.assertIs(categorical_column_b, embedding_column_b.categorical_column) self.assertEqual(embedding_dimension, embedding_column_a.dimension) self.assertEqual(embedding_dimension, embedding_column_b.dimension) self.assertEqual('mean', embedding_column_a.combiner) self.assertEqual('mean', embedding_column_b.combiner) self.assertIsNotNone(embedding_column_a.initializer) self.assertIsNotNone(embedding_column_b.initializer) self.assertEqual('aaa_bbb_shared_embedding', embedding_column_a.shared_embedding_collection_name) self.assertEqual('aaa_bbb_shared_embedding', embedding_column_b.shared_embedding_collection_name) self.assertEqual('aaa_shared_embedding', embedding_column_a.name) self.assertEqual('bbb_shared_embedding', embedding_column_b.name) self.assertEqual('aaa_bbb_shared_embedding', embedding_column_a._var_scope_name) self.assertEqual('aaa_bbb_shared_embedding', embedding_column_b._var_scope_name) self.assertEqual((embedding_dimension,), embedding_column_a._variable_shape) self.assertEqual((embedding_dimension,), embedding_column_b._variable_shape) self.assertEqual({ 'aaa': parsing_ops.VarLenFeature(dtypes.int64) }, embedding_column_a._parse_example_spec) self.assertEqual({ 'bbb': parsing_ops.VarLenFeature(dtypes.int64) }, embedding_column_b._parse_example_spec)
def test_defaults(self): categorical_column_a = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=3) categorical_column_b = fc_lib.categorical_column_with_identity( key='bbb', num_buckets=3) embedding_dimension = 2 embedding_column_b, embedding_column_a = tpu_fc.shared_embedding_columns( [categorical_column_b, categorical_column_a], dimension=embedding_dimension) self.assertIs(categorical_column_a, embedding_column_a.categorical_column) self.assertIs(categorical_column_b, embedding_column_b.categorical_column) self.assertEqual(embedding_dimension, embedding_column_a.dimension) self.assertEqual(embedding_dimension, embedding_column_b.dimension) self.assertEqual('mean', embedding_column_a.combiner) self.assertEqual('mean', embedding_column_b.combiner) self.assertIsNotNone(embedding_column_a.initializer) self.assertIsNotNone(embedding_column_b.initializer) self.assertEqual('aaa_bbb_shared_embedding', embedding_column_a.shared_embedding_collection_name) self.assertEqual('aaa_bbb_shared_embedding', embedding_column_b.shared_embedding_collection_name) self.assertEqual('aaa_shared_embedding', embedding_column_a.name) self.assertEqual('bbb_shared_embedding', embedding_column_b.name) self.assertEqual('aaa_bbb_shared_embedding', embedding_column_a._var_scope_name) self.assertEqual('aaa_bbb_shared_embedding', embedding_column_b._var_scope_name) self.assertEqual((embedding_dimension,), embedding_column_a._variable_shape) self.assertEqual((embedding_dimension,), embedding_column_b._variable_shape) self.assertEqual({ 'aaa': parsing_ops.VarLenFeature(dtypes.int64) }, embedding_column_a._parse_example_spec) self.assertEqual({ 'bbb': parsing_ops.VarLenFeature(dtypes.int64) }, embedding_column_b._parse_example_spec)
def test_defaults(self): vocabulary_size = 3 categorical_column_a = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) categorical_column_b = fc_lib.categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) embedding_dimension = 2 embedding_column_b, embedding_column_a = tpu_fc.shared_embedding_columns_v2( [categorical_column_b, categorical_column_a], dimension=embedding_dimension) self.assertIs(categorical_column_a, embedding_column_a.categorical_column) self.assertIs(categorical_column_b, embedding_column_b.categorical_column) self.assertEqual((vocabulary_size, embedding_dimension), embedding_column_a.get_embedding_table_size()) self.assertEqual((vocabulary_size, embedding_dimension), embedding_column_a.get_embedding_table_size()) self.assertEqual('mean', embedding_column_a.combiner) self.assertEqual('mean', embedding_column_b.combiner) self.assertIsNotNone(embedding_column_a.get_initializer()) self.assertIsNotNone(embedding_column_b.get_initializer()) self.assertEqual('aaa_bbb_shared_embedding', embedding_column_a.get_embedding_var_name()) self.assertEqual('aaa_bbb_shared_embedding', embedding_column_b.get_embedding_var_name()) self.assertEqual('aaa_shared_embedding', embedding_column_a.name) self.assertEqual('bbb_shared_embedding', embedding_column_b.name) self.assertEqual((embedding_dimension, ), embedding_column_a.variable_shape) self.assertEqual((embedding_dimension, ), embedding_column_b.variable_shape)
def test_all_constructor_args(self): vocabulary_size = 3 categorical_column_a = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) categorical_column_b = fc_lib.categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) embedding_dimension = 2 embedding_column_a, embedding_column_b = tpu_fc.shared_embedding_columns_v2( [categorical_column_a, categorical_column_b], dimension=embedding_dimension, combiner='my_combiner', initializer=lambda: 'my_initializer', shared_embedding_collection_name='var_scope_name') self.assertIs(categorical_column_a, embedding_column_a.categorical_column) self.assertIs(categorical_column_b, embedding_column_b.categorical_column) self.assertEqual((vocabulary_size, embedding_dimension), embedding_column_a.get_embedding_table_size()) self.assertEqual((vocabulary_size, embedding_dimension), embedding_column_a.get_embedding_table_size()) self.assertEqual('my_combiner', embedding_column_a.combiner) self.assertEqual('my_combiner', embedding_column_b.combiner) self.assertEqual('my_initializer', embedding_column_a.get_initializer()()) self.assertEqual('my_initializer', embedding_column_b.get_initializer()()) self.assertEqual('var_scope_name', embedding_column_a.get_embedding_var_name()) self.assertEqual('var_scope_name', embedding_column_b.get_embedding_var_name()) self.assertEqual('aaa_shared_embedding', embedding_column_a.name) self.assertEqual('bbb_shared_embedding', embedding_column_b.name) self.assertEqual((embedding_dimension,), embedding_column_a.variable_shape) self.assertEqual((embedding_dimension,), embedding_column_b.variable_shape)
def test_deepcopy(self): vocabulary_size = 3 categorical_column_a = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) categorical_column_b = fc_lib.categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) embedding_dimension = 2 columns = tpu_fc.shared_embedding_columns_v2( [categorical_column_b, categorical_column_a], dimension=embedding_dimension) columns_copy = copy.deepcopy(columns) self.assertEqual( [column._shared_embedding_collection_name for column in columns], [column._shared_embedding_collection_name for column in columns_copy])
def test_with_scope_validation(self): categorical_column = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=3) embedding_dimension = 2 initializer = init_ops.truncated_normal_initializer(mean=0.0, stddev=.5) embedding_column = tpu_fc._TPUEmbeddingColumnV2( categorical_column=categorical_column, dimension=embedding_dimension, combiner='mean', initializer=initializer, max_sequence_length=0, learning_rate_fn=None, use_safe_embedding_lookup=True, bypass_scope_validation=False) self.assertIs(categorical_column, embedding_column.categorical_column) self.assertEqual(embedding_dimension, embedding_column.dimension) state_manager = _TestStateManager() with tpu_function.tpu_shard_context(1): with variable_scope.variable_scope('tower1/scope1'): embedding_column.create_state(state_manager) with variable_scope.variable_scope('tower2/scope2'): # With default scope validation, the same column cannot be used in a new # variable scope. with self.assertRaisesRegex( ValueError, 'the variable scope name is different'): embedding_column.create_state(state_manager)
def test_get_dense_tensor(self): # Inputs. vocabulary_size = 3 sparse_input = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] # example 2, ids [] # example 3, ids [1] indices=((0, 0), (1, 0), (1, 4), (3, 0)), values=(2, 0, 1, 1), dense_shape=(4, 5)) # Embedding variable. embedding_dimension = 2 embedding_values = ( (1., 2.), # id 0 (3., 5.), # id 1 (7., 11.) # id 2 ) def _initializer(shape, dtype, partition_info): self.assertAllEqual((vocabulary_size, embedding_dimension), shape) self.assertEqual(dtypes.float32, dtype) self.assertIsNone(partition_info) return embedding_values # Expected lookup result, using combiner='mean'. expected_lookups = ( # example 0, ids [2], embedding = [7, 11] (7., 11.), # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5] (2., 3.5), # example 2, ids [], embedding = [0, 0] (0., 0.), # example 3, ids [1], embedding = [3, 5] (3., 5.), ) # Build columns. categorical_column = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) embedding_column = tpu_fc.embedding_column( categorical_column, dimension=embedding_dimension, initializer=_initializer) # Provide sparse input and get dense result. embedding_lookup = embedding_column._get_dense_tensor( fc._LazyBuilder({ 'aaa': sparse_input })) # Assert expected embedding variable and lookups. global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) self.assertItemsEqual(('embedding_weights:0',), tuple([v.name for v in global_vars])) with _initialized_session(): self.assertAllEqual(embedding_values, global_vars[0]) self.assertAllEqual(expected_lookups, embedding_lookup)
def test_get_dense_tensor(self): # Inputs. vocabulary_size = 3 sparse_input = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] # example 2, ids [] # example 3, ids [1] indices=((0, 0), (1, 0), (1, 4), (3, 0)), values=(2, 0, 1, 1), dense_shape=(4, 5)) # Embedding variable. embedding_dimension = 2 embedding_values = ( (1., 2.), # id 0 (3., 5.), # id 1 (7., 11.) # id 2 ) def _initializer(shape, dtype, partition_info): self.assertAllEqual((vocabulary_size, embedding_dimension), shape) self.assertEqual(dtypes.float32, dtype) self.assertIsNone(partition_info) return embedding_values # Expected lookup result, using combiner='mean'. expected_lookups = ( # example 0, ids [2], embedding = [7, 11] (7., 11.), # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5] (2., 3.5), # example 2, ids [], embedding = [0, 0] (0., 0.), # example 3, ids [1], embedding = [3, 5] (3., 5.), ) # Build columns. categorical_column = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) embedding_column = tpu_fc.embedding_column( categorical_column, dimension=embedding_dimension, initializer=_initializer) # Provide sparse input and get dense result. embedding_lookup = embedding_column._get_dense_tensor( fc._LazyBuilder({ 'aaa': sparse_input })) # Assert expected embedding variable and lookups. global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) self.assertItemsEqual(('embedding_weights:0',), tuple([v.name for v in global_vars])) with _initialized_session(): self.assertAllEqual(embedding_values, global_vars[0].eval()) self.assertAllEqual(expected_lookups, embedding_lookup.eval())
def test_error_dense_shape_invalid(self): categorical_column_input = fc_lib.categorical_column_with_identity( key='inp', num_buckets=5) with self.assertRaisesRegexp(ValueError, 'tensor_core_shape must be size 2'): tpu_fc.shared_embedding_columns_v2([categorical_column_input], dimension=20, tensor_core_shape=[None, 20, 15])
def test_deepcopy(self): categorical_column = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=3) embedding_column = tpu_fc.embedding_column_v2(categorical_column, dimension=2) embedding_column_copy = copy.deepcopy(embedding_column) self.assertEqual(embedding_column.dimension, embedding_column_copy.dimension) self.assertEqual(embedding_column._max_sequence_length, embedding_column_copy._max_sequence_length)
def test_invalid_cases(self, shared): # Inputs. input_sparse_tensor = sparse_tensor.SparseTensorValue( indices=((0, 0), (1, 0), (1, 1), (1, 4)), values=(2, 0, 1, 3), dense_shape=(2, 5)) input_features = {'inp': input_sparse_tensor} # Build columns. categorical_column_input = fc_lib.categorical_column_with_identity( key='inp', num_buckets=3) # Training on TPU with cpu embedding lookups is not supported. if shared: embedding_column = tpu_fc.shared_embedding_columns_v2( [categorical_column_input], dimension=2, embedding_lookup_device='cpu', tensor_core_shape=[None, 3]) else: embedding_column = tpu_fc.embedding_column_v2( categorical_column_input, dimension=2, embedding_lookup_device='cpu', tensor_core_shape=[None, 3]) dense_features = fc_lib.DenseFeatures(embedding_column) with self.assertRaisesRegexp( ValueError, r'.*embedding_lookup_device=\"cpu\" during training is not'): dense_features(input_features) # Inference on with TPU Embedding Hardware is not supported. if shared: embedding_column = tpu_fc.shared_embedding_columns_v2( [categorical_column_input], dimension=2, embedding_lookup_device='tpu_embedding_core', tensor_core_shape=[None, 3]) else: embedding_column = tpu_fc.embedding_column_v2( categorical_column_input, dimension=2, embedding_lookup_device='tpu_embedding_core', tensor_core_shape=[None, 3]) context = tpu._TPUInferenceContext('tpu_inference') context.Enter() dense_features = fc_lib.DenseFeatures(embedding_column) with self.assertRaisesRegexp( ValueError, r'Using embedding_lookup_device=tpu_embedding_core during inference is ' ): dense_features(input_features) context.Exit()
def get_feature_columns(): initializer = tf.zeros_initializer() column = fc_lib.categorical_column_with_identity(key=KEY_NAME, num_buckets=BUCKET_SIZE) embedding_fc = tpu_fc.embedding_column_v2(column, dimension=EMBEDDING_DIM, combiner='mean', initializer=initializer) all_fc = [embedding_fc] return all_fc
def test_defaults(self): categorical_column = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=3) embedding_dimension = 2 embedding_column = tpu_fc.embedding_column_v2( categorical_column, dimension=embedding_dimension) # Can't test default initializer as it's a random function. self.assertIs(categorical_column, embedding_column.categorical_column) self.assertEqual(embedding_dimension, embedding_column.dimension) self.assertEqual('mean', embedding_column.combiner) self.assertEqual('aaa_embedding', embedding_column.name) self.assertEqual((embedding_dimension,), embedding_column.variable_shape)
def test_all_constructor_args(self): categorical_column_a = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=3) categorical_column_b = fc_lib.categorical_column_with_identity( key='bbb', num_buckets=3) embedding_dimension = 2 embedding_column_a, embedding_column_b = tpu_fc.shared_embedding_columns( [categorical_column_a, categorical_column_b], dimension=embedding_dimension, combiner='my_combiner', initializer=lambda: 'my_initializer', shared_embedding_collection_name='var_scope_name') self.assertIs(categorical_column_a, embedding_column_a.categorical_column) self.assertIs(categorical_column_b, embedding_column_b.categorical_column) self.assertEqual(embedding_dimension, embedding_column_a.dimension) self.assertEqual(embedding_dimension, embedding_column_b.dimension) self.assertEqual('my_combiner', embedding_column_a.combiner) self.assertEqual('my_combiner', embedding_column_b.combiner) self.assertEqual('my_initializer', embedding_column_a.initializer()) self.assertEqual('my_initializer', embedding_column_b.initializer()) self.assertEqual('var_scope_name', embedding_column_a.shared_embedding_collection_name) self.assertEqual('var_scope_name', embedding_column_b.shared_embedding_collection_name) self.assertEqual('aaa_shared_embedding', embedding_column_a.name) self.assertEqual('bbb_shared_embedding', embedding_column_b.name) self.assertEqual('var_scope_name', embedding_column_a._var_scope_name) self.assertEqual('var_scope_name', embedding_column_b._var_scope_name) self.assertEqual((embedding_dimension, ), embedding_column_a._variable_shape) self.assertEqual((embedding_dimension, ), embedding_column_b._variable_shape) self.assertEqual({'aaa': parsing_ops.VarLenFeature(dtypes.int64)}, embedding_column_a._parse_example_spec) self.assertEqual({'bbb': parsing_ops.VarLenFeature(dtypes.int64)}, embedding_column_b._parse_example_spec)
def test_defaults(self): categorical_column = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=3) embedding_dimension = 2 embedding_column = tpu_fc.embedding_column( categorical_column, dimension=embedding_dimension) self.assertIs(categorical_column, embedding_column.categorical_column) self.assertEqual(embedding_dimension, embedding_column.dimension) self.assertEqual('mean', embedding_column.combiner) self.assertEqual('aaa_embedding', embedding_column.name) self.assertEqual('aaa_embedding', embedding_column._var_scope_name) self.assertEqual((embedding_dimension, ), embedding_column._variable_shape) self.assertEqual({'aaa': parsing_ops.VarLenFeature(dtypes.int64)}, embedding_column._parse_example_spec)
def test_defaults(self): categorical_column = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=3) embedding_dimension = 2 embedding_column = tpu_fc.embedding_column( categorical_column, dimension=embedding_dimension) self.assertIs(categorical_column, embedding_column.categorical_column) self.assertEqual(embedding_dimension, embedding_column.dimension) self.assertEqual('mean', embedding_column.combiner) self.assertEqual('aaa_embedding', embedding_column.name) self.assertEqual('aaa_embedding', embedding_column._var_scope_name) self.assertEqual((embedding_dimension,), embedding_column._variable_shape) self.assertEqual({ 'aaa': parsing_ops.VarLenFeature(dtypes.int64) }, embedding_column._parse_example_spec)
def test_all_constructor_args(self): categorical_column_a = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=3) categorical_column_b = fc_lib.categorical_column_with_identity( key='bbb', num_buckets=3) embedding_dimension = 2 embedding_column_a, embedding_column_b = tpu_fc.shared_embedding_columns( [categorical_column_a, categorical_column_b], dimension=embedding_dimension, combiner='my_combiner', initializer=lambda: 'my_initializer', shared_embedding_collection_name='var_scope_name') self.assertIs(categorical_column_a, embedding_column_a.categorical_column) self.assertIs(categorical_column_b, embedding_column_b.categorical_column) self.assertEqual(embedding_dimension, embedding_column_a.dimension) self.assertEqual(embedding_dimension, embedding_column_b.dimension) self.assertEqual('my_combiner', embedding_column_a.combiner) self.assertEqual('my_combiner', embedding_column_b.combiner) self.assertEqual('my_initializer', embedding_column_a.initializer()) self.assertEqual('my_initializer', embedding_column_b.initializer()) self.assertEqual('var_scope_name', embedding_column_a.shared_embedding_collection_name) self.assertEqual('var_scope_name', embedding_column_b.shared_embedding_collection_name) self.assertEqual('aaa_shared_embedding', embedding_column_a.name) self.assertEqual('bbb_shared_embedding', embedding_column_b.name) self.assertEqual('var_scope_name', embedding_column_a._var_scope_name) self.assertEqual('var_scope_name', embedding_column_b._var_scope_name) self.assertEqual((embedding_dimension,), embedding_column_a._variable_shape) self.assertEqual((embedding_dimension,), embedding_column_b._variable_shape) self.assertEqual({ 'aaa': parsing_ops.VarLenFeature(dtypes.int64) }, embedding_column_a._parse_example_spec) self.assertEqual({ 'bbb': parsing_ops.VarLenFeature(dtypes.int64) }, embedding_column_b._parse_example_spec)
def test_custom_column(self): # This column is not in any allowlist but should succeed because # it inherits from V2 CategoricalColumn. categorical_column = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=10) embedding_dimension = 2 embedding_column = tpu_fc.embedding_column( categorical_column, dimension=embedding_dimension) self.assertIs(categorical_column, embedding_column.categorical_column) self.assertEqual(embedding_dimension, embedding_column.dimension) self.assertEqual('mean', embedding_column.combiner) self.assertEqual('aaa_embedding', embedding_column.name) self.assertEqual('aaa_embedding', embedding_column._var_scope_name) self.assertEqual((embedding_dimension,), embedding_column._variable_shape) self.assertEqual({'aaa': parsing_ops.VarLenFeature(dtypes.int64)}, embedding_column._parse_example_spec)
def sequence_categorical_column_with_identity(key, num_buckets, default_value=None): """Returns a feature column that represents sequences of integers. Pass this to `embedding_column` or `indicator_column` to convert sequence categorical data into dense representation for input to sequence NN, such as RNN. Example: ```python watches = sequence_categorical_column_with_identity( 'watches', num_buckets=1000) watches_embedding = embedding_column(watches, dimension=10) columns = [watches_embedding] features = tf.parse_example(..., features=make_parse_example_spec(columns)) sequence_feature_layer = SequenceFeatures(columns) sequence_input, sequence_length = sequence_feature_layer(features) sequence_length_mask = tf.sequence_mask(sequence_length) rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size) rnn_layer = tf.keras.layers.RNN(rnn_cell) outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask) ``` Args: key: A unique string identifying the input feature. num_buckets: Range of inputs. Namely, inputs are expected to be in the range `[0, num_buckets)`. default_value: If `None`, this column's graph operations will fail for out-of-range inputs. Otherwise, this value must be in the range `[0, num_buckets)`, and will replace out-of-range inputs. Returns: A `SequenceCategoricalColumn`. Raises: ValueError: if `num_buckets` is less than one. ValueError: if `default_value` is not in range `[0, num_buckets)`. """ return fc.SequenceCategoricalColumn( fc.categorical_column_with_identity(key=key, num_buckets=num_buckets, default_value=default_value))
def sequence_categorical_column_with_identity( key, num_buckets, default_value=None): """Returns a feature column that represents sequences of integers. Pass this to `embedding_column` or `indicator_column` to convert sequence categorical data into dense representation for input to sequence NN, such as RNN. Example: ```python watches = sequence_categorical_column_with_identity( 'watches', num_buckets=1000) watches_embedding = embedding_column(watches, dimension=10) columns = [watches_embedding] features = tf.parse_example(..., features=make_parse_example_spec(columns)) sequence_feature_layer = SequenceFeatures(columns) sequence_input, sequence_length = sequence_feature_layer(features) sequence_length_mask = tf.sequence_mask(sequence_length) rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size) rnn_layer = tf.keras.layers.RNN(rnn_cell) outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask) ``` Args: key: A unique string identifying the input feature. num_buckets: Range of inputs. Namely, inputs are expected to be in the range `[0, num_buckets)`. default_value: If `None`, this column's graph operations will fail for out-of-range inputs. Otherwise, this value must be in the range `[0, num_buckets)`, and will replace out-of-range inputs. Returns: A `SequenceCategoricalColumn`. Raises: ValueError: if `num_buckets` is less than one. ValueError: if `default_value` is not in range `[0, num_buckets)`. """ return fc.SequenceCategoricalColumn( fc.categorical_column_with_identity( key=key, num_buckets=num_buckets, default_value=default_value))
def test_all_constructor_args(self): categorical_column = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=3) embedding_dimension = 2 embedding_column = tpu_fc.embedding_column_v2( categorical_column, dimension=embedding_dimension, combiner='my_combiner', initializer=lambda: 'my_initializer') self.assertIs(categorical_column, embedding_column.categorical_column) self.assertEqual(embedding_dimension, embedding_column.dimension) self.assertEqual('my_combiner', embedding_column.combiner) self.assertEqual('my_initializer', embedding_column.initializer()) self.assertEqual('aaa_embedding', embedding_column.name) self.assertEqual((embedding_dimension, ), embedding_column.variable_shape) self.assertEqual({'aaa': parsing_ops.VarLenFeature(dtypes.int64)}, embedding_column._parse_example_spec)
def test_all_constructor_args(self): categorical_column = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=3) embedding_dimension = 2 embedding_column = tpu_fc.embedding_column( categorical_column, dimension=embedding_dimension, combiner='my_combiner', initializer=lambda: 'my_initializer') self.assertIs(categorical_column, embedding_column.categorical_column) self.assertEqual(embedding_dimension, embedding_column.dimension) self.assertEqual('my_combiner', embedding_column.combiner) self.assertEqual('aaa_embedding', embedding_column.name) self.assertEqual('aaa_embedding', embedding_column._var_scope_name) self.assertEqual((embedding_dimension,), embedding_column._variable_shape) self.assertEqual({ 'aaa': parsing_ops.VarLenFeature(dtypes.int64) }, embedding_column._parse_example_spec)
def test_deepcopy_with_bypass_scope_validation(self): categorical_column = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=3) embedding_dimension = 2 initializer = init_ops.truncated_normal_initializer(mean=0.0, stddev=.5) embedding_column = tpu_fc._TPUEmbeddingColumnV2( categorical_column=categorical_column, dimension=embedding_dimension, combiner='mean', initializer=initializer, max_sequence_length=0, use_safe_embedding_lookup=False, bypass_scope_validation=True) embedding_column_copy = copy.deepcopy(embedding_column) self.assertEqual(embedding_dimension, embedding_column_copy.dimension) self.assertEqual(embedding_column._max_sequence_length, embedding_column_copy._max_sequence_length) self.assertTrue(embedding_column_copy._bypass_scope_validation) self.assertFalse(embedding_column_copy.use_safe_embedding_lookup)
def test_dense_embedding_lookup(self, shared, combiner): # Inputs. vocabulary_size = 3 input_sparse_tensor = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1, 3] indices=((0, 0), (1, 0), (1, 1), (1, 4)), values=(2, 0, 1, 3), dense_shape=(2, 5)) input_features = {'inp': input_sparse_tensor} # Embedding variable. embedding_dimension = 2 embedding_values = ( (1., 2.), # id 0 (3., 5.), # id 1 (7., 11.), # id 2 (13., 17.) # id 3 ) def _initializer(shape, dtype, partition_info=None): self.assertAllEqual((vocabulary_size, embedding_dimension), shape) self.assertEqual(dtypes.float32, dtype) self.assertIsNone(partition_info) return embedding_values # Build columns. categorical_column_input = fc_lib.categorical_column_with_identity( key='inp', num_buckets=vocabulary_size) # Set tensor_core_shape to be [None, 20] to ensure some padding and # dynamic batch size. if shared: embedding_column = tpu_fc.shared_embedding_columns_v2( [categorical_column_input], dimension=embedding_dimension, initializer=_initializer, combiner=combiner, embedding_lookup_device='tpu_tensor_core', tensor_core_shape=[None, 3]) else: embedding_column = tpu_fc.embedding_column_v2( categorical_column_input, dimension=embedding_dimension, initializer=_initializer, combiner=combiner, embedding_lookup_device='tpu_tensor_core', tensor_core_shape=[None, 3]) # Run in TPUInferenceContext so that we hit the intended densification case. context = tpu._TPUInferenceContext('tpu_inference') context.Enter() dense_features = fc_lib.DenseFeatures(embedding_column) # Sqrtn combiner not supported for now. if combiner == 'sqrtn': with self.assertRaisesRegexp( ValueError, 'Dense TPU Embedding does not support combiner'): embedding_lookup = dense_features(input_features) return if combiner == 'mean': expected_lookups = ( # example 0: (7., 11.), # ids [2], embedding = [7, 11] # example 1: (2., 3.5 ), # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5] ) elif combiner == 'sum': expected_lookups = ( # example 0: (7., 11.), # ids [2], embedding = [7, 11] # example 1: (4., 7 ), # ids [0, 1], embedding = sum([1, 2] + [3, 5]) = [4, 7] ) embedding_lookup = dense_features(input_features) # Assert expected embedding variable and lookups. global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) if shared: self.assertCountEqual(('inp_shared_embedding:0', ), tuple([v.name for v in global_vars])) else: self.assertCountEqual( ('dense_features/inp_embedding/embedding_weights:0', ), tuple([v.name for v in global_vars])) embedding_var = global_vars[0] with _initialized_session(): self.assertAllEqual(embedding_values, embedding_var.eval()) eval_res = embedding_lookup.eval() self.assertAllEqual(expected_lookups, eval_res) context.Exit()
def test_feature_layer_cpu(self): # Inputs. vocabulary_size = 3 input_a = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] indices=((0, 0), (1, 0), (1, 1)), values=(2, 0, 1), dense_shape=(2, 2)) input_b = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] # example 2, ids [] indices=((0, 0), (1, 0), (1, 1)), values=(2, 0, 1), dense_shape=(3, 2)) input_features = {'aaa': input_a, 'bbb': input_b} # Embedding variable. embedding_dimension = 2 embedding_values = ( (1., 2.), # id 0 (3., 5.), # id 1 (7., 11.) # id 2 ) def _initializer(shape, dtype, partition_info=None): self.assertAllEqual((vocabulary_size, embedding_dimension), shape) self.assertEqual(dtypes.float32, dtype) self.assertIsNone(partition_info) return embedding_values # Expected lookup result, using combiner='mean'. expected_lookups_a = ( # example 0: (7., 11.), # ids [2], embedding = [7, 11] # example 1: (2., 3.5 ), # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5] ) expected_lookups_b = ( # example 0: ( (7., 11.), (0., 0.), ), # ids [2], embedding = [[7, 11], [0, 0]] # example 1: ( (1., 2.), (3., 5.), ), # ids [0, 1], embedding = [[1, 2], [3, 5]] # example 2: ( (0., 0.), (0., 0.), ), # ids [], embedding = [[0, 0], [0, 0]] ) # Build columns. categorical_column_a = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) categorical_column_b = fc_lib.sequence_categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) embedding_column_a, embedding_column_b = tpu_fc.shared_embedding_columns_v2( [categorical_column_a, categorical_column_b], dimension=embedding_dimension, initializer=_initializer, max_sequence_lengths=[0, 2]) # Provide sparse input and get dense result. dense_features = fc_lib.DenseFeatures([embedding_column_a]) sequence_features = fc_lib.SequenceFeatures([embedding_column_b]) embedding_lookup_a = dense_features(input_features) embedding_lookup_b = sequence_features(input_features) # Assert expected embedding variable and lookups. global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) self.assertItemsEqual(('aaa_bbb_shared_embedding:0', ), tuple([v.name for v in global_vars])) embedding_var = global_vars[0] with _initialized_session(): self.assertAllEqual(embedding_values, embedding_var.eval()) self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval()) self.assertAllEqual(expected_lookups_b, embedding_lookup_b[0].eval())
def test_encode_features(self): with tf.Graph().as_default(): # Inputs. vocabulary_size = 4 # -1 values are ignored. input_a = np.array([ [3, -1, -1], # example 0, ids [3] [0, 1, -1], # example 1, ids [0, 1] ]) input_b = np.array([ [0, -1, -1], # example 0, ids [0] [-1, -1, -1], # example 1, ids [] ]) input_features = {"aaa": input_a, "bbb": input_b} # Embedding variable. embedding_dimension = 2 embedding_values = ( (1., 2.), # id 0 (3., 5.), # id 1 (7., 11.), # id 2 (9., 13.) # id 3 ) # Expected lookup result, using combiner='mean'. expected_lookups_a = ( # example 0: (9., 13.), # ids [3], embedding = [9, 13] # example 1: (2., 3.5), # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5] ) expected_lookups_b = ( # example 0: (1., 2.), # ids [0], embedding = [1, 2] # example 1: (0., 0.), # ids [], embedding = [0, 0] ) # Build columns. categorical_column_a = feature_column.categorical_column_with_identity( key="aaa", num_buckets=vocabulary_size) categorical_column_b = feature_column.categorical_column_with_identity( key="bbb", num_buckets=vocabulary_size) embed_column_a, embed_column_b = feature_column.shared_embedding_columns( [categorical_column_a, categorical_column_b], dimension=embedding_dimension, initializer=lambda shape, dtype, partition_info: embedding_values, shared_embedding_collection_name="custom_collection_name") feature_columns = {"aaa": embed_column_a, "bbb": embed_column_b} cols_to_tensors = feature_lib.encode_features( input_features, feature_columns.values(), mode=tf.estimator.ModeKeys.EVAL) embedding_lookup_a = cols_to_tensors[feature_columns["aaa"]] embedding_lookup_b = cols_to_tensors[feature_columns["bbb"]] # Assert expected embedding variable and lookups. global_vars = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.GLOBAL_VARIABLES) embedding_var = global_vars[0] with tf.compat.v1.Session() as sess: sess.run(tf.compat.v1.global_variables_initializer()) sess.run(tf.compat.v1.tables_initializer()) self.assertAllEqual(embedding_values, embedding_var.eval()) self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval()) self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
def test_empty_row(self): # Inputs. vocabulary_size = 3 input_sparse_tensor = sparse_tensor.SparseTensorValue( # example 0, ids [] # example 1, ids [0, 1, 3] indices=((1, 0), (1, 1), (1, 4)), values=(0, 1, 3), dense_shape=(2, 5)) input_features = {'inp': input_sparse_tensor} # Embedding variable. embedding_dimension = 2 embedding_values = ( (1., 2.), # id 0 (3., 5.), # id 1 (7., 11.), # id 2 (13., 17.) # id 3 ) def _initializer(shape, dtype, partition_info=None): self.assertAllEqual((vocabulary_size, embedding_dimension), shape) self.assertEqual(dtypes.float32, dtype) self.assertIsNone(partition_info) return embedding_values # Build columns. categorical_column_input = fc_lib.categorical_column_with_identity( key='inp', num_buckets=vocabulary_size) # Set tensor_core_shape to be [None, 20] to ensure some padding and # dynamic batch size. embedding_column = tpu_fc.embedding_column_v2( categorical_column_input, dimension=embedding_dimension, initializer=_initializer, combiner='mean', embedding_lookup_device='tpu_tensor_core', tensor_core_shape=[None, 3]) # Run in TPUContexts so that we hit the intended densification case. context = tpu._TPUInferenceContext('tpu_inference') context.Enter() with tpu_function.tpu_shard_context(1): dense_features = fc_lib.DenseFeatures(embedding_column) expected_lookups = ( # example 0: (0., 0.), # ids [], embedding = [0, 0] # example 1: (2., 3.5 ), # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5] ) embedding_lookup = dense_features(input_features) # Assert expected embedding variable and lookups. global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) self.assertCountEqual( ('dense_features/inp_embedding/embedding_weights:0', ), tuple([v.name for v in global_vars])) embedding_var = global_vars[0] with _initialized_session(): self.assertAllEqual(embedding_values, embedding_var) eval_res = embedding_lookup.eval() self.assertAllEqual(expected_lookups, eval_res) context.Exit()
def test_get_dense_tensor(self): # Inputs. vocabulary_size = 3 # -1 values are ignored. input_a = np.array([ [2, -1, -1], # example 0, ids [2] [0, 1, -1] ]) # example 1, ids [0, 1] input_b = np.array([ [0, -1, -1], # example 0, ids [0] [-1, -1, -1] ]) # example 1, ids [] input_features = {'aaa': input_a, 'bbb': input_b} # Embedding variable. embedding_dimension = 2 embedding_values = ( (1., 2.), # id 0 (3., 5.), # id 1 (7., 11.) # id 2 ) def _initializer(shape, dtype, partition_info): self.assertAllEqual((vocabulary_size, embedding_dimension), shape) self.assertEqual(dtypes.float32, dtype) self.assertIsNone(partition_info) return embedding_values # Expected lookup result, using combiner='mean'. expected_lookups_a = ( # example 0: (7., 11.), # ids [2], embedding = [7, 11] # example 1: (2., 3.5 ), # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5] ) expected_lookups_b = ( # example 0: (1., 2.), # ids [0], embedding = [1, 2] # example 1: (0., 0.), # ids [], embedding = [0, 0] ) # Build columns. categorical_column_a = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) categorical_column_b = fc_lib.categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) embedding_column_a, embedding_column_b = tpu_fc.shared_embedding_columns( [categorical_column_a, categorical_column_b], dimension=embedding_dimension, initializer=_initializer) # Provide sparse input and get dense result. embedding_lookup_a = embedding_column_a._get_dense_tensor( fc._LazyBuilder(input_features)) embedding_lookup_b = embedding_column_b._get_dense_tensor( fc._LazyBuilder(input_features)) # Assert expected embedding variable and lookups. global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) self.assertItemsEqual(('embedding_weights:0', ), tuple([v.name for v in global_vars])) embedding_var = global_vars[0] with _initialized_session(): self.assertAllEqual(embedding_values, embedding_var) self.assertAllEqual(expected_lookups_a, embedding_lookup_a) self.assertAllEqual(expected_lookups_b, embedding_lookup_b)
def test_feature_layer_cpu(self): # Inputs. vocabulary_size = 3 sparse_input = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] # example 2, ids [] # example 3, ids [1] indices=((0, 0), (1, 0), (1, 1), (3, 0)), values=(2, 0, 1, 1), dense_shape=(4, 2)) # Embedding variable. embedding_dimension = 2 embedding_values = ( (1., 2.), # id 0 (3., 5.), # id 1 (7., 11.) # id 2 ) def _initializer(shape, dtype, partition_info=None): self.assertAllEqual((vocabulary_size, embedding_dimension), shape) self.assertEqual(dtypes.float32, dtype) self.assertIsNone(partition_info) return embedding_values # Expected lookup result, using combiner='mean'. expected_lookups = ( # example 0, ids [2], embedding = [7, 11] (7., 11.), # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5] (2., 3.5), # example 2, ids [], embedding = [0, 0] (0., 0.), # example 3, ids [1], embedding = [3, 5] (3., 5.), ) expected_lookups_sequence = ( # example 0, ids [2], embedding = [[7, 11], [0, 0]] ( (7., 11.), (0., 0.), ), # example 1, ids [0, 1], embedding = [[1, 2], [3. 5]] ( (1., 2.), (3., 5.), ), # example 2, ids [], embedding = [0, 0] ( (0., 0.), (0., 0.), ), # example 3, ids [1], embedding = [3, 5] ( (3., 5.), (0., 0.), ), ) # Build columns. categorical_column = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) sequence_categorical_column = ( fc_lib.sequence_categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size)) embedding_column = tpu_fc.embedding_column_v2( categorical_column, dimension=embedding_dimension, initializer=_initializer) sequence_embedding_column = tpu_fc.embedding_column_v2( sequence_categorical_column, dimension=embedding_dimension, initializer=_initializer, max_sequence_length=2) # Provide sparse input and get dense result. features = {'aaa': sparse_input, 'bbb': sparse_input} dense_features = fc_lib.DenseFeatures([embedding_column]) sequence_features = fc_lib.SequenceFeatures( [sequence_embedding_column]) embedding_lookup = dense_features(features) sequence_embedding_lookup = sequence_features(features) # Assert expected embedding variable and lookups. global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) self.assertItemsEqual(( 'dense_features/aaa_embedding/embedding_weights:0', 'sequence_features/bbb_embedding/embedding_weights:0', ), tuple([v.name for v in global_vars])) with _initialized_session(): self.assertAllEqual(embedding_values, global_vars[0].eval()) self.assertAllEqual(expected_lookups, embedding_lookup.eval()) self.assertAllEqual(expected_lookups_sequence, sequence_embedding_lookup[0].eval())
def test_get_dense_tensor(self): # Inputs. vocabulary_size = 3 # -1 values are ignored. input_a = np.array([ [2, -1, -1], # example 0, ids [2] [0, 1, -1] ]) # example 1, ids [0, 1] input_b = np.array([ [0, -1, -1], # example 0, ids [0] [-1, -1, -1] ]) # example 1, ids [] input_features = {'aaa': input_a, 'bbb': input_b} # Embedding variable. embedding_dimension = 2 embedding_values = ( (1., 2.), # id 0 (3., 5.), # id 1 (7., 11.) # id 2 ) def _initializer(shape, dtype, partition_info): self.assertAllEqual((vocabulary_size, embedding_dimension), shape) self.assertEqual(dtypes.float32, dtype) self.assertIsNone(partition_info) return embedding_values # Expected lookup result, using combiner='mean'. expected_lookups_a = ( # example 0: (7., 11.), # ids [2], embedding = [7, 11] # example 1: (2., 3.5), # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5] ) expected_lookups_b = ( # example 0: (1., 2.), # ids [0], embedding = [1, 2] # example 1: (0., 0.), # ids [], embedding = [0, 0] ) # Build columns. categorical_column_a = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) categorical_column_b = fc_lib.categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) embedding_column_a, embedding_column_b = tpu_fc.shared_embedding_columns( [categorical_column_a, categorical_column_b], dimension=embedding_dimension, initializer=_initializer) # Provide sparse input and get dense result. embedding_lookup_a = embedding_column_a._get_dense_tensor( fc._LazyBuilder(input_features)) embedding_lookup_b = embedding_column_b._get_dense_tensor( fc._LazyBuilder(input_features)) # Assert expected embedding variable and lookups. global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) self.assertItemsEqual(('embedding_weights:0',), tuple([v.name for v in global_vars])) embedding_var = global_vars[0] with _initialized_session(): self.assertAllEqual(embedding_values, embedding_var.eval()) self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval()) self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())