def test_with_1d_sparse_tensor(): tf.compat.v1.reset_default_graph() #混合散列 body_style = tf.feature_column.categorical_column_with_vocabulary_list( 'name', vocabulary_list=['anna', 'gary', 'bob'],num_oov_buckets=2) #稀疏矩阵 #稠密矩阵 builder = _LazyBuilder({ 'name': ['anna', 'gary','alsa'], }) #稀疏矩阵 builder2 = _LazyBuilder({ 'name': tf.SparseTensor( indices=((0,), (1,), (2,)), values=('anna', 'gary', 'alsa'), dense_shape=(3,)), }) id_weight_pair = body_style._get_sparse_tensors(builder) # id_weight_pair2 = body_style._get_sparse_tensors(builder2) # with tf.compat.v1.Session() as sess: sess.run(lookup_ops.tables_initializer()) id_tensor_eval = id_weight_pair.id_tensor.eval() print("稀疏矩阵:\n",id_tensor_eval) id_tensor_eval2 = id_weight_pair2.id_tensor.eval() print("稀疏矩阵2:\n",id_tensor_eval2) dense_decoded = tf.sparse.to_dense( id_tensor_eval, default_value=-1).eval(session=sess) print("稠密矩阵:\n",dense_decoded)
def test_sequence_length_with_empty_rows(self): """Tests _sequence_length when some examples do not have ids.""" vocabulary_size = 3 sparse_input_a = sparse_tensor.SparseTensorValue( # example 0, ids [] # example 1, ids [2] # example 2, ids [0, 1] # example 3, ids [] # example 4, ids [1] # example 5, ids [] indices=((1, 0), (2, 0), (2, 1), (4, 0)), values=(2, 0, 1, 1), dense_shape=(6, 2)) expected_sequence_length_a = [0, 1, 2, 0, 1, 0] categorical_column_a = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) sparse_input_b = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [] # example 2, ids [] # example 3, ids [] # example 4, ids [1] # example 5, ids [0, 1] indices=((0, 0), (4, 0), (5, 0), (5, 1)), values=(2, 1, 0, 1), dense_shape=(6, 2)) expected_sequence_length_b = [1, 0, 0, 0, 1, 2] categorical_column_b = sfc.sequence_categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) shared_embedding_columns = fc.shared_embedding_columns( [categorical_column_a, categorical_column_b], dimension=2) sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor( _LazyBuilder({ 'aaa': sparse_input_a }))[1] sequence_length_b = shared_embedding_columns[1]._get_sequence_dense_tensor( _LazyBuilder({ 'bbb': sparse_input_b }))[1] with monitored_session.MonitoredSession() as sess: self.assertAllEqual( expected_sequence_length_a, sequence_length_a.eval(session=sess)) self.assertAllEqual( expected_sequence_length_b, sequence_length_b.eval(session=sess))
def test_sequence_length_with_empty_rows(self): """Tests _sequence_length when some examples do not have ids.""" vocabulary_size = 3 sparse_input = sparse_tensor.SparseTensorValue( # example 0, ids [] # example 1, ids [2] # example 2, ids [0, 1] # example 3, ids [] # example 4, ids [1] # example 5, ids [] indices=((1, 0), (2, 0), (2, 1), (4, 0)), values=(2, 0, 1, 1), dense_shape=(6, 2)) expected_sequence_length = [0, 1, 2, 0, 1, 0] categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) indicator_column = sfc._sequence_indicator_column(categorical_column) _, sequence_length = indicator_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': sparse_input})) with monitored_session.MonitoredSession() as sess: self.assertAllEqual(expected_sequence_length, sequence_length.eval(session=sess))
def test_transormations_called_once(self): class TransformCounter(fc._FeatureColumn): def __init__(self): self.num_transform = 0 @property def name(self): return 'TransformCounter' def _transform_feature(self, cache): self.num_transform += 1 # Count transform calls. return cache.get('a') @property def _parse_example_config(self): pass builder = fc._LazyBuilder( features={'a': constant_op.constant([[2], [3.]])}) column = TransformCounter() self.assertEqual(0, column.num_transform) builder.get(column) self.assertEqual(1, column.num_transform) builder.get(column) self.assertEqual(1, column.num_transform)
def test_categorical_column_with_vocabulary_list(): color_data = { 'color': [['R', 'R'], ['G', 'R'], ['B', 'G'], ['A', 'A']] } # 4行样本 builder = _LazyBuilder(color_data) color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) color_column_tensor = color_column._get_sparse_tensors(builder) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) # 将稀疏的转换成dense,也就是one-hot形式,只是multi-hot color_column_identy = feature_column.indicator_column(color_column) color_dense_tensor = feature_column.input_layer(color_data, [color_column_identy]) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run([color_dense_tensor]))
def test_shared_embedding_column_with_hash_bucket(): color_data = { 'color': [[2, 2], [5, 5], [0, -1], [0, 0]], 'color2': [[2], [5], [-1], [0]] } # 4行样本 builder = _LazyBuilder(color_data) color_column = feature_column.categorical_column_with_hash_bucket( 'color', 7, dtype=tf.int32) color_column_tensor = color_column._get_sparse_tensors(builder) color_column2 = feature_column.categorical_column_with_hash_bucket( 'color2', 7, dtype=tf.int32) color_column_tensor2 = color_column2._get_sparse_tensors(builder) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('not use input_layer' + '_' * 40) print(session.run([color_column_tensor.id_tensor])) print(session.run([color_column_tensor2.id_tensor])) # 将稀疏的转换成dense,也就是one-hot形式,只是multi-hot color_column_embed = feature_column.shared_embedding_columns( [color_column2, color_column], 3, combiner='sum') print(type(color_column_embed)) color_dense_tensor = feature_column.input_layer(color_data, color_column_embed) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run(color_dense_tensor))
def test_get_sequence_dense_tensor_with_normalizer_fn(self): def _increment_two(input_sparse_tensor): return sparse_ops.sparse_add( input_sparse_tensor, sparse_tensor.SparseTensor(((0, 0), (1, 1)), (2.0, 2.0), (2, 2)) ) sparse_input = sparse_tensor.SparseTensorValue( # example 0, values [[0.], [1]] # example 1, [[10.]] indices=((0, 0), (0, 1), (1, 0)), values=(0., 1., 10.), dense_shape=(2, 2)) # Before _increment_two: # [[0.], [1.]], # [[10.], [0.]], # After _increment_two: # [[2.], [1.]], # [[10.], [2.]], expected_dense_tensor = [ [[2.], [1.]], [[10.], [2.]], ] numeric_column = sfc.sequence_numeric_column( 'aaa', normalizer_fn=_increment_two) dense_tensor, _ = numeric_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': sparse_input})) with monitored_session.MonitoredSession() as sess: self.assertAllEqual( expected_dense_tensor, dense_tensor.eval(session=sess))
def test_categorical_column_with_hash_bucket(): # 1. Input features color_data = {'color': [[2], [5], [-1], [0]]} builder = _LazyBuilder(color_data) # 2. Feature columns (Sparse) color_column = feature_column.categorical_column_with_hash_bucket( 'color', 7, dtype=tf.int32) color_column_tensor = color_column._get_sparse_tensors(builder) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) # 2. Feature columns (Dense) # Convert the Categorical Column to Dense Column color_column_identity = feature_column.indicator_column(color_column) # 3. Feature tensor color_dense_tensor = feature_column.input_layer(color_data, [color_column_identity]) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run([color_dense_tensor]))
def test_weighted_categorical_column(): # 1. Input features color_data = { 'color': [['R'], ['G'], ['B'], ['A']], 'weight': [[1.0], [2.0], [4.0], [8.0]] } # 2. Feature columns (Sparse) color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) # 2. Feature columns (Sparse) color_weight_categorical_column \ = feature_column.weighted_categorical_column(color_column, 'weight') builder = _LazyBuilder(color_data) id_tensor, weight = color_weight_categorical_column._get_sparse_tensors( builder) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('weighted categorical' + '-' * 40) print(session.run([id_tensor])) print('-' * 40) print(session.run([weight])) # 2. Feature columns (Dense) weighted_column = feature_column.indicator_column( color_weight_categorical_column) # 3. Feature tensor weighted_column_dense_tensor = feature_column.input_layer( color_data, [weighted_column]) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run([weighted_column_dense_tensor]))
def test_get_sequence_dense_tensor(self): vocabulary_size = 3 sparse_input = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] # example 2, ids [] # example 3, ids [1] indices=((0, 0), (1, 0), (1, 1), (3, 0)), values=(2, 0, 1, 1), dense_shape=(4, 2)) expected_lookups = [ # example 0, ids [2] [[0., 0., 1.], [0., 0., 0.]], # example 1, ids [0, 1] [[1., 0., 0.], [0., 1., 0.]], # example 2, ids [] [[0., 0., 0.], [0., 0., 0.]], # example 3, ids [1] [[0., 1., 0.], [0., 0., 0.]], ] categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) indicator_column = sfc._sequence_indicator_column(categorical_column) indicator_tensor, _ = indicator_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': sparse_input})) with monitored_session.MonitoredSession() as sess: self.assertAllEqual(expected_lookups, indicator_tensor.eval(session=sess))
def test_embedding(): tf.set_random_seed(1) # 1. Input features color_data = {'color': [['R', 'G'], ['G', 'A'], ['B', 'B'], ['A', 'A']]} builder = _LazyBuilder(color_data) # 2. Feature columns (Sparse) color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) color_column_tensor = color_column._get_sparse_tensors(builder) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) # 2. Feature columns (Dense) color_embedding = feature_column.embedding_column(color_column, 4, combiner='sum') # 3. Feature tensor color_embedding_dense_tensor = feature_column.input_layer( color_data, [color_embedding]) with tf.Session() as session: # Embedding needs variables (weights) to do the embedding session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('embedding' + '_' * 40) print(session.run([color_embedding_dense_tensor]))
def test_shared_embedding_column_with_hash_bucket(): # 1. Input features color_data = { 'range': [[2, 2], [5, 5], [0, -1], [0, 0]], 'id': [[2], [5], [-1], [0]] } builder = _LazyBuilder(color_data) # 2. Feature columns (Sparse) color_column = feature_column.categorical_column_with_hash_bucket( 'range', 7, dtype=tf.int32) color_column_tensor = color_column._get_sparse_tensors(builder) # 2. Feature columns (Sparse) color_column2 = feature_column.categorical_column_with_hash_bucket( 'id', 7, dtype=tf.int32) color_column_tensor2 = color_column2._get_sparse_tensors(builder) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('not use input_layer' + '_' * 40) print(session.run([color_column_tensor.id_tensor])) print(session.run([color_column_tensor2.id_tensor])) # 2. Feature columns (Dense) color_column_embed = feature_column.shared_embedding_columns( [color_column2, color_column], 3, combiner='sum') print(type(color_column_embed)) # 3. Feature tensor color_dense_tensor = feature_column.input_layer(color_data, color_column_embed) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run(color_dense_tensor))
def test_tensor_dtype_should_be_string_or_integer(self): string_fc = fc.categorical_column_with_hash_bucket('a_string', 10, dtype=dtypes.string) int_fc = fc.categorical_column_with_hash_bucket('a_int', 10, dtype=dtypes.int32) float_fc = fc.categorical_column_with_hash_bucket('a_float', 10, dtype=dtypes.string) int_tensor = sparse_tensor.SparseTensor(values=constant_op.constant( [101]), indices=[[0, 0]], dense_shape=[1, 1]) string_tensor = sparse_tensor.SparseTensor(values=constant_op.constant( ['101']), indices=[[0, 0]], dense_shape=[1, 1]) float_tensor = sparse_tensor.SparseTensor(values=constant_op.constant( [101.]), indices=[[0, 0]], dense_shape=[1, 1]) builder = fc._LazyBuilder({ 'a_int': int_tensor, 'a_string': string_tensor, 'a_float': float_tensor }) builder.get(string_fc) builder.get(int_fc) with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'): builder.get(float_fc)
def test_get_sequence_dense_tensor(self): vocabulary_size = 3 sparse_input = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] # example 2, ids [] # example 3, ids [1] indices=((0, 0), (1, 0), (1, 1), (3, 0)), values=(2, 0, 1, 1), dense_shape=(4, 2)) expected_lookups = [ # example 0, ids [2] [[0., 0., 1.], [0., 0., 0.]], # example 1, ids [0, 1] [[1., 0., 0.], [0., 1., 0.]], # example 2, ids [] [[0., 0., 0.], [0., 0., 0.]], # example 3, ids [1] [[0., 1., 0.], [0., 0., 0.]], ] categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) indicator_column = fc.indicator_column(categorical_column) indicator_tensor, _ = indicator_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': sparse_input})) with monitored_session.MonitoredSession() as sess: self.assertAllEqual(expected_lookups, indicator_tensor.eval(session=sess))
def test_weighted_cate_column(): # !!! id=''代表missing,其对应的weight只能为0,否则会导致id和weight长度不一致而报错 # !!! 而且weight必须是float型,输入int会报错 x_values = { 'id': [[b'a', b'z', b'a', b'c'], [b'b', b'', b'd', b'b']], 'weight': [[1.0, 2.0, -3.0, 4.0], [5.0, 0.0, 7.0, -8.0]] } builder = _LazyBuilder(x_values) # lazy representation of input # ================== define ops sparse_id_featcol = feature_column.categorical_column_with_vocabulary_list( 'id', ['a', 'b', 'c'], dtype=tf.string, default_value=-1) sparse_featcol = feature_column.weighted_categorical_column( categorical_column=sparse_id_featcol, weight_feature_key='weight') x_sparse_tensor = sparse_featcol._get_sparse_tensors(builder) # indicator_column将sparse tensor转换成dense MHE格式, shape=[batch_size, #tokens] # 其中的权重是这个token出现的所有权重的总和 dense_featcol = feature_column.indicator_column(sparse_featcol) x_dense_tensor = feature_column.input_layer(x_values, [dense_featcol]) # ================== run with tf.Session() as sess: # 必须initialize table,否则报错 sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) id_sparse_value, weight_sparse_value = sess.run( [x_sparse_tensor.id_tensor, x_sparse_tensor.weight_tensor]) print("************************* sparse id tensor") # sparse tensor's id_tensor保持与原始输入相同的形状,[batch_size, max_tokens_per_example]=[2,4] # SparseTensorValue(indices=array( # [[0, 0], # [0, 1], # [0, 2], # [0, 3], # [1, 0], # [1, 2], # [1, 3]]), values=array([ 0, -1, 0, 2, 1, -1, 1]), dense_shape=array([2, 4])) print(id_sparse_value) print("************************* sparse weight tensor") # sparse tensor's weight_tensor保持与原始输入相同的形状,[batch_size, max_tokens_per_example]=[2,4] # SparseTensorValue(indices=array( # [[0, 0], # [0, 1], # [0, 2], # [0, 3], # [1, 0], # [1, 2], # [1, 3]]), values=array([ 1., 2., -3., 4., 5., 7., -8.], dtype=float32), dense_shape=array([2, 4])) print(weight_sparse_value) print("************************* dense MHE tensor") # indicator_column将sparse tensor按照MHE的方式转化成dense tensor,shape=[batch_size, total_tokens_in_vocab] # 其中的每个数值是该token出现的所有权重的总和 # [[-2. 0. 4.] # [ 0. -3. 0.]] print(sess.run(x_dense_tensor))
def test_multi_value_embedding(): color_data = { 'color': [['G', 'G'], ['G', 'B'], ['B', 'B'], ['G', 'R'], ['R', 'R'], ['B', 'R']] } color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) color_embeding = feature_column.embedding_column(color_column, 7) color_embeding_dense_tensor = feature_column.input_layer( color_data, [color_embeding]) builder = _LazyBuilder(color_data) color_column_tensor = color_column._get_sparse_tensors(builder) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('embeding' + '-' * 40) print(session.run([color_embeding_dense_tensor]))
def test_weighted_categorical_column(): color_data = { 'color': [['R'], ['G'], ['B'], ['A']], 'weight': [[1.0], [2.0], [4.0], [8.0]] } # 4行样本 color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) color_weight_categorical_column = feature_column.weighted_categorical_column( color_column, 'weight') builder = _LazyBuilder(color_data) with tf.Session() as session: id_tensor, weight = color_weight_categorical_column._get_sparse_tensors( builder) session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('weighted categorical' + '-' * 40) print(session.run([id_tensor])) print('-' * 40) print(session.run([weight]))
def test_sequence_length_with_empty_rows(self): """Tests _sequence_length when some examples do not have ids.""" vocabulary_size = 3 sparse_input = sparse_tensor.SparseTensorValue( # example 0, ids [] # example 1, ids [2] # example 2, ids [0, 1] # example 3, ids [] # example 4, ids [1] # example 5, ids [] indices=((1, 0), (2, 0), (2, 1), (4, 0)), values=(2, 0, 1, 1), dense_shape=(6, 2)) expected_sequence_length = [0, 1, 2, 0, 1, 0] categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) indicator_column = fc.indicator_column(categorical_column) _, sequence_length = indicator_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': sparse_input})) with monitored_session.MonitoredSession() as sess: self.assertAllEqual( expected_sequence_length, sequence_length.eval(session=sess))
def test_categorical_column_with_hash_bucket(): #源数据 color_data = {'color': [[2], [5], [-1], [0]]} # 4行样本 shape=[4,1] builder = _LazyBuilder(color_data) # categorical_column color_column = feature_column.categorical_column_with_hash_bucket( 'color', 7, dtype=tf.int32) # tensor color_column_tensor = color_column._get_sparse_tensors(builder) #稀疏表示 with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) # 通过indicator_column,将稀疏的转换成dense,也就是one-hot形式,只是multi-hot color_column_identy = feature_column.indicator_column(color_column) #input_layer连接数据源和声明的column生成新的tensor color_dense_tensor = feature_column.input_layer(color_data, [color_column_identy]) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run([color_dense_tensor]))
def test_embedding(): tf.set_random_seed(1) #源数据 color_data = { 'color': [['R', 'G'], ['G', 'A'], ['B', 'B'], ['A', 'A']] } # 4行样本 builder = _LazyBuilder(color_data) # categorical_column 要想转为 embedding 先将源数据的clomn表达为categorical_column 这里只是声明没有源数据 color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) # tensor 数据源 将数据源表达成tensor color_column_tensor = color_column._get_sparse_tensors(builder) #获取embedding_column; 第一个参数是:categorical_column; 第二个参数是维度 color_embedding_column = feature_column.embedding_column(color_column, 4, combiner='sum') # 转化为tensor input_layer(数据源,column) 连接起数据源和embedding_column color_embeding_dense_tensor = feature_column.input_layer( color_data, [color_embedding_column]) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) print('embeding' + '_' * 40) print(session.run([color_embeding_dense_tensor]))
def test_get_dense_tensor(self): # Inputs. vocabulary_size = 3 sparse_input = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] # example 2, ids [] # example 3, ids [1] indices=((0, 0), (1, 0), (1, 4), (3, 0)), values=(2, 0, 1, 1), dense_shape=(4, 5)) # Embedding variable. embedding_dimension = 2 embedding_values = ( (1., 2.), # id 0 (3., 5.), # id 1 (7., 11.) # id 2 ) def _initializer(shape, dtype, partition_info): self.assertAllEqual((vocabulary_size, embedding_dimension), shape) self.assertEqual(dtypes.float32, dtype) self.assertIsNone(partition_info) return embedding_values # Expected lookup result, using combiner='mean'. expected_lookups = ( # example 0, ids [2], embedding = [7, 11] (7., 11.), # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5] (2., 3.5), # example 2, ids [], embedding = [0, 0] (0., 0.), # example 3, ids [1], embedding = [3, 5] (3., 5.), ) # Build columns. categorical_column = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) embedding_column = tpu_fc.embedding_column( categorical_column, dimension=embedding_dimension, initializer=_initializer) # Provide sparse input and get dense result. embedding_lookup = embedding_column._get_dense_tensor( fc._LazyBuilder({ 'aaa': sparse_input })) # Assert expected embedding variable and lookups. global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) self.assertItemsEqual(('embedding_weights:0',), tuple([v.name for v in global_vars])) with _initialized_session(): self.assertAllEqual(embedding_values, global_vars[0]) self.assertAllEqual(expected_lookups, embedding_lookup)
def test_get_dense_tensor(self): # Inputs. vocabulary_size = 3 sparse_input = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] # example 2, ids [] # example 3, ids [1] indices=((0, 0), (1, 0), (1, 4), (3, 0)), values=(2, 0, 1, 1), dense_shape=(4, 5)) # Embedding variable. embedding_dimension = 2 embedding_values = ( (1., 2.), # id 0 (3., 5.), # id 1 (7., 11.) # id 2 ) def _initializer(shape, dtype, partition_info): self.assertAllEqual((vocabulary_size, embedding_dimension), shape) self.assertEqual(dtypes.float32, dtype) self.assertIsNone(partition_info) return embedding_values # Expected lookup result, using combiner='mean'. expected_lookups = ( # example 0, ids [2], embedding = [7, 11] (7., 11.), # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5] (2., 3.5), # example 2, ids [], embedding = [0, 0] (0., 0.), # example 3, ids [1], embedding = [3, 5] (3., 5.), ) # Build columns. categorical_column = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) embedding_column = tpu_fc.embedding_column( categorical_column, dimension=embedding_dimension, initializer=_initializer) # Provide sparse input and get dense result. embedding_lookup = embedding_column._get_dense_tensor( fc._LazyBuilder({ 'aaa': sparse_input })) # Assert expected embedding variable and lookups. global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) self.assertItemsEqual(('embedding_weights:0',), tuple([v.name for v in global_vars])) with _initialized_session(): self.assertAllEqual(embedding_values, global_vars[0].eval()) self.assertAllEqual(expected_lookups, embedding_lookup.eval())
def show_column1(data: dict, feature_column): builder = _LazyBuilder(data) id_tensor, weight = feature_column._get_sparse_tensors(builder) if weight is None: return id_tensor.values else: return id_tensor.values, weight.values
def test_dtype_should_match_with_tensor(self): hashed_sparse = fc.categorical_column_with_hash_bucket( 'wire', 10, dtype=dtypes.int64) wire_tensor = sparse_tensor.SparseTensor(values=['omar'], indices=[[0, 0]], dense_shape=[1, 1]) builder = fc._LazyBuilder({'wire': wire_tensor}) with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'): builder.get(hashed_sparse)
def test_get_sparse_tensors(self): hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10) wire_tensor = sparse_tensor.SparseTensor( values=['omar', 'stringer', 'marlo'], indices=[[0, 0], [1, 0], [1, 1]], dense_shape=[2, 2]) builder = fc._LazyBuilder({'wire': wire_tensor}) self.assertEqual(builder.get(hashed_sparse), hashed_sparse._get_sparse_tensors(builder).id_tensor)
def test_sparse_tensor_not_supported(self): price = fc.numeric_column('price') builder = fc._LazyBuilder({ 'price': sparse_tensor.SparseTensor(indices=[[0, 0]], values=[0.3], dense_shape=[1, 1]) }) with self.assertRaisesRegexp(ValueError, 'must be a Tensor'): price._transform_feature(builder)
def test_get_dense_tensor(self): def _increment_two(input_tensor): return input_tensor + 2. price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two) builder = fc._LazyBuilder( {'price': constant_op.constant([[1., 2.], [5., 6.]])}) self.assertEqual(builder.get(price), price._get_dense_tensor(builder))
def practise(): fx = {'x': [['a', 'a'], ['b', 'c'], ['c', 'e'], ['d', ''], ['e', 'f']]} fc = feature_column.categorical_column_with_hash_bucket('x', 5) fic = feature_column.indicator_column(fc) t2 = fc._get_sparse_tensors(_LazyBuilder(fx)).id_tensor tsor = feature_column.input_layer(fx, fic) with tf.Session() as sess: print(sess.run(t2)) print(sess.run(tsor))
def calc_weight(self): with ops.name_scope(None, 'weights', values=self.features.values()): if self.weight_column is None or self.mode == model_fn.ModeKeys.PREDICT or self.mode == model_fn.ModeKeys.EVAL: return 1. weight_column = feature_column_lib.numeric_column( key=self.weight_column) weights = weight_column._get_dense_tensor( feature_column_lib._LazyBuilder(self.features)) weights = math_ops.to_float(weights, name='weights') return weights
def test_key_should_be_string_or_feature_colum(self): class NotAFeatureColumn(object): pass builder = fc._LazyBuilder( features={'a': constant_op.constant([[2], [3.]])}) with self.assertRaisesRegexp( TypeError, '"key" must be either a "str" or "_FeatureColumn".'): builder.get(NotAFeatureColumn())
def dnn_logit_fn(features, mode): with variable_scope.variable_scope( 'input_from_feature_columns', values=tuple(six.itervalues(features)), partitioner=input_layer_partitioner): inputs = feature_column_lib.input_layer( features=features, feature_columns=feature_columns) dense = inputs for layer_id, num_hidden_units in enumerate(hidden_units): with variable_scope.variable_scope( 'dense_layer_%d' % layer_id, values=(dense, )) as hidden_layer_scope: dense = core_layers.dense( dense, units=num_hidden_units, activation=activation_fn, kernel_initializer=init_ops.glorot_uniform_initializer(), name=hidden_layer_scope) if dropout is not None and mode == model_fn.ModeKeys.TRAIN: dense = core_layers.dropout(dense, rate=dropout, training=True) _add_hidden_layer_summary(dense, hidden_layer_scope.name) with variable_scope.variable_scope( 'fm_layer', values=(inputs, )) as cross_layer_scope: builder = feature_column_lib._LazyBuilder(features) fm_outputs = [] for col_pair in fm_feature_columns: column1, column2 = col_pair tensor1 = column1._get_dense_tensor(builder, trainable=True) num_elements = column1._variable_shape.num_elements() batch_size = array_ops.shape(tensor1)[0] tensor2 = column2._get_dense_tensor(builder, trainable=True) tensor1 = array_ops.reshape(tensor1, shape=(batch_size, num_elements)) tensor2 = array_ops.reshape(tensor2, shape=(batch_size, num_elements)) fm_outputs.append(matmul(tensor1, tensor2)) fm_outputs = tf.convert_to_tensor(fm_outputs) _add_hidden_layer_summary(fm_outputs, cross_layer_scope.name) with variable_scope.variable_scope( 'logits', values=(dense, fm_outputs)) as logits_scope: dense_cross = concat([dense, fm_outputs], axis=1) logits = core_layers.dense( dense_cross, units=1, activation=None, kernel_initializer=init_ops.glorot_uniform_initializer(), name=logits_scope) _add_hidden_layer_summary(logits, logits_scope.name) return logits
def test_sequence_length(self): vocabulary_size = 3 sparse_input_a = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] indices=((0, 0), (1, 0), (1, 1)), values=(2, 0, 1), dense_shape=(2, 2)) expected_sequence_length_a = [1, 2] categorical_column_a = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) sparse_input_b = sparse_tensor.SparseTensorValue( # example 0, ids [0, 2] # example 1, ids [1] indices=((0, 0), (0, 1), (1, 0)), values=(0, 2, 1), dense_shape=(2, 2)) expected_sequence_length_b = [2, 1] categorical_column_b = sfc.sequence_categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) shared_embedding_columns = fc.shared_embedding_columns( [categorical_column_a, categorical_column_b], dimension=2) sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor( _LazyBuilder({ 'aaa': sparse_input_a }))[1] sequence_length_b = shared_embedding_columns[1]._get_sequence_dense_tensor( _LazyBuilder({ 'bbb': sparse_input_b }))[1] with monitored_session.MonitoredSession() as sess: sequence_length_a = sess.run(sequence_length_a) self.assertAllEqual(expected_sequence_length_a, sequence_length_a) self.assertEqual(np.int64, sequence_length_a.dtype) sequence_length_b = sess.run(sequence_length_b) self.assertAllEqual(expected_sequence_length_b, sequence_length_b) self.assertEqual(np.int64, sequence_length_b.dtype)
def test_get_sequence_dense_tensor(self): vocabulary_size = 3 sparse_input = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] # example 2, ids [] # example 3, ids [1] indices=((0, 0), (1, 0), (1, 1), (3, 0)), values=(2, 0, 1, 1), dense_shape=(4, 2)) embedding_dimension = 2 embedding_values = ( (1., 2.), # id 0 (3., 5.), # id 1 (7., 11.) # id 2 ) def _initializer(shape, dtype, partition_info): self.assertAllEqual((vocabulary_size, embedding_dimension), shape) self.assertEqual(dtypes.float32, dtype) self.assertIsNone(partition_info) return embedding_values expected_lookups = [ # example 0, ids [2] [[7., 11.], [0., 0.]], # example 1, ids [0, 1] [[1., 2.], [3., 5.]], # example 2, ids [] [[0., 0.], [0., 0.]], # example 3, ids [1] [[3., 5.], [0., 0.]], ] categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) embedding_column = sfc._sequence_embedding_column( categorical_column, dimension=embedding_dimension, initializer=_initializer) embedding_lookup, _ = embedding_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': sparse_input})) global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) self.assertItemsEqual(('embedding_weights:0', ), tuple([v.name for v in global_vars])) with monitored_session.MonitoredSession() as sess: self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess)) self.assertAllEqual(expected_lookups, embedding_lookup.eval(session=sess))
def test_sequence_length(self): column = sfc.sequence_categorical_column_with_hash_bucket( 'aaa', hash_bucket_size=10) inputs = sparse_tensor.SparseTensorValue( indices=((0, 0), (1, 0), (1, 1)), values=('omar', 'stringer', 'marlo'), dense_shape=(2, 2)) expected_sequence_length = [1, 2] sequence_length = column._sequence_length(_LazyBuilder({'aaa': inputs})) with monitored_session.MonitoredSession() as sess: self.assertAllEqual( expected_sequence_length, sequence_length.eval(session=sess))
def test_sequence_length_with_zeros(self): column = sfc.sequence_categorical_column_with_identity( 'aaa', num_buckets=3) inputs = sparse_tensor.SparseTensorValue( indices=((1, 0), (3, 0), (3, 1)), values=(1, 2, 0), dense_shape=(5, 2)) expected_sequence_length = [0, 1, 0, 2, 0] sequence_length = column._sequence_length(_LazyBuilder({'aaa': inputs})) with monitored_session.MonitoredSession() as sess: self.assertAllEqual( expected_sequence_length, sequence_length.eval(session=sess))
def test_get_sequence_dense_tensor(self): vocabulary_size = 3 sparse_input = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] # example 2, ids [] # example 3, ids [1] indices=((0, 0), (1, 0), (1, 1), (3, 0)), values=(2, 0, 1, 1), dense_shape=(4, 2)) embedding_dimension = 2 embedding_values = ( (1., 2.), # id 0 (3., 5.), # id 1 (7., 11.) # id 2 ) def _initializer(shape, dtype, partition_info): self.assertAllEqual((vocabulary_size, embedding_dimension), shape) self.assertEqual(dtypes.float32, dtype) self.assertIsNone(partition_info) return embedding_values expected_lookups = [ # example 0, ids [2] [[7., 11.], [0., 0.]], # example 1, ids [0, 1] [[1., 2.], [3., 5.]], # example 2, ids [] [[0., 0.], [0., 0.]], # example 3, ids [1] [[3., 5.], [0., 0.]], ] categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) embedding_column = fc.embedding_column( categorical_column, dimension=embedding_dimension, initializer=_initializer) embedding_lookup, _ = embedding_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': sparse_input})) global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) self.assertItemsEqual( ('embedding_weights:0',), tuple([v.name for v in global_vars])) with monitored_session.MonitoredSession() as sess: self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess)) self.assertAllEqual(expected_lookups, embedding_lookup.eval(session=sess))
def test_sequence_length(self): column = sfc.sequence_categorical_column_with_identity( 'aaa', num_buckets=3) inputs = sparse_tensor.SparseTensorValue( indices=((0, 0), (1, 0), (1, 1)), values=(1, 2, 0), dense_shape=(2, 2)) expected_sequence_length = [1, 2] sequence_length = column._sequence_length(_LazyBuilder({'aaa': inputs})) with monitored_session.MonitoredSession() as sess: sequence_length = sess.run(sequence_length) self.assertAllEqual(expected_sequence_length, sequence_length) self.assertEqual(np.int64, sequence_length.dtype)
def test_sequence_length(self): column = sfc.sequence_categorical_column_with_vocabulary_file( key='aaa', vocabulary_file=self._wire_vocabulary_file_name, vocabulary_size=self._wire_vocabulary_size) inputs = sparse_tensor.SparseTensorValue( indices=((0, 0), (1, 0), (1, 1)), values=('marlo', 'skywalker', 'omar'), dense_shape=(2, 2)) expected_sequence_length = [1, 2] sequence_length = column._sequence_length(_LazyBuilder({'aaa': inputs})) with monitored_session.MonitoredSession() as sess: self.assertAllEqual( expected_sequence_length, sequence_length.eval(session=sess))
def _weights(features, weight_column): """Fetches weights from features.""" if weight_column is None: return 1. if isinstance(weight_column, six.string_types): weight_column = feature_column_lib.numeric_column(key=weight_column) if not isinstance(weight_column, feature_column_lib._NumericColumn): # pylint: disable=protected-access raise TypeError('Weight column must be either a string or _NumericColumn. ' 'Given type: {}.'.format(type(weight_column))) weights = weight_column._get_dense_tensor( # pylint: disable=protected-access feature_column_lib._LazyBuilder(features)) # pylint: disable=protected-access if not (weights.dtype.is_floating or weights.dtype.is_integer): raise ValueError('Weight column should be castable to float. ' 'Given dtype: {}'.format(weights.dtype)) weights = _maybe_expand_dim(math_ops.to_float(weights, name='weights')) return weights
def test_sequence_length_with_shape(self): """Tests _sequence_length with shape !=(1,).""" sparse_input = sparse_tensor.SparseTensorValue( # example 0, values [[0.], [1]] # example 1, [[10.]] indices=((0, 0), (0, 1), (1, 0)), values=(0., 1., 10.), dense_shape=(2, 2)) expected_sequence_length = [2, 1] numeric_column = sfc.sequence_numeric_column('aaa') _, sequence_length = numeric_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': sparse_input})) with monitored_session.MonitoredSession() as sess: self.assertAllEqual( expected_sequence_length, sequence_length.eval(session=sess))
def test_get_sparse_tensors_inputs3d(self): """Tests _get_sparse_tensors when the input is already 3D Tensor.""" column = sfc.sequence_categorical_column_with_identity( 'aaa', num_buckets=3) inputs = sparse_tensor.SparseTensorValue( indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)), values=(1, 2, 0), dense_shape=(2, 2, 1)) with self.assertRaisesRegexp( errors.InvalidArgumentError, r'Column aaa expected ID tensor of rank 2\.\s*' r'id_tensor shape:\s*\[2 2 1\]'): id_weight_pair = column._get_sparse_tensors( _LazyBuilder({'aaa': inputs})) with monitored_session.MonitoredSession() as sess: id_weight_pair.id_tensor.eval(session=sess)
def test_sequence_length(self): sparse_input = sparse_tensor.SparseTensorValue( # example 0, values [[0., 1., 2.], [3., 4., 5.]] # example 1, [[10., 11., 12.]] indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (1, 0), (1, 1), (1, 2)), values=(0., 1., 2., 3., 4., 5., 10., 11., 12.), dense_shape=(2, 6)) expected_sequence_length = [2, 1] numeric_column = sfc.sequence_numeric_column('aaa', shape=(3,)) _, sequence_length = numeric_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': sparse_input})) with monitored_session.MonitoredSession() as sess: self.assertAllEqual( expected_sequence_length, sequence_length.eval(session=sess))
def test_get_sequence_dense_tensor(self): sparse_input = sparse_tensor.SparseTensorValue( # example 0, values [[0.], [1]] # example 1, [[10.]] indices=((0, 0), (0, 1), (1, 0)), values=(0., 1., 10.), dense_shape=(2, 2)) expected_dense_tensor = [ [[0.], [1.]], [[10.], [0.]], ] numeric_column = sfc.sequence_numeric_column('aaa') dense_tensor, _ = numeric_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': sparse_input})) with monitored_session.MonitoredSession() as sess: self.assertAllEqual( expected_dense_tensor, dense_tensor.eval(session=sess))
def test_get_sparse_tensors(self): column = sfc.sequence_categorical_column_with_identity( 'aaa', num_buckets=3) inputs = sparse_tensor.SparseTensorValue( indices=((0, 0), (1, 0), (1, 1)), values=(1, 2, 0), dense_shape=(2, 2)) expected_sparse_ids = sparse_tensor.SparseTensorValue( indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)), values=np.array((1, 2, 0), dtype=np.int64), dense_shape=(2, 2, 1)) id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs})) self.assertIsNone(id_weight_pair.weight_tensor) with monitored_session.MonitoredSession() as sess: _assert_sparse_tensor_value( self, expected_sparse_ids, id_weight_pair.id_tensor.eval(session=sess))
def test_get_dense_tensor_multi_dim(self): """Tests get_sequence_dense_tensor for multi-dim numeric_column.""" sparse_input = sparse_tensor.SparseTensorValue( # example 0, values [[[0., 1.], [2., 3.]], [[4., 5.], [6., 7.]]] # example 1, [[[10., 11.], [12., 13.]]] indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7), (1, 0), (1, 1), (1, 2), (1, 3)), values=(0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.), dense_shape=(2, 8)) expected_dense_tensor = [ [[[0., 1.], [2., 3.]], [[4., 5.], [6., 7.]]], [[[10., 11.], [12., 13.]], [[0., 0.], [0., 0.]]], ] numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2)) dense_tensor, _ = numeric_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': sparse_input})) with monitored_session.MonitoredSession() as sess: self.assertAllEqual( expected_dense_tensor, dense_tensor.eval(session=sess))
def test_get_sparse_tensors(self): column = sfc.sequence_categorical_column_with_vocabulary_list( key='aaa', vocabulary_list=('omar', 'stringer', 'marlo')) inputs = sparse_tensor.SparseTensorValue( indices=((0, 0), (1, 0), (1, 1)), values=('marlo', 'skywalker', 'omar'), dense_shape=(2, 2)) expected_sparse_ids = sparse_tensor.SparseTensorValue( indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)), values=np.array((2, -1, 0), dtype=np.int64), dense_shape=(2, 2, 1)) id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs})) self.assertIsNone(id_weight_pair.weight_tensor) with monitored_session.MonitoredSession() as sess: _assert_sparse_tensor_value( self, expected_sparse_ids, id_weight_pair.id_tensor.eval(session=sess))
def test_sequence_length(self): vocabulary_size = 3 sparse_input = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] indices=((0, 0), (1, 0), (1, 1)), values=(2, 0, 1), dense_shape=(2, 2)) expected_sequence_length = [1, 2] categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) indicator_column = fc.indicator_column(categorical_column) _, sequence_length = indicator_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': sparse_input})) with monitored_session.MonitoredSession() as sess: sequence_length = sess.run(sequence_length) self.assertAllEqual(expected_sequence_length, sequence_length) self.assertEqual(np.int64, sequence_length.dtype)
def test_sequence_length_with_empty_rows(self): """Tests _sequence_length when some examples do not have ids.""" sparse_input = sparse_tensor.SparseTensorValue( # example 0, values [] # example 1, values [[0.], [1.]] # example 2, [[2.]] # example 3, values [] # example 4, [[3.]] # example 5, values [] indices=((1, 0), (1, 1), (2, 0), (4, 0)), values=(0., 1., 2., 3.), dense_shape=(6, 2)) expected_sequence_length = [0, 2, 1, 0, 1, 0] numeric_column = sfc.sequence_numeric_column('aaa') _, sequence_length = numeric_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': sparse_input})) with monitored_session.MonitoredSession() as sess: self.assertAllEqual( expected_sequence_length, sequence_length.eval(session=sess))
def test_get_sequence_dense_tensor_with_shape(self): """Tests get_sequence_dense_tensor with shape !=(1,).""" sparse_input = sparse_tensor.SparseTensorValue( # example 0, values [[0., 1., 2.], [3., 4., 5.]] # example 1, [[10., 11., 12.]] indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (1, 0), (1, 1), (1, 2)), values=(0., 1., 2., 3., 4., 5., 10., 11., 12.), dense_shape=(2, 6)) expected_dense_tensor = [ [[0., 1., 2.], [3., 4., 5.]], [[10., 11., 12.], [0., 0., 0.]], ] numeric_column = sfc.sequence_numeric_column('aaa', shape=(3,)) dense_tensor, _ = numeric_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': sparse_input})) with monitored_session.MonitoredSession() as sess: self.assertAllEqual( expected_dense_tensor, dense_tensor.eval(session=sess))
def test_get_sparse_tensors(self): column = sfc.sequence_categorical_column_with_hash_bucket( 'aaa', hash_bucket_size=10) inputs = sparse_tensor.SparseTensorValue( indices=((0, 0), (1, 0), (1, 1)), values=('omar', 'stringer', 'marlo'), dense_shape=(2, 2)) expected_sparse_ids = sparse_tensor.SparseTensorValue( indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)), # Ignored to avoid hash dependence in test. values=np.array((0, 0, 0), dtype=np.int64), dense_shape=(2, 2, 1)) id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs})) self.assertIsNone(id_weight_pair.weight_tensor) with monitored_session.MonitoredSession() as sess: _assert_sparse_tensor_indices_shape( self, expected_sparse_ids, id_weight_pair.id_tensor.eval(session=sess))
def sequence_input_layer( features, feature_columns, weight_collections=None, trainable=True): """"Builds input layer for sequence input. All `feature_columns` must be sequence dense columns with the same `sequence_length`. The output of this method can be fed into sequence networks, such as RNN. The output of this method is a 3D `Tensor` of shape `[batch_size, T, D]`. `T` is the maximum sequence length for this batch, which could differ from batch to batch. If multiple `feature_columns` are given with `Di` `num_elements` each, their outputs are concatenated. So, the final `Tensor` has shape `[batch_size, T, D0 + D1 + ... + Dn]`. Example: ```python rating = sequence_numeric_column('rating') watches = sequence_categorical_column_with_identity( 'watches', num_buckets=1000) watches_embedding = embedding_column(watches, dimension=10) columns = [rating, watches] features = tf.parse_example(..., features=make_parse_example_spec(columns)) input_layer, sequence_length = sequence_input_layer(features, columns) rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size) outputs, state = tf.nn.dynamic_rnn( rnn_cell, inputs=input_layer, sequence_length=sequence_length) ``` Args: features: A dict mapping keys to tensors. feature_columns: An iterable of dense sequence columns. Valid columns are - `embedding_column` that wraps a `sequence_categorical_column_with_*` - `sequence_numeric_column`. weight_collections: A list of collection names to which the Variable will be added. Note that variables will also be added to collections `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`. trainable: If `True` also add the variable to the graph collection `GraphKeys.TRAINABLE_VARIABLES`. Returns: An `(input_layer, sequence_length)` tuple where: - input_layer: A float `Tensor` of shape `[batch_size, T, D]`. `T` is the maximum sequence length for this batch, which could differ from batch to batch. `D` is the sum of `num_elements` for all `feature_columns`. - sequence_length: An int `Tensor` of shape `[batch_size]`. The sequence length for each example. Raises: ValueError: If any of the `feature_columns` is the wrong type. """ feature_columns = fc._normalize_feature_columns(feature_columns) for c in feature_columns: if not isinstance(c, fc._SequenceDenseColumn): raise ValueError( 'All feature_columns must be of type _SequenceDenseColumn. ' 'You can wrap a sequence_categorical_column with an embedding_column ' 'or indicator_column. ' 'Given (type {}): {}'.format(type(c), c)) with variable_scope.variable_scope( None, default_name='sequence_input_layer', values=features.values()): builder = fc._LazyBuilder(features) output_tensors = [] sequence_lengths = [] ordered_columns = [] for column in sorted(feature_columns, key=lambda x: x.name): ordered_columns.append(column) with variable_scope.variable_scope( None, default_name=column._var_scope_name): dense_tensor, sequence_length = column._get_sequence_dense_tensor( builder, weight_collections=weight_collections, trainable=trainable) # Flattens the final dimension to produce a 3D Tensor. num_elements = column._variable_shape.num_elements() shape = array_ops.shape(dense_tensor) target_shape = [shape[0], shape[1], num_elements] output_tensors.append( array_ops.reshape(dense_tensor, shape=target_shape)) sequence_lengths.append(sequence_length) fc._verify_static_batch_size_equality(output_tensors, ordered_columns) fc._verify_static_batch_size_equality(sequence_lengths, ordered_columns) sequence_length = _assert_all_equal_and_return(sequence_lengths) return array_ops.concat(output_tensors, -1), sequence_length
def test_get_dense_tensor(self): # Inputs. vocabulary_size = 3 # -1 values are ignored. input_a = np.array([ [2, -1, -1], # example 0, ids [2] [0, 1, -1] ]) # example 1, ids [0, 1] input_b = np.array([ [0, -1, -1], # example 0, ids [0] [-1, -1, -1] ]) # example 1, ids [] input_features = {'aaa': input_a, 'bbb': input_b} # Embedding variable. embedding_dimension = 2 embedding_values = ( (1., 2.), # id 0 (3., 5.), # id 1 (7., 11.) # id 2 ) def _initializer(shape, dtype, partition_info): self.assertAllEqual((vocabulary_size, embedding_dimension), shape) self.assertEqual(dtypes.float32, dtype) self.assertIsNone(partition_info) return embedding_values # Expected lookup result, using combiner='mean'. expected_lookups_a = ( # example 0: (7., 11.), # ids [2], embedding = [7, 11] # example 1: (2., 3.5), # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5] ) expected_lookups_b = ( # example 0: (1., 2.), # ids [0], embedding = [1, 2] # example 1: (0., 0.), # ids [], embedding = [0, 0] ) # Build columns. categorical_column_a = fc_lib.categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) categorical_column_b = fc_lib.categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) embedding_column_a, embedding_column_b = tpu_fc.shared_embedding_columns( [categorical_column_a, categorical_column_b], dimension=embedding_dimension, initializer=_initializer) # Provide sparse input and get dense result. embedding_lookup_a = embedding_column_a._get_dense_tensor( fc._LazyBuilder(input_features)) embedding_lookup_b = embedding_column_b._get_dense_tensor( fc._LazyBuilder(input_features)) # Assert expected embedding variable and lookups. global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) self.assertItemsEqual(('embedding_weights:0',), tuple([v.name for v in global_vars])) embedding_var = global_vars[0] with _initialized_session(): self.assertAllEqual(embedding_values, embedding_var.eval()) self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval()) self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
def sequence_input_layer( features, feature_columns, weight_collections=None, trainable=True, scope=None): """"Builds input layer for sequence input. All `feature_columns` must be sequence dense columns with the same `sequence_length`. The output of this method can be fed into sequence networks, such as RNN. The output of this method is a 3D `Tensor` of shape `[batch_size, T, D]`. `T` is the maximum sequence length for this batch, which could differ from batch to batch. If multiple `feature_columns` are given with `Di` `num_elements` each, their outputs are concatenated. So, the final `Tensor` has shape `[batch_size, T, D0 + D1 + ... + Dn]`. Example: ```python rating = sequence_numeric_column('rating') watches = sequence_categorical_column_with_identity( 'watches', num_buckets=1000) watches_embedding = embedding_column(watches, dimension=10) columns = [rating, watches] features = tf.parse_example(..., features=make_parse_example_spec(columns)) input_layer, sequence_length = sequence_input_layer(features, columns) rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size) outputs, state = tf.nn.dynamic_rnn( rnn_cell, inputs=input_layer, sequence_length=sequence_length) ``` Returns: An `(input_layer, sequence_length)` tuple where: - input_layer: A float `Tensor` of shape `[batch_size, T, D]`. `T` is the maximum sequence length for this batch, which could differ from batch to batch. `D` is the sum of `num_elements` for all `feature_columns`. - sequence_length: An int `Tensor` of shape `[batch_size]`. The sequence length for each example. Raises: ValueError: If any of the `feature_columns` is the wrong type. """ feature_columns = fc._clean_feature_columns(feature_columns) for c in feature_columns: if not isinstance(c, _SequenceDenseColumn): raise ValueError( 'All feature_columns must be of type _SequenceDenseColumn. ' 'Given (type {}): {}'.format(type(c), c)) with variable_scope.variable_scope( scope, default_name='sequence_input_layer', values=features.values()): builder = fc._LazyBuilder(features) output_tensors = [] sequence_lengths = [] ordered_columns = [] for column in sorted(feature_columns, key=lambda x: x.name): ordered_columns.append(column) with variable_scope.variable_scope( None, default_name=column._var_scope_name): dense_tensor, sequence_length = column._get_sequence_dense_tensor( builder, weight_collections=weight_collections, trainable=trainable) # Flattens the final dimension to produce a 3D Tensor. num_elements = column._variable_shape.num_elements() shape = array_ops.shape(dense_tensor) output_tensors.append( array_ops.reshape( dense_tensor, shape=array_ops.concat([shape[:2], [num_elements]], axis=0))) sequence_lengths.append(sequence_length) fc._verify_static_batch_size_equality(output_tensors, ordered_columns) # TODO(b/73160931): Verify sequence_length equality. return array_ops.concat(output_tensors, -1), sequence_lengths[0]
def _get_weights_and_check_match_logits( features, weight_column, logits, allow_per_logit_weights=False): """Fetches weights from features and checks that the shape matches logits. Consider logits of shape [D0, D1, ... DN, logits_dimension]. Weights shape can be either: * [D0, D1, ... DN, logits_dimension] if `allow_per_logit_weights=True`. * [D0, D1, ... DN, 1] * [D0, D1, ... DN]: In this case, weights is reshaped into [D0, D1, ... DN, 1] to work with weight broadcasting rules. Args: features: The features dict that contains weights. weight_column: The weight column. If not given, this method returns 1. logits: logits Tensor. allow_per_logit_weights: Boolean. Whether we allow weights along the logits dimension, namely shape `[D0, D1, ... DN, logits_dimension]`. Returns: Validated and reshaped weights Tensor. Raises: ValueError: If the weights `Tensor` cannot be cast into float. """ if allow_per_logit_weights: err_msg = ( 'weights shape must be [D0, D1, ... DN], [D0, D1, ... DN, 1] or ' '[D0, D1, ... DN, logits_dimension]') else: err_msg = ( 'weights shape must be [D0, D1, ... DN] or [D0, D1, ... DN, 1]') with ops.name_scope( None, 'weights', values=tuple(six.itervalues(features)) + (logits,)) as scope: # Fetch the weights. if weight_column is None: return 1. if isinstance(weight_column, six.string_types): weight_column = feature_column_lib.numeric_column( key=weight_column, shape=(1,)) if not isinstance(weight_column, feature_column_lib._NumericColumn): # pylint: disable=protected-access raise TypeError('Weight column must be either a string or _NumericColumn.' ' Given type: {}.'.format(type(weight_column))) weights = weight_column._get_dense_tensor( # pylint: disable=protected-access feature_column_lib._LazyBuilder(features)) # pylint: disable=protected-access if not (weights.dtype.is_floating or weights.dtype.is_integer): raise ValueError('Weight column should be castable to float. ' 'Given dtype: {}'.format(weights.dtype)) weights = math_ops.to_float(weights, name='weights') # Validate the weights shape. weights_shape = array_ops.shape(weights, name='weights_shape') logits_shape = array_ops.shape(logits, name='logits_shape') if (weights.shape.ndims is not None and logits.shape.ndims is not None and weights.shape.ndims == logits.shape.ndims - 1): assert_dimension = check_ops.assert_equal( logits_shape[:-1], weights_shape, message=err_msg, data=['logits_shape: ', logits_shape, 'weights_shape: ', weights_shape]) with ops.control_dependencies([assert_dimension]): return array_ops.expand_dims(weights, -1, name=scope) supported_weights_shape = array_ops.concat([logits_shape[:-1], [1]], axis=0) if allow_per_logit_weights: condition = math_ops.reduce_any( [math_ops.reduce_all(math_ops.equal(logits_shape, weights_shape)), math_ops.reduce_all(math_ops.equal( supported_weights_shape, weights_shape))]) assert_dimension = control_flow_ops.Assert( condition=condition, data=[err_msg, 'logits_shape: ', logits_shape, 'weights_shape: ', weights_shape]) else: assert_dimension = check_ops.assert_equal( supported_weights_shape, weights_shape, message=err_msg, data=['logits_shape: ', logits_shape, 'weights_shape: ', weights_shape]) with ops.control_dependencies([assert_dimension]): return array_ops.identity(weights, name=scope)