def test_shared_embedding_column_with_non_sequence_categorical(self): """Tests that error is raised for non-sequence shared embedding column.""" vocabulary_size = 3 sparse_input_a = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] indices=((0, 0), (1, 0), (1, 1)), values=(2, 0, 1), dense_shape=(2, 2)) sparse_input_b = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] indices=((0, 0), (1, 0), (1, 1)), values=(2, 0, 1), dense_shape=(2, 2)) categorical_column_a = fc.categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) categorical_column_b = fc.categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) shared_embedding_columns = fc.shared_embedding_columns( [categorical_column_a, categorical_column_b], dimension=2) with self.assertRaisesRegexp( ValueError, r'In embedding_column: aaa_shared_embedding\. categorical_column must ' r'be of type _SequenceCategoricalColumn to use sequence_input_layer\.'): _, _ = sfc.sequence_input_layer( features={ 'aaa': sparse_input_a, 'bbb': sparse_input_b }, feature_columns=shared_embedding_columns)
def sequence_categorical_column_with_identity(key, num_buckets, default_value=None): return _SequenceCategoricalColumn( fc.categorical_column_with_identity(key=key, num_buckets=num_buckets, default_value=default_value))
def testWarmStart_SparseColumnIntegerized(self): # Create feature column. sc_int = fc.categorical_column_with_identity("sc_int", num_buckets=10) # Save checkpoint from which to warm-start. _, prev_int_val = self._create_prev_run_var( "linear_model/sc_int/weights", shape=[10, 1], initializer=ones()) # Verify we initialized the values correctly. self.assertAllEqual(np.ones([10, 1]), prev_int_val) partitioner = lambda shape, dtype: [1] * len(shape) # New graph, new session WITHOUT warm-starting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model([sc_int], partitioner) sess.run(variables.global_variables_initializer()) # Without warm-starting, the weights should be initialized using default # initializer (which is init_ops.zeros_initializer). self._assert_cols_to_vars(cols_to_vars, {sc_int: [np.zeros([10, 1])]}, sess) # New graph, new session with warm-starting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model([sc_int], partitioner) ws_util._warm_start( ws_util.WarmStartSettings( self.get_temp_dir(), vars_to_warm_start=".*sc_int.*")) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warm-started. self._assert_cols_to_vars(cols_to_vars, {sc_int: [prev_int_val]}, sess)
def testWarmStart_SparseColumnIntegerized(self): # Create feature column. sc_int = fc.categorical_column_with_identity("sc_int", num_buckets=10) # Save checkpoint from which to warm-start. _, prev_int_val = self._create_prev_run_var( "linear_model/sc_int/weights", shape=[10, 1], initializer=ones()) # Verify we initialized the values correctly. self.assertAllEqual(np.ones([10, 1]), prev_int_val) partitioner = lambda shape, dtype: [1] * len(shape) # New graph, new session WITHOUT warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model([sc_int], partitioner) sess.run(variables.global_variables_initializer()) # Without warmstarting, the weights should be initialized using default # initializer (which is init_ops.zeros_initializer). self._assert_cols_to_vars(cols_to_vars, {sc_int: [np.zeros([10, 1])]}, sess) # New graph, new session with warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model([sc_int], partitioner) ws_util._warmstart(ws_util._WarmStartSettings( self.get_temp_dir(), vars_to_warmstart=".*sc_int.*")) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warmstarted. self._assert_cols_to_vars(cols_to_vars, {sc_int: [prev_int_val]}, sess)
def _test_parsed_sequence_example( self, col_name, col_fn, col_arg, shape, values): """Helper function to check that each FeatureColumn parses correctly. Args: col_name: string, name to give to the feature column. Should match the name that the column will parse out of the features dict. col_fn: function used to create the feature column. For example, sequence_numeric_column. col_arg: second arg that the target feature column is expecting. shape: the expected dense_shape of the feature after parsing into a SparseTensor. values: the expected values at index [0, 2, 6] of the feature after parsing into a SparseTensor. """ example = _make_sequence_example() columns = [ fc.categorical_column_with_identity('int_ctx', num_buckets=100), fc.numeric_column('float_ctx'), col_fn(col_name, col_arg) ] context, seq_features = parsing_ops.parse_single_sequence_example( example.SerializeToString(), context_features=fc.make_parse_example_spec(columns[:2]), sequence_features=fc.make_parse_example_spec(columns[2:])) with self.cached_session() as sess: ctx_result, seq_result = sess.run([context, seq_features]) self.assertEqual(list(seq_result[col_name].dense_shape), shape) self.assertEqual( list(seq_result[col_name].values[[0, 2, 6]]), values) self.assertEqual(list(ctx_result['int_ctx'].dense_shape), [1]) self.assertEqual(ctx_result['int_ctx'].values[0], 5) self.assertEqual(list(ctx_result['float_ctx'].shape), [1]) self.assertAlmostEqual(ctx_result['float_ctx'][0], 123.6, places=1)
def sequence_categorical_column_with_identity( key, num_buckets, default_value=None): return _SequenceCategoricalColumn( fc.categorical_column_with_identity( key=key, num_buckets=num_buckets, default_value=default_value))
def _build_feature_columns(self): col = fc.categorical_column_with_identity( 'int_ctx', num_buckets=100) ctx_cols = [ fc.embedding_column(col, dimension=10), fc.numeric_column('float_ctx')] identity_col = sfc.sequence_categorical_column_with_identity( 'int_list', num_buckets=10) bucket_col = sfc.sequence_categorical_column_with_hash_bucket( 'bytes_list', hash_bucket_size=100) seq_cols = [ fc.embedding_column(identity_col, dimension=10), fc.embedding_column(bucket_col, dimension=20)] return ctx_cols, seq_cols
def sequence_categorical_column_with_identity( key, num_buckets, default_value=None): """Returns a feature column that represents sequences of integers. Pass this to `embedding_column` or `indicator_column` to convert sequence categorical data into dense representation for input to sequence NN, such as RNN. Example: ```python watches = sequence_categorical_column_with_identity( 'watches', num_buckets=1000) watches_embedding = embedding_column(watches, dimension=10) columns = [watches_embedding] features = tf.parse_example(..., features=make_parse_example_spec(columns)) input_layer, sequence_length = sequence_input_layer(features, columns) rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size) outputs, state = tf.nn.dynamic_rnn( rnn_cell, inputs=input_layer, sequence_length=sequence_length) ``` Args: key: A unique string identifying the input feature. num_buckets: Range of inputs. Namely, inputs are expected to be in the range `[0, num_buckets)`. default_value: If `None`, this column's graph operations will fail for out-of-range inputs. Otherwise, this value must be in the range `[0, num_buckets)`, and will replace out-of-range inputs. Returns: A `_SequenceCategoricalColumn`. Raises: ValueError: if `num_buckets` is less than one. ValueError: if `default_value` is not in range `[0, num_buckets)`. """ return fc_old._SequenceCategoricalColumn( fc_old.categorical_column_with_identity( key=key, num_buckets=num_buckets, default_value=default_value))
def sequence_categorical_column_with_identity( key, num_buckets, default_value=None): """Returns a feature column that represents sequences of integers. Pass this to `embedding_column` or `indicator_column` to convert sequence categorical data into dense representation for input to sequence NN, such as RNN. Example: ```python watches = sequence_categorical_column_with_identity( 'watches', num_buckets=1000) watches_embedding = embedding_column(watches, dimension=10) columns = [watches_embedding] features = tf.parse_example(..., features=make_parse_example_spec(columns)) input_layer, sequence_length = sequence_input_layer(features, columns) rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size) outputs, state = tf.nn.dynamic_rnn( rnn_cell, inputs=input_layer, sequence_length=sequence_length) ``` Args: key: A unique string identifying the input feature. num_buckets: Range of inputs. Namely, inputs are expected to be in the range `[0, num_buckets)`. default_value: If `None`, this column's graph operations will fail for out-of-range inputs. Otherwise, this value must be in the range `[0, num_buckets)`, and will replace out-of-range inputs. Returns: A `_SequenceCategoricalColumn`. Raises: ValueError: if `num_buckets` is less than one. ValueError: if `default_value` is not in range `[0, num_buckets)`. """ return fc._SequenceCategoricalColumn( fc.categorical_column_with_identity( key=key, num_buckets=num_buckets, default_value=default_value))
def test_indicator_column_with_non_sequence_categorical(self): """Tests that error is raised for non-sequence categorical column.""" vocabulary_size = 3 sparse_input = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] indices=((0, 0), (1, 0), (1, 1)), values=(2, 0, 1), dense_shape=(2, 2)) categorical_column_a = fc.categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) indicator_column_a = fc.indicator_column(categorical_column_a) with self.assertRaisesRegexp( ValueError, r'In indicator_column: aaa_indicator\. categorical_column must be of ' r'type _SequenceCategoricalColumn to use sequence_input_layer\.'): _, _ = sfc.sequence_input_layer( features={'aaa': sparse_input}, feature_columns=[indicator_column_a])
def testWarmStart_MultipleCols(self): # Create vocab for sparse column "sc_vocab". vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"], "vocab") # Create feature columns. sc_int = fc.categorical_column_with_identity("sc_int", num_buckets=10) sc_hash = fc.categorical_column_with_hash_bucket( "sc_hash", hash_bucket_size=15) sc_keys = fc.categorical_column_with_vocabulary_list( "sc_keys", vocabulary_list=["a", "b", "c", "e"]) sc_vocab = fc.categorical_column_with_vocabulary_file( "sc_vocab", vocabulary_file=vocab_path, vocabulary_size=4) real = fc.numeric_column("real") real_bucket = fc.bucketized_column(real, boundaries=[0., 1., 2., 3.]) cross = fc.crossed_column([sc_keys, sc_vocab], hash_bucket_size=20) all_linear_cols = [sc_int, sc_hash, sc_keys, sc_vocab, real_bucket, cross] # Save checkpoint from which to warm-start. Also create a bias variable, # so we can check that it's also warm-started. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: sc_int_weights = variable_scope.get_variable( "linear_model/sc_int/weights", shape=[10, 1], initializer=ones()) sc_hash_weights = variable_scope.get_variable( "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms()) sc_keys_weights = variable_scope.get_variable( "linear_model/sc_keys/weights", shape=[4, 1], initializer=rand()) sc_vocab_weights = variable_scope.get_variable( "linear_model/sc_vocab/weights", shape=[4, 1], initializer=ones()) real_bucket_weights = variable_scope.get_variable( "linear_model/real_bucketized/weights", shape=[5, 1], initializer=norms()) cross_weights = variable_scope.get_variable( "linear_model/sc_keys_X_sc_vocab/weights", shape=[20, 1], initializer=rand()) bias = variable_scope.get_variable( "linear_model/bias_weights", shape=[1], initializer=rand()) self._write_checkpoint(sess) (prev_int_val, prev_hash_val, prev_keys_val, prev_vocab_val, prev_bucket_val, prev_cross_val, prev_bias_val) = sess.run([ sc_int_weights, sc_hash_weights, sc_keys_weights, sc_vocab_weights, real_bucket_weights, cross_weights, bias ]) partitioner = lambda shape, dtype: [1] * len(shape) # New graph, new session WITHOUT warm-starting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, partitioner) sess.run(variables.global_variables_initializer()) # Without warm-starting, all weights should be initialized using default # initializer (which is init_ops.zeros_initializer). self._assert_cols_to_vars(cols_to_vars, { sc_int: [np.zeros([10, 1])], sc_hash: [np.zeros([15, 1])], sc_keys: [np.zeros([4, 1])], sc_vocab: [np.zeros([4, 1])], real_bucket: [np.zeros([5, 1])], cross: [np.zeros([20, 1])], }, sess) # New graph, new session with warm-starting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, partitioner) vocab_info = ws_util.VocabInfo( new_vocab=sc_vocab.vocabulary_file, new_vocab_size=sc_vocab.vocabulary_size, num_oov_buckets=sc_vocab.num_oov_buckets, old_vocab=vocab_path) ws_util._warm_start( ws_util.WarmStartSettings( self.get_temp_dir(), var_name_to_vocab_info={ "linear_model/sc_vocab/weights": vocab_info })) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warm-started. self._assert_cols_to_vars(cols_to_vars, { sc_int: [prev_int_val], sc_hash: [prev_hash_val], sc_keys: [prev_keys_val], sc_vocab: [prev_vocab_val], real_bucket: [prev_bucket_val], cross: [prev_cross_val], "bias": [prev_bias_val], }, sess)
def testWarmStart_MultipleCols(self): # Create vocab for sparse column "sc_vocab". vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"], "vocab") # Create feature columns. sc_int = fc.categorical_column_with_identity("sc_int", num_buckets=10) sc_hash = fc.categorical_column_with_hash_bucket( "sc_hash", hash_bucket_size=15) sc_keys = fc.categorical_column_with_vocabulary_list( "sc_keys", vocabulary_list=["a", "b", "c", "e"]) sc_vocab = fc.categorical_column_with_vocabulary_file( "sc_vocab", vocabulary_file=vocab_path, vocabulary_size=4) real = fc.numeric_column("real") real_bucket = fc.bucketized_column(real, boundaries=[0., 1., 2., 3.]) cross = fc.crossed_column([sc_keys, sc_vocab], hash_bucket_size=20) all_linear_cols = [sc_int, sc_hash, sc_keys, sc_vocab, real_bucket, cross] # Save checkpoint from which to warm-start. Also create a bias variable, # so we can check that it's also warmstarted. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: sc_int_weights = variable_scope.get_variable( "linear_model/sc_int/weights", shape=[10, 1], initializer=ones()) sc_hash_weights = variable_scope.get_variable( "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms()) sc_keys_weights = variable_scope.get_variable( "linear_model/sc_keys/weights", shape=[4, 1], initializer=rand()) sc_vocab_weights = variable_scope.get_variable( "linear_model/sc_vocab/weights", shape=[4, 1], initializer=ones()) real_bucket_weights = variable_scope.get_variable( "linear_model/real_bucketized/weights", shape=[5, 1], initializer=norms()) cross_weights = variable_scope.get_variable( "linear_model/sc_keys_X_sc_vocab/weights", shape=[20, 1], initializer=rand()) bias = variable_scope.get_variable( "linear_model/bias_weights", shape=[1], initializer=rand()) self._write_checkpoint(sess) (prev_int_val, prev_hash_val, prev_keys_val, prev_vocab_val, prev_bucket_val, prev_cross_val, prev_bias_val) = sess.run([ sc_int_weights, sc_hash_weights, sc_keys_weights, sc_vocab_weights, real_bucket_weights, cross_weights, bias ]) partitioner = lambda shape, dtype: [1] * len(shape) # New graph, new session WITHOUT warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, partitioner) sess.run(variables.global_variables_initializer()) # Without warmstarting, all weights should be initialized using default # initializer (which is init_ops.zeros_initializer). self._assert_cols_to_vars(cols_to_vars, { sc_int: [np.zeros([10, 1])], sc_hash: [np.zeros([15, 1])], sc_keys: [np.zeros([4, 1])], sc_vocab: [np.zeros([4, 1])], real_bucket: [np.zeros([5, 1])], cross: [np.zeros([20, 1])], }, sess) # New graph, new session with warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, partitioner) vocab_info = ws_util._VocabInfo( new_vocab=sc_vocab.vocabulary_file, new_vocab_size=sc_vocab.vocabulary_size, num_oov_buckets=sc_vocab.num_oov_buckets, old_vocab=vocab_path ) ws_util._warmstart( ws_util._WarmStartSettings( self.get_temp_dir(), var_name_to_vocab_info={ "linear_model/sc_vocab/weights": vocab_info })) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warmstarted. self._assert_cols_to_vars(cols_to_vars, { sc_int: [prev_int_val], sc_hash: [prev_hash_val], sc_keys: [prev_keys_val], sc_vocab: [prev_vocab_val], real_bucket: [prev_bucket_val], cross: [prev_cross_val], "bias": [prev_bias_val], }, sess)