def _testExampleWeight(self, n_classes): def train_input_fn(): return { 'tokens': sparse_tensor.SparseTensor( values=['the', 'cat', 'sat', 'dog', 'barked'], indices=[[0, 0], [0, 1], [0, 2], [1, 0], [1, 1]], dense_shape=[2, 3]), 'w': [[1], [2]], }, [[1], [0]] col = seq_fc.sequence_categorical_column_with_hash_bucket( 'tokens', hash_bucket_size=10) embed = fc.embedding_column(col, dimension=2) input_units = 2 cell_units = [4, 2] est = rnn.RNNClassifier(num_units=cell_units, sequence_feature_columns=[embed], n_classes=n_classes, weight_column='w', model_dir=self._model_dir) # Train for a few steps, and validate final checkpoint. num_steps = 10 est.train(input_fn=train_input_fn, steps=num_steps) self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
def _testExampleWeight(self, n_classes): def train_input_fn(): return { 'tokens': sparse_tensor.SparseTensor( values=['the', 'cat', 'sat', 'dog', 'barked'], indices=[[0, 0], [0, 1], [0, 2], [1, 0], [1, 1]], dense_shape=[2, 3]), 'w': [[1], [2]], }, [[1], [0]] col = seq_fc.sequence_categorical_column_with_hash_bucket( 'tokens', hash_bucket_size=10) embed = fc.embedding_column(col, dimension=2) input_units = 2 cell_units = [4, 2] est = rnn.RNNClassifier( num_units=cell_units, sequence_feature_columns=[embed], n_classes=n_classes, weight_column='w', model_dir=self._model_dir) # Train for a few steps, and validate final checkpoint. num_steps = 10 est.train(input_fn=train_input_fn, steps=num_steps) self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
def testFromScratchWithCustomRNNCellFn(self): def train_input_fn(): return { 'tokens': sparse_tensor.SparseTensor(values=['the', 'cat', 'sat'], indices=[[0, 0], [0, 1], [0, 2]], dense_shape=[1, 3]), }, [[1]] col = seq_fc.sequence_categorical_column_with_hash_bucket( 'tokens', hash_bucket_size=10) embed = fc.embedding_column(col, dimension=2) input_units = 2 cell_units = [4, 2] n_classes = 2 def rnn_cell_fn(mode): del mode # unused cells = [rnn_cell.BasicRNNCell(num_units=n) for n in cell_units] return rnn_cell.MultiRNNCell(cells) est = rnn.RNNClassifier(sequence_feature_columns=[embed], rnn_cell_fn=rnn_cell_fn, n_classes=n_classes, model_dir=self._model_dir) # Train for a few steps, and validate final checkpoint. num_steps = 10 est.train(input_fn=train_input_fn, steps=num_steps) self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
def testFromScratchWithCustomRNNCellFn(self): def train_input_fn(): return { 'tokens': sparse_tensor.SparseTensor( values=['the', 'cat', 'sat'], indices=[[0, 0], [0, 1], [0, 2]], dense_shape=[1, 3]), }, [[1]] col = seq_fc.sequence_categorical_column_with_hash_bucket( 'tokens', hash_bucket_size=10) embed = fc.embedding_column(col, dimension=2) input_units = 2 cell_units = [4, 2] n_classes = 2 def rnn_cell_fn(mode): del mode # unused cells = [rnn_cell.BasicRNNCell(num_units=n) for n in cell_units] return rnn_cell.MultiRNNCell(cells) est = rnn.RNNClassifier( sequence_feature_columns=[embed], rnn_cell_fn=rnn_cell_fn, n_classes=n_classes, model_dir=self._model_dir) # Train for a few steps, and validate final checkpoint. num_steps = 10 est.train(input_fn=train_input_fn, steps=num_steps) self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
def testParseExampleInputFn(self): """Tests complete flow with input_fn constructed from parse_example.""" n_classes = 3 batch_size = 10 words = [b'dog', b'cat', b'bird', b'the', b'a', b'sat', b'flew', b'slept'] _, examples_file = tempfile.mkstemp() writer = python_io.TFRecordWriter(examples_file) for _ in range(batch_size): sequence_length = random.randint(1, len(words)) sentence = random.sample(words, sequence_length) label = random.randint(0, n_classes - 1) example = example_pb2.Example(features=feature_pb2.Features( feature={ 'tokens': feature_pb2.Feature(bytes_list=feature_pb2.BytesList( value=sentence)), 'label': feature_pb2.Feature(int64_list=feature_pb2.Int64List( value=[label])), })) writer.write(example.SerializeToString()) writer.close() col = seq_fc.sequence_categorical_column_with_hash_bucket( 'tokens', hash_bucket_size=10) embed = fc.embedding_column(col, dimension=2) feature_columns = [embed] feature_spec = parsing_utils.classifier_parse_example_spec( feature_columns, label_key='label', label_dtype=dtypes.int64) def _train_input_fn(): dataset = readers.make_batched_features_dataset( examples_file, batch_size, feature_spec) return dataset.map(lambda features: (features, features.pop('label'))) def _eval_input_fn(): dataset = readers.make_batched_features_dataset( examples_file, batch_size, feature_spec, num_epochs=1) return dataset.map(lambda features: (features, features.pop('label'))) def _predict_input_fn(): dataset = readers.make_batched_features_dataset( examples_file, batch_size, feature_spec, num_epochs=1) def features_fn(features): features.pop('label') return features return dataset.map(features_fn) self._test_complete_flow( feature_columns=feature_columns, train_input_fn=_train_input_fn, eval_input_fn=_eval_input_fn, predict_input_fn=_predict_input_fn, n_classes=n_classes, batch_size=batch_size)
def testParseExampleInputFn(self): """Tests complete flow with input_fn constructed from parse_example.""" n_classes = 3 batch_size = 10 words = [b'dog', b'cat', b'bird', b'the', b'a', b'sat', b'flew', b'slept'] _, examples_file = tempfile.mkstemp() writer = python_io.TFRecordWriter(examples_file) for _ in range(batch_size): sequence_length = random.randint(1, len(words)) sentence = random.sample(words, sequence_length) label = random.randint(0, n_classes - 1) example = example_pb2.Example(features=feature_pb2.Features( feature={ 'tokens': feature_pb2.Feature(bytes_list=feature_pb2.BytesList( value=sentence)), 'label': feature_pb2.Feature(int64_list=feature_pb2.Int64List( value=[label])), })) writer.write(example.SerializeToString()) writer.close() col = seq_fc.sequence_categorical_column_with_hash_bucket( 'tokens', hash_bucket_size=10) embed = fc.embedding_column(col, dimension=2) feature_columns = [embed] feature_spec = parsing_utils.classifier_parse_example_spec( feature_columns, label_key='label', label_dtype=dtypes.int64) def _train_input_fn(): dataset = readers.make_batched_features_dataset( examples_file, batch_size, feature_spec) return dataset.map(lambda features: (features, features.pop('label'))) def _eval_input_fn(): dataset = readers.make_batched_features_dataset( examples_file, batch_size, feature_spec, num_epochs=1) return dataset.map(lambda features: (features, features.pop('label'))) def _predict_input_fn(): dataset = readers.make_batched_features_dataset( examples_file, batch_size, feature_spec, num_epochs=1) def features_fn(features): features.pop('label') return features return dataset.map(features_fn) self._test_complete_flow( feature_columns=feature_columns, train_input_fn=_train_input_fn, eval_input_fn=_eval_input_fn, predict_input_fn=_predict_input_fn, n_classes=n_classes, batch_size=batch_size)
def test_sequence_length(self): column = sfc.sequence_categorical_column_with_hash_bucket( 'aaa', hash_bucket_size=10) inputs = sparse_tensor.SparseTensorValue( indices=((0, 0), (1, 0), (1, 1)), values=('omar', 'stringer', 'marlo'), dense_shape=(2, 2)) expected_sequence_length = [1, 2] sequence_length = column._sequence_length(_LazyBuilder({'aaa': inputs})) with monitored_session.MonitoredSession() as sess: self.assertAllEqual( expected_sequence_length, sequence_length.eval(session=sess))
def test_sequence_length(self): column = sfc.sequence_categorical_column_with_hash_bucket( 'aaa', hash_bucket_size=10) inputs = sparse_tensor.SparseTensorValue(indices=((0, 0), (1, 0), (1, 1)), values=('omar', 'stringer', 'marlo'), dense_shape=(2, 2)) expected_sequence_length = [1, 2] sequence_length = column._sequence_length(_LazyBuilder({'aaa': inputs})) with monitored_session.MonitoredSession() as sess: self.assertAllEqual(expected_sequence_length, sequence_length.eval(session=sess))
def _build_feature_columns(self): col = fc.categorical_column_with_identity( 'int_ctx', num_buckets=100) ctx_cols = [ fc.embedding_column(col, dimension=10), fc.numeric_column('float_ctx')] identity_col = sfc.sequence_categorical_column_with_identity( 'int_list', num_buckets=10) bucket_col = sfc.sequence_categorical_column_with_hash_bucket( 'bytes_list', hash_bucket_size=100) seq_cols = [ fc.embedding_column(identity_col, dimension=10), fc.embedding_column(bucket_col, dimension=20)] return ctx_cols, seq_cols
def testNumpyInputFn(self): """Tests complete flow with numpy_input_fn.""" n_classes = 3 batch_size = 10 words = ['dog', 'cat', 'bird', 'the', 'a', 'sat', 'flew', 'slept'] # Numpy only supports dense input, so all examples will have same length. # TODO(b/73160931): Update test when support for prepadded data exists. sequence_length = 3 features = [] for _ in range(batch_size): sentence = random.sample(words, sequence_length) features.append(sentence) x_data = np.array(features) y_data = np.random.randint(n_classes, size=batch_size) train_input_fn = numpy_io.numpy_input_fn( x={'tokens': x_data}, y=y_data, batch_size=batch_size, num_epochs=None, shuffle=True) eval_input_fn = numpy_io.numpy_input_fn( x={'tokens': x_data}, y=y_data, batch_size=batch_size, shuffle=False) predict_input_fn = numpy_io.numpy_input_fn( x={'tokens': x_data}, batch_size=batch_size, shuffle=False) col = seq_fc.sequence_categorical_column_with_hash_bucket( 'tokens', hash_bucket_size=10) embed = fc.embedding_column(col, dimension=2) feature_columns = [embed] self._test_complete_flow( feature_columns=feature_columns, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, predict_input_fn=predict_input_fn, n_classes=n_classes, batch_size=batch_size)
def testNumpyInputFn(self): """Tests complete flow with numpy_input_fn.""" n_classes = 3 batch_size = 10 words = ['dog', 'cat', 'bird', 'the', 'a', 'sat', 'flew', 'slept'] # Numpy only supports dense input, so all examples will have same length. # TODO(b/73160931): Update test when support for prepadded data exists. sequence_length = 3 features = [] for _ in range(batch_size): sentence = random.sample(words, sequence_length) features.append(sentence) x_data = np.array(features) y_data = np.random.randint(n_classes, size=batch_size) train_input_fn = numpy_io.numpy_input_fn( x={'tokens': x_data}, y=y_data, batch_size=batch_size, num_epochs=None, shuffle=True) eval_input_fn = numpy_io.numpy_input_fn( x={'tokens': x_data}, y=y_data, batch_size=batch_size, shuffle=False) predict_input_fn = numpy_io.numpy_input_fn( x={'tokens': x_data}, batch_size=batch_size, shuffle=False) col = seq_fc.sequence_categorical_column_with_hash_bucket( 'tokens', hash_bucket_size=10) embed = fc.embedding_column(col, dimension=2) feature_columns = [embed] self._test_complete_flow( feature_columns=feature_columns, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, predict_input_fn=predict_input_fn, n_classes=n_classes, batch_size=batch_size)
def _test_complete_flow( self, train_input_fn, eval_input_fn, predict_input_fn, n_classes, batch_size): col = seq_fc.sequence_categorical_column_with_hash_bucket( 'tokens', hash_bucket_size=10) embed = fc.embedding_column(col, dimension=2) feature_columns = [embed] cell_units = [4, 2] est = rnn.RNNClassifier( num_units=cell_units, sequence_feature_columns=feature_columns, n_classes=n_classes, model_dir=self._model_dir) # TRAIN num_steps = 10 est.train(train_input_fn, steps=num_steps) # EVALUATE scores = est.evaluate(eval_input_fn) self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP]) self.assertIn('loss', six.iterkeys(scores)) # PREDICT predicted_proba = np.array([ x[prediction_keys.PredictionKeys.PROBABILITIES] for x in est.predict(predict_input_fn) ]) self.assertAllEqual((batch_size, n_classes), predicted_proba.shape) # EXPORT feature_spec = { 'tokens': parsing_ops.VarLenFeature(dtypes.string), 'label': parsing_ops.FixedLenFeature([1], dtypes.int64), } serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn( feature_spec) export_dir = est.export_savedmodel(tempfile.mkdtemp(), serving_input_receiver_fn) self.assertTrue(gfile.Exists(export_dir))
def testConflictingRNNCellFn(self): col = seq_fc.sequence_categorical_column_with_hash_bucket( 'tokens', hash_bucket_size=10) embed = fc.embedding_column(col, dimension=2) cell_units = [4, 2] with self.assertRaisesRegexp( ValueError, 'num_units and cell_type must not be specified when using rnn_cell_fn' ): rnn.RNNClassifier(sequence_feature_columns=[embed], rnn_cell_fn=lambda x: x, num_units=cell_units) with self.assertRaisesRegexp( ValueError, 'num_units and cell_type must not be specified when using rnn_cell_fn' ): rnn.RNNClassifier(sequence_feature_columns=[embed], rnn_cell_fn=lambda x: x, cell_type='lstm')
def testConflictingRNNCellFn(self): col = seq_fc.sequence_categorical_column_with_hash_bucket( 'tokens', hash_bucket_size=10) embed = fc.embedding_column(col, dimension=2) cell_units = [4, 2] with self.assertRaisesRegexp( ValueError, 'num_units and cell_type must not be specified when using rnn_cell_fn'): rnn.RNNClassifier( sequence_feature_columns=[embed], rnn_cell_fn=lambda x: x, num_units=cell_units) with self.assertRaisesRegexp( ValueError, 'num_units and cell_type must not be specified when using rnn_cell_fn'): rnn.RNNClassifier( sequence_feature_columns=[embed], rnn_cell_fn=lambda x: x, cell_type='lstm')
def _test_complete_flow(self, train_input_fn, eval_input_fn, predict_input_fn, n_classes, batch_size): col = seq_fc.sequence_categorical_column_with_hash_bucket( 'tokens', hash_bucket_size=10) embed = fc.embedding_column(col, dimension=2) feature_columns = [embed] cell_units = [4, 2] est = rnn.RNNClassifier(num_units=cell_units, sequence_feature_columns=feature_columns, n_classes=n_classes, model_dir=self._model_dir) # TRAIN num_steps = 10 est.train(train_input_fn, steps=num_steps) # EVALUATE scores = est.evaluate(eval_input_fn) self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP]) self.assertIn('loss', six.iterkeys(scores)) # PREDICT predicted_proba = np.array([ x[prediction_keys.PredictionKeys.PROBABILITIES] for x in est.predict(predict_input_fn) ]) self.assertAllEqual((batch_size, n_classes), predicted_proba.shape) # EXPORT feature_spec = { 'tokens': parsing_ops.VarLenFeature(dtypes.string), 'label': parsing_ops.FixedLenFeature([1], dtypes.int64), } serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn( feature_spec) export_dir = est.export_savedmodel(tempfile.mkdtemp(), serving_input_receiver_fn) self.assertTrue(gfile.Exists(export_dir))
def test_get_sparse_tensors(self): column = sfc.sequence_categorical_column_with_hash_bucket( 'aaa', hash_bucket_size=10) inputs = sparse_tensor.SparseTensorValue( indices=((0, 0), (1, 0), (1, 1)), values=('omar', 'stringer', 'marlo'), dense_shape=(2, 2)) expected_sparse_ids = sparse_tensor.SparseTensorValue( indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)), # Ignored to avoid hash dependence in test. values=np.array((0, 0, 0), dtype=np.int64), dense_shape=(2, 2, 1)) id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs})) self.assertIsNone(id_weight_pair.weight_tensor) with monitored_session.MonitoredSession() as sess: _assert_sparse_tensor_indices_shape( self, expected_sparse_ids, id_weight_pair.id_tensor.eval(session=sess))