Пример #1
0
    def _testExampleWeight(self, n_classes):
        def train_input_fn():
            return {
                'tokens':
                sparse_tensor.SparseTensor(
                    values=['the', 'cat', 'sat', 'dog', 'barked'],
                    indices=[[0, 0], [0, 1], [0, 2], [1, 0], [1, 1]],
                    dense_shape=[2, 3]),
                'w': [[1], [2]],
            }, [[1], [0]]

        col = seq_fc.sequence_categorical_column_with_hash_bucket(
            'tokens', hash_bucket_size=10)
        embed = fc.embedding_column(col, dimension=2)
        input_units = 2

        cell_units = [4, 2]
        est = rnn.RNNClassifier(num_units=cell_units,
                                sequence_feature_columns=[embed],
                                n_classes=n_classes,
                                weight_column='w',
                                model_dir=self._model_dir)

        # Train for a few steps, and validate final checkpoint.
        num_steps = 10
        est.train(input_fn=train_input_fn, steps=num_steps)
        self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
Пример #2
0
  def _testExampleWeight(self, n_classes):
    def train_input_fn():
      return {
          'tokens':
              sparse_tensor.SparseTensor(
                  values=['the', 'cat', 'sat', 'dog', 'barked'],
                  indices=[[0, 0], [0, 1], [0, 2], [1, 0], [1, 1]],
                  dense_shape=[2, 3]),
          'w': [[1], [2]],
      }, [[1], [0]]

    col = seq_fc.sequence_categorical_column_with_hash_bucket(
        'tokens', hash_bucket_size=10)
    embed = fc.embedding_column(col, dimension=2)
    input_units = 2

    cell_units = [4, 2]
    est = rnn.RNNClassifier(
        num_units=cell_units,
        sequence_feature_columns=[embed],
        n_classes=n_classes,
        weight_column='w',
        model_dir=self._model_dir)

    # Train for a few steps, and validate final checkpoint.
    num_steps = 10
    est.train(input_fn=train_input_fn, steps=num_steps)
    self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
Пример #3
0
    def testFromScratchWithCustomRNNCellFn(self):
        def train_input_fn():
            return {
                'tokens':
                sparse_tensor.SparseTensor(values=['the', 'cat', 'sat'],
                                           indices=[[0, 0], [0, 1], [0, 2]],
                                           dense_shape=[1, 3]),
            }, [[1]]

        col = seq_fc.sequence_categorical_column_with_hash_bucket(
            'tokens', hash_bucket_size=10)
        embed = fc.embedding_column(col, dimension=2)
        input_units = 2
        cell_units = [4, 2]
        n_classes = 2

        def rnn_cell_fn(mode):
            del mode  # unused
            cells = [rnn_cell.BasicRNNCell(num_units=n) for n in cell_units]
            return rnn_cell.MultiRNNCell(cells)

        est = rnn.RNNClassifier(sequence_feature_columns=[embed],
                                rnn_cell_fn=rnn_cell_fn,
                                n_classes=n_classes,
                                model_dir=self._model_dir)

        # Train for a few steps, and validate final checkpoint.
        num_steps = 10
        est.train(input_fn=train_input_fn, steps=num_steps)
        self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
Пример #4
0
  def testFromScratchWithCustomRNNCellFn(self):
    def train_input_fn():
      return {
          'tokens':
              sparse_tensor.SparseTensor(
                  values=['the', 'cat', 'sat'],
                  indices=[[0, 0], [0, 1], [0, 2]],
                  dense_shape=[1, 3]),
      }, [[1]]

    col = seq_fc.sequence_categorical_column_with_hash_bucket(
        'tokens', hash_bucket_size=10)
    embed = fc.embedding_column(col, dimension=2)
    input_units = 2
    cell_units = [4, 2]
    n_classes = 2

    def rnn_cell_fn(mode):
      del mode  # unused
      cells = [rnn_cell.BasicRNNCell(num_units=n) for n in cell_units]
      return rnn_cell.MultiRNNCell(cells)

    est = rnn.RNNClassifier(
        sequence_feature_columns=[embed],
        rnn_cell_fn=rnn_cell_fn,
        n_classes=n_classes,
        model_dir=self._model_dir)

    # Train for a few steps, and validate final checkpoint.
    num_steps = 10
    est.train(input_fn=train_input_fn, steps=num_steps)
    self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
  def testParseExampleInputFn(self):
    """Tests complete flow with input_fn constructed from parse_example."""
    n_classes = 3
    batch_size = 10
    words = [b'dog', b'cat', b'bird', b'the', b'a', b'sat', b'flew', b'slept']

    _, examples_file = tempfile.mkstemp()
    writer = python_io.TFRecordWriter(examples_file)
    for _ in range(batch_size):
      sequence_length = random.randint(1, len(words))
      sentence = random.sample(words, sequence_length)
      label = random.randint(0, n_classes - 1)
      example = example_pb2.Example(features=feature_pb2.Features(
          feature={
              'tokens':
                  feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
                      value=sentence)),
              'label':
                  feature_pb2.Feature(int64_list=feature_pb2.Int64List(
                      value=[label])),
          }))
      writer.write(example.SerializeToString())
    writer.close()

    col = seq_fc.sequence_categorical_column_with_hash_bucket(
        'tokens', hash_bucket_size=10)
    embed = fc.embedding_column(col, dimension=2)
    feature_columns = [embed]
    feature_spec = parsing_utils.classifier_parse_example_spec(
        feature_columns,
        label_key='label',
        label_dtype=dtypes.int64)

    def _train_input_fn():
      dataset = readers.make_batched_features_dataset(
          examples_file, batch_size, feature_spec)
      return dataset.map(lambda features: (features, features.pop('label')))
    def _eval_input_fn():
      dataset = readers.make_batched_features_dataset(
          examples_file, batch_size, feature_spec, num_epochs=1)
      return dataset.map(lambda features: (features, features.pop('label')))
    def _predict_input_fn():
      dataset = readers.make_batched_features_dataset(
          examples_file, batch_size, feature_spec, num_epochs=1)
      def features_fn(features):
        features.pop('label')
        return features
      return dataset.map(features_fn)

    self._test_complete_flow(
        feature_columns=feature_columns,
        train_input_fn=_train_input_fn,
        eval_input_fn=_eval_input_fn,
        predict_input_fn=_predict_input_fn,
        n_classes=n_classes,
        batch_size=batch_size)
Пример #6
0
  def testParseExampleInputFn(self):
    """Tests complete flow with input_fn constructed from parse_example."""
    n_classes = 3
    batch_size = 10
    words = [b'dog', b'cat', b'bird', b'the', b'a', b'sat', b'flew', b'slept']

    _, examples_file = tempfile.mkstemp()
    writer = python_io.TFRecordWriter(examples_file)
    for _ in range(batch_size):
      sequence_length = random.randint(1, len(words))
      sentence = random.sample(words, sequence_length)
      label = random.randint(0, n_classes - 1)
      example = example_pb2.Example(features=feature_pb2.Features(
          feature={
              'tokens':
                  feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
                      value=sentence)),
              'label':
                  feature_pb2.Feature(int64_list=feature_pb2.Int64List(
                      value=[label])),
          }))
      writer.write(example.SerializeToString())
    writer.close()

    col = seq_fc.sequence_categorical_column_with_hash_bucket(
        'tokens', hash_bucket_size=10)
    embed = fc.embedding_column(col, dimension=2)
    feature_columns = [embed]
    feature_spec = parsing_utils.classifier_parse_example_spec(
        feature_columns,
        label_key='label',
        label_dtype=dtypes.int64)

    def _train_input_fn():
      dataset = readers.make_batched_features_dataset(
          examples_file, batch_size, feature_spec)
      return dataset.map(lambda features: (features, features.pop('label')))
    def _eval_input_fn():
      dataset = readers.make_batched_features_dataset(
          examples_file, batch_size, feature_spec, num_epochs=1)
      return dataset.map(lambda features: (features, features.pop('label')))
    def _predict_input_fn():
      dataset = readers.make_batched_features_dataset(
          examples_file, batch_size, feature_spec, num_epochs=1)
      def features_fn(features):
        features.pop('label')
        return features
      return dataset.map(features_fn)

    self._test_complete_flow(
        feature_columns=feature_columns,
        train_input_fn=_train_input_fn,
        eval_input_fn=_eval_input_fn,
        predict_input_fn=_predict_input_fn,
        n_classes=n_classes,
        batch_size=batch_size)
  def test_sequence_length(self):
    column = sfc.sequence_categorical_column_with_hash_bucket(
        'aaa', hash_bucket_size=10)
    inputs = sparse_tensor.SparseTensorValue(
        indices=((0, 0), (1, 0), (1, 1)),
        values=('omar', 'stringer', 'marlo'),
        dense_shape=(2, 2))
    expected_sequence_length = [1, 2]

    sequence_length = column._sequence_length(_LazyBuilder({'aaa': inputs}))

    with monitored_session.MonitoredSession() as sess:
      self.assertAllEqual(
          expected_sequence_length, sequence_length.eval(session=sess))
    def test_sequence_length(self):
        column = sfc.sequence_categorical_column_with_hash_bucket(
            'aaa', hash_bucket_size=10)
        inputs = sparse_tensor.SparseTensorValue(indices=((0, 0), (1, 0), (1,
                                                                           1)),
                                                 values=('omar', 'stringer',
                                                         'marlo'),
                                                 dense_shape=(2, 2))
        expected_sequence_length = [1, 2]

        sequence_length = column._sequence_length(_LazyBuilder({'aaa':
                                                                inputs}))

        with monitored_session.MonitoredSession() as sess:
            self.assertAllEqual(expected_sequence_length,
                                sequence_length.eval(session=sess))
  def _build_feature_columns(self):
    col = fc.categorical_column_with_identity(
        'int_ctx', num_buckets=100)
    ctx_cols = [
        fc.embedding_column(col, dimension=10),
        fc.numeric_column('float_ctx')]

    identity_col = sfc.sequence_categorical_column_with_identity(
        'int_list', num_buckets=10)
    bucket_col = sfc.sequence_categorical_column_with_hash_bucket(
        'bytes_list', hash_bucket_size=100)
    seq_cols = [
        fc.embedding_column(identity_col, dimension=10),
        fc.embedding_column(bucket_col, dimension=20)]

    return ctx_cols, seq_cols
Пример #10
0
  def testNumpyInputFn(self):
    """Tests complete flow with numpy_input_fn."""
    n_classes = 3
    batch_size = 10
    words = ['dog', 'cat', 'bird', 'the', 'a', 'sat', 'flew', 'slept']
    # Numpy only supports dense input, so all examples will have same length.
    # TODO(b/73160931): Update test when support for prepadded data exists.
    sequence_length = 3

    features = []
    for _ in range(batch_size):
      sentence = random.sample(words, sequence_length)
      features.append(sentence)

    x_data = np.array(features)
    y_data = np.random.randint(n_classes, size=batch_size)

    train_input_fn = numpy_io.numpy_input_fn(
        x={'tokens': x_data},
        y=y_data,
        batch_size=batch_size,
        num_epochs=None,
        shuffle=True)
    eval_input_fn = numpy_io.numpy_input_fn(
        x={'tokens': x_data},
        y=y_data,
        batch_size=batch_size,
        shuffle=False)
    predict_input_fn = numpy_io.numpy_input_fn(
        x={'tokens': x_data},
        batch_size=batch_size,
        shuffle=False)

    col = seq_fc.sequence_categorical_column_with_hash_bucket(
        'tokens', hash_bucket_size=10)
    embed = fc.embedding_column(col, dimension=2)
    feature_columns = [embed]

    self._test_complete_flow(
        feature_columns=feature_columns,
        train_input_fn=train_input_fn,
        eval_input_fn=eval_input_fn,
        predict_input_fn=predict_input_fn,
        n_classes=n_classes,
        batch_size=batch_size)
  def testNumpyInputFn(self):
    """Tests complete flow with numpy_input_fn."""
    n_classes = 3
    batch_size = 10
    words = ['dog', 'cat', 'bird', 'the', 'a', 'sat', 'flew', 'slept']
    # Numpy only supports dense input, so all examples will have same length.
    # TODO(b/73160931): Update test when support for prepadded data exists.
    sequence_length = 3

    features = []
    for _ in range(batch_size):
      sentence = random.sample(words, sequence_length)
      features.append(sentence)

    x_data = np.array(features)
    y_data = np.random.randint(n_classes, size=batch_size)

    train_input_fn = numpy_io.numpy_input_fn(
        x={'tokens': x_data},
        y=y_data,
        batch_size=batch_size,
        num_epochs=None,
        shuffle=True)
    eval_input_fn = numpy_io.numpy_input_fn(
        x={'tokens': x_data},
        y=y_data,
        batch_size=batch_size,
        shuffle=False)
    predict_input_fn = numpy_io.numpy_input_fn(
        x={'tokens': x_data},
        batch_size=batch_size,
        shuffle=False)

    col = seq_fc.sequence_categorical_column_with_hash_bucket(
        'tokens', hash_bucket_size=10)
    embed = fc.embedding_column(col, dimension=2)
    feature_columns = [embed]

    self._test_complete_flow(
        feature_columns=feature_columns,
        train_input_fn=train_input_fn,
        eval_input_fn=eval_input_fn,
        predict_input_fn=predict_input_fn,
        n_classes=n_classes,
        batch_size=batch_size)
Пример #12
0
  def _test_complete_flow(
      self, train_input_fn, eval_input_fn, predict_input_fn, n_classes,
      batch_size):
    col = seq_fc.sequence_categorical_column_with_hash_bucket(
        'tokens', hash_bucket_size=10)
    embed = fc.embedding_column(col, dimension=2)
    feature_columns = [embed]

    cell_units = [4, 2]
    est = rnn.RNNClassifier(
        num_units=cell_units,
        sequence_feature_columns=feature_columns,
        n_classes=n_classes,
        model_dir=self._model_dir)

    # TRAIN
    num_steps = 10
    est.train(train_input_fn, steps=num_steps)

    # EVALUATE
    scores = est.evaluate(eval_input_fn)
    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
    self.assertIn('loss', six.iterkeys(scores))

    # PREDICT
    predicted_proba = np.array([
        x[prediction_keys.PredictionKeys.PROBABILITIES]
        for x in est.predict(predict_input_fn)
    ])
    self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)

    # EXPORT
    feature_spec = {
        'tokens': parsing_ops.VarLenFeature(dtypes.string),
        'label': parsing_ops.FixedLenFeature([1], dtypes.int64),
    }
    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
        feature_spec)
    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
                                       serving_input_receiver_fn)
    self.assertTrue(gfile.Exists(export_dir))
Пример #13
0
    def testConflictingRNNCellFn(self):
        col = seq_fc.sequence_categorical_column_with_hash_bucket(
            'tokens', hash_bucket_size=10)
        embed = fc.embedding_column(col, dimension=2)
        cell_units = [4, 2]

        with self.assertRaisesRegexp(
                ValueError,
                'num_units and cell_type must not be specified when using rnn_cell_fn'
        ):
            rnn.RNNClassifier(sequence_feature_columns=[embed],
                              rnn_cell_fn=lambda x: x,
                              num_units=cell_units)

        with self.assertRaisesRegexp(
                ValueError,
                'num_units and cell_type must not be specified when using rnn_cell_fn'
        ):
            rnn.RNNClassifier(sequence_feature_columns=[embed],
                              rnn_cell_fn=lambda x: x,
                              cell_type='lstm')
Пример #14
0
  def testConflictingRNNCellFn(self):
    col = seq_fc.sequence_categorical_column_with_hash_bucket(
        'tokens', hash_bucket_size=10)
    embed = fc.embedding_column(col, dimension=2)
    cell_units = [4, 2]

    with self.assertRaisesRegexp(
        ValueError,
        'num_units and cell_type must not be specified when using rnn_cell_fn'):
      rnn.RNNClassifier(
          sequence_feature_columns=[embed],
          rnn_cell_fn=lambda x: x,
          num_units=cell_units)

    with self.assertRaisesRegexp(
        ValueError,
        'num_units and cell_type must not be specified when using rnn_cell_fn'):
      rnn.RNNClassifier(
          sequence_feature_columns=[embed],
          rnn_cell_fn=lambda x: x,
          cell_type='lstm')
Пример #15
0
    def _test_complete_flow(self, train_input_fn, eval_input_fn,
                            predict_input_fn, n_classes, batch_size):
        col = seq_fc.sequence_categorical_column_with_hash_bucket(
            'tokens', hash_bucket_size=10)
        embed = fc.embedding_column(col, dimension=2)
        feature_columns = [embed]

        cell_units = [4, 2]
        est = rnn.RNNClassifier(num_units=cell_units,
                                sequence_feature_columns=feature_columns,
                                n_classes=n_classes,
                                model_dir=self._model_dir)

        # TRAIN
        num_steps = 10
        est.train(train_input_fn, steps=num_steps)

        # EVALUATE
        scores = est.evaluate(eval_input_fn)
        self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
        self.assertIn('loss', six.iterkeys(scores))

        # PREDICT
        predicted_proba = np.array([
            x[prediction_keys.PredictionKeys.PROBABILITIES]
            for x in est.predict(predict_input_fn)
        ])
        self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)

        # EXPORT
        feature_spec = {
            'tokens': parsing_ops.VarLenFeature(dtypes.string),
            'label': parsing_ops.FixedLenFeature([1], dtypes.int64),
        }
        serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
            feature_spec)
        export_dir = est.export_savedmodel(tempfile.mkdtemp(),
                                           serving_input_receiver_fn)
        self.assertTrue(gfile.Exists(export_dir))
  def test_get_sparse_tensors(self):
    column = sfc.sequence_categorical_column_with_hash_bucket(
        'aaa', hash_bucket_size=10)
    inputs = sparse_tensor.SparseTensorValue(
        indices=((0, 0), (1, 0), (1, 1)),
        values=('omar', 'stringer', 'marlo'),
        dense_shape=(2, 2))

    expected_sparse_ids = sparse_tensor.SparseTensorValue(
        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
        # Ignored to avoid hash dependence in test.
        values=np.array((0, 0, 0), dtype=np.int64),
        dense_shape=(2, 2, 1))

    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))

    self.assertIsNone(id_weight_pair.weight_tensor)
    with monitored_session.MonitoredSession() as sess:
      _assert_sparse_tensor_indices_shape(
          self,
          expected_sparse_ids,
          id_weight_pair.id_tensor.eval(session=sess))