Exemplo n.º 1
0
  def _testExampleWeight(self, n_classes):
    def train_input_fn():
      return {
          'tokens':
              sparse_tensor.SparseTensor(
                  values=['the', 'cat', 'sat', 'dog', 'barked'],
                  indices=[[0, 0], [0, 1], [0, 2], [1, 0], [1, 1]],
                  dense_shape=[2, 3]),
          'w': [[1], [2]],
      }, [[1], [0]]

    col = seq_fc.sequence_categorical_column_with_hash_bucket(
        'tokens', hash_bucket_size=10)
    embed = fc.embedding_column(col, dimension=2)
    input_units = 2

    cell_units = [4, 2]
    est = rnn.RNNClassifier(
        num_units=cell_units,
        sequence_feature_columns=[embed],
        n_classes=n_classes,
        weight_column='w',
        model_dir=self._model_dir)

    # Train for a few steps, and validate final checkpoint.
    num_steps = 10
    est.train(input_fn=train_input_fn, steps=num_steps)
    self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
Exemplo n.º 2
0
  def testFromScratchWithCustomRNNCellFn(self):
    def train_input_fn():
      return {
          'tokens':
              sparse_tensor.SparseTensor(
                  values=['the', 'cat', 'sat'],
                  indices=[[0, 0], [0, 1], [0, 2]],
                  dense_shape=[1, 3]),
      }, [[1]]

    col = seq_fc.sequence_categorical_column_with_hash_bucket(
        'tokens', hash_bucket_size=10)
    embed = fc.embedding_column(col, dimension=2)
    input_units = 2
    cell_units = [4, 2]
    n_classes = 2

    def rnn_cell_fn(mode):
      del mode  # unused
      cells = [rnn_cell.BasicRNNCell(num_units=n) for n in cell_units]
      return rnn_cell.MultiRNNCell(cells)

    est = rnn.RNNClassifier(
        sequence_feature_columns=[embed],
        rnn_cell_fn=rnn_cell_fn,
        n_classes=n_classes,
        model_dir=self._model_dir)

    # Train for a few steps, and validate final checkpoint.
    num_steps = 10
    est.train(input_fn=train_input_fn, steps=num_steps)
    self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
Exemplo n.º 3
0
    def test_dnn_classifier(self):
        embedding = feature_column_lib.embedding_column(
            feature_column_lib.categorical_column_with_vocabulary_list(
                'wire_cast', ['kima', 'omar', 'stringer']), 8)
        dnn = estimator_lib.DNNClassifier(feature_columns=[embedding],
                                          hidden_units=[3, 1])

        def train_input_fn():
            return dataset_ops.Dataset.from_tensors(({
                'wire_cast': [['omar'], ['kima']]
            }, [[0], [1]])).repeat(3)

        def eval_input_fn():
            return dataset_ops.Dataset.from_tensors(({
                'wire_cast': [['stringer'], ['kima']]
            }, [[0], [1]])).repeat(2)

        evaluator = hooks_lib.InMemoryEvaluatorHook(dnn,
                                                    eval_input_fn,
                                                    name='in-memory')
        dnn.train(train_input_fn, hooks=[evaluator])
        self.assertTrue(os.path.isdir(dnn.eval_dir('in-memory')))
        step_keyword_to_value = summary_step_keyword_to_value_mapping(
            dnn.eval_dir('in-memory'))

        final_metrics = dnn.evaluate(eval_input_fn)
        step = final_metrics[ops.GraphKeys.GLOBAL_STEP]
        for summary_tag in final_metrics:
            if summary_tag == ops.GraphKeys.GLOBAL_STEP:
                continue
            self.assertEqual(final_metrics[summary_tag],
                             step_keyword_to_value[step][summary_tag])
Exemplo n.º 4
0
    def testFromScratchWithCustomRNNCellFn(self):
        def train_input_fn():
            return {
                'tokens':
                sparse_tensor.SparseTensor(values=['the', 'cat', 'sat'],
                                           indices=[[0, 0], [0, 1], [0, 2]],
                                           dense_shape=[1, 3]),
            }, [[1]]

        col = seq_fc.sequence_categorical_column_with_hash_bucket(
            'tokens', hash_bucket_size=10)
        embed = fc.embedding_column(col, dimension=2)
        input_units = 2
        cell_units = [4, 2]
        n_classes = 2

        def rnn_cell_fn(mode):
            del mode  # unused
            cells = [rnn_cell.BasicRNNCell(num_units=n) for n in cell_units]
            return rnn_cell.MultiRNNCell(cells)

        est = rnn.RNNClassifier(sequence_feature_columns=[embed],
                                rnn_cell_fn=rnn_cell_fn,
                                n_classes=n_classes,
                                model_dir=self._model_dir)

        # Train for a few steps, and validate final checkpoint.
        num_steps = 10
        est.train(input_fn=train_input_fn, steps=num_steps)
        self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
Exemplo n.º 5
0
def _sequence_embedding_column(categorical_column,
                               dimension,
                               initializer=None,
                               ckpt_to_load_from=None,
                               tensor_name_in_ckpt=None,
                               max_norm=None,
                               trainable=True):
    """Returns a feature column that represents sequences of embeddings.

  Use this to convert sequence categorical data into dense representation for
  input to sequence NN, such as RNN.

  Example:

  ```python
  watches = sequence_categorical_column_with_identity(
      'watches', num_buckets=1000)
  watches_embedding = embedding_column(watches, dimension=10)
  columns = [watches]

  features = tf.parse_example(..., features=make_parse_example_spec(columns))
  input_layer, sequence_length = sequence_input_layer(features, columns)

  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
  outputs, state = tf.nn.dynamic_rnn(
      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
  ```

  Args:
    categorical_column: A `_SequenceCategoricalColumn` created with a
      `sequence_cateogrical_column_with_*` function.
    dimension: Integer dimension of the embedding.
    initializer: Initializer function used to initialize the embeddings.
    ckpt_to_load_from: String representing checkpoint name/pattern from which to
      restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
    tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from
      which to restore the column weights. Required if `ckpt_to_load_from` is
      not `None`.
    max_norm: If not `None`, embedding values are l2-normalized to this value.
    trainable: Whether or not the embedding is trainable. Default is True.

  Returns:
    A `_SequenceEmbeddingColumn`.

  Raises:
    ValueError: If `categorical_column` is not the right type.
  """
    if not isinstance(categorical_column, _SequenceCategoricalColumn):
        raise ValueError(
            'categorical_column must be of type _SequenceCategoricalColumn. '
            'Given (type {}): {}'.format(type(categorical_column),
                                         categorical_column))
    return _SequenceEmbeddingColumn(
        fc.embedding_column(categorical_column,
                            dimension=dimension,
                            initializer=initializer,
                            ckpt_to_load_from=ckpt_to_load_from,
                            tensor_name_in_ckpt=tensor_name_in_ckpt,
                            max_norm=max_norm,
                            trainable=trainable))
Exemplo n.º 6
0
    def _testExampleWeight(self, n_classes):
        def train_input_fn():
            return {
                'tokens':
                sparse_tensor.SparseTensor(
                    values=['the', 'cat', 'sat', 'dog', 'barked'],
                    indices=[[0, 0], [0, 1], [0, 2], [1, 0], [1, 1]],
                    dense_shape=[2, 3]),
                'w': [[1], [2]],
            }, [[1], [0]]

        col = seq_fc.sequence_categorical_column_with_hash_bucket(
            'tokens', hash_bucket_size=10)
        embed = fc.embedding_column(col, dimension=2)
        input_units = 2

        cell_units = [4, 2]
        est = rnn.RNNClassifier(num_units=cell_units,
                                sequence_feature_columns=[embed],
                                n_classes=n_classes,
                                weight_column='w',
                                model_dir=self._model_dir)

        # Train for a few steps, and validate final checkpoint.
        num_steps = 10
        est.train(input_fn=train_input_fn, steps=num_steps)
        self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
  def _build_feature_columns(self):
    col = fc.categorical_column_with_identity(
        'int_ctx', num_buckets=100)
    ctx_cols = [
        fc.embedding_column(col, dimension=10),
        fc.numeric_column('float_ctx')]

    identity_col = sfc.sequence_categorical_column_with_identity(
        'int_list', num_buckets=10)
    bucket_col = sfc.sequence_categorical_column_with_hash_bucket(
        'bytes_list', hash_bucket_size=100)
    seq_cols = [
        fc.embedding_column(identity_col, dimension=10),
        fc.embedding_column(bucket_col, dimension=20)]

    return ctx_cols, seq_cols
Exemplo n.º 8
0
  def test_dnn_classifier(self):
    embedding = feature_column_lib.embedding_column(
        feature_column_lib.categorical_column_with_vocabulary_list(
            'wire_cast', ['kima', 'omar', 'stringer']), 8)
    dnn = estimator_lib.DNNClassifier(
        feature_columns=[embedding], hidden_units=[3, 1])

    def train_input_fn():
      return dataset_ops.Dataset.from_tensors(({
          'wire_cast': [['omar'], ['kima']]
      }, [[0], [1]])).repeat(3)

    def eval_input_fn():
      return dataset_ops.Dataset.from_tensors(({
          'wire_cast': [['stringer'], ['kima']]
      }, [[0], [1]])).repeat(2)

    evaluator = hooks_lib.InMemoryEvaluatorHook(
        dnn, eval_input_fn, name='in-memory')
    dnn.train(train_input_fn, hooks=[evaluator])
    self.assertTrue(os.path.isdir(dnn.eval_dir('in-memory')))
    step_keyword_to_value = summary_step_keyword_to_value_mapping(
        dnn.eval_dir('in-memory'))

    final_metrics = dnn.evaluate(eval_input_fn)
    step = final_metrics[ops.GraphKeys.GLOBAL_STEP]
    for summary_tag in final_metrics:
      if summary_tag == ops.GraphKeys.GLOBAL_STEP:
        continue
      self.assertEqual(final_metrics[summary_tag],
                       step_keyword_to_value[step][summary_tag])
  def test_sequence_length_with_empty_rows(self):
    """Tests _sequence_length when some examples do not have ids."""
    vocabulary_size = 3
    sparse_input = sparse_tensor.SparseTensorValue(
        # example 0, ids []
        # example 1, ids [2]
        # example 2, ids [0, 1]
        # example 3, ids []
        # example 4, ids [1]
        # example 5, ids []
        indices=((1, 0), (2, 0), (2, 1), (4, 0)),
        values=(2, 0, 1, 1),
        dense_shape=(6, 2))
    expected_sequence_length = [0, 1, 2, 0, 1, 0]

    categorical_column = sfc.sequence_categorical_column_with_identity(
        key='aaa', num_buckets=vocabulary_size)
    embedding_column = fc.embedding_column(
        categorical_column, dimension=2)

    _, sequence_length = embedding_column._get_sequence_dense_tensor(
        _LazyBuilder({'aaa': sparse_input}))

    with monitored_session.MonitoredSession() as sess:
      self.assertAllEqual(
          expected_sequence_length, sequence_length.eval(session=sess))
  def testWarmStartInputLayerEmbeddingColumn(self):
    # Create old and new vocabs for embedding column "sc_vocab".
    prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                        "old_vocab")
    new_vocab_path = self._write_vocab(
        ["orange", "guava", "banana", "apple", "raspberry", "blueberry"],
        "new_vocab")

    # Save checkpoint from which to warm-start.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        _ = variable_scope.get_variable(
            "input_layer/sc_vocab_embedding/embedding_weights",
            initializer=[[0.5, 0.4], [1., 1.1], [2., 2.2], [3., 3.3]])
        self._write_checkpoint(sess)

    def _partitioner(shape, dtype):  # pylint:disable=unused-argument
      # Partition each var into 2 equal slices.
      partitions = [1] * len(shape)
      partitions[0] = min(2, shape[0].value)
      return partitions

    # Create feature columns.
    sc_vocab = fc.categorical_column_with_vocabulary_file(
        "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6)
    emb_vocab = fc.embedding_column(
        categorical_column=sc_vocab,
        dimension=2,
        # Can't use constant_initializer with load_and_remap.  In practice,
        # use a truncated normal initializer.
        initializer=init_ops.random_uniform_initializer(
            minval=0.42, maxval=0.42))
    all_deep_cols = [emb_vocab]
    # New graph, new session with warmstarting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = {}
        with variable_scope.variable_scope("", partitioner=_partitioner):
          # Create the variables.
          fc.input_layer(
              features=self._create_dummy_inputs(),
              feature_columns=all_deep_cols,
              cols_to_vars=cols_to_vars)
        ws_settings = ws_util._WarmStartSettings(
            self.get_temp_dir(), col_to_prev_vocab={
                emb_vocab: prev_vocab_path
            })
        ws_util._warmstart_input_layer(cols_to_vars, ws_settings)
        sess.run(variables.global_variables_initializer())
        # Verify weights were correctly warmstarted. Var corresponding to
        # emb_vocab should be correctly warmstarted after vocab remapping.
        # Missing values are filled in with the EmbeddingColumn's initializer.
        self._assert_cols_to_vars(
            cols_to_vars, {
                emb_vocab: [
                    np.array([[3., 3.3], [2., 2.2], [1., 1.1]]),
                    np.array([[0.5, 0.4], [0.42, 0.42], [0.42, 0.42]])
                ]
            }, sess)
Exemplo n.º 11
0
    def testWarmStartInputLayerEmbeddingColumn(self):
        # Create old and new vocabs for embedding column "sc_vocab".
        prev_vocab_path = self._write_vocab(
            ["apple", "banana", "guava", "orange"], "old_vocab")
        new_vocab_path = self._write_vocab(
            ["orange", "guava", "banana", "apple", "raspberry", "blueberry"],
            "new_vocab")

        # Save checkpoint from which to warm-start.
        with ops.Graph().as_default() as g:
            with self.test_session(graph=g) as sess:
                _ = variable_scope.get_variable(
                    "input_layer/sc_vocab_embedding/embedding_weights",
                    initializer=[[0.5, 0.4], [1., 1.1], [2., 2.2], [3., 3.3]])
                self._write_checkpoint(sess)

        def _partitioner(shape, dtype):  # pylint:disable=unused-argument
            # Partition each var into 2 equal slices.
            partitions = [1] * len(shape)
            partitions[0] = min(2, shape[0].value)
            return partitions

        # Create feature columns.
        sc_vocab = fc.categorical_column_with_vocabulary_file(
            "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6)
        emb_vocab = fc.embedding_column(
            categorical_column=sc_vocab,
            dimension=2,
            # Can't use constant_initializer with load_and_remap.  In practice,
            # use a truncated normal initializer.
            initializer=init_ops.random_uniform_initializer(minval=0.42,
                                                            maxval=0.42))
        all_deep_cols = [emb_vocab]
        # New graph, new session with warmstarting.
        with ops.Graph().as_default() as g:
            with self.test_session(graph=g) as sess:
                cols_to_vars = {}
                with variable_scope.variable_scope("",
                                                   partitioner=_partitioner):
                    # Create the variables.
                    fc.input_layer(features=self._create_dummy_inputs(),
                                   feature_columns=all_deep_cols,
                                   cols_to_vars=cols_to_vars)
                ws_settings = ws_util._WarmStartSettings(
                    self.get_temp_dir(),
                    col_to_prev_vocab={emb_vocab: prev_vocab_path})
                ws_util._warmstart_input_layer(cols_to_vars, ws_settings)
                sess.run(variables.global_variables_initializer())
                # Verify weights were correctly warmstarted. Var corresponding to
                # emb_vocab should be correctly warmstarted after vocab remapping.
                # Missing values are filled in with the EmbeddingColumn's initializer.
                self._assert_cols_to_vars(
                    cols_to_vars, {
                        emb_vocab: [
                            np.array([[3., 3.3], [2., 2.2], [1., 1.1]]),
                            np.array([[0.5, 0.4], [0.42, 0.42], [0.42, 0.42]])
                        ]
                    }, sess)
Exemplo n.º 12
0
def _sequence_embedding_column(
    categorical_column, dimension, initializer=None, ckpt_to_load_from=None,
    tensor_name_in_ckpt=None, max_norm=None, trainable=True):
  """Returns a feature column that represents sequences of embeddings.

  Use this to convert sequence categorical data into dense representation for
  input to sequence NN, such as RNN.

  Example:

  ```python
  watches = sequence_categorical_column_with_identity(
      'watches', num_buckets=1000)
  watches_embedding = _sequence_embedding_column(watches, dimension=10)
  columns = [watches]

  features = tf.parse_example(..., features=make_parse_example_spec(columns))
  input_layer, sequence_length = sequence_input_layer(features, columns)

  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
  outputs, state = tf.nn.dynamic_rnn(
      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
  ```

  Args:
    categorical_column: A `_SequenceCategoricalColumn` created with a
      `sequence_cateogrical_column_with_*` function.
    dimension: Integer dimension of the embedding.
    initializer: Initializer function used to initialize the embeddings.
    ckpt_to_load_from: String representing checkpoint name/pattern from which to
      restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
    tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from
      which to restore the column weights. Required if `ckpt_to_load_from` is
      not `None`.
    max_norm: If not `None`, embedding values are l2-normalized to this value.
    trainable: Whether or not the embedding is trainable. Default is True.

  Returns:
    A `_SequenceCategoricalToDenseColumn`.

  Raises:
    ValueError: If `categorical_column` is not the right type.
  """
  if not isinstance(categorical_column, _SequenceCategoricalColumn):
    raise ValueError(
        'categorical_column must be of type _SequenceCategoricalColumn. '
        'Given (type {}): {}'.format(
            type(categorical_column), categorical_column))
  return _SequenceCategoricalToDenseColumn(
      fc.embedding_column(
          categorical_column,
          dimension=dimension,
          initializer=initializer,
          ckpt_to_load_from=ckpt_to_load_from,
          tensor_name_in_ckpt=tensor_name_in_ckpt,
          max_norm=max_norm,
          trainable=trainable))
  def testParseExampleInputFn(self):
    """Tests complete flow with input_fn constructed from parse_example."""
    n_classes = 3
    batch_size = 10
    words = [b'dog', b'cat', b'bird', b'the', b'a', b'sat', b'flew', b'slept']

    _, examples_file = tempfile.mkstemp()
    writer = python_io.TFRecordWriter(examples_file)
    for _ in range(batch_size):
      sequence_length = random.randint(1, len(words))
      sentence = random.sample(words, sequence_length)
      label = random.randint(0, n_classes - 1)
      example = example_pb2.Example(features=feature_pb2.Features(
          feature={
              'tokens':
                  feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
                      value=sentence)),
              'label':
                  feature_pb2.Feature(int64_list=feature_pb2.Int64List(
                      value=[label])),
          }))
      writer.write(example.SerializeToString())
    writer.close()

    col = seq_fc.sequence_categorical_column_with_hash_bucket(
        'tokens', hash_bucket_size=10)
    embed = fc.embedding_column(col, dimension=2)
    feature_columns = [embed]
    feature_spec = parsing_utils.classifier_parse_example_spec(
        feature_columns,
        label_key='label',
        label_dtype=dtypes.int64)

    def _train_input_fn():
      dataset = readers.make_batched_features_dataset(
          examples_file, batch_size, feature_spec)
      return dataset.map(lambda features: (features, features.pop('label')))
    def _eval_input_fn():
      dataset = readers.make_batched_features_dataset(
          examples_file, batch_size, feature_spec, num_epochs=1)
      return dataset.map(lambda features: (features, features.pop('label')))
    def _predict_input_fn():
      dataset = readers.make_batched_features_dataset(
          examples_file, batch_size, feature_spec, num_epochs=1)
      def features_fn(features):
        features.pop('label')
        return features
      return dataset.map(features_fn)

    self._test_complete_flow(
        feature_columns=feature_columns,
        train_input_fn=_train_input_fn,
        eval_input_fn=_eval_input_fn,
        predict_input_fn=_predict_input_fn,
        n_classes=n_classes,
        batch_size=batch_size)
Exemplo n.º 14
0
  def testParseExampleInputFn(self):
    """Tests complete flow with input_fn constructed from parse_example."""
    n_classes = 3
    batch_size = 10
    words = [b'dog', b'cat', b'bird', b'the', b'a', b'sat', b'flew', b'slept']

    _, examples_file = tempfile.mkstemp()
    writer = python_io.TFRecordWriter(examples_file)
    for _ in range(batch_size):
      sequence_length = random.randint(1, len(words))
      sentence = random.sample(words, sequence_length)
      label = random.randint(0, n_classes - 1)
      example = example_pb2.Example(features=feature_pb2.Features(
          feature={
              'tokens':
                  feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
                      value=sentence)),
              'label':
                  feature_pb2.Feature(int64_list=feature_pb2.Int64List(
                      value=[label])),
          }))
      writer.write(example.SerializeToString())
    writer.close()

    col = seq_fc.sequence_categorical_column_with_hash_bucket(
        'tokens', hash_bucket_size=10)
    embed = fc.embedding_column(col, dimension=2)
    feature_columns = [embed]
    feature_spec = parsing_utils.classifier_parse_example_spec(
        feature_columns,
        label_key='label',
        label_dtype=dtypes.int64)

    def _train_input_fn():
      dataset = readers.make_batched_features_dataset(
          examples_file, batch_size, feature_spec)
      return dataset.map(lambda features: (features, features.pop('label')))
    def _eval_input_fn():
      dataset = readers.make_batched_features_dataset(
          examples_file, batch_size, feature_spec, num_epochs=1)
      return dataset.map(lambda features: (features, features.pop('label')))
    def _predict_input_fn():
      dataset = readers.make_batched_features_dataset(
          examples_file, batch_size, feature_spec, num_epochs=1)
      def features_fn(features):
        features.pop('label')
        return features
      return dataset.map(features_fn)

    self._test_complete_flow(
        feature_columns=feature_columns,
        train_input_fn=_train_input_fn,
        eval_input_fn=_eval_input_fn,
        predict_input_fn=_predict_input_fn,
        n_classes=n_classes,
        batch_size=batch_size)
Exemplo n.º 15
0
    def test_warm_starting_selective_variables(self):
        """Tests selecting variables to warm-start."""
        age = feature_column.numeric_column('age')
        city = feature_column.embedding_column(
            feature_column.categorical_column_with_vocabulary_list(
                'city', vocabulary_list=['Mountain View', 'Palo Alto']),
            dimension=5)

        # Create a DNNLinearCombinedClassifier and train to save a checkpoint.
        dnn_lc_classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
            linear_feature_columns=[age],
            dnn_feature_columns=[city],
            dnn_hidden_units=[256, 128],
            model_dir=self._ckpt_and_vocab_dir,
            n_classes=4,
            linear_optimizer='SGD',
            dnn_optimizer='SGD')
        dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1)

        # Create a second DNNLinearCombinedClassifier, warm-started from the first.
        # Use a learning_rate = 0.0 optimizer to check values (use SGD so we don't
        # have accumulator values that change).
        warm_started_dnn_lc_classifier = (
            dnn_linear_combined.DNNLinearCombinedClassifier(
                linear_feature_columns=[age],
                dnn_feature_columns=[city],
                dnn_hidden_units=[256, 128],
                n_classes=4,
                linear_optimizer=gradient_descent.GradientDescentOptimizer(
                    learning_rate=0.0),
                dnn_optimizer=gradient_descent.GradientDescentOptimizer(
                    learning_rate=0.0),
                # The provided regular expression will only warm-start the deep
                # portion of the model.
                warm_start_from=estimator.WarmStartSettings(
                    ckpt_to_initialize_from=dnn_lc_classifier.model_dir,
                    vars_to_warm_start='.*(dnn).*')))

        warm_started_dnn_lc_classifier.train(input_fn=self._input_fn,
                                             max_steps=1)
        for variable_name in warm_started_dnn_lc_classifier.get_variable_names(
        ):
            if 'dnn' in variable_name:
                self.assertAllClose(
                    dnn_lc_classifier.get_variable_value(variable_name),
                    warm_started_dnn_lc_classifier.get_variable_value(
                        variable_name))
            elif 'linear' in variable_name:
                linear_values = warm_started_dnn_lc_classifier.get_variable_value(
                    variable_name)
                # Since they're not warm-started, the linear weights will be
                # zero-initialized.
                self.assertAllClose(np.zeros_like(linear_values),
                                    linear_values)
Exemplo n.º 16
0
def build_model_columns():
    week_list = fc.categorical_column_with_vocabulary_list("week_list",
                                                           vocabulary_list=['mon', 'tue', 'wed', 'thur', 'fri', 'sat',
                                                                            'sun'])
    week = fc.weighted_categorical_column(week_list, 'week_weight')

    week = fc.embedding_column(week, 3)

    wide = []
    deep = [week]
    return wide, deep
Exemplo n.º 17
0
def test_reuse():
    data = {
        'gender': [['M'], ['G'], ['M'], ['M']],
        'user': [['A'], ['B'], ['C'], ['C']],
        'pos': [['a'], ['d'], ['f'], ['c']],
        'neg': [['c'], ['e'], ['d'], ['a']]
    }
    user_v_list = ['A', 'B', 'C', 'D']
    item_v_list = ['a', 'b', 'c', 'd', 'e', 'f']

    gender_col = feature_column.categorical_column_with_vocabulary_list(
        'gender', ['M', "G"], dtype=tf.string)
    user_col = feature_column.categorical_column_with_vocabulary_list(
        'user', user_v_list, dtype=tf.string)
    pos_item_col = feature_column.categorical_column_with_vocabulary_list(
        'pos', item_v_list, dtype=tf.string)
    neg_item_col = feature_column.categorical_column_with_vocabulary_list(
        'neg', item_v_list, dtype=tf.string)

    gender_embedding = feature_column.embedding_column(gender_col, 2)
    user_embedding = feature_column.embedding_column(user_col, 2)
    pos_embedding, neg_embedding = feature_column.shared_embedding_columns(
        [pos_item_col, neg_item_col], 3)
    columns = [gender_embedding, user_embedding, pos_embedding, neg_embedding]

    with tf.variable_scope("a") as scope:
        aa = scope.name
        ret = tf.feature_column.input_layer(data, columns)
    print(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=aa))

    with tf.variable_scope("b") as scope:
        bb = scope.name
        ret1 = tf.feature_column.input_layer(data, columns)
    print(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=bb))
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        print(sess.run(ret))
        print('------------------')
        print(sess.run(ret1))
Exemplo n.º 18
0
    def test_get_sequence_dense_tensor(self):
        vocabulary_size = 3
        sparse_input = sparse_tensor.SparseTensorValue(
            # example 0, ids [2]
            # example 1, ids [0, 1]
            # example 2, ids []
            # example 3, ids [1]
            indices=((0, 0), (1, 0), (1, 1), (3, 0)),
            values=(2, 0, 1, 1),
            dense_shape=(4, 2))

        embedding_dimension = 2
        embedding_values = (
            (1., 2.),  # id 0
            (3., 5.),  # id 1
            (7., 11.)  # id 2
        )

        def _initializer(shape, dtype, partition_info):
            self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
            self.assertEqual(dtypes.float32, dtype)
            self.assertIsNone(partition_info)
            return embedding_values

        expected_lookups = [
            # example 0, ids [2]
            [[7., 11.], [0., 0.]],
            # example 1, ids [0, 1]
            [[1., 2.], [3., 5.]],
            # example 2, ids []
            [[0., 0.], [0., 0.]],
            # example 3, ids [1]
            [[3., 5.], [0., 0.]],
        ]

        categorical_column = sfc.sequence_categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size)
        embedding_column = fc.embedding_column(categorical_column,
                                               dimension=embedding_dimension,
                                               initializer=_initializer)

        embedding_lookup, _ = embedding_column._get_sequence_dense_tensor(
            _LazyBuilder({'aaa': sparse_input}))

        global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
        self.assertItemsEqual(('embedding_weights:0', ),
                              tuple([v.name for v in global_vars]))
        with monitored_session.MonitoredSession() as sess:
            self.assertAllEqual(embedding_values,
                                global_vars[0].eval(session=sess))
            self.assertAllEqual(expected_lookups,
                                embedding_lookup.eval(session=sess))
  def test_warm_starting_selective_variables(self):
    """Tests selecting variables to warm-start."""
    age = feature_column.numeric_column('age')
    city = feature_column.embedding_column(
        feature_column.categorical_column_with_vocabulary_list(
            'city', vocabulary_list=['Mountain View', 'Palo Alto']),
        dimension=5)

    # Create a DNNLinearCombinedClassifier and train to save a checkpoint.
    dnn_lc_classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
        linear_feature_columns=[age],
        dnn_feature_columns=[city],
        dnn_hidden_units=[256, 128],
        model_dir=self._ckpt_and_vocab_dir,
        n_classes=4,
        linear_optimizer='SGD',
        dnn_optimizer='SGD')
    dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1)

    # Create a second DNNLinearCombinedClassifier, warm-started from the first.
    # Use a learning_rate = 0.0 optimizer to check values (use SGD so we don't
    # have accumulator values that change).
    warm_started_dnn_lc_classifier = (
        dnn_linear_combined.DNNLinearCombinedClassifier(
            linear_feature_columns=[age],
            dnn_feature_columns=[city],
            dnn_hidden_units=[256, 128],
            n_classes=4,
            linear_optimizer=gradient_descent.GradientDescentOptimizer(
                learning_rate=0.0),
            dnn_optimizer=gradient_descent.GradientDescentOptimizer(
                learning_rate=0.0),
            # The provided regular expression will only warm-start the deep
            # portion of the model.
            warm_start_from=estimator.WarmStartSettings(
                ckpt_to_initialize_from=dnn_lc_classifier.model_dir,
                vars_to_warm_start='.*(dnn).*')))

    warm_started_dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1)
    for variable_name in warm_started_dnn_lc_classifier.get_variable_names():
      if 'dnn' in variable_name:
        self.assertAllClose(
            dnn_lc_classifier.get_variable_value(variable_name),
            warm_started_dnn_lc_classifier.get_variable_value(variable_name))
      elif 'linear' in variable_name:
        linear_values = warm_started_dnn_lc_classifier.get_variable_value(
            variable_name)
        # Since they're not warm-started, the linear weights will be
        # zero-initialized.
        self.assertAllClose(np.zeros_like(linear_values), linear_values)
  def test_get_sequence_dense_tensor(self):
    vocabulary_size = 3
    sparse_input = sparse_tensor.SparseTensorValue(
        # example 0, ids [2]
        # example 1, ids [0, 1]
        # example 2, ids []
        # example 3, ids [1]
        indices=((0, 0), (1, 0), (1, 1), (3, 0)),
        values=(2, 0, 1, 1),
        dense_shape=(4, 2))

    embedding_dimension = 2
    embedding_values = (
        (1., 2.),  # id 0
        (3., 5.),  # id 1
        (7., 11.)  # id 2
    )
    def _initializer(shape, dtype, partition_info):
      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
      self.assertEqual(dtypes.float32, dtype)
      self.assertIsNone(partition_info)
      return embedding_values

    expected_lookups = [
        # example 0, ids [2]
        [[7., 11.], [0., 0.]],
        # example 1, ids [0, 1]
        [[1., 2.], [3., 5.]],
        # example 2, ids []
        [[0., 0.], [0., 0.]],
        # example 3, ids [1]
        [[3., 5.], [0., 0.]],
    ]

    categorical_column = sfc.sequence_categorical_column_with_identity(
        key='aaa', num_buckets=vocabulary_size)
    embedding_column = fc.embedding_column(
        categorical_column, dimension=embedding_dimension,
        initializer=_initializer)

    embedding_lookup, _ = embedding_column._get_sequence_dense_tensor(
        _LazyBuilder({'aaa': sparse_input}))

    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
    self.assertItemsEqual(
        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
    with monitored_session.MonitoredSession() as sess:
      self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
      self.assertAllEqual(expected_lookups, embedding_lookup.eval(session=sess))
Exemplo n.º 21
0
def _sequence_embedding_column(
    categorical_column, dimension, initializer=None, ckpt_to_load_from=None,
    tensor_name_in_ckpt=None, max_norm=None, trainable=True):
  if not isinstance(categorical_column, _SequenceCategoricalColumn):
    raise ValueError(
        'categorical_column must be of type _SequenceCategoricalColumn. '
        'Given (type {}): {}'.format(
            type(categorical_column), categorical_column))
  return _SequenceEmbeddingColumn(
      fc.embedding_column(
          categorical_column,
          dimension=dimension,
          initializer=initializer,
          ckpt_to_load_from=ckpt_to_load_from,
          tensor_name_in_ckpt=tensor_name_in_ckpt,
          max_norm=max_norm,
          trainable=trainable))
Exemplo n.º 22
0
  def testNumpyInputFn(self):
    """Tests complete flow with numpy_input_fn."""
    n_classes = 3
    batch_size = 10
    words = ['dog', 'cat', 'bird', 'the', 'a', 'sat', 'flew', 'slept']
    # Numpy only supports dense input, so all examples will have same length.
    # TODO(b/73160931): Update test when support for prepadded data exists.
    sequence_length = 3

    features = []
    for _ in range(batch_size):
      sentence = random.sample(words, sequence_length)
      features.append(sentence)

    x_data = np.array(features)
    y_data = np.random.randint(n_classes, size=batch_size)

    train_input_fn = numpy_io.numpy_input_fn(
        x={'tokens': x_data},
        y=y_data,
        batch_size=batch_size,
        num_epochs=None,
        shuffle=True)
    eval_input_fn = numpy_io.numpy_input_fn(
        x={'tokens': x_data},
        y=y_data,
        batch_size=batch_size,
        shuffle=False)
    predict_input_fn = numpy_io.numpy_input_fn(
        x={'tokens': x_data},
        batch_size=batch_size,
        shuffle=False)

    col = seq_fc.sequence_categorical_column_with_hash_bucket(
        'tokens', hash_bucket_size=10)
    embed = fc.embedding_column(col, dimension=2)
    feature_columns = [embed]

    self._test_complete_flow(
        feature_columns=feature_columns,
        train_input_fn=train_input_fn,
        eval_input_fn=eval_input_fn,
        predict_input_fn=predict_input_fn,
        n_classes=n_classes,
        batch_size=batch_size)
  def testNumpyInputFn(self):
    """Tests complete flow with numpy_input_fn."""
    n_classes = 3
    batch_size = 10
    words = ['dog', 'cat', 'bird', 'the', 'a', 'sat', 'flew', 'slept']
    # Numpy only supports dense input, so all examples will have same length.
    # TODO(b/73160931): Update test when support for prepadded data exists.
    sequence_length = 3

    features = []
    for _ in range(batch_size):
      sentence = random.sample(words, sequence_length)
      features.append(sentence)

    x_data = np.array(features)
    y_data = np.random.randint(n_classes, size=batch_size)

    train_input_fn = numpy_io.numpy_input_fn(
        x={'tokens': x_data},
        y=y_data,
        batch_size=batch_size,
        num_epochs=None,
        shuffle=True)
    eval_input_fn = numpy_io.numpy_input_fn(
        x={'tokens': x_data},
        y=y_data,
        batch_size=batch_size,
        shuffle=False)
    predict_input_fn = numpy_io.numpy_input_fn(
        x={'tokens': x_data},
        batch_size=batch_size,
        shuffle=False)

    col = seq_fc.sequence_categorical_column_with_hash_bucket(
        'tokens', hash_bucket_size=10)
    embed = fc.embedding_column(col, dimension=2)
    feature_columns = [embed]

    self._test_complete_flow(
        feature_columns=feature_columns,
        train_input_fn=train_input_fn,
        eval_input_fn=eval_input_fn,
        predict_input_fn=predict_input_fn,
        n_classes=n_classes,
        batch_size=batch_size)
Exemplo n.º 24
0
    def test_classifier_basic_warm_starting(self):
        """Tests correctness of DNNLinearCombinedClassifier default warm-start."""
        age = feature_column.numeric_column('age')
        city = feature_column.embedding_column(
            feature_column.categorical_column_with_vocabulary_list(
                'city', vocabulary_list=['Mountain View', 'Palo Alto']),
            dimension=5)

        # Create a DNNLinearCombinedClassifier and train to save a checkpoint.
        dnn_lc_classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
            linear_feature_columns=[age],
            dnn_feature_columns=[city],
            dnn_hidden_units=[256, 128],
            model_dir=self._ckpt_and_vocab_dir,
            n_classes=4,
            linear_optimizer='SGD',
            dnn_optimizer='SGD')
        dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1)

        # Create a second DNNLinearCombinedClassifier, warm-started from the first.
        # Use a learning_rate = 0.0 optimizer to check values (use SGD so we don't
        # have accumulator values that change).
        warm_started_dnn_lc_classifier = (
            dnn_linear_combined.DNNLinearCombinedClassifier(
                linear_feature_columns=[age],
                dnn_feature_columns=[city],
                dnn_hidden_units=[256, 128],
                n_classes=4,
                linear_optimizer=gradient_descent.GradientDescentOptimizer(
                    learning_rate=0.0),
                dnn_optimizer=gradient_descent.GradientDescentOptimizer(
                    learning_rate=0.0),
                warm_start_from=dnn_lc_classifier.model_dir))

        warm_started_dnn_lc_classifier.train(input_fn=self._input_fn,
                                             max_steps=1)
        for variable_name in warm_started_dnn_lc_classifier.get_variable_names(
        ):
            self.assertAllClose(
                dnn_lc_classifier.get_variable_value(variable_name),
                warm_started_dnn_lc_classifier.get_variable_value(
                    variable_name))
Exemplo n.º 25
0
def _sequence_embedding_column(categorical_column,
                               dimension,
                               initializer=None,
                               ckpt_to_load_from=None,
                               tensor_name_in_ckpt=None,
                               max_norm=None,
                               trainable=True):
    if not isinstance(categorical_column, _SequenceCategoricalColumn):
        raise ValueError(
            'categorical_column must be of type _SequenceCategoricalColumn. '
            'Given (type {}): {}'.format(type(categorical_column),
                                         categorical_column))
    return _SequenceEmbeddingColumn(
        fc.embedding_column(categorical_column,
                            dimension=dimension,
                            initializer=initializer,
                            ckpt_to_load_from=ckpt_to_load_from,
                            tensor_name_in_ckpt=tensor_name_in_ckpt,
                            max_norm=max_norm,
                            trainable=trainable))
Exemplo n.º 26
0
  def _test_complete_flow(
      self, train_input_fn, eval_input_fn, predict_input_fn, n_classes,
      batch_size):
    col = seq_fc.sequence_categorical_column_with_hash_bucket(
        'tokens', hash_bucket_size=10)
    embed = fc.embedding_column(col, dimension=2)
    feature_columns = [embed]

    cell_units = [4, 2]
    est = rnn.RNNClassifier(
        num_units=cell_units,
        sequence_feature_columns=feature_columns,
        n_classes=n_classes,
        model_dir=self._model_dir)

    # TRAIN
    num_steps = 10
    est.train(train_input_fn, steps=num_steps)

    # EVALUATE
    scores = est.evaluate(eval_input_fn)
    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
    self.assertIn('loss', six.iterkeys(scores))

    # PREDICT
    predicted_proba = np.array([
        x[prediction_keys.PredictionKeys.PROBABILITIES]
        for x in est.predict(predict_input_fn)
    ])
    self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)

    # EXPORT
    feature_spec = {
        'tokens': parsing_ops.VarLenFeature(dtypes.string),
        'label': parsing_ops.FixedLenFeature([1], dtypes.int64),
    }
    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
        feature_spec)
    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
                                       serving_input_receiver_fn)
    self.assertTrue(gfile.Exists(export_dir))
Exemplo n.º 27
0
    def test_sequence_length(self):
        vocabulary_size = 3
        sparse_input = sparse_tensor.SparseTensorValue(
            # example 0, ids [2]
            # example 1, ids [0, 1]
            indices=((0, 0), (1, 0), (1, 1)),
            values=(2, 0, 1),
            dense_shape=(2, 2))
        expected_sequence_length = [1, 2]

        categorical_column = sfc.sequence_categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size)
        embedding_column = fc.embedding_column(categorical_column, dimension=2)

        _, sequence_length = embedding_column._get_sequence_dense_tensor(
            _LazyBuilder({'aaa': sparse_input}))

        with monitored_session.MonitoredSession() as sess:
            sequence_length = sess.run(sequence_length)
            self.assertAllEqual(expected_sequence_length, sequence_length)
            self.assertEqual(np.int64, sequence_length.dtype)
Exemplo n.º 28
0
    def test_embedding_column(self):
        """Tests that error is raised for sequence embedding column."""
        vocabulary_size = 3
        sparse_input = sparse_tensor.SparseTensorValue(
            # example 0, ids [2]
            # example 1, ids [0, 1]
            indices=((0, 0), (1, 0), (1, 1)),
            values=(2, 0, 1),
            dense_shape=(2, 2))

        categorical_column_a = sfc.sequence_categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size)
        embedding_column_a = fc.embedding_column(categorical_column_a,
                                                 dimension=2)

        with self.assertRaisesRegexp(
                ValueError,
                r'In embedding_column: aaa_embedding\. categorical_column must not be '
                r'of type _SequenceCategoricalColumn\.'):
            _ = fc.input_layer(features={'aaa': sparse_input},
                               feature_columns=[embedding_column_a])
Exemplo n.º 29
0
    def testConflictingRNNCellFn(self):
        col = seq_fc.sequence_categorical_column_with_hash_bucket(
            'tokens', hash_bucket_size=10)
        embed = fc.embedding_column(col, dimension=2)
        cell_units = [4, 2]

        with self.assertRaisesRegexp(
                ValueError,
                'num_units and cell_type must not be specified when using rnn_cell_fn'
        ):
            rnn.RNNClassifier(sequence_feature_columns=[embed],
                              rnn_cell_fn=lambda x: x,
                              num_units=cell_units)

        with self.assertRaisesRegexp(
                ValueError,
                'num_units and cell_type must not be specified when using rnn_cell_fn'
        ):
            rnn.RNNClassifier(sequence_feature_columns=[embed],
                              rnn_cell_fn=lambda x: x,
                              cell_type='lstm')
Exemplo n.º 30
0
  def testConflictingRNNCellFn(self):
    col = seq_fc.sequence_categorical_column_with_hash_bucket(
        'tokens', hash_bucket_size=10)
    embed = fc.embedding_column(col, dimension=2)
    cell_units = [4, 2]

    with self.assertRaisesRegexp(
        ValueError,
        'num_units and cell_type must not be specified when using rnn_cell_fn'):
      rnn.RNNClassifier(
          sequence_feature_columns=[embed],
          rnn_cell_fn=lambda x: x,
          num_units=cell_units)

    with self.assertRaisesRegexp(
        ValueError,
        'num_units and cell_type must not be specified when using rnn_cell_fn'):
      rnn.RNNClassifier(
          sequence_feature_columns=[embed],
          rnn_cell_fn=lambda x: x,
          cell_type='lstm')
Exemplo n.º 31
0
    def _test_complete_flow(self, train_input_fn, eval_input_fn,
                            predict_input_fn, n_classes, batch_size):
        col = seq_fc.sequence_categorical_column_with_hash_bucket(
            'tokens', hash_bucket_size=10)
        embed = fc.embedding_column(col, dimension=2)
        feature_columns = [embed]

        cell_units = [4, 2]
        est = rnn.RNNClassifier(num_units=cell_units,
                                sequence_feature_columns=feature_columns,
                                n_classes=n_classes,
                                model_dir=self._model_dir)

        # TRAIN
        num_steps = 10
        est.train(train_input_fn, steps=num_steps)

        # EVALUATE
        scores = est.evaluate(eval_input_fn)
        self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
        self.assertIn('loss', six.iterkeys(scores))

        # PREDICT
        predicted_proba = np.array([
            x[prediction_keys.PredictionKeys.PROBABILITIES]
            for x in est.predict(predict_input_fn)
        ])
        self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)

        # EXPORT
        feature_spec = {
            'tokens': parsing_ops.VarLenFeature(dtypes.string),
            'label': parsing_ops.FixedLenFeature([1], dtypes.int64),
        }
        serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
            feature_spec)
        export_dir = est.export_savedmodel(tempfile.mkdtemp(),
                                           serving_input_receiver_fn)
        self.assertTrue(gfile.Exists(export_dir))
  def test_classifier_basic_warm_starting(self):
    """Tests correctness of DNNLinearCombinedClassifier default warm-start."""
    age = feature_column.numeric_column('age')
    city = feature_column.embedding_column(
        feature_column.categorical_column_with_vocabulary_list(
            'city', vocabulary_list=['Mountain View', 'Palo Alto']),
        dimension=5)

    # Create a DNNLinearCombinedClassifier and train to save a checkpoint.
    dnn_lc_classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
        linear_feature_columns=[age],
        dnn_feature_columns=[city],
        dnn_hidden_units=[256, 128],
        model_dir=self._ckpt_and_vocab_dir,
        n_classes=4,
        linear_optimizer='SGD',
        dnn_optimizer='SGD')
    dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1)

    # Create a second DNNLinearCombinedClassifier, warm-started from the first.
    # Use a learning_rate = 0.0 optimizer to check values (use SGD so we don't
    # have accumulator values that change).
    warm_started_dnn_lc_classifier = (
        dnn_linear_combined.DNNLinearCombinedClassifier(
            linear_feature_columns=[age],
            dnn_feature_columns=[city],
            dnn_hidden_units=[256, 128],
            n_classes=4,
            linear_optimizer=gradient_descent.GradientDescentOptimizer(
                learning_rate=0.0),
            dnn_optimizer=gradient_descent.GradientDescentOptimizer(
                learning_rate=0.0),
            warm_start_from=dnn_lc_classifier.model_dir))

    warm_started_dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1)
    for variable_name in warm_started_dnn_lc_classifier.get_variable_names():
      self.assertAllClose(
          dnn_lc_classifier.get_variable_value(variable_name),
          warm_started_dnn_lc_classifier.get_variable_value(variable_name))
  def test_embedding_column(self):
    """Tests that error is raised for sequence embedding column."""
    vocabulary_size = 3
    sparse_input = sparse_tensor.SparseTensorValue(
        # example 0, ids [2]
        # example 1, ids [0, 1]
        indices=((0, 0), (1, 0), (1, 1)),
        values=(2, 0, 1),
        dense_shape=(2, 2))

    categorical_column_a = sfc.sequence_categorical_column_with_identity(
        key='aaa', num_buckets=vocabulary_size)
    embedding_column_a = fc.embedding_column(
        categorical_column_a, dimension=2)

    with self.assertRaisesRegexp(
        ValueError,
        r'In embedding_column: aaa_embedding\. categorical_column must not be '
        r'of type _SequenceCategoricalColumn\.'):
      _ = fc.input_layer(
          features={'aaa': sparse_input},
          feature_columns=[embedding_column_a])
  def test_sequence_length(self):
    vocabulary_size = 3
    sparse_input = sparse_tensor.SparseTensorValue(
        # example 0, ids [2]
        # example 1, ids [0, 1]
        indices=((0, 0), (1, 0), (1, 1)),
        values=(2, 0, 1),
        dense_shape=(2, 2))
    expected_sequence_length = [1, 2]

    categorical_column = sfc.sequence_categorical_column_with_identity(
        key='aaa', num_buckets=vocabulary_size)
    embedding_column = fc.embedding_column(
        categorical_column, dimension=2)

    _, sequence_length = embedding_column._get_sequence_dense_tensor(
        _LazyBuilder({'aaa': sparse_input}))

    with monitored_session.MonitoredSession() as sess:
      sequence_length = sess.run(sequence_length)
      self.assertAllEqual(expected_sequence_length, sequence_length)
      self.assertEqual(np.int64, sequence_length.dtype)
  def _testAnnotationsPresentForEstimator(self, estimator_class):
    feature_columns = [
        feature_column.numeric_column('x', shape=(1,)),
        feature_column.embedding_column(
            feature_column.categorical_column_with_vocabulary_list(
                'y', vocabulary_list=['a', 'b', 'c']),
            dimension=3)
    ]
    estimator = estimator_class(
        hidden_units=(2, 2),
        feature_columns=feature_columns,
        model_dir=self._model_dir)
    model_fn = estimator.model_fn

    graph = ops.Graph()
    with graph.as_default():
      model_fn({
          'x': array_ops.constant([1.0]),
          'y': array_ops.constant(['a'])
      }, {},
               model_fn_lib.ModeKeys.PREDICT,
               config=None)

      unprocessed_features = self._getLayerAnnotationCollection(
          graph, dnn_with_layer_annotations.LayerAnnotationsCollectionNames
          .UNPROCESSED_FEATURES)
      processed_features = self._getLayerAnnotationCollection(
          graph, dnn_with_layer_annotations.LayerAnnotationsCollectionNames
          .PROCESSED_FEATURES)
      feature_columns = graph.get_collection(
          dnn_with_layer_annotations.LayerAnnotationsCollectionNames
          .FEATURE_COLUMNS)

      self.assertItemsEqual(unprocessed_features.keys(), ['x', 'y'])
      self.assertEqual(2, len(processed_features.keys()))
      self.assertEqual(2, len(feature_columns))
    def _testAnnotationsPresentForEstimator(self, estimator_class):
        feature_columns = [
            feature_column.numeric_column('x', shape=(1, )),
            feature_column.embedding_column(
                feature_column.categorical_column_with_vocabulary_list(
                    'y', vocabulary_list=['a', 'b', 'c']),
                dimension=3)
        ]
        estimator = estimator_class(hidden_units=(2, 2),
                                    feature_columns=feature_columns,
                                    model_dir=self._model_dir)
        model_fn = estimator.model_fn

        graph = ops.Graph()
        with graph.as_default():
            model_fn(
                {
                    'x': array_ops.constant([1.0]),
                    'y': array_ops.constant(['a'])
                }, {},
                model_fn_lib.ModeKeys.PREDICT,
                config=None)

            unprocessed_features = self._getLayerAnnotationCollection(
                graph, dnn_with_layer_annotations.
                LayerAnnotationsCollectionNames.UNPROCESSED_FEATURES)
            processed_features = self._getLayerAnnotationCollection(
                graph, dnn_with_layer_annotations.
                LayerAnnotationsCollectionNames.PROCESSED_FEATURES)
            feature_columns = graph.get_collection(
                dnn_with_layer_annotations.LayerAnnotationsCollectionNames.
                FEATURE_COLUMNS)

            self.assertItemsEqual(unprocessed_features.keys(), ['x', 'y'])
            self.assertEqual(2, len(processed_features.keys()))
            self.assertEqual(2, len(feature_columns))
Exemplo n.º 37
0
from tensorflow.contrib.learn import LinearRegressor, pandas_input_fn, DNNRegressor, Experiment
from tensorflow.python.feature_column.feature_column import categorical_column_with_hash_bucket, numeric_column, \
    categorical_column_with_vocabulary_list, embedding_column, indicator_column

make = categorical_column_with_hash_bucket('make', 100)
horsepower = numeric_column('horsepower', shape=[])
cylinders = categorical_column_with_vocabulary_list(
    'num-of-cylinders', ['two', 'three', 'four', 'six', 'eight'])

###############
regressor = DNNRegressor(feature_columns=[
    embedding_column(make, 10), horsepower,
    indicator_column(cylinders, 3)
],
                         hidden_units=[50, 30, 10])
################
regressor = LinearRegressor(feature_columns=[make, horsepower, cylinders])

# any python generator
train_input_fn = pandas_input_fn(x=input_data,
                                 y=input_label,
                                 batch_size=64,
                                 shuffle=True,
                                 num_epochs=None)

regressor.train(train_input_fn, steps=10000)


def expirement_fn(run_config, hparams):
    regressor = DNNRegressor(...,
                             config=run_config,
  def _complete_flow_with_mode(self, mode):
    n_classes = 3
    input_dimension = 2
    batch_size = 12

    data = np.linspace(
        0., n_classes - 1., batch_size * input_dimension, dtype=np.float32)
    x_data = data.reshape(batch_size, input_dimension)
    categorical_data = np.random.random_integers(
        0, len(x_data), size=len(x_data))
    y_data = np.reshape(self._as_label(data[:batch_size]), (batch_size, 1))
    train_input_fn = numpy_io.numpy_input_fn(
        x={'x': x_data,
           'categories': categorical_data},
        y=y_data,
        batch_size=batch_size,
        num_epochs=None,
        shuffle=True)
    eval_input_fn = numpy_io.numpy_input_fn(
        x={'x': x_data,
           'categories': categorical_data},
        y=y_data,
        batch_size=batch_size,
        shuffle=False)
    predict_input_fn = numpy_io.numpy_input_fn(
        x={'x': x_data,
           'categories': categorical_data},
        batch_size=batch_size,
        shuffle=False)

    feature_columns = [
        feature_column.numeric_column('x', shape=(input_dimension,)),
        feature_column.embedding_column(
            feature_column.categorical_column_with_vocabulary_list(
                'categories',
                vocabulary_list=np.linspace(
                    0., len(x_data), len(x_data), dtype=np.int64)), 1)
    ]

    estimator = dnn.DNNClassifier(
        hidden_units=(2, 2),
        feature_columns=feature_columns,
        n_classes=n_classes,
        model_dir=self._model_dir)

    def optimizer_fn():
      return optimizers.get_optimizer_instance('Adagrad', learning_rate=0.05)

    if not mode:  # Use the public `replicate_model_fn`.
      model_fn = replicate_model_fn.replicate_model_fn(
          estimator.model_fn,
          optimizer_fn,
          devices=['/gpu:0', '/gpu:1', '/gpu:2'])
    else:
      model_fn = replicate_model_fn._replicate_model_fn_with_mode(
          estimator.model_fn,
          optimizer_fn,
          devices=['/gpu:0', '/gpu:1', '/gpu:2'],
          mode=mode)

    estimator = estimator_lib.Estimator(
        model_fn=model_fn,
        model_dir=estimator.model_dir,
        config=estimator.config,
        params=estimator.params)

    num_steps = 10
    estimator.train(train_input_fn, steps=num_steps)

    scores = estimator.evaluate(eval_input_fn)
    self.assertEqual(num_steps, scores[ops_lib.GraphKeys.GLOBAL_STEP])
    self.assertIn('loss', six.iterkeys(scores))

    predicted_proba = np.array([
        x[prediction_keys.PredictionKeys.PROBABILITIES]
        for x in estimator.predict(predict_input_fn)
    ])
    self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)

    feature_spec = feature_column.make_parse_example_spec(feature_columns)
    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
        feature_spec)
    export_dir = estimator.export_savedmodel(tempfile.mkdtemp(),
                                             serving_input_receiver_fn)
    self.assertTrue(gfile.Exists(export_dir))
Exemplo n.º 39
0
 def test_one_shot_prediction_head_export(self, estimator_factory):
   def _new_temp_dir():
     return os.path.join(test.get_temp_dir(), str(ops.uid()))
   model_dir = _new_temp_dir()
   categorical_column = feature_column.categorical_column_with_hash_bucket(
       key="categorical_exogenous_feature", hash_bucket_size=16)
   exogenous_feature_columns = [
       feature_column.numeric_column(
           "2d_exogenous_feature", shape=(2,)),
       feature_column.embedding_column(
           categorical_column=categorical_column, dimension=10)]
   estimator = estimator_factory(
       model_dir=model_dir,
       exogenous_feature_columns=exogenous_feature_columns,
       head_type=ts_head_lib.OneShotPredictionHead)
   train_features = {
       feature_keys.TrainEvalFeatures.TIMES: numpy.arange(
           20, dtype=numpy.int64),
       feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange(
           20, dtype=numpy.float32)[:, None], [1, 5]),
       "2d_exogenous_feature": numpy.ones([20, 2]),
       "categorical_exogenous_feature": numpy.array(
           ["strkey"] * 20)[:, None]
   }
   train_input_fn = input_pipeline.RandomWindowInputFn(
       input_pipeline.NumpyReader(train_features), shuffle_seed=2,
       num_threads=1, batch_size=16, window_size=16)
   estimator.train(input_fn=train_input_fn, steps=5)
   result = estimator.evaluate(input_fn=train_input_fn, steps=1)
   self.assertIn("average_loss", result)
   self.assertNotIn(feature_keys.State.STATE_TUPLE, result)
   input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
   export_location = estimator.export_saved_model(_new_temp_dir(),
                                                  input_receiver_fn)
   graph = ops.Graph()
   with graph.as_default():
     with session_lib.Session() as session:
       signatures = loader.load(
           session, [tag_constants.SERVING], export_location)
       self.assertEqual([feature_keys.SavedModelLabels.PREDICT],
                        list(signatures.signature_def.keys()))
       predict_signature = signatures.signature_def[
           feature_keys.SavedModelLabels.PREDICT]
       six.assertCountEqual(
           self,
           [feature_keys.FilteringFeatures.TIMES,
            feature_keys.FilteringFeatures.VALUES,
            "2d_exogenous_feature",
            "categorical_exogenous_feature"],
           predict_signature.inputs.keys())
       features = {
           feature_keys.TrainEvalFeatures.TIMES: numpy.tile(
               numpy.arange(35, dtype=numpy.int64)[None, :], [2, 1]),
           feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange(
               20, dtype=numpy.float32)[None, :, None], [2, 1, 5]),
           "2d_exogenous_feature": numpy.ones([2, 35, 2]),
           "categorical_exogenous_feature": numpy.tile(numpy.array(
               ["strkey"] * 35)[None, :, None], [2, 1, 1])
       }
       feeds = {
           graph.as_graph_element(input_value.name): features[input_key]
           for input_key, input_value in predict_signature.inputs.items()}
       fetches = {output_key: graph.as_graph_element(output_value.name)
                  for output_key, output_value
                  in predict_signature.outputs.items()}
       output = session.run(fetches, feed_dict=feeds)
       self.assertEqual((2, 15, 5), output["mean"].shape)
   # Build a parsing input function, then make a tf.Example for it to parse.
   export_location = estimator.export_saved_model(
       _new_temp_dir(),
       estimator.build_one_shot_parsing_serving_input_receiver_fn(
           filtering_length=20, prediction_length=15))
   graph = ops.Graph()
   with graph.as_default():
     with session_lib.Session() as session:
       example = example_pb2.Example()
       times = example.features.feature[feature_keys.TrainEvalFeatures.TIMES]
       values = example.features.feature[feature_keys.TrainEvalFeatures.VALUES]
       times.int64_list.value.extend(range(35))
       for i in range(20):
         values.float_list.value.extend(
             [float(i) * 2. + feature_number
              for feature_number in range(5)])
       real_feature = example.features.feature["2d_exogenous_feature"]
       categortical_feature = example.features.feature[
           "categorical_exogenous_feature"]
       for i in range(35):
         real_feature.float_list.value.extend([1, 1])
         categortical_feature.bytes_list.value.append(b"strkey")
       # Serialize the tf.Example for feeding to the Session
       examples = [example.SerializeToString()] * 2
       signatures = loader.load(
           session, [tag_constants.SERVING], export_location)
       predict_signature = signatures.signature_def[
           feature_keys.SavedModelLabels.PREDICT]
       ((_, input_value),) = predict_signature.inputs.items()
       feeds = {graph.as_graph_element(input_value.name): examples}
       fetches = {output_key: graph.as_graph_element(output_value.name)
                  for output_key, output_value
                  in predict_signature.outputs.items()}
       output = session.run(fetches, feed_dict=feeds)
       self.assertEqual((2, 15, 5), output["mean"].shape)
Exemplo n.º 40
0
    def _build_feature_columns(self, ):
        multi_hot_feature_columns = {}
        multi_hot_feature_columns_deep = {}
        multi_category_feature_columns = {}
        continuous_feature_columns = {}
        crossed_feature_columns = []
        bucketized_feature_columns = []
        embedding_feature_columns = []

        if self._data_conf.multi_hot_columns is not None:
            for column in self._data_conf.multi_hot_columns:
                multi_hot_feature_columns[
                    column] = categorical_column_with_vocabulary_list(
                        column,
                        self._data_conf.multi_hot_columns[column],
                        dtype=tf.string)
                multi_hot_feature_columns_deep[column] = indicator_column(
                    multi_hot_feature_columns[column])

        if self._data_conf.multi_category_columns is not None:
            multi_category_feature_columns = {
                column:
                categorical_column_with_hash_bucket(column,
                                                    hash_bucket_size=1000)
                for column in self._data_conf.multi_category_columns
            }

        if self._data_conf.continuous_columns is not None:
            continuous_feature_columns = {
                column: numeric_column(column)
                for column in self._data_conf.continuous_columns
            }

        if self._data_conf.crossed_columns is not None:
            crossed_feature_columns = [
                crossed_column(_, hash_bucket_size=100000)
                for _ in self._data_conf.crossed_columns
            ]

        if self._data_conf.bucketized_columns is not None:
            [
                bucketized_feature_columns.append(
                    bucketized_column(continuous_feature_columns[column],
                                      boundaries=boundary)) for column,
                boundary in self._data_conf.bucketized_columns.items
            ]

        if len(multi_category_feature_columns) > 0:
            embedding_feature_columns = [
                embedding_column(
                    _, dimension=self._model_conf.embedding_dimension)
                for _ in multi_category_feature_columns.values()
            ]

        self._feature_mapping = {
            0: list(multi_hot_feature_columns.values()),
            1: list(multi_category_feature_columns.values()),
            2: list(continuous_feature_columns.values()),
            3: crossed_feature_columns,
            4: bucketized_feature_columns,
            5: embedding_feature_columns,
            6: list(multi_hot_feature_columns_deep.values())
        }

        self._build_feature_columns_for_model()
  def test_embedding_column(self):
    vocabulary_size = 3
    sparse_input_a = sparse_tensor.SparseTensorValue(
        # example 0, ids [2]
        # example 1, ids [0, 1]
        indices=((0, 0), (1, 0), (1, 1)),
        values=(2, 0, 1),
        dense_shape=(2, 2))
    sparse_input_b = sparse_tensor.SparseTensorValue(
        # example 0, ids [1]
        # example 1, ids [2, 0]
        indices=((0, 0), (1, 0), (1, 1)),
        values=(1, 2, 0),
        dense_shape=(2, 2))

    embedding_dimension_a = 2
    embedding_values_a = (
        (1., 2.),  # id 0
        (3., 4.),  # id 1
        (5., 6.)  # id 2
    )
    embedding_dimension_b = 3
    embedding_values_b = (
        (11., 12., 13.),  # id 0
        (14., 15., 16.),  # id 1
        (17., 18., 19.)  # id 2
    )
    def _get_initializer(embedding_dimension, embedding_values):
      def _initializer(shape, dtype, partition_info):
        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
        self.assertEqual(dtypes.float32, dtype)
        self.assertIsNone(partition_info)
        return embedding_values
      return _initializer

    expected_input_layer = [
        # example 0, ids_a [2], ids_b [1]
        [[5., 6., 14., 15., 16.], [0., 0., 0., 0., 0.]],
        # example 1, ids_a [0, 1], ids_b [2, 0]
        [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]],
    ]
    expected_sequence_length = [1, 2]

    categorical_column_a = sfc.sequence_categorical_column_with_identity(
        key='aaa', num_buckets=vocabulary_size)
    embedding_column_a = fc.embedding_column(
        categorical_column_a, dimension=embedding_dimension_a,
        initializer=_get_initializer(embedding_dimension_a, embedding_values_a))
    categorical_column_b = sfc.sequence_categorical_column_with_identity(
        key='bbb', num_buckets=vocabulary_size)
    embedding_column_b = fc.embedding_column(
        categorical_column_b, dimension=embedding_dimension_b,
        initializer=_get_initializer(embedding_dimension_b, embedding_values_b))

    input_layer, sequence_length = sfc.sequence_input_layer(
        features={
            'aaa': sparse_input_a,
            'bbb': sparse_input_b,
        },
        # Test that columns are reordered alphabetically.
        feature_columns=[embedding_column_b, embedding_column_a])

    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
    self.assertItemsEqual(
        ('sequence_input_layer/aaa_embedding/embedding_weights:0',
         'sequence_input_layer/bbb_embedding/embedding_weights:0'),
        tuple([v.name for v in global_vars]))
    with monitored_session.MonitoredSession() as sess:
      self.assertAllEqual(embedding_values_a, global_vars[0].eval(session=sess))
      self.assertAllEqual(embedding_values_b, global_vars[1].eval(session=sess))
      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
      self.assertAllEqual(
          expected_sequence_length, sequence_length.eval(session=sess))
Exemplo n.º 42
0
 def test_one_shot_prediction_head_export(self):
     model_dir = self.get_temp_dir()
     categorical_column = feature_column.categorical_column_with_hash_bucket(
         key="categorical_exogenous_feature", hash_bucket_size=16)
     exogenous_feature_columns = [
         feature_column.numeric_column("2d_exogenous_feature", shape=(2, )),
         feature_column.embedding_column(
             categorical_column=categorical_column, dimension=10)
     ]
     estimator = ts_estimators.TimeSeriesRegressor(
         model=lstm_example._LSTMModel(
             num_features=5,
             num_units=128,
             exogenous_feature_columns=exogenous_feature_columns),
         optimizer=adam.AdamOptimizer(0.001),
         config=estimator_lib.RunConfig(tf_random_seed=4),
         state_manager=state_management.ChainingStateManager(),
         head_type=ts_head_lib.OneShotPredictionHead,
         model_dir=model_dir)
     train_features = {
         feature_keys.TrainEvalFeatures.TIMES:
         numpy.arange(20, dtype=numpy.int64),
         feature_keys.TrainEvalFeatures.VALUES:
         numpy.tile(numpy.arange(20, dtype=numpy.float32)[:, None], [1, 5]),
         "2d_exogenous_feature":
         numpy.ones([20, 2]),
         "categorical_exogenous_feature":
         numpy.array(["strkey"] * 20)[:, None]
     }
     train_input_fn = input_pipeline.RandomWindowInputFn(
         input_pipeline.NumpyReader(train_features),
         shuffle_seed=2,
         num_threads=1,
         batch_size=16,
         window_size=16)
     estimator.train(input_fn=train_input_fn, steps=5)
     input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
     export_location = estimator.export_savedmodel(self.get_temp_dir(),
                                                   input_receiver_fn)
     graph = ops.Graph()
     with graph.as_default():
         with session_lib.Session() as session:
             signatures = loader.load(session, [tag_constants.SERVING],
                                      export_location)
             self.assertEqual([feature_keys.SavedModelLabels.PREDICT],
                              list(signatures.signature_def.keys()))
             predict_signature = signatures.signature_def[
                 feature_keys.SavedModelLabels.PREDICT]
             six.assertCountEqual(self, [
                 feature_keys.FilteringFeatures.TIMES,
                 feature_keys.FilteringFeatures.VALUES,
                 "2d_exogenous_feature", "categorical_exogenous_feature"
             ], predict_signature.inputs.keys())
             features = {
                 feature_keys.TrainEvalFeatures.TIMES:
                 numpy.tile(
                     numpy.arange(35, dtype=numpy.int64)[None, :], [2, 1]),
                 feature_keys.TrainEvalFeatures.VALUES:
                 numpy.tile(
                     numpy.arange(20, dtype=numpy.float32)[None, :, None],
                     [2, 1, 5]),
                 "2d_exogenous_feature":
                 numpy.ones([2, 35, 2]),
                 "categorical_exogenous_feature":
                 numpy.tile(
                     numpy.array(["strkey"] * 35)[None, :, None], [2, 1, 1])
             }
             feeds = {
                 graph.as_graph_element(input_value.name):
                 features[input_key]
                 for input_key, input_value in
                 predict_signature.inputs.items()
             }
             fetches = {
                 output_key: graph.as_graph_element(output_value.name)
                 for output_key, output_value in
                 predict_signature.outputs.items()
             }
             output = session.run(fetches, feed_dict=feeds)
             self.assertAllEqual((2, 15, 5), output["mean"].shape)
Exemplo n.º 43
0
    def test_one_shot_prediction_head_export(self, estimator_factory):
        def _new_temp_dir():
            return os.path.join(test.get_temp_dir(), str(ops.uid()))

        model_dir = _new_temp_dir()
        categorical_column = feature_column.categorical_column_with_hash_bucket(
            key="categorical_exogenous_feature", hash_bucket_size=16)
        exogenous_feature_columns = [
            feature_column.numeric_column("2d_exogenous_feature", shape=(2, )),
            feature_column.embedding_column(
                categorical_column=categorical_column, dimension=10)
        ]
        estimator = estimator_factory(
            model_dir=model_dir,
            exogenous_feature_columns=exogenous_feature_columns,
            head_type=ts_head_lib.OneShotPredictionHead)
        train_features = {
            feature_keys.TrainEvalFeatures.TIMES:
            numpy.arange(20, dtype=numpy.int64),
            feature_keys.TrainEvalFeatures.VALUES:
            numpy.tile(numpy.arange(20, dtype=numpy.float32)[:, None], [1, 5]),
            "2d_exogenous_feature":
            numpy.ones([20, 2]),
            "categorical_exogenous_feature":
            numpy.array(["strkey"] * 20)[:, None]
        }
        train_input_fn = input_pipeline.RandomWindowInputFn(
            input_pipeline.NumpyReader(train_features),
            shuffle_seed=2,
            num_threads=1,
            batch_size=16,
            window_size=16)
        estimator.train(input_fn=train_input_fn, steps=5)
        result = estimator.evaluate(input_fn=train_input_fn, steps=1)
        self.assertIn("average_loss", result)
        self.assertNotIn(feature_keys.State.STATE_TUPLE, result)
        input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
        export_location = estimator.export_savedmodel(_new_temp_dir(),
                                                      input_receiver_fn)
        graph = ops.Graph()
        with graph.as_default():
            with session_lib.Session() as session:
                signatures = loader.load(session, [tag_constants.SERVING],
                                         export_location)
                self.assertEqual([feature_keys.SavedModelLabels.PREDICT],
                                 list(signatures.signature_def.keys()))
                predict_signature = signatures.signature_def[
                    feature_keys.SavedModelLabels.PREDICT]
                six.assertCountEqual(self, [
                    feature_keys.FilteringFeatures.TIMES,
                    feature_keys.FilteringFeatures.VALUES,
                    "2d_exogenous_feature", "categorical_exogenous_feature"
                ], predict_signature.inputs.keys())
                features = {
                    feature_keys.TrainEvalFeatures.TIMES:
                    numpy.tile(
                        numpy.arange(35, dtype=numpy.int64)[None, :], [2, 1]),
                    feature_keys.TrainEvalFeatures.VALUES:
                    numpy.tile(
                        numpy.arange(20, dtype=numpy.float32)[None, :, None],
                        [2, 1, 5]),
                    "2d_exogenous_feature":
                    numpy.ones([2, 35, 2]),
                    "categorical_exogenous_feature":
                    numpy.tile(
                        numpy.array(["strkey"] * 35)[None, :, None], [2, 1, 1])
                }
                feeds = {
                    graph.as_graph_element(input_value.name):
                    features[input_key]
                    for input_key, input_value in
                    predict_signature.inputs.items()
                }
                fetches = {
                    output_key: graph.as_graph_element(output_value.name)
                    for output_key, output_value in
                    predict_signature.outputs.items()
                }
                output = session.run(fetches, feed_dict=feeds)
                self.assertEqual((2, 15, 5), output["mean"].shape)
        # Build a parsing input function, then make a tf.Example for it to parse.
        export_location = estimator.export_savedmodel(
            _new_temp_dir(),
            estimator.build_one_shot_parsing_serving_input_receiver_fn(
                filtering_length=20, prediction_length=15))
        graph = ops.Graph()
        with graph.as_default():
            with session_lib.Session() as session:
                example = example_pb2.Example()
                times = example.features.feature[
                    feature_keys.TrainEvalFeatures.TIMES]
                values = example.features.feature[
                    feature_keys.TrainEvalFeatures.VALUES]
                times.int64_list.value.extend(range(35))
                for i in range(20):
                    values.float_list.value.extend([
                        float(i) * 2. + feature_number
                        for feature_number in range(5)
                    ])
                real_feature = example.features.feature["2d_exogenous_feature"]
                categortical_feature = example.features.feature[
                    "categorical_exogenous_feature"]
                for i in range(35):
                    real_feature.float_list.value.extend([1, 1])
                    categortical_feature.bytes_list.value.append(b"strkey")
                # Serialize the tf.Example for feeding to the Session
                examples = [example.SerializeToString()] * 2
                signatures = loader.load(session, [tag_constants.SERVING],
                                         export_location)
                predict_signature = signatures.signature_def[
                    feature_keys.SavedModelLabels.PREDICT]
                ((_, input_value), ) = predict_signature.inputs.items()
                feeds = {graph.as_graph_element(input_value.name): examples}
                fetches = {
                    output_key: graph.as_graph_element(output_value.name)
                    for output_key, output_value in
                    predict_signature.outputs.items()
                }
                output = session.run(fetches, feed_dict=feeds)
                self.assertEqual((2, 15, 5), output["mean"].shape)
Exemplo n.º 44
0
  def testWarmStartEmbeddingColumnLinearModel(self):
    # Create old and new vocabs for embedding column "sc_vocab".
    prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                        "old_vocab")
    new_vocab_path = self._write_vocab(
        ["orange", "guava", "banana", "apple", "raspberry", "blueberry"],
        "new_vocab")

    # Save checkpoint from which to warm-start.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        variable_scope.get_variable(
            "linear_model/sc_vocab_embedding/embedding_weights",
            initializer=[[0.5, 0.4], [1., 1.1], [2., 2.2], [3., 3.3]])
        variable_scope.get_variable(
            "linear_model/sc_vocab_embedding/weights",
            initializer=[[0.69], [0.71]])
        self._write_checkpoint(sess)

    def _partitioner(shape, dtype):  # pylint:disable=unused-argument
      # Partition each var into 2 equal slices.
      partitions = [1] * len(shape)
      partitions[0] = min(2, shape[0].value)
      return partitions

    # Create feature columns.
    sc_vocab = fc.categorical_column_with_vocabulary_file(
        "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6)
    emb_vocab = fc.embedding_column(
        categorical_column=sc_vocab,
        dimension=2)
    all_deep_cols = [emb_vocab]
    # New graph, new session with warm-starting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = {}
        with variable_scope.variable_scope("", partitioner=_partitioner):
          # Create the variables.
          fc.linear_model(
              features=self._create_dummy_inputs(),
              feature_columns=all_deep_cols,
              cols_to_vars=cols_to_vars)

        # Construct the vocab_info for the embedding weight.
        vocab_info = ws_util.VocabInfo(
            new_vocab=sc_vocab.vocabulary_file,
            new_vocab_size=sc_vocab.vocabulary_size,
            num_oov_buckets=sc_vocab.num_oov_buckets,
            old_vocab=prev_vocab_path,
            # Can't use constant_initializer with load_and_remap.  In practice,
            # use a truncated normal initializer.
            backup_initializer=init_ops.random_uniform_initializer(
                minval=0.42, maxval=0.42))
        ws_util.warm_start(
            self.get_temp_dir(),
            vars_to_warm_start=".*sc_vocab.*",
            var_name_to_vocab_info={
                "linear_model/sc_vocab_embedding/embedding_weights": vocab_info
            })
        sess.run(variables.global_variables_initializer())
        # Verify weights were correctly warm-started. Var corresponding to
        # emb_vocab should be correctly warm-started after vocab remapping.
        # Missing values are filled in with the EmbeddingColumn's initializer.
        self._assert_cols_to_vars(
            cols_to_vars,
            {
                emb_vocab: [
                    # linear weights part 0.
                    np.array([[0.69]]),
                    # linear weights part 1.
                    np.array([[0.71]]),
                    # embedding_weights part 0.
                    np.array([[3., 3.3], [2., 2.2], [1., 1.1]]),
                    # embedding_weights part 1.
                    np.array([[0.5, 0.4], [0.42, 0.42], [0.42, 0.42]])
                ]
            },
            sess)
    def test_complete_flow(self):
        n_classes = 3
        input_dimension = 2
        batch_size = 12

        data = np.linspace(0.,
                           n_classes - 1.,
                           batch_size * input_dimension,
                           dtype=np.float32)
        x_data = data.reshape(batch_size, input_dimension)
        categorical_data = np.random.random_integers(0,
                                                     len(x_data),
                                                     size=len(x_data))
        y_data = np.reshape(self._as_label(data[:batch_size]), (batch_size, 1))
        train_input_fn = numpy_io.numpy_input_fn(x={
            'x': x_data,
            'categories': categorical_data
        },
                                                 y=y_data,
                                                 batch_size=batch_size,
                                                 num_epochs=None,
                                                 shuffle=True)
        eval_input_fn = numpy_io.numpy_input_fn(x={
            'x': x_data,
            'categories': categorical_data
        },
                                                y=y_data,
                                                batch_size=batch_size,
                                                shuffle=False)
        predict_input_fn = numpy_io.numpy_input_fn(x={
            'x':
            x_data,
            'categories':
            categorical_data
        },
                                                   batch_size=batch_size,
                                                   shuffle=False)

        feature_columns = [
            feature_column.numeric_column('x', shape=(input_dimension, )),
            feature_column.embedding_column(
                feature_column.categorical_column_with_vocabulary_list(
                    'categories',
                    vocabulary_list=np.linspace(0.,
                                                len(x_data),
                                                len(x_data),
                                                dtype=np.int64)), 1)
        ]

        estimator = dnn.DNNClassifier(hidden_units=(2, 2),
                                      feature_columns=feature_columns,
                                      n_classes=n_classes,
                                      model_dir=self._model_dir)

        def optimizer_fn():
            return optimizers.get_optimizer_instance('Adagrad',
                                                     learning_rate=0.05)

        estimator = estimator_lib.Estimator(
            model_fn=replicate_model_fn.replicate_model_fn(
                estimator.model_fn,
                optimizer_fn,
                devices=['/gpu:0', '/gpu:1', '/gpu:2']),
            model_dir=estimator.model_dir,
            config=estimator.config,
            params=estimator.params)

        num_steps = 10
        estimator.train(train_input_fn, steps=num_steps)

        scores = estimator.evaluate(eval_input_fn)
        self.assertEqual(num_steps, scores[ops_lib.GraphKeys.GLOBAL_STEP])
        self.assertIn('loss', six.iterkeys(scores))

        predicted_proba = np.array([
            x[prediction_keys.PredictionKeys.PROBABILITIES]
            for x in estimator.predict(predict_input_fn)
        ])
        self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)

        feature_spec = feature_column.make_parse_example_spec(feature_columns)
        serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
            feature_spec)
        export_dir = estimator.export_savedmodel(tempfile.mkdtemp(),
                                                 serving_input_receiver_fn)
        self.assertTrue(gfile.Exists(export_dir))
Exemplo n.º 46
0
 def test_one_shot_prediction_head_export(self, estimator_factory):
   model_dir = os.path.join(test.get_temp_dir(), str(ops.uid()))
   categorical_column = feature_column.categorical_column_with_hash_bucket(
       key="categorical_exogenous_feature", hash_bucket_size=16)
   exogenous_feature_columns = [
       feature_column.numeric_column(
           "2d_exogenous_feature", shape=(2,)),
       feature_column.embedding_column(
           categorical_column=categorical_column, dimension=10)]
   estimator = estimator_factory(
       model_dir=model_dir,
       exogenous_feature_columns=exogenous_feature_columns,
       head_type=ts_head_lib.OneShotPredictionHead)
   train_features = {
       feature_keys.TrainEvalFeatures.TIMES: numpy.arange(
           20, dtype=numpy.int64),
       feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange(
           20, dtype=numpy.float32)[:, None], [1, 5]),
       "2d_exogenous_feature": numpy.ones([20, 2]),
       "categorical_exogenous_feature": numpy.array(
           ["strkey"] * 20)[:, None]
   }
   train_input_fn = input_pipeline.RandomWindowInputFn(
       input_pipeline.NumpyReader(train_features), shuffle_seed=2,
       num_threads=1, batch_size=16, window_size=16)
   estimator.train(input_fn=train_input_fn, steps=5)
   input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
   export_location = estimator.export_savedmodel(test.get_temp_dir(),
                                                 input_receiver_fn)
   graph = ops.Graph()
   with graph.as_default():
     with session_lib.Session() as session:
       signatures = loader.load(
           session, [tag_constants.SERVING], export_location)
       self.assertEqual([feature_keys.SavedModelLabels.PREDICT],
                        list(signatures.signature_def.keys()))
       predict_signature = signatures.signature_def[
           feature_keys.SavedModelLabels.PREDICT]
       six.assertCountEqual(
           self,
           [feature_keys.FilteringFeatures.TIMES,
            feature_keys.FilteringFeatures.VALUES,
            "2d_exogenous_feature",
            "categorical_exogenous_feature"],
           predict_signature.inputs.keys())
       features = {
           feature_keys.TrainEvalFeatures.TIMES: numpy.tile(
               numpy.arange(35, dtype=numpy.int64)[None, :], [2, 1]),
           feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange(
               20, dtype=numpy.float32)[None, :, None], [2, 1, 5]),
           "2d_exogenous_feature": numpy.ones([2, 35, 2]),
           "categorical_exogenous_feature": numpy.tile(numpy.array(
               ["strkey"] * 35)[None, :, None], [2, 1, 1])
       }
       feeds = {
           graph.as_graph_element(input_value.name): features[input_key]
           for input_key, input_value in predict_signature.inputs.items()}
       fetches = {output_key: graph.as_graph_element(output_value.name)
                  for output_key, output_value
                  in predict_signature.outputs.items()}
       output = session.run(fetches, feed_dict=feeds)
       self.assertEqual((2, 15, 5), output["mean"].shape)