def define_kpls_for_training(self, use_adapt):
        # Define KPLs under strategy's scope. Right now, if they have look up
        # tables, they will be created on the client. Their variables will be
        # created on PS. Ideally they should be cached on each worker since they
        # will not be changed in a training step.
        if use_adapt:
            feature_lookup_layer = string_lookup.StringLookup(
                num_oov_indices=1)
            feature_lookup_layer.adapt(FEATURE_VOCAB)
            label_lookup_layer = string_lookup.StringLookup(num_oov_indices=0,
                                                            mask_token=None)
            label_lookup_layer.adapt(LABEL_VOCAB)
        else:
            feature_lookup_layer = string_lookup.StringLookup(
                vocabulary=FEATURE_VOCAB, num_oov_indices=1)
            label_lookup_layer = string_lookup.StringLookup(
                vocabulary=LABEL_VOCAB, num_oov_indices=0, mask_token=None)

        raw_feature_input = keras.layers.Input(shape=(3, ),
                                               dtype=dtypes.string,
                                               name="feature",
                                               ragged=True)
        feature_id_input = feature_lookup_layer(raw_feature_input)

        # Model creates variables as well.
        feature_ps = keras.Model({"features": raw_feature_input},
                                 feature_id_input)

        raw_label_input = keras.layers.Input(shape=(),
                                             dtype=dtypes.string,
                                             name="label")
        label_id_input = label_lookup_layer(raw_label_input)
        label_ps = keras.Model({"label": raw_label_input}, label_id_input)

        return feature_ps, label_ps
예제 #2
0
  def test_get_vocab_returns_str(self):
    vocab_data = ["earth", "wind", "and", "fire"]
    expected_vocab = ["[UNK]", "earth", "wind", "and", "fire"]
    layer = string_lookup.StringLookup(vocabulary=vocab_data)
    layer_vocab = layer.get_vocabulary()
    self.assertAllEqual(expected_vocab, layer_vocab)
    self.assertIsInstance(layer_vocab[0], str)

    inverse_layer = string_lookup.StringLookup(
        vocabulary=layer.get_vocabulary(), invert=True)
    layer_vocab = inverse_layer.get_vocabulary()
    self.assertAllEqual(expected_vocab, layer_vocab)
    self.assertIsInstance(layer_vocab[0], str)
예제 #3
0
 def test_non_unique_vocab_from_file_fails(self):
   vocab_list = ["earth", "wind", "and", "fire", "earth"]
   vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
   with self.assertRaisesRegex(
       errors_impl.FailedPreconditionError,
       "HashTable has different value for same key.*earth"):
     _ = string_lookup.StringLookup(vocabulary=vocab_path)
예제 #4
0
  def test_forward_backward_explicit_vocab(self):
    vocab_data = ["earth", "wind", "and", "fire"]
    input_array = np.array([["earth", "wind", "and", "fire"],
                            ["fire", "and", "earth", "michigan"]])
    expected_output = np.array([["earth", "wind", "and", "fire"],
                                ["fire", "and", "earth", "[UNK]"]])

    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
    layer = string_lookup.StringLookup(vocabulary=vocab_data)
    invert_layer = string_lookup.StringLookup(
        vocabulary=vocab_data, invert=True)
    int_data = layer(input_data)
    out_data = invert_layer(int_data)
    model = keras.Model(inputs=input_data, outputs=out_data)
    output_data = model.predict(input_array)
    self.assertAllEqual(expected_output, output_data)
  def embedding_varlen(self, batch_size, max_length):
    """Benchmark a variable-length embedding."""
    # Data and constants.
    vocab = fc_bm.create_vocabulary(32768)
    path = self._write_to_temp_file("tmp", vocab)

    data = fc_bm.create_string_data(
        max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.15)

    # Keras implementation
    model = keras.Sequential()
    model.add(
        keras.Input(
            shape=(max_length,), name="data", ragged=True, dtype=dt.string))
    model.add(string_lookup.StringLookup(vocabulary=path, mask_token=None))

    # FC implementation
    fc = sfc.sequence_categorical_column_with_vocabulary_list(
        key="data", vocabulary_list=vocab, num_oov_buckets=1)

    # Wrap the FC implementation in a tf.function for a fair comparison
    @tf_function()
    def fc_fn(tensors):
      fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)

    # Benchmark runs
    keras_data = {"data": data}
    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)

    fc_data = {"data": data.to_sparse()}
    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)

    return k_avg_time, fc_avg_time
예제 #6
0
  def test_sparse_output(self):
    vocab_data = ["earth", "wind", "and", "fire"]

    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
    layer = string_lookup.StringLookup(
        vocabulary=vocab_data, output_mode="multi_hot", sparse=True)
    res = layer(input_data)
    self.assertTrue(res.__class__.__name__, "SparseKerasTensor")
 def define_reverse_lookup_layer(self):
     # Only needed for serving.
     label_inverse_lookup_layer = string_lookup.StringLookup(
         num_oov_indices=1,
         mask_token=None,
         vocabulary=LABEL_VOCAB,
         invert=True)
     return label_inverse_lookup_layer
예제 #8
0
    def define_reverse_lookup_layer(self):
        """Create string reverse lookup layer for serving."""

        label_inverse_lookup_layer = string_lookup.StringLookup(
            num_oov_indices=1,
            mask_token=None,
            vocabulary=self.LABEL_VOCAB,
            invert=True)
        return label_inverse_lookup_layer
 def dataset_fn(input_context):
     del input_context
     lookup_layer = string_lookup.StringLookup(num_oov_indices=1,
                                               vocabulary=filepath)
     x = np.array([["earth", "wind", "and", "fire"],
                   ["fire", "and", "earth", "michigan"]])
     y = np.array([0, 1])
     map_fn = lambda x, y: (lookup_layer(x), y)
     return dataset_ops.DatasetV2.from_tensor_slices(
         (x, y)).shuffle(10).repeat().batch(2).map(map_fn)
예제 #10
0
    def define_kpls_for_training(self, use_adapt):
        """Function that defines KPL used for unit tests of tf.distribute.

    Args:
      use_adapt: if adapt will be called. False means there will be precomputed
        statistics.

    Returns:
      feature_mapper: a simple keras model with one keras StringLookup layer
      which maps feature to index.
      label_mapper: similar to feature_mapper, but maps label to index.

    """
        if use_adapt:
            feature_lookup_layer = (string_lookup.StringLookup(
                num_oov_indices=1))
            feature_lookup_layer.adapt(self.FEATURE_VOCAB)
            label_lookup_layer = (string_lookup.StringLookup(num_oov_indices=0,
                                                             mask_token=None))
            label_lookup_layer.adapt(self.LABEL_VOCAB)
        else:
            feature_lookup_layer = (string_lookup.StringLookup(
                vocabulary=self.FEATURE_VOCAB, num_oov_indices=1))
            label_lookup_layer = (string_lookup.StringLookup(
                vocabulary=self.LABEL_VOCAB,
                num_oov_indices=0,
                mask_token=None))

        raw_feature_input = keras.layers.Input(shape=(3, ),
                                               dtype=dtypes.string,
                                               name="feature",
                                               ragged=True)
        feature_id_input = feature_lookup_layer(raw_feature_input)
        feature_mapper = keras.Model({"features": raw_feature_input},
                                     feature_id_input)

        raw_label_input = keras.layers.Input(shape=(1, ),
                                             dtype=dtypes.string,
                                             name="label")
        label_id_input = label_lookup_layer(raw_label_input)
        label_mapper = keras.Model({"label": raw_label_input}, label_id_input)

        return feature_mapper, label_mapper
 def test_tensor_vocab(self):
   vocab_data = ["[UNK]", "wind", "and", "fire"]
   vocab_tensor = constant_op.constant(vocab_data)
   layer = string_lookup.StringLookup(vocabulary=vocab_tensor)
   returned_vocab = layer.get_vocabulary()
   self.assertAllEqual(vocab_data, returned_vocab)
   self.assertAllEqual(layer.vocabulary_size(), 4)
   fn = def_function.function(lambda: layer.set_vocabulary(vocab_tensor))
   with self.assertRaisesRegex(RuntimeError, "Cannot set a tensor vocabulary"):
     fn()
예제 #12
0
  def test_inverse_layer(self):
    vocab_data = ["earth", "wind", "and", "fire"]
    input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 0]])
    expected_output = np.array([["earth", "wind", "and", "fire"],
                                ["fire", "and", "earth", ""]])

    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
    layer = string_lookup.StringLookup(vocabulary=vocab_data, invert=True)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
    output_data = model.predict(input_array)
    self.assertAllEqual(expected_output, output_data)
예제 #13
0
  def test_int_output_explicit_vocab_with_special_tokens(self):
    vocab_data = ["", "[UNK]", "earth", "wind", "and", "fire"]
    input_array = np.array([["earth", "wind", "and", "fire"],
                            ["fire", "and", "earth", "michigan"]])
    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]

    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
    layer = string_lookup.StringLookup(vocabulary=vocab_data, mask_token="")
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
    output_data = model.predict(input_array)
    self.assertAllEqual(expected_output, output_data)
예제 #14
0
  def test_int_output_explicit_vocab(self):
    vocab_data = ["earth", "wind", "and", "fire"]
    input_array = np.array([["earth", "wind", "and", "fire"],
                            ["fire", "and", "earth", "michigan"]])
    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]

    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
    layer = string_lookup.StringLookup(vocabulary=vocab_data)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
    output_data = model.predict(input_array)
    self.assertAllEqual(expected_output, output_data)
예제 #15
0
  def test_count_output(self):
    vocab_data = ["earth", "wind", "and", "fire"]
    input_array = np.array([["earth", "earth", "fire", "fire"],
                            ["fire", "and", "earth", "michigan"]])
    expected_output = [[0, 2, 0, 0, 2], [1, 1, 0, 1, 1]]

    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
    layer = string_lookup.StringLookup(
        vocabulary=vocab_data, output_mode="count")
    res = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=res)
    output_data = model.predict(input_array)
    self.assertAllEqual(expected_output, output_data)
예제 #16
0
  def test_inverse_layer_from_file(self):
    vocab_data = ["earth", "wind", "and", "fire"]
    input_array = np.array([[1, 2, 3, 4], [4, 3, 1, 0]])
    expected_output = np.array([["earth", "wind", "and", "fire"],
                                ["fire", "and", "earth", "[UNK]"]])
    vocab_path = self._write_to_temp_file("vocab_file", vocab_data)

    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
    layer = string_lookup.StringLookup(vocabulary=vocab_path, invert=True)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
    output_data = model.predict(input_array)
    self.assertAllEqual(expected_output, output_data)
예제 #17
0
  def test_int_output_explicit_vocab_from_file(self):
    vocab_list = ["earth", "wind", "and", "fire"]
    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)

    input_array = np.array([["earth", "wind", "and", "fire"],
                            ["fire", "and", "earth", "michigan"]])
    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]

    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
    layer = string_lookup.StringLookup(vocabulary=vocab_path)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
    output_data = model.predict(input_array)
    self.assertAllEqual(expected_output, output_data)
예제 #18
0
  def test_ragged_string_input_multi_bucket(self):
    vocab_data = ["earth", "wind", "and", "fire"]
    input_array = ragged_factory_ops.constant([["earth", "wind", "fire"],
                                               ["fire", "and", "earth",
                                                "ohio"]])
    expected_output = [[2, 3, 5], [5, 4, 2, 1]]

    input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
    layer = string_lookup.StringLookup(num_oov_indices=2)
    layer.set_vocabulary(vocab_data)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
    output_data = model.predict(input_array)
    self.assertAllEqual(expected_output, output_data)
예제 #19
0
  def test_int_output_no_oov(self):
    vocab_data = ["earth", "wind", "and", "fire"]
    valid_input = np.array([["earth", "wind", "and", "fire"],
                            ["fire", "and", "earth", ""]])
    invalid_input = np.array([["earth", "wind", "and", "michigan"],
                              ["fire", "and", "earth", "michigan"]])
    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]

    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
    layer = string_lookup.StringLookup(
        vocabulary=vocab_data, mask_token="", num_oov_indices=0)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
    output_data = model.predict(valid_input)
    self.assertAllEqual(expected_output, output_data)
    with self.assertRaisesRegex(errors.InvalidArgumentError,
                                "found OOV values.*michigan"):
      _ = model.predict(invalid_input)
def embedding_varlen(batch_size, max_length):
    """Benchmark a variable-length embedding."""
    # Data and constants.
    vocab_size = 32768
    vocab = fc_bm.create_vocabulary(vocab_size)
    data = fc_bm.create_string_data(max_length,
                                    batch_size * NUM_REPEATS,
                                    vocab,
                                    pct_oov=0.15)

    # Keras implementation
    model = keras.Sequential()
    model.add(keras.Input(shape=(max_length, ), name="data", dtype=dt.string))
    model.add(string_lookup.StringLookup(vocabulary=vocab, mask_token=None))
    model.add(
        category_encoding.CategoryEncoding(num_tokens=vocab_size + 1,
                                           output_mode="count"))

    # FC implementation
    fc = fcv2.indicator_column(
        fcv2.categorical_column_with_vocabulary_list(key="data",
                                                     vocabulary_list=vocab,
                                                     num_oov_buckets=1))

    # Wrap the FC implementation in a tf.function for a fair comparison
    @tf_function()
    def fc_fn(tensors):
        fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)

    # Benchmark runs
    keras_data = {
        "data": data.to_tensor(default_value="",
                               shape=(batch_size, max_length))
    }
    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)

    fc_data = {
        "data": data.to_tensor(default_value="",
                               shape=(batch_size, max_length))
    }
    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)

    return k_avg_time, fc_avg_time
예제 #21
0
    def testTrainAndServe(self):
        # These vocabularies usually come from TFT or a Beam pipeline.
        feature_vocab = [
            "avenger", "ironman", "batman", "hulk", "spiderman", "kingkong",
            "wonder_woman"
        ]
        label_vocab = ["yes", "no"]

        with self.client.strategy.scope():

            # Define KPLs under strategy's scope. Right now, if they have look up
            # tables, they will be created on the client. Their variables will be
            # created on PS. Ideally they should be cached on each worker since they
            # will not be changed in a training step.
            feature_lookup_layer = string_lookup.StringLookup()
            raw_feature_input = keras.layers.Input(shape=(3, ),
                                                   dtype=dtypes.string,
                                                   name="feature",
                                                   ragged=True)
            feature_id_input = feature_lookup_layer(raw_feature_input)

            # Model creates variables as well.
            feature_ps = keras.Model({"features": raw_feature_input},
                                     feature_id_input)

            # TODO(yuefengz): adapt may be expensive for large vocab?
            feature_lookup_layer.adapt(feature_vocab)

            label_lookup_layer = string_lookup.StringLookup(num_oov_indices=0,
                                                            mask_token=None)
            raw_label_input = keras.layers.Input(shape=(),
                                                 dtype=dtypes.string,
                                                 name="label")
            label_id_input = label_lookup_layer(raw_label_input)
            label_ps = keras.Model({"label": raw_label_input}, label_id_input)

            label_lookup_layer.adapt(label_vocab)

            # Only needed for serving.
            label_inverse_lookup_layer = string_lookup.StringLookup(
                num_oov_indices=1,
                mask_token=None,
                vocabulary=label_lookup_layer.get_vocabulary(),
                invert=True)

            def dataset_fn():
                def feature_and_label_gen():
                    while True:
                        features = random.sample(feature_vocab, 3)
                        label = "yes" if "avenger" in features else "no"
                        yield {"features": features, "label": label}

                # The dataset will be created on the client?
                raw_dataset = dataset_ops.Dataset.from_generator(
                    feature_and_label_gen,
                    output_types={
                        "features": dtypes.string,
                        "label": dtypes.string
                    }).shuffle(200).batch(32)
                preproc_dataset = raw_dataset.map(
                    lambda x: {  # pylint: disable=g-long-lambda
                        "features": feature_ps(x["features"]),
                        "label": label_ps(x["label"])
                    })
                train_dataset = preproc_dataset.map(lambda x: (  # pylint: disable=g-long-lambda
                    {
                        "features": x["features"]
                    }, [x["label"]]))
                return train_dataset

            distributed_dataset = self.client.create_per_worker_dataset(
                dataset_fn)

            model_input = keras.layers.Input(shape=(3, ),
                                             dtype=dtypes.int64,
                                             name="model_input")
            emb_output = keras.layers.Embedding(input_dim=len(
                feature_lookup_layer.get_vocabulary()),
                                                output_dim=20)(model_input)
            emb_output = math_ops.reduce_mean(emb_output, axis=1)
            dense_output = keras.layers.Dense(units=1,
                                              activation="sigmoid")(emb_output)
            model = keras.Model({"features": model_input}, dense_output)
            optimizer = rmsprop.RMSprop(learning_rate=0.01)
            accuracy = keras.metrics.Accuracy()

            @def_function.function
            def worker_fn(iterator):
                def train_step(iterator):
                    batch_data, labels = next(iterator)
                    with backprop.GradientTape() as tape:
                        pred = model(batch_data, training=True)
                        loss = nn.compute_average_loss(
                            keras.losses.BinaryCrossentropy(
                                reduction=loss_reduction.ReductionV2.NONE)(
                                    labels, pred))
                        gradients = tape.gradient(loss,
                                                  model.trainable_variables)

                    optimizer.apply_gradients(
                        zip(gradients, model.trainable_variables))

                    actual_pred = math_ops.cast(math_ops.greater(pred, 0.5),
                                                dtypes.int64)
                    accuracy.update_state(labels, actual_pred)

                self.client._strategy.run(train_step, args=(iterator, ))

        distributed_iterator = iter(distributed_dataset)
        for _ in range(10):
            self.client.schedule(worker_fn, args=(distributed_iterator, ))
        self.client.join()
        self.assertGreater(accuracy.result().numpy(), 0.0)

        # Create a saved model.
        model.feature_ps = feature_ps
        model.label_ps = label_ps
        model.label_inverse_lookup_layer = label_inverse_lookup_layer

        def create_serving_signature(model):
            @def_function.function
            def serve_fn(raw_features):
                raw_features = array_ops.expand_dims(raw_features, axis=0)
                transformed_features = model.feature_ps(raw_features)
                outputs = model(transformed_features)
                outputs = array_ops.squeeze(outputs, axis=0)
                outputs = math_ops.cast(math_ops.greater(outputs, 0.5),
                                        dtypes.int64)
                decoded_outputs = model.label_inverse_lookup_layer(outputs)
                return array_ops.squeeze(decoded_outputs, axis=0)

            # serving does NOT have batch dimension
            return serve_fn.get_concrete_function(
                tensor_spec.TensorSpec(shape=(3),
                                       dtype=dtypes.string,
                                       name="example"))

        serving_fn = create_serving_signature(model)

        saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
        model.save(saved_model_dir, signatures={"serving_default": serving_fn})

        # Test the saved_model.
        loaded_serving_fn = keras.saving.save.load_model(
            saved_model_dir).signatures["serving_default"]

        # check the result w/ and w/o avenger.
        prediction0 = loaded_serving_fn(
            constant_op.constant(["avenger", "ironman",
                                  "avenger"]))["output_0"]
        self.assertIn(prediction0, ("yes", "no"))

        prediction1 = loaded_serving_fn(
            constant_op.constant(["ironman", "ironman",
                                  "unkonwn"]))["output_0"]
        self.assertIn(prediction1, ("yes", "no"))
예제 #22
0
 def test_non_unique_vocab_from_file_fails(self):
   vocab_list = ["earth", "wind", "and", "fire", "earth"]
   vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
   with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
     _ = string_lookup.StringLookup(vocabulary=vocab_path)
예제 #23
0
 def test_no_vocab(self):
   with self.assertRaisesRegex(
       ValueError, "You must set the layer's vocabulary"):
     layer = string_lookup.StringLookup()
     layer([["a"]])
예제 #24
0
    def __init__(self,
                 max_tokens=None,
                 standardize=LOWER_AND_STRIP_PUNCTUATION,
                 split=SPLIT_ON_WHITESPACE,
                 ngrams=None,
                 output_mode=INT,
                 output_sequence_length=None,
                 pad_to_max_tokens=False,
                 vocabulary=None,
                 **kwargs):

        # This layer only applies to string processing, and so should only have
        # a dtype of 'string'.
        if "dtype" in kwargs and kwargs["dtype"] != dtypes.string:
            raise ValueError(
                "TextVectorization may only have a dtype of string.")
        elif "dtype" not in kwargs:
            kwargs["dtype"] = dtypes.string

        # 'standardize' must be one of (None, LOWER_AND_STRIP_PUNCTUATION, callable)
        layer_utils.validate_string_arg(
            standardize,
            allowable_strings=(LOWER_AND_STRIP_PUNCTUATION),
            layer_name="TextVectorization",
            arg_name="standardize",
            allow_none=True,
            allow_callables=True)

        # 'split' must be one of (None, SPLIT_ON_WHITESPACE, callable)
        layer_utils.validate_string_arg(
            split,
            allowable_strings=(SPLIT_ON_WHITESPACE),
            layer_name="TextVectorization",
            arg_name="split",
            allow_none=True,
            allow_callables=True)

        # Support deprecated names for output_modes.
        if output_mode == "binary":
            output_mode = MULTI_HOT
        if output_mode == "tf-idf":
            output_mode = TF_IDF
        # 'output_mode' must be one of (None, INT, COUNT, MULTI_HOT, TF_IDF)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(INT, COUNT,
                                                           MULTI_HOT, TF_IDF),
                                        layer_name="TextVectorization",
                                        arg_name="output_mode",
                                        allow_none=True)

        # 'ngrams' must be one of (None, int, tuple(int))
        if not (ngrams is None or isinstance(ngrams, int)
                or isinstance(ngrams, tuple)
                and all(isinstance(item, int) for item in ngrams)):
            raise ValueError(
                ("`ngrams` must be None, an integer, or a tuple of "
                 "integers. Got %s") % (ngrams, ))

        # 'output_sequence_length' must be one of (None, int) and is only
        # set if output_mode is INT.
        if (output_mode == INT
                and not (isinstance(output_sequence_length, int) or
                         (output_sequence_length is None))):
            raise ValueError(
                "`output_sequence_length` must be either None or an "
                "integer when `output_mode` is 'int'. "
                "Got %s" % output_sequence_length)

        if output_mode != INT and output_sequence_length is not None:
            raise ValueError("`output_sequence_length` must not be set if "
                             "`output_mode` is not 'int'.")

        self._max_tokens = max_tokens
        self._standardize = standardize
        self._split = split
        self._ngrams_arg = ngrams
        if isinstance(ngrams, int):
            self._ngrams = tuple(range(1, ngrams + 1))
        else:
            self._ngrams = ngrams

        self._output_mode = output_mode
        self._output_sequence_length = output_sequence_length
        vocabulary_size = 0
        # IndexLookup needs to keep track the current vocab size outside of its
        # layer weights. We persist it as a hidden part of the config during
        # serialization.
        if "vocabulary_size" in kwargs:
            vocabulary_size = kwargs["vocabulary_size"]
            del kwargs["vocabulary_size"]

        super(TextVectorization, self).__init__(combiner=None, **kwargs)
        base_preprocessing_layer.keras_kpl_gauge.get_cell(
            "TextVectorization").set(True)

        self._index_lookup_layer = string_lookup.StringLookup(
            max_tokens=max_tokens,
            vocabulary=vocabulary,
            pad_to_max_tokens=pad_to_max_tokens,
            output_mode=output_mode if output_mode is not None else INT,
            vocabulary_size=vocabulary_size)
예제 #25
0
 def test_non_unique_vocab_fails(self):
   vocab_data = ["earth", "wind", "and", "fire", "fire"]
   with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
     _ = string_lookup.StringLookup(vocabulary=vocab_data)