def _preprocess(self, inputs): if self._standardize is LOWER_AND_STRIP_PUNCTUATION: lowercase_inputs = gen_string_ops.string_lower(inputs) inputs = string_ops.regex_replace(lowercase_inputs, self._strip_regex, "") elif self._standardize is not None: # TODO(momernick): Support callables here. raise RuntimeError("Not a supported standardization.") if self._split is SPLIT_ON_WHITESPACE: # If split isn't None, we validate that the 1st axis is of dimension 1 and # so can be squeezed out. We do this here instead of after splitting for # performance reasons - it's more expensive to squeeze a ragged tensor. inputs = array_ops.squeeze(inputs, axis=1) # This treats multiple whitespaces as one whitespace, and strips leading # and trailing whitespace. inputs = ragged_string_ops.string_split_v2(inputs) elif self._split is not None: # TODO(momernick): Support callables here. raise RuntimeError("Not a supported splitting.") # Note that 'inputs' here can be either ragged or dense depending on the # configuration choices for this Layer. The strings.ngrams op, however, does # support both ragged and dense inputs. if self._ngrams is not None: inputs = ragged_string_ops.ngrams(inputs, ngram_width=self._ngrams, separator=" ") return inputs
def _preprocess(self, inputs): if self._standardize == LOWER_AND_STRIP_PUNCTUATION: if ragged_tensor.is_ragged(inputs): lowercase_inputs = ragged_functional_ops.map_flat_values( gen_string_ops.string_lower, inputs) # Depending on configuration, we may never touch the non-data tensor # in the ragged inputs tensor. If that is the case, and this is the # only layer in the keras model, running it will throw an error. # To get around this, we wrap the result in an identity. lowercase_inputs = array_ops.identity(lowercase_inputs) else: lowercase_inputs = gen_string_ops.string_lower(inputs) inputs = string_ops.regex_replace(lowercase_inputs, DEFAULT_STRIP_REGEX, "") elif callable(self._standardize): inputs = self._standardize(inputs) elif self._standardize is not None: raise ValueError( ("%s is not a supported standardization. " "TextVectorization supports the following options " "for `standardize`: None, " "'lower_and_strip_punctuation', or a " "Callable.") % self._standardize) if self._split is not None: # If we are splitting, we validate that the 1st axis is of dimension 1 and # so can be squeezed out. We do this here instead of after splitting for # performance reasons - it's more expensive to squeeze a ragged tensor. if inputs.shape.ndims > 1: inputs = array_ops.squeeze(inputs, axis=-1) if self._split == SPLIT_ON_WHITESPACE: # This treats multiple whitespaces as one whitespace, and strips leading # and trailing whitespace. inputs = ragged_string_ops.string_split_v2(inputs) elif callable(self._split): inputs = self._split(inputs) else: raise ValueError( ("%s is not a supported splitting." "TextVectorization supports the following options " "for `split`: None, 'whitespace', or a Callable.") % self._split) # Note that 'inputs' here can be either ragged or dense depending on the # configuration choices for this Layer. The strings.ngrams op, however, does # support both ragged and dense inputs. if self._ngrams is not None: inputs = ragged_string_ops.ngrams(inputs, ngram_width=self._ngrams, separator=" ") return inputs
def _preprocess(self, inputs): if self._standardize is LOWER_AND_STRIP_PUNCTUATION: lowercase_inputs = gen_string_ops.string_lower(inputs) inputs = string_ops.regex_replace(lowercase_inputs, DEFAULT_STRIP_REGEX, "") elif callable(self._standardize): inputs = self._standardize(inputs) elif self._standardize is not None: raise ValueError( ("%s is not a supported standardization. " "TextVectorization supports the following options " "for `standardize`: None, " "'lower_and_strip_punctuation', or a " "Callable.") % self._standardize) if self._split is not None: # If we are splitting, we validate that the 1st axis is of dimension 1 and # so can be squeezed out. We do this here instead of after splitting for # performance reasons - it's more expensive to squeeze a ragged tensor. inputs = array_ops.squeeze(inputs, axis=1) if self._split is SPLIT_ON_WHITESPACE: # This treats multiple whitespaces as one whitespace, and strips leading # and trailing whitespace. inputs = ragged_string_ops.string_split_v2(inputs) elif callable(self._split): inputs = self._split(inputs) else: raise ValueError( ("%s is not a supported splitting." "TextVectorization supports the following options " "for `split`: None, 'whitespace', or a Callable.") % self._split) # Note that 'inputs' here can be either ragged or dense depending on the # configuration choices for this Layer. The strings.ngrams op, however, does # support both ragged and dense inputs. if self._ngrams is not None: inputs = ragged_string_ops.ngrams(inputs, ngram_width=self._ngrams, separator=" ") return inputs
def custom_standardize_fn(x): return gen_string_ops.string_lower(x)