def __init__(self, num_bins, output_mode="int", sparse=False, **kwargs): # By default, output int64 when output_mode="int" and floats otherwise. if "dtype" not in kwargs or kwargs["dtype"] is None: kwargs[ "dtype"] = tf.int64 if output_mode == INT else backend.floatx( ) super().__init__(**kwargs) base_preprocessing_layer.keras_kpl_gauge.get_cell( "HashedCrossing").set(True) # Check dtype only after base layer parses it; dtype parsing is complex. if output_mode == INT and not tf.as_dtype( self.compute_dtype).is_integer: input_dtype = kwargs["dtype"] raise ValueError( "When `output_mode='int'`, `dtype` should be an integer " f"type. Received: dtype={input_dtype}") # "output_mode" must be one of (INT, ONE_HOT) layer_utils.validate_string_arg(output_mode, allowable_strings=(INT, ONE_HOT), layer_name=self.__class__.__name__, arg_name="output_mode") self.num_bins = num_bins self.output_mode = output_mode self.sparse = sparse
def __init__(self, num_tokens=None, output_mode=BINARY, sparse=False, **kwargs): # max_tokens is an old name for the num_tokens arg we continue to support # because of usage. if "max_tokens" in kwargs: logging.warning( "max_tokens is deprecated, please use num_tokens instead.") num_tokens = kwargs["max_tokens"] del kwargs["max_tokens"] super(CategoryEncoding, self).__init__(**kwargs) # 'output_mode' must be one of (COUNT, BINARY) layer_utils.validate_string_arg(output_mode, allowable_strings=(COUNT, BINARY), layer_name="CategoryEncoding", arg_name="output_mode") if num_tokens is None: raise ValueError( "num_tokens must be set to use this layer. If the " "number of tokens is not known beforehand, use the " "IntegerLookup layer instead.") if num_tokens < 1: raise ValueError("num_tokens must be >= 1.") self.num_tokens = num_tokens self.output_mode = output_mode self.sparse = sparse
def __init__(self, num_bins, mask_value=None, salt=None, output_mode='int', sparse=False, **kwargs): if num_bins is None or num_bins <= 0: raise ValueError( f'The `num_bins` for `Hashing` cannot be `None` or non-positive ' f'values. Received: num_bins={num_bins}.') # By default, output int64 when output_mode='int' and floats otherwise. if 'dtype' not in kwargs or kwargs['dtype'] is None: kwargs['dtype'] = tf.int64 if output_mode == INT else backend.floatx() elif output_mode == 'int' and not tf.as_dtype(kwargs['dtype']).is_integer: # Compat for when dtype was always floating and ignored by the layer. kwargs['dtype'] = tf.int64 super().__init__(**kwargs) base_preprocessing_layer.keras_kpl_gauge.get_cell('Hashing').set(True) # Check dtype only after base layer parses it; dtype parsing is complex. if output_mode == INT and not tf.as_dtype(self.compute_dtype).is_integer: input_dtype = kwargs['dtype'] raise ValueError('When `output_mode="int"`, `dtype` should be an integer ' f'type. Received: dtype={input_dtype}') # 'output_mode' must be one of (INT, ONE_HOT, MULTI_HOT, COUNT) layer_utils.validate_string_arg( output_mode, allowable_strings=(INT, ONE_HOT, MULTI_HOT, COUNT), layer_name=self.__class__.__name__, arg_name='output_mode') if sparse and output_mode == INT: raise ValueError(f'`sparse` may only be true if `output_mode` is ' f'`"one_hot"`, `"multi_hot"`, or `"count"`. ' f'Received: sparse={sparse} and ' f'output_mode={output_mode}') self.num_bins = num_bins self.mask_value = mask_value self.strong_hash = True if salt is not None else False self.output_mode = output_mode self.sparse = sparse self.salt = None if salt is not None: if isinstance(salt, (tuple, list)) and len(salt) == 2: self.salt = salt elif isinstance(salt, int): self.salt = [salt, salt] else: raise ValueError( f'The `salt` argument for `Hashing` can only be a tuple of size 2 ' f'integers, or a single integer. Received: salt={salt}.')
def __init__(self, max_tokens=None, output_mode=BINARY, sparse=False, **kwargs): # 'output_mode' must be one of (COUNT, BINARY, TFIDF) layer_utils.validate_string_arg(output_mode, allowable_strings=(COUNT, BINARY, TFIDF), layer_name="CategoryEncoding", arg_name="output_mode") # If max_tokens is set, the value must be greater than 1 - otherwise we # are creating a 0-element vocab, which doesn't make sense. if max_tokens is not None and max_tokens < 1: raise ValueError("max_tokens must be > 1.") # We need to call super() before we call _add_state_variable(). combiner = _CategoryEncodingCombiner(max_tokens=max_tokens, compute_idf=output_mode == TFIDF) super(CategoryEncoding, self).__init__(combiner=combiner, **kwargs) base_preprocessing_layer.keras_kpl_gauge.get_cell( "CategoryEncoding").set(True) self.max_tokens = max_tokens self.output_mode = output_mode self.sparse = sparse self._called = False if self.output_mode == TFIDF: # The TF-IDF weight may have a (None,) tensorshape. This creates # a 1D variable with arbitrary shape, which we can assign any weight to # so long as it has 1 dimension. In order to properly initialize this # weight in Keras, we need to provide a custom callable initializer which # does not depend on the shape of the weight (as all other initializers # do) since the weight is not known. Hence the lambda shape, dtype: [0]. if max_tokens is None: initializer = lambda shape, dtype: [0] else: initializer = tf.compat.v1.zeros_initializer # We are adding these here instead of in build() since they do not depend # on the input shape at all. self.tf_idf_weights = self._add_state_variable( name=_IDF_NAME, shape=tf.TensorShape((max_tokens, )), dtype=K.floatx(), initializer=initializer) self.input_spec = InputSpec(ndim=2)
def __init__(self, num_tokens=None, output_mode="multi_hot", sparse=False, **kwargs): # max_tokens is an old name for the num_tokens arg we continue to # support because of usage. if "max_tokens" in kwargs: logging.warning( "max_tokens is deprecated, please use num_tokens instead.") num_tokens = kwargs["max_tokens"] del kwargs["max_tokens"] # By default, output floats. This is already default for TF2, but in TF1 # dtype is inferred from inputs, and would default to int. if "dtype" not in kwargs: kwargs["dtype"] = backend.floatx() super().__init__(**kwargs) base_preprocessing_layer.keras_kpl_gauge.get_cell( "CategoryEncoding").set(True) # Support deprecated names for output_modes. if output_mode == "binary": output_mode = MULTI_HOT # 'output_mode' must be one of (COUNT, ONE_HOT, MULTI_HOT) layer_utils.validate_string_arg( output_mode, allowable_strings=(COUNT, ONE_HOT, MULTI_HOT), layer_name="CategoryEncoding", arg_name="output_mode", ) if num_tokens is None: raise ValueError( "num_tokens must be set to use this layer. If the " "number of tokens is not known beforehand, use the " "IntegerLookup layer instead.") if num_tokens < 1: raise ValueError( f"`num_tokens` must be >= 1. Received: num_tokens={num_tokens}." ) self.num_tokens = num_tokens self.output_mode = output_mode self.sparse = sparse
def __init__(self, num_tokens=None, output_mode=MULTI_HOT, sparse=False, **kwargs): # max_tokens is an old name for the num_tokens arg we continue to support # because of usage. if "max_tokens" in kwargs: logging.warning( "max_tokens is deprecated, please use num_tokens instead.") num_tokens = kwargs["max_tokens"] del kwargs["max_tokens"] super(CategoryEncoding, self).__init__(**kwargs) base_preprocessing_layer.keras_kpl_gauge.get_cell( "CategoryEncoding").set(True) # Support deprecated names for output_modes. if output_mode == "binary": output_mode = MULTI_HOT # 'output_mode' must be one of (COUNT, ONE_HOT, MULTI_HOT) layer_utils.validate_string_arg(output_mode, allowable_strings=(COUNT, ONE_HOT, MULTI_HOT), layer_name="CategoryEncoding", arg_name="output_mode") if num_tokens is None: raise ValueError( "num_tokens must be set to use this layer. If the " "number of tokens is not known beforehand, use the " "IntegerLookup layer instead.") if num_tokens < 1: raise ValueError("num_tokens must be >= 1.") self.num_tokens = num_tokens self.output_mode = output_mode self.sparse = sparse
def __init__( self, bin_boundaries=None, num_bins=None, epsilon=0.01, output_mode="int", sparse=False, **kwargs, ): # bins is a deprecated arg for setting bin_boundaries or num_bins that still # has some usage. if "bins" in kwargs: logging.warning( "bins is deprecated, please use bin_boundaries or num_bins instead." ) if isinstance(kwargs["bins"], int) and num_bins is None: num_bins = kwargs["bins"] elif bin_boundaries is None: bin_boundaries = kwargs["bins"] del kwargs["bins"] # By default, output int64 when output_mode='int' and floats otherwise. if "dtype" not in kwargs or kwargs["dtype"] is None: kwargs["dtype"] = ( tf.int64 if output_mode == INT else backend.floatx() ) elif ( output_mode == "int" and not tf.as_dtype(kwargs["dtype"]).is_integer ): # Compat for when dtype was always floating and ignored by the layer. kwargs["dtype"] = tf.int64 super().__init__(**kwargs) base_preprocessing_layer.keras_kpl_gauge.get_cell("Discretization").set( True ) # Check dtype only after base layer parses it; dtype parsing is complex. if ( output_mode == INT and not tf.as_dtype(self.compute_dtype).is_integer ): input_dtype = kwargs["dtype"] raise ValueError( "When `output_mode='int'`, `dtype` should be an integer " f"type. Received: dtype={input_dtype}" ) # 'output_mode' must be one of (INT, ONE_HOT, MULTI_HOT, COUNT) layer_utils.validate_string_arg( output_mode, allowable_strings=(INT, ONE_HOT, MULTI_HOT, COUNT), layer_name=self.__class__.__name__, arg_name="output_mode", ) if sparse and output_mode == INT: raise ValueError( f"`sparse` may only be true if `output_mode` is " f"`'one_hot'`, `'multi_hot'`, or `'count'`. " f"Received: sparse={sparse} and " f"output_mode={output_mode}" ) if num_bins is not None and num_bins < 0: raise ValueError( "`num_bins` must be greater than or equal to 0. " "You passed `num_bins={}`".format(num_bins) ) if num_bins is not None and bin_boundaries is not None: raise ValueError( "Both `num_bins` and `bin_boundaries` should not be " "set. You passed `num_bins={}` and " "`bin_boundaries={}`".format(num_bins, bin_boundaries) ) bin_boundaries = utils.listify_tensors(bin_boundaries) self.input_bin_boundaries = bin_boundaries self.bin_boundaries = ( bin_boundaries if bin_boundaries is not None else [] ) self.num_bins = num_bins self.epsilon = epsilon self.output_mode = output_mode self.sparse = sparse
def __init__(self, max_tokens, num_oov_indices, mask_token, oov_token, vocabulary=None, invert=False, output_mode=INT, sparse=False, pad_to_max_tokens=False, **kwargs): # If max_tokens is set, the value must be greater than 1 - otherwise we # are creating a 0-element vocab, which doesn't make sense. if max_tokens is not None and max_tokens <= 1: raise ValueError("If set, `max_tokens` must be greater than 1. " "You passed {}".format(max_tokens)) if num_oov_indices < 0: raise ValueError( "`num_oov_indices` must be greater than or equal to 0. " "You passed {}".format(num_oov_indices)) # 'output_mode' must be one of (INT, BINARY, COUNT, TFIDF) layer_utils.validate_string_arg(output_mode, allowable_strings=(INT, BINARY, COUNT, TFIDF), layer_name=self.__class__.__name__, arg_name="output_mode") self.invert = invert self.max_tokens = max_tokens self.num_oov_indices = num_oov_indices self.oov_token = oov_token self.mask_token = mask_token self.output_mode = output_mode self.sparse = sparse self.pad_to_max_tokens = pad_to_max_tokens self._called = False self._num_special_tokens = self.num_oov_indices if self.mask_token is not None: self._num_special_tokens += 1 self._vocab_size = 0 # We need to keep track our current vocab size outside of our layer weights # to support a static output shape when `output_mode != INT`. The bincount # ops do not set shape on their outputs, which means we have to set it # ourselves. We persist the current vocab size as a hidden part of the # config when serializing our model. if "vocab_size" in kwargs: self._vocab_size = kwargs["vocab_size"] del kwargs["vocab_size"] # If there is only one OOV bucket, we can determine the OOV value (either 0 # or 1 depending on whether 0 is reserved) and set that as the default # value of the index_lookup table. If we hav multiple OOV values, we need to # do a further hashing step; to make this easier, we set the OOV value to # -1. (This lets us do a vectorized add and cast to boolean to determine # locations where we need to do extra hashing.) if self.num_oov_indices == 1: self._oov_value = 0 if mask_token is None else 1 else: self._oov_value = -1 if max_tokens is not None: available_vocab_size = max_tokens - self._num_special_tokens else: available_vocab_size = None super(IndexLookup, self).__init__(combiner=_IndexLookupCombiner( vocab_size=available_vocab_size, mask_value=mask_token, oov_value=oov_token, compute_idf=(output_mode == TFIDF)), **kwargs) # We need to save the key dtype so that we know if we're expecting int64 # keys. If we are, we will cast int32 inputs to int64 as well. if invert: self._key_dtype = tf.int64 self._value_dtype = self.dtype oov_value = self.oov_token oov_indices = None else: self._key_dtype = self.dtype self._value_dtype = tf.int64 oov_value = self._oov_value if self.num_oov_indices <= 1: oov_indices = None else: oov_start = 1 if mask_token is not None else 0 oov_end = oov_start + num_oov_indices oov_indices = list(range(oov_start, oov_end)) if vocabulary is not None and isinstance( vocabulary, tf.lookup.TextFileInitializer): self._table = self._static_table_class()(vocabulary, default_value=oov_value) self._table_handler = table_utils.TableHandler( table=self._table, mask_token=mask_token, oov_tokens=oov_indices, use_v1_apis=self._use_v1_apis()) self.max_tokens = (self._table_handler.table_size() + self.num_oov_indices + (0 if mask_token is None else 1)) else: self._table = lookup_ops.MutableHashTable( key_dtype=self._key_dtype, value_dtype=self._value_dtype, default_value=oov_value, name=(self._name + "_index_table")) self._table_handler = table_utils.TableHandler( table=self._table, oov_tokens=oov_indices, use_v1_apis=self._use_v1_apis()) if vocabulary is not None: self.set_vocabulary(vocabulary) if self.output_mode == TFIDF: # The TF-IDF weight may have a (None,) tensorshape. This creates # a 1D variable with arbitrary shape, which we can assign any weight to # so long as it has 1 dimension. In order to properly initialize this # weight in Keras, we need to provide a custom callable initializer which # does not depend on the shape of the weight (as all other initializers # do) since the weight is not known. Hence the lambda shape, dtype: [0]. if not self.pad_to_max_tokens or max_tokens is None: initializer = lambda shape, dtype: [0] else: initializer = tf.compat.v1.zeros_initializer # We are adding these here instead of in build() since they do not depend # on the input shape at all. idf_shape = (max_tokens, ) if self.pad_to_max_tokens else (None, ) self.tf_idf_weights = self._add_state_variable( name="idf", shape=tf.TensorShape(idf_shape), dtype=K.floatx(), initializer=initializer) tracked_table = self._add_trackable(self._table, trainable=False) # This is a workaround for summary() on this layer. Because the table is # not mutable during training, the effective number of parameters (and so # the weight shape) is 0; we add this as an attr so that the parameter # counting code in the Model object doesn't throw an attribute error. tracked_table.shape = tf.TensorShape((0, ))
def __init__(self, max_tokens=None, standardize=LOWER_AND_STRIP_PUNCTUATION, split=SPLIT_ON_WHITESPACE, ngrams=None, output_mode=INT, output_sequence_length=None, pad_to_max_tokens=True, vocabulary=None, **kwargs): # This layer only applies to string processing, and so should only have # a dtype of 'string'. if "dtype" in kwargs and kwargs["dtype"] != tf.string: raise ValueError( "TextVectorization may only have a dtype of string.") elif "dtype" not in kwargs: kwargs["dtype"] = tf.string # 'standardize' must be one of (None, LOWER_AND_STRIP_PUNCTUATION, callable) layer_utils.validate_string_arg( standardize, allowable_strings=(LOWER_AND_STRIP_PUNCTUATION), layer_name="TextVectorization", arg_name="standardize", allow_none=True, allow_callables=True) # 'split' must be one of (None, SPLIT_ON_WHITESPACE, callable) layer_utils.validate_string_arg( split, allowable_strings=(SPLIT_ON_WHITESPACE), layer_name="TextVectorization", arg_name="split", allow_none=True, allow_callables=True) # 'output_mode' must be one of (None, INT, COUNT, BINARY, TFIDF) layer_utils.validate_string_arg(output_mode, allowable_strings=(INT, COUNT, BINARY, TFIDF), layer_name="TextVectorization", arg_name="output_mode", allow_none=True) # 'ngrams' must be one of (None, int, tuple(int)) if not (ngrams is None or isinstance(ngrams, int) or isinstance(ngrams, tuple) and all(isinstance(item, int) for item in ngrams)): raise ValueError( ("`ngrams` must be None, an integer, or a tuple of " "integers. Got %s") % (ngrams, )) # 'output_sequence_length' must be one of (None, int) and is only # set if output_mode is INT. if (output_mode == INT and not (isinstance(output_sequence_length, int) or (output_sequence_length is None))): raise ValueError( "`output_sequence_length` must be either None or an " "integer when `output_mode` is 'int'. " "Got %s" % output_sequence_length) if output_mode != INT and output_sequence_length is not None: raise ValueError("`output_sequence_length` must not be set if " "`output_mode` is not 'int'.") # If max_tokens is set, the value must be greater than 1 - otherwise we # are creating a 0-element vocab, which doesn't make sense. if max_tokens is not None and max_tokens < 1: raise ValueError("max_tokens must be > 1.") self._max_tokens = max_tokens # In INT mode, the zero value is reserved for padding (per Keras standard # padding approaches). In non-INT modes, there is no padding so we can set # the OOV value to zero instead of one. self._oov_value = 1 if output_mode == INT else 0 self._standardize = standardize self._split = split self._ngrams_arg = ngrams if isinstance(ngrams, int): self._ngrams = tuple(range(1, ngrams + 1)) else: self._ngrams = ngrams self._output_mode = output_mode self._output_sequence_length = output_sequence_length self._pad_to_max = pad_to_max_tokens self._vocab_size = 0 super(TextVectorization, self).__init__(combiner=None, **kwargs) base_preprocessing_layer.keras_kpl_gauge.get_cell( "TextVectorization").set(True) mask_token = "" if output_mode in [None, INT] else None self._index_lookup_layer = self._get_index_lookup_class()( max_tokens=max_tokens, mask_token=mask_token, vocabulary=vocabulary, pad_to_max_tokens=pad_to_max_tokens, output_mode=output_mode if output_mode is not None else INT)
def __init__(self, max_tokens, num_oov_indices, mask_token, oov_token, vocabulary_dtype, vocabulary=None, idf_weights=None, invert=False, output_mode="int", sparse=False, pad_to_max_tokens=False, **kwargs): # If max_tokens is set, the value must be greater than 1 - otherwise we # are creating a 0-element vocab, which doesn't make sense. if max_tokens is not None and max_tokens <= 1: raise ValueError(f"If set, `max_tokens` must be greater than 1. " f"Received: max_tokens={max_tokens}") if pad_to_max_tokens and max_tokens is None: raise ValueError( f"If pad_to_max_tokens is True, must set `max_tokens`. " f"Received: max_tokens={max_tokens}") if num_oov_indices < 0: raise ValueError( f"`num_oov_indices` must be greater than or equal to 0. " f"Received: num_oov_indices={num_oov_indices}") # Support deprecated names for output_modes. if output_mode == "binary": output_mode = MULTI_HOT if output_mode == "tf-idf": output_mode = TF_IDF # 'output_mode' must be one of (INT, ONE_HOT, MULTI_HOT, COUNT, TF_IDF) layer_utils.validate_string_arg(output_mode, allowable_strings=(INT, ONE_HOT, MULTI_HOT, COUNT, TF_IDF), layer_name=self.__class__.__name__, arg_name="output_mode") if invert and output_mode != INT: raise ValueError( f"`output_mode` must be `'int'` when `invert` is true. " f"Received: output_mode={output_mode}") if sparse and output_mode == INT: raise ValueError( f"`sparse` may only be true if `output_mode` is " f"`'one_hot'`, `'multi_hot'`, `'count'` or `'tf_idf'`. " f"Received: sparse={sparse} and " f"output_mode={output_mode}") if idf_weights is not None and output_mode != TF_IDF: raise ValueError( f"`idf_weights` should only be set if `output_mode` is " f"`'tf_idf'`. Received: idf_weights={idf_weights} and " f"output_mode={output_mode}") self.invert = invert self.max_tokens = max_tokens self.num_oov_indices = num_oov_indices self.mask_token = mask_token self.oov_token = oov_token self.output_mode = output_mode self.sparse = sparse self.pad_to_max_tokens = pad_to_max_tokens self.vocabulary_dtype = vocabulary_dtype self._frozen_vocab_size = None self.input_vocabulary = vocabulary self.input_idf_weights = idf_weights # VocabularySavedModelSaver will clear the config vocabulary to restore the # lookup table ops directly. We persist this hidden option to persist the # fact that we have have a non-adaptable layer with a manually set vocab. self._has_input_vocabulary = kwargs.pop("has_input_vocabulary", (vocabulary is not None)) # Drop deprecated config options. kwargs.pop("vocabulary_size", None) kwargs.pop("has_static_table", None) # By default, output int64 when output_mode='int' and floats otherwise. if "dtype" not in kwargs: kwargs[ "dtype"] = tf.int64 if output_mode == INT else backend.floatx( ) super().__init__(**kwargs) # Check dtype only after base layer parses it; dtype parsing is complex. if output_mode == INT and not tf.as_dtype( self.compute_dtype).is_integer: input_dtype = kwargs["dtype"] raise ValueError( "When `output_mode='int'`, `dtype` should be an integer " f"type. Received: dtype={input_dtype}") if invert: self._key_dtype = self.dtype if output_mode == INT else tf.int64 self._value_dtype = tf.as_dtype(self.vocabulary_dtype) mask_key = 0 mask_value = mask_token self._default_value = self.oov_token else: self._key_dtype = tf.as_dtype(self.vocabulary_dtype) self._value_dtype = self.dtype if output_mode == INT else tf.int64 mask_key = mask_token # Masks should map to 0 for int output and be dropped otherwise. Max ints # will be dropped from the bincount op. mask_value = 0 if self.output_mode == INT else self._value_dtype.max if self.num_oov_indices == 0: # If there are no OOV indices, we map OOV tokens to -1 and error out # during call if we find a negative index. self._default_value = -1 elif self.num_oov_indices == 1: # If there is only one OOV index, we can set that index as the default # value of the index_lookup table. self._default_value = self._oov_start_index() else: # If we hav multiple OOV values, we need to do a further hashing step; # to make this easier, we set the OOV value to -1. (This lets us do a # vectorized add and cast to boolean to determine locations where we # need to do extra hashing.) self._default_value = -1 if self.mask_token is not None: self._mask_key = tf.convert_to_tensor(mask_key, self._key_dtype) self._mask_value = tf.convert_to_tensor(mask_value, self._value_dtype) if self.output_mode == TF_IDF: self.idf_weights = tf.Variable([0] * self._token_start_index(), shape=(None, ), dtype=self.compute_dtype, trainable=False) self.idf_weights_const = self.idf_weights.value() if vocabulary is not None: self.set_vocabulary(vocabulary, idf_weights) else: # When restoring from a keras SavedModel, the loading code will expect to # find and restore a lookup_table attribute on the layer. This table needs # to be uninitialized as a StaticHashTable cannot be initialized twice. self.lookup_table = self._uninitialized_lookup_table() # Only set up adapt state if we did not recieve a vocab on construction. if not self._has_input_vocabulary: # Add a custom weight handler to return the layers vocab as it's weight. self._add_trackable(VocabWeightHandler(self), False) # Set adapt state. self.token_counts = tf.lookup.experimental.MutableHashTable( key_dtype=vocabulary_dtype, value_dtype=tf.int64, default_value=0) if self.output_mode == TF_IDF: self.token_document_counts = tf.lookup.experimental.MutableHashTable( key_dtype=vocabulary_dtype, value_dtype=tf.int64, default_value=0) self.num_documents = tf.Variable(0, dtype=tf.int64, trainable=False)
def __init__(self, max_tokens=None, standardize=LOWER_AND_STRIP_PUNCTUATION, split=SPLIT_ON_WHITESPACE, ngrams=None, output_mode=INT, output_sequence_length=None, pad_to_max_tokens=False, vocabulary=None, **kwargs): # This layer only applies to string processing, and so should only have # a dtype of 'string'. if "dtype" in kwargs and kwargs["dtype"] != tf.string: raise ValueError( "TextVectorization may only have a dtype of string.") elif "dtype" not in kwargs: kwargs["dtype"] = tf.string # 'standardize' must be one of (None, LOWER_AND_STRIP_PUNCTUATION, callable) layer_utils.validate_string_arg( standardize, allowable_strings=(LOWER_AND_STRIP_PUNCTUATION), layer_name="TextVectorization", arg_name="standardize", allow_none=True, allow_callables=True) # 'split' must be one of (None, SPLIT_ON_WHITESPACE, callable) layer_utils.validate_string_arg( split, allowable_strings=(SPLIT_ON_WHITESPACE), layer_name="TextVectorization", arg_name="split", allow_none=True, allow_callables=True) # 'output_mode' must be one of (None, INT, COUNT, BINARY, TFIDF) layer_utils.validate_string_arg(output_mode, allowable_strings=(INT, COUNT, BINARY, TFIDF), layer_name="TextVectorization", arg_name="output_mode", allow_none=True) # 'ngrams' must be one of (None, int, tuple(int)) if not (ngrams is None or isinstance(ngrams, int) or isinstance(ngrams, tuple) and all(isinstance(item, int) for item in ngrams)): raise ValueError( ("`ngrams` must be None, an integer, or a tuple of " "integers. Got %s") % (ngrams, )) # 'output_sequence_length' must be one of (None, int) and is only # set if output_mode is INT. if (output_mode == INT and not (isinstance(output_sequence_length, int) or (output_sequence_length is None))): raise ValueError( "`output_sequence_length` must be either None or an " "integer when `output_mode` is 'int'. " "Got %s" % output_sequence_length) if output_mode != INT and output_sequence_length is not None: raise ValueError("`output_sequence_length` must not be set if " "`output_mode` is not 'int'.") self._max_tokens = max_tokens self._standardize = standardize self._split = split self._ngrams_arg = ngrams if isinstance(ngrams, int): self._ngrams = tuple(range(1, ngrams + 1)) else: self._ngrams = ngrams self._output_mode = output_mode self._output_sequence_length = output_sequence_length vocabulary_size = 0 # IndexLookup needs to keep track the current vocab size outside of its # layer weights. We persist it as a hidden part of the config during # serialization. if "vocabulary_size" in kwargs: vocabulary_size = kwargs["vocabulary_size"] del kwargs["vocabulary_size"] super(TextVectorization, self).__init__(combiner=None, **kwargs) base_preprocessing_layer.keras_kpl_gauge.get_cell( "TextVectorization").set(True) self._index_lookup_layer = self._get_index_lookup_class()( max_tokens=max_tokens, vocabulary=vocabulary, pad_to_max_tokens=pad_to_max_tokens, output_mode=output_mode if output_mode is not None else INT, vocabulary_size=vocabulary_size)
def __init__(self, max_tokens=None, standardize="lower_and_strip_punctuation", split="whitespace", ngrams=None, output_mode="int", output_sequence_length=None, pad_to_max_tokens=False, vocabulary=None, idf_weights=None, sparse=False, ragged=False, **kwargs): # This layer only applies to string processing, and so should only have # a dtype of 'string'. if "dtype" in kwargs and kwargs["dtype"] != tf.string: raise ValueError( f"`TextVectorization` may only have a dtype of string. " f"Received dtype: {kwargs['dtype']}.") elif "dtype" not in kwargs: kwargs["dtype"] = tf.string # 'standardize' must be one of # (None, LOWER_AND_STRIP_PUNCTUATION, LOWER, STRIP_PUNCTUATION, callable) layer_utils.validate_string_arg( standardize, allowable_strings=(LOWER_AND_STRIP_PUNCTUATION, LOWER, STRIP_PUNCTUATION), layer_name="TextVectorization", arg_name="standardize", allow_none=True, allow_callables=True) # 'split' must be one of (None, WHITESPACE, CHARACTER, callable) layer_utils.validate_string_arg(split, allowable_strings=(WHITESPACE, CHARACTER), layer_name="TextVectorization", arg_name="split", allow_none=True, allow_callables=True) # Support deprecated names for output_modes. if output_mode == "binary": output_mode = MULTI_HOT if output_mode == "tf-idf": output_mode = TF_IDF # 'output_mode' must be one of (None, INT, COUNT, MULTI_HOT, TF_IDF) layer_utils.validate_string_arg(output_mode, allowable_strings=(INT, COUNT, MULTI_HOT, TF_IDF), layer_name="TextVectorization", arg_name="output_mode", allow_none=True) # 'ngrams' must be one of (None, int, tuple(int)) if not (ngrams is None or isinstance(ngrams, int) or isinstance(ngrams, tuple) and all(isinstance(item, int) for item in ngrams)): raise ValueError( f"`ngrams` must be None, an integer, or a tuple of " f"integers. Received: ngrams={ngrams}") # 'output_sequence_length' must be one of (None, int) and is only # set if output_mode is INT. if (output_mode == INT and not (isinstance(output_sequence_length, int) or (output_sequence_length is None))): raise ValueError( f"`output_sequence_length` must be either None or an " f"integer when `output_mode` is 'int'. Received: " f"output_sequence_length={output_sequence_length}") if output_mode != INT and output_sequence_length is not None: raise ValueError( f"`output_sequence_length` must not be set if `output_mode` is not " f"'int'. Received output_sequence_length={output_sequence_length}." ) if ragged and output_mode != INT: raise ValueError(f"`ragged` must not be true if `output_mode` is " f"`'int'`. Received: ragged={ragged} and " f"output_mode={output_mode}") if ragged and output_sequence_length is not None: raise ValueError( f"`output_sequence_length` must not be set if ragged " f"is True. Received: ragged={ragged} and " f"output_sequence_length={output_sequence_length}") self._max_tokens = max_tokens self._standardize = standardize self._split = split self._ngrams_arg = ngrams if isinstance(ngrams, int): self._ngrams = tuple(range(1, ngrams + 1)) else: self._ngrams = ngrams self._ragged = ragged self._output_mode = output_mode self._output_sequence_length = output_sequence_length # VocabularySavedModelSaver will clear the config vocabulary to restore the # lookup table ops directly. We persist this hidden option to persist the # fact that we have have a non-adaptable layer with a manually set vocab. self._has_input_vocabulary = kwargs.pop("has_input_vocabulary", (vocabulary is not None)) # Drop deprecated config options. kwargs.pop("vocabulary_size", None) super().__init__(**kwargs) base_preprocessing_layer.keras_kpl_gauge.get_cell( "TextVectorization").set(True) self._lookup_layer = string_lookup.StringLookup( max_tokens=max_tokens, vocabulary=vocabulary, idf_weights=idf_weights, pad_to_max_tokens=pad_to_max_tokens, mask_token="", output_mode=output_mode if output_mode is not None else INT, sparse=sparse, has_input_vocabulary=self._has_input_vocabulary)
def __init__(self, max_tokens, num_oov_indices, mask_token, oov_token, vocabulary=None, invert=False, output_mode=INT, sparse=False, pad_to_max_tokens=False, **kwargs): # If max_tokens is set, the value must be greater than 1 - otherwise we # are creating a 0-element vocab, which doesn't make sense. if max_tokens is not None and max_tokens <= 1: raise ValueError("If set, `max_tokens` must be greater than 1. " "You passed {}".format(max_tokens)) if num_oov_indices < 0: raise ValueError( "`num_oov_indices` must be greater than or equal to 0. " "You passed {}".format(num_oov_indices)) # 'output_mode' must be one of (INT, BINARY, COUNT, TFIDF) layer_utils.validate_string_arg(output_mode, allowable_strings=(INT, BINARY, COUNT, TFIDF), layer_name=self.__class__.__name__, arg_name="output_mode") if invert and output_mode != INT: raise ValueError( "`output_mode` must be {} when `invert` is true. You " "passed {}".format(INT, output_mode)) self.invert = invert self.max_tokens = max_tokens self.num_oov_indices = num_oov_indices self.oov_token = oov_token self.output_mode = output_mode self.sparse = sparse self.pad_to_max_tokens = pad_to_max_tokens self._called = False # A note on vocab_size: we need to always keep a non-Tensor representation # of vocab_size around to use in graph building. Because we might be # in a tf.function, we can't rely on evaluating the actual tables to # find the value either. self._vocab_size = None # We need to keep track our current vocab size outside of our layer weights # to support a static output shape when `output_mode != INT`. The bincount # ops do not set shape on their outputs, which means we have to set it # ourselves. We persist the current vocab size as a hidden part of the # config when serializing our model. if "vocabulary_size" in kwargs: self._vocab_size = kwargs["vocabulary_size"] del kwargs["vocabulary_size"] restore_from_static_table = kwargs.pop("has_static_table", False) # Make sure the mask token is truly of the dtype we want. We can ignore # strings here, because they have only one dtype. if mask_token is not None: dtype = kwargs["dtype"] if dtype == tf.int32: mask_token = np.int32(mask_token) elif dtype == tf.int64: mask_token = np.int64(mask_token) self.mask_token = mask_token if max_tokens is not None: available_vocab_size = max_tokens - self._token_start_index() else: available_vocab_size = None super(IndexLookup, self).__init__(combiner=_IndexLookupCombiner( vocab_size=available_vocab_size, mask_value=mask_token, oov_value=oov_token, compute_idf=(output_mode == TFIDF)), **kwargs) # We need to save the key dtype so that we know if we're expecting int64 # keys. If we are, we will cast int32 inputs to int64 as well. if invert: self._key_dtype = tf.int64 self._value_dtype = self.dtype self._mask_key = 0 self._mask_value = mask_token key_index = tf.lookup.TextFileIndex.LINE_NUMBER value_index = tf.lookup.TextFileIndex.WHOLE_LINE default_value = self.oov_token oov_indices = None else: self._key_dtype = self.dtype self._value_dtype = tf.int64 self._mask_key = mask_token key_index = tf.lookup.TextFileIndex.WHOLE_LINE value_index = tf.lookup.TextFileIndex.LINE_NUMBER # Masks should map to 0 for int output and be dropped otherwise. Max ints # will be dropped from the bincount op. self._mask_value = 0 if self.output_mode == INT else tf.int64.max oov_start = self._oov_start_index() token_start = self._token_start_index() if self.num_oov_indices == 0: # If there are no OOV indices, we map OOV tokens to -1 for int output # and drop them from bagged output. Max ints will be dropped from the # bincount op. default_value = -1 if self.output_mode == INT else tf.int64.max oov_indices = None elif self.num_oov_indices == 1: # If there is only one OOV index, we can set that index as the default # value of the index_lookup table. default_value = oov_start oov_indices = None else: # If we hav multiple OOV values, we need to do a further hashing step; # to make this easier, we set the OOV value to -1. (This lets us do a # vectorized add and cast to boolean to determine locations where we # need to do extra hashing.) default_value = -1 oov_indices = list(range(oov_start, token_start)) self._static_vocabulary_path = None has_vocab_path = (vocabulary is not None and isinstance(vocabulary, str)) if has_vocab_path or restore_from_static_table: self._has_static_table = True if vocabulary is None: # If we're restoring a layer that was saved with a static table # initializer, we create a fake initializer object to let the code # progress. The savedmodel restoration code will handle restoring # the actual data. initializer = _NullInitializer(self._key_dtype, self._value_dtype) else: if not os.path.exists(vocabulary): raise ValueError("Vocabulary file %s does not exist." % (vocabulary, )) self._static_vocabulary_path = vocabulary num_tokens = table_utils.num_tokens_in_file(vocabulary) self._vocab_size = self._token_start_index() + num_tokens initializer = tf.lookup.TextFileInitializer( filename=vocabulary, key_dtype=self._key_dtype, key_index=key_index, value_dtype=self._value_dtype, value_index=value_index, value_index_offset=self._token_start_index()) self._table = self._static_table_class()( initializer, default_value=default_value) self._table_handler = table_utils.TableHandler( table=self._table, mask_token=self._mask_key, mask_value=self._mask_value, oov_tokens=oov_indices, use_v1_apis=self._use_v1_apis()) tracked_table = self._add_trackable(self._table, trainable=False) else: self._has_static_table = False self._table = lookup_ops.MutableHashTable( key_dtype=self._key_dtype, value_dtype=self._value_dtype, default_value=default_value, name=(self._name + "_index_table")) self._table_handler = table_utils.TableHandler( table=self._table, oov_tokens=oov_indices, use_v1_apis=self._use_v1_apis()) if vocabulary is not None: self.set_vocabulary(vocabulary) tracked_table = self._add_trackable(self._table, trainable=False) if self.output_mode == TFIDF: # The TF-IDF weight may have a (None,) tensorshape. This creates # a 1D variable with arbitrary shape, which we can assign any weight to # so long as it has 1 dimension. In order to properly initialize this # weight in Keras, we need to provide a custom callable initializer which # does not depend on the shape of the weight (as all other initializers # do) since the weight is not known. Hence the lambda shape, dtype: [0]. if not self.pad_to_max_tokens or max_tokens is None: initializer = lambda shape, dtype: [0] else: initializer = tf.compat.v1.zeros_initializer # We are adding these here instead of in build() since they do not depend # on the input shape at all. idf_shape = (max_tokens, ) if self.pad_to_max_tokens else (None, ) self.tf_idf_weights = self._add_state_variable( name="idf", shape=tf.TensorShape(idf_shape), dtype=backend.floatx(), initializer=initializer) # This is a workaround for summary() on this layer. Because the table is # not mutable during training, the effective number of parameters (and so # the weight shape) is 0; we add this as an attr so that the parameter # counting code in the Model object doesn't throw an attribute error. tracked_table.shape = tf.TensorShape((0, ))
def __init__(self, max_tokens, num_oov_indices, mask_token, oov_token, vocabulary=None, invert=False, output_mode="int", sparse=False, pad_to_max_tokens=False, **kwargs): # If max_tokens is set, the value must be greater than 1 - otherwise we # are creating a 0-element vocab, which doesn't make sense. if max_tokens is not None and max_tokens <= 1: raise ValueError("If set, `max_tokens` must be greater than 1. " "You passed `max_tokens={}`".format(max_tokens)) if pad_to_max_tokens and max_tokens is None: raise ValueError( "If pad_to_max_tokens is True, must set `max_tokens`. " "You passed `max_tokens={}`".format(max_tokens)) if num_oov_indices < 0: raise ValueError( "`num_oov_indices` must be greater than or equal to 0. " "You passed {}".format(num_oov_indices)) # Support deprecated names for output_modes. if output_mode == "binary": output_mode = MULTI_HOT if output_mode == "tf-idf": output_mode = TF_IDF # 'output_mode' must be one of (INT, ONE_HOT, MULTI_HOT, COUNT, TF_IDF) layer_utils.validate_string_arg(output_mode, allowable_strings=(INT, ONE_HOT, MULTI_HOT, COUNT, TF_IDF), layer_name=self.__class__.__name__, arg_name="output_mode") if invert and output_mode != INT: raise ValueError( "`output_mode` must be {} when `invert` is true. You " "passed {}".format(INT, output_mode)) self.invert = invert self.max_tokens = max_tokens self.num_oov_indices = num_oov_indices self.mask_token = mask_token self.oov_token = oov_token self.output_mode = output_mode self.sparse = sparse self.pad_to_max_tokens = pad_to_max_tokens self.input_vocabulary = None # IndexLookupLayerSavedModelSaver will clear the config config vocabulary to # restore the lookup table ops directly. We persist this hidden option to # persist the fact that we have have a non-adaptable layer with a manually # set vocabulary. self._has_input_vocabulary = kwargs.pop("has_input_vocabulary", False) self._frozen_vocab_size = None # Drop deprecated config options. kwargs.pop("vocabulary_size", None) kwargs.pop("has_static_table", None) super().__init__(**kwargs) if invert: self._key_dtype = tf.int64 self._value_dtype = tf.as_dtype(self.dtype) mask_key = 0 mask_value = mask_token self._default_value = self.oov_token else: self._key_dtype = tf.as_dtype(self.dtype) self._value_dtype = tf.int64 mask_key = mask_token # Masks should map to 0 for int output and be dropped otherwise. Max ints # will be dropped from the bincount op. mask_value = 0 if self.output_mode == INT else tf.int64.max if self.num_oov_indices == 0: # If there are no OOV indices, we map OOV tokens to -1 and error out # during call if we find a negative index. self._default_value = -1 elif self.num_oov_indices == 1: # If there is only one OOV index, we can set that index as the default # value of the index_lookup table. self._default_value = self._oov_start_index() else: # If we hav multiple OOV values, we need to do a further hashing step; # to make this easier, we set the OOV value to -1. (This lets us do a # vectorized add and cast to boolean to determine locations where we # need to do extra hashing.) self._default_value = -1 if self.mask_token is not None: self._mask_key = tf.convert_to_tensor(mask_key, self._key_dtype) self._mask_value = tf.convert_to_tensor(mask_value, self._value_dtype) if self.output_mode == TF_IDF: self.idf_weights = tf.Variable([0] * self._token_start_index(), shape=(None, ), dtype=backend.floatx(), trainable=False) self.idf_weights_const = self.idf_weights.value() if vocabulary is not None: self.set_vocabulary(vocabulary) else: # When restoring from a keras SavedModel, the loading code will expect to # find and restore a lookup_table attribute on the layer. This table needs # to be uninitialized as a StaticHashTable cannot be initialized twice. self.lookup_table = self._uninitialized_lookup_table() if not self._has_input_vocabulary: # Add a custom weight handler to return the layers vocab as it's weight. self._add_trackable(VocabWeightHandler(self), False) # Set adapt state. self.token_counts = tf.lookup.experimental.MutableHashTable( key_dtype=self.dtype, value_dtype=tf.int64, default_value=0) if self.output_mode == TF_IDF: self.token_document_counts = tf.lookup.experimental.MutableHashTable( key_dtype=self.dtype, value_dtype=tf.int64, default_value=0) self.num_documents = tf.Variable(0, dtype=tf.int64, trainable=False)
def __init__(self, max_tokens, num_oov_indices, mask_token, oov_token, vocabulary=None, invert=False, output_mode=INT, sparse=False, **kwargs): # If max_tokens is set, the value must be greater than 1 - otherwise we # are creating a 0-element vocab, which doesn't make sense. if max_tokens is not None and max_tokens <= 1: raise ValueError("If set, max_tokens must be greater than 1. " "You passed %s" % (max_tokens,)) if num_oov_indices < 0: raise ValueError("`num_oov_indices` must be greater than 0. You passed " "%s" % (num_oov_indices,)) if invert and num_oov_indices != 1: raise ValueError("`num_oov_tokens` must be 1 when `invert` is True.") # 'output_mode' must be one of (INT, BINARY, COUNT) layer_utils.validate_string_arg( output_mode, allowable_strings=(INT, BINARY, COUNT), layer_name=self.__class__.__name__, arg_name="output_mode") self.invert = invert self.max_tokens = max_tokens self.num_oov_indices = num_oov_indices self.oov_token = oov_token self.mask_token = mask_token self.output_mode = output_mode self.sparse = sparse # If there is only one OOV bucket, we can determine the OOV value (either 0 # or 1 depending on whether 0 is reserved) and set that as the default # value of the index_lookup table. If we hav multiple OOV values, we need to # do a further hashing step; to make this easier, we set the OOV value to # -1. (This lets us do a vectorized add and cast to boolean to determine # locations where we need to do extra hashing.) if self.num_oov_indices == 1: self._oov_value = 0 if mask_token is None else 1 else: self._oov_value = -1 if max_tokens is not None: num_mask_tokens = (0 if mask_token is None else 1) vocab_size = max_tokens - (num_oov_indices + num_mask_tokens) else: vocab_size = None super(IndexLookup, self).__init__( combiner=_IndexLookupCombiner(vocab_size, self.mask_token), **kwargs) self._output_dtype = tf.int64 # We need to save the key dtype so that we know if we're expecting int64 # keys. If we are, we will cast int32 inputs to int64 as well. if invert: self._key_dtype = self._output_dtype value_dtype = self.dtype oov_value = self.oov_token else: self._key_dtype = self.dtype value_dtype = self._output_dtype oov_value = self._oov_value self._table = lookup_ops.MutableHashTable( key_dtype=self._key_dtype, value_dtype=value_dtype, default_value=oov_value, name=(self._name + "_index_table")) tracked_table = self._add_trackable(self._table, trainable=False) # This is a workaround for summary() on this layer. Because the table is # not mutable during training, the effective number of parameters (and so # the weight shape) is 0; we add this as an attr so that the parameter # counting code in the Model object doesn't throw an attribute error. tracked_table.shape = tf.TensorShape((0,)) if self.num_oov_indices <= 1: oov_indices = None else: oov_start = 1 if mask_token is not None else 0 oov_end = oov_start + num_oov_indices oov_indices = list(range(oov_start, oov_end)) self._table_handler = table_utils.TableHandler( table=self._table, oov_tokens=oov_indices, use_v1_apis=self._use_v1_apis()) if vocabulary is not None: self.set_vocabulary(vocabulary)
def __init__(self, max_tokens=None, standardize="lower_and_strip_punctuation", split="whitespace", ngrams=None, output_mode="int", output_sequence_length=None, pad_to_max_tokens=False, vocabulary=None, **kwargs): # This layer only applies to string processing, and so should only have # a dtype of 'string'. if "dtype" in kwargs and kwargs["dtype"] != tf.string: raise ValueError( "TextVectorization may only have a dtype of string.") elif "dtype" not in kwargs: kwargs["dtype"] = tf.string # 'standardize' must be one of (None, LOWER_AND_STRIP_PUNCTUATION, callable) layer_utils.validate_string_arg( standardize, allowable_strings=(LOWER_AND_STRIP_PUNCTUATION), layer_name="TextVectorization", arg_name="standardize", allow_none=True, allow_callables=True) # 'split' must be one of (None, SPLIT_ON_WHITESPACE, callable) layer_utils.validate_string_arg( split, allowable_strings=(SPLIT_ON_WHITESPACE), layer_name="TextVectorization", arg_name="split", allow_none=True, allow_callables=True) # Support deprecated names for output_modes. if output_mode == "binary": output_mode = MULTI_HOT if output_mode == "tf-idf": output_mode = TF_IDF # 'output_mode' must be one of (None, INT, COUNT, MULTI_HOT, TF_IDF) layer_utils.validate_string_arg(output_mode, allowable_strings=(INT, COUNT, MULTI_HOT, TF_IDF), layer_name="TextVectorization", arg_name="output_mode", allow_none=True) # 'ngrams' must be one of (None, int, tuple(int)) if not (ngrams is None or isinstance(ngrams, int) or isinstance(ngrams, tuple) and all(isinstance(item, int) for item in ngrams)): raise ValueError( ("`ngrams` must be None, an integer, or a tuple of " "integers. Got %s") % (ngrams, )) # 'output_sequence_length' must be one of (None, int) and is only # set if output_mode is INT. if (output_mode == INT and not (isinstance(output_sequence_length, int) or (output_sequence_length is None))): raise ValueError( "`output_sequence_length` must be either None or an " "integer when `output_mode` is 'int'. " "Got %s" % output_sequence_length) if output_mode != INT and output_sequence_length is not None: raise ValueError("`output_sequence_length` must not be set if " "`output_mode` is not 'int'.") self._max_tokens = max_tokens self._standardize = standardize self._split = split self._ngrams_arg = ngrams if isinstance(ngrams, int): self._ngrams = tuple(range(1, ngrams + 1)) else: self._ngrams = ngrams self._output_mode = output_mode self._output_sequence_length = output_sequence_length # Drop deprecated config options. kwargs.pop("vocabulary_size", None) super().__init__(**kwargs) base_preprocessing_layer.keras_kpl_gauge.get_cell( "TextVectorization").set(True) self._index_lookup_layer = string_lookup.StringLookup( max_tokens=max_tokens, vocabulary=vocabulary, pad_to_max_tokens=pad_to_max_tokens, mask_token="", output_mode=output_mode if output_mode is not None else INT)