def get_table(dtype=dtypes.string, oov_tokens=None): table = lookup_ops.MutableHashTable(key_dtype=dtype, value_dtype=dtypes.int64, default_value=-7, name="index_table") return table_utils.TableHandler( table, oov_tokens, use_v1_apis=(not context.executing_eagerly()))
def get_table_handler(self): # Note: There is some repetition in these tests' setup. However, Tensorflow # does not play nicely with a separate setUp() call (causing errors related # to graph building), so we have to use a called setup instead of a setUp() # call. table = lookup_ops.MutableHashTable( key_dtype=dtypes.string, value_dtype=dtypes.int32, default_value=0) return base_layer_utils.TrackableWeightHandler(table)
def model_fn(features, labels, mode): tb = lookup_ops.MutableHashTable( key_dtype=tf.dtypes.int32, value_dtype=tf.dtypes.int32, default_value=-1) predictions = tb.lookup(features['x']) train_op = None if mode == ModeKeys.TRAIN: train_op = tf.group( tb.insert(features['x'], labels), tf.compat.v1.assign_add(tf.compat.v1.train.get_global_step(), 1)) return model_fn_lib.EstimatorSpec( mode, loss=tf.constant(0), predictions=predictions, train_op=train_op)
def testDistributeMutableHashTable(self, value_rank): def value(v): for _ in range(value_rank): v = [v, v] return v v1 = value(10) v2 = value(11) default_value = value(-1) cluster = data_service_test_base.TestCluster(num_workers=1) table = lookup_ops.MutableHashTable(dtypes.int64, dtypes.int64, default_value) self.evaluate(table.insert([0, 1], [v1, v2])) ds = dataset_ops.Dataset.range(3) ds = ds.map(table.lookup) ds = self.make_distributed_dataset(ds, cluster) self.assertDatasetProduces(ds, [v1, v2, default_value], requires_initialization=True)
def __init__(self, max_tokens=None, num_oov_tokens=1, vocabulary=None, reserve_zero=True, mask_zero=False, **kwargs): allowed_dtypes = [dtypes.string, dtypes.int64] if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes: raise ValueError( "TextVectorization may only have a dtype of string or int64.") elif "dtype" not in kwargs: kwargs["dtype"] = dtypes.string # If max_tokens is set, the value must be greater than 1 - otherwise we # are creating a 0-element vocab, which doesn't make sense. if max_tokens is not None and max_tokens <= 1: raise ValueError("max_tokens must be greater than 1.") # For now, limit the num_oov_tokens to one. if num_oov_tokens < 0: raise ValueError( "num_oov_tokens must be greater than 0. You passed %s" % num_oov_tokens) self.max_tokens = max_tokens self.num_oov_tokens = num_oov_tokens self.reserve_zero = reserve_zero self.mask_zero = mask_zero # We need to reserve at least num_oov_tokens tokens, plus one additional # value if we are reserving the zero value in our output. if reserve_zero: self._reserved_values = (num_oov_tokens + 1) else: self._reserved_values = num_oov_tokens # We need to account for the OOV buckets in our vocabulary size. if max_tokens is not None: self._max_elements = max_tokens - num_oov_tokens else: self._max_elements = None # If there is only one OOV bucket, we can determine the OOV value (either 0 # or 1 depending on whether 0 is reserved) and set that as the default # value of the index_lookup table. If we hav multiple OOV values, we need to # do a further hashing step; to make this easier, we set the OOV value to # -1. (This lets us do a vectorized add and cast to boolean to determine # locations where we need to do extra hashing.) if self.num_oov_tokens == 1: self._oov_value = 1 if reserve_zero else 0 else: self._oov_value = -1 super(IndexLookup, self).__init__(combiner=_IndexLookupCombiner(self.max_tokens), **kwargs) # This layer supports RaggedTensor inputs. self._supports_ragged_inputs = True # If the layer's input type is int32, we can only output int32 values - # MutableHashTable doesn't allow us to map int32->int64. if self.dtype == dtypes.int32: self._output_dtype = dtypes.int32 else: self._output_dtype = dtypes.int64 self._table = lookup_ops.MutableHashTable( key_dtype=self.dtype, value_dtype=self._output_dtype, default_value=self._oov_value, name=(self._name + "_index_table")) tracked_table = self._add_trackable(self._table, trainable=False) # This is a workaround for summary() on this layer. Because the table is # not mutable during training, the effective number of parameters (and so # the weight shape) is 0; we add this as an attr so that the parameter # counting code in the Model object doesn't throw an attribute error. tracked_table.shape = tensor_shape.TensorShape((0, )) self._inverse_table = None if vocabulary is not None: if isinstance(vocabulary, str): vocabulary = self._get_vocabulary_from_file(vocabulary) vocabulary_set = set(vocabulary) if len(vocabulary) != len(vocabulary_set): repeated_items = [ item for item, count in collections.Counter(vocabulary).items() if count > 1 ] raise ValueError( "The passed vocabulary has at least one repeated " "term. Please uniquify your dataset before passing " "it to IndexLookup(). The repeated terms are %s" % repeated_items) self.set_vocabulary(vocabulary)
def __init__(self, max_tokens=None, num_oov_tokens=1, vocabulary=None, reserve_zero=True, mask_zero=False, **kwargs): allowed_dtypes = [dtypes.string, dtypes.int64] if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes: raise ValueError( "TextVectorization may only have a dtype of string or int64.") elif "dtype" not in kwargs: kwargs["dtype"] = dtypes.string # If max_tokens is set, the value must be greater than 1 - otherwise we # are creating a 0-element vocab, which doesn't make sense. if max_tokens is not None and max_tokens <= 1: raise ValueError("max_tokens must be greater than 1.") # For now, limit the num_oov_tokens to one. if num_oov_tokens != 1: raise ValueError( "num_oov_tokens must be 1 for the time being. Other " "values will be supported in the near future. " "You passed %s" % num_oov_tokens) self.max_tokens = max_tokens self.num_oov_tokens = num_oov_tokens self.reserve_zero = reserve_zero self.mask_zero = mask_zero # We need to reserve at least num_oov_tokens tokens, plus one additional # value if we are reserving the zero value in our output. if reserve_zero: self._reserved_values = (num_oov_tokens + 1) else: self._reserved_values = num_oov_tokens # We need to account for the OOV buckets in our vocabulary size. if max_tokens is not None: self._max_elements = max_tokens - num_oov_tokens else: self._max_elements = None # If there is only one OOV bucket, we can determine the OOV value (either 0 # or 1 depending on whether 0 is reserved) and set that as the default # value of the index_lookup table. If we hav multiple OOV values, we need to # do a further hashing step; to make this easier, we set the OOV value to # -1. (This lets us do a vectorized add and cast to boolean to determine # locations where we need to do extra hashing.) if self.num_oov_tokens == 1: self._oov_value = 1 if reserve_zero else 0 else: self._oov_value = -1 super(IndexLookup, self).__init__(combiner=_IndexLookupCombiner(self.max_tokens), **kwargs) # This layer supports RaggedTensor inputs. self._supports_ragged_inputs = True # If the layer's input type is int32, we can only output int32 values - # MutableHashTable doesn't allow us to map int32->int64. if self.dtype == dtypes.int32: self._output_dtype = dtypes.int32 else: self._output_dtype = dtypes.int64 self._table = lookup_ops.MutableHashTable( key_dtype=self.dtype, value_dtype=self._output_dtype, default_value=self._oov_value, name=(self._name + "_index_table")) tracked_table = self._add_trackable(self._table, trainable=False) # This is a workaround for summary() on this layer. Because the table is # not mutable during training, the effective number of parameters (and so # the weight shape) is 0; we add this as an attr so that the parameter # counting code in the Model object doesn't throw an attribute error. tracked_table.shape = tensor_shape.TensorShape((0, )) # This is a workaround for saving not working yet for MutableHashTables. # By replacing the existing function call by an explicit failure, we # can provide a more user-friendly error message. def fail(_): raise NotImplementedError( "Saving is not yet supported for IndexLookup layers.") self._table._list_extra_dependencies_for_serialization = fail # pylint: disable=protected-access self._inverse_table = None if vocabulary is not None: self._export_vocab = True self.set_vocabulary(vocabulary) else: self._export_vocab = False
def __init__(self, max_tokens=None, num_oov_tokens=1, vocabulary=None, reserve_zero=True, mask_zero=False, **kwargs): invert = False if invert: allowed_dtypes = [dtypes.int32, dtypes.int64] else: allowed_dtypes = [dtypes.string, dtypes.int32, dtypes.int64] if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes: raise ValueError("TextVectorization may only have a dtype in %s." % allowed_dtypes) if "dtype" not in kwargs: kwargs["dtype"] = dtypes.int64 if invert else dtypes.string # If max_tokens is set, the value must be greater than 1 - otherwise we # are creating a 0-element vocab, which doesn't make sense. if max_tokens is not None and max_tokens <= 1: raise ValueError("If set, max_tokens must be greater than 1.") if num_oov_tokens < 0: raise ValueError( "num_oov_tokens must be greater than 0. You passed %s" % num_oov_tokens) self.invert = invert self.max_tokens = max_tokens self.num_oov_tokens = num_oov_tokens self.reserve_zero = reserve_zero self.mask_zero = mask_zero # We need to reserve at least num_oov_tokens tokens, plus one additional # value if we are reserving the zero value in our output. if reserve_zero: self._reserved_values = (num_oov_tokens + 1) else: self._reserved_values = num_oov_tokens # We need to account for the OOV buckets in our vocabulary size. if max_tokens is not None: self._max_elements = max_tokens - num_oov_tokens else: self._max_elements = None # If there is only one OOV bucket, we can determine the OOV value (either 0 # or 1 depending on whether 0 is reserved) and set that as the default # value of the index_lookup table. If we hav multiple OOV values, we need to # do a further hashing step; to make this easier, we set the OOV value to # -1. (This lets us do a vectorized add and cast to boolean to determine # locations where we need to do extra hashing.) if self.num_oov_tokens == 1: self._oov_value = 1 if reserve_zero else 0 else: self._oov_value = -1 super(IndexLookup, self).__init__(combiner=_IndexLookupCombiner(self.max_tokens), **kwargs) # If the layer's input type is int32, we can only output int32 values - # MutableHashTable doesn't allow us to map int32->int64. if self.dtype == dtypes.int32: self._output_dtype = dtypes.int32 else: self._output_dtype = dtypes.int64 self._table = lookup_ops.MutableHashTable( key_dtype=self.dtype, value_dtype=self._output_dtype, default_value=self._oov_value, name=(self._name + "_index_table")) tracked_table = self._add_trackable(self._table, trainable=False) # This is a workaround for summary() on this layer. Because the table is # not mutable during training, the effective number of parameters (and so # the weight shape) is 0; we add this as an attr so that the parameter # counting code in the Model object doesn't throw an attribute error. tracked_table.shape = tensor_shape.TensorShape((0, )) if self.num_oov_tokens <= 1: oov_tokens = None else: oov_start = 1 if reserve_zero else 0 oov_tokens = list(range(oov_start, self._reserved_values)) self._table_handler = table_utils.TableHandler( table=self._table, oov_tokens=oov_tokens, use_v1_apis=self._use_v1_apis()) if vocabulary is not None: if isinstance(vocabulary, str): vocabulary = table_utils.get_vocabulary_from_file(vocabulary) table_utils.validate_vocabulary_is_unique(vocabulary) self.set_vocabulary(vocabulary)
def __init__(self): self.v1 = tf.Variable([0, 0, 0, 0]) self.v2 = tf.Variable([1, 1, 1, 1]) self.table = lookup_ops.MutableHashTable(key_dtype=tf.int32, value_dtype=tf.int32, default_value=-1)
def __init__(self, max_tokens, num_oov_indices, mask_token, oov_token, vocabulary=None, invert=False, output_mode=INT, sparse=False, pad_to_max_tokens=False, **kwargs): # If max_tokens is set, the value must be greater than 1 - otherwise we # are creating a 0-element vocab, which doesn't make sense. if max_tokens is not None and max_tokens <= 1: raise ValueError("If set, `max_tokens` must be greater than 1. " "You passed {}".format(max_tokens)) if num_oov_indices < 0: raise ValueError( "`num_oov_indices` must be greater than or equal to 0. " "You passed {}".format(num_oov_indices)) # 'output_mode' must be one of (INT, BINARY, COUNT, TFIDF) layer_utils.validate_string_arg(output_mode, allowable_strings=(INT, BINARY, COUNT, TFIDF), layer_name=self.__class__.__name__, arg_name="output_mode") if invert and output_mode != INT: raise ValueError( "`output_mode` must be {} when `invert` is true. You " "passed {}".format(INT, output_mode)) self.invert = invert self.max_tokens = max_tokens self.num_oov_indices = num_oov_indices self.oov_token = oov_token self.output_mode = output_mode self.sparse = sparse self.pad_to_max_tokens = pad_to_max_tokens self._called = False # A note on vocab_size: we need to always keep a non-Tensor representation # of vocab_size around to use in graph building. Because we might be # in a tf.function, we can't rely on evaluating the actual tables to # find the value either. self._vocab_size = None # We need to keep track our current vocab size outside of our layer weights # to support a static output shape when `output_mode != INT`. The bincount # ops do not set shape on their outputs, which means we have to set it # ourselves. We persist the current vocab size as a hidden part of the # config when serializing our model. if "vocabulary_size" in kwargs: self._vocab_size = kwargs["vocabulary_size"] del kwargs["vocabulary_size"] restore_from_static_table = kwargs.pop("has_static_table", False) # Make sure the mask token is truly of the dtype we want. We can ignore # strings here, because they have only one dtype. if mask_token is not None: dtype = kwargs["dtype"] if dtype == tf.int32: mask_token = np.int32(mask_token) elif dtype == tf.int64: mask_token = np.int64(mask_token) self.mask_token = mask_token if max_tokens is not None: available_vocab_size = max_tokens - self._token_start_index() else: available_vocab_size = None super(IndexLookup, self).__init__(combiner=_IndexLookupCombiner( vocab_size=available_vocab_size, mask_value=mask_token, oov_value=oov_token, compute_idf=(output_mode == TFIDF)), **kwargs) # We need to save the key dtype so that we know if we're expecting int64 # keys. If we are, we will cast int32 inputs to int64 as well. if invert: self._key_dtype = tf.int64 self._value_dtype = self.dtype self._mask_key = 0 self._mask_value = mask_token key_index = tf.lookup.TextFileIndex.LINE_NUMBER value_index = tf.lookup.TextFileIndex.WHOLE_LINE default_value = self.oov_token oov_indices = None else: self._key_dtype = self.dtype self._value_dtype = tf.int64 self._mask_key = mask_token key_index = tf.lookup.TextFileIndex.WHOLE_LINE value_index = tf.lookup.TextFileIndex.LINE_NUMBER # Masks should map to 0 for int output and be dropped otherwise. Max ints # will be dropped from the bincount op. self._mask_value = 0 if self.output_mode == INT else tf.int64.max oov_start = self._oov_start_index() token_start = self._token_start_index() if self.num_oov_indices == 0: # If there are no OOV indices, we map OOV tokens to -1 for int output # and drop them from bagged output. Max ints will be dropped from the # bincount op. default_value = -1 if self.output_mode == INT else tf.int64.max oov_indices = None elif self.num_oov_indices == 1: # If there is only one OOV index, we can set that index as the default # value of the index_lookup table. default_value = oov_start oov_indices = None else: # If we hav multiple OOV values, we need to do a further hashing step; # to make this easier, we set the OOV value to -1. (This lets us do a # vectorized add and cast to boolean to determine locations where we # need to do extra hashing.) default_value = -1 oov_indices = list(range(oov_start, token_start)) self._static_vocabulary_path = None has_vocab_path = (vocabulary is not None and isinstance(vocabulary, str)) if has_vocab_path or restore_from_static_table: self._has_static_table = True if vocabulary is None: # If we're restoring a layer that was saved with a static table # initializer, we create a fake initializer object to let the code # progress. The savedmodel restoration code will handle restoring # the actual data. initializer = _NullInitializer(self._key_dtype, self._value_dtype) else: if not os.path.exists(vocabulary): raise ValueError("Vocabulary file %s does not exist." % (vocabulary, )) self._static_vocabulary_path = vocabulary num_tokens = table_utils.num_tokens_in_file(vocabulary) self._vocab_size = self._token_start_index() + num_tokens initializer = tf.lookup.TextFileInitializer( filename=vocabulary, key_dtype=self._key_dtype, key_index=key_index, value_dtype=self._value_dtype, value_index=value_index, value_index_offset=self._token_start_index()) self._table = self._static_table_class()( initializer, default_value=default_value) self._table_handler = table_utils.TableHandler( table=self._table, mask_token=self._mask_key, mask_value=self._mask_value, oov_tokens=oov_indices, use_v1_apis=self._use_v1_apis()) tracked_table = self._add_trackable(self._table, trainable=False) else: self._has_static_table = False self._table = lookup_ops.MutableHashTable( key_dtype=self._key_dtype, value_dtype=self._value_dtype, default_value=default_value, name=(self._name + "_index_table")) self._table_handler = table_utils.TableHandler( table=self._table, oov_tokens=oov_indices, use_v1_apis=self._use_v1_apis()) if vocabulary is not None: self.set_vocabulary(vocabulary) tracked_table = self._add_trackable(self._table, trainable=False) if self.output_mode == TFIDF: # The TF-IDF weight may have a (None,) tensorshape. This creates # a 1D variable with arbitrary shape, which we can assign any weight to # so long as it has 1 dimension. In order to properly initialize this # weight in Keras, we need to provide a custom callable initializer which # does not depend on the shape of the weight (as all other initializers # do) since the weight is not known. Hence the lambda shape, dtype: [0]. if not self.pad_to_max_tokens or max_tokens is None: initializer = lambda shape, dtype: [0] else: initializer = tf.compat.v1.zeros_initializer # We are adding these here instead of in build() since they do not depend # on the input shape at all. idf_shape = (max_tokens, ) if self.pad_to_max_tokens else (None, ) self.tf_idf_weights = self._add_state_variable( name="idf", shape=tf.TensorShape(idf_shape), dtype=backend.floatx(), initializer=initializer) # This is a workaround for summary() on this layer. Because the table is # not mutable during training, the effective number of parameters (and so # the weight shape) is 0; we add this as an attr so that the parameter # counting code in the Model object doesn't throw an attribute error. tracked_table.shape = tf.TensorShape((0, ))
def __init__(self, max_tokens=None, standardize=LOWER_AND_STRIP_PUNCTUATION, split=SPLIT_ON_WHITESPACE, ngrams=None, output_mode=INT, output_sequence_length=None, pad_to_max_tokens=True, **kwargs): # This layer only applies to string processing, and so should only have # a dtype of 'string'. if "dtype" in kwargs and kwargs["dtype"] != dtypes.string: raise ValueError( "TextVectorization may only have a dtype of string.") elif "dtype" not in kwargs: kwargs["dtype"] = dtypes.string # 'standardize' must be one of (None, LOWER_AND_STRIP_PUNCTUATION, callable) layer_utils.validate_string_arg( standardize, allowable_strings=[LOWER_AND_STRIP_PUNCTUATION], layer_name="TextVectorization", arg_name="standardize", allow_none=True, allow_callables=True) # 'split' must be one of (None, SPLIT_ON_WHITESPACE, callable) layer_utils.validate_string_arg( split, allowable_strings=[SPLIT_ON_WHITESPACE], layer_name="TextVectorization", arg_name="split", allow_none=True, allow_callables=True) # 'output_mode' must be one of (None, INT, COUNT, BINARY, TFIDF) layer_utils.validate_string_arg( output_mode, allowable_strings=[INT, COUNT, BINARY, TFIDF], layer_name="TextVectorization", arg_name="output_mode", allow_none=True) # 'ngrams' must be one of (None, int, tuple(int)) if not (ngrams is None or isinstance(ngrams, int) or isinstance(ngrams, tuple) and all(isinstance(item, int) for item in ngrams)): raise ValueError( ("`ngrams` must be None, an integer, or a tuple of " "integers. Got %s") % (ngrams, )) # 'output_sequence_length' must be one of (None, int) and is only # set if output_mode is INT. if (output_mode == INT and not (isinstance(output_sequence_length, int) or (output_sequence_length is None))): raise ValueError( "`output_sequence_length` must be either None or an " "integer when `output_mode` is 'int'. " "Got %s" % output_sequence_length) if output_mode != INT and output_sequence_length is not None: raise ValueError("`output_sequence_length` must not be set if " "`output_mode` is not 'int'.") # If max_tokens is set, the value must be greater than 1 - otherwise we # are creating a 0-element vocab, which doesn't make sense. if max_tokens is not None and max_tokens < 1: raise ValueError("max_tokens must be > 1.") self._max_tokens = max_tokens # In INT mode, we have two reserved values (PAD and OOV). However, non-INT # modes don't have a PAD value, so we only need to reserve one value. self._reserved_values = 2 if output_mode == INT else 1 # In INT mode, the zero value is reserved for padding (per Keras standard # padding approaches). In non-INT modes, there is no padding so we can set # the OOV value to zero instead of one. self._oov_value = 1 if output_mode == INT else 0 # We always reduce the max token number by 1 to account for the OOV token # if it is set. Keras' use of the reserved number 0 for padding tokens, # if the output is in INT mode, does not really count as a 'token' for # vocabulary purposes, so we only reduce vocab size by 1 here. self._max_vocab_size = max_tokens - 1 if max_tokens is not None else None self._standardize = standardize self._split = split self._ngrams_arg = ngrams if isinstance(ngrams, int): self._ngrams = tuple(range(1, ngrams + 1)) else: self._ngrams = ngrams self._output_mode = output_mode self._output_sequence_length = output_sequence_length self._pad_to_max = pad_to_max_tokens self._vocab_size = 0 self._called = False super(TextVectorization, self).__init__(combiner=_TextVectorizationCombiner( self._max_vocab_size, compute_idf=output_mode == TFIDF), **kwargs) self._table = lookup_ops.MutableHashTable( key_dtype=dtypes.string, value_dtype=dtypes.int64, default_value=self._oov_value, name=(self._name + "_index_table")) def fail(_): raise NotImplementedError( "Saving is not yet supported for TextVectorization layers.") self._table._list_extra_dependencies_for_serialization = fail # pylint: disable=protected-access tracked_table = self._add_trackable(self._table, trainable=False) # This is a workaround for summary() on this layer. Because the table is # not mutable during training, the effective number of parameters (and so # the weight shape) is 0; we add this as an attr so that the parameter # counting code in the Model object doesn't throw an attribute error. tracked_table.shape = tensor_shape.TensorShape((0, )) # If this layer is configured for string or integer output, we do not # create a vectorization layer (as the output is not vectorized). if self._output_mode in [None, INT]: return if max_tokens is not None and self._pad_to_max: vectorize_max_tokens = max_tokens else: vectorize_max_tokens = None self._vectorize_layer = self._get_vectorization_class()( max_tokens=vectorize_max_tokens, output_mode=self._output_mode)
def __init__(self, max_tokens=None, standardize=LOWER_AND_STRIP_PUNCTUATION, split=SPLIT_ON_WHITESPACE, ngrams=None, output_mode=INT, output_sequence_length=None, pad_to_max_tokens=True, **kwargs): # This layer only applies to string processing, and so should only have # a dtype of 'string'. if "dtype" in kwargs and kwargs["dtype"] != dtypes.string: raise ValueError("TextVectorization may only have a dtype of string.") elif "dtype" not in kwargs: kwargs["dtype"] = dtypes.string # 'standardize' must be one of (None, LOWER_AND_STRIP_PUNCTUATION, callable) _validate_string_arg( standardize, allowable_strings=[LOWER_AND_STRIP_PUNCTUATION], arg_name="standardize") # 'split' must be one of (None, SPLIT_ON_WHITESPACE, callable) _validate_string_arg( split, allowable_strings=[SPLIT_ON_WHITESPACE], arg_name="split") # 'output_mode' must be one of (None, INT, COUNT, BINARY, TFIDF) _validate_string_arg( output_mode, allowable_strings=[INT, COUNT, BINARY, TFIDF], arg_name="output_mode", allow_callables=False) # 'ngrams' must be one of (None, int, tuple(int)) if not (ngrams is None or isinstance(ngrams, int) or isinstance(ngrams, tuple) and all(isinstance(item, int) for item in ngrams)): raise ValueError(("`ngrams` must be None, an integer, or a tuple of " "integers. Got %s") % (ngrams,)) # 'output_sequence_length' must be one of (None, int) and is only # set if output_mode is INT. if (output_mode == INT and not (isinstance(output_sequence_length, int) or (output_sequence_length is None))): raise ValueError("`output_sequence_length` must be either None or an " "integer when `output_mode` is 'int'. " "Got %s" % output_sequence_length) if output_mode != INT and output_sequence_length is not None: raise ValueError("`output_sequence_length` must not be set if " "`output_mode` is not 'int'.") self._max_tokens = max_tokens # In INT mode, we have two reserved values (PAD and OOV). However, non-INT # modes don't have a PAD value, so we only need to reserve one value. self._reserved_values = 2 if output_mode == INT else 1 # In INT mode, the zero value is reserved for padding (per Keras standard # padding approaches). In non-INT modes, there is no padding so we can set # the OOV value to zero instead of one. self._oov_value = 1 if output_mode == INT else 0 # We always reduce the max token number by 1 to account for the OOV token # if it is set. The PAD marker isn't really a token (it's the absence of a # token) so we don't account for it here. self._max_vocab_size = max_tokens - 1 if max_tokens is not None else None self._standardize = standardize self._split = split self._ngrams_arg = ngrams if isinstance(ngrams, int): self._ngrams = tuple(range(1, ngrams + 1)) else: self._ngrams = ngrams self._output_mode = output_mode self._output_sequence_length = output_sequence_length self._pad_to_max = pad_to_max_tokens self._has_vocab = False super(TextVectorization, self).__init__( combiner=_TextVectorizationCombiner( self._max_vocab_size, compute_idf=output_mode == TFIDF), **kwargs) self._table = lookup_ops.MutableHashTable( key_dtype=dtypes.string, value_dtype=dtypes.int64, default_value=self._oov_value, name=(self._name + "_index_table")) def fail(_): raise NotImplementedError( "Saving is not yet supported for TextVectorization layers.") self._table._list_extra_dependencies_for_serialization = fail # pylint: disable=protected-access self._add_trackable(self._table, trainable=False) # We are adding this here instead of in build() since it does not depend # on the input shape at all. if self._output_mode == TFIDF: # Create the TFIDF weight, but use a (None,) tensorshape. This creates # a 1D variable with arbitrary shape, which we can assign any weight to # so long as it has 1 dimension. In order to properly initialize this # weight in Keras, we need to provide a custom callable initializer which # does not depend on the shape of the weight (as all other initializers # do) since the weight is not known. Hence the lambda shape, dtype: [0]. self._tf_idf_weights = self.add_weight( name="tfidf_data", shape=tensor_shape.TensorShape((None,)), dtype=K.floatx(), trainable=False, initializer=lambda shape, dtype: [0])
def __init__(self, max_tokens, num_oov_indices, mask_token, oov_token, vocabulary=None, **kwargs): # If max_tokens is set, the value must be greater than 1 - otherwise we # are creating a 0-element vocab, which doesn't make sense. if max_tokens is not None and max_tokens <= 1: raise ValueError("If set, max_tokens must be greater than 1.") if num_oov_indices < 0: raise ValueError( "num_oov_indices must be greater than 0. You passed %s" % num_oov_indices) self.max_tokens = max_tokens self.num_oov_indices = num_oov_indices self.oov_token = oov_token self.mask_token = mask_token # If there is only one OOV bucket, we can determine the OOV value (either 0 # or 1 depending on whether 0 is reserved) and set that as the default # value of the index_lookup table. If we hav multiple OOV values, we need to # do a further hashing step; to make this easier, we set the OOV value to # -1. (This lets us do a vectorized add and cast to boolean to determine # locations where we need to do extra hashing.) if self.num_oov_indices == 1: self._oov_value = 0 if mask_token is None else 1 else: self._oov_value = -1 super(IndexLookup, self).__init__(combiner=_IndexLookupCombiner( self.max_tokens, self.mask_token), **kwargs) self._output_dtype = dtypes.int64 self._table = lookup_ops.MutableHashTable( key_dtype=self.dtype, value_dtype=self._output_dtype, default_value=self._oov_value, name=(self._name + "_index_table")) tracked_table = self._add_trackable(self._table, trainable=False) # This is a workaround for summary() on this layer. Because the table is # not mutable during training, the effective number of parameters (and so # the weight shape) is 0; we add this as an attr so that the parameter # counting code in the Model object doesn't throw an attribute error. tracked_table.shape = tensor_shape.TensorShape((0, )) if self.num_oov_indices <= 1: oov_indices = None else: oov_start = 1 if mask_token is not None else 0 oov_end = oov_start + num_oov_indices oov_indices = list(range(oov_start, oov_end)) self._table_handler = table_utils.TableHandler( table=self._table, oov_tokens=oov_indices, use_v1_apis=self._use_v1_apis()) if vocabulary is not None: self.set_vocabulary(vocabulary)
def __init__(self, max_tokens=None, standardize=LOWER_AND_STRIP_PUNCTUATION, split=SPLIT_ON_WHITESPACE, ngrams=None, output_mode=INT, output_sequence_length=None, pad_to_max_tokens=True, **kwargs): # This layer only applies to string processing, and so should only have # a dtype of 'string'. if "dtype" in kwargs and kwargs["dtype"] != dtypes.string: raise ValueError( "TextVectorization may only have a dtype of string.") elif "dtype" not in kwargs: kwargs["dtype"] = dtypes.string # TODO(momernick): Validate the inputs. The following must apply: # 'standardize' must be one of (None, LOWER_AND_STRIP, callable) # 'split' must be one of (None, WHITESPACE, callable) # 'ngrams' must be one of (None, int, tuple(int)) # 'output_mode' must be one of (None, INT, COUNT, BINARY, TFIDF) # 'output_sequence_length' must be one of (None, int) and is only # set if output_mode is INT. self._max_tokens = max_tokens # In INT mode, we have two reserved values (PAD and OOV). However, non-INT # modes don't have a PAD value, so we only need to reserve one value. self._reserved_values = 2 if output_mode == INT else 1 # In INT mode, the zero value is reserved for padding (per Keras standard # padding approaches). In non-INT modes, there is no padding so we can set # the OOV value to zero instead of one. self._oov_value = 1 if output_mode == INT else 0 # We always reduce the max token number by 1 to account for the OOV token # if it is set. The PAD marker isn't really a token (it's the absence of a # token) so we don't account for it here. self._max_vocab_size = max_tokens - 1 if max_tokens is not None else None # This is an explicit regex of all the tokens that will be stripped if # LOWER_AND_STRIP_PUNCTUATION is set. If an application requires other # stripping, a Callable should be passed into the 'standardize' arg. self._strip_regex = r'[!"#$%&()\*\+,-\./:;<=>?@\[\\\]^_`{|}~\t\n\']' self._standardize = standardize self._split = split self._ngrams_arg = ngrams if isinstance(ngrams, int): self._ngrams = tuple(builtin_range(1, ngrams + 1)) else: self._ngrams = ngrams self._output_mode = output_mode self._output_sequence_length = output_sequence_length self._pad_to_max = pad_to_max_tokens self._has_vocab = False super(TextVectorization, self).__init__(combiner=_TextVectorizationCombiner( self._max_vocab_size, compute_idf=output_mode == TFIDF), **kwargs) self._table = lookup_ops.MutableHashTable( key_dtype=dtypes.string, value_dtype=dtypes.int64, default_value=self._oov_value, name=(self._name + "_index_table")) self._add_trackable(self._table, trainable=False) # We are adding this here instead of in build() since it does not depend # on the input shape at all. if self._output_mode == TFIDF: # Create the TFIDF weight, but use a (None,) tensorshape. This creates # a 1D variable with arbitrary shape, which we can assign any weight to # so long as it has 1 dimension. In order to properly initialize this # weight in Keras, we need to provide a custom callable initializer which # does not depend on the shape of the weight (as all other initializers # do) since the weight is not known. Hence the lambda shape, dtype: [0]. self._tf_idf_weights = self.add_weight( name="tfidf_data", shape=tensor_shape.TensorShape((None, )), dtype=K.floatx(), trainable=False, initializer=lambda shape, dtype: [0])
def __init__(self, key_dtype=dtypes.int64, value_dtype=dtypes.float32, dim=1, devices=None, partitioner=default_partition_fn, shared_name=None, name="DynamicEmbedding_Variable", initializer=None, trainable=True, checkpoint=True): """Creates an empty `Variable` object. Creates a group of tables placed on devices, the type of its keys and values are specified by key_dtype and value_dtype, respectively. The environment variables 'TF_HASHTABLE_INIT_SIZE' can be used to set the inital size of each tables, which can help reduce rehash times. The default initial table size : 1,048,576 for CPU, 16,777,216 for GPU. Args: key_dtype: the type of the key tensors. value_dtype: the type of the value tensors. dim: the length of the value array for each key. devices: the list of devices holding the tables. One table will be created on each device. partitioner: partition function of keys, return the partition index for each key. Example partition func: ```python def default_partition_fn(keys, shard_num): return tf.cast(keys % shard_num, dtype=tf.int32) ``` shared_name: No used. name: A name for the operation (optional). initializer: The value to use if a key is missing in the hash table. which can be a python number, numpy array or `tf.initializer` instances. If initializer is `None` (the default), `0` will be taken. trainable: True, will be treated as a trainable Variable, and add to to the list of variables collected in the graph under the key `GraphKeys.TRAINABLE_VARIABLES`. checkpoint: if True, the contents of the SparseVariable are saved to and restored from checkpoints. If `shared_name` is empty for a checkpointed table, it is shared using the table node name. Returns: A `Variable` object. """ self.key_dtype = key_dtype self.value_dtype = value_dtype self.dim = dim def _get_default_devices(): gpu_list = [ x.name for x in device_lib.list_local_devices() if x.device_type == 'GPU' ] return gpu_list[0:1] or [ "/CPU:0", ] devices_ = devices or _get_default_devices() self.devices = devices_ if isinstance(devices_, list) else [ devices, ] self.partition_fn = partitioner self.name = name self.shared_name = shared_name or "shared_name.{}".format(name) self.initializer = None self.trainable = trainable self.checkpoint = checkpoint self._tables = [] self.size_ops = [] self.shard_num = len(self.devices) key_dtype_list = [dtypes.int32, dtypes.int64] value_dtype_list = [ dtypes.int32, dtypes.int64, dtypes.bool, dtypes.float32, dtypes.float64, dtypes.half, dtypes.int8 ] if 'GPU' in self.devices[0].upper(): key_dtype_list = [dtypes.int64] value_dtype_list = [ dtypes.int32, dtypes.float32, dtypes.half, dtypes.int8 ] if key_dtype not in key_dtype_list: raise TypeError("key_dtype should be ", key_dtype_list) if value_dtype not in value_dtype_list: raise TypeError("value_dtype should be ", value_dtype_list) _initializer = initializer if _initializer is None: _initializer = init_ops.zeros_initializer(dtype=self.value_dtype) static_default_value = self._convert_anything_to_init( _initializer, dim) scope_name = self.name.split("/")[-1] with ops.name_scope(scope_name, "DynamicEmbedding_Variable"): with ops.colocate_with(None, ignore_existing=True): for idx in range(len(self.devices)): with ops.device(self.devices[idx]): mht = None mht = lookup_ops.MutableHashTable( key_dtype=self.key_dtype, value_dtype=self.value_dtype, default_value=static_default_value, name=self._make_name(idx), checkpoint=self.checkpoint) self._tables.append(mht) super(Variable, self).__init__() self.trainable_wrappers = []
def __init__(self, max_tokens, num_oov_indices, mask_token, oov_token, vocabulary=None, invert=False, output_mode=INT, sparse=False, **kwargs): # If max_tokens is set, the value must be greater than 1 - otherwise we # are creating a 0-element vocab, which doesn't make sense. if max_tokens is not None and max_tokens <= 1: raise ValueError("If set, max_tokens must be greater than 1. " "You passed %s" % (max_tokens, )) if num_oov_indices < 0: raise ValueError( "`num_oov_indices` must be greater than 0. You passed " "%s" % (num_oov_indices, )) if invert and num_oov_indices != 1: raise ValueError( "`num_oov_tokens` must be 1 when `invert` is True.") # 'output_mode' must be one of (INT, BINARY, COUNT) layer_utils.validate_string_arg(output_mode, allowable_strings=(INT, BINARY, COUNT), layer_name=self.__class__.__name__, arg_name="output_mode") self.invert = invert self.max_tokens = max_tokens self.num_oov_indices = num_oov_indices self.oov_token = oov_token self.mask_token = mask_token self.output_mode = output_mode self.sparse = sparse # If there is only one OOV bucket, we can determine the OOV value (either 0 # or 1 depending on whether 0 is reserved) and set that as the default # value of the index_lookup table. If we hav multiple OOV values, we need to # do a further hashing step; to make this easier, we set the OOV value to # -1. (This lets us do a vectorized add and cast to boolean to determine # locations where we need to do extra hashing.) if self.num_oov_indices == 1: self._oov_value = 0 if mask_token is None else 1 else: self._oov_value = -1 if max_tokens is not None: num_mask_tokens = (0 if mask_token is None else 1) vocab_size = max_tokens - (num_oov_indices + num_mask_tokens) else: vocab_size = None super(IndexLookup, self).__init__(combiner=_IndexLookupCombiner( vocab_size, self.mask_token), **kwargs) self._output_dtype = dtypes.int64 # We need to save the key dtype so that we know if we're expecting int64 # keys. If we are, we will cast int32 inputs to int64 as well. if invert: self._key_dtype = self._output_dtype value_dtype = self.dtype oov_value = self.oov_token else: self._key_dtype = self.dtype value_dtype = self._output_dtype oov_value = self._oov_value self._table = lookup_ops.MutableHashTable(key_dtype=self._key_dtype, value_dtype=value_dtype, default_value=oov_value, name=(self._name + "_index_table")) tracked_table = self._add_trackable(self._table, trainable=False) # This is a workaround for summary() on this layer. Because the table is # not mutable during training, the effective number of parameters (and so # the weight shape) is 0; we add this as an attr so that the parameter # counting code in the Model object doesn't throw an attribute error. tracked_table.shape = tensor_shape.TensorShape((0, )) if self.num_oov_indices <= 1: oov_indices = None else: oov_start = 1 if mask_token is not None else 0 oov_end = oov_start + num_oov_indices oov_indices = list(range(oov_start, oov_end)) self._table_handler = table_utils.TableHandler( table=self._table, oov_tokens=oov_indices, use_v1_apis=self._use_v1_apis()) if vocabulary is not None: self.set_vocabulary(vocabulary)
def __init__(self, max_tokens, num_oov_indices, mask_token, oov_token, vocabulary=None, invert=False, output_mode=INT, sparse=False, pad_to_max_tokens=False, **kwargs): # If max_tokens is set, the value must be greater than 1 - otherwise we # are creating a 0-element vocab, which doesn't make sense. if max_tokens is not None and max_tokens <= 1: raise ValueError("If set, `max_tokens` must be greater than 1. " "You passed {}".format(max_tokens)) if num_oov_indices < 0: raise ValueError( "`num_oov_indices` must be greater than or equal to 0. " "You passed {}".format(num_oov_indices)) # 'output_mode' must be one of (INT, BINARY, COUNT, TFIDF) layer_utils.validate_string_arg(output_mode, allowable_strings=(INT, BINARY, COUNT, TFIDF), layer_name=self.__class__.__name__, arg_name="output_mode") self.invert = invert self.max_tokens = max_tokens self.num_oov_indices = num_oov_indices self.oov_token = oov_token self.mask_token = mask_token self.output_mode = output_mode self.sparse = sparse self.pad_to_max_tokens = pad_to_max_tokens self._called = False self._num_special_tokens = self.num_oov_indices if self.mask_token is not None: self._num_special_tokens += 1 self._vocab_size = 0 # We need to keep track our current vocab size outside of our layer weights # to support a static output shape when `output_mode != INT`. The bincount # ops do not set shape on their outputs, which means we have to set it # ourselves. We persist the current vocab size as a hidden part of the # config when serializing our model. if "vocab_size" in kwargs: self._vocab_size = kwargs["vocab_size"] del kwargs["vocab_size"] # If there is only one OOV bucket, we can determine the OOV value (either 0 # or 1 depending on whether 0 is reserved) and set that as the default # value of the index_lookup table. If we hav multiple OOV values, we need to # do a further hashing step; to make this easier, we set the OOV value to # -1. (This lets us do a vectorized add and cast to boolean to determine # locations where we need to do extra hashing.) if self.num_oov_indices == 1: self._oov_value = 0 if mask_token is None else 1 else: self._oov_value = -1 if max_tokens is not None: available_vocab_size = max_tokens - self._num_special_tokens else: available_vocab_size = None super(IndexLookup, self).__init__(combiner=_IndexLookupCombiner( vocab_size=available_vocab_size, mask_value=mask_token, oov_value=oov_token, compute_idf=(output_mode == TFIDF)), **kwargs) # We need to save the key dtype so that we know if we're expecting int64 # keys. If we are, we will cast int32 inputs to int64 as well. if invert: self._key_dtype = tf.int64 self._value_dtype = self.dtype oov_value = self.oov_token oov_indices = None else: self._key_dtype = self.dtype self._value_dtype = tf.int64 oov_value = self._oov_value if self.num_oov_indices <= 1: oov_indices = None else: oov_start = 1 if mask_token is not None else 0 oov_end = oov_start + num_oov_indices oov_indices = list(range(oov_start, oov_end)) if vocabulary is not None and isinstance( vocabulary, tf.lookup.TextFileInitializer): self._table = self._static_table_class()(vocabulary, default_value=oov_value) self._table_handler = table_utils.TableHandler( table=self._table, mask_token=mask_token, oov_tokens=oov_indices, use_v1_apis=self._use_v1_apis()) self.max_tokens = (self._table_handler.table_size() + self.num_oov_indices + (0 if mask_token is None else 1)) else: self._table = lookup_ops.MutableHashTable( key_dtype=self._key_dtype, value_dtype=self._value_dtype, default_value=oov_value, name=(self._name + "_index_table")) self._table_handler = table_utils.TableHandler( table=self._table, oov_tokens=oov_indices, use_v1_apis=self._use_v1_apis()) if vocabulary is not None: self.set_vocabulary(vocabulary) if self.output_mode == TFIDF: # The TF-IDF weight may have a (None,) tensorshape. This creates # a 1D variable with arbitrary shape, which we can assign any weight to # so long as it has 1 dimension. In order to properly initialize this # weight in Keras, we need to provide a custom callable initializer which # does not depend on the shape of the weight (as all other initializers # do) since the weight is not known. Hence the lambda shape, dtype: [0]. if not self.pad_to_max_tokens or max_tokens is None: initializer = lambda shape, dtype: [0] else: initializer = tf.compat.v1.zeros_initializer # We are adding these here instead of in build() since they do not depend # on the input shape at all. idf_shape = (max_tokens, ) if self.pad_to_max_tokens else (None, ) self.tf_idf_weights = self._add_state_variable( name="idf", shape=tf.TensorShape(idf_shape), dtype=K.floatx(), initializer=initializer) tracked_table = self._add_trackable(self._table, trainable=False) # This is a workaround for summary() on this layer. Because the table is # not mutable during training, the effective number of parameters (and so # the weight shape) is 0; we add this as an attr so that the parameter # counting code in the Model object doesn't throw an attribute error. tracked_table.shape = tf.TensorShape((0, ))
def get_table(dtype=tf.string, oov_tokens=None): table = lookup_ops.MutableHashTable(key_dtype=dtype, value_dtype=tf.int64, default_value=-7, name="index_table") return table_utils.TableHandler(table, oov_tokens)