def _set_inverse_vocabulary(self, vocab): """Sets vocabulary data for this layer when inverse is True.""" table_utils.validate_vocabulary_is_unique(vocab) should_have_mask = self.mask_token is not None has_mask = vocab[0] == self.mask_token insert_special_tokens = should_have_mask and not has_mask special_tokens = [] if self.mask_token is None else [self.mask_token] num_special_tokens = len(special_tokens) tokens = vocab if insert_special_tokens else vocab[num_special_tokens:] if self.mask_token in tokens: raise ValueError( "Reserved mask token %s was found in the passed " "vocabulary at index %s. Please either remove the " "reserved token from the vocabulary or change the " "mask token for this layer." % (self.mask_token, tokens.index(self.mask_token))) if insert_special_tokens: total_vocab_size = len(vocab) + num_special_tokens else: total_vocab_size = len(vocab) if self.max_tokens is not None and total_vocab_size > self.max_tokens: raise ValueError( "Attempted to set a vocabulary larger than the maximum vocab size. " "Passed vocab size is %s, max vocab size is %s." % (total_vocab_size, self.max_tokens)) start_index = num_special_tokens if insert_special_tokens else 0 values = np.arange(start_index, len(vocab) + start_index, dtype=np.int64) self._table_handler.clear() self._table_handler.insert(values, vocab) if insert_special_tokens and num_special_tokens > 0: special_token_values = np.arange(num_special_tokens, dtype=np.int64) self._table_handler.insert(special_token_values, special_tokens) return total_vocab_size
def __init__(self, max_tokens=None, num_oov_tokens=1, vocabulary=None, reserve_zero=True, mask_zero=False, **kwargs): invert = False if invert: allowed_dtypes = [dtypes.int32, dtypes.int64] else: allowed_dtypes = [dtypes.string, dtypes.int32, dtypes.int64] if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes: raise ValueError("TextVectorization may only have a dtype in %s." % allowed_dtypes) if "dtype" not in kwargs: kwargs["dtype"] = dtypes.int64 if invert else dtypes.string # If max_tokens is set, the value must be greater than 1 - otherwise we # are creating a 0-element vocab, which doesn't make sense. if max_tokens is not None and max_tokens <= 1: raise ValueError("If set, max_tokens must be greater than 1.") if num_oov_tokens < 0: raise ValueError( "num_oov_tokens must be greater than 0. You passed %s" % num_oov_tokens) self.invert = invert self.max_tokens = max_tokens self.num_oov_tokens = num_oov_tokens self.reserve_zero = reserve_zero self.mask_zero = mask_zero # We need to reserve at least num_oov_tokens tokens, plus one additional # value if we are reserving the zero value in our output. if reserve_zero: self._reserved_values = (num_oov_tokens + 1) else: self._reserved_values = num_oov_tokens # We need to account for the OOV buckets in our vocabulary size. if max_tokens is not None: self._max_elements = max_tokens - num_oov_tokens else: self._max_elements = None # If there is only one OOV bucket, we can determine the OOV value (either 0 # or 1 depending on whether 0 is reserved) and set that as the default # value of the index_lookup table. If we hav multiple OOV values, we need to # do a further hashing step; to make this easier, we set the OOV value to # -1. (This lets us do a vectorized add and cast to boolean to determine # locations where we need to do extra hashing.) if self.num_oov_tokens == 1: self._oov_value = 1 if reserve_zero else 0 else: self._oov_value = -1 super(IndexLookup, self).__init__(combiner=_IndexLookupCombiner(self.max_tokens), **kwargs) # If the layer's input type is int32, we can only output int32 values - # MutableHashTable doesn't allow us to map int32->int64. if self.dtype == dtypes.int32: self._output_dtype = dtypes.int32 else: self._output_dtype = dtypes.int64 self._table = lookup_ops.MutableHashTable( key_dtype=self.dtype, value_dtype=self._output_dtype, default_value=self._oov_value, name=(self._name + "_index_table")) tracked_table = self._add_trackable(self._table, trainable=False) # This is a workaround for summary() on this layer. Because the table is # not mutable during training, the effective number of parameters (and so # the weight shape) is 0; we add this as an attr so that the parameter # counting code in the Model object doesn't throw an attribute error. tracked_table.shape = tensor_shape.TensorShape((0, )) if self.num_oov_tokens <= 1: oov_tokens = None else: oov_start = 1 if reserve_zero else 0 oov_tokens = list(range(oov_start, self._reserved_values)) self._table_handler = table_utils.TableHandler( table=self._table, oov_tokens=oov_tokens, use_v1_apis=self._use_v1_apis()) if vocabulary is not None: if isinstance(vocabulary, str): vocabulary = table_utils.get_vocabulary_from_file(vocabulary) table_utils.validate_vocabulary_is_unique(vocabulary) self.set_vocabulary(vocabulary)
def _set_forward_vocabulary(self, vocab): """Sets vocabulary data for this layer when inverse is False.""" table_utils.validate_vocabulary_is_unique(vocab) should_have_mask = self.mask_token is not None has_mask = vocab[0] == self.mask_token oov_start = 1 if should_have_mask else 0 should_have_oov = (self.num_oov_indices > 0) and not self.invert if should_have_oov: oov_end = oov_start + self.num_oov_indices expected_oov = [self.oov_token] * self.num_oov_indices has_oov = vocab[oov_start:oov_end] == expected_oov # If we get a numpy array, then has_oov may end up being a numpy array # instead of a bool. Fix this by collapsing the variable if it's not bool. if not isinstance(has_oov, bool): has_oov = any(has_oov) else: has_oov = False if all([should_have_mask, has_mask, should_have_oov]) and not has_oov: raise ValueError( "Invalid vocabulary format. The layer was created with " "`mask_token=%s` and `oov_token=%s`. These tokens should" " be included in the provided vocabulary. " "The passed vocabulary has the correct mask token `%s` " "at index 0, but does not have the OOV token `%s` in " "indices [%s:%s]. Instead, we found `%s`. Was this " "vocabulary generated by a layer with incompatible " "settings?" % (self.mask_token, self.oov_token, self.mask_token, self.oov_token, oov_start, oov_end, vocab[oov_start:oov_end])) if all([should_have_oov, has_oov, should_have_mask]) and not has_mask: raise ValueError( "Invalid vocabulary format. The layer was created with " "`mask_token=%s` and `oov_token=%s`. These tokens should " "be included in the provided vocabulary. " "The passed vocabulary has the correct OOV token `%s` at " "indices [%s:%s], but does not have the mask token `%s` in " "index 0. Instead, we found `%s`. Was this vocabulary " "generated by a layer with incompatible settings?" % (self.mask_token, self.oov_token, self.oov_token, oov_start, oov_end, self.mask_token, vocab[0])) insert_special_tokens = not has_oov and not has_mask special_tokens = [] if self.mask_token is None else [self.mask_token] special_tokens.extend([self.oov_token] * self.num_oov_indices) num_special_tokens = len(special_tokens) tokens = vocab if insert_special_tokens else vocab[num_special_tokens:] if self.mask_token in tokens: raise ValueError( "Reserved mask token %s was found in the passed " "vocabulary at index %s. Please either remove the " "reserved token from the vocabulary or change the " "mask token for this layer." % (self.mask_token, tokens.index(self.mask_token))) if self.oov_token in tokens: raise ValueError( "Reserved OOV token %s was found in the passed " "vocabulary at index %s. Please either remove the " "reserved token from the vocabulary or change the " "OOV token for this layer." % (self.oov_token, tokens.index(self.oov_token))) if insert_special_tokens: total_vocab_size = len(vocab) + num_special_tokens else: total_vocab_size = len(vocab) if self.max_tokens is not None and total_vocab_size > self.max_tokens: raise ValueError( "Attempted to set a vocabulary larger than the maximum vocab size. " "Passed vocab size is %s, max vocab size is %s." % (total_vocab_size, self.max_tokens)) start_index = num_special_tokens values = np.arange(start_index, len(vocab) + start_index, dtype=np.int64) self._table_handler.clear() self._table_handler.insert(vocab, values) if insert_special_tokens and num_special_tokens > 0: special_token_values = np.arange(num_special_tokens, dtype=np.int64) self._table_handler.insert(special_tokens, special_token_values)
def _set_forward_vocabulary(self, vocab, idf_weights=None): """Sets vocabulary data for this layer when inverse is False.""" table_utils.validate_vocabulary_is_unique(vocab) should_have_mask = self.mask_token is not None has_mask = vocab[0] == self.mask_token oov_start = 1 if should_have_mask else 0 should_have_oov = (self.num_oov_indices > 0) and not self.invert if should_have_oov: oov_end = oov_start + self.num_oov_indices expected_oov = [self.oov_token] * self.num_oov_indices has_oov = vocab[oov_start:oov_end] == expected_oov # If we get a numpy array, then has_oov may end up being a numpy array # instead of a bool. Fix this by collapsing the variable if it's not bool. if not isinstance(has_oov, bool): has_oov = any(has_oov) else: has_oov = False if all([should_have_mask, has_mask, should_have_oov]) and not has_oov: raise ValueError("Invalid vocabulary format. The layer was created with " "`mask_token=%s` and `oov_token=%s`. These tokens should" " be included in the provided vocabulary. " "The passed vocabulary has the correct mask token `%s` " "at index 0, but does not have the OOV token `%s` in " "indices [%s:%s]. Instead, we found `%s`. Was this " "vocabulary generated by a layer with incompatible " "settings?" % (self.mask_token, self.oov_token, self.mask_token, self.oov_token, oov_start, oov_end, vocab[oov_start:oov_end])) if all([should_have_oov, has_oov, should_have_mask]) and not has_mask: raise ValueError( "Invalid vocabulary format. The layer was created with " "`mask_token=%s` and `oov_token=%s`. These tokens should " "be included in the provided vocabulary. " "The passed vocabulary has the correct OOV token `%s` at " "indices [%s:%s], but does not have the mask token `%s` in " "index 0. Instead, we found `%s`. Was this vocabulary " "generated by a layer with incompatible settings?" % (self.mask_token, self.oov_token, self.oov_token, oov_start, oov_end, self.mask_token, vocab[0])) special_tokens = [] if self.mask_token is None else [self.mask_token] special_tokens.extend([self.oov_token] * self.num_oov_indices) insert_special_tokens = special_tokens and not has_oov and not has_mask num_special_tokens = len(special_tokens) tokens = vocab if insert_special_tokens else vocab[num_special_tokens:] if self.mask_token in tokens: raise ValueError("Reserved mask token %s was found in the passed " "vocabulary at index %s. Please either remove the " "reserved token from the vocabulary or change the " "mask token for this layer." % (self.mask_token, tokens.index(self.mask_token))) if self.oov_token in tokens: raise ValueError("Reserved OOV token %s was found in the passed " "vocabulary at index %s. Please either remove the " "reserved token from the vocabulary or change the " "OOV token for this layer." % (self.oov_token, tokens.index(self.oov_token))) total_vocab_size = len(tokens) + num_special_tokens if self.max_tokens is not None and total_vocab_size > self.max_tokens: raise ValueError( "Attempted to set a vocabulary larger than the maximum vocab size. " "Passed vocab size is %s, max vocab size is %s." % (total_vocab_size, self.max_tokens)) self._table_handler.clear() if insert_special_tokens: start_index = num_special_tokens values = np.arange(start_index, len(tokens) + start_index, dtype=np.int64) self._table_handler.insert(tokens, values) special_token_values = np.arange(num_special_tokens, dtype=np.int64) self._table_handler.insert(special_tokens, special_token_values) else: values = np.arange(len(vocab), dtype=np.int64) self._table_handler.insert(vocab, values) if self.output_mode == TFIDF: if idf_weights is None: raise ValueError("idf_weights must be set if output_mode is TFIDF") if len(vocab) != len(idf_weights): raise ValueError("idf_weights must be the same length as vocab. " "len(idf_weights) is %s, len(vocab) is %s" % (len(vocab), len(idf_weights))) idf_weights = self._convert_to_ndarray(idf_weights) if idf_weights.ndim != 1: raise ValueError( "TF-IDF data must be a 1-index array, but received {}".format( type(idf_weights))) # If we inserted special tokens into the vocab, we need to pad the front # of idf_weights. We don't have real document frequencies for these tokens # so we will use an average of all idf_weights passed in as a reasonable # default. if insert_special_tokens: front_padding = num_special_tokens front_padding_value = np.average(idf_weights) else: front_padding = 0 front_padding_value = 0 # If pad_to_max_tokens is true, and max_tokens is greater than our total # vocab size, we need to pad the back of idf_weights with zeros as well. back_padding_value = 0 if self.pad_to_max_tokens and self.max_tokens is not None: back_padding = self.max_tokens - total_vocab_size else: back_padding = 0 idf_weights = np.pad( idf_weights, (front_padding, back_padding), "constant", constant_values=(front_padding_value, back_padding_value)) K.set_value(self.tf_idf_weights, idf_weights) return total_vocab_size
def set_vocabulary(self, vocab): """Sets vocabulary (and optionally document frequency) data for this layer. This method sets the vocabulary for this layer directly, instead of analyzing a dataset through 'adapt'. It should be used whenever the vocab information is already known. If vocabulary data is already present in the layer, this method will either replace it Arguments: vocab: An array of string tokens. Raises: ValueError: If there are too many inputs, the inputs do not match, or input data is missing. """ table_utils.validate_vocabulary_is_unique(vocab) should_have_mask = self.mask_token is not None if should_have_mask: has_mask = vocab[0] == self.mask_token oov_start = 1 else: has_mask = False oov_start = 0 should_have_oov = self.num_oov_indices > 0 if should_have_oov: oov_end = oov_start + self.num_oov_indices expected_oov = [self.oov_token] * self.num_oov_indices has_oov = vocab[oov_start:oov_end] == expected_oov # If we get a numpy array, then has_oov may end up being a numpy array # instead of a bool. Fix this by collapsing the variable if it's not bool. if not isinstance(has_oov, bool): has_oov = any(has_oov) else: has_oov = False if all([should_have_mask, has_mask, should_have_oov]) and not has_oov: raise ValueError( "The passed vocabulary has the correct mask token `%s` " "at index 0, but does not have the OOV token `%s` in " "indices [%s:%s]. Instead, we found `%s`. Was this " "vocabulary generated by a layer with incompatible " "settings?" % (self.mask_token, self.oov_token, oov_start, oov_end, vocab[oov_start:oov_end])) if all([should_have_oov, has_oov, should_have_mask]) and not has_mask: raise ValueError( "The passed vocabulary has the correct OOV token `%s` at " "indices [%s:%s], but does not have the mask token `%s` in " "index 0. Instead, we found `%s`. Was this vocabulary " "generated by a layer with incompatible settings?" % (self.oov_token, oov_start, oov_end, self.mask_token, vocab[0])) insert_special_tokens = not has_oov and not has_mask special_tokens = [] if self.mask_token is None else [self.mask_token] special_tokens.extend([self.oov_token] * self.num_oov_indices) num_special_tokens = len(special_tokens) tokens = vocab if insert_special_tokens else vocab[num_special_tokens:] if self.mask_token in tokens: raise ValueError( "Reserved mask token %s was found in the passed " "vocabulary at index %s. Please either remove the " "reserved token from the vocabulary or change the " "mask token for this layer." % (self.mask_token, tokens.index(self.mask_token))) if self.oov_token in tokens: raise ValueError( "Reserved OOV token %s was found in the passed " "vocabulary at index %s. Please either remove the " "reserved token from the vocabulary or change the " "OOV token for this layer." % (self.oov_token, tokens.index(self.oov_token))) if insert_special_tokens: total_vocab_size = len(vocab) + num_special_tokens else: total_vocab_size = len(vocab) if self.max_tokens is not None and total_vocab_size > self.max_tokens: raise ValueError( "Attempted to set a vocabulary larger than the maximum vocab size. " "Passed vocab size is %s, max vocab size is %s." % (total_vocab_size, self.max_tokens)) start_index = num_special_tokens values = np.arange(start_index, len(vocab) + start_index, dtype=np.int64) self._table_handler.clear() self._table_handler.insert(vocab, values) if insert_special_tokens and num_special_tokens > 0: special_token_values = np.arange(num_special_tokens, dtype=np.int64) self._table_handler.insert(special_tokens, special_token_values)