def call(self, inputs): self._maybe_freeze_vocab_size() inputs = self._standardize_inputs(inputs, self._key_dtype) original_shape = inputs.shape # Some ops will not handle scalar input, so uprank to rank 1. if inputs.shape.rank == 0: inputs = self._expand_dims(inputs, -1) if tf_utils.is_sparse(inputs): lookups = tf.SparseTensor(inputs.indices, self._lookup_dense(inputs.values), inputs.dense_shape) elif tf_utils.is_ragged(inputs): lookups = tf.ragged.map_flat_values(self._lookup_dense, inputs) else: lookups = self._lookup_dense(inputs) if self.output_mode == INT: # If we received a scalar input, downrank back to a scalar. if original_shape.rank == 0: lookups = tf.squeeze(lookups, -1) return lookups depth = (self.max_tokens if self.pad_to_max_tokens else self._frozen_vocab_size) idf_weights = self.idf_weights_const if self.output_mode == TF_IDF else None return utils.encode_categorical_inputs(lookups, output_mode=self.output_mode, depth=depth, dtype=self.compute_dtype, sparse=self.sparse, idf_weights=idf_weights)
def _num_tokens(self, data): """Count the number of tokens in a ragged, sparse or dense tensor.""" if tf_utils.is_sparse(data): flat_values = data.values elif tf_utils.is_ragged(data): flat_values = data.flat_values else: flat_values = tf.reshape(data, [-1]) tokens, _, counts = tf.unique_with_counts(flat_values, out_idx=tf.int64) return tokens, counts
def call(self, inputs): self._maybe_freeze_vocab_size() inputs = self._standardize_inputs(inputs, self._key_dtype) original_shape = inputs.shape # Some ops will not handle scalar input, so uprank to rank 1. if inputs.shape.rank == 0: inputs = self._expand_dims(inputs, -1) if tf_utils.is_sparse(inputs): lookups = tf.SparseTensor(inputs.indices, self._lookup_dense(inputs.values), inputs.dense_shape) elif tf_utils.is_ragged(inputs): lookups = tf.ragged.map_flat_values(self._lookup_dense, inputs) else: lookups = self._lookup_dense(inputs) if self.output_mode == INT: # If we received a scalar input, downrank back to a scalar. if original_shape.rank == 0: lookups = tf.squeeze(lookups, -1) return lookups # One hot will unprank only if the final output dimension is not already 1. if self.output_mode == ONE_HOT: if lookups.shape[-1] != 1: lookups = self._expand_dims(lookups, -1) # TODO(b/190445202): remove output rank restriction. if lookups.shape.rank > 2: raise ValueError( "Received input shape {}, which would result in output rank {}. " "Currently only outputs up to rank 2 are supported for " "`output_mode={}`.".format(original_shape, lookups.shape.rank, self.output_mode)) binary_output = self.output_mode in (MULTI_HOT, ONE_HOT) if self.pad_to_max_tokens: out_depth = self.max_tokens else: out_depth = self._frozen_vocab_size if self.sparse: bincounts = category_encoding.sparse_bincount( lookups, out_depth, binary_output) else: bincounts = category_encoding.dense_bincount( lookups, out_depth, binary_output) if self.output_mode == TF_IDF: return tf.multiply(bincounts, self.idf_weights_const) return bincounts
def call(self, inputs): if not self.max_tokens and self._vocab_size is None: raise ValueError( "You must set the layer's vocabulary before calling it. " "Either pass a `vocabulary` argument to the layer, or " "call `layer.adapt(dataset)` with some sample data.") self._called = True if self._key_dtype == tf.int64 and inputs.dtype == tf.int32: inputs = tf.cast(inputs, tf.int64) lookup_result = self._table_handler.lookup(inputs) lookup_checks = [] if self.num_oov_indices == 0 and not self.invert: if tf_utils.is_sparse(inputs): lookup_values = lookup_result.values input_values = inputs.values elif tf_utils.is_ragged(inputs): lookup_values = lookup_result.flat_values input_values = inputs.flat_values else: lookup_values = lookup_result input_values = inputs oov_indices = tf.where(tf.equal(lookup_values, -1)) oov_inputs = tf.compat.v1.gather_nd(input_values, oov_indices) msg = tf.strings.format( "When `num_oov_indices=0` all inputs should be in vocabulary, " "found OOV values {}, consider setting `num_oov_indices=1`.", (oov_inputs, )) assertion = tf.Assert(tf.equal(tf.compat.v1.size(oov_indices), 0), [msg]) lookup_checks.append(assertion) with tf.control_dependencies(lookup_checks): if self.output_mode == INT: return tf.identity(lookup_result) multi_hot_output = (self.output_mode == MULTI_HOT) if self._vocab_size and not self.pad_to_max_tokens: out_depth = self._vocab_size else: out_depth = self.max_tokens if self.sparse: bincounts = category_encoding.sparse_bincount( lookup_result, out_depth, multi_hot_output) else: bincounts = category_encoding.dense_bincount( lookup_result, out_depth, multi_hot_output) if self.output_mode == TF_IDF: return tf.multiply(bincounts, self.tf_idf_weights) return bincounts
def call(self, inputs): def bucketize(inputs): return tf.raw_ops.Bucketize(input=inputs, boundaries=self.bin_boundaries) if tf_utils.is_ragged(inputs): integer_buckets = tf.ragged.map_flat_values(bucketize, inputs) # Ragged map_flat_values doesn't touch the non-values tensors in the # ragged composite tensor. If this op is the only op a Keras model, # this can cause errors in Graph mode, so wrap the tensor in an identity. return tf.identity(integer_buckets) elif tf_utils.is_sparse(inputs): return tf.SparseTensor(indices=tf.identity(inputs.indices), values=bucketize(inputs.values), dense_shape=tf.identity(inputs.dense_shape)) else: return bucketize(inputs)
def call(self, inputs): if isinstance(inputs, (list, tuple, np.ndarray)): inputs = tf.convert_to_tensor(inputs) if not self.max_tokens and self._vocab_size is None: raise ValueError( "You must set the layer's vocabulary before calling it. " "Either pass a `vocabulary` argument to the layer, or " "call `layer.adapt(dataset)` with some sample data.") self._called = True if self._key_dtype == tf.int64 and inputs.dtype == tf.int32: inputs = tf.cast(inputs, tf.int64) lookup_result = self._table_handler.lookup(inputs) lookup_checks = [] if self.num_oov_indices == 0 and not self.invert: if tf_utils.is_sparse(inputs): lookup_values = lookup_result.values input_values = inputs.values elif tf_utils.is_ragged(inputs): lookup_values = lookup_result.flat_values input_values = inputs.flat_values else: lookup_values = lookup_result input_values = inputs # tf.where needs rank > 0. if input_values.shape.rank == 0: input_values = self._expand_dims(input_values, -1) lookup_values = self._expand_dims(lookup_values, -1) oov_indices = tf.where(tf.equal(lookup_values, -1)) oov_inputs = tf.compat.v1.gather_nd(input_values, oov_indices) msg = tf.strings.format( "When `num_oov_indices=0` all inputs should be in vocabulary, " "found OOV values {}, consider setting `num_oov_indices=1`.", (oov_inputs, )) assertion = tf.Assert(tf.equal(tf.compat.v1.size(oov_indices), 0), [msg]) lookup_checks.append(assertion) with tf.control_dependencies(lookup_checks): if self.output_mode == INT: return tf.identity(lookup_result) else: return self._encode_output(lookup_result)
def call(self, inputs): def bucketize(inputs): outputs = tf.raw_ops.Bucketize(input=inputs, boundaries=self.bin_boundaries) # All other preprocessing layers use int64 for int output, so we conform # here. Sadly the underlying op only supports int32, so we need to cast. return tf.cast(outputs, tf.int64) if tf_utils.is_ragged(inputs): integer_buckets = tf.ragged.map_flat_values(bucketize, inputs) # Ragged map_flat_values doesn't touch the non-values tensors in the # ragged composite tensor. If this op is the only op a Keras model, # this can cause errors in Graph mode, so wrap the tensor in an identity. return tf.identity(integer_buckets) elif tf_utils.is_sparse(inputs): return tf.SparseTensor(indices=tf.identity(inputs.indices), values=bucketize(inputs.values), dense_shape=tf.identity(inputs.dense_shape)) else: return bucketize(inputs)
def call(self, inputs): def bucketize(inputs): return tf.raw_ops.Bucketize(input=inputs, boundaries=self.bin_boundaries) if tf_utils.is_ragged(inputs): indices = tf.ragged.map_flat_values(bucketize, inputs) elif tf_utils.is_sparse(inputs): indices = tf.SparseTensor(indices=tf.identity(inputs.indices), values=bucketize(inputs.values), dense_shape=tf.identity( inputs.dense_shape)) else: indices = bucketize(inputs) return utils.encode_categorical_inputs(indices, output_mode=self.output_mode, depth=len(self.bin_boundaries) + 1, sparse=self.sparse, dtype=self.compute_dtype)
def expand_dims(inputs, axis): if tf_utils.is_sparse(inputs): return tf.sparse.expand_dims(inputs, axis) else: return tf.compat.v1.expand_dims(inputs, axis)
def test_is_sparse_return_false_for_list(self): tensor = [1., 2., 3.] self.assertFalse(tf_utils.is_sparse(tensor))
def test_is_sparse_return_true_for_sparse_tensor_value(self): tensor = tf.compat.v1.SparseTensorValue( indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4]) self.assertTrue(tf_utils.is_sparse(tensor))
def _expand_dims(self, inputs, axis): if tf_utils.is_sparse(inputs): return tf.sparse.expand_dims(inputs, axis) else: return tf.expand_dims(inputs, axis)
def expand_dims(inputs, axis): """Expand dims on sparse, ragged, or dense tensors.""" if tf_utils.is_sparse(inputs): return tf.sparse.expand_dims(inputs, axis) else: return tf.expand_dims(inputs, axis)