def unrag_tensor(x: tf.RaggedTensor, max_size: int, axis: int) -> tf.Tensor: """Converts a ragged tensor to a full tensor by padding to a maximum size. This function is useful for converting ragged tensors to a fixed size when one or more of the dimensions are of variable length. Args: x: Ragged tensor to convert. max_size: Maximum size of the axis to pad. axis: Axis of `x` to pad to `max_size`. This must specify ragged dimensions. If more than one axis is specified, `max_size` must be of the same length as `axis`. Returns: A padded version of `x`. Padding will use the equivalent of NaNs in the tensor's native dtype. This will replace the shape of the specified `axis` with `max_size`, leaving the remaining dimensions set to the bounding shape of the ragged tensor. """ bounding_shape = x.bounding_shape() axis = tf.cast(axis, tf.int64) axis = axis % len(x.shape) # Handle negative indices. axis = tf.reshape(axis, [-1, 1]) # Ensure (n, 1) shape for indexing. max_size = tf.cast(max_size, bounding_shape.dtype) max_size = tf.reshape(max_size, [-1]) # Ensure (n,) shape for indexing. shape = tf.tensor_scatter_nd_update(bounding_shape, axis, max_size) return x.to_tensor(default_value=tf.cast(np.NaN, x.dtype), shape=shape)
def _ragged_as_leaf_node( ragged_tensor: tf.RaggedTensor, is_repeated: bool, reference_ragged_tensor: tf.RaggedTensor, options: calculate_options.Options) -> prensor.LeafNodeTensor: """Creates a ragged tensor as a leaf node.""" assertions = [] size_dim = tf.compat.dimension_at_index(ragged_tensor.shape, 0).value reference_size_dim = tf.compat.dimension_at_index( reference_ragged_tensor.shape, 0).value if (size_dim is not None and reference_size_dim is not None): if size_dim != reference_size_dim: raise ValueError("Returned ragged tensor is not the right size.") elif options.ragged_checks: assertions.append( tf.assert_equal(ragged_tensor.nrows(), reference_ragged_tensor.nrows())) if not is_repeated: rowids = ragged_tensor.value_rowids() if options.ragged_checks: assertions.append( tf.compat.v1.assert_positive(rowids[1:] - rowids[:-1])) if assertions: with tf.control_dependencies(assertions): parent_index = ragged_tensor.value_rowids() return prensor.LeafNodeTensor(parent_index, ragged_tensor.values, is_repeated) else: parent_index = ragged_tensor.value_rowids() return prensor.LeafNodeTensor(parent_index, ragged_tensor.values, is_repeated)
def call(self, texts: tf.Tensor, substrings: tf.RaggedTensor) -> tf.RaggedTensor: texts = tf.strings.regex_replace(texts, pattern=self.special_chars, rewrite='') texts = tf.strings.strip( tf.strings.regex_replace(texts, pattern=r'\s{2,}', rewrite=' ')) substrings = tf.strings.regex_replace(substrings, pattern=self.special_chars, rewrite='') substrings = tf.strings.strip( tf.strings.regex_replace(substrings, pattern=r'\s{2,}', rewrite=' ')) pre = r'.*(\s|^)' post = r'(\s|$).*' ragged_texts = tf.RaggedTensor.from_row_lengths( values=tf.repeat(texts, repeats=substrings.row_lengths()), row_lengths=substrings.row_lengths()) return tf.ragged.map_flat_values( self.find_match, ragged_texts, tf.strings.join([pre, substrings, post]))
def update_state(self, y_true: tf.RaggedTensor, y_pred: tf.RaggedTensor, sample_weight: Optional[tf.Tensor] = None) -> None: """Computes edit distance for two RaggedTensors""" assert isinstance(y_true, tf.RaggedTensor) and isinstance( y_pred, tf.RaggedTensor) edit_distances = tf.edit_distance(y_pred.to_sparse(), y_true.to_sparse(), normalize=True) return super().update_state(edit_distances, sample_weight)
def pad_sequence_left(sequences_batch: tf.RaggedTensor, mask: bool): """ Pad sequences with zeros on left side """ # Truncate rows to have at most `settings.SEQUENCE_LENGTH` items sequences_batch = sequences_batch[:,-settings.settings.sequence_length:] if mask: # Add one to indices, to reserve 0 index for padding sequences_batch += 1 pad_row_lengths = settings.settings.sequence_length - sequences_batch.row_lengths() pad_values = tf.zeros( [(settings.settings.sequence_length * sequences_batch.nrows()) - tf.size(sequences_batch, tf.int64)], sequences_batch.dtype) padding = tf.RaggedTensor.from_row_lengths(pad_values, pad_row_lengths) return tf.concat([padding, sequences_batch], axis=1).to_tensor()
def raged_lists_batch_to_multihot(ragged_lists_batch: tf.RaggedTensor, multihot_dim: int) -> tf.Tensor: """ Maps a batch of label indices to a batch of multi-hot ones """ # TODO: Seems tf.one_hot supports ragged tensors, so try to remove to_tensor call t = ragged_lists_batch.to_tensor(-1) # Default value = -1 -> one_hot will not assign any one t = tf.one_hot( t , multihot_dim ) t = tf.reduce_max( t , axis=1 ) return t
def __init__(self, hierarchical_histogram: tf.RaggedTensor, use_efficient: bool = False): """Initializer for `HierarchicalHistogramDecoder`. `use_efficient` decides whether to use the accuracy optimization trick from the paper ["Efficient Use of Differentially Private Binary Trees. James Honaker".](https://privacytools.seas.harvard.edu/files/privacytools/files/honaker.pdf) for differentially private hierarchical histogram. The optimization trick leverages redudant information in the hierarchical histogram to optimize the accuracy of node queries. Args: hierarchical_histogram: A `tf.RaggedTensor` for the hierarchical histogram. use_efficient: A boolean indicating the usage of the efficient tree aggregation algorithm. """ self._hierarchical_histogram = hierarchical_histogram.to_list() _check_hierarchical_histogram_shape(self._hierarchical_histogram) if len(self._hierarchical_histogram) == 1: self._arity = 2 else: self._arity = int( len(self._hierarchical_histogram[1]) / len(self._hierarchical_histogram[0])) self._size = len(hierarchical_histogram[-1]) self._num_layers = math.ceil(math.log(self._size, self._arity)) + 1 self._use_efficient = use_efficient
def embedding_pooling(tensor: tf.RaggedTensor, combiner: str = "sqrtn"): tensor_sum = tf.math.reduce_sum(tensor, axis=1) if combiner == "sum": return tensor_sum row_lengths = tf.expand_dims(tensor.row_lengths(axis=1), axis=1) row_lengths = tf.math.maximum(tf.ones_like(row_lengths), row_lengths) row_lengths = tf.cast(row_lengths, dtype=tf.float32) if combiner == "mean": return tensor_sum / row_lengths if combiner == "sqrtn": return tensor_sum / tf.math.sqrt(row_lengths)
def pad_sequence_right( sequences_batch: tf.RaggedTensor, mask: bool) -> tf.Tensor: """ Pad sequences with zeros on right side """ # Avoid sequences larger than sequence_length: Get last sequence_length of each sequence sequences_batch = sequences_batch[:,-settings.settings.sequence_length:] if mask: # Add one to indices, to reserve 0 index for padding sequences_batch += 1 # Convert to dense, padding zeros to the right sequences_batch = sequences_batch.to_tensor(0, shape=[None, settings.settings.sequence_length]) return sequences_batch
def _run_model_filter_empty_sequences(self, batch_item_indices: tf.RaggedTensor, batch_customer_indices, n_results): # Check if there are empty sequences sequences_lenghts = batch_item_indices.row_lengths() non_empty_seq_count = tf.math.count_nonzero(sequences_lenghts) n_sequences = tf.shape(sequences_lenghts, tf.int64)[0] #print(">>>", non_empty_seq_count, n_results) if non_empty_seq_count == 0: # All sequences are empty label_predictions = tf.zeros([n_sequences, n_results], dtype=tf.string) probs_predictions = tf.zeros([n_sequences, n_results], dtype=tf.float32) return (label_predictions, probs_predictions) elif non_empty_seq_count >= n_sequences: # There are no empty sequences. Run the model with the full batch return self._run_model_and_postprocess(batch_item_indices, batch_customer_indices, n_results) else: # There are some empty sequences # Model will fail if a sequence is empty, and it seems it's the expected behaviour: Do not feed empty sequences # Get non empty sequences mask non_empty_mask = tf.math.greater(sequences_lenghts, 0) # Get non empty sequences non_empty_sequences: tf.RaggedTensor = tf.ragged.boolean_mask( batch_item_indices, non_empty_mask) non_empty_customers = tf.boolean_mask(batch_customer_indices, non_empty_mask) # Run model label_predictions, probs_predictions = self._run_model_and_postprocess( non_empty_sequences, non_empty_customers, n_results) # Merge real predictions with empty predictions for empty sequences: indices = tf.where(non_empty_mask) final_shape = [n_sequences, n_results] label_predictions = tf.scatter_nd(indices, label_predictions, final_shape) #print(label_predictions) probs_predictions = tf.scatter_nd(indices, probs_predictions, final_shape) #print(probs_predictions) return (label_predictions, probs_predictions)
def __init__( self, num_states: int, data: tf.Tensor, indices: tf.Tensor, indptr: tf.Tensor, states: tf.RaggedTensor, ref_ids: tf.RaggedTensor, ref_ids_lookup: Dict[str, int], ) -> None: self._num_states = tf.Variable([num_states], trainable=False) self._data = tf.Variable(data, trainable=False) self._indices = tf.Variable(indices, trainable=False) self._indptr = tf.Variable(indptr, trainable=False) self._states = tf.Variable(states.to_tensor(), trainable=False) self._ref_ids = tf.Variable(ref_ids.to_tensor(), trainable=False) keys = tf.convert_to_tensor(list(ref_ids_lookup.keys()), dtype=tf.string) values = tf.convert_to_tensor(list(ref_ids_lookup.values()), dtype=tf.int32) initializer = tf.lookup.KeyValueTensorInitializer(keys, values) self._ref_ids_lookup = tf.lookup.StaticHashTable(initializer, default_value=0)
def multi_behavior_embedding_pooling(tensor: tf.RaggedTensor, combiner: str = "sqrtn"): if len(tensor.shape) == 3: tensor = tf.expand_dims( tensor, axis=2) # batch_size * behavior_count * 1 * embedding_size tensor_sum = tf.math.reduce_sum( tensor, axis=2) # batch_size * behavior_count * embedding_size if combiner == "sum": return tensor_sum row_lengths = tf.expand_dims(tensor.row_lengths(axis=2), axis=2) row_lengths = tf.math.maximum(tf.ones_like(row_lengths), row_lengths) row_lengths = tf.cast(row_lengths, dtype=tf.float32) if combiner == "mean": return tensor_sum / row_lengths if combiner == "sqrtn": return tensor_sum / tf.math.sqrt(row_lengths)
def _truncate_row_lengths(ragged_tensor: tf.RaggedTensor, new_lengths: tf.Tensor) -> tf.RaggedTensor: """Truncates the rows of `ragged_tensor` to the given row lengths.""" new_lengths = tf.broadcast_to(new_lengths, ragged_tensor.bounding_shape()[0:1]) def fn(x): row, new_length = x return row[0:new_length] fn_dtype = tf.RaggedTensorSpec(dtype=ragged_tensor.dtype, ragged_rank=ragged_tensor.ragged_rank - 1) result = tf.map_fn(fn, (ragged_tensor, new_lengths), dtype=fn_dtype) # Work around broken shape propagation: without this, result has unknown rank. flat_values_shape = [None] * ragged_tensor.flat_values.shape.rank result = result.with_flat_values( tf.ensure_shape(result.flat_values, flat_values_shape)) return result