Exemplo n.º 1
0
def unrag_tensor(x: tf.RaggedTensor, max_size: int, axis: int) -> tf.Tensor:
    """Converts a ragged tensor to a full tensor by padding to a maximum size.

    This function is useful for converting ragged tensors to a fixed size when one or
    more of the dimensions are of variable length.

    Args:
        x: Ragged tensor to convert.
        max_size: Maximum size of the axis to pad.
        axis: Axis of `x` to pad to `max_size`. This must specify ragged dimensions.
            If more than one axis is specified, `max_size` must be of the same length as
            `axis`.

    Returns:
        A padded version of `x`. Padding will use the equivalent of NaNs in the tensor's
        native dtype.

        This will replace the shape of the specified `axis` with `max_size`, leaving the
        remaining dimensions set to the bounding shape of the ragged tensor.
    """
    bounding_shape = x.bounding_shape()
    axis = tf.cast(axis, tf.int64)
    axis = axis % len(x.shape)  # Handle negative indices.
    axis = tf.reshape(axis, [-1, 1])  # Ensure (n, 1) shape for indexing.
    max_size = tf.cast(max_size, bounding_shape.dtype)
    max_size = tf.reshape(max_size, [-1])  # Ensure (n,) shape for indexing.
    shape = tf.tensor_scatter_nd_update(bounding_shape, axis, max_size)
    return x.to_tensor(default_value=tf.cast(np.NaN, x.dtype), shape=shape)
Exemplo n.º 2
0
def _ragged_as_leaf_node(
        ragged_tensor: tf.RaggedTensor, is_repeated: bool,
        reference_ragged_tensor: tf.RaggedTensor,
        options: calculate_options.Options) -> prensor.LeafNodeTensor:
    """Creates a ragged tensor as a leaf node."""
    assertions = []
    size_dim = tf.compat.dimension_at_index(ragged_tensor.shape, 0).value
    reference_size_dim = tf.compat.dimension_at_index(
        reference_ragged_tensor.shape, 0).value
    if (size_dim is not None and reference_size_dim is not None):
        if size_dim != reference_size_dim:
            raise ValueError("Returned ragged tensor is not the right size.")
    elif options.ragged_checks:
        assertions.append(
            tf.assert_equal(ragged_tensor.nrows(),
                            reference_ragged_tensor.nrows()))

    if not is_repeated:
        rowids = ragged_tensor.value_rowids()
        if options.ragged_checks:
            assertions.append(
                tf.compat.v1.assert_positive(rowids[1:] - rowids[:-1]))
    if assertions:
        with tf.control_dependencies(assertions):
            parent_index = ragged_tensor.value_rowids()
            return prensor.LeafNodeTensor(parent_index, ragged_tensor.values,
                                          is_repeated)
    else:
        parent_index = ragged_tensor.value_rowids()
        return prensor.LeafNodeTensor(parent_index, ragged_tensor.values,
                                      is_repeated)
Exemplo n.º 3
0
    def call(self, texts: tf.Tensor,
             substrings: tf.RaggedTensor) -> tf.RaggedTensor:
        texts = tf.strings.regex_replace(texts,
                                         pattern=self.special_chars,
                                         rewrite='')
        texts = tf.strings.strip(
            tf.strings.regex_replace(texts, pattern=r'\s{2,}', rewrite=' '))

        substrings = tf.strings.regex_replace(substrings,
                                              pattern=self.special_chars,
                                              rewrite='')
        substrings = tf.strings.strip(
            tf.strings.regex_replace(substrings,
                                     pattern=r'\s{2,}',
                                     rewrite=' '))

        pre = r'.*(\s|^)'
        post = r'(\s|$).*'

        ragged_texts = tf.RaggedTensor.from_row_lengths(
            values=tf.repeat(texts, repeats=substrings.row_lengths()),
            row_lengths=substrings.row_lengths())
        return tf.ragged.map_flat_values(
            self.find_match, ragged_texts,
            tf.strings.join([pre, substrings, post]))
Exemplo n.º 4
0
        def update_state(self,
                         y_true: tf.RaggedTensor,
                         y_pred: tf.RaggedTensor,
                         sample_weight: Optional[tf.Tensor] = None) -> None:
            """Computes edit distance for two RaggedTensors"""
            assert isinstance(y_true, tf.RaggedTensor) and isinstance(
                y_pred, tf.RaggedTensor)

            edit_distances = tf.edit_distance(y_pred.to_sparse(),
                                              y_true.to_sparse(),
                                              normalize=True)
            return super().update_state(edit_distances, sample_weight)
Exemplo n.º 5
0
def pad_sequence_left(sequences_batch: tf.RaggedTensor, mask: bool):
    """ Pad sequences with zeros on left side """

    # Truncate rows to have at most `settings.SEQUENCE_LENGTH` items
    sequences_batch = sequences_batch[:,-settings.settings.sequence_length:]

    if mask:
        # Add one to indices, to reserve 0 index for padding
        sequences_batch += 1

    pad_row_lengths = settings.settings.sequence_length - sequences_batch.row_lengths()
    pad_values = tf.zeros( [(settings.settings.sequence_length * sequences_batch.nrows()) - tf.size(sequences_batch, tf.int64)], 
        sequences_batch.dtype)
    padding = tf.RaggedTensor.from_row_lengths(pad_values, pad_row_lengths)
    return tf.concat([padding, sequences_batch], axis=1).to_tensor()
Exemplo n.º 6
0
def raged_lists_batch_to_multihot(ragged_lists_batch: tf.RaggedTensor, multihot_dim: int) -> tf.Tensor:
    """ Maps a batch of label indices to a batch of multi-hot ones """
    # TODO: Seems tf.one_hot supports ragged tensors, so try to remove to_tensor call
    t = ragged_lists_batch.to_tensor(-1) # Default value = -1 -> one_hot will not assign any one
    t = tf.one_hot( t , multihot_dim )
    t = tf.reduce_max( t , axis=1 )
    return t
Exemplo n.º 7
0
    def __init__(self,
                 hierarchical_histogram: tf.RaggedTensor,
                 use_efficient: bool = False):
        """Initializer for `HierarchicalHistogramDecoder`.

    `use_efficient` decides whether to use the accuracy optimization trick from
    the paper ["Efficient Use of Differentially Private Binary Trees. James
    Honaker".](https://privacytools.seas.harvard.edu/files/privacytools/files/honaker.pdf)
    for differentially private hierarchical histogram. The optimization trick
    leverages redudant information in the hierarchical histogram to optimize the
    accuracy of node queries.

    Args:
      hierarchical_histogram: A `tf.RaggedTensor` for the hierarchical
        histogram.
      use_efficient: A boolean indicating the usage of the efficient tree
        aggregation algorithm.
    """

        self._hierarchical_histogram = hierarchical_histogram.to_list()
        _check_hierarchical_histogram_shape(self._hierarchical_histogram)
        if len(self._hierarchical_histogram) == 1:
            self._arity = 2
        else:
            self._arity = int(
                len(self._hierarchical_histogram[1]) /
                len(self._hierarchical_histogram[0]))
        self._size = len(hierarchical_histogram[-1])
        self._num_layers = math.ceil(math.log(self._size, self._arity)) + 1
        self._use_efficient = use_efficient
Exemplo n.º 8
0
def embedding_pooling(tensor: tf.RaggedTensor, combiner: str = "sqrtn"):
    tensor_sum = tf.math.reduce_sum(tensor, axis=1)
    if combiner == "sum":
        return tensor_sum
    row_lengths = tf.expand_dims(tensor.row_lengths(axis=1), axis=1)
    row_lengths = tf.math.maximum(tf.ones_like(row_lengths), row_lengths)
    row_lengths = tf.cast(row_lengths, dtype=tf.float32)
    if combiner == "mean":
        return tensor_sum / row_lengths
    if combiner == "sqrtn":
        return tensor_sum / tf.math.sqrt(row_lengths)
Exemplo n.º 9
0
def pad_sequence_right( sequences_batch: tf.RaggedTensor, mask: bool) -> tf.Tensor:
    """ Pad sequences with zeros on right side """

    # Avoid sequences larger than sequence_length: Get last sequence_length of each sequence
    sequences_batch = sequences_batch[:,-settings.settings.sequence_length:]

    if mask:
        # Add one to indices, to reserve 0 index for padding
        sequences_batch += 1

    # Convert to dense, padding zeros to the right
    sequences_batch = sequences_batch.to_tensor(0, shape=[None, settings.settings.sequence_length])
    return sequences_batch
Exemplo n.º 10
0
    def _run_model_filter_empty_sequences(self,
                                          batch_item_indices: tf.RaggedTensor,
                                          batch_customer_indices, n_results):

        # Check if there are empty sequences
        sequences_lenghts = batch_item_indices.row_lengths()
        non_empty_seq_count = tf.math.count_nonzero(sequences_lenghts)
        n_sequences = tf.shape(sequences_lenghts, tf.int64)[0]

        #print(">>>", non_empty_seq_count, n_results)
        if non_empty_seq_count == 0:
            # All sequences are empty
            label_predictions = tf.zeros([n_sequences, n_results],
                                         dtype=tf.string)
            probs_predictions = tf.zeros([n_sequences, n_results],
                                         dtype=tf.float32)
            return (label_predictions, probs_predictions)

        elif non_empty_seq_count >= n_sequences:
            # There are no empty sequences. Run the model with the full batch
            return self._run_model_and_postprocess(batch_item_indices,
                                                   batch_customer_indices,
                                                   n_results)
        else:
            # There are some empty sequences
            # Model will fail if a sequence is empty, and it seems it's the expected behaviour: Do not feed empty sequences
            # Get non empty sequences mask
            non_empty_mask = tf.math.greater(sequences_lenghts, 0)

            # Get non empty sequences
            non_empty_sequences: tf.RaggedTensor = tf.ragged.boolean_mask(
                batch_item_indices, non_empty_mask)
            non_empty_customers = tf.boolean_mask(batch_customer_indices,
                                                  non_empty_mask)

            # Run model
            label_predictions, probs_predictions = self._run_model_and_postprocess(
                non_empty_sequences, non_empty_customers, n_results)

            # Merge real predictions with empty predictions for empty sequences:
            indices = tf.where(non_empty_mask)
            final_shape = [n_sequences, n_results]
            label_predictions = tf.scatter_nd(indices, label_predictions,
                                              final_shape)
            #print(label_predictions)
            probs_predictions = tf.scatter_nd(indices, probs_predictions,
                                              final_shape)
            #print(probs_predictions)
            return (label_predictions, probs_predictions)
Exemplo n.º 11
0
    def __init__(
        self,
        num_states: int,
        data: tf.Tensor,
        indices: tf.Tensor,
        indptr: tf.Tensor,
        states: tf.RaggedTensor,
        ref_ids: tf.RaggedTensor,
        ref_ids_lookup: Dict[str, int],
    ) -> None:
        self._num_states = tf.Variable([num_states], trainable=False)
        self._data = tf.Variable(data, trainable=False)
        self._indices = tf.Variable(indices, trainable=False)
        self._indptr = tf.Variable(indptr, trainable=False)
        self._states = tf.Variable(states.to_tensor(), trainable=False)
        self._ref_ids = tf.Variable(ref_ids.to_tensor(), trainable=False)

        keys = tf.convert_to_tensor(list(ref_ids_lookup.keys()),
                                    dtype=tf.string)
        values = tf.convert_to_tensor(list(ref_ids_lookup.values()),
                                      dtype=tf.int32)
        initializer = tf.lookup.KeyValueTensorInitializer(keys, values)
        self._ref_ids_lookup = tf.lookup.StaticHashTable(initializer,
                                                         default_value=0)
Exemplo n.º 12
0
def multi_behavior_embedding_pooling(tensor: tf.RaggedTensor,
                                     combiner: str = "sqrtn"):
    if len(tensor.shape) == 3:
        tensor = tf.expand_dims(
            tensor, axis=2)  # batch_size * behavior_count * 1 * embedding_size
    tensor_sum = tf.math.reduce_sum(
        tensor, axis=2)  # batch_size * behavior_count * embedding_size
    if combiner == "sum":
        return tensor_sum
    row_lengths = tf.expand_dims(tensor.row_lengths(axis=2), axis=2)
    row_lengths = tf.math.maximum(tf.ones_like(row_lengths), row_lengths)
    row_lengths = tf.cast(row_lengths, dtype=tf.float32)
    if combiner == "mean":
        return tensor_sum / row_lengths
    if combiner == "sqrtn":
        return tensor_sum / tf.math.sqrt(row_lengths)
Exemplo n.º 13
0
def _truncate_row_lengths(ragged_tensor: tf.RaggedTensor,
                          new_lengths: tf.Tensor) -> tf.RaggedTensor:
  """Truncates the rows of `ragged_tensor` to the given row lengths."""
  new_lengths = tf.broadcast_to(new_lengths,
                                ragged_tensor.bounding_shape()[0:1])
  def fn(x):
    row, new_length = x
    return row[0:new_length]
  fn_dtype = tf.RaggedTensorSpec(dtype=ragged_tensor.dtype,
                                 ragged_rank=ragged_tensor.ragged_rank - 1)
  result = tf.map_fn(fn, (ragged_tensor, new_lengths), dtype=fn_dtype)
  # Work around broken shape propagation: without this, result has unknown rank.
  flat_values_shape = [None] * ragged_tensor.flat_values.shape.rank
  result = result.with_flat_values(
      tf.ensure_shape(result.flat_values, flat_values_shape))

  return result