def call(self, texts: tf.Tensor, substrings: tf.RaggedTensor) -> tf.RaggedTensor: texts = tf.strings.regex_replace(texts, pattern=self.special_chars, rewrite='') texts = tf.strings.strip( tf.strings.regex_replace(texts, pattern=r'\s{2,}', rewrite=' ')) substrings = tf.strings.regex_replace(substrings, pattern=self.special_chars, rewrite='') substrings = tf.strings.strip( tf.strings.regex_replace(substrings, pattern=r'\s{2,}', rewrite=' ')) pre = r'.*(\s|^)' post = r'(\s|$).*' ragged_texts = tf.RaggedTensor.from_row_lengths( values=tf.repeat(texts, repeats=substrings.row_lengths()), row_lengths=substrings.row_lengths()) return tf.ragged.map_flat_values( self.find_match, ragged_texts, tf.strings.join([pre, substrings, post]))
def embedding_pooling(tensor: tf.RaggedTensor, combiner: str = "sqrtn"): tensor_sum = tf.math.reduce_sum(tensor, axis=1) if combiner == "sum": return tensor_sum row_lengths = tf.expand_dims(tensor.row_lengths(axis=1), axis=1) row_lengths = tf.math.maximum(tf.ones_like(row_lengths), row_lengths) row_lengths = tf.cast(row_lengths, dtype=tf.float32) if combiner == "mean": return tensor_sum / row_lengths if combiner == "sqrtn": return tensor_sum / tf.math.sqrt(row_lengths)
def _run_model_filter_empty_sequences(self, batch_item_indices: tf.RaggedTensor, batch_customer_indices, n_results): # Check if there are empty sequences sequences_lenghts = batch_item_indices.row_lengths() non_empty_seq_count = tf.math.count_nonzero(sequences_lenghts) n_sequences = tf.shape(sequences_lenghts, tf.int64)[0] #print(">>>", non_empty_seq_count, n_results) if non_empty_seq_count == 0: # All sequences are empty label_predictions = tf.zeros([n_sequences, n_results], dtype=tf.string) probs_predictions = tf.zeros([n_sequences, n_results], dtype=tf.float32) return (label_predictions, probs_predictions) elif non_empty_seq_count >= n_sequences: # There are no empty sequences. Run the model with the full batch return self._run_model_and_postprocess(batch_item_indices, batch_customer_indices, n_results) else: # There are some empty sequences # Model will fail if a sequence is empty, and it seems it's the expected behaviour: Do not feed empty sequences # Get non empty sequences mask non_empty_mask = tf.math.greater(sequences_lenghts, 0) # Get non empty sequences non_empty_sequences: tf.RaggedTensor = tf.ragged.boolean_mask( batch_item_indices, non_empty_mask) non_empty_customers = tf.boolean_mask(batch_customer_indices, non_empty_mask) # Run model label_predictions, probs_predictions = self._run_model_and_postprocess( non_empty_sequences, non_empty_customers, n_results) # Merge real predictions with empty predictions for empty sequences: indices = tf.where(non_empty_mask) final_shape = [n_sequences, n_results] label_predictions = tf.scatter_nd(indices, label_predictions, final_shape) #print(label_predictions) probs_predictions = tf.scatter_nd(indices, probs_predictions, final_shape) #print(probs_predictions) return (label_predictions, probs_predictions)
def pad_sequence_left(sequences_batch: tf.RaggedTensor, mask: bool): """ Pad sequences with zeros on left side """ # Truncate rows to have at most `settings.SEQUENCE_LENGTH` items sequences_batch = sequences_batch[:,-settings.settings.sequence_length:] if mask: # Add one to indices, to reserve 0 index for padding sequences_batch += 1 pad_row_lengths = settings.settings.sequence_length - sequences_batch.row_lengths() pad_values = tf.zeros( [(settings.settings.sequence_length * sequences_batch.nrows()) - tf.size(sequences_batch, tf.int64)], sequences_batch.dtype) padding = tf.RaggedTensor.from_row_lengths(pad_values, pad_row_lengths) return tf.concat([padding, sequences_batch], axis=1).to_tensor()
def multi_behavior_embedding_pooling(tensor: tf.RaggedTensor, combiner: str = "sqrtn"): if len(tensor.shape) == 3: tensor = tf.expand_dims( tensor, axis=2) # batch_size * behavior_count * 1 * embedding_size tensor_sum = tf.math.reduce_sum( tensor, axis=2) # batch_size * behavior_count * embedding_size if combiner == "sum": return tensor_sum row_lengths = tf.expand_dims(tensor.row_lengths(axis=2), axis=2) row_lengths = tf.math.maximum(tf.ones_like(row_lengths), row_lengths) row_lengths = tf.cast(row_lengths, dtype=tf.float32) if combiner == "mean": return tensor_sum / row_lengths if combiner == "sqrtn": return tensor_sum / tf.math.sqrt(row_lengths)