def _reduce_batch_weighted_counts(x): (unique_x, summed_weights_per_x, summed_positive_per_x_and_y, counts_per_x) = tf_utils.reduce_batch_weighted_counts(x) self.assertIsNone(summed_weights_per_x) self.assertIsNone(summed_positive_per_x_and_y) self.assertIsNone(counts_per_x) return unique_x
def _get_approximate_vocabulary_analyzer_inputs( x: common_types.TensorType, file_format: common_types.VocabularyFileFormatType, weights: Optional[common_types.TensorType] = None, ) -> Tuple[common_types.TensorType, common_types.TensorType]: """Helper for constructing approximate vocabulary inputs from tensors. Args: x: `Tensor` or `CompositeTensor` to compute vocabulary over. file_format: The format of the resulting vocabulary file. 'tfrecord_gzip' requires tensorflow>=2.4. weights: Optional `Tensor` of weights. Returns: A list of batch-reduced `Tensor`s to feed to vocabulary analysis. """ filter_regex = analyzers.get_vocab_newline_characters_regex( x.dtype, file_format) reduced_batch = tf_utils.reduce_batch_weighted_counts( x, weights=weights, force=True, filter_regex=filter_regex) assert reduced_batch.summed_positive_per_x_and_y is None if weights is None: assert reduced_batch.summed_weights_per_x is None return (reduced_batch.unique_x, reduced_batch.counts_per_x) else: return (reduced_batch.unique_x, reduced_batch.summed_weights_per_x)
def test_reduce_batch_weighted_counts(self, x, weights, expected_results): x = tf.constant(x) if weights is not None: weights = tf.constant(weights) returned_tensors = tf_utils.reduce_batch_weighted_counts(x, weights) with tf.compat.v1.Session() as sess: results = sess.run([a for a in returned_tensors if a is not None]) for result, expected in zip(results, expected_results): self.assertAllEqual(result, np.array(expected))