def testPackSequencesErrors(self): test_cases = { 'actual_seq_len must be the same shape': PackSequenceTestCase([1, 1, 1], [1, 1], 2, 2, 2), 'actual_seq_len must be a vector': PackSequenceTestCase([[1], [1]], [[1], [1]], 2, 2, 2) } for name, test in test_cases.items(): with self.assertRaisesRegex(tf.errors.InvalidArgumentError, name): with self.session() as sess: sess.run( ops.pack_sequences(test.src_actual_seq_len, test.tgt_actual_seq_len, test.packed_batch_size, test.packed_src_seq_len, test.packed_tgt_seq_len)) # seq_len must be a scalar. test = PackSequenceTestCase([1, 1], [1, 1], 2, [2, 2], 2) with self.assertRaisesRegex(TypeError, 'Expected int'): with self.session() as sess: sess.run( ops.pack_sequences(test.src_actual_seq_len, test.tgt_actual_seq_len, test.packed_batch_size, test.packed_src_seq_len, test.packed_tgt_seq_len))
def _Pack(self, batch_in): """Packs a given batch, which changes the batch size.""" actual_seq_len = tf.math.reduce_sum(tf.cast(batch_in.segment_ids, tf.int32), axis=1) (segment_ids, segment_pos, indices_in_input, _, _, _) = ops.pack_sequences( actual_seq_len, actual_seq_len, packed_batch_size=0, packed_src_seq_len=self.params.max_sequence_length, packed_tgt_seq_len=self.params.max_sequence_length) def ApplyPacking(x): return ops.apply_packing(x, 0, segment_ids, indices_in_input) batch_out = batch_in.DeepCopy() batch_out = batch_out.Transform(ApplyPacking) batch_out.paddings = ops.apply_packing(batch_in.paddings, 1, segment_ids, indices_in_input) batch_out.segment_ids = tf.cast(segment_ids, tf.float32) batch_out.segment_pos = segment_pos return batch_out
def testPackSequencesShapeUnknown(self): actual_seq_len = tf.compat.v1.placeholder(tf.int32, shape=None) with self.session() as sess: output = ops.pack_sequences(actual_seq_len, actual_seq_len, 2, 5, 5) r = sess.run(output, feed_dict={actual_seq_len: np.array([1, 2, 1])}) self.assertEqual(6, len(r)) self.assertAllEqual(r[0], [[1, 2, 0, 0, 0], [1, 1, 0, 0, 0]]) self.assertAllEqual(r[1], [[0, 0, 0, 0, 0], [0, 1, 0, 0, 0]]) self.assertAllEqual(r[2], [[0, 2, 0, 0, 0], [1, 1, 0, 0, 0]]) self.assertAllEqual(r[3], r[0]) self.assertAllEqual(r[4], r[1]) self.assertAllEqual(r[5], r[2])
def testDroppingInputsNonDeterministic(self): # Packing 3 rows into 1, where we need to drop two rows. inputs = [[2, 1, 2], [1, 2, 1], 1, 2, 2] test_cases = [ # keeping only the first row PackSequenceTestCase( *(inputs + [[[1, 1]], [[0, 1]], [[0, 0]], [[1, 0]], [[0, 0]], [[0, 0]]]) ), # keeping only the second row PackSequenceTestCase( *(inputs + [[[1, 0]], [[0, 0]], [[1, 0]], [[1, 1]], [[0, 1]], [[1, 1]]]) ), # keeping only the last row PackSequenceTestCase( *(inputs + [[[1, 1]], [[0, 1]], [[2, 2]], [[1, 0]], [[0, 0]], [[2, 0]]]) ), ] counts = [0] * 3 with self.session() as sess: test = test_cases[0] for _ in range(100): r = sess.run( ops.pack_sequences( tf.constant(test.src_actual_seq_len, tf.int32), tf.constant(test.tgt_actual_seq_len, tf.int32), tf.constant(test.packed_batch_size, tf.int32), tf.constant(test.packed_src_seq_len, tf.int32), tf.constant(test.packed_tgt_seq_len, tf.int32))) match_idx = FindResultFromList(r, test_cases) self.assertIsNotNone(match_idx, '{} is not a valid result'.format(r)) counts[match_idx] += 1 # We test that all possible outcomes occur sufficiently often to ensure that # dropping is not biased. # The probability of this test failing due to chance is less than 1 in a # million runs, as scipy.stats.binom.cdf(10, 100, 0.3333) ~= 5e-8 for idx, count in enumerate(counts): self.assertGreater( count, 10, 'test case {} does not occur sufficiently often: {}'.format( idx, counts))
def testDroppingInputsFixedSeed(self): # Packing 3 rows into 2, where we need to drop one row. inputs = [[2, 1, 2], [1, 2, 1], 2, 2, 2] test_cases = [ # (seed, test_case) ( 45, # dropping the last row PackSequenceTestCase( *(inputs + [[[1, 1], [1, 0]], [[0, 1], [0, 0]], [[0, 0], [1, 0]], [[1, 0], [1, 1]], [[0, 0], [0, 1]], [[0, 0], [1, 1]]])) ), ( (1 << 37) + 1, # dropping the second row PackSequenceTestCase( *(inputs + [[[1, 1], [1, 1]], [[0, 1], [0, 1]], [[0, 0], [2, 2]], [[1, 0], [1, 0]], [[0, 0], [0, 0]], [[0, 0], [2, 0]]]) )), ] for seed, test in test_cases: with self.session() as sess: r = sess.run( ops.pack_sequences(tf.constant(test.src_actual_seq_len, tf.int32), tf.constant(test.tgt_actual_seq_len, tf.int32), tf.constant(test.packed_batch_size, tf.int32), tf.constant(test.packed_src_seq_len, tf.int32), tf.constant(test.packed_tgt_seq_len, tf.int32), seed=seed)) name = 'test case with seed {}'.format(seed) self.assertEqual(6, len(r), name) self.assertAllEqual(r[0], test.src_segment_ids, name) self.assertAllEqual(r[1], test.src_segment_pos, name) self.assertAllEqual(r[2], test.src_indices_in_input, name) self.assertAllEqual(r[3], test.tgt_segment_ids, name) self.assertAllEqual(r[4], test.tgt_segment_pos, name) self.assertAllEqual(r[5], test.tgt_indices_in_input, name)
def testPackSequences(self): test_cases = { 'Basic': PackSequenceTestCase([1, 2, 1], [1, 2, 1], 2, 5, 5, [[1, 2, 0, 0, 0], [1, 1, 0, 0, 0]], [[0, 0, 0, 0, 0], [0, 1, 0, 0, 0]], [[0, 2, 0, 0, 0], [1, 1, 0, 0, 0]], [[1, 2, 0, 0, 0], [1, 1, 0, 0, 0]], [[0, 0, 0, 0, 0], [0, 1, 0, 0, 0]], [[0, 2, 0, 0, 0], [1, 1, 0, 0, 0]]), 'SpreadFirstN': PackSequenceTestCase([3, 1, 2], [4, 2, 1], 2, 5, 5, [[1, 1, 1, 2, 2], [1, 0, 0, 0, 0]], [[0, 1, 2, 0, 1], [0, 0, 0, 0, 0]], [[0, 0, 0, 2, 2], [1, 0, 0, 0, 0]], [[1, 1, 1, 1, 2], [1, 1, 0, 0, 0]], [[0, 1, 2, 3, 0], [0, 1, 0, 0, 0]], [[0, 0, 0, 0, 2], [1, 1, 0, 0, 0]]), 'DifferentSrcTgtLengths': PackSequenceTestCase([3, 2, 1], [4, 1, 5], 2, 4, 6, [[1, 1, 1, 0], [1, 1, 2, 0]], [[0, 1, 2, 0], [0, 1, 0, 0]], [[0, 0, 0, 0], [1, 1, 2, 0]], [[1, 1, 1, 1, 0, 0], [1, 2, 2, 2, 2, 2]], [[0, 1, 2, 3, 0, 0], [0, 0, 1, 2, 3, 4]], [[0, 0, 0, 0, 0, 0], [1, 2, 2, 2, 2, 2]]), 'Padding': PackSequenceTestCase([1], [2], 3, 3, 3, [[1, 0, 0], [0, 0, 0], [0, 0, 0]], [[0, 0, 0], [0, 0, 0], [0, 0, 0]], [[0, 0, 0], [0, 0, 0], [0, 0, 0]], [[1, 1, 0], [0, 0, 0], [0, 0, 0]], [[0, 1, 0], [0, 0, 0], [0, 0, 0]], [[0, 0, 0], [0, 0, 0], [0, 0, 0]]), 'DroppingInputsTooLong': PackSequenceTestCase([1, 3, 1], [4, 1, 2], 2, 2, 3, [[1, 0], [0, 0]], [[0, 0], [0, 0]], [[2, 0], [0, 0]], [[1, 1, 0], [0, 0, 0]], [[0, 1, 0], [0, 0, 0]], [[2, 2, 0], [0, 0, 0]]), 'DroppingNonPositiveLengths': PackSequenceTestCase([1, 0, 1], [0, 1, 3], 2, 2, 3, [[1, 0], [0, 0]], [[0, 0], [0, 0]], [[2, 0], [0, 0]], [[1, 1, 1], [0, 0, 0]], [[0, 1, 2], [0, 0, 0]], [[2, 2, 2], [0, 0, 0]]), 'PackedBatchSize0': PackSequenceTestCase( [3, 1, 2, 0, 1, 6, 2, 3, 4, 1, 1], [4, 2, 1, 1, 0, 2, 6, 1, 1, 4, 3], 0, 5, 5, [[1, 1, 1, 2, 2], [1, 2, 2, 2, 0], [1, 1, 1, 1, 2], [1, 0, 0, 0, 0]], [[0, 1, 2, 0, 1], [0, 0, 1, 2, 0], [0, 1, 2, 3, 0], [0, 0, 0, 0, 0]], [[0, 0, 0, 2, 2], [1, 7, 7, 7, 0], [8, 8, 8, 8, 9], [10, 0, 0, 0, 0]], [[1, 1, 1, 1, 2], [1, 1, 2, 0, 0], [1, 2, 2, 2, 2], [1, 1, 1, 0, 0]], [[0, 1, 2, 3, 0], [0, 1, 0, 0, 0], [0, 0, 1, 2, 3], [0, 1, 2, 0, 0]], [[0, 0, 0, 0, 2], [1, 1, 7, 0, 0], [8, 9, 9, 9, 9], [10, 10, 10, 0, 0]]) } for name, test in test_cases.items(): with self.session() as sess: r = sess.run( ops.pack_sequences( tf.constant(test.src_actual_seq_len, tf.int32), tf.constant(test.tgt_actual_seq_len, tf.int32), test.packed_batch_size, test.packed_src_seq_len, test.packed_tgt_seq_len)) self.assertEqual(6, len(r), name) self.assertAllEqual(r[0], test.src_segment_ids, name) self.assertAllEqual(r[1], test.src_segment_pos, name) self.assertAllEqual(r[2], test.src_indices_in_input, name) self.assertAllEqual(r[3], test.tgt_segment_ids, name) self.assertAllEqual(r[4], test.tgt_segment_pos, name) self.assertAllEqual(r[5], test.tgt_indices_in_input, name)
def _ApplyPacking(self, batch): """Packs a given batch. Note that this may change the batch size. This function packs the input batch and adds .segment_ids and .segment_pos fields to its `src` and `tgt` fields. Args: batch: a `.NestedMap` of input tensors to be packed. It is modified in place. """ src_actual_seq_len = tf.math.reduce_sum(tf.cast( batch.src.ids_indicator, tf.int32), axis=1) tgt_actual_seq_len = tf.math.reduce_sum(tf.cast( batch.tgt.ids_indicator, tf.int32), axis=1) summary_utils.histogram('source_seq_lengths', src_actual_seq_len) summary_utils.histogram('target_seq_lengths', tgt_actual_seq_len) if not self.params.packing_factor: # Supply segment_ids and segment_pos with no packing. batch.src.segment_ids = batch.src.ids_indicator batch.src.segment_pos = _GetSegmentPos(batch.src.ids_indicator) batch.tgt.segment_ids = batch.tgt.ids_indicator batch.tgt.segment_pos = _GetSegmentPos(batch.tgt.ids_indicator) return (src_segment_ids, src_segment_pos, src_indices_in_input, tgt_segment_ids, tgt_segment_pos, tgt_indices_in_input) = ops.pack_sequences( src_actual_seq_len, tgt_actual_seq_len, self._ScaledBatchSize(), self.params.source_max_length, self.params.target_max_length) uniq_src_indices_in_input = tf.unique( tf.reshape(src_indices_in_input, [-1])).y uniq_tgt_indices_in_input = tf.unique( tf.reshape(tgt_indices_in_input, [-1])).y summary_utils.histogram( 'packed_source_seq_lengths', tf.gather(src_actual_seq_len, uniq_src_indices_in_input, axis=0)) summary_utils.histogram( 'packed_target_seq_lengths', tf.gather(tgt_actual_seq_len, uniq_tgt_indices_in_input, axis=0)) # Ratio of number of non-padded tokens. If < 1.0, we are dropping # input data due to p.packing_factor too high. src_orig_tokens_count = tf.cast(tf.reduce_sum(src_actual_seq_len), tf.float32) src_packed_tokens_count = tf.reduce_sum( tf.cast(src_segment_ids > 0, tf.float32)) summary_utils.scalar('examples/src_packed_token_ratio', src_packed_tokens_count / src_orig_tokens_count) tgt_orig_tokens_count = tf.cast(tf.reduce_sum(tgt_actual_seq_len), tf.float32) tgt_packed_tokens_count = tf.reduce_sum( tf.cast(tgt_segment_ids > 0, tf.float32)) summary_utils.scalar('examples/tgt_packed_token_ratio', tgt_packed_tokens_count / tgt_orig_tokens_count) # We deferred adding .paddings and use its complement .ids_indicator # exclusively so that we can apply the packing with padding set to 0 for all # fields. def ApplyPackingToSource(x): if x.dtype == tf.string: return ops.apply_packing(x, '\t', src_segment_ids, src_indices_in_input) return ops.apply_packing(x, 0, src_segment_ids, src_indices_in_input) src_paddings = ops.apply_packing(batch.src.paddings, 1, src_segment_ids, src_indices_in_input) batch.src = batch.src.Transform(ApplyPackingToSource) batch.src.paddings = src_paddings batch.src.segment_ids = tf.cast(src_segment_ids, tf.float32) batch.src.segment_pos = src_segment_pos def ApplyPackingToTarget(x): if x.dtype == tf.string: return ops.apply_packing(x, '\t', tgt_segment_ids, tgt_indices_in_input) return ops.apply_packing(x, 0, tgt_segment_ids, tgt_indices_in_input) tgt_paddings = ops.apply_packing(batch.tgt.paddings, 1, tgt_segment_ids, tgt_indices_in_input) batch.tgt = batch.tgt.Transform(ApplyPackingToTarget) batch.tgt.paddings = tgt_paddings batch.tgt.segment_ids = tf.cast(tgt_segment_ids, tf.float32) batch.tgt.segment_pos = tgt_segment_pos # The number of examples is indicated by the segment_ids of the target. num_segments = tf.math.reduce_max(batch.tgt.segment_ids, axis=1) num_examples = tf.reduce_sum(num_segments) # Note that this is per infeed value when p.use_per_host_infeed = True. metric_name = 'examples/num_packed_examples' summary_utils.scalar(metric_name, num_examples)
def _Pack(self, batch): """Packs a given batch. Note that this may change the batch size. This function packs the input batch and adds .segment_ids and .segment_pos fields to its `src` and `tgt` fields. Args: batch: a `.NestedMap` of input tensors to be packed. It is modified in place. """ src_actual_seq_len = tf.math.reduce_sum( tf.cast(batch.src.ids_indicator, tf.int32), axis=1) tgt_actual_seq_len = tf.math.reduce_sum( tf.cast(batch.tgt.ids_indicator, tf.int32), axis=1) summary_utils.histogram('source_seq_lengths', src_actual_seq_len) summary_utils.histogram('target_seq_lengths', tgt_actual_seq_len) if not self.params.packing_factor: # Supply segment_ids and segment_pos with no packing. batch.src.segment_ids = batch.src.ids_indicator batch.src.segment_pos = _GetSegmentPos(batch.src.ids_indicator) batch.tgt.segment_ids = batch.tgt.ids_indicator batch.tgt.segment_pos = _GetSegmentPos(batch.tgt.ids_indicator) return (src_segment_ids, src_segment_pos, src_indices_in_input, tgt_segment_ids, tgt_segment_pos, tgt_indices_in_input) = ops.pack_sequences( src_actual_seq_len, tgt_actual_seq_len, self._ScaledBatchSize(), self.params.source_max_length, self.params.target_max_length) uniq_src_indices_in_input = tf.unique( tf.reshape(src_indices_in_input, [-1])).y uniq_tgt_indices_in_input = tf.unique( tf.reshape(tgt_indices_in_input, [-1])).y summary_utils.histogram( 'packed_source_seq_lengths', tf.gather(src_actual_seq_len, uniq_src_indices_in_input, axis=0)) summary_utils.histogram( 'packed_target_seq_lengths', tf.gather(tgt_actual_seq_len, uniq_tgt_indices_in_input, axis=0)) # We deferred adding .paddings and use its complement .ids_indicator # exclusively so that we can apply the packing with padding set to 0 for all # fields. def ApplyPackingToSource(x): if x.dtype == tf.string: return ops.apply_packing(x, '\t', src_segment_ids, src_indices_in_input) return ops.apply_packing(x, 0, src_segment_ids, src_indices_in_input) batch.src = batch.src.Transform(ApplyPackingToSource) batch.src.segment_ids = tf.cast(src_segment_ids, tf.float32) batch.src.segment_pos = src_segment_pos def ApplyPackingToTarget(x): if x.dtype == tf.string: return ops.apply_packing(x, '\t', tgt_segment_ids, tgt_indices_in_input) return ops.apply_packing(x, 0, tgt_segment_ids, tgt_indices_in_input) batch.tgt = batch.tgt.Transform(ApplyPackingToTarget) batch.tgt.segment_ids = tf.cast(tgt_segment_ids, tf.float32) batch.tgt.segment_pos = tgt_segment_pos