Пример #1
0
    def testPackSequencesErrors(self):
        test_cases = {
            'actual_seq_len must be the same shape':
            PackSequenceTestCase([1, 1, 1], [1, 1], 2, 2, 2),
            'actual_seq_len must be a vector':
            PackSequenceTestCase([[1], [1]], [[1], [1]], 2, 2, 2)
        }
        for name, test in test_cases.items():
            with self.assertRaisesRegex(tf.errors.InvalidArgumentError, name):
                with self.session() as sess:
                    sess.run(
                        ops.pack_sequences(test.src_actual_seq_len,
                                           test.tgt_actual_seq_len,
                                           test.packed_batch_size,
                                           test.packed_src_seq_len,
                                           test.packed_tgt_seq_len))

        # seq_len must be a scalar.
        test = PackSequenceTestCase([1, 1], [1, 1], 2, [2, 2], 2)
        with self.assertRaisesRegex(TypeError, 'Expected int'):
            with self.session() as sess:
                sess.run(
                    ops.pack_sequences(test.src_actual_seq_len,
                                       test.tgt_actual_seq_len,
                                       test.packed_batch_size,
                                       test.packed_src_seq_len,
                                       test.packed_tgt_seq_len))
Пример #2
0
    def _Pack(self, batch_in):
        """Packs a given batch, which changes the batch size."""

        actual_seq_len = tf.math.reduce_sum(tf.cast(batch_in.segment_ids,
                                                    tf.int32),
                                            axis=1)
        (segment_ids, segment_pos, indices_in_input, _, _,
         _) = ops.pack_sequences(
             actual_seq_len,
             actual_seq_len,
             packed_batch_size=0,
             packed_src_seq_len=self.params.max_sequence_length,
             packed_tgt_seq_len=self.params.max_sequence_length)

        def ApplyPacking(x):
            return ops.apply_packing(x, 0, segment_ids, indices_in_input)

        batch_out = batch_in.DeepCopy()
        batch_out = batch_out.Transform(ApplyPacking)
        batch_out.paddings = ops.apply_packing(batch_in.paddings, 1,
                                               segment_ids, indices_in_input)
        batch_out.segment_ids = tf.cast(segment_ids, tf.float32)
        batch_out.segment_pos = segment_pos

        return batch_out
Пример #3
0
 def testPackSequencesShapeUnknown(self):
   actual_seq_len = tf.compat.v1.placeholder(tf.int32, shape=None)
   with self.session() as sess:
     output = ops.pack_sequences(actual_seq_len, actual_seq_len, 2, 5, 5)
     r = sess.run(output, feed_dict={actual_seq_len: np.array([1, 2, 1])})
   self.assertEqual(6, len(r))
   self.assertAllEqual(r[0], [[1, 2, 0, 0, 0], [1, 1, 0, 0, 0]])
   self.assertAllEqual(r[1], [[0, 0, 0, 0, 0], [0, 1, 0, 0, 0]])
   self.assertAllEqual(r[2], [[0, 2, 0, 0, 0], [1, 1, 0, 0, 0]])
   self.assertAllEqual(r[3], r[0])
   self.assertAllEqual(r[4], r[1])
   self.assertAllEqual(r[5], r[2])
Пример #4
0
 def testDroppingInputsNonDeterministic(self):
     # Packing 3 rows into 1, where we need to drop two rows.
     inputs = [[2, 1, 2], [1, 2, 1], 1, 2, 2]
     test_cases = [
         # keeping only the first row
         PackSequenceTestCase(
             *(inputs +
               [[[1, 1]], [[0, 1]], [[0, 0]], [[1, 0]], [[0, 0]], [[0, 0]]])
         ),
         # keeping only the second row
         PackSequenceTestCase(
             *(inputs +
               [[[1, 0]], [[0, 0]], [[1, 0]], [[1, 1]], [[0, 1]], [[1, 1]]])
         ),
         # keeping only the last row
         PackSequenceTestCase(
             *(inputs +
               [[[1, 1]], [[0, 1]], [[2, 2]], [[1, 0]], [[0, 0]], [[2, 0]]])
         ),
     ]
     counts = [0] * 3
     with self.session() as sess:
         test = test_cases[0]
         for _ in range(100):
             r = sess.run(
                 ops.pack_sequences(
                     tf.constant(test.src_actual_seq_len, tf.int32),
                     tf.constant(test.tgt_actual_seq_len, tf.int32),
                     tf.constant(test.packed_batch_size, tf.int32),
                     tf.constant(test.packed_src_seq_len, tf.int32),
                     tf.constant(test.packed_tgt_seq_len, tf.int32)))
             match_idx = FindResultFromList(r, test_cases)
             self.assertIsNotNone(match_idx,
                                  '{} is not a valid result'.format(r))
             counts[match_idx] += 1
     # We test that all possible outcomes occur sufficiently often to ensure that
     # dropping is not biased.
     # The probability of this test failing due to chance is less than 1 in a
     # million runs, as scipy.stats.binom.cdf(10, 100, 0.3333) ~= 5e-8
     for idx, count in enumerate(counts):
         self.assertGreater(
             count, 10,
             'test case {} does not occur sufficiently often: {}'.format(
                 idx, counts))
Пример #5
0
 def testDroppingInputsFixedSeed(self):
     # Packing 3 rows into 2, where we need to drop one row.
     inputs = [[2, 1, 2], [1, 2, 1], 2, 2, 2]
     test_cases = [
         # (seed, test_case)
         (
             45,
             # dropping the last row
             PackSequenceTestCase(
                 *(inputs +
                   [[[1, 1], [1, 0]], [[0, 1], [0, 0]], [[0, 0], [1, 0]],
                    [[1, 0], [1, 1]], [[0, 0], [0, 1]], [[0, 0], [1, 1]]]))
         ),
         (
             (1 << 37) + 1,
             # dropping the second row
             PackSequenceTestCase(
                 *(inputs +
                   [[[1, 1], [1, 1]], [[0, 1], [0, 1]], [[0, 0], [2, 2]],
                    [[1, 0], [1, 0]], [[0, 0], [0, 0]], [[0, 0], [2, 0]]])
             )),
     ]
     for seed, test in test_cases:
         with self.session() as sess:
             r = sess.run(
                 ops.pack_sequences(tf.constant(test.src_actual_seq_len,
                                                tf.int32),
                                    tf.constant(test.tgt_actual_seq_len,
                                                tf.int32),
                                    tf.constant(test.packed_batch_size,
                                                tf.int32),
                                    tf.constant(test.packed_src_seq_len,
                                                tf.int32),
                                    tf.constant(test.packed_tgt_seq_len,
                                                tf.int32),
                                    seed=seed))
             name = 'test case with seed {}'.format(seed)
             self.assertEqual(6, len(r), name)
             self.assertAllEqual(r[0], test.src_segment_ids, name)
             self.assertAllEqual(r[1], test.src_segment_pos, name)
             self.assertAllEqual(r[2], test.src_indices_in_input, name)
             self.assertAllEqual(r[3], test.tgt_segment_ids, name)
             self.assertAllEqual(r[4], test.tgt_segment_pos, name)
             self.assertAllEqual(r[5], test.tgt_indices_in_input, name)
Пример #6
0
 def testPackSequences(self):
     test_cases = {
         'Basic':
         PackSequenceTestCase([1, 2, 1], [1, 2, 1], 2, 5, 5,
                              [[1, 2, 0, 0, 0], [1, 1, 0, 0, 0]],
                              [[0, 0, 0, 0, 0], [0, 1, 0, 0, 0]],
                              [[0, 2, 0, 0, 0], [1, 1, 0, 0, 0]],
                              [[1, 2, 0, 0, 0], [1, 1, 0, 0, 0]],
                              [[0, 0, 0, 0, 0], [0, 1, 0, 0, 0]],
                              [[0, 2, 0, 0, 0], [1, 1, 0, 0, 0]]),
         'SpreadFirstN':
         PackSequenceTestCase([3, 1, 2], [4, 2, 1], 2, 5, 5,
                              [[1, 1, 1, 2, 2], [1, 0, 0, 0, 0]],
                              [[0, 1, 2, 0, 1], [0, 0, 0, 0, 0]],
                              [[0, 0, 0, 2, 2], [1, 0, 0, 0, 0]],
                              [[1, 1, 1, 1, 2], [1, 1, 0, 0, 0]],
                              [[0, 1, 2, 3, 0], [0, 1, 0, 0, 0]],
                              [[0, 0, 0, 0, 2], [1, 1, 0, 0, 0]]),
         'DifferentSrcTgtLengths':
         PackSequenceTestCase([3, 2, 1], [4, 1, 5], 2, 4, 6,
                              [[1, 1, 1, 0], [1, 1, 2, 0]],
                              [[0, 1, 2, 0], [0, 1, 0, 0]],
                              [[0, 0, 0, 0], [1, 1, 2, 0]],
                              [[1, 1, 1, 1, 0, 0], [1, 2, 2, 2, 2, 2]],
                              [[0, 1, 2, 3, 0, 0], [0, 0, 1, 2, 3, 4]],
                              [[0, 0, 0, 0, 0, 0], [1, 2, 2, 2, 2, 2]]),
         'Padding':
         PackSequenceTestCase([1], [2], 3, 3, 3,
                              [[1, 0, 0], [0, 0, 0], [0, 0, 0]],
                              [[0, 0, 0], [0, 0, 0], [0, 0, 0]],
                              [[0, 0, 0], [0, 0, 0], [0, 0, 0]],
                              [[1, 1, 0], [0, 0, 0], [0, 0, 0]],
                              [[0, 1, 0], [0, 0, 0], [0, 0, 0]],
                              [[0, 0, 0], [0, 0, 0], [0, 0, 0]]),
         'DroppingInputsTooLong':
         PackSequenceTestCase([1, 3, 1], [4, 1, 2], 2, 2, 3,
                              [[1, 0], [0, 0]], [[0, 0], [0, 0]],
                              [[2, 0], [0, 0]], [[1, 1, 0], [0, 0, 0]],
                              [[0, 1, 0], [0, 0, 0]],
                              [[2, 2, 0], [0, 0, 0]]),
         'DroppingNonPositiveLengths':
         PackSequenceTestCase([1, 0, 1], [0, 1, 3], 2, 2, 3,
                              [[1, 0], [0, 0]], [[0, 0], [0, 0]],
                              [[2, 0], [0, 0]], [[1, 1, 1], [0, 0, 0]],
                              [[0, 1, 2], [0, 0, 0]],
                              [[2, 2, 2], [0, 0, 0]]),
         'PackedBatchSize0':
         PackSequenceTestCase(
             [3, 1, 2, 0, 1, 6, 2, 3, 4, 1, 1],
             [4, 2, 1, 1, 0, 2, 6, 1, 1, 4, 3], 0, 5, 5,
             [[1, 1, 1, 2, 2], [1, 2, 2, 2, 0], [1, 1, 1, 1, 2],
              [1, 0, 0, 0, 0]], [[0, 1, 2, 0, 1], [0, 0, 1, 2, 0],
                                 [0, 1, 2, 3, 0], [0, 0, 0, 0, 0]],
             [[0, 0, 0, 2, 2], [1, 7, 7, 7, 0], [8, 8, 8, 8, 9],
              [10, 0, 0, 0, 0]], [[1, 1, 1, 1, 2], [1, 1, 2, 0, 0],
                                  [1, 2, 2, 2, 2], [1, 1, 1, 0, 0]],
             [[0, 1, 2, 3, 0], [0, 1, 0, 0, 0], [0, 0, 1, 2, 3],
              [0, 1, 2, 0, 0]], [[0, 0, 0, 0, 2], [1, 1, 7, 0, 0],
                                 [8, 9, 9, 9, 9], [10, 10, 10, 0, 0]])
     }
     for name, test in test_cases.items():
         with self.session() as sess:
             r = sess.run(
                 ops.pack_sequences(
                     tf.constant(test.src_actual_seq_len, tf.int32),
                     tf.constant(test.tgt_actual_seq_len,
                                 tf.int32), test.packed_batch_size,
                     test.packed_src_seq_len, test.packed_tgt_seq_len))
             self.assertEqual(6, len(r), name)
             self.assertAllEqual(r[0], test.src_segment_ids, name)
             self.assertAllEqual(r[1], test.src_segment_pos, name)
             self.assertAllEqual(r[2], test.src_indices_in_input, name)
             self.assertAllEqual(r[3], test.tgt_segment_ids, name)
             self.assertAllEqual(r[4], test.tgt_segment_pos, name)
             self.assertAllEqual(r[5], test.tgt_indices_in_input, name)
Пример #7
0
    def _ApplyPacking(self, batch):
        """Packs a given batch.

    Note that this may change the batch size.

    This function packs the input batch and adds .segment_ids and .segment_pos
    fields to its `src` and `tgt` fields.

    Args:
      batch: a `.NestedMap` of input tensors to be packed. It is modified in
        place.
    """
        src_actual_seq_len = tf.math.reduce_sum(tf.cast(
            batch.src.ids_indicator, tf.int32),
                                                axis=1)
        tgt_actual_seq_len = tf.math.reduce_sum(tf.cast(
            batch.tgt.ids_indicator, tf.int32),
                                                axis=1)
        summary_utils.histogram('source_seq_lengths', src_actual_seq_len)
        summary_utils.histogram('target_seq_lengths', tgt_actual_seq_len)

        if not self.params.packing_factor:
            # Supply segment_ids and segment_pos with no packing.
            batch.src.segment_ids = batch.src.ids_indicator
            batch.src.segment_pos = _GetSegmentPos(batch.src.ids_indicator)
            batch.tgt.segment_ids = batch.tgt.ids_indicator
            batch.tgt.segment_pos = _GetSegmentPos(batch.tgt.ids_indicator)
            return

        (src_segment_ids, src_segment_pos, src_indices_in_input,
         tgt_segment_ids, tgt_segment_pos,
         tgt_indices_in_input) = ops.pack_sequences(
             src_actual_seq_len, tgt_actual_seq_len, self._ScaledBatchSize(),
             self.params.source_max_length, self.params.target_max_length)

        uniq_src_indices_in_input = tf.unique(
            tf.reshape(src_indices_in_input, [-1])).y
        uniq_tgt_indices_in_input = tf.unique(
            tf.reshape(tgt_indices_in_input, [-1])).y
        summary_utils.histogram(
            'packed_source_seq_lengths',
            tf.gather(src_actual_seq_len, uniq_src_indices_in_input, axis=0))
        summary_utils.histogram(
            'packed_target_seq_lengths',
            tf.gather(tgt_actual_seq_len, uniq_tgt_indices_in_input, axis=0))

        # Ratio of number of non-padded tokens. If < 1.0, we are dropping
        # input data due to p.packing_factor too high.
        src_orig_tokens_count = tf.cast(tf.reduce_sum(src_actual_seq_len),
                                        tf.float32)
        src_packed_tokens_count = tf.reduce_sum(
            tf.cast(src_segment_ids > 0, tf.float32))
        summary_utils.scalar('examples/src_packed_token_ratio',
                             src_packed_tokens_count / src_orig_tokens_count)
        tgt_orig_tokens_count = tf.cast(tf.reduce_sum(tgt_actual_seq_len),
                                        tf.float32)
        tgt_packed_tokens_count = tf.reduce_sum(
            tf.cast(tgt_segment_ids > 0, tf.float32))
        summary_utils.scalar('examples/tgt_packed_token_ratio',
                             tgt_packed_tokens_count / tgt_orig_tokens_count)

        # We deferred adding .paddings and use its complement .ids_indicator
        # exclusively so that we can apply the packing with padding set to 0 for all
        # fields.
        def ApplyPackingToSource(x):
            if x.dtype == tf.string:
                return ops.apply_packing(x, '\t', src_segment_ids,
                                         src_indices_in_input)
            return ops.apply_packing(x, 0, src_segment_ids,
                                     src_indices_in_input)

        src_paddings = ops.apply_packing(batch.src.paddings, 1,
                                         src_segment_ids, src_indices_in_input)
        batch.src = batch.src.Transform(ApplyPackingToSource)
        batch.src.paddings = src_paddings
        batch.src.segment_ids = tf.cast(src_segment_ids, tf.float32)
        batch.src.segment_pos = src_segment_pos

        def ApplyPackingToTarget(x):
            if x.dtype == tf.string:
                return ops.apply_packing(x, '\t', tgt_segment_ids,
                                         tgt_indices_in_input)
            return ops.apply_packing(x, 0, tgt_segment_ids,
                                     tgt_indices_in_input)

        tgt_paddings = ops.apply_packing(batch.tgt.paddings, 1,
                                         tgt_segment_ids, tgt_indices_in_input)
        batch.tgt = batch.tgt.Transform(ApplyPackingToTarget)
        batch.tgt.paddings = tgt_paddings
        batch.tgt.segment_ids = tf.cast(tgt_segment_ids, tf.float32)
        batch.tgt.segment_pos = tgt_segment_pos

        # The number of examples is indicated by the segment_ids of the target.
        num_segments = tf.math.reduce_max(batch.tgt.segment_ids, axis=1)
        num_examples = tf.reduce_sum(num_segments)
        # Note that this is per infeed value when p.use_per_host_infeed = True.
        metric_name = 'examples/num_packed_examples'
        summary_utils.scalar(metric_name, num_examples)
Пример #8
0
  def _Pack(self, batch):
    """Packs a given batch.

    Note that this may change the batch size.

    This function packs the input batch and adds .segment_ids and .segment_pos
    fields to its `src` and `tgt` fields.

    Args:
      batch: a `.NestedMap` of input tensors to be packed. It is modified in
        place.
    """
    src_actual_seq_len = tf.math.reduce_sum(
        tf.cast(batch.src.ids_indicator, tf.int32), axis=1)
    tgt_actual_seq_len = tf.math.reduce_sum(
        tf.cast(batch.tgt.ids_indicator, tf.int32), axis=1)
    summary_utils.histogram('source_seq_lengths', src_actual_seq_len)
    summary_utils.histogram('target_seq_lengths', tgt_actual_seq_len)

    if not self.params.packing_factor:
      # Supply segment_ids and segment_pos with no packing.
      batch.src.segment_ids = batch.src.ids_indicator
      batch.src.segment_pos = _GetSegmentPos(batch.src.ids_indicator)
      batch.tgt.segment_ids = batch.tgt.ids_indicator
      batch.tgt.segment_pos = _GetSegmentPos(batch.tgt.ids_indicator)
      return

    (src_segment_ids, src_segment_pos, src_indices_in_input, tgt_segment_ids,
     tgt_segment_pos, tgt_indices_in_input) = ops.pack_sequences(
         src_actual_seq_len, tgt_actual_seq_len, self._ScaledBatchSize(),
         self.params.source_max_length, self.params.target_max_length)

    uniq_src_indices_in_input = tf.unique(
        tf.reshape(src_indices_in_input, [-1])).y
    uniq_tgt_indices_in_input = tf.unique(
        tf.reshape(tgt_indices_in_input, [-1])).y
    summary_utils.histogram(
        'packed_source_seq_lengths',
        tf.gather(src_actual_seq_len, uniq_src_indices_in_input, axis=0))
    summary_utils.histogram(
        'packed_target_seq_lengths',
        tf.gather(tgt_actual_seq_len, uniq_tgt_indices_in_input, axis=0))

    # We deferred adding .paddings and use its complement .ids_indicator
    # exclusively so that we can apply the packing with padding set to 0 for all
    # fields.
    def ApplyPackingToSource(x):
      if x.dtype == tf.string:
        return ops.apply_packing(x, '\t', src_segment_ids, src_indices_in_input)
      return ops.apply_packing(x, 0, src_segment_ids, src_indices_in_input)

    batch.src = batch.src.Transform(ApplyPackingToSource)
    batch.src.segment_ids = tf.cast(src_segment_ids, tf.float32)
    batch.src.segment_pos = src_segment_pos

    def ApplyPackingToTarget(x):
      if x.dtype == tf.string:
        return ops.apply_packing(x, '\t', tgt_segment_ids, tgt_indices_in_input)
      return ops.apply_packing(x, 0, tgt_segment_ids, tgt_indices_in_input)

    batch.tgt = batch.tgt.Transform(ApplyPackingToTarget)
    batch.tgt.segment_ids = tf.cast(tgt_segment_ids, tf.float32)
    batch.tgt.segment_pos = tgt_segment_pos