Пример #1
0
    def pipeline(root):
      example_packer = beam_utils.PriorityExamplePacker(
          priority_feature='token_ids',
          max_lengths=dict(token_ids=10, sentence_ids=10, global_token_ids=5),
          breakpoint_features=dict(
              token_ids='long_breakpoints',
              global_token_ids='global_breakpoints'),
          cumulative_features=['sentence_ids'],
          min_packing_fraction=0.75,
          max_cache_len=5)

      result = (
          root | beam.Create(input_examples)
          | beam_utils.PackExamples(example_packer)
          | beam.Map(str))

      beam_testing.assert_that(
          result,
          beam_testing.equal_to([str(x) for x in expected_packed_examples]))
    def pipeline(root):

        question_answers = read_question_answer_csv(input_qaps,
                                                    input_documents,
                                                    data_split)
        question_answers = (
            root | 'CreateQuestionAnswers' >> beam.Create(question_answers)
            | 'ShuffleAfterCreatingQA' >> beam.Reshuffle())

        read_outputs = (
            question_answers
            | 'ReadEvidence' >> beam.ParDo(
                ReadEvidence(stories_dir=stories_dir,
                             summaries_path=summaries_path)).with_outputs())

        _ = (read_outputs[ReadEvidenceOutput.NO_STAR_END_CONTENT]
             | 'ShuffleNoStarEndContent' >> beam.Reshuffle()
             | 'WriteNoStarEndContent' >> write_to_file_fn(
                 output_prefix, 'no_star_end_content.txt'))

        _ = (read_outputs[ReadEvidenceOutput.TOO_SHORT_CONTENT]
             | 'ShuffleTooShortContent' >> beam.Reshuffle()
             | 'WriteTooShortContent' >> write_to_file_fn(
                 output_prefix, 'too_short_content.txt'))

        outputs = (
            read_outputs[ReadEvidenceOutput.SUCCESS]
            | 'ShuffleBeforeMakeExamples' >> beam.Reshuffle()
            | 'MakeExamples' >> beam.ParDo(
                MakeExamples(
                    spm_model_path=spm_model_path,
                    num_blocks_per_example=num_blocks_per_example,
                    block_overlap_length=block_overlap_length,
                    block_length=block_length,
                    max_num_annotations_per_block=max_num_annotations_per_block,
                    padding_token_id=padding_token_id,
                    cls_token_id=cls_token_id,
                    sep_token_id=sep_token_id,
                    generate_answers=generate_answers,
                    generate_summaries=generate_summaries,
                    min_rouge_l_oracle_score=min_rouge_l_oracle_score,
                    nltk_data_path=nltk_data_path)).with_outputs())

        #  if generate_answers:
        #     # Write failure cases, when no answer was found
        #     _ = (
        #         outputs[MakeExampleOutput.NO_ANSWER]
        #         | 'ShuffleNoAnswer' >> beam.Reshuffle()
        #         | 'SampleNoAnswer' >>
        #         beam.combiners.Sample.FixedSizeGlobally(SAMPLE_NO_ANSWER_QUESTIONS)
        #         | 'WriteNoAnswer' >> write_to_file_fn('no_answer.jsonl'))

        #     _ = (
        #         outputs[MakeExampleOutput.NO_ANSWER_TOKENIZED]
        #         | 'ShuffleNoAnswerTokenized' >> beam.Reshuffle()
        #         | 'SampleNoAnswerTokenized' >>
        #         beam.combiners.Sample.FixedSizeGlobally(SAMPLE_NO_ANSWER_QUESTIONS)
        #         | 'WriteNoAnswerTokenized' >>
        #         write_to_file_fn('no_answer_tokenized.jsonl'))

        #     # Write annotations that have been filtered out after tokenization
        #     _ = (
        #         outputs[MakeExampleOutput.SUCCESS_FILTERED_ANNOTATIONS]
        #         | 'ShuffleSuccessFilteredAnnotations' >> beam.Reshuffle()
        #         | 'FlattenSuccessFilteredAnnotations' >> beam.FlatMap(lambda x: x)
        #         | 'WriteSuccessFilteredAnnotations' >>
        #         write_to_file_fn('success.filtered_annotations.txt'))

        # _ = (
        #     outputs[
        #         MakeExampleOutput.NO_ANSWER_TOKENIZED_FILTERED_ANNOTATIONS]
        #     |
        #     'ShuffleNoAnswerTokenizedFilteredAnnotations' >> beam.Reshuffle()
        #     | 'FlattenNoAnswerTokenizedFilteredAnnotations' >>
        #     beam.FlatMap(lambda x: x)
        #     | 'WriteNoAnswerTokenizedFilteredAnnotations' >>
        #     write_to_file_fn('no_answer_tokenized.filtered_annotations.txt'))

        #     # Write cases where the too many answer spans were found
        #     _ = (
        #         outputs[MakeExampleOutput.TOO_MANY_ANSWERS]
        #         | 'ShuffleTooManyAnswers' >> beam.Reshuffle()
        #         | ('WriteTooManyAnswers' >>
        #             write_to_file_fn('too_many_answers.jsonl')))

        max_tokens = num_blocks_per_example * block_length
        max_num_annotations = num_blocks_per_example * max_num_annotations_per_block
        max_lengths = dict(token_ids=max_tokens,
                           is_continuation=max_tokens,
                           block_ids=num_blocks_per_example,
                           answer_annotation_begins=max_num_annotations,
                           answer_annotation_ends=max_num_annotations,
                           answer_annotation_labels=max_num_annotations,
                           entity_annotation_begins=max_num_annotations,
                           entity_annotation_ends=max_num_annotations,
                           entity_annotation_labels=max_num_annotations,
                           prefix_length=num_blocks_per_example)

        if generate_summaries:
            max_lengths['summary_token_ids'] = max_tokens

        example_packer = beam_utils.PriorityExamplePacker(
            priority_feature='token_ids',
            max_lengths=max_lengths,
            breakpoint_features=dict(),
            cumulative_features=[],
            min_packing_fraction=1.0,
            max_cache_len=num_blocks_per_example)
        _ = (outputs[MakeExampleOutput.SUCCESS]
             | 'ShuffleBeforePacking' >> beam.Reshuffle()
             | 'PackExamples' >> beam_utils.PackExamples(example_packer)
             | 'ShuffleAfterPacking' >> beam.Reshuffle()
             | 'WriteTfExamples' >> beam.io.WriteToTFRecord(
                 os.path.join(output_prefix + '.tfrecord'),
                 coder=beam.coders.ProtoCoder(tf.train.Example),
                 num_shards=output_num_shards))
Пример #3
0
  def test_priority_example_packer_for_read_it_twice_model(self):
    """Test for PriorityExamplePacker with read-it-twice model's data format.

    For brevity reasons, we omit `is_continuation` feature.
    Note that in all examples below the `block_length` is 5
    """

    # Test case 1: Simple combination of blocks
    packer = beam_utils.PriorityExamplePacker(
        priority_feature='token_ids',
        max_lengths=dict(token_ids=20, block_ids=4),
        breakpoint_features={},
        cumulative_features=[],
        min_packing_fraction=1.0,
        max_cache_len=2)

    self.assertEqual([],
                     packer.add_example(
                         text_format.Parse(
                             """
      features {
        feature {
          key: "token_ids"
          value {
            int64_list {
              value: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
            }
          }
        }
        feature {
          key: "block_ids"
          value {
            int64_list {
              value: [1, 1]
            }
          }
        }
      }
      """, tf.train.Example())))

    self.assertEqual([],
                     packer.add_example(
                         text_format.Parse(
                             """
      features {
        feature {
          key: "token_ids"
          value {
            int64_list {
              value: [1, 0, 0, 0, 0]
            }
          }
        }
        feature {
          key: "block_ids"
          value {
            int64_list {
              value: [2]
            }
          }
        }
      }
      """, tf.train.Example())))

    self.assertEqual([
        text_format.Parse(
            """
            features {
              feature {
                key: "token_ids"
                value {
                  int64_list {
                    value: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                            1, 0, 0, 0, 0, 1, 0, 0, 0, 0]
                  }
                }
              }
              feature {
                key: "block_ids"
                value {
                  int64_list {
                    value: [1, 1, 2, 5]
                  }
                }
              }
              feature {
                key: "packing_status"
                value {
                  bytes_list {
                    value: ['packed']
                  }
                }
              }
            }
            """, tf.train.Example())
    ],
                     packer.add_example(
                         text_format.Parse(
                             """
      features {
        feature {
          key: "token_ids"
          value {
            int64_list {
              value: [1, 0, 0, 0, 0]
            }
          }
        }
        feature {
          key: "block_ids"
          value {
            int64_list {
              value: [5]
            }
          }
        }
      }
      """, tf.train.Example())))

    # Test case 2: The block is expected to be filled in `flush_examples`
    packer = beam_utils.PriorityExamplePacker(
        priority_feature='token_ids',
        max_lengths=dict(token_ids=20, block_ids=4),
        breakpoint_features={},
        cumulative_features=[],
        min_packing_fraction=1.0,
        max_cache_len=2,
        padding_token_ids=dict(token_ids=-1, block_ids=0))

    self.assertEqual([],
                     packer.add_example(
                         text_format.Parse(
                             """
      features {
        feature {
          key: "token_ids"
          value {
            int64_list {
              value: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
            }
          }
        }
        feature {
          key: "block_ids"
          value {
            int64_list {
              value: [1, 1]
            }
          }
        }
      }
      """, tf.train.Example())))

    self.assertEqual([],
                     packer.add_example(
                         text_format.Parse(
                             """
      features {
        feature {
          key: "token_ids"
          value {
            int64_list {
              value: [1, 0, 0, 0, 0]
            }
          }
        }
        feature {
          key: "block_ids"
          value {
            int64_list {
              value: [2]
            }
          }
        }
      }
      """, tf.train.Example())))

    self.assertEqual([
        text_format.Parse(
            """
            features {
              feature {
                key: "token_ids"
                value {
                  int64_list {
                    value: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                            1, 0, 0, 0, 0, -1, -1, -1, -1, -1]
                  }
                }
              }
              feature {
                key: "block_ids"
                value {
                  int64_list {
                    value: [1, 1, 2, 0]
                  }
                }
              }
              feature {
                key: "packing_status"
                value {
                  bytes_list {
                    value: ['flushed']
                  }
                }
              }
            }
            """, tf.train.Example())
    ], packer.flush_examples())
Пример #4
0
  def test_priority_example_packer(self):

    def make_test_example(
        num_values,
        token_pad_num = None,
        global_token_pad_num = None,
        packing_status = None):
      """Makes a tf.Example for testing packing logic.

      The result will have the following features:
      1. An int64 `token_ids` feature containing `num_values` values counting
        from 1 upwards. If `token_pad_num` is supplied, additional `0` values
        will be appended to the right until the number of values reaches
        `token_pad_num`.
      2. An int64 `global_token_ids` feature containing a single `1` value.
        If `global_token_pad_num` is supplied, additional `0` values
        will be appended to the right until the number of values reaches
        `global_token_pad_num`.
      3. If `packing_status` is supplied, a feature by that name will be added
        with the corresponding bytes value.

      Args:
        num_values: Positive integer number of `token_ids` in the example.
        token_pad_num: Optional length to pad `token_ids` to. Must not be less
          than `num_values`.
        global_token_pad_num: Optional length to pad `global_token_ids` to. Must
          be positive.
        packing_status: Optional bytes value to record in a `packing_status`
          feature.

      Returns:
        The example.
      """
      values = [x + 1 for x in range(num_values)]
      if token_pad_num is not None:
        while len(values) < token_pad_num:
          values.append(0)
      result = tf.train.Example()
      result.features.feature['token_ids'].int64_list.value.extend(values)

      global_values = [1]
      if global_token_pad_num is not None:
        remaining_len = global_token_pad_num - len(global_values)
        global_values.extend([0] * remaining_len)
      result.features.feature['global_token_ids'].int64_list.value.extend(
          global_values)

      if packing_status is not None:
        result.features.feature['packing_status'].bytes_list.value.append(
            packing_status)

      return result

    # Some example calls to `make_test_example()` for illustration:
    self.assertEqual(
        text_format.Parse(
            """
        features {
          feature {
            key: "token_ids"
            value {
              int64_list {
                value: [1, 2, 3, 4, 5]
              }
            }
          }
          feature {
            key: "global_token_ids"
            value {
              int64_list {
                value: [1]
              }
            }
          }
        }
        """, tf.train.Example()), make_test_example(5))

    self.assertEqual(
        text_format.Parse(
            """
        features {
          feature {
            key: "token_ids"
            value {
              int64_list {
                value: [1, 2, 3, 4, 5, 6, 0, 0, 0, 0]
              }
            }
          }
          feature {
            key: "global_token_ids"
            value {
              int64_list {
                value: [1, 0, 0, 0]
              }
            }
          }
          feature {
            key: "packing_status"
            value {
              bytes_list {
                value: ['untouched']
              }
            }
          }
        }
        """, tf.train.Example()),
        make_test_example(
            6,
            token_pad_num=10,
            global_token_pad_num=4,
            packing_status=b'untouched'))

    # Proceed to test `PriorityExamplePacker`.

    # `breakpoint_features` and `cumulative_features` are tested in
    # `test_pack_examples()` above, so we omit them here for brevity.
    packer = beam_utils.PriorityExamplePacker(
        priority_feature='token_ids',
        max_lengths=dict(token_ids=10, global_token_ids=4),
        breakpoint_features={},
        cumulative_features=[],
        min_packing_fraction=0.85,
        max_cache_len=2)

    self.assertEqual([], packer.add_example(make_test_example(8)))
    self.assertEqual([], packer.add_example(make_test_example(4)))
    self.assertEqual([
        make_test_example(
            8,
            token_pad_num=10,
            global_token_pad_num=4,
            packing_status=b'evicted')
    ], packer.add_example(make_test_example(7)))
    self.assertEqual([
        text_format.Parse(
            """
            features {
              feature {
                key: "token_ids"
                value {
                  int64_list {
                    value: [1, 2, 3, 4, 5, 6, 7, 1, 2, 0]
                  }
                }
              }
              feature {
                key: "global_token_ids"
                value {
                  int64_list {
                    value: [1, 1, 0, 0]
                  }
                }
              }
              feature {
                key: "packing_status"
                value {
                  bytes_list {
                    value: ['packed']
                  }
                }
              }
            }
            """, tf.train.Example())
    ], packer.add_example(make_test_example(2)))
    self.assertEqual([], packer.add_example(make_test_example(1)))
    self.assertEqual([], packer.add_example(make_test_example(1)))
    self.assertEqual([], packer.add_example(make_test_example(5)))

    # We satisfy minimum `global_token_ids` instead of `token_ids` here.
    self.assertEqual([
        text_format.Parse(
            """
            features {
              feature {
                key: "token_ids"
                value {
                  int64_list {
                    value: [1, 2, 3, 4, 1, 1, 1, 0, 0, 0]
                  }
                }
              }
              feature {
                key: "global_token_ids"
                value {
                  int64_list {
                    value: [1, 1, 1, 1]
                  }
                }
              }
              feature {
                key: "packing_status"
                value {
                  bytes_list {
                    value: ['packed']
                  }
                }
              }
            }
            """, tf.train.Example())
    ], packer.add_example(make_test_example(1)))

    self.assertEqual([
        make_test_example(
            9,
            token_pad_num=10,
            global_token_pad_num=4,
            packing_status=b'untouched')
    ], packer.add_example(make_test_example(9)))

    with self.assertRaises(ValueError):
      packer.add_example(make_test_example(11))

    with self.assertRaises(ValueError):
      packer.add_example(make_test_example(3, global_token_pad_num=5))

    self.assertEqual([], packer.add_example(make_test_example(8)))
    self.assertEqual([
        make_test_example(
            8,
            token_pad_num=10,
            global_token_pad_num=4,
            packing_status=b'flushed'),
        make_test_example(
            5,
            token_pad_num=10,
            global_token_pad_num=4,
            packing_status=b'flushed')
    ], packer.flush_examples())
    self.assertEqual([], packer.flush_examples())
Пример #5
0
    def pipeline(root):
        question_answers = read_question_answer_json(input_file)
        question_answers = (
            root | 'CreateQuestionAnswers' >> beam.Create(question_answers))

        outputs = (
            question_answers
            | 'ReadEvidence' >> beam.ParDo(
                ReadEvidence(wikipedia_dir=wikipedia_dir, web_dir=web_dir))
            | 'ShuffleBeforeMakeExamples' >> beam.Reshuffle()
            | 'MakeExamples' >> beam.ParDo(
                MakeExamples(
                    spm_model_path=spm_model_path,
                    num_blocks_per_example=num_blocks_per_example,
                    block_overlap_length=block_overlap_length,
                    block_length=block_length,
                    max_num_annotations_per_block=max_num_annotations_per_block,
                    padding_token_id=padding_token_id,
                    cls_token_id=cls_token_id,
                    sep_token_id=sep_token_id,
                    generate_answers=generate_answers,
                    nltk_data_path=nltk_data_path)).with_outputs())

        if generate_answers:

            # Write failure cases, when no answer was found
            _ = (outputs[MakeExampleOutput.NO_ANSWER]
                 | 'WriteNoAnswer' >> write_to_file_fn(output_prefix,
                                                       'no_answer.jsonl'))

            _ = (outputs[MakeExampleOutput.NO_ANSWER_TOKENIZED]
                 | 'WriteNoAnswerTokenized' >> write_to_file_fn(
                     output_prefix, 'no_answer_tokenized.jsonl'))

            # Write annotations that have been filtered out after tokenization
            _ = (outputs[MakeExampleOutput.SUCCESS_FILTERED_ANNOTATIONS]
                 | 'FlattenSuccessFilteredAnnotations' >>
                 beam.FlatMap(lambda x: x)
                 | 'WriteSuccessFilteredAnnotations' >> write_to_file_fn(
                     output_prefix, 'success.filtered_annotations.txt'))

            _ = (outputs[
                MakeExampleOutput.NO_ANSWER_TOKENIZED_FILTERED_ANNOTATIONS]
                 | 'FlattenNoAnswerTokenizedFilteredAnnotations' >>
                 beam.FlatMap(lambda x: x)
                 | 'WriteNoAnswerTokenizedFilteredAnnotations' >>
                 write_to_file_fn(
                     output_prefix,
                     'no_answer_tokenized.filtered_annotations.txt'))

            # Write cases where the too many answer spans were found
            _ = (outputs[MakeExampleOutput.TOO_MANY_ANSWERS]
                 | 'WriteTooManyAnswers' >> write_to_file_fn(
                     output_prefix, 'too_many_answers.jsonl'))

        max_tokens = num_blocks_per_example * block_length
        max_num_annotations = num_blocks_per_example * max_num_annotations_per_block
        example_packer = beam_utils.PriorityExamplePacker(
            priority_feature='token_ids',
            max_lengths=dict(token_ids=max_tokens,
                             is_continuation=max_tokens,
                             block_ids=num_blocks_per_example,
                             answer_annotation_begins=max_num_annotations,
                             answer_annotation_ends=max_num_annotations,
                             answer_annotation_labels=max_num_annotations,
                             entity_annotation_begins=max_num_annotations,
                             entity_annotation_ends=max_num_annotations,
                             entity_annotation_labels=max_num_annotations,
                             prefix_length=num_blocks_per_example),
            breakpoint_features=dict(),
            cumulative_features=[],
            min_packing_fraction=1.0,
            max_cache_len=num_blocks_per_example)
        _ = (outputs[MakeExampleOutput.SUCCESS]
             | 'ShuffleBeforePacking' >> beam.Reshuffle()
             | 'PackExamples' >> beam_utils.PackExamples(example_packer)
             | 'ShuffleAfterPacking' >> beam.Reshuffle()
             | 'WriteTfExamples' >> beam.io.WriteToTFRecord(
                 os.path.join(output_prefix + '.tfrecord'),
                 coder=beam.coders.ProtoCoder(tf.train.Example),
                 num_shards=output_num_shards))