def pipeline(root): example_packer = beam_utils.PriorityExamplePacker( priority_feature='token_ids', max_lengths=dict(token_ids=10, sentence_ids=10, global_token_ids=5), breakpoint_features=dict( token_ids='long_breakpoints', global_token_ids='global_breakpoints'), cumulative_features=['sentence_ids'], min_packing_fraction=0.75, max_cache_len=5) result = ( root | beam.Create(input_examples) | beam_utils.PackExamples(example_packer) | beam.Map(str)) beam_testing.assert_that( result, beam_testing.equal_to([str(x) for x in expected_packed_examples]))
def pipeline(root): question_answers = read_question_answer_csv(input_qaps, input_documents, data_split) question_answers = ( root | 'CreateQuestionAnswers' >> beam.Create(question_answers) | 'ShuffleAfterCreatingQA' >> beam.Reshuffle()) read_outputs = ( question_answers | 'ReadEvidence' >> beam.ParDo( ReadEvidence(stories_dir=stories_dir, summaries_path=summaries_path)).with_outputs()) _ = (read_outputs[ReadEvidenceOutput.NO_STAR_END_CONTENT] | 'ShuffleNoStarEndContent' >> beam.Reshuffle() | 'WriteNoStarEndContent' >> write_to_file_fn( output_prefix, 'no_star_end_content.txt')) _ = (read_outputs[ReadEvidenceOutput.TOO_SHORT_CONTENT] | 'ShuffleTooShortContent' >> beam.Reshuffle() | 'WriteTooShortContent' >> write_to_file_fn( output_prefix, 'too_short_content.txt')) outputs = ( read_outputs[ReadEvidenceOutput.SUCCESS] | 'ShuffleBeforeMakeExamples' >> beam.Reshuffle() | 'MakeExamples' >> beam.ParDo( MakeExamples( spm_model_path=spm_model_path, num_blocks_per_example=num_blocks_per_example, block_overlap_length=block_overlap_length, block_length=block_length, max_num_annotations_per_block=max_num_annotations_per_block, padding_token_id=padding_token_id, cls_token_id=cls_token_id, sep_token_id=sep_token_id, generate_answers=generate_answers, generate_summaries=generate_summaries, min_rouge_l_oracle_score=min_rouge_l_oracle_score, nltk_data_path=nltk_data_path)).with_outputs()) # if generate_answers: # # Write failure cases, when no answer was found # _ = ( # outputs[MakeExampleOutput.NO_ANSWER] # | 'ShuffleNoAnswer' >> beam.Reshuffle() # | 'SampleNoAnswer' >> # beam.combiners.Sample.FixedSizeGlobally(SAMPLE_NO_ANSWER_QUESTIONS) # | 'WriteNoAnswer' >> write_to_file_fn('no_answer.jsonl')) # _ = ( # outputs[MakeExampleOutput.NO_ANSWER_TOKENIZED] # | 'ShuffleNoAnswerTokenized' >> beam.Reshuffle() # | 'SampleNoAnswerTokenized' >> # beam.combiners.Sample.FixedSizeGlobally(SAMPLE_NO_ANSWER_QUESTIONS) # | 'WriteNoAnswerTokenized' >> # write_to_file_fn('no_answer_tokenized.jsonl')) # # Write annotations that have been filtered out after tokenization # _ = ( # outputs[MakeExampleOutput.SUCCESS_FILTERED_ANNOTATIONS] # | 'ShuffleSuccessFilteredAnnotations' >> beam.Reshuffle() # | 'FlattenSuccessFilteredAnnotations' >> beam.FlatMap(lambda x: x) # | 'WriteSuccessFilteredAnnotations' >> # write_to_file_fn('success.filtered_annotations.txt')) # _ = ( # outputs[ # MakeExampleOutput.NO_ANSWER_TOKENIZED_FILTERED_ANNOTATIONS] # | # 'ShuffleNoAnswerTokenizedFilteredAnnotations' >> beam.Reshuffle() # | 'FlattenNoAnswerTokenizedFilteredAnnotations' >> # beam.FlatMap(lambda x: x) # | 'WriteNoAnswerTokenizedFilteredAnnotations' >> # write_to_file_fn('no_answer_tokenized.filtered_annotations.txt')) # # Write cases where the too many answer spans were found # _ = ( # outputs[MakeExampleOutput.TOO_MANY_ANSWERS] # | 'ShuffleTooManyAnswers' >> beam.Reshuffle() # | ('WriteTooManyAnswers' >> # write_to_file_fn('too_many_answers.jsonl'))) max_tokens = num_blocks_per_example * block_length max_num_annotations = num_blocks_per_example * max_num_annotations_per_block max_lengths = dict(token_ids=max_tokens, is_continuation=max_tokens, block_ids=num_blocks_per_example, answer_annotation_begins=max_num_annotations, answer_annotation_ends=max_num_annotations, answer_annotation_labels=max_num_annotations, entity_annotation_begins=max_num_annotations, entity_annotation_ends=max_num_annotations, entity_annotation_labels=max_num_annotations, prefix_length=num_blocks_per_example) if generate_summaries: max_lengths['summary_token_ids'] = max_tokens example_packer = beam_utils.PriorityExamplePacker( priority_feature='token_ids', max_lengths=max_lengths, breakpoint_features=dict(), cumulative_features=[], min_packing_fraction=1.0, max_cache_len=num_blocks_per_example) _ = (outputs[MakeExampleOutput.SUCCESS] | 'ShuffleBeforePacking' >> beam.Reshuffle() | 'PackExamples' >> beam_utils.PackExamples(example_packer) | 'ShuffleAfterPacking' >> beam.Reshuffle() | 'WriteTfExamples' >> beam.io.WriteToTFRecord( os.path.join(output_prefix + '.tfrecord'), coder=beam.coders.ProtoCoder(tf.train.Example), num_shards=output_num_shards))
def test_priority_example_packer_for_read_it_twice_model(self): """Test for PriorityExamplePacker with read-it-twice model's data format. For brevity reasons, we omit `is_continuation` feature. Note that in all examples below the `block_length` is 5 """ # Test case 1: Simple combination of blocks packer = beam_utils.PriorityExamplePacker( priority_feature='token_ids', max_lengths=dict(token_ids=20, block_ids=4), breakpoint_features={}, cumulative_features=[], min_packing_fraction=1.0, max_cache_len=2) self.assertEqual([], packer.add_example( text_format.Parse( """ features { feature { key: "token_ids" value { int64_list { value: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] } } } feature { key: "block_ids" value { int64_list { value: [1, 1] } } } } """, tf.train.Example()))) self.assertEqual([], packer.add_example( text_format.Parse( """ features { feature { key: "token_ids" value { int64_list { value: [1, 0, 0, 0, 0] } } } feature { key: "block_ids" value { int64_list { value: [2] } } } } """, tf.train.Example()))) self.assertEqual([ text_format.Parse( """ features { feature { key: "token_ids" value { int64_list { value: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0] } } } feature { key: "block_ids" value { int64_list { value: [1, 1, 2, 5] } } } feature { key: "packing_status" value { bytes_list { value: ['packed'] } } } } """, tf.train.Example()) ], packer.add_example( text_format.Parse( """ features { feature { key: "token_ids" value { int64_list { value: [1, 0, 0, 0, 0] } } } feature { key: "block_ids" value { int64_list { value: [5] } } } } """, tf.train.Example()))) # Test case 2: The block is expected to be filled in `flush_examples` packer = beam_utils.PriorityExamplePacker( priority_feature='token_ids', max_lengths=dict(token_ids=20, block_ids=4), breakpoint_features={}, cumulative_features=[], min_packing_fraction=1.0, max_cache_len=2, padding_token_ids=dict(token_ids=-1, block_ids=0)) self.assertEqual([], packer.add_example( text_format.Parse( """ features { feature { key: "token_ids" value { int64_list { value: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] } } } feature { key: "block_ids" value { int64_list { value: [1, 1] } } } } """, tf.train.Example()))) self.assertEqual([], packer.add_example( text_format.Parse( """ features { feature { key: "token_ids" value { int64_list { value: [1, 0, 0, 0, 0] } } } feature { key: "block_ids" value { int64_list { value: [2] } } } } """, tf.train.Example()))) self.assertEqual([ text_format.Parse( """ features { feature { key: "token_ids" value { int64_list { value: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 0, 0, 0, 0, -1, -1, -1, -1, -1] } } } feature { key: "block_ids" value { int64_list { value: [1, 1, 2, 0] } } } feature { key: "packing_status" value { bytes_list { value: ['flushed'] } } } } """, tf.train.Example()) ], packer.flush_examples())
def test_priority_example_packer(self): def make_test_example( num_values, token_pad_num = None, global_token_pad_num = None, packing_status = None): """Makes a tf.Example for testing packing logic. The result will have the following features: 1. An int64 `token_ids` feature containing `num_values` values counting from 1 upwards. If `token_pad_num` is supplied, additional `0` values will be appended to the right until the number of values reaches `token_pad_num`. 2. An int64 `global_token_ids` feature containing a single `1` value. If `global_token_pad_num` is supplied, additional `0` values will be appended to the right until the number of values reaches `global_token_pad_num`. 3. If `packing_status` is supplied, a feature by that name will be added with the corresponding bytes value. Args: num_values: Positive integer number of `token_ids` in the example. token_pad_num: Optional length to pad `token_ids` to. Must not be less than `num_values`. global_token_pad_num: Optional length to pad `global_token_ids` to. Must be positive. packing_status: Optional bytes value to record in a `packing_status` feature. Returns: The example. """ values = [x + 1 for x in range(num_values)] if token_pad_num is not None: while len(values) < token_pad_num: values.append(0) result = tf.train.Example() result.features.feature['token_ids'].int64_list.value.extend(values) global_values = [1] if global_token_pad_num is not None: remaining_len = global_token_pad_num - len(global_values) global_values.extend([0] * remaining_len) result.features.feature['global_token_ids'].int64_list.value.extend( global_values) if packing_status is not None: result.features.feature['packing_status'].bytes_list.value.append( packing_status) return result # Some example calls to `make_test_example()` for illustration: self.assertEqual( text_format.Parse( """ features { feature { key: "token_ids" value { int64_list { value: [1, 2, 3, 4, 5] } } } feature { key: "global_token_ids" value { int64_list { value: [1] } } } } """, tf.train.Example()), make_test_example(5)) self.assertEqual( text_format.Parse( """ features { feature { key: "token_ids" value { int64_list { value: [1, 2, 3, 4, 5, 6, 0, 0, 0, 0] } } } feature { key: "global_token_ids" value { int64_list { value: [1, 0, 0, 0] } } } feature { key: "packing_status" value { bytes_list { value: ['untouched'] } } } } """, tf.train.Example()), make_test_example( 6, token_pad_num=10, global_token_pad_num=4, packing_status=b'untouched')) # Proceed to test `PriorityExamplePacker`. # `breakpoint_features` and `cumulative_features` are tested in # `test_pack_examples()` above, so we omit them here for brevity. packer = beam_utils.PriorityExamplePacker( priority_feature='token_ids', max_lengths=dict(token_ids=10, global_token_ids=4), breakpoint_features={}, cumulative_features=[], min_packing_fraction=0.85, max_cache_len=2) self.assertEqual([], packer.add_example(make_test_example(8))) self.assertEqual([], packer.add_example(make_test_example(4))) self.assertEqual([ make_test_example( 8, token_pad_num=10, global_token_pad_num=4, packing_status=b'evicted') ], packer.add_example(make_test_example(7))) self.assertEqual([ text_format.Parse( """ features { feature { key: "token_ids" value { int64_list { value: [1, 2, 3, 4, 5, 6, 7, 1, 2, 0] } } } feature { key: "global_token_ids" value { int64_list { value: [1, 1, 0, 0] } } } feature { key: "packing_status" value { bytes_list { value: ['packed'] } } } } """, tf.train.Example()) ], packer.add_example(make_test_example(2))) self.assertEqual([], packer.add_example(make_test_example(1))) self.assertEqual([], packer.add_example(make_test_example(1))) self.assertEqual([], packer.add_example(make_test_example(5))) # We satisfy minimum `global_token_ids` instead of `token_ids` here. self.assertEqual([ text_format.Parse( """ features { feature { key: "token_ids" value { int64_list { value: [1, 2, 3, 4, 1, 1, 1, 0, 0, 0] } } } feature { key: "global_token_ids" value { int64_list { value: [1, 1, 1, 1] } } } feature { key: "packing_status" value { bytes_list { value: ['packed'] } } } } """, tf.train.Example()) ], packer.add_example(make_test_example(1))) self.assertEqual([ make_test_example( 9, token_pad_num=10, global_token_pad_num=4, packing_status=b'untouched') ], packer.add_example(make_test_example(9))) with self.assertRaises(ValueError): packer.add_example(make_test_example(11)) with self.assertRaises(ValueError): packer.add_example(make_test_example(3, global_token_pad_num=5)) self.assertEqual([], packer.add_example(make_test_example(8))) self.assertEqual([ make_test_example( 8, token_pad_num=10, global_token_pad_num=4, packing_status=b'flushed'), make_test_example( 5, token_pad_num=10, global_token_pad_num=4, packing_status=b'flushed') ], packer.flush_examples()) self.assertEqual([], packer.flush_examples())
def pipeline(root): question_answers = read_question_answer_json(input_file) question_answers = ( root | 'CreateQuestionAnswers' >> beam.Create(question_answers)) outputs = ( question_answers | 'ReadEvidence' >> beam.ParDo( ReadEvidence(wikipedia_dir=wikipedia_dir, web_dir=web_dir)) | 'ShuffleBeforeMakeExamples' >> beam.Reshuffle() | 'MakeExamples' >> beam.ParDo( MakeExamples( spm_model_path=spm_model_path, num_blocks_per_example=num_blocks_per_example, block_overlap_length=block_overlap_length, block_length=block_length, max_num_annotations_per_block=max_num_annotations_per_block, padding_token_id=padding_token_id, cls_token_id=cls_token_id, sep_token_id=sep_token_id, generate_answers=generate_answers, nltk_data_path=nltk_data_path)).with_outputs()) if generate_answers: # Write failure cases, when no answer was found _ = (outputs[MakeExampleOutput.NO_ANSWER] | 'WriteNoAnswer' >> write_to_file_fn(output_prefix, 'no_answer.jsonl')) _ = (outputs[MakeExampleOutput.NO_ANSWER_TOKENIZED] | 'WriteNoAnswerTokenized' >> write_to_file_fn( output_prefix, 'no_answer_tokenized.jsonl')) # Write annotations that have been filtered out after tokenization _ = (outputs[MakeExampleOutput.SUCCESS_FILTERED_ANNOTATIONS] | 'FlattenSuccessFilteredAnnotations' >> beam.FlatMap(lambda x: x) | 'WriteSuccessFilteredAnnotations' >> write_to_file_fn( output_prefix, 'success.filtered_annotations.txt')) _ = (outputs[ MakeExampleOutput.NO_ANSWER_TOKENIZED_FILTERED_ANNOTATIONS] | 'FlattenNoAnswerTokenizedFilteredAnnotations' >> beam.FlatMap(lambda x: x) | 'WriteNoAnswerTokenizedFilteredAnnotations' >> write_to_file_fn( output_prefix, 'no_answer_tokenized.filtered_annotations.txt')) # Write cases where the too many answer spans were found _ = (outputs[MakeExampleOutput.TOO_MANY_ANSWERS] | 'WriteTooManyAnswers' >> write_to_file_fn( output_prefix, 'too_many_answers.jsonl')) max_tokens = num_blocks_per_example * block_length max_num_annotations = num_blocks_per_example * max_num_annotations_per_block example_packer = beam_utils.PriorityExamplePacker( priority_feature='token_ids', max_lengths=dict(token_ids=max_tokens, is_continuation=max_tokens, block_ids=num_blocks_per_example, answer_annotation_begins=max_num_annotations, answer_annotation_ends=max_num_annotations, answer_annotation_labels=max_num_annotations, entity_annotation_begins=max_num_annotations, entity_annotation_ends=max_num_annotations, entity_annotation_labels=max_num_annotations, prefix_length=num_blocks_per_example), breakpoint_features=dict(), cumulative_features=[], min_packing_fraction=1.0, max_cache_len=num_blocks_per_example) _ = (outputs[MakeExampleOutput.SUCCESS] | 'ShuffleBeforePacking' >> beam.Reshuffle() | 'PackExamples' >> beam_utils.PackExamples(example_packer) | 'ShuffleAfterPacking' >> beam.Reshuffle() | 'WriteTfExamples' >> beam.io.WriteToTFRecord( os.path.join(output_prefix + '.tfrecord'), coder=beam.coders.ProtoCoder(tf.train.Example), num_shards=output_num_shards))