Пример #1
0
 def test_minimal_examples(self):
     num_examples = 20
     minimal_example = {
         "annotations": [],
         "long_answer_candidates": [],
         "question_text": "",
         "document_url": "",
         "document_title": "",
         "example_id": 1
     }
     self.write_examples([minimal_example] * num_examples)
     output_examples = nq_data_utils.get_nq_examples(self.test_file)
     self.assertEqual(num_examples, len(list(output_examples)))
Пример #2
0
def main(_):
  shard_counter = 0
  input_file = nq_data_utils.get_nq_filename(FLAGS.input_data_dir, FLAGS.split,
                                             FLAGS.task_id, "jsonl.gz")
  op = get_output_fp(shard_counter)
  counter = 0
  for line in nq_data_utils.get_nq_examples(input_file):
    op.write((line.decode("utf-8")).encode("utf-8"))
    counter += 1
    if counter % FLAGS.split_size == 0:
      op.close()
      shard_counter += 1
      op = get_output_fp(shard_counter)
Пример #3
0
 def test_example_metadata(self):
     example = {
         "annotations": [],
         "long_answer_candidates": [],
         "question_text": "test_q",
         "document_url": "test_url",
         "document_title": "test_title",
         "example_id": 10
     }
     self.write_examples([example])
     output_example = next(nq_data_utils.get_nq_examples(self.test_file))
     self.assertEqual(output_example["name"], "test_title")
     self.assertEqual(output_example["id"], "10")
     self.assertEqual(output_example["questions"][0]["input_text"],
                      "test_q")
     self.assertEqual(output_example["answers"][0]["input_text"], "long")
Пример #4
0
def main(_):
  examples_processed = 0
  instances_processed = 0
  num_examples_with_correct_context = 0

  if FLAGS.is_training:
    creator_fn = run_nq.CreateTFExampleFn(is_training=FLAGS.is_training)
    instances = []
    input_file = nq_data_utils.get_sharded_filename(FLAGS.input_data_dir,
                                                    FLAGS.split, FLAGS.task_id,
                                                    FLAGS.shard_split_id,
                                                    "jsonl.gz")
    tf.logging.info("Reading file %s", input_file)
    for example in nq_data_utils.get_nq_examples(input_file):
      for instance in creator_fn.process(example):
        instances.append(instance)
        instances_processed += 1
      if example["has_correct_context"]:
        num_examples_with_correct_context += 1
      if examples_processed % 100 == 0:
        tf.logging.info("Examples processed: %d", examples_processed)
        tf.logging.info("Instances processed: %d", instances_processed)
      examples_processed += 1
      if FLAGS.max_examples > 0 and examples_processed >= FLAGS.max_examples:
        break
    tf.logging.info("Examples with correct context retained: %d of %d",
                    num_examples_with_correct_context, examples_processed)
    random.shuffle(instances)
    tf.logging.info("Total no: of instances in current shard: %d",
                    len(instances))
    output_file = nq_data_utils.get_sharded_filename(FLAGS.output_data_dir,
                                                     FLAGS.split, FLAGS.task_id,
                                                     FLAGS.shard_split_id,
                                                     "tf-record")
    with tf.python_io.TFRecordWriter(output_file) as writer:
      for instance in instances:
        writer.write(instance)

  # For eval - First process every shard in parallel
  elif not FLAGS.is_training and not FLAGS.merge_eval:
    input_file = nq_data_utils.get_sharded_filename(FLAGS.input_data_dir,
                                                    FLAGS.split, FLAGS.task_id,
                                                    FLAGS.shard_split_id,
                                                    "jsonl.gz")
    tf.logging.info("Reading file %s", input_file)
    tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
    eval_examples = run_nq.read_nq_examples(
        input_file=input_file, is_training=False)
    output_file = nq_data_utils.get_sharded_filename(FLAGS.output_data_dir,
                                                     FLAGS.split, FLAGS.task_id,
                                                     FLAGS.shard_split_id,
                                                     "tf-record")
    eval_writer = run_nq.FeatureWriter(filename=output_file, is_training=False)
    eval_features = []
    examples_processed = 0

    def append_feature(feature):
      eval_features.append(feature)
      eval_writer.process_feature(feature)
      examples_processed = len(eval_features)
      if examples_processed % 100 == 0:
        tf.logging.info("Examples processed: %d", examples_processed)

    _ = run_nq.convert_examples_to_features(
        examples=eval_examples,
        tokenizer=tokenizer,
        is_training=False,
        output_fn=append_feature)
    eval_writer.close()

  # For eval - Fianlly merge all shards into 1
  else:
    instances = []
    for task in range(FLAGS.max_dev_tasks):
      for shard_split in range(FLAGS.max_dev_shard_splits):
        input_file = nq_data_utils.get_sharded_filename(FLAGS.input_data_dir,
                                                        FLAGS.split, task,
                                                        shard_split,
                                                        "tf-record")
        tf.logging.info("Reading file %s", input_file)
        instances.extend([
            tf.train.Example.FromString(r)
            for r in tf.python_io.tf_record_iterator(input_file)
        ])

    output_file = os.path.join(FLAGS.output_data_dir, "eval.tf-record")
    with tf.python_io.TFRecordWriter(output_file) as writer:
      for instance in instances:
        writer.write(instance.SerializeToString())
Пример #5
0
    def test_multi_candidate_document(self):
        example = {
            "annotations": [{
                "long_answer": {
                    "candidate_index": 1,
                    "start_token": 0,
                    "end_token": 3,
                    "entity_map": {}
                },
                "short_answers": [{
                    "start_token": 2,
                    "end_token": 3,
                    "entity_map": {}
                }],
                "yes_no_answer":
                "NONE"
            }],
            "long_answer_candidates": [{
                "start_token": 0,
                "end_token": 3,
                "top_level": True,
                "entity_map": {}
            }, {
                "start_token": 0,
                "end_token": 3,
                "top_level": True
            }],
            "document_tokens": [{
                "token": "<P>",
                "html_token": True
            }, {
                "token": "the",
                "html_token": False
            }, {
                "token": "document",
                "html_token": False
            }],
            "question_text":
            "the question",
            "document_url":
            "",
            "document_title":
            "",
            "example_id":
            1
        }
        self.write_examples([example])

        # The document in this case should be a single string with all contexts.
        output_example = next(nq_data_utils.get_nq_examples(self.test_file))
        self.assertEqual(
            "[ContextId=-1] [NoLongAnswer] [ContextId=0] [Paragraph=1] the document "
            "[ContextId=1] [Paragraph=2] the document",
            output_example["contexts"])

        passages, spans, _ = self.make_tf_examples(output_example,
                                                   is_training=True)
        self.assertEqual([
            "[CLS] [Q] the question [SEP] [ContextId=-1] [NoLongAnswer] "
            "[ContextId=0] [Paragraph=1] the docum [SEP] [SEP] [PAD] [PAD] "
            "[PAD] [PAD] [PAD] [PAD] [PAD]",
            "[CLS] [Q] the question [SEP]ent [ContextId=1] [Paragraph=2] the document [SEP] "
            "[SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]"
        ], passages)
        self.assertEqual(["[CLS]", "document"], spans)

        passages, _, tok_maps = self.make_tf_examples(output_example,
                                                      is_training=False)
        self.assertEqual([
            "[CLS] [Q] the question [SEP] [ContextId=-1] [NoLongAnswer] "
            "[ContextId=0] [Paragraph=1] the docum [SEP] [SEP] [PAD] [PAD] "
            "[PAD] [PAD] [PAD] [PAD] [PAD]",
            "[CLS] [Q] the question [SEP]ent [ContextId=1] [Paragraph=2] the document [SEP] "
            "[SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]"
        ], passages)
        self.assertEqual([[-1] * 9 + [1, 2] + [-1] * 9,
                          [-1] * 5 + [2, -1, -1, 1, 2, 2] + [-1] * 9],
                         tok_maps)
Пример #6
0
def main(_):
    examples_processed = 0
    instances_processed = 0
    examples_with_instances = 0
    num_examples_with_correct_context = 0

    # if FLAGS.create_pretrain_data or FLAGS.create_fact_annotation_data:
    #     pretrain_file = open(nq_data_utils.get_sharded_filename(FLAGS.pretrain_data_dir,
    #                                                             FLAGS.split, FLAGS.task_id,
    #                                                             FLAGS.shard_split_id,
    #                                                             "txt"), 'w')
    # else:
    #     pretrain_file = None
    pretrain_file = open(
        nq_data_utils.get_sharded_filename(FLAGS.pretrain_data_dir,
                                           FLAGS.split, FLAGS.task_id,
                                           FLAGS.shard_split_id, "txt"), 'w')

    if FLAGS.is_training:
        fixed_train_list = None
        if FLAGS.use_fixed_training_data:
            fp = open(FLAGS.fixed_training_data_filepath)
            fixed_train_list = []
            for line in fp:
                fixed_train_list.append(int(line))
        creator_fn = run_nq.CreateTFExampleFn(is_training=FLAGS.is_training)
        instances = []
        input_file = nq_data_utils.get_sharded_filename(
            FLAGS.input_data_dir, FLAGS.split, FLAGS.task_id,
            FLAGS.shard_split_id, "jsonl.gz")
        print("Reading file %s", input_file)
        file_stats_counter = {
            'example_count': 0,
            'sp_recall_sum': 0,
            'answer_reach_counter': 0,
            'single_answer_reach_counter': 0,
            'multi_answer_recall': 0,
            'single_answer_counter': 0,
            'multi_answer_counter': 0,
            'multi_answer_size_counter': 0
        }
        for example in nq_data_utils.get_nq_examples(input_file):
            ins_count = 0
            stats_count = None
            for instance, stats_count in creator_fn.process(
                    example, pretrain_file, fixed_train_list):
                instances.append(instance)
                instances_processed += 1
                ins_count = 1
                if FLAGS.use_passage_rw_facts_in_shortest_path or FLAGS.use_question_to_passage_facts_in_shortest_path:
                    file_stats_counter['example_count'] += 1
                    file_stats_counter['sp_recall_sum'] += stats_count[
                        'fact_recall_counter']
                    if len(stats_count['answers_reached']) > 0:
                        file_stats_counter['answer_reach_counter'] += 1
                        if len(stats_count['answer_entity_ids']) > 1:
                            file_stats_counter[
                                'multi_answer_recall'] += stats_count[
                                    'answer_recall_counter']
                        else:
                            file_stats_counter[
                                'single_answer_reach_counter'] += 1
                    if len(stats_count['answer_entity_ids']) > 1:
                        file_stats_counter['multi_answer_counter'] += 1
                        file_stats_counter['multi_answer_size_counter'] += len(
                            stats_count['answer_entity_ids'])
                    else:
                        file_stats_counter['single_answer_counter'] += 1

            if stats_count is not None and FLAGS.use_question_rw_facts_in_shortest_path:
                file_stats_counter['example_count'] += 1
                file_stats_counter['sp_recall_sum'] += stats_count[
                    'fact_recall_counter']
                if len(stats_count['answers_reached']) > 0:
                    file_stats_counter['answer_reach_counter'] += 1
                    if len(stats_count['answer_entity_ids']) > 1:
                        file_stats_counter[
                            'multi_answer_recall'] += stats_count[
                                'answer_recall_counter']
                    else:
                        file_stats_counter['single_answer_reach_counter'] += 1
                if len(stats_count['answer_entity_ids']) > 1:
                    file_stats_counter['multi_answer_counter'] += 1
                    file_stats_counter['multi_answer_size_counter'] += len(
                        stats_count['answer_entity_ids'])
                else:
                    file_stats_counter['single_answer_counter'] += 1

            if example["has_correct_context"]:
                num_examples_with_correct_context += 1
            if examples_processed % 100 == 0:
                print("Examples processed: %d", examples_processed)
                print("Instances processed: %d", instances_processed)
            examples_processed += 1
            examples_with_instances += ins_count
            if FLAGS.max_examples > 0 and examples_processed >= FLAGS.max_examples:
                break
            #time.sleep(5)
        print("Examples with correct context retained: %d of %d",
              num_examples_with_correct_context, examples_processed)
        random.shuffle(instances)
        print("Total no: of instances in current shard: %d", len(instances))
        output_file = nq_data_utils.get_sharded_filename(
            FLAGS.output_data_dir, FLAGS.split, FLAGS.task_id,
            FLAGS.shard_split_id, "tf-record")
        stats_file = nq_data_utils.get_sharded_filename(
            FLAGS.output_data_dir, FLAGS.split, FLAGS.task_id,
            FLAGS.shard_split_id, "stats.txt")
        with tf.python_io.TFRecordWriter(output_file) as writer:
            for instance in instances:
                writer.write(instance)
        with open(stats_file, 'w') as fp:
            print("Example count: %d", file_stats_counter['example_count'])
            print("Fact Recall sum: %d", file_stats_counter['sp_recall_sum'])
            print("Count with answers reached: %d",
                  file_stats_counter['answer_reach_counter'])
            print("Single Answer Example count: %d",
                  file_stats_counter['single_answer_counter'])
            print("Single Answer Reached count: %d",
                  file_stats_counter['single_answer_reach_counter'])
            print("Multi Answer Example count: %d",
                  file_stats_counter['multi_answer_counter'])
            print("Multi Answer recall sum: %d",
                  file_stats_counter['multi_answer_recall'])
            print("Multi Answer Size counter: %d",
                  file_stats_counter['multi_answer_size_counter'])
            fp.write("Example count: " +
                     str(file_stats_counter['example_count']) + "\n")
            fp.write("Fact Recall sum: " +
                     str(file_stats_counter['sp_recall_sum']) + "\n")
            fp.write("Count with answers reached: " +
                     str(file_stats_counter['answer_reach_counter']) + "\n")
            fp.write("Single Answer Example count: " +
                     str(file_stats_counter['single_answer_counter']) + "\n")
            fp.write("Single Answer Reached count: " +
                     str(file_stats_counter['single_answer_reach_counter']) +
                     "\n")
            fp.write("Multi Answer Example count: " +
                     str(file_stats_counter['multi_answer_counter']) + "\n")
            fp.write("Multi Answer recall sum: " +
                     str(file_stats_counter['multi_answer_recall']) + "\n")
            fp.write("Multi Answer Size counter: " +
                     str(file_stats_counter['multi_answer_size_counter']) +
                     "\n")
            fp.write("Examples with instances Processed: " +
                     str(examples_with_instances) + "\n")

    # For eval - First process every shard in parallel
    elif not FLAGS.is_training and not FLAGS.merge_eval:
        input_file = nq_data_utils.get_sharded_filename(
            FLAGS.input_data_dir, FLAGS.split, FLAGS.task_id,
            FLAGS.shard_split_id, "jsonl.gz")
        print("Reading file %s", input_file)
        tokenizer = tokenization.FullTokenizer(
            vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
        eval_examples = run_nq.read_nq_examples(input_file=input_file,
                                                is_training=False)
        output_file = nq_data_utils.get_sharded_filename(
            FLAGS.output_data_dir, FLAGS.split, FLAGS.task_id,
            FLAGS.shard_split_id, "tf-record")
        eval_writer = run_nq.FeatureWriter(filename=output_file,
                                           is_training=False)
        eval_features = []
        examples_processed = 0

        def append_feature(feature):
            eval_features.append(feature)
            eval_writer.process_feature(feature)
            examples_processed = len(eval_features)
            if examples_processed % 10 == 0:
                print("Examples processed: %d", examples_processed)

        _ = run_nq.convert_examples_to_features(examples=eval_examples,
                                                tokenizer=tokenizer,
                                                is_training=False,
                                                output_fn=append_feature,
                                                pretrain_file=pretrain_file)
        eval_writer.close()

    # For eval - Fianlly merge all shards into 1
    else:
        instances = []
        for task in range(FLAGS.max_dev_tasks):
            for shard_split in range(FLAGS.max_dev_shard_splits):
                input_file = nq_data_utils.get_sharded_filename(
                    FLAGS.input_data_dir, FLAGS.split, task, shard_split,
                    "tf-record")
                print("Reading file %s", input_file)
                instances.extend([
                    tf.train.Example.FromString(r)
                    for r in tf.python_io.tf_record_iterator(input_file)
                ])

        output_file = os.path.join(FLAGS.output_data_dir, "eval.tf-record")
        with tf.python_io.TFRecordWriter(output_file) as writer:
            for instance in instances:
                writer.write(instance.SerializeToString())