示例#1
0
def get_examples(data_dir, output_data_dir, mode, task_id, shard_id):
    """Reads NQ data, does sling entity linking and returns augmented data."""
    file_path = nq_data_utils.get_sharded_filename(data_dir, mode, task_id,
                                                   shard_id, 'jsonl.gz')
    output_file_path = nq_data_utils.get_sharded_filename(
        output_data_dir, mode, task_id, shard_id, 'tsv')
    print(file_path)
    tf.logging.info("Reading file: %s" % (file_path))
    if not os.path.exists(file_path):
        return None, None
    nq_data, entities = extract_nq_data(file_path, output_file_path)
    tf.logging.info("NQ data Size: " + str(len(nq_data.keys())))
    return nq_data, entities
def get_output_fp(shard_counter):
    output_file = nq_data_utils.get_sharded_filename(FLAGS.output_data_dir,
                                                     FLAGS.split,
                                                     FLAGS.task_id,
                                                     shard_counter, "jsonl.gz")
    op = gzip.GzipFile(fileobj=tf.gfile.Open(output_file, "w"))
    return op
示例#3
0
def main(_):
  # input_file = nq_data_utils.get_sharded_filename(FLAGS.input_data_dir,
  #                                               FLAGS.split, FLAGS.task_id,
  #                                               FLAGS.shard_split_id,
  #                                               "jsonl.gz")
  # print("Reading file %s", input_file)
  # output_file = nq_data_utils.get_sharded_filename(FLAGS.output_data_dir,
  #                                                FLAGS.split, FLAGS.task_id,
  #                                                FLAGS.shard_split_id,
  #                                                "jsonl")
  # count = nq_jsonl_to_tsv(input_file, output_file)
  # print(count)
  input_data_dir = '/remote/bones/user/vbalacha/fact_augmented_text/fat/fat_bert_nq/generated_files/nq_t5_data_new/'
  stats = {'train':(50,7), 'dev':(4,16)}
  for split in ['train', 'dev']:
    if split == 'train':
      output_file = os.path.join(input_data_dir, "all_train.jsonl")
    else:
      output_file = os.path.join(input_data_dir, "all_dev.jsonl")
    op = open(output_file, 'w')
    count = 0
    for task in range(stats[split][0]):
      for shard_split in range(stats[split][1]):
        input_file = nq_data_utils.get_sharded_filename(input_data_dir,
                                                      split, task,
                                                      shard_split,
                                                      "jsonl")
        if not os.path.exists(input_file):
            continue
        print("Reading file %s", input_file)
        fp = open(input_file)
        for line in fp:
            count += 1
            op.write(line+"\n")
    print(count)
示例#4
0
def main(_):
  examples_processed = 0
  instances_processed = 0
  num_examples_with_correct_context = 0

  if FLAGS.is_training:
    creator_fn = run_nq.CreateTFExampleFn(is_training=FLAGS.is_training)
    instances = []
    input_file = nq_data_utils.get_sharded_filename(FLAGS.input_data_dir,
                                                    FLAGS.split, FLAGS.task_id,
                                                    FLAGS.shard_split_id,
                                                    "jsonl.gz")
    tf.logging.info("Reading file %s", input_file)
    for example in nq_data_utils.get_nq_examples(input_file):
      for instance in creator_fn.process(example):
        instances.append(instance)
        instances_processed += 1
      if example["has_correct_context"]:
        num_examples_with_correct_context += 1
      if examples_processed % 100 == 0:
        tf.logging.info("Examples processed: %d", examples_processed)
        tf.logging.info("Instances processed: %d", instances_processed)
      examples_processed += 1
      if FLAGS.max_examples > 0 and examples_processed >= FLAGS.max_examples:
        break
    tf.logging.info("Examples with correct context retained: %d of %d",
                    num_examples_with_correct_context, examples_processed)
    random.shuffle(instances)
    tf.logging.info("Total no: of instances in current shard: %d",
                    len(instances))
    output_file = nq_data_utils.get_sharded_filename(FLAGS.output_data_dir,
                                                     FLAGS.split, FLAGS.task_id,
                                                     FLAGS.shard_split_id,
                                                     "tf-record")
    with tf.python_io.TFRecordWriter(output_file) as writer:
      for instance in instances:
        writer.write(instance)

  # For eval - First process every shard in parallel
  elif not FLAGS.is_training and not FLAGS.merge_eval:
    input_file = nq_data_utils.get_sharded_filename(FLAGS.input_data_dir,
                                                    FLAGS.split, FLAGS.task_id,
                                                    FLAGS.shard_split_id,
                                                    "jsonl.gz")
    tf.logging.info("Reading file %s", input_file)
    tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
    eval_examples = run_nq.read_nq_examples(
        input_file=input_file, is_training=False)
    output_file = nq_data_utils.get_sharded_filename(FLAGS.output_data_dir,
                                                     FLAGS.split, FLAGS.task_id,
                                                     FLAGS.shard_split_id,
                                                     "tf-record")
    eval_writer = run_nq.FeatureWriter(filename=output_file, is_training=False)
    eval_features = []
    examples_processed = 0

    def append_feature(feature):
      eval_features.append(feature)
      eval_writer.process_feature(feature)
      examples_processed = len(eval_features)
      if examples_processed % 100 == 0:
        tf.logging.info("Examples processed: %d", examples_processed)

    _ = run_nq.convert_examples_to_features(
        examples=eval_examples,
        tokenizer=tokenizer,
        is_training=False,
        output_fn=append_feature)
    eval_writer.close()

  # For eval - Fianlly merge all shards into 1
  else:
    instances = []
    for task in range(FLAGS.max_dev_tasks):
      for shard_split in range(FLAGS.max_dev_shard_splits):
        input_file = nq_data_utils.get_sharded_filename(FLAGS.input_data_dir,
                                                        FLAGS.split, task,
                                                        shard_split,
                                                        "tf-record")
        tf.logging.info("Reading file %s", input_file)
        instances.extend([
            tf.train.Example.FromString(r)
            for r in tf.python_io.tf_record_iterator(input_file)
        ])

    output_file = os.path.join(FLAGS.output_data_dir, "eval.tf-record")
    with tf.python_io.TFRecordWriter(output_file) as writer:
      for instance in instances:
        writer.write(instance.SerializeToString())
示例#5
0
                    'Read nq data to extract entities')
flags.DEFINE_string('relation_emb_file', '',
                    'Read nq data to extract entities')
#flags.DEFINE_string('relations_file', None, 'input relations dict')
#flags.DEFINE_string('rel2id_file', None, 'input relations dict')
flags.DEFINE_string('output_file', None, '')

flags.DEFINE_integer("shard_id", None,
                     "Train and dev shard to read from and write to.")
flags.DEFINE_string(
    "mode", "train",
    "Train and dev split to read from and write to. Accepted values: ['train', 'dev', 'test']"
)

input_file = nq_data_utils.get_sharded_filename(FLAGS.nq_dir, FLAGS.mode,
                                                FLAGS.task_id, FLAGS.shard_id,
                                                'jsonl.gz')
embeddings_file = "/remote/bones/user/vbalacha/datasets/glove/glove.6B.300d.txt"
#with gzip.GzipFile(fileobj=tf.gfile.Open(FLAGS.rel2id_file, 'rb')) as op4:
#    rel2id = json.load(op4)
#    op4.close()
#id2rel = {str(idx): ent for ent, idx in rel2id.items()}

dim = 300

apr_obj = ApproximatePageRank(mode='train',
                              task_id=FLAGS.task_id,
                              shard_id=FLAGS.shard_id)
#apr_obj = ApproximatePageRank()

#fp = open("/remote/bones/user/vbalacha/google-research/fat/fat/fat_bert_nq/generated_files/tmpdir_unk-1_/unique_id_list_0000.txt", "w")

name_to_features = {
    "unique_ids": tf.FixedLenFeature([], tf.int64),
    "input_ids": tf.FixedLenFeature([512], tf.int64),
    "input_mask": tf.FixedLenFeature([512], tf.int64),
    "segment_ids": tf.FixedLenFeature([512], tf.int64),
}

#instances = []
train_count = 0
for task in range(max_train_tasks):
    for shard in range(max_shard_splits):
        #for task in [0]:
        #  for shard in [0]:
        input_file = nq_utils.get_sharded_filename(input_data_dir, "train",
                                                   task, shard, 'tf-record')
        print("Reading file %s", input_file)
        if not os.path.exists(input_file):
            continue
        #instances.extend([
        #    tf.train.Example.FromString(r)
        #    for r in tf.python_io.tf_record_iterator(input_file)
        #])
        #example = tf.parse_single_example(record, name_to_features)
        for record in tf.python_io.tf_record_iterator(input_file):
            #example = tf.parse_single_example(record, name_to_features)
            example = tf.train.Example.FromString(record)
            unique_id = example.features.feature[
                'unique_ids'].int64_list.value[0]
            fp.write(str(unique_id) + "\n")
#print("Training size: "+str(count))
示例#7
0
                                                   shard_id, 'jsonl.gz')
    print(file_path)
    tf.logging.info("Reading file: %s" % (file_path))
    if not os.path.exists(file_path):
        return None, None
    nq_data, entities = extract_nq_data(file_path)
    tf.logging.info("NQ data Size: " + str(len(nq_data.keys())))
    return nq_data, entities


if __name__ == '__main__':
    print(FLAGS.full_wiki)
    print(FLAGS.decompose_ppv)
    print(FLAGS.apr_files_dir)
    question_emb_file = nq_data_utils.get_sharded_filename(
        FLAGS.question_emb_dir, FLAGS.split, FLAGS.task_id,
        FLAGS.shard_split_id, 'pkl')
    question_embeddings = pkl.load(open(question_emb_file, 'rb'))
    relation_embeddings = pkl.load(open(FLAGS.relation_emb_file, 'rb'))
    max_tasks = {"train": 50, "dev": 5}
    max_shards = {"train": 7, "dev": 17}
    apr = ApproximatePageRank(FLAGS.split, FLAGS.task_id, FLAGS.shard_split_id)
    empty_ents = 0
    for mode in [FLAGS.split]:
        # Parse all shards in each mode
        # Currently sequentially, can be parallelized later
        for task_id in [FLAGS.task_id]:  #range(0, max_tasks[mode]):
            for shard_id in [FLAGS.shard_split_id
                             ]:  #range(0, max_shards[mode]):
                # if task_id == 0 and shard_id in range(0, 16):
                #     print("skipping finished job")
示例#8
0
flags.DEFINE_string('nq_dir',
                    '/remote/bones/user/vbalacha/datasets/ent_linked_nq_new/',
                    'Read nq data to extract entities')
#flags.DEFINE_string('output_dir', '/remote/bones/user/vbalacha/datasets/nq_question_embeddings/', 'Read nq data to extract entities')
flags.DEFINE_integer("shard_id", None,
                     "Train and dev shard to read from and write to.")
#flags.DEFINE_integer("task_id", None,
#                     "Train and dev shard to read from and write to.")
flags.DEFINE_string(
    "mode", "train",
    "Train and dev split to read from and write to. Accepted values: ['train', 'dev', 'test']"
)
flags.DEFINE_bool("filter_sw", False, "")

questions_file = nq_data_utils.get_sharded_filename(FLAGS.nq_dir, FLAGS.mode,
                                                    FLAGS.task_id,
                                                    FLAGS.shard_id, 'jsonl.gz')
embeddings_file = "/remote/bones/user/vbalacha/datasets/glove/glove.6B.300d.txt"
output_file = nq_data_utils.get_sharded_filename(FLAGS.output_dir, FLAGS.mode,
                                                 FLAGS.task_id, FLAGS.shard_id,
                                                 'pkl')
dim = 300
stopwords = set(SW.words("english"))
stopwords.add("'s")

word_to_question = {}
question_lens = {}


def extract_keywords(text):
    """Remove wh-words and stop words from text."""
示例#9
0
def main(_):
    examples_processed = 0
    instances_processed = 0
    examples_with_instances = 0
    num_examples_with_correct_context = 0

    # if FLAGS.create_pretrain_data or FLAGS.create_fact_annotation_data:
    #     pretrain_file = open(nq_data_utils.get_sharded_filename(FLAGS.pretrain_data_dir,
    #                                                             FLAGS.split, FLAGS.task_id,
    #                                                             FLAGS.shard_split_id,
    #                                                             "txt"), 'w')
    # else:
    #     pretrain_file = None
    pretrain_file = open(
        nq_data_utils.get_sharded_filename(FLAGS.pretrain_data_dir,
                                           FLAGS.split, FLAGS.task_id,
                                           FLAGS.shard_split_id, "txt"), 'w')

    if FLAGS.is_training:
        fixed_train_list = None
        if FLAGS.use_fixed_training_data:
            fp = open(FLAGS.fixed_training_data_filepath)
            fixed_train_list = []
            for line in fp:
                fixed_train_list.append(int(line))
        creator_fn = run_nq.CreateTFExampleFn(is_training=FLAGS.is_training)
        instances = []
        input_file = nq_data_utils.get_sharded_filename(
            FLAGS.input_data_dir, FLAGS.split, FLAGS.task_id,
            FLAGS.shard_split_id, "jsonl.gz")
        print("Reading file %s", input_file)
        file_stats_counter = {
            'example_count': 0,
            'sp_recall_sum': 0,
            'answer_reach_counter': 0,
            'single_answer_reach_counter': 0,
            'multi_answer_recall': 0,
            'single_answer_counter': 0,
            'multi_answer_counter': 0,
            'multi_answer_size_counter': 0
        }
        for example in nq_data_utils.get_nq_examples(input_file):
            ins_count = 0
            stats_count = None
            for instance, stats_count in creator_fn.process(
                    example, pretrain_file, fixed_train_list):
                instances.append(instance)
                instances_processed += 1
                ins_count = 1
                if FLAGS.use_passage_rw_facts_in_shortest_path or FLAGS.use_question_to_passage_facts_in_shortest_path:
                    file_stats_counter['example_count'] += 1
                    file_stats_counter['sp_recall_sum'] += stats_count[
                        'fact_recall_counter']
                    if len(stats_count['answers_reached']) > 0:
                        file_stats_counter['answer_reach_counter'] += 1
                        if len(stats_count['answer_entity_ids']) > 1:
                            file_stats_counter[
                                'multi_answer_recall'] += stats_count[
                                    'answer_recall_counter']
                        else:
                            file_stats_counter[
                                'single_answer_reach_counter'] += 1
                    if len(stats_count['answer_entity_ids']) > 1:
                        file_stats_counter['multi_answer_counter'] += 1
                        file_stats_counter['multi_answer_size_counter'] += len(
                            stats_count['answer_entity_ids'])
                    else:
                        file_stats_counter['single_answer_counter'] += 1

            if stats_count is not None and FLAGS.use_question_rw_facts_in_shortest_path:
                file_stats_counter['example_count'] += 1
                file_stats_counter['sp_recall_sum'] += stats_count[
                    'fact_recall_counter']
                if len(stats_count['answers_reached']) > 0:
                    file_stats_counter['answer_reach_counter'] += 1
                    if len(stats_count['answer_entity_ids']) > 1:
                        file_stats_counter[
                            'multi_answer_recall'] += stats_count[
                                'answer_recall_counter']
                    else:
                        file_stats_counter['single_answer_reach_counter'] += 1
                if len(stats_count['answer_entity_ids']) > 1:
                    file_stats_counter['multi_answer_counter'] += 1
                    file_stats_counter['multi_answer_size_counter'] += len(
                        stats_count['answer_entity_ids'])
                else:
                    file_stats_counter['single_answer_counter'] += 1

            if example["has_correct_context"]:
                num_examples_with_correct_context += 1
            if examples_processed % 100 == 0:
                print("Examples processed: %d", examples_processed)
                print("Instances processed: %d", instances_processed)
            examples_processed += 1
            examples_with_instances += ins_count
            if FLAGS.max_examples > 0 and examples_processed >= FLAGS.max_examples:
                break
            #time.sleep(5)
        print("Examples with correct context retained: %d of %d",
              num_examples_with_correct_context, examples_processed)
        random.shuffle(instances)
        print("Total no: of instances in current shard: %d", len(instances))
        output_file = nq_data_utils.get_sharded_filename(
            FLAGS.output_data_dir, FLAGS.split, FLAGS.task_id,
            FLAGS.shard_split_id, "tf-record")
        stats_file = nq_data_utils.get_sharded_filename(
            FLAGS.output_data_dir, FLAGS.split, FLAGS.task_id,
            FLAGS.shard_split_id, "stats.txt")
        with tf.python_io.TFRecordWriter(output_file) as writer:
            for instance in instances:
                writer.write(instance)
        with open(stats_file, 'w') as fp:
            print("Example count: %d", file_stats_counter['example_count'])
            print("Fact Recall sum: %d", file_stats_counter['sp_recall_sum'])
            print("Count with answers reached: %d",
                  file_stats_counter['answer_reach_counter'])
            print("Single Answer Example count: %d",
                  file_stats_counter['single_answer_counter'])
            print("Single Answer Reached count: %d",
                  file_stats_counter['single_answer_reach_counter'])
            print("Multi Answer Example count: %d",
                  file_stats_counter['multi_answer_counter'])
            print("Multi Answer recall sum: %d",
                  file_stats_counter['multi_answer_recall'])
            print("Multi Answer Size counter: %d",
                  file_stats_counter['multi_answer_size_counter'])
            fp.write("Example count: " +
                     str(file_stats_counter['example_count']) + "\n")
            fp.write("Fact Recall sum: " +
                     str(file_stats_counter['sp_recall_sum']) + "\n")
            fp.write("Count with answers reached: " +
                     str(file_stats_counter['answer_reach_counter']) + "\n")
            fp.write("Single Answer Example count: " +
                     str(file_stats_counter['single_answer_counter']) + "\n")
            fp.write("Single Answer Reached count: " +
                     str(file_stats_counter['single_answer_reach_counter']) +
                     "\n")
            fp.write("Multi Answer Example count: " +
                     str(file_stats_counter['multi_answer_counter']) + "\n")
            fp.write("Multi Answer recall sum: " +
                     str(file_stats_counter['multi_answer_recall']) + "\n")
            fp.write("Multi Answer Size counter: " +
                     str(file_stats_counter['multi_answer_size_counter']) +
                     "\n")
            fp.write("Examples with instances Processed: " +
                     str(examples_with_instances) + "\n")

    # For eval - First process every shard in parallel
    elif not FLAGS.is_training and not FLAGS.merge_eval:
        input_file = nq_data_utils.get_sharded_filename(
            FLAGS.input_data_dir, FLAGS.split, FLAGS.task_id,
            FLAGS.shard_split_id, "jsonl.gz")
        print("Reading file %s", input_file)
        tokenizer = tokenization.FullTokenizer(
            vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
        eval_examples = run_nq.read_nq_examples(input_file=input_file,
                                                is_training=False)
        output_file = nq_data_utils.get_sharded_filename(
            FLAGS.output_data_dir, FLAGS.split, FLAGS.task_id,
            FLAGS.shard_split_id, "tf-record")
        eval_writer = run_nq.FeatureWriter(filename=output_file,
                                           is_training=False)
        eval_features = []
        examples_processed = 0

        def append_feature(feature):
            eval_features.append(feature)
            eval_writer.process_feature(feature)
            examples_processed = len(eval_features)
            if examples_processed % 10 == 0:
                print("Examples processed: %d", examples_processed)

        _ = run_nq.convert_examples_to_features(examples=eval_examples,
                                                tokenizer=tokenizer,
                                                is_training=False,
                                                output_fn=append_feature,
                                                pretrain_file=pretrain_file)
        eval_writer.close()

    # For eval - Fianlly merge all shards into 1
    else:
        instances = []
        for task in range(FLAGS.max_dev_tasks):
            for shard_split in range(FLAGS.max_dev_shard_splits):
                input_file = nq_data_utils.get_sharded_filename(
                    FLAGS.input_data_dir, FLAGS.split, task, shard_split,
                    "tf-record")
                print("Reading file %s", input_file)
                instances.extend([
                    tf.train.Example.FromString(r)
                    for r in tf.python_io.tf_record_iterator(input_file)
                ])

        output_file = os.path.join(FLAGS.output_data_dir, "eval.tf-record")
        with tf.python_io.TFRecordWriter(output_file) as writer:
            for instance in instances:
                writer.write(instance.SerializeToString())