def get_examples(data_dir, output_data_dir, mode, task_id, shard_id): """Reads NQ data, does sling entity linking and returns augmented data.""" file_path = nq_data_utils.get_sharded_filename(data_dir, mode, task_id, shard_id, 'jsonl.gz') output_file_path = nq_data_utils.get_sharded_filename( output_data_dir, mode, task_id, shard_id, 'tsv') print(file_path) tf.logging.info("Reading file: %s" % (file_path)) if not os.path.exists(file_path): return None, None nq_data, entities = extract_nq_data(file_path, output_file_path) tf.logging.info("NQ data Size: " + str(len(nq_data.keys()))) return nq_data, entities
def get_output_fp(shard_counter): output_file = nq_data_utils.get_sharded_filename(FLAGS.output_data_dir, FLAGS.split, FLAGS.task_id, shard_counter, "jsonl.gz") op = gzip.GzipFile(fileobj=tf.gfile.Open(output_file, "w")) return op
def main(_): # input_file = nq_data_utils.get_sharded_filename(FLAGS.input_data_dir, # FLAGS.split, FLAGS.task_id, # FLAGS.shard_split_id, # "jsonl.gz") # print("Reading file %s", input_file) # output_file = nq_data_utils.get_sharded_filename(FLAGS.output_data_dir, # FLAGS.split, FLAGS.task_id, # FLAGS.shard_split_id, # "jsonl") # count = nq_jsonl_to_tsv(input_file, output_file) # print(count) input_data_dir = '/remote/bones/user/vbalacha/fact_augmented_text/fat/fat_bert_nq/generated_files/nq_t5_data_new/' stats = {'train':(50,7), 'dev':(4,16)} for split in ['train', 'dev']: if split == 'train': output_file = os.path.join(input_data_dir, "all_train.jsonl") else: output_file = os.path.join(input_data_dir, "all_dev.jsonl") op = open(output_file, 'w') count = 0 for task in range(stats[split][0]): for shard_split in range(stats[split][1]): input_file = nq_data_utils.get_sharded_filename(input_data_dir, split, task, shard_split, "jsonl") if not os.path.exists(input_file): continue print("Reading file %s", input_file) fp = open(input_file) for line in fp: count += 1 op.write(line+"\n") print(count)
def main(_): examples_processed = 0 instances_processed = 0 num_examples_with_correct_context = 0 if FLAGS.is_training: creator_fn = run_nq.CreateTFExampleFn(is_training=FLAGS.is_training) instances = [] input_file = nq_data_utils.get_sharded_filename(FLAGS.input_data_dir, FLAGS.split, FLAGS.task_id, FLAGS.shard_split_id, "jsonl.gz") tf.logging.info("Reading file %s", input_file) for example in nq_data_utils.get_nq_examples(input_file): for instance in creator_fn.process(example): instances.append(instance) instances_processed += 1 if example["has_correct_context"]: num_examples_with_correct_context += 1 if examples_processed % 100 == 0: tf.logging.info("Examples processed: %d", examples_processed) tf.logging.info("Instances processed: %d", instances_processed) examples_processed += 1 if FLAGS.max_examples > 0 and examples_processed >= FLAGS.max_examples: break tf.logging.info("Examples with correct context retained: %d of %d", num_examples_with_correct_context, examples_processed) random.shuffle(instances) tf.logging.info("Total no: of instances in current shard: %d", len(instances)) output_file = nq_data_utils.get_sharded_filename(FLAGS.output_data_dir, FLAGS.split, FLAGS.task_id, FLAGS.shard_split_id, "tf-record") with tf.python_io.TFRecordWriter(output_file) as writer: for instance in instances: writer.write(instance) # For eval - First process every shard in parallel elif not FLAGS.is_training and not FLAGS.merge_eval: input_file = nq_data_utils.get_sharded_filename(FLAGS.input_data_dir, FLAGS.split, FLAGS.task_id, FLAGS.shard_split_id, "jsonl.gz") tf.logging.info("Reading file %s", input_file) tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) eval_examples = run_nq.read_nq_examples( input_file=input_file, is_training=False) output_file = nq_data_utils.get_sharded_filename(FLAGS.output_data_dir, FLAGS.split, FLAGS.task_id, FLAGS.shard_split_id, "tf-record") eval_writer = run_nq.FeatureWriter(filename=output_file, is_training=False) eval_features = [] examples_processed = 0 def append_feature(feature): eval_features.append(feature) eval_writer.process_feature(feature) examples_processed = len(eval_features) if examples_processed % 100 == 0: tf.logging.info("Examples processed: %d", examples_processed) _ = run_nq.convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, is_training=False, output_fn=append_feature) eval_writer.close() # For eval - Fianlly merge all shards into 1 else: instances = [] for task in range(FLAGS.max_dev_tasks): for shard_split in range(FLAGS.max_dev_shard_splits): input_file = nq_data_utils.get_sharded_filename(FLAGS.input_data_dir, FLAGS.split, task, shard_split, "tf-record") tf.logging.info("Reading file %s", input_file) instances.extend([ tf.train.Example.FromString(r) for r in tf.python_io.tf_record_iterator(input_file) ]) output_file = os.path.join(FLAGS.output_data_dir, "eval.tf-record") with tf.python_io.TFRecordWriter(output_file) as writer: for instance in instances: writer.write(instance.SerializeToString())
'Read nq data to extract entities') flags.DEFINE_string('relation_emb_file', '', 'Read nq data to extract entities') #flags.DEFINE_string('relations_file', None, 'input relations dict') #flags.DEFINE_string('rel2id_file', None, 'input relations dict') flags.DEFINE_string('output_file', None, '') flags.DEFINE_integer("shard_id", None, "Train and dev shard to read from and write to.") flags.DEFINE_string( "mode", "train", "Train and dev split to read from and write to. Accepted values: ['train', 'dev', 'test']" ) input_file = nq_data_utils.get_sharded_filename(FLAGS.nq_dir, FLAGS.mode, FLAGS.task_id, FLAGS.shard_id, 'jsonl.gz') embeddings_file = "/remote/bones/user/vbalacha/datasets/glove/glove.6B.300d.txt" #with gzip.GzipFile(fileobj=tf.gfile.Open(FLAGS.rel2id_file, 'rb')) as op4: # rel2id = json.load(op4) # op4.close() #id2rel = {str(idx): ent for ent, idx in rel2id.items()} dim = 300 apr_obj = ApproximatePageRank(mode='train', task_id=FLAGS.task_id, shard_id=FLAGS.shard_id) #apr_obj = ApproximatePageRank()
#fp = open("/remote/bones/user/vbalacha/google-research/fat/fat/fat_bert_nq/generated_files/tmpdir_unk-1_/unique_id_list_0000.txt", "w") name_to_features = { "unique_ids": tf.FixedLenFeature([], tf.int64), "input_ids": tf.FixedLenFeature([512], tf.int64), "input_mask": tf.FixedLenFeature([512], tf.int64), "segment_ids": tf.FixedLenFeature([512], tf.int64), } #instances = [] train_count = 0 for task in range(max_train_tasks): for shard in range(max_shard_splits): #for task in [0]: # for shard in [0]: input_file = nq_utils.get_sharded_filename(input_data_dir, "train", task, shard, 'tf-record') print("Reading file %s", input_file) if not os.path.exists(input_file): continue #instances.extend([ # tf.train.Example.FromString(r) # for r in tf.python_io.tf_record_iterator(input_file) #]) #example = tf.parse_single_example(record, name_to_features) for record in tf.python_io.tf_record_iterator(input_file): #example = tf.parse_single_example(record, name_to_features) example = tf.train.Example.FromString(record) unique_id = example.features.feature[ 'unique_ids'].int64_list.value[0] fp.write(str(unique_id) + "\n") #print("Training size: "+str(count))
shard_id, 'jsonl.gz') print(file_path) tf.logging.info("Reading file: %s" % (file_path)) if not os.path.exists(file_path): return None, None nq_data, entities = extract_nq_data(file_path) tf.logging.info("NQ data Size: " + str(len(nq_data.keys()))) return nq_data, entities if __name__ == '__main__': print(FLAGS.full_wiki) print(FLAGS.decompose_ppv) print(FLAGS.apr_files_dir) question_emb_file = nq_data_utils.get_sharded_filename( FLAGS.question_emb_dir, FLAGS.split, FLAGS.task_id, FLAGS.shard_split_id, 'pkl') question_embeddings = pkl.load(open(question_emb_file, 'rb')) relation_embeddings = pkl.load(open(FLAGS.relation_emb_file, 'rb')) max_tasks = {"train": 50, "dev": 5} max_shards = {"train": 7, "dev": 17} apr = ApproximatePageRank(FLAGS.split, FLAGS.task_id, FLAGS.shard_split_id) empty_ents = 0 for mode in [FLAGS.split]: # Parse all shards in each mode # Currently sequentially, can be parallelized later for task_id in [FLAGS.task_id]: #range(0, max_tasks[mode]): for shard_id in [FLAGS.shard_split_id ]: #range(0, max_shards[mode]): # if task_id == 0 and shard_id in range(0, 16): # print("skipping finished job")
flags.DEFINE_string('nq_dir', '/remote/bones/user/vbalacha/datasets/ent_linked_nq_new/', 'Read nq data to extract entities') #flags.DEFINE_string('output_dir', '/remote/bones/user/vbalacha/datasets/nq_question_embeddings/', 'Read nq data to extract entities') flags.DEFINE_integer("shard_id", None, "Train and dev shard to read from and write to.") #flags.DEFINE_integer("task_id", None, # "Train and dev shard to read from and write to.") flags.DEFINE_string( "mode", "train", "Train and dev split to read from and write to. Accepted values: ['train', 'dev', 'test']" ) flags.DEFINE_bool("filter_sw", False, "") questions_file = nq_data_utils.get_sharded_filename(FLAGS.nq_dir, FLAGS.mode, FLAGS.task_id, FLAGS.shard_id, 'jsonl.gz') embeddings_file = "/remote/bones/user/vbalacha/datasets/glove/glove.6B.300d.txt" output_file = nq_data_utils.get_sharded_filename(FLAGS.output_dir, FLAGS.mode, FLAGS.task_id, FLAGS.shard_id, 'pkl') dim = 300 stopwords = set(SW.words("english")) stopwords.add("'s") word_to_question = {} question_lens = {} def extract_keywords(text): """Remove wh-words and stop words from text."""
def main(_): examples_processed = 0 instances_processed = 0 examples_with_instances = 0 num_examples_with_correct_context = 0 # if FLAGS.create_pretrain_data or FLAGS.create_fact_annotation_data: # pretrain_file = open(nq_data_utils.get_sharded_filename(FLAGS.pretrain_data_dir, # FLAGS.split, FLAGS.task_id, # FLAGS.shard_split_id, # "txt"), 'w') # else: # pretrain_file = None pretrain_file = open( nq_data_utils.get_sharded_filename(FLAGS.pretrain_data_dir, FLAGS.split, FLAGS.task_id, FLAGS.shard_split_id, "txt"), 'w') if FLAGS.is_training: fixed_train_list = None if FLAGS.use_fixed_training_data: fp = open(FLAGS.fixed_training_data_filepath) fixed_train_list = [] for line in fp: fixed_train_list.append(int(line)) creator_fn = run_nq.CreateTFExampleFn(is_training=FLAGS.is_training) instances = [] input_file = nq_data_utils.get_sharded_filename( FLAGS.input_data_dir, FLAGS.split, FLAGS.task_id, FLAGS.shard_split_id, "jsonl.gz") print("Reading file %s", input_file) file_stats_counter = { 'example_count': 0, 'sp_recall_sum': 0, 'answer_reach_counter': 0, 'single_answer_reach_counter': 0, 'multi_answer_recall': 0, 'single_answer_counter': 0, 'multi_answer_counter': 0, 'multi_answer_size_counter': 0 } for example in nq_data_utils.get_nq_examples(input_file): ins_count = 0 stats_count = None for instance, stats_count in creator_fn.process( example, pretrain_file, fixed_train_list): instances.append(instance) instances_processed += 1 ins_count = 1 if FLAGS.use_passage_rw_facts_in_shortest_path or FLAGS.use_question_to_passage_facts_in_shortest_path: file_stats_counter['example_count'] += 1 file_stats_counter['sp_recall_sum'] += stats_count[ 'fact_recall_counter'] if len(stats_count['answers_reached']) > 0: file_stats_counter['answer_reach_counter'] += 1 if len(stats_count['answer_entity_ids']) > 1: file_stats_counter[ 'multi_answer_recall'] += stats_count[ 'answer_recall_counter'] else: file_stats_counter[ 'single_answer_reach_counter'] += 1 if len(stats_count['answer_entity_ids']) > 1: file_stats_counter['multi_answer_counter'] += 1 file_stats_counter['multi_answer_size_counter'] += len( stats_count['answer_entity_ids']) else: file_stats_counter['single_answer_counter'] += 1 if stats_count is not None and FLAGS.use_question_rw_facts_in_shortest_path: file_stats_counter['example_count'] += 1 file_stats_counter['sp_recall_sum'] += stats_count[ 'fact_recall_counter'] if len(stats_count['answers_reached']) > 0: file_stats_counter['answer_reach_counter'] += 1 if len(stats_count['answer_entity_ids']) > 1: file_stats_counter[ 'multi_answer_recall'] += stats_count[ 'answer_recall_counter'] else: file_stats_counter['single_answer_reach_counter'] += 1 if len(stats_count['answer_entity_ids']) > 1: file_stats_counter['multi_answer_counter'] += 1 file_stats_counter['multi_answer_size_counter'] += len( stats_count['answer_entity_ids']) else: file_stats_counter['single_answer_counter'] += 1 if example["has_correct_context"]: num_examples_with_correct_context += 1 if examples_processed % 100 == 0: print("Examples processed: %d", examples_processed) print("Instances processed: %d", instances_processed) examples_processed += 1 examples_with_instances += ins_count if FLAGS.max_examples > 0 and examples_processed >= FLAGS.max_examples: break #time.sleep(5) print("Examples with correct context retained: %d of %d", num_examples_with_correct_context, examples_processed) random.shuffle(instances) print("Total no: of instances in current shard: %d", len(instances)) output_file = nq_data_utils.get_sharded_filename( FLAGS.output_data_dir, FLAGS.split, FLAGS.task_id, FLAGS.shard_split_id, "tf-record") stats_file = nq_data_utils.get_sharded_filename( FLAGS.output_data_dir, FLAGS.split, FLAGS.task_id, FLAGS.shard_split_id, "stats.txt") with tf.python_io.TFRecordWriter(output_file) as writer: for instance in instances: writer.write(instance) with open(stats_file, 'w') as fp: print("Example count: %d", file_stats_counter['example_count']) print("Fact Recall sum: %d", file_stats_counter['sp_recall_sum']) print("Count with answers reached: %d", file_stats_counter['answer_reach_counter']) print("Single Answer Example count: %d", file_stats_counter['single_answer_counter']) print("Single Answer Reached count: %d", file_stats_counter['single_answer_reach_counter']) print("Multi Answer Example count: %d", file_stats_counter['multi_answer_counter']) print("Multi Answer recall sum: %d", file_stats_counter['multi_answer_recall']) print("Multi Answer Size counter: %d", file_stats_counter['multi_answer_size_counter']) fp.write("Example count: " + str(file_stats_counter['example_count']) + "\n") fp.write("Fact Recall sum: " + str(file_stats_counter['sp_recall_sum']) + "\n") fp.write("Count with answers reached: " + str(file_stats_counter['answer_reach_counter']) + "\n") fp.write("Single Answer Example count: " + str(file_stats_counter['single_answer_counter']) + "\n") fp.write("Single Answer Reached count: " + str(file_stats_counter['single_answer_reach_counter']) + "\n") fp.write("Multi Answer Example count: " + str(file_stats_counter['multi_answer_counter']) + "\n") fp.write("Multi Answer recall sum: " + str(file_stats_counter['multi_answer_recall']) + "\n") fp.write("Multi Answer Size counter: " + str(file_stats_counter['multi_answer_size_counter']) + "\n") fp.write("Examples with instances Processed: " + str(examples_with_instances) + "\n") # For eval - First process every shard in parallel elif not FLAGS.is_training and not FLAGS.merge_eval: input_file = nq_data_utils.get_sharded_filename( FLAGS.input_data_dir, FLAGS.split, FLAGS.task_id, FLAGS.shard_split_id, "jsonl.gz") print("Reading file %s", input_file) tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) eval_examples = run_nq.read_nq_examples(input_file=input_file, is_training=False) output_file = nq_data_utils.get_sharded_filename( FLAGS.output_data_dir, FLAGS.split, FLAGS.task_id, FLAGS.shard_split_id, "tf-record") eval_writer = run_nq.FeatureWriter(filename=output_file, is_training=False) eval_features = [] examples_processed = 0 def append_feature(feature): eval_features.append(feature) eval_writer.process_feature(feature) examples_processed = len(eval_features) if examples_processed % 10 == 0: print("Examples processed: %d", examples_processed) _ = run_nq.convert_examples_to_features(examples=eval_examples, tokenizer=tokenizer, is_training=False, output_fn=append_feature, pretrain_file=pretrain_file) eval_writer.close() # For eval - Fianlly merge all shards into 1 else: instances = [] for task in range(FLAGS.max_dev_tasks): for shard_split in range(FLAGS.max_dev_shard_splits): input_file = nq_data_utils.get_sharded_filename( FLAGS.input_data_dir, FLAGS.split, task, shard_split, "tf-record") print("Reading file %s", input_file) instances.extend([ tf.train.Example.FromString(r) for r in tf.python_io.tf_record_iterator(input_file) ]) output_file = os.path.join(FLAGS.output_data_dir, "eval.tf-record") with tf.python_io.TFRecordWriter(output_file) as writer: for instance in instances: writer.write(instance.SerializeToString())