def main(): parser = argparse.ArgumentParser() parser.add_argument("output") parser.add_argument("-m", "--min_count", type=int, default=1) parser.add_argument("-n", "--n_processes", type=int, default=1) args = parser.parse_args() if exists(args.output): raise ValueError() data = TriviaQaOpenDataset() corpus_voc = get_evidence_voc(data.evidence, args.n_processes) print("Adding question voc...") train = data.get_train() for q in train: corpus_voc.update(q.question) print("Saving...") with open(args.output, "w") as f: for word, c in corpus_voc.items(): if c >= args.min_count: f.write(word) f.write("\n")
all_train_questions = [] all_dev_questions = [] all_filemaps = {} for ind, dataset_name in enumerate(args.datasets.split(',')): print('loading ' + dataset_name) source_dir = join(CORPUS_DIR, "triviaqa", "web-open", dataset_name) dataset = TriviaQaOpenDataset(source_dir) # just loading the pkl that was saved in build_span_corpus if args.sample_first == 1.0 or ind == 0: all_dev_questions += dataset.get_dev() num_of_contexts = (pd.Series(args.datasets.replace('-G','').replace('-O','').split(',')) == \ dataset_name.replace('-G','').replace('-O','')).sum() train = dataset.get_train() # Filtering cases with no answer: train_with_ans = [] for question in train: if pd.Series([len(doc.answer_spans) for doc in question.all_docs]).sum() > 0: train_with_ans.append(question) print("number of question with answer is %d" % (len(train_with_ans))) # sample_first assumes the first dataset in the list is our target dataset, to ablate we may whish # to take only a sample of it for training. sample_first is between (0,1] if args.sample_first <= 1.0 and ind == 0: all_train_questions += list( pd.Series(train_with_ans).sample(frac=args.sample_first))