def make_para_dataset(): embedding_file = "../glove.840B.300d.txt" embedding = "./embedding.pkl" src_word2idx_file = "./word2idx.pkl" train_squad = "../squad/train-v1.1.json" dev_squad = "../squad/dev-v1.1.json" train_src_file = "../squad/para-train.txt" train_trg_file = "../squad/tgt-train.txt" dev_src_file = "../squad/para-dev.txt" dev_trg_file = "../squad/tgt-dev.txt" test_src_file = "../squad/para-test.txt" test_trg_file = "../squad/tgt-test.txt" # pre-process training data train_examples, counter = process_file(train_squad) make_conll_format(train_examples, train_src_file, train_trg_file) word2idx = make_vocab_from_squad(src_word2idx_file, counter, config.vocab_size) make_embedding(embedding_file, embedding, word2idx) # split dev into dev and test dev_test_examples, _ = process_file(dev_squad) # random.shuffle(dev_test_examples) num_dev = len(dev_test_examples) // 2 dev_examples = dev_test_examples[:num_dev] test_examples = dev_test_examples[num_dev:] make_conll_format(dev_examples, dev_src_file, dev_trg_file) make_conll_format(test_examples, test_src_file, test_trg_file)
def make_sent_dataset(): embedding_file = "./glove/glove.840B.300d.txt" embedding = "./hotpot/embedding.pkl" src_word2idx_file = "./hotpot/word2idx.pkl" train_hotpot = "./hotpot/data/hotpot_train_v1.1.json" dev_hotpot = "./hotpot/data/hotpot_dev_distractor_v1.json" train_src_file = "./hotpot-sent/para-train.txt" train_trg_file = "./hotpot-sent/tgt-train.txt" dev_src_file = "./hotpot-sent/para-dev.txt" dev_trg_file = "./hotpot-sent/tgt-dev.txt" test_src_file = "./hotpot-sent/para-test.txt" test_trg_file = "./hotpot-sent/tgt-test.txt" # pre-process training data train_examples, counter = process_file(train_hotpot, "sent") make_conll_format(train_examples, train_src_file, train_trg_file) word2idx = make_vocab_from_hotpot(src_word2idx_file, counter, config.vocab_size) make_embedding(embedding_file, embedding, word2idx) # split dev into dev and test dev_test_examples, _ = process_file(dev_hotpot, "sent") # random.shuffle(dev_test_examples) num_dev = len(dev_test_examples) // 2 dev_examples = dev_test_examples[:num_dev] test_examples = dev_test_examples[num_dev:] make_conll_format(dev_examples, dev_src_file, dev_trg_file) make_conll_format(test_examples, test_src_file, test_trg_file)
def make_para_dataset(): embedding_file = "./glove.840B.300d.txt" embedding = "./embedding.pkl" src_word2idx_file = "./word2idx.pkl" ent2idx_file = "./ent2idx.pkl" rel2idx_file = "./rel2idx.pkl" entity_embedding = "./entity.pkl" relation_embedding = "./relation.pkl" train_squad = "../squad/train-v1.1.json" dev_squad = "../squad/dev-v1.1.json" train_src_file = "../squad/para-train.txt" train_trg_file = "../squad/tgt-train.txt" train_cs_file = "./paracs-train.json" dev_src_file = "../squad/para-dev.txt" dev_trg_file = "../squad/tgt-dev.txt" dev_cs_file = "./paracs-dev.json" test_src_file = "../squad/para-test.txt" test_trg_file = "../squad/tgt-test.txt" test_cs_file = "./paracs-test.json" ent_vector = "./entity_transE.txt" rel_vector = "./relation_transE.txt" ent_file = "./entity.txt" rel_file = "./relation.txt" cs_file = "./resource.json" database = dict() with open(cs_file, "r") as f: d = json.load(f) if d["dict_csk"] is not None: database = d["dict_csk"] # process the graph vector through the static attention mechanism _, _, ent2idx, rel2idx = make_graph_vector(entity_embedding, relation_embedding, ent_vector, ent_file, rel_vector, rel_file, ent2idx_file, rel2idx_file ) # pre-process training data train_examples, counter, num = process_file(train_squad, ent2idx, rel2idx, database) make_conll_format(train_examples, train_src_file, train_trg_file, train_cs_file, num) word2idx = make_vocab_from_squad(src_word2idx_file, counter, config.vocab_size) make_embedding(embedding_file, embedding, word2idx) # split dev into dev and test dev_test_examples, _, num = process_file(dev_squad, ent2idx, rel2idx, database) # random.shuffle(dev_test_examples) num_dev = len(dev_test_examples) // 2 dev_examples = dev_test_examples[:num_dev] test_examples = dev_test_examples[num_dev:] make_conll_format(dev_examples, dev_src_file, dev_trg_file, dev_cs_file, num) make_conll_format(test_examples, test_src_file, test_trg_file, test_cs_file, num)
def make_sent_dataset(): train_src_file = "./para-train.txt" train_trg_file = "./tgt-train.txt" embedding_file = "./glove.840B.300d.txt" embedding = "./embedding.pkl" word2idx_file = "./word2idx.pkl" # make vocab file word2idx = make_vocab(train_src_file, train_trg_file, word2idx_file, config.vocab_size) make_embedding(embedding_file, embedding, word2idx)
def make_para_dataset(): embedding_file = "./glove.840B.300d.txt" embedding = "./embedding.pkl" src_word2idx_file = "./word2idx.pkl" train_squad = "../squad/train-v1.1.json" dev_squad = "../squad/dev-v1.1.json" train_src_file = "../squad/para-train.txt" train_trg_file = "../squad/tgt-train.txt" dev_src_file = "../squad/para-dev.txt" dev_trg_file = "../squad/tgt-dev.txt" test_src_file = "../squad/para-test.txt" test_trg_file = "../squad/tgt-test.txt" # pre-process training data # train_examples have passage question pairs, counter is the word frequency across all passages # question and passages are represented as a list of tokens with open('../cnn-dailymail/cnn_examples.pkl', 'rb') as f: cnn = pickle.load(f) print('loaded cnn') # with open('../cnn-dailymail/dm_examples.pkl','rb') as f: # dm = pickle.load(f) # print('loaded dailymail') counter = defaultdict(int) examples = cnn shuffle(examples) train_size = int(len(examples) * 0.92) train_examples = examples[:train_size] print(len(train_examples)) dev_test_examples = examples[train_size:] print(len(train_examples), len(dev_test_examples)) for e in train_examples: for token in e['context_tokens']: counter[token] += 1 make_conll_format2(train_examples, train_src_file, train_trg_file) # make a dict mapping word to unique index word2idx = make_vocab_from_dm(src_word2idx_file, counter, config.vocab_size) # makes a dict mapping words from all passages to embedding vectors make_embedding(embedding_file, embedding, word2idx) # split dev into dev and test # random.shuffle(dev_test_examples) num_dev = len(dev_test_examples) // 2 dev_examples = dev_test_examples[:num_dev] test_examples = dev_test_examples[num_dev:] make_conll_format2(dev_examples, dev_src_file, dev_trg_file) make_conll_format2(test_examples, test_src_file, test_trg_file)
def make_sent_dataset(): train_src_file = "../squad/train_src50.txt" train_trg_file = "../squad/train_tgt50.txt" # dev file dev_src_file = "../squad/dev_src50.txt" dev_trg_file = "../squad/dev_tgt50.txt" embedding_file = "./glove.840B.300d.txt" embedding = "./embedding.pkl" word2idx_file = "./word2idx.pkl" # make vocab file word2idx = make_vocab(train_src_file, train_trg_file, dev_src_file, dev_trg_file, word2idx_file, config.vocab_size) make_embedding(embedding_file, embedding, word2idx)