Exemplo n.º 1
0
def make_para_dataset():
    embedding_file = "../glove.840B.300d.txt"
    embedding = "./embedding.pkl"
    src_word2idx_file = "./word2idx.pkl"

    train_squad = "../squad/train-v1.1.json"
    dev_squad = "../squad/dev-v1.1.json"

    train_src_file = "../squad/para-train.txt"
    train_trg_file = "../squad/tgt-train.txt"
    dev_src_file = "../squad/para-dev.txt"
    dev_trg_file = "../squad/tgt-dev.txt"

    test_src_file = "../squad/para-test.txt"
    test_trg_file = "../squad/tgt-test.txt"

    # pre-process training data
    train_examples, counter = process_file(train_squad)
    make_conll_format(train_examples, train_src_file, train_trg_file)
    word2idx = make_vocab_from_squad(src_word2idx_file, counter,
                                     config.vocab_size)
    make_embedding(embedding_file, embedding, word2idx)

    # split dev into dev and test
    dev_test_examples, _ = process_file(dev_squad)
    # random.shuffle(dev_test_examples)
    num_dev = len(dev_test_examples) // 2
    dev_examples = dev_test_examples[:num_dev]
    test_examples = dev_test_examples[num_dev:]
    make_conll_format(dev_examples, dev_src_file, dev_trg_file)
    make_conll_format(test_examples, test_src_file, test_trg_file)
Exemplo n.º 2
0
def make_sent_dataset():
    embedding_file = "./glove/glove.840B.300d.txt"
    embedding = "./hotpot/embedding.pkl"
    src_word2idx_file = "./hotpot/word2idx.pkl"

    train_hotpot = "./hotpot/data/hotpot_train_v1.1.json"
    dev_hotpot = "./hotpot/data/hotpot_dev_distractor_v1.json"

    train_src_file = "./hotpot-sent/para-train.txt"
    train_trg_file = "./hotpot-sent/tgt-train.txt"
    dev_src_file = "./hotpot-sent/para-dev.txt"
    dev_trg_file = "./hotpot-sent/tgt-dev.txt"

    test_src_file = "./hotpot-sent/para-test.txt"
    test_trg_file = "./hotpot-sent/tgt-test.txt"

    # pre-process training data
    train_examples, counter = process_file(train_hotpot, "sent")
    make_conll_format(train_examples, train_src_file, train_trg_file)
    word2idx = make_vocab_from_hotpot(src_word2idx_file, counter, config.vocab_size)
    make_embedding(embedding_file, embedding, word2idx)

    # split dev into dev and test
    dev_test_examples, _ = process_file(dev_hotpot, "sent")
    # random.shuffle(dev_test_examples)
    num_dev = len(dev_test_examples) // 2
    dev_examples = dev_test_examples[:num_dev]
    test_examples = dev_test_examples[num_dev:]
    make_conll_format(dev_examples, dev_src_file, dev_trg_file)
    make_conll_format(test_examples, test_src_file, test_trg_file)
def make_para_dataset():
    embedding_file = "./glove.840B.300d.txt"
    embedding = "./embedding.pkl"
    src_word2idx_file = "./word2idx.pkl"
    ent2idx_file = "./ent2idx.pkl"
    rel2idx_file = "./rel2idx.pkl"
    entity_embedding = "./entity.pkl"
    relation_embedding = "./relation.pkl"

    train_squad = "../squad/train-v1.1.json"
    dev_squad = "../squad/dev-v1.1.json"

    train_src_file = "../squad/para-train.txt"
    train_trg_file = "../squad/tgt-train.txt"
    train_cs_file = "./paracs-train.json"
    dev_src_file = "../squad/para-dev.txt"
    dev_trg_file = "../squad/tgt-dev.txt"
    dev_cs_file = "./paracs-dev.json"

    test_src_file = "../squad/para-test.txt"
    test_trg_file = "../squad/tgt-test.txt"
    test_cs_file = "./paracs-test.json"
    ent_vector = "./entity_transE.txt"
    rel_vector = "./relation_transE.txt"
    ent_file = "./entity.txt"
    rel_file = "./relation.txt"
    cs_file = "./resource.json"

    database = dict()
    with open(cs_file, "r") as f:
        d = json.load(f)
        if d["dict_csk"] is not None:
            database = d["dict_csk"]

    # process the graph vector through the static attention mechanism
    _, _, ent2idx, rel2idx = make_graph_vector(entity_embedding,
                                               relation_embedding,
                                               ent_vector,
                                               ent_file,
                                               rel_vector,
                                               rel_file,
                                               ent2idx_file,
                                               rel2idx_file
                                               )
    # pre-process training data
    train_examples, counter, num = process_file(train_squad, ent2idx, rel2idx, database)
    make_conll_format(train_examples, train_src_file, train_trg_file, train_cs_file, num)
    word2idx = make_vocab_from_squad(src_word2idx_file, counter, config.vocab_size)
    make_embedding(embedding_file, embedding, word2idx)

    # split dev into dev and test
    dev_test_examples, _, num = process_file(dev_squad, ent2idx, rel2idx, database)
    # random.shuffle(dev_test_examples)
    num_dev = len(dev_test_examples) // 2
    dev_examples = dev_test_examples[:num_dev]
    test_examples = dev_test_examples[num_dev:]
    make_conll_format(dev_examples, dev_src_file, dev_trg_file, dev_cs_file, num)
    make_conll_format(test_examples, test_src_file, test_trg_file, test_cs_file, num)
def make_sent_dataset():
    train_src_file = "./para-train.txt"
    train_trg_file = "./tgt-train.txt"

    embedding_file = "./glove.840B.300d.txt"
    embedding = "./embedding.pkl"
    word2idx_file = "./word2idx.pkl"
    # make vocab file
    word2idx = make_vocab(train_src_file, train_trg_file, word2idx_file, config.vocab_size)
    make_embedding(embedding_file, embedding, word2idx)
Exemplo n.º 5
0
def make_para_dataset():
    embedding_file = "./glove.840B.300d.txt"
    embedding = "./embedding.pkl"
    src_word2idx_file = "./word2idx.pkl"

    train_squad = "../squad/train-v1.1.json"
    dev_squad = "../squad/dev-v1.1.json"

    train_src_file = "../squad/para-train.txt"
    train_trg_file = "../squad/tgt-train.txt"
    dev_src_file = "../squad/para-dev.txt"
    dev_trg_file = "../squad/tgt-dev.txt"

    test_src_file = "../squad/para-test.txt"
    test_trg_file = "../squad/tgt-test.txt"

    # pre-process training data
    # train_examples have passage question pairs, counter is the word frequency across all passages
    # question and passages are represented as a list of tokens
    with open('../cnn-dailymail/cnn_examples.pkl', 'rb') as f:
        cnn = pickle.load(f)
    print('loaded cnn')
    #  with open('../cnn-dailymail/dm_examples.pkl','rb') as f:
    #       dm = pickle.load(f)
    #    print('loaded dailymail')

    counter = defaultdict(int)

    examples = cnn
    shuffle(examples)
    train_size = int(len(examples) * 0.92)
    train_examples = examples[:train_size]
    print(len(train_examples))
    dev_test_examples = examples[train_size:]
    print(len(train_examples), len(dev_test_examples))
    for e in train_examples:
        for token in e['context_tokens']:
            counter[token] += 1
    make_conll_format2(train_examples, train_src_file, train_trg_file)
    # make a dict mapping word to unique index
    word2idx = make_vocab_from_dm(src_word2idx_file, counter,
                                  config.vocab_size)
    # makes a dict mapping words from all passages to embedding vectors
    make_embedding(embedding_file, embedding, word2idx)

    # split dev into dev and test
    # random.shuffle(dev_test_examples)
    num_dev = len(dev_test_examples) // 2
    dev_examples = dev_test_examples[:num_dev]
    test_examples = dev_test_examples[num_dev:]
    make_conll_format2(dev_examples, dev_src_file, dev_trg_file)
    make_conll_format2(test_examples, test_src_file, test_trg_file)
Exemplo n.º 6
0
def make_sent_dataset():

    train_src_file = "../squad/train_src50.txt"
    train_trg_file = "../squad/train_tgt50.txt"
    # dev file
    dev_src_file = "../squad/dev_src50.txt"
    dev_trg_file = "../squad/dev_tgt50.txt"

    embedding_file = "./glove.840B.300d.txt"
    embedding = "./embedding.pkl"
    word2idx_file = "./word2idx.pkl"
    # make vocab file
    word2idx = make_vocab(train_src_file, train_trg_file, dev_src_file, dev_trg_file, word2idx_file, config.vocab_size)
    make_embedding(embedding_file, embedding, word2idx)