Exemplo n.º 1
0
def main():
    input_file = "data/train.txt"
    vocab_file = "data/vocab"
    embedding_file = "data/glove.npz"
    glove_file = "data/glove.840B.300d.txt"
    dict_file = "data/dict.p"
    max_vocab_size = 5e4
    Vocab.build_vocab(input_file, vocab_file, dict_file, glove_file,
                      embedding_file, max_vocab_size)
Exemplo n.º 2
0
if __name__ == '__main__':
    vocab_num = 100000
    pubmed_w2v_path = 'pubmed_w2v.txt'
    emb_path = 'emb_cnn.pt'
    opt = Options(config_vocab=False)
    pubmedreader = PubMedReader(opt)
    print('loding text data')
    train_sents, train_labels, test_sents, test_labels, valid_sents, valid_labels = pubmedreader.get_data(
    )

    print('read vocab')
    fixed_vocab_set = read_vocab(pubmed_w2v_path)
    print('fixed vocab set size {}'.format(len(fixed_vocab_set)))
    print('build vocab')
    vocab = Vocab.build_vocab(train_sents, fixed_vocab_set=fixed_vocab_set)
    #
    vocab.append_sents(valid_sents, fixed_vocab_set=fixed_vocab_set)
    vocab.append_sents(test_sents, fixed_vocab_set=fixed_vocab_set)
    #
    print('vocab size {} before shrink'.format(vocab.vocab_len))
    vocab.shrink_vocab(2)
    print('vocab size {} after shrink'.format(vocab.vocab_len))

    print('read vec')
    word_list = [vocab.idx2word[i] for i in range(len(vocab.idx2word))]
    vec = read_vec(pubmed_w2v_path, word_list)
    assert vec.shape[0] == vocab.vocab_len

    print('build emb layer')
    emb = Embedding(vocab.vocab_len,