def use_glove(sentences, index2word, model_file='glove_model/glove_50_iter2900.model',emb_dim=50):
    glove.logger.setLevel(logging.INFO)
    vocab = glove.build_vocab(sentences)
    # cooccur = glove.build_cooccur(vocab, sentences, window_size=10)
    # id2word = evaluate.make_id2word(vocab)

    W = glove.load_model(model_file)
    embeddings = []
    for index,word in enumerate(index2word):
        if word in vocab:
            word_id = vocab[word][0]
            vec = W[word_id]
        else:
            vec = np.zeros(shape=(W.shape[1],))

        embeddings.append(vec)
    embeddings = np.asarray(embeddings)

    return embeddings
def train_glove(sentences, emb_dim=50):
    glove.logger.setLevel(logging.INFO)
    vocab = glove.build_vocab(sentences)
    cooccur = glove.build_cooccur(vocab, sentences, window_size=10)
    id2word = evaluate.make_id2word(vocab)


    def evaluate_word(W):
        words = ['good', 'movie', 'bad', 'worth', 'dog']
        for word in words:
            print evaluate.most_similar(W, vocab, id2word, word)


    def save_per(W,i):
        if i % 100 == 0 and i >= 100:
            filename = "log/glove_%d_iter%d.model" % (emb_dim, i)
            W = evaluate.merge_main_context(W)
            glove.save_model(W, filename)
            evaluate_word(W)

    W = glove.train_glove(vocab, cooccur, vector_size=emb_dim, iterations=3000, iter_callback=save_per)
Пример #3
0
def use_glove(sentences,
              index2word,
              model_file='glove_model/glove_50_iter2900.model',
              emb_dim=50):
    glove.logger.setLevel(logging.INFO)
    vocab = glove.build_vocab(sentences)
    # cooccur = glove.build_cooccur(vocab, sentences, window_size=10)
    # id2word = evaluate.make_id2word(vocab)

    W = glove.load_model(model_file)
    embeddings = []
    for index, word in enumerate(index2word):
        if word in vocab:
            word_id = vocab[word][0]
            vec = W[word_id]
        else:
            vec = np.zeros(shape=(W.shape[1], ))

        embeddings.append(vec)
    embeddings = np.asarray(embeddings)

    return embeddings
Пример #4
0
def train_glove(sentences, emb_dim=50):
    glove.logger.setLevel(logging.INFO)
    vocab = glove.build_vocab(sentences)
    cooccur = glove.build_cooccur(vocab, sentences, window_size=10)
    id2word = evaluate.make_id2word(vocab)

    def evaluate_word(W):
        words = ['good', 'movie', 'bad', 'worth', 'dog']
        for word in words:
            print evaluate.most_similar(W, vocab, id2word, word)

    def save_per(W, i):
        if i % 100 == 0 and i >= 100:
            filename = "log/glove_%d_iter%d.model" % (emb_dim, i)
            W = evaluate.merge_main_context(W)
            glove.save_model(W, filename)
            evaluate_word(W)

    W = glove.train_glove(vocab,
                          cooccur,
                          vector_size=emb_dim,
                          iterations=3000,
                          iter_callback=save_per)
Пример #5
0
test_corpus = ("""human interface computer
survey user computer system response time
eps user interface system
system human system eps
user response time
trees
graph trees
graph minors trees
graph minors survey
I like graph and stuff
I like trees and stuff
Sometimes I build a graph
Sometimes I build trees""").split("\n")

glove.logger.setLevel(logging.ERROR)
vocab = glove.build_vocab(test_corpus)
cooccur = glove.build_cooccur(vocab, test_corpus, window_size=10)
id2word = evaluate.make_id2word(vocab)

W = glove.train_glove(vocab, cooccur, vector_size=10, iterations=500)

# Merge and normalize word vectors
W = evaluate.merge_main_context(W)


def test_similarity():
    similar = evaluate.most_similar(W, vocab, id2word, 'graph')
    logging.debug(similar)

    assert_equal('trees', similar[0])
Пример #6
0
    with open(path) as f:
        return f.read().split("\n")


print("Loading corpus and lexica")

#test_corpus = read_lines("../../datasets/snli_1.0/snli_sentenceA_72k_train.txt")

synonyms = read_lines("../../datasets/antonym_synonym/synonym_200.txt")
antonyms = read_lines("../../datasets/antonym_synonym/antonym_200.txt")

glove.logger.setLevel(logging.ERROR)

print("Building Vocab")

vocab = glove.build_vocab(test_corpus, synonyms, antonyms)

synonyms = glove.build_syncab(synonyms, vocab)
antonyms = glove.build_antcab(antonyms, vocab)

print("Building Cooccur")

cooccur = glove.build_cooccur(vocab, test_corpus, window_size=1)
id2word = evaluate.make_id2word(vocab)

print("Training vectors...")

W = glove.train_glove(vocab,
                      synonyms,
                      antonyms,
                      cooccur,
Пример #7
0
test_corpus = ("""human interface computer
survey user computer system response time
eps user interface system
system human system eps
user response time
trees
graph trees
graph minors trees
graph minors survey
I like graph and stuff
I like trees and stuff
Sometimes I build a graph
Sometimes I build trees""").split("\n")

glove.logger.setLevel(logging.ERROR)
vocab = glove.build_vocab(test_corpus)
cooccur = glove.build_cooccur(vocab, test_corpus, window_size=10)
id2word = evaluate.make_id2word(vocab)

W = glove.train_glove(vocab, cooccur, vector_size=10, iterations=500)

# Merge and normalize word vectors
W = evaluate.merge_main_context(W)


def test_similarity():
    similar = evaluate.most_similar(W, vocab, id2word, 'graph')
    logging.debug(similar)

    assert_equal('trees', similar[0])