def main():
    pos_file = './data/train_pos.txt'
    neg_file = './data/train_neg.txt'
    validation = './data/test_data.txt'
    stopwords = './data/stopwords.txt'

    vocab_file = 'vocab.dat'
    inv_vocab_file = 'inv_vocab.dat'

    cooc_file = 'cooc.dat'

    embeddings_file = 'embeddings.dat'

    label_file = 'labels.dat'

    submission_file = 'submission.csv'

    glove_seed = 1234
    kmeans_seed = 4321
    xgb_seed = 1337
    sampler_seed = 7331

    build_vocab([pos_file, neg_file],
                stopwords,
                vocab_file,
                inv_vocab_file,
                cutoff=5)

    vocab = load_pickled(vocab_file)
    inv_vocab = load_pickled(inv_vocab_file)

    build_cooc([pos_file, neg_file], vocab, cooc_file)

    train_glove(cooc_file, embeddings_file, glove_seed)

    train_kmeans(embeddings_file, label_file, kmeans_seed)

    train_xgb(vocab_file, pos_file, neg_file, label_file, validation,
              submission_file, xgb_seed, sampler_seed)
def train_glove(sentences, emb_dim=50):
    glove.logger.setLevel(logging.INFO)
    vocab = glove.build_vocab(sentences)
    cooccur = glove.build_cooccur(vocab, sentences, window_size=10)
    id2word = evaluate.make_id2word(vocab)


    def evaluate_word(W):
        words = ['good', 'movie', 'bad', 'worth', 'dog']
        for word in words:
            print evaluate.most_similar(W, vocab, id2word, word)


    def save_per(W,i):
        if i % 100 == 0 and i >= 100:
            filename = "log/glove_%d_iter%d.model" % (emb_dim, i)
            W = evaluate.merge_main_context(W)
            glove.save_model(W, filename)
            evaluate_word(W)

    W = glove.train_glove(vocab, cooccur, vector_size=emb_dim, iterations=3000, iter_callback=save_per)
Пример #3
0
def main(args):
    print 80 * "="
    print "INITIALIZING"
    print 80 * "="
    dataset = StanfordSentiment()
    print "Done, read total %d windows" % dataset.word_count()
    print 80 * "="
    print "TRAINING"
    print 80 * "="
    print "Training %s word vectors" % args.model
    if not os.path.exists(args.vector_path):
        os.makedirs(args.vector_path)

    if args.model == 'word2vec':
        word_vectors = word2vec_model(args, dataset)
    else:
        # glove model
        vocab = dataset.tokens()
        word_freq = dataset.tokenfreq()
        cooccur = build_cooccur(vocab, word_freq, dataset, window_size=10)
        word_vectors = train_glove(vocab, cooccur, args.vector_size, args.vector_path, iterations=args.iterations)
Пример #4
0
def train_glove(sentences, emb_dim=50):
    glove.logger.setLevel(logging.INFO)
    vocab = glove.build_vocab(sentences)
    cooccur = glove.build_cooccur(vocab, sentences, window_size=10)
    id2word = evaluate.make_id2word(vocab)

    def evaluate_word(W):
        words = ['good', 'movie', 'bad', 'worth', 'dog']
        for word in words:
            print evaluate.most_similar(W, vocab, id2word, word)

    def save_per(W, i):
        if i % 100 == 0 and i >= 100:
            filename = "log/glove_%d_iter%d.model" % (emb_dim, i)
            W = evaluate.merge_main_context(W)
            glove.save_model(W, filename)
            evaluate_word(W)

    W = glove.train_glove(vocab,
                          cooccur,
                          vector_size=emb_dim,
                          iterations=3000,
                          iter_callback=save_per)
Пример #5
0
test_corpus = ("""human interface computer
survey user computer system response time
eps user interface system
system human system eps
user response time
trees
graph trees
graph minors trees
graph minors survey
I like graph and stuff
I like trees and stuff
Sometimes I build a graph
Sometimes I build trees""").split("\n")

glove.logger.setLevel(logging.ERROR)
vocab = glove.build_vocab(test_corpus)
cooccur = glove.build_cooccur(vocab, test_corpus, window_size=10)
id2word = evaluate.make_id2word(vocab)

W = glove.train_glove(vocab, cooccur, vector_size=10, iterations=500)

# Merge and normalize word vectors
W = evaluate.merge_main_context(W)


def test_similarity():
    similar = evaluate.most_similar(W, vocab, id2word, 'graph')
    logging.debug(similar)

    assert_equal('trees', similar[0])
Пример #6
0
vocab = glove.build_vocab(test_corpus, synonyms, antonyms)

synonyms = glove.build_syncab(synonyms, vocab)
antonyms = glove.build_antcab(antonyms, vocab)

print("Building Cooccur")

cooccur = glove.build_cooccur(vocab, test_corpus, window_size=1)
id2word = evaluate.make_id2word(vocab)

print("Training vectors...")

W = glove.train_glove(vocab,
                      synonyms,
                      antonyms,
                      cooccur,
                      vector_size=100,
                      iterations=25)

print("Evaluation:")
# Merge and normalize word vectors
W = evaluate.merge_main_context(W)


def test_similarity():
    similar = evaluate.most_similar(W, vocab, id2word, 'trees')
    print(similar)
    logger.info(similar)

    #assert_equal('trees', similar[0])
Пример #7
0
test_corpus = ("""human interface computer
survey user computer system response time
eps user interface system
system human system eps
user response time
trees
graph trees
graph minors trees
graph minors survey
I like graph and stuff
I like trees and stuff
Sometimes I build a graph
Sometimes I build trees""").split("\n")

glove.logger.setLevel(logging.ERROR)
vocab = glove.build_vocab(test_corpus)
cooccur = glove.build_cooccur(vocab, test_corpus, window_size=10)
id2word = evaluate.make_id2word(vocab)

W = glove.train_glove(vocab, cooccur, vector_size=10, iterations=500)

# Merge and normalize word vectors
W = evaluate.merge_main_context(W)


def test_similarity():
    similar = evaluate.most_similar(W, vocab, id2word, 'graph')
    logging.debug(similar)

    assert_equal('trees', similar[0])