def main(): pos_file = './data/train_pos.txt' neg_file = './data/train_neg.txt' validation = './data/test_data.txt' stopwords = './data/stopwords.txt' vocab_file = 'vocab.dat' inv_vocab_file = 'inv_vocab.dat' cooc_file = 'cooc.dat' embeddings_file = 'embeddings.dat' label_file = 'labels.dat' submission_file = 'submission.csv' glove_seed = 1234 kmeans_seed = 4321 xgb_seed = 1337 sampler_seed = 7331 build_vocab([pos_file, neg_file], stopwords, vocab_file, inv_vocab_file, cutoff=5) vocab = load_pickled(vocab_file) inv_vocab = load_pickled(inv_vocab_file) build_cooc([pos_file, neg_file], vocab, cooc_file) train_glove(cooc_file, embeddings_file, glove_seed) train_kmeans(embeddings_file, label_file, kmeans_seed) train_xgb(vocab_file, pos_file, neg_file, label_file, validation, submission_file, xgb_seed, sampler_seed)
def train_glove(sentences, emb_dim=50): glove.logger.setLevel(logging.INFO) vocab = glove.build_vocab(sentences) cooccur = glove.build_cooccur(vocab, sentences, window_size=10) id2word = evaluate.make_id2word(vocab) def evaluate_word(W): words = ['good', 'movie', 'bad', 'worth', 'dog'] for word in words: print evaluate.most_similar(W, vocab, id2word, word) def save_per(W,i): if i % 100 == 0 and i >= 100: filename = "log/glove_%d_iter%d.model" % (emb_dim, i) W = evaluate.merge_main_context(W) glove.save_model(W, filename) evaluate_word(W) W = glove.train_glove(vocab, cooccur, vector_size=emb_dim, iterations=3000, iter_callback=save_per)
def main(args): print 80 * "=" print "INITIALIZING" print 80 * "=" dataset = StanfordSentiment() print "Done, read total %d windows" % dataset.word_count() print 80 * "=" print "TRAINING" print 80 * "=" print "Training %s word vectors" % args.model if not os.path.exists(args.vector_path): os.makedirs(args.vector_path) if args.model == 'word2vec': word_vectors = word2vec_model(args, dataset) else: # glove model vocab = dataset.tokens() word_freq = dataset.tokenfreq() cooccur = build_cooccur(vocab, word_freq, dataset, window_size=10) word_vectors = train_glove(vocab, cooccur, args.vector_size, args.vector_path, iterations=args.iterations)
def train_glove(sentences, emb_dim=50): glove.logger.setLevel(logging.INFO) vocab = glove.build_vocab(sentences) cooccur = glove.build_cooccur(vocab, sentences, window_size=10) id2word = evaluate.make_id2word(vocab) def evaluate_word(W): words = ['good', 'movie', 'bad', 'worth', 'dog'] for word in words: print evaluate.most_similar(W, vocab, id2word, word) def save_per(W, i): if i % 100 == 0 and i >= 100: filename = "log/glove_%d_iter%d.model" % (emb_dim, i) W = evaluate.merge_main_context(W) glove.save_model(W, filename) evaluate_word(W) W = glove.train_glove(vocab, cooccur, vector_size=emb_dim, iterations=3000, iter_callback=save_per)
test_corpus = ("""human interface computer survey user computer system response time eps user interface system system human system eps user response time trees graph trees graph minors trees graph minors survey I like graph and stuff I like trees and stuff Sometimes I build a graph Sometimes I build trees""").split("\n") glove.logger.setLevel(logging.ERROR) vocab = glove.build_vocab(test_corpus) cooccur = glove.build_cooccur(vocab, test_corpus, window_size=10) id2word = evaluate.make_id2word(vocab) W = glove.train_glove(vocab, cooccur, vector_size=10, iterations=500) # Merge and normalize word vectors W = evaluate.merge_main_context(W) def test_similarity(): similar = evaluate.most_similar(W, vocab, id2word, 'graph') logging.debug(similar) assert_equal('trees', similar[0])
vocab = glove.build_vocab(test_corpus, synonyms, antonyms) synonyms = glove.build_syncab(synonyms, vocab) antonyms = glove.build_antcab(antonyms, vocab) print("Building Cooccur") cooccur = glove.build_cooccur(vocab, test_corpus, window_size=1) id2word = evaluate.make_id2word(vocab) print("Training vectors...") W = glove.train_glove(vocab, synonyms, antonyms, cooccur, vector_size=100, iterations=25) print("Evaluation:") # Merge and normalize word vectors W = evaluate.merge_main_context(W) def test_similarity(): similar = evaluate.most_similar(W, vocab, id2word, 'trees') print(similar) logger.info(similar) #assert_equal('trees', similar[0])