# Context size C = 5 # Train word vectors (this could take a while!) # Reset the random seed to make sure that everyone gets the same results random.seed(31415) np.random.seed(9265) wordVectors = normalizeRows(np.random.randn(nWords * 2, dimVectors)) wordVectors0 = sgd(lambda wordVectors: word2vec_sgd_wrapper(skipgram, C, negSamplingCostAndGradient, wordVectors), wordVectors, 10.0, 200000, normalizeRows, True) # just use the output vectors wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:]) / 2.0 print "\n=== For autograder ===" checkWords = ["the", "a", "an", "movie", "ordinary", "but", "and"] checkIdx = [tokens[word] for word in checkWords] checkVecs = wordVectors[checkIdx, :] print checkVecs # Visualize the word vectors you trained _, wordVectors0 = load_saved_params() wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:]) / 2.0 visualizeWords = ["the", "a", "an", ",", ".", "?", "!", "``", "''", "--", "good", "great", "cool", "brilliant", "wonderful", "well", "amazing", "worth", "sweet", "warm", "enjoyable", "boring", "bad", "garbage", "waste", "disaster", "dumb", "embarrassment", "annoying", "disgusting"] visualizeIdx = [tokens[word] for word in visualizeWords] visualizeVecs = wordVectors[visualizeIdx, :] import visualizing as vs vs.visualize(visualizeVecs, visualizeWords, "word2vec")
__author__ = 'dy' from gensim.models.word2vec import Word2Vec import numpy as np import matplotlib.pyplot as plt from cs224d.datasets.data_utils import * dataset = StanfordSentiment() sentences = dataset.sentences() model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) # model.save_word2vec_format("baseline.model") print "\n=== For autograder ===" checkWords = ["the", "a", "an", "movie", "ordinary", "but", "and"] # checkIdx = [model.vocab[word].index for word in checkWords] # checkVecs = model[checkIdx, :] checkVecs = np.array([model[w] for w in checkWords]) print checkVecs # Visualize the word vectors you trained # model = model.load_word2vec_format("baseline.model") visualizeWords = ["the", "a", "an", ",", ".", "?", "!", "``", "''", "--", "good", "great", "cool", "brilliant", "wonderful", "well", "amazing", "worth", "sweet", "warm", "enjoyable", "boring", "bad", "garbage", "waste", "disaster", "dumb", "embarrassment", "annoying", "disgusting"] visualizeVecs = np.array([model[w] for w in visualizeWords]) import visualizing as vs vs.visualize(visualizeVecs, visualizeWords, "baseline")