def initialize(): # Reset the random seed to make sure that everyone gets the same results random.seed(314) dataset = StanfordSentiment() tokens = dataset.tokens() print(tokens) json.dump(tokens, open("tokens.json", "w")) nWords = len(tokens) print(nWords, "word") # We are going to train 10-dimensional vectors for this assignment dimVectors = 50 EPOCH = 100 # Context size C = 5 # Reset the random seed to make sure that everyone gets the same results random.seed(31415) np.random.seed(9265) in_glove = 0 wordVectors = np.zeros((2 * nWords, dimVectors)) for i in range(0, nWords): if list(tokens.keys())[i] in wv_from_bin.vocab.keys(): wordVectors[i] = np.array( wv_from_bin.word_vec(list(tokens.keys())[i])) in_glove += 1 else: wordVectors[i] = (np.random.rand(1, dimVectors) - 0.5) / dimVectors for i in range(nWords, 2 * nWords): if list(tokens.keys())[i - nWords] in wv_from_bin.vocab.keys(): wordVectors[i] = np.array( wv_from_bin.word_vec(list(tokens.keys())[i - nWords])) print(wordVectors) print(in_glove, " in GloVe") wordVectors = sgd(lambda vec: word2vec_sgd_wrapper( skipgram, tokens, vec, dataset, C, negSamplingLossAndGradient), wordVectors, 0.3, EPOCH, None, True, PRINT_EVERY=1) # Note that normalization is not called here. This is not a bug, # normalizing during training loses the notion of length. print("sanity check: cost at convergence should be around or below 10") # concatenate the input and output word vectors wordVectors = np.concatenate( (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=0) print(wordVectors.shape) # %% np.save("wordVectors", wordVectors)
def run(): dataset = StanfordSentiment() tokens_encoded = dataset.tokens() for k,v in tokens_encoded.items(): if type(k) == str: tokens_encoded.pop(k) tokens_encoded[k.encode('latin1')] = v tokens = dict((k.decode('latin1'),v) for k,v in tokens_encoded.items()) V,D = len(tokens),10 random.seed(319) np.random.seed(419) vectors = np.concatenate((np.random.randn(V,D), np.zeros((V,D))), axis=0) vectors = sgd(lambda vecs: sgd_wrapper(tokens_encoded, vecs, 7, dataset), vectors, 4001, 3e-1)
def main(args): """ Train a model to do sentiment analyis""" # Load the dataset dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) if args.yourvectors: _, wordVectors, _ = load_saved_params() wordVectors = np.concatenate( (wordVectors[:nWords,:], wordVectors[nWords:,:]), axis=1) elif args.pretrained: wordVectors = glove.loadWordVectors(tokens) dimVectors = wordVectors.shape[1]
def run(): random.seed(314) dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) # Train 10-dimensional vectors dimVectors = 10 # Context size C = 5 random.seed(31415) np.random.seed(9265) startTime = time.time() wordVectors = np.concatenate(((np.random.rand(nWords, dimVectors) - 0.5) / dimVectors, np.zeros((nWords, dimVectors))), axis=0) wordVectors = sgd(lambda vec: test_word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C, negSamplingCostAndGradient), wordVectors, 0.3, 40000, None, True, PRINT_EVERY=10) print "Sanity check: cost at convergence should be around or below 10" print "Training took %d seconds" % (time.time() - startTime) # Concatenate the input and output word vectors wordVectors = np.concatenate((wordVectors[:nWords,:], wordVectors[nWords:,:]), axis=0) # wordVectors = wordVectors[:nWords,:] + wordVectors[nWords:,:] visualizeWords = [ "the", "a", "an", ",", ".", "?", "!", "``", "''", "--", "good", "great", "cool", "brilliant", "wonderful", "well", "amazing", "worth", "sweet", "enjoyable", "boring", "bad", "waste", "dumb", "annoying"] visualizeIdx = [tokens[word] for word in visualizeWords] visualizeVecs = wordVectors[visualizeIdx, :] temp = (visualizeVecs - np.mean(visualizeVecs, axis=0)) covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp) U,S,V = np.linalg.svd(covariance) coord = temp.dot(U[:,0:2]) for i in xrange(len(visualizeWords)): plt.text(coord[i,0], coord[i,1], visualizeWords[i], bbox=dict(facecolor='green', alpha=0.1)) plt.xlim((np.min(coord[:,0]), np.max(coord[:,0]))) plt.ylim((np.min(coord[:,1]), np.max(coord[:,1]))) plt.savefig('q3_word_vectors.png') # Save a visualization for the word vectors
def run(): random.seed(319) dataset = StanfordSentiment() tokens_encoded = dataset.tokens() for k, v in tokens_encoded.items(): if type(k) == str: tokens_encoded.pop(k) tokens_encoded[k.encode('latin1')] = v tokens = dict((k.decode('latin1'), v) for k, v in tokens_encoded.items()) V, D = len(tokens), 10 random.seed(31919) np.random.seed(41717) vectors = np.concatenate((np.random.randn(V, D), np.zeros((V, D))), axis=0) start_time = time.time() vectors = sgd( lambda vecs: sgd_wrapper( tokens_encoded, vecs, dataset, 5, w2vmodel=skipgram), vectors, 14001, 3e-1) print("w2v run in (%f) seconds" % (time.time() - start_time))
def main(args): print 80 * "=" print "INITIALIZING" print 80 * "=" dataset = StanfordSentiment() print "Done, read total %d windows" % dataset.word_count() print 80 * "=" print "TRAINING" print 80 * "=" print "Training %s word vectors" % args.model if not os.path.exists(args.vector_path): os.makedirs(args.vector_path) if args.model == 'word2vec': word_vectors = word2vec_model(args, dataset) else: # glove model vocab = dataset.tokens() word_freq = dataset.tokenfreq() cooccur = build_cooccur(vocab, word_freq, dataset, window_size=10) word_vectors = train_glove(vocab, cooccur, args.vector_size, args.vector_path, iterations=args.iterations)
def run(): random.seed(319) dataset = StanfordSentiment() tokens_encoded = dataset.tokens() for k, v in tokens_encoded.items(): if type(k) == str: tokens_encoded.pop(k) tokens_encoded[k.encode('latin1')] = v tokens = dict((k.decode('latin1'), v) for k, v in tokens_encoded.items()) V, D = len(tokens), 10 random.seed(31919) np.random.seed(419) vectors = np.concatenate((np.random.randn(V, D), np.zeros((V, D))), axis=0) st = time.time() vectors = sgd( lambda vecs: sgd_wrapper(tokens_encoded, vectors, 5, dataset, w2vModel=skipgram, w2vCAG=negSamplingCAG), vectors, 5001, 3e-1) print("run-sgd finished in (%f) seconds" % (time.time() - st))
def do_train(args): # Set up some parameters. config = Config(args) # Load the dataset dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) if args.vector == "yourvectors": _, wordVectors, _ = load_saved_params() wordVectors = np.concatenate( (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=1) elif args.vector == "pretrained": wordVectors = glove.loadWordVectors(tokens) # Load the train set trainset = dataset.getTrainSentences() train_max_length, train, train_raw = word2index(tokens, trainset) print(train_raw[0]) print(train[0]) # Prepare dev set features devset = dataset.getDevSentences() _, dev, dev_raw = word2index(tokens, devset) # Prepare test set features testset = dataset.getTestSentences() _, test, test_raw = word2index(tokens, testset) config.max_length = train_max_length config.embed_size = wordVectors.shape[1] handler = logging.FileHandler(config.log_output) handler.setLevel(logging.DEBUG) handler.setFormatter( logging.Formatter('%(asctime)s:%(levelname)s: %(message)s')) logging.getLogger().addHandler(handler) report = None #Report(Config.eval_output) with tf.Graph().as_default(): logger.info("Building model...", ) start = time.time() model = RNNModel(config, wordVectors, tokens) logger.info("took %.2f seconds", time.time() - start) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as session: session.run(init) model.fit(session, saver, train, dev) # do some error analysis if args.vector == "pretrained": y_true, preds = model.output(session, dev_raw) outputConfusionMatrix(preds, y_true, "q5_dev_conf.png")
def run(): random.seed(319) dataset = StanfordSentiment() tokens_encoded = dataset.tokens() for k, v in tokens_encoded.items(): if type(k) == str: tokens_encoded.pop(k) tokens_encoded[k.encode('latin1')] = v tokens = dict((k.decode('latin1'), v) for k, v in tokens_encoded.items()) V, D = len(tokens), 10 random.seed(31919) np.random.seed(41717) vectors = np.concatenate((np.random.randn(V, D), np.zeros((V, D))), axis=0) start_time = time.time() vectors = sgd( lambda vecs: sgd_wrapper( tokens_encoded, vecs, dataset, 5, w2vmodel=skipgram), vectors, 24001, 3e-1) print("w2v run in (%f) seconds" % (time.time() - start_time)) visualize_words = [ 'smart', 'dumb', 'tall', 'short', 'good', 'bad', 'king', 'queen', 'man', 'woman' ] visualize_indices = [tokens[w] for w in visualize_words] visualize_vecs = vectors[visualize_indices, :] temp = (visualize_vecs - np.mean(visualize_vecs, axis=0)) covariance = 1.0 / len(visualize_indices) * temp.T.dot(temp) U, S, V = np.linalg.svd(covariance) coord = temp.dot(U[:, 0:2]) for i in range(len(visualize_words)): plt.text(coord[i, 0], coord[i, 1], visualize_words[i], bbox=dict(facecolor='green', alpha=0.1)) plt.xlim((np.min(coord[:, 0]), np.max(coord[:, 0]))) plt.ylim((np.min(coord[:, 1]), np.max(coord[:, 1]))) plt.savefig('q3_word_vectors.png')
def run(): random.seed(319) dataset = StanfordSentiment() tokens_encoded = dataset.tokens() for k, v in tokens_encoded.items(): if type(k) == str: tokens_encoded.pop(k) tokens_encoded[k.encode('latin1')] = v tokens = dict((k.decode('latin1'), v) for k, v in tokens_encoded.items()) nWords = len(tokens) dimVectors = 10 C = 5 random.seed(31919) np.random.seed(41717) start_time = time.time() vectors = np.concatenate( (np.random.randn(nWords, dimVectors), np.zeros((nWords, dimVectors))), axis=0) vectors, cost = sgd( lambda vecs: sgd_wrapper( tokens_encoded, vecs, C, dataset, soc=skipgram), vectors, 40000, 3e-1) print("SGD finished in ({}) seconds with cost ({})".format( time.time() - start_time, cost))
def main(args): dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) if args.yourvectors: _, wordVectors, _ = load_saved_params() wordVectors = np.concatenate( (wordVectors[:nWords,:], wordVectors[nWords:,:]), axis=1) elif args.pretrained: wordVectors = glove.loadWordVectors(tokens) dimVectors = wordVectors.shape[1] print dimVectors trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) trainLabels = np.zeros((nTrain,), dtype=np.int32) for i in xrange(nTrain): words, trainLabels[i] = trainset[i] trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare dev set features devset = dataset.getDevSentences() nDev = len(devset) devFeatures = np.zeros((nDev, dimVectors)) devLabels = np.zeros((nDev,), dtype=np.int32) for i in xrange(nDev): words, devLabels[i] = devset[i] devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare test set features testset = dataset.getTestSentences() nTest = len(testset) testFeatures = np.zeros((nTest, dimVectors)) testLabels = np.zeros((nTest,), dtype=np.int32) for i in xrange(nTest): words, testLabels[i] = testset[i] testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)
from utils.utils import get_relative_path from loguru import logger from knn import run_knn from matplotlib import use as use_matplotlib from typing import Dict, cast, List use_matplotlib('agg') # Check Python Version assert sys.version_info[0] == 3 assert sys.version_info[1] >= 5 # Reset the random seed to make sure that everyone gets the same results random.seed(314) dataset = StanfordSentiment( path=get_relative_path('data/stanfordSentimentTreebank')) tokens = dataset.tokens() nWords = len(tokens) # We are going to train 10-dimensional vectors for this assignment dimVectors = 10 # Context size C = 5 # Reset the random seed to make sure that everyone gets the same results random.seed(31415) np.random.seed(9265) startTime = time.time() wordVectors = np.concatenate(
import matplotlib.pyplot as plt import time from q3_word2vec import * from q3_sgd import * from itertools import islice def take(n, iterable): "Return first n items of the iterable as a list" return list(islice(iterable, n)) # Reset the random seed to make sure that everyone gets the same results random.seed(314) dataset = StanfordSentiment() tokens_encoded = dataset.tokens() strcnt, bytcnt, othcnt, unks = 0, 0, 0, 0 for k, v in tokens_encoded.items(): if type(k) == str: strcnt += 1 print("the string is (%s)" % (k)) tokens_encoded.pop(k) tokens_encoded[k.encode('latin1')] = v elif type(k) == bytes: bytcnt += 1 if k == b'unk': print("UNKUNKUNKUNKUNKUNKUNKUNKUNKUNKUNK") else: othcnt += 1 print("str(%d)byt(%d)oth(%d)" % (strcnt, bytcnt, othcnt)) tokens = dict((k.decode('latin1'), v) for (k, v) in tokens_encoded.items()) nWords = len(tokens)
#!/usr/bin/env python import random import numpy as np from utils.treebank import StanfordSentiment dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) # print(dataset.type) print(dataset.getRandomContext(5))
import codecs import _pickle as pickle from q3_sgd import load_saved_params path = "utils/datasets/stanfordSentimentTreebank" from utils.treebank import StanfordSentiment dataset = StanfordSentiment() sentences = dataset.sentences() sentence = [codecs.decode(word, 'latin1') for word in sentences[0]] " ".join(sentence) dictionary = dict() phrases = 0 with open(path + "/dictionary.txt", "r") as f: for line in f: line = line.strip() if not line: continue splitted = line.split("|") dictionary[splitted[0].lower()] = int(splitted[1]) phrases += 1 # sentences = [] # with open(path + "/datasetSentences.txt", "r", encoding='utf-8') as f: # for line in f: # # splitted = line.strip().split()[1:] # # print(splitted) # # Deal with some peculiar encoding issues with this file #
#!/usr/bin/env python import random import numpy as np from utils.treebank import StanfordSentiment import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import time from q3_word2vec import * from q3_sgd import * # Reset the random seed to make sure that everyone gets the same results random.seed(314) dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) # We are going to train 10-dimensional vectors for this assignment dimVectors = 10 # Context size C = 5 # Reset the random seed to make sure that everyone gets the same results random.seed(31415) np.random.seed(9265) startTime=time.time() wordVectors = np.concatenate(
from utils.treebank import StanfordSentiment import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import time from word2vec import * from sgd import * #check python version import sys assert sys.version_info[0] == 3 assert sys.version_info[1] >= 5 #Reset the random seed to make sure that everyone gets the same results random.seed(314) datasets = StanfordSentiment() tokens = datasets.tokens() nWords = len(tokens) #We are going to train 10-dimensional vectors for this assignment dimVectors = 10 #Context size C = 5 #Reset the random seed to make sure that everyone gets the same results random.seed(31415) np.random.seed(9265) startTime = time.time() wordVectors = np.concatenate(
#!/usr/bin/env python import random import numpy as np from utils.treebank import StanfordSentiment import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import time from q3_word2vec import * from q3_sgd import * # Reset the random seed to make sure that everyone gets the same results random.seed(314) dataset = StanfordSentiment() dataset.sentences() tokens = dataset.tokens() nWords = len(tokens) # We are going to train 10-dimensional vectors for this assignment dimVectors = 10 # Context size C = 5 # Reset the random seed to make sure that everyone gets the same results random.seed(31415) np.random.seed(9265) startTime = time.time()
def main(args): """ Train a model to do sentiment analyis""" # Load the dataset dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) if args.yourvectors: _, wordVectors, _ = load_saved_params() wordVectors = np.concatenate( (wordVectors[:nWords,:], wordVectors[nWords:,:]), axis=1) elif args.pretrained: wordVectors = glove.loadWordVectors(tokens) dimVectors = wordVectors.shape[1] # Load the train set trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) trainLabels = np.zeros((nTrain,), dtype=np.int32) #frequency counting freq = Counter() Sum = 0 for sen in trainset: for word in sen[0]: Sum += 1 freq[word]+=1 for word,tf in freq.items(): freq[word] = tf/Sum #generate all sentence features for i in range(nTrain): words, trainLabels[i] = trainset[i] trainFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq) #svd in training set svd = TruncatedSVD(n_components=1, n_iter=5, random_state=0) u = svd.fit(trainFeatures).components_[0] # the first singular vector # remove the projections of the sentence embeddings to their first principal component for i in range(trainFeatures.shape[0]): trainFeatures[i] = trainFeatures[i] - np.dot(trainFeatures[i],u.T) * u # Prepare dev set features devset = dataset.getDevSentences() nDev = len(devset) devFeatures = np.zeros((nDev, dimVectors)) devLabels = np.zeros((nDev,), dtype=np.int32) for i in range(nDev): words, devLabels[i] = devset[i] devFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq) for i in range(devFeatures.shape[0]): devFeatures[i] = devFeatures[i] - np.dot(devFeatures[i],u.T) * u # Prepare test set features testset = dataset.getTestSentences() nTest = len(testset) testFeatures = np.zeros((nTest, dimVectors)) testLabels = np.zeros((nTest,), dtype=np.int32) for i in range(nTest): words, testLabels[i] = testset[i] testFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq) for i in range(testFeatures.shape[0]): testFeatures[i] = testFeatures[i] - np.dot(testFeatures[i],u.T) * u # We will save our results from each run results = [] regValues = getRegularizationValues() for reg in regValues: print("Training for reg=%f" % reg) # Note: add a very small number to regularization to please the library clf = LogisticRegression(C=1.0/(reg + 1e-12)) clf.fit(trainFeatures, trainLabels) # Test on train set pred = clf.predict(trainFeatures) trainAccuracy = accuracy(trainLabels, pred) print("Train accuracy (%%): %f" % trainAccuracy) # Test on dev set pred = clf.predict(devFeatures) devAccuracy = accuracy(devLabels, pred) print("Dev accuracy (%%): %f" % devAccuracy) # Test on test set # Note: always running on test is poor style. Typically, you should # do this only after validation. pred = clf.predict(testFeatures) testAccuracy = accuracy(testLabels, pred) print("Test accuracy (%%): %f" % testAccuracy) results.append({ "reg": reg, "clf": clf, "train": trainAccuracy, "dev": devAccuracy, "test": testAccuracy}) # Print the accuracies print ("") print ("=== Recap ===") print ("Reg\t\tTrain\tDev\tTest") for result in results: print ("%.2E\t%.3f\t%.3f\t%.3f" % ( result["reg"], result["train"], result["dev"], result["test"])) print ("") bestResult = chooseBestModel(results) print ("Best regularization value: %0.2E" % bestResult["reg"]) print ("Test accuracy (%%): %f" % bestResult["test"]) # do some error analysis if args.pretrained: plotRegVsAccuracy(regValues, results, "q4_sif_reg_v_acc.png") outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"], "q4_sif_dev_conf.png") outputPredictions(devset, devFeatures, devLabels, bestResult["clf"], "q4_sif_dev_pred.txt")
def main(args): """ Train a model to do sentiment analyis""" # Load the dataset dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) if args.yourvectors: _, wordVectors, _ = load_saved_params() wordVectors = np.concatenate( (wordVectors[:nWords,:], wordVectors[nWords:,:]), axis=1) elif args.pretrained: wordVectors = glove.loadWordVectors(tokens) dimVectors = wordVectors.shape[1] # Load the train set trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) trainLabels = np.zeros((nTrain,), dtype=np.int32) for i in xrange(nTrain): words, trainLabels[i] = trainset[i] trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare dev set features devset = dataset.getDevSentences() nDev = len(devset) devFeatures = np.zeros((nDev, dimVectors)) devLabels = np.zeros((nDev,), dtype=np.int32) for i in xrange(nDev): words, devLabels[i] = devset[i] devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare test set features testset = dataset.getTestSentences() nTest = len(testset) testFeatures = np.zeros((nTest, dimVectors)) testLabels = np.zeros((nTest,), dtype=np.int32) for i in xrange(nTest): words, testLabels[i] = testset[i] testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # We will save our results from each run results = [] regValues = getRegularizationValues() for reg in regValues: print "Training for reg=%f" % reg # Note: add a very small number to regularization to please the library clf = LogisticRegression(C=1.0/(reg + 1e-12)) clf.fit(trainFeatures, trainLabels) # Test on train set pred = clf.predict(trainFeatures) trainAccuracy = accuracy(trainLabels, pred) print "Train accuracy (%%): %f" % trainAccuracy # Test on dev set pred = clf.predict(devFeatures) devAccuracy = accuracy(devLabels, pred) print "Dev accuracy (%%): %f" % devAccuracy # Test on test set # Note: always running on test is poor style. Typically, you should # do this only after validation. pred = clf.predict(testFeatures) testAccuracy = accuracy(testLabels, pred) print "Test accuracy (%%): %f" % testAccuracy results.append({ "reg": reg, "clf": clf, "train": trainAccuracy, "dev": devAccuracy, "test": testAccuracy}) # Print the accuracies print "" print "=== Recap ===" print "Reg\t\tTrain\tDev\tTest" for result in results: print "%.2E\t%.3f\t%.3f\t%.3f" % ( result["reg"], result["train"], result["dev"], result["test"]) print "" bestResult = chooseBestModel(results) print "Best regularization value: %0.2E" % bestResult["reg"] print "Test accuracy (%%): %f" % bestResult["test"] # do some error analysis if args.pretrained: plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png") outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"], "q4_dev_conf.png") outputPredictions(devset, devFeatures, devLabels, bestResult["clf"], "q4_dev_pred.txt") else: # plotRegVsAccuracy(regValues, results, "q4_reg_v_acc_your.png") outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"], "q4_dev_conf_your.png")
#!/usr/bin/env python import random import numpy as np from utils.treebank import StanfordSentiment import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import time from q3_word2vec import * from q3_sgd import * # Reset the random seed to make sure that everyone gets the same results random.seed(314) dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) # We are going to train 10-dimensional vectors for this assignment dimVectors = 10 # Context size C = 5 # Reset the random seed to make sure that everyone gets the same results random.seed(31415) np.random.seed(9265) startTime = time.time() wordVectors = np.concatenate(
def run(): ### Here is the main body of this file. We initialize the model and clean up the dataset ### Reset the random seed to make sure that everyone gets the same results random.seed(314) dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) ### We are going to train 10-dimensional vectors for this assignment dimVectors = 10 ### The maximum half context size C = 5 ### Reset the random seed to make sure that everyone gets the same results random.seed(31415) np.random.seed(9265) ### Start the clock when we begin to train this model startTime = time.time() ### The initial point to start SGD from wordVectors = np.concatenate( ((np.random.rand(nWords, dimVectors) - 0.5) / dimVectors, np.zeros((nWords, dimVectors))), axis=0) ### Call the sgd function to train our model, wordVectors = sgd(lambda vec: word2vec_sgd_wrapper( skipgram, tokens, vec, dataset, C, negSamplingCostAndGradient), wordVectors, 0.3, 40000, None, True, PRINT_EVERY=10) ### Note that normalization is not called here. This is not a bug, ### normalizing during training loses the notion of length. print("sanity check: cost at convergence should be around or below 10") print("training took %d seconds" % (time.time() - startTime)) ### Concatenate the input and output word vectors wordVectors = np.concatenate( (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=0) ### wordVectors = wordVectors[:nWords,:] + wordVectors[nWords:,:] ### Visualize word embeddings visualizeWords = [ "the", "a", "an", ",", ".", "?", "!", "``", "''", "--", "good", "great", "cool", "brilliant", "wonderful", "well", "amazing", "worth", "sweet", "enjoyable", "boring", "bad", "waste", "dumb", "annoying" ] visualizeIdx = [tokens[word] for word in visualizeWords] visualizeVecs = wordVectors[visualizeIdx, :] temp = (visualizeVecs - np.mean(visualizeVecs, axis=0)) covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp) U, S, V = np.linalg.svd(covariance) coord = temp.dot(U[:, 0:2]) for i in range(len(visualizeWords)): plt.text(coord[i, 0], coord[i, 1], visualizeWords[i], bbox=dict(facecolor='green', alpha=0.1)) plt.xlim((np.min(coord[:, 0]), np.max(coord[:, 0]))) plt.ylim((np.min(coord[:, 1]), np.max(coord[:, 1]))) plt.savefig('q3_word_vectors.png')
import random import numpy as np from utils.treebank import StanfordSentiment import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import time import argparse from sgd import * from word2vec import * random.seed(314) dataset = StanfordSentiment() word2Ind = dataset.tokens() nWords = len(word2Ind) def word2vec_sgd_wrapper(word2vecModel, word2Ind, wordVectors, dataset, windowSize, word2vecLossAndGradient=naiveSoftmaxLossAndGradient): batchsize = 50 loss = 0.0 grad = np.zeros(wordVectors.shape) N = wordVectors.shape[0] centerWordVectors = wordVectors[:int(N/2),:] outsideVectors = wordVectors[int(N/2):,:] for i in range(batchsize): windowSize1 = random.randint(1, windowSize) centerWord, context = dataset.getRandomContext(windowSize1) c, gin, gout = word2vecModel( centerWord, windowSize1, context, word2Ind, centerWordVectors,
import random import numpy as np from utils.treebank import StanfordSentiment import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import time # Check Python Version import sys assert sys.version_info[0] == 3 assert sys.version_info[1] >= 5 # Reset the random seed to make sure that everyone gets the same results random.seed(314) dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) dimVectors = 1024 sparseness = 0.03 # Context size C = 5 SPARSENESS = 0.03 LAMBDA = 0.05 GAMMA = 0.05 startTime=time.time() wordVectors = np.random.rand(nWords, dimVectors) - (1 - SPARSENESS)
import numpy as np from time import time from utils.treebank import StanfordSentiment from libNN.network import Word2Vec # load data sets dataset = StanfordSentiment() # model model = Word2Vec(word_dim=50) # training start_time = time() word_vectors = model.fit(dataset=dataset) # save print("Saving word vectors...") np.save("word_vectors", word_vectors) print("Training took {0} seconds".format(time() - start_time)) print(word_vectors[0:5])
from q4_softmaxreg import softmaxRegression, getSentenceFeature, accuracy, softmax_wrapper import seaborn as sns sns.set(style='whitegrid', context='talk') # Try different regularizations and pick the best! # NOTE: fill in one more "your code here" below before running! REGULARIZATION = None # Assign a list of floats in the block below ### YOUR CODE HERE REGULARIZATION = np.logspace(-6, 0.1, 21) REGULARIZATION = np.hstack([0, REGULARIZATION]) ### END YOUR CODE # Load the dataset dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) # Load the word vectors we trained earlier _, wordVectors0, _ = load_saved_params() N = wordVectors0.shape[0] // 2 #assert nWords == N wordVectors = (wordVectors0[:N, :] + wordVectors0[N:, :]) dimVectors = wordVectors.shape[1] # Load the train set trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) trainLabels = np.zeros((nTrain, ), dtype=np.int32)
import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import time from word2vec import * from sgd import * # Check Python Version检查python版本 import sys assert sys.version_info[0] == 3 assert sys.version_info[1] >= 5 # Reset the random seed to make sure that everyone gets the same results 设置随机种子 random.seed(314) dataset = StanfordSentiment() #导入数据集 tokens = dataset.tokens() #拿token nWords = len(tokens) #words的个数 #print("dataset:",dataset) #print("tokens:",tokens)#tokens是对应的单词和对应的矩阵序列号{'the': 0, 'rock': 1, 'is': 2, 'destined': 3, 'to': 4, 'be': 5, '21st'....这种之类的 #print("nWords",nWords)#一共19539个 # We are going to train 10-dimensional vectors for this assignment 这一次训练10个向量 dimVectors = 10 # Context size 滑动框 C = 5 # Reset the random seed to make sure that everyone gets the same results#对应的np的随机种子 random.seed(31415)
def get_glove_data(): embedding_dimension = 100 x_text, y = load_data_and_labels_bow("thread_content.npy", "thread_labels.npy") # num_recipients_features = np.array(np.load("num_recipients_features_nodup.npy")) # # avgNumRecipients = np.array(np.load("avg_num_recipients.npy")) # avgNumTokensPerEmail = np.array(np.load("avg_num_tokens_per_email.npy")) dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) # Initialize word vectors with glove. embedded_vectors = glove.loadWordVectors(tokens) print("The shape of embedding matrix is:") print(embedded_vectors.shape) # Should be number of e-mails, number of embeddings nTrain = len(x_text) trainFeatures = np.zeros((nTrain, embedding_dimension)) #5 is the number of slots the extra features take up toRemove = [] for i in xrange(nTrain): words = x_text[i] num_words = len(words) #place number of words in buckets if num_words < 10: num_words_bucket = 0 elif num_words >= 10 and num_words < 100: num_words_bucket = 1 elif num_words >= 100 and num_words < 500: num_words_bucket = 2 elif num_words >= 500 and num_words < 1000: num_words_bucket = 3 elif num_words >= 1000 and num_words < 2000: num_words_bucket = 4 elif num_words >= 2000: num_words_bucket = 5 sentenceFeatures = getSentenceFeatures(tokens, embedded_vectors, words) if sentenceFeatures is None: toRemove.append(i) else: featureVector = sentenceFeatures #num_words = avgNumTokensPerEmail[i] #place number of words in buckets # if num_words < 10: # num_words_bucket = 0 # elif num_words >= 10 and num_words < 100: # num_words_bucket = 1 # elif num_words >= 100 and num_words < 500: # num_words_bucket = 2 # elif num_words >= 500 and num_words < 1000: # num_words_bucket = 3 # elif num_words >= 1000 and num_words < 2000: # num_words_bucket = 4 # elif num_words >= 2000: # num_words_bucket = 5 # featureVector = np.hstack((featureVector, num_words_bucket)) #featureVector = np.hstack((featureVector, avgNumRecipients[i])) trainFeatures[i, :] = featureVector print(len(toRemove)) y = np.delete(y, toRemove, axis=0) trainFeatures = np.delete(trainFeatures, toRemove, axis=0) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) # Array of random numbers from 1 to # of labels. x_shuffled = trainFeatures[shuffle_indices] y_shuffled = y[shuffle_indices] train = 0.6 dev = 0.2 test = 0.2 # train x, dev x, test x, train y, dev y, test y train_cutoff = int(0.6 * len(x_shuffled)) dev_cutoff = int(0.8 * len(x_shuffled)) test_cutoff = int(len(x_shuffled)) return x_shuffled[0:train_cutoff], x_shuffled[train_cutoff:dev_cutoff], x_shuffled[dev_cutoff:test_cutoff], \ y_shuffled[0:train_cutoff], y_shuffled[train_cutoff:dev_cutoff], y_shuffled[dev_cutoff:test_cutoff],
def main(args): """ Train a model to do sentiment analyis""" # Load the dataset dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) if args.yourvectors: _, wordVectors, _ = load_saved_params() wordVectors = np.concatenate( (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=1) elif args.pretrained: wordVectors = glove.loadWordVectors(tokens) dimVectors = wordVectors.shape[1] # Load the train set trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) trainLabels = np.zeros((nTrain, ), dtype=np.int32) for i in xrange(nTrain): words, trainLabels[i] = trainset[i] trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare dev set features devset = dataset.getDevSentences() nDev = len(devset) devFeatures = np.zeros((nDev, dimVectors)) devLabels = np.zeros((nDev, ), dtype=np.int32) for i in xrange(nDev): words, devLabels[i] = devset[i] devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare test set features testset = dataset.getTestSentences() nTest = len(testset) testFeatures = np.zeros((nTest, dimVectors)) testLabels = np.zeros((nTest, ), dtype=np.int32) for i in xrange(nTest): words, testLabels[i] = testset[i] testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # We will save our results from each run results = [] regValues = getRegularizationValues() for reg in regValues: print("Training for reg=%f" % reg) # Note: add a very small number to regularization to please the library clf = LogisticRegression(C=1.0 / (reg + 1e-12)) clf.fit(trainFeatures, trainLabels) # Test on train set pred = clf.predict(trainFeatures) trainAccuracy = accuracy(trainLabels, pred) print("Train accuracy (%%): %f" % trainAccuracy) # Test on dev set pred = clf.predict(devFeatures) devAccuracy = accuracy(devLabels, pred) print("Dev accuracy (%%): %f" % devAccuracy) # Test on test set # Note: always running on test is poor style. Typically, you should # do this only after validation. pred = clf.predict(testFeatures) testAccuracy = accuracy(testLabels, pred) print("Test accuracy (%%): %f" % testAccuracy) results.append({ "reg": reg, "clf": clf, "train": trainAccuracy, "dev": devAccuracy, "test": testAccuracy }) # Print the accuracies print("") print("=== Recap ===") print("Reg\t\tTrain\tDev\tTest") for result in results: print("%.2E\t%.3f\t%.3f\t%.3f" % (result["reg"], result["train"], result["dev"], result["test"])) print("") bestResult = chooseBestModel(results) print("Best regularization value: %0.2E" % bestResult["reg"]) print("Test accuracy (%%): %f" % bestResult["test"]) # do some error analysis if args.pretrained: plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png") outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"], "q4_dev_conf.png") outputPredictions(devset, devFeatures, devLabels, bestResult["clf"], "q4_dev_pred.txt")
import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import time from word2vec import * from sgd import * # Check Python Version assert sys.version_info[0] == 3 assert sys.version_info[1] >= 5 # Reset the random seed to make sure that everyone gets the same results random.seed(314) dataset = StanfordSentiment(path=r'C:\Users\msingleton\Documents\XCS224N-A2/utils/datasets/stanfordSentimentTreebank') tokens = dataset.tokens() nWords = len(tokens) # We are going to train 10-dimensional vectors for this assignment dimVectors = 10 # Context size C = 5 # Reset the random seed to make sure that everyone gets the same results random.seed(31415) np.random.seed(9265) startTime = time.time() wordVectors = np.concatenate(