def main(args): """ Train a model to do sentiment analyis""" # Load the dataset dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) if args.yourvectors: _, wordVectors, _ = load_saved_params() wordVectors = np.concatenate( (wordVectors[:nWords,:], wordVectors[nWords:,:]), axis=1) elif args.pretrained: wordVectors = glove.loadWordVectors(tokens) dimVectors = wordVectors.shape[1] # Load the train set trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) trainLabels = np.zeros((nTrain,), dtype=np.int32) for i in xrange(nTrain): words, trainLabels[i] = trainset[i] trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare dev set features devset = dataset.getDevSentences() nDev = len(devset) devFeatures = np.zeros((nDev, dimVectors)) devLabels = np.zeros((nDev,), dtype=np.int32) for i in xrange(nDev): words, devLabels[i] = devset[i] devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare test set features testset = dataset.getTestSentences() nTest = len(testset) testFeatures = np.zeros((nTest, dimVectors)) testLabels = np.zeros((nTest,), dtype=np.int32) for i in xrange(nTest): words, testLabels[i] = testset[i] testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # We will save our results from each run results = [] regValues = getRegularizationValues() for reg in regValues: print "Training for reg=%f" % reg # Note: add a very small number to regularization to please the library clf = LogisticRegression(C=1.0/(reg + 1e-12)) clf.fit(trainFeatures, trainLabels) # Test on train set pred = clf.predict(trainFeatures) trainAccuracy = accuracy(trainLabels, pred) print "Train accuracy (%%): %f" % trainAccuracy # Test on dev set pred = clf.predict(devFeatures) devAccuracy = accuracy(devLabels, pred) print "Dev accuracy (%%): %f" % devAccuracy # Test on test set # Note: always running on test is poor style. Typically, you should # do this only after validation. pred = clf.predict(testFeatures) testAccuracy = accuracy(testLabels, pred) print "Test accuracy (%%): %f" % testAccuracy results.append({ "reg": reg, "clf": clf, "train": trainAccuracy, "dev": devAccuracy, "test": testAccuracy}) # Print the accuracies print "" print "=== Recap ===" print "Reg\t\tTrain\tDev\tTest" for result in results: print "%.2E\t%.3f\t%.3f\t%.3f" % ( result["reg"], result["train"], result["dev"], result["test"]) print "" bestResult = chooseBestModel(results) print "Best regularization value: %0.2E" % bestResult["reg"] print "Test accuracy (%%): %f" % bestResult["test"] # do some error analysis if args.pretrained: plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png") outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"], "q4_dev_conf.png") outputPredictions(devset, devFeatures, devLabels, bestResult["clf"], "q4_dev_pred.txt") else: # plotRegVsAccuracy(regValues, results, "q4_reg_v_acc_your.png") outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"], "q4_dev_conf_your.png")
def main(args): """ Train a model to do sentiment analyis""" # Load the dataset dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) if args.yourvectors: _, wordVectors, _ = load_saved_params() wordVectors = np.concatenate( (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=1) elif args.pretrained: wordVectors = glove.loadWordVectors(tokens) dimVectors = wordVectors.shape[1] # Load the train set trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) # sentence-level features trainLabels = np.zeros((nTrain, ), dtype=np.int32) for i in xrange(nTrain): words, trainLabels[i] = trainset[i] trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare dev set features devset = dataset.getDevSentences() nDev = len(devset) devFeatures = np.zeros((nDev, dimVectors)) devLabels = np.zeros((nDev, ), dtype=np.int32) for i in xrange(nDev): words, devLabels[i] = devset[i] devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare test set features testset = dataset.getTestSentences() nTest = len(testset) testFeatures = np.zeros((nTest, dimVectors)) testLabels = np.zeros((nTest, ), dtype=np.int32) for i in xrange(nTest): words, testLabels[i] = testset[i] testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # We will save our results from each run results = [] regValues = getRegularizationValues() for reg in regValues: print "Training for reg=%f" % reg # Note: add a very small number to regularization to please the library clf = LogisticRegression(C=1.0 / (reg + 1e-12)) clf.fit(trainFeatures, trainLabels) # Test on train set pred = clf.predict(trainFeatures) trainAccuracy = accuracy(trainLabels, pred) print "Train accuracy (%%): %f" % trainAccuracy # Test on dev set pred = clf.predict(devFeatures) devAccuracy = accuracy(devLabels, pred) print "Dev accuracy (%%): %f" % devAccuracy # Test on test set # Note: always running on test is poor style. Typically, you should # do this only after validation. pred = clf.predict(testFeatures) testAccuracy = accuracy(testLabels, pred) print "Test accuracy (%%): %f" % testAccuracy results.append({ "reg": reg, "clf": clf, "train": trainAccuracy, "dev": devAccuracy, "test": testAccuracy }) # Print the accuracies print "" print "=== Recap ===" print "Reg\t\tTrain\tDev\tTest" for result in results: print "%.2E\t%.3f\t%.3f\t%.3f" % (result["reg"], result["train"], result["dev"], result["test"]) print "" bestResult = chooseBestModel(results) print "Best regularization value: %0.2E" % bestResult["reg"] print "Test accuracy (%%): %f" % bestResult["test"] # do some error analysis if args.pretrained: plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png") outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"], "q4_dev_conf.png") outputPredictions(devset, devFeatures, devLabels, bestResult["clf"], "q4_dev_pred.txt")
def trial2(): sentences = np.array([[0, 1, 2, 4], [0, 1, 3, 0]]) sentences = np.array([[0, 1, 2, 4], [0, 1, 3, 5]]) mask = np.array([[0, 0, 0, 1], [0, 0, 1, 0]]) mask2 = np.array([[0, 0, 1, 1], [0, 0, 1, 0]]) labels = np.array([[1, 0, 0], [0, 1, 0]]) n_classes = 3 embed_size = 6 max_length = 4 batch_size = 1 lr = 0.001 n_features = 6 hidden_size = 10 DUMMY_PATH = "utils/glove/glove_dummy.txt" token_list = glove.loadWordTokens(DUMMY_PATH) tokens = {} for i in range(len(token_list)): tokens[token_list[i]] = i wordVectors = glove.loadWordVectors(tokens, DUMMY_PATH, embed_size) token_list.append("cqian23th7zhangrao") tokens["cqian23th7zhangrao"] = len(token_list) - 1 print 'WV', np.shape(wordVectors) wordVectors = np.append(wordVectors, [np.zeros(embed_size)], axis=0) print 'WV', np.shape(wordVectors) wordVectors2 = data_util.load_embeddings(DUMMY_PATH, embed_size) assert (wordVectors.all() == wordVectors2.all()) # start buiding model #cell=RNNCell(n_features,hidden_size) input_placeholder = tf.placeholder(tf.int32, [None, max_length]) labels_placeholder = tf.placeholder(tf.int32, [None, n_classes]) mask_placeholder = tf.placeholder(tf.bool, [None, max_length]) U = tf.Variable( np.random.rand(hidden_size, n_classes).astype(np.float32), tf.float32) # feed dict feed_dict = { input_placeholder: sentences, labels_placeholder: labels, mask_placeholder: mask } feed_dict2 = { input_placeholder: sentences, labels_placeholder: labels, mask_placeholder: mask2 } emb = tf.Variable(wordVectors, dtype=tf.float32) x = tf.nn.embedding_lookup(emb, input_placeholder) h = tf.zeros([tf.shape(x)[0], hidden_size], tf.float32) preds = [] W_h = tf.Variable( np.random.rand(hidden_size, hidden_size).astype(np.float32), tf.float32) W_x = tf.Variable( np.random.rand(n_features, hidden_size).astype(np.float32), tf.float32) b1 = tf.Variable( np.random.rand(hidden_size).astype(np.float32), tf.float32) # run through rnn for i in range(max_length): if i >= 1: tf.get_variable_scope().reuse_variables() h = tf.nn.sigmoid(tf.matmul(h, W_h) + tf.matmul(x[:, i, :], W_x) + b1) p = tf.matmul(h, U) print 'p', tf.shape(p) preds.append(p) # prediction preds = tf.pack(preds) preds2 = tf.reshape(preds, [-1, max_length, n_classes]) # these are for verification preds3 = tf.nn.softmax(preds2) preds4 = tf.log(preds3) # loss calculation labels_to_loss = tf.tile(labels_placeholder, [max_length, 1]) labels_to_loss = tf.reshape(labels_to_loss, [-1, max_length, n_classes]) loss = tf.nn.softmax_cross_entropy_with_logits(preds2, labels_to_loss) loss2 = tf.boolean_mask(loss, mask_placeholder) loss3 = tf.reduce_mean(loss2) # training op train_op = tf.train.AdamOptimizer(lr).minimize(loss) # test implementation init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) xx = sess.run(x, feed_dict=feed_dict) print 'embedding', xx print 'embedding shape', np.shape(xx) pp = sess.run(preds, feed_dict=feed_dict) print 'preds after pack', pp pp2 = sess.run(preds2, feed_dict=feed_dict) print 'preds after reshape', pp2 pp3 = sess.run(preds3, feed_dict=feed_dict) print 'preds after softmax', pp3 mask2 = np.stack([mask for i in range(n_classes)], 2) pred6 = np.sum(np.multiply(pp3, mask2), 1) print 'test batch_pred', pred6 pp4 = sess.run(preds4, feed_dict=feed_dict) print 'preds after log', pp4 lalo = sess.run(labels_to_loss, feed_dict=feed_dict) print 'labels to loss', lalo.shape, lalo ll = sess.run(loss, feed_dict=feed_dict) print 'after softmax loss', ll.shape, ll ll2 = sess.run(loss2, feed_dict=feed_dict) print 'after boolean_mask loss', ll2 print np.shape(ll2) ll2 = sess.run(loss2, feed_dict=feed_dict2) print 'after boolean_mask loss', ll2 print np.shape(ll2) ll3 = sess.run(loss3, feed_dict=feed_dict) print 'final loss', ll3
def main(args): """ Train a model to do sentiment analyis""" dataset, tokens, num_labels = getToxicDataMultilabel() target_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'non_toxic'] # Shuffle data shuffle(dataset) num_data = len(dataset) # Create train, dev, and test train_cutoff = int(0.6 * num_data) dev_start = int(0.6 * num_data) + 1 dev_cutoff = int(0.8 * num_data) trainset = dataset[:train_cutoff] devset = dataset[dev_start:dev_cutoff] testset = dataset[dev_cutoff + 1:] nWords = len(tokens) wordVectors = glove.loadWordVectors(tokens) dimVectors = wordVectors.shape[1] # Load the train set #trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) trainLabels = [] for i in xrange(nTrain): words = trainset[i][0] trainLabels.append(trainset[i][1]) trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare dev set features nDev = len(devset) devFeatures = np.zeros((nDev, dimVectors)) devLabels = [] for i in xrange(nDev): words = devset[i][0] devLabels.append(devset[i][1]) devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare test set features #testset = dataset.getTestSentences() nTest = len(testset) testFeatures = np.zeros((nTest, dimVectors)) testLabels = [] for i in xrange(nTest): words = testset[i][0] testLabels.append(testset[i][1]) testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # We will save our results from each run results = [] regValues = getRegularizationValues() print "LR Results:" classifier = Pipeline([ ('vectorizer', CountVectorizer(min_n=1,max_n=2)), ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(LinearSVC()))]) classifier.fit(trainFeatures, trainLabels) predicted = classifier.predict(devFeatures) clf = SVC() clf.fit(trainFeatures, trainLabels) # Test on train set pred = clf.predict(trainFeatures) trainAccuracy = accuracy(trainLabels, pred) print "Train accuracy (%%): %f" % trainAccuracy # Test on dev set pred = clf.predict(devFeatures) devAccuracy = accuracy(devLabels, pred) print "Dev accuracy (%%): %f" % devAccuracy # Test on test set # Note: always running on test is poor style. Typically, you should # do this only after validation. pred = clf.predict(testFeatures) testAccuracy = accuracy(testLabels, pred) print "Test accuracy (%%): %f" % testAccuracy results.append({ "reg": 0.0, "clf": clf, "train": trainAccuracy, "dev": devAccuracy, "test": testAccuracy}) # Print the accuracies print "" print "=== Recap ===" print "Reg\t\tTrain\tDev\tTest" for result in results: print "%.2E\t%.3f\t%.3f\t%.3f" % ( result["reg"], result["train"], result["dev"], result["test"]) print "" bestResult = chooseBestModel(results) # print "Best regularization value: %0.2E" % bestResult["reg"] # print "Test accuracy (%%): %f" % bestResult["test"] # do some error analysis if args.pretrained: plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png") outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"], "q4_dev_svm_conf.png") outputPredictions(devset, devFeatures, devLabels, bestResult["clf"], "q4_dev_svm_pred.txt")
def getLSTMMultiData(): dataset, tokens, maxLength = getMLToxicData() # Shuffle data shuffle(dataset) maxLength = 100 num_data = len(dataset) # Create train, dev, and test train_cutoff = int(0.6 * num_data) dev_start = int(0.6 * num_data) + 1 dev_cutoff = int(0.8 * num_data) trainset = dataset[:train_cutoff] devset = dataset[dev_start:dev_cutoff] testset = dataset[dev_cutoff + 1:] nWords = len(tokens) wordVectors = glove.loadWordVectors(tokens) dimVectors = wordVectors.shape[1] # Load the train set #trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, maxLength, dimVectors)) trainLabels = np.zeros((nTrain, 5), dtype=np.int32) for i in xrange(nTrain): words = trainset[i][0] trainLabels[i][0] = trainset[i][1] trainLabels[i][1] = trainset[i][2] trainLabels[i][2] = trainset[i][3] trainLabels[i][3] = trainset[i][4] trainLabels[i][4] = trainset[i][5] #trainLabels[i][5]= trainset[i][6] trainFeatures[i, :] = getLSTMSentenceFeatures(tokens, wordVectors, dimVectors, words, maxLength) # Prepare dev set features #devset = dataset.getDevSentences() nDev = len(devset) devFeatures = np.zeros((nDev, maxLength, dimVectors)) devLabels = np.zeros((nDev, 5), dtype=np.int32) for i in xrange(nDev): words = devset[i][0] devLabels[i][0] = devset[i][1] devLabels[i][1] = devset[i][2] devLabels[i][2] = devset[i][3] devLabels[i][3] = devset[i][4] devLabels[i][4] = devset[i][5] #devLabels[i][5] = devset[i][6] devFeatures[i, :] = getLSTMSentenceFeatures(tokens, wordVectors, dimVectors, words, maxLength) # Prepare test set features #testset = dataset.getTestSentences() nTest = len(testset) testFeatures = np.zeros((nTest, maxLength, dimVectors)) testLabels = np.zeros((nTest, 5), dtype=np.int32) for i in xrange(nTest): words = testset[i][0] testLabels[i][0] = testset[i][1] testLabels[i][1] = testset[i][2] testLabels[i][2] = testset[i][3] testLabels[i][3] = testset[i][4] testLabels[i][4] = testset[i][5] #testLabels[i][5] = testset[i][6] testFeatures[i, :] = getLSTMSentenceFeatures(tokens, wordVectors, dimVectors, words, maxLength) return trainFeatures, trainLabels, devFeatures, devLabels, testFeatures, testLabels, maxLength, dimVectors #, tokens, wordVectors
def main(args): """ Train a model to do sentiment analyis""" dataset, tokens, maxSentence = getToxicData() print len(dataset) # Shuffle data shuffle(dataset) num_data = len(dataset) # Create train, dev, and test train_cutoff = int(0.6 * num_data) dev_start = int(0.6 * num_data) + 1 dev_cutoff = int(0.8 * num_data) trainset = dataset[:train_cutoff] devset = dataset[dev_start:dev_cutoff] testset = dataset[dev_cutoff + 1:] nWords = len(tokens) if args.yourvectors: _, wordVectors, _ = load_saved_params() wordVectors = np.concatenate( (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=1) elif args.pretrained: wordVectors = glove.loadWordVectors(tokens) dimVectors = wordVectors.shape[1] # Load the train set #trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) trainLabels = np.zeros((nTrain, ), dtype=np.int32) for i in xrange(nTrain): words, trainLabels[i] = trainset[i] trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare dev set features #devset = dataset.getDevSentences() nDev = len(devset) devFeatures = np.zeros((nDev, dimVectors)) devLabels = np.zeros((nDev, ), dtype=np.int32) for i in xrange(nDev): words, devLabels[i] = devset[i] devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare test set features #testset = dataset.getTestSentences() nTest = len(testset) testFeatures = np.zeros((nTest, dimVectors)) testLabels = np.zeros((nTest, ), dtype=np.int32) for i in xrange(nTest): words, testLabels[i] = testset[i] testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # We will save our results from each run results = [] regValues = getRegularizationValues() print "SVM Results:" clf = SVC() clf.fit(trainFeatures, trainLabels) # Test on train set pred = clf.predict(trainFeatures) trainAccuracy = accuracy(trainLabels, pred) print "Train accuracy (%%): %f" % trainAccuracy # Test on dev set pred = clf.predict(devFeatures) devAccuracy = accuracy(devLabels, pred) print "Dev accuracy (%%): %f" % devAccuracy # Test on test set # Note: always running on test is poor style. Typically, you should # do this only after validation. pred = clf.predict(testFeatures) testAccuracy = accuracy(testLabels, pred) print "Test accuracy (%%): %f" % testAccuracy results.append({ "reg": 0.0, "clf": clf, "train": trainAccuracy, "dev": devAccuracy, "test": testAccuracy }) # Print the accuracies print "" print "=== Recap ===" print "Reg\t\tTrain\tDev\tTest" for result in results: print "%.2E\t%.3f\t%.3f\t%.3f" % (result["reg"], result["train"], result["dev"], result["test"]) print "" bestResult = chooseBestModel(results) # print "Best regularization value: %0.2E" % bestResult["reg"] # print "Test accuracy (%%): %f" % bestResult["test"] # do some error analysis if args.pretrained: plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png") outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"], "q4_dev_svm_conf.png") outputPredictions(devset, devFeatures, devLabels, bestResult["clf"], "q4_dev_svm_pred.txt")
from prep import clean files = [ 'data/hillaryclinton.csv', 'data/realdonaldtrump.csv', 'data/jimmyfallon.csv', 'data/barackobama.csv', 'data/conanobrien.csv' ] res, labels, vocab_dict, handle_dict = clean(files) print('len(vocab_dict):', len(vocab_dict)) import json import utils.glove as glove wordVectors = glove.loadWordVectors(vocab_dict) print(wordVectors.shape) new_res = [] for re in res: new_res.append(np.array([wordVectors[i] for i in re])) res = np.stack(new_res) print(res.shape, labels.shape) assert 1 == 2 ratio = .0 #proportion of test data cutoff = int(len(res) * ratio) X_test, X_train = res[:cutoff], res[cutoff:] y_test, y_train = labels[:cutoff], labels[cutoff:]
def main(args): """ Train a model to do sentiment analyis""" # Load the dataset dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) if args.yourvectors: _, wordVectors, _ = load_saved_params() wordVectors = np.concatenate( (wordVectors[:nWords,:], wordVectors[nWords:,:]), axis=1) elif args.pretrained: wordVectors = glove.loadWordVectors(tokens) dimVectors = wordVectors.shape[1] # Load the train set trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) trainLabels = np.zeros((nTrain,), dtype=np.int32) #frequency counting freq = Counter() Sum = 0 for sen in trainset: for word in sen[0]: Sum += 1 freq[word]+=1 for word,tf in freq.items(): freq[word] = tf/Sum #generate all sentence features for i in range(nTrain): words, trainLabels[i] = trainset[i] trainFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq) #svd in training set svd = TruncatedSVD(n_components=1, n_iter=5, random_state=0) u = svd.fit(trainFeatures).components_[0] # the first singular vector # remove the projections of the sentence embeddings to their first principal component for i in range(trainFeatures.shape[0]): trainFeatures[i] = trainFeatures[i] - np.dot(trainFeatures[i],u.T) * u # Prepare dev set features devset = dataset.getDevSentences() nDev = len(devset) devFeatures = np.zeros((nDev, dimVectors)) devLabels = np.zeros((nDev,), dtype=np.int32) for i in range(nDev): words, devLabels[i] = devset[i] devFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq) for i in range(devFeatures.shape[0]): devFeatures[i] = devFeatures[i] - np.dot(devFeatures[i],u.T) * u # Prepare test set features testset = dataset.getTestSentences() nTest = len(testset) testFeatures = np.zeros((nTest, dimVectors)) testLabels = np.zeros((nTest,), dtype=np.int32) for i in range(nTest): words, testLabels[i] = testset[i] testFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq) for i in range(testFeatures.shape[0]): testFeatures[i] = testFeatures[i] - np.dot(testFeatures[i],u.T) * u # We will save our results from each run results = [] regValues = getRegularizationValues() for reg in regValues: print("Training for reg=%f" % reg) # Note: add a very small number to regularization to please the library clf = LogisticRegression(C=1.0/(reg + 1e-12)) clf.fit(trainFeatures, trainLabels) # Test on train set pred = clf.predict(trainFeatures) trainAccuracy = accuracy(trainLabels, pred) print("Train accuracy (%%): %f" % trainAccuracy) # Test on dev set pred = clf.predict(devFeatures) devAccuracy = accuracy(devLabels, pred) print("Dev accuracy (%%): %f" % devAccuracy) # Test on test set # Note: always running on test is poor style. Typically, you should # do this only after validation. pred = clf.predict(testFeatures) testAccuracy = accuracy(testLabels, pred) print("Test accuracy (%%): %f" % testAccuracy) results.append({ "reg": reg, "clf": clf, "train": trainAccuracy, "dev": devAccuracy, "test": testAccuracy}) # Print the accuracies print ("") print ("=== Recap ===") print ("Reg\t\tTrain\tDev\tTest") for result in results: print ("%.2E\t%.3f\t%.3f\t%.3f" % ( result["reg"], result["train"], result["dev"], result["test"])) print ("") bestResult = chooseBestModel(results) print ("Best regularization value: %0.2E" % bestResult["reg"]) print ("Test accuracy (%%): %f" % bestResult["test"]) # do some error analysis if args.pretrained: plotRegVsAccuracy(regValues, results, "q4_sif_reg_v_acc.png") outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"], "q4_sif_dev_conf.png") outputPredictions(devset, devFeatures, devLabels, bestResult["clf"], "q4_sif_dev_pred.txt")
def get_glove_data(): embedding_dimension = 100 x_text, y = load_data_and_labels_bow("thread_content.npy", "thread_labels.npy") # num_recipients_features = np.array(np.load("num_recipients_features_nodup.npy")) # # avgNumRecipients = np.array(np.load("avg_num_recipients.npy")) # avgNumTokensPerEmail = np.array(np.load("avg_num_tokens_per_email.npy")) dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) # Initialize word vectors with glove. embedded_vectors = glove.loadWordVectors(tokens) print("The shape of embedding matrix is:") print(embedded_vectors.shape) # Should be number of e-mails, number of embeddings nTrain = len(x_text) trainFeatures = np.zeros((nTrain, embedding_dimension)) #5 is the number of slots the extra features take up toRemove = [] for i in xrange(nTrain): words = x_text[i] num_words = len(words) #place number of words in buckets if num_words < 10: num_words_bucket = 0 elif num_words >= 10 and num_words < 100: num_words_bucket = 1 elif num_words >= 100 and num_words < 500: num_words_bucket = 2 elif num_words >= 500 and num_words < 1000: num_words_bucket = 3 elif num_words >= 1000 and num_words < 2000: num_words_bucket = 4 elif num_words >= 2000: num_words_bucket = 5 sentenceFeatures = getSentenceFeatures(tokens, embedded_vectors, words) if sentenceFeatures is None: toRemove.append(i) else: featureVector = sentenceFeatures #num_words = avgNumTokensPerEmail[i] #place number of words in buckets # if num_words < 10: # num_words_bucket = 0 # elif num_words >= 10 and num_words < 100: # num_words_bucket = 1 # elif num_words >= 100 and num_words < 500: # num_words_bucket = 2 # elif num_words >= 500 and num_words < 1000: # num_words_bucket = 3 # elif num_words >= 1000 and num_words < 2000: # num_words_bucket = 4 # elif num_words >= 2000: # num_words_bucket = 5 # featureVector = np.hstack((featureVector, num_words_bucket)) #featureVector = np.hstack((featureVector, avgNumRecipients[i])) trainFeatures[i, :] = featureVector print(len(toRemove)) y = np.delete(y, toRemove, axis=0) trainFeatures = np.delete(trainFeatures, toRemove, axis=0) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) # Array of random numbers from 1 to # of labels. x_shuffled = trainFeatures[shuffle_indices] y_shuffled = y[shuffle_indices] train = 0.6 dev = 0.2 test = 0.2 # train x, dev x, test x, train y, dev y, test y train_cutoff = int(0.6 * len(x_shuffled)) dev_cutoff = int(0.8 * len(x_shuffled)) test_cutoff = int(len(x_shuffled)) return x_shuffled[0:train_cutoff], x_shuffled[train_cutoff:dev_cutoff], x_shuffled[dev_cutoff:test_cutoff], \ y_shuffled[0:train_cutoff], y_shuffled[train_cutoff:dev_cutoff], y_shuffled[dev_cutoff:test_cutoff],
visualizeWords = [ "the", "a", "an", ",", ".", "?", "!", "``", "''", "--", "good", "great", "cool", "brilliant", "wonderful", "well", "amazing", "worth", "sweet", "enjoyable", "boring", "bad", "waste", "dumb", "annoying"] key_words = ["the", "unique", "superb", "comedy", "surprisingly"] tokens = {} ind = 0 for word in visualizeWords + key_words: if word not in tokens: tokens[word] = ind ind += 1 wordVectors = loadWordVectors(tokens) visualizeIdx = [tokens[word] for word in visualizeWords] visualizeVecs = wordVectors[visualizeIdx, :] temp = (visualizeVecs - np.mean(visualizeVecs, axis=0)) covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp) U,S,V = np.linalg.svd(covariance) coord = temp.dot(U[:,0:2]) for i in xrange(len(visualizeWords)): plt.text(coord[i,0], coord[i,1], visualizeWords[i], bbox=dict(facecolor='green', alpha=0.1)) plt.xlim((np.min(coord[:,0]), np.max(coord[:,0]))) plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))
# the path of the file where the word vectors are stored DUMMY_PATH = "utils/glove/glove_dummy.txt" option = 2 # token_list is the list containing all the tokens # tokens is a dictionary that maps a token to its index if option == 1: token_list = ["is", "this", "a", "file", "dummy"] elif option == 2: token_list = glove.loadWordTokens(DUMMY_PATH) else: assert false, 'Not a valid option' # create an empty dictionary tokens = {} for i in range(len(token_list)): tokens[token_list[i]] = i # read in word vectors # the function takes 3 arguments: """ tokens: a dictionary maps the token to their index in token_list filepath: a string, the path of the word vector file to be read dimension: integer, the length of the vector (it must be consistent with the file) """ dummy_vectors = glove.loadWordVectors(tokens, DUMMY_PATH, 6) for i in range(len(dummy_vectors)): # print the words (formatted to have a tab behind them) and the word vectors print "{0}\t".format(token_list[i]), dummy_vectors[i]