def read_vectors(infile): _, word_vectors0, _ = load_saved_params(infile) num_words = word_vectors0.shape[0] / 2 print "loaded vectors for %d words" % num_words word_vectors = (word_vectors0[:num_words,:] + word_vectors0[num_words:,:]) #dim_vectors = word_vectors.shape[1] return word_vectors
word_vectors, params, postprocessing=normalize_rows, use_saved=True, print_every=100, save_params_every=5000) # sanity check: cost at convergence should be around or below 10 # sum the input and output word vectors word_vectors = (word_vectors0[:num_words,:] + word_vectors0[num_words:,:]) print "\n=== For autograder ===" check_words = ["the", "a", "an", "movie", "ordinary", "but", "and"] check_idx = [tokens[word] for word in check_words] check_vecs = word_vectors[check_idx, :] print check_vecs # Visualize the word vectors you trained _, word_vectors0, _ = load_saved_params() word_vectors = (word_vectors0[:num_words,:] + word_vectors0[num_words:,:]) visualize_words = ["the", "a", "an", "good", "great", "cool", "brilliant", "wonderful", "well", "amazing", "worth", "sweet", "enjoyable", "boring", "bad", "waste", "dumb", "annoying", "london", "england", "york", "yorker", "winter", "car", "automatically"] visualize_idx = [tokens[word] for word in visualize_words] visualize_vecs = word_vectors[visualize_idx, :] temp = (visualize_vecs - np.mean(visualize_vecs, axis=0)) covariance = 1.0 / len(visualize_idx) * temp.T.dot(temp) U, S, V = np.linalg.svd(covariance) coord = temp.dot(U[:,0:2]) print "SVD coordinates" for i in xrange(len(visualize_words)): plt.text(coord[i,0], coord[i,1], visualize_words[i], bbox=dict(facecolor='green', alpha=0.1)) plt.xlim((np.min(coord[:,0]), np.max(coord[:,0]))) plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))
def main(args): """ Train a model to do sentiment analyis""" # Load the dataset dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) if args.yourvectors: _, wordVectors, _ = load_saved_params() wordVectors = np.concatenate( (wordVectors[:nWords,:], wordVectors[nWords:,:]), axis=1) elif args.pretrained: wordVectors = glove.loadWordVectors(tokens) dimVectors = wordVectors.shape[1] # Load the train set trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) trainLabels = np.zeros((nTrain,), dtype=np.int32) for i in xrange(nTrain): words, trainLabels[i] = trainset[i] trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare dev set features devset = dataset.getDevSentences() nDev = len(devset) devFeatures = np.zeros((nDev, dimVectors)) devLabels = np.zeros((nDev,), dtype=np.int32) for i in xrange(nDev): words, devLabels[i] = devset[i] devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare test set features testset = dataset.getTestSentences() nTest = len(testset) testFeatures = np.zeros((nTest, dimVectors)) testLabels = np.zeros((nTest,), dtype=np.int32) for i in xrange(nTest): words, testLabels[i] = testset[i] testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # We will save our results from each run results = [] regValues = getRegularizationValues() for reg in regValues: print "Training for reg=%f" % reg # Note: add a very small number to regularization to please the library clf = LogisticRegression(C=1.0/(reg + 1e-12)) clf.fit(trainFeatures, trainLabels) # Test on train set pred = clf.predict(trainFeatures) trainAccuracy = accuracy(trainLabels, pred) print "Train accuracy (%%): %f" % trainAccuracy # Test on dev set pred = clf.predict(devFeatures) devAccuracy = accuracy(devLabels, pred) print "Dev accuracy (%%): %f" % devAccuracy # Test on test set # Note: always running on test is poor style. Typically, you should # do this only after validation. pred = clf.predict(testFeatures) testAccuracy = accuracy(testLabels, pred) print "Test accuracy (%%): %f" % testAccuracy results.append({ "reg": reg, "clf": clf, "train": trainAccuracy, "dev": devAccuracy, "test": testAccuracy}) # Print the accuracies print "" print "=== Recap ===" print "Reg\t\tTrain\tDev\tTest" for result in results: print "%.2E\t%.3f\t%.3f\t%.3f" % ( result["reg"], result["train"], result["dev"], result["test"]) print "" bestResult = chooseBestModel(results) print "Best regularization value: %0.2E" % bestResult["reg"] print "Test accuracy (%%): %f" % bestResult["test"] # do some error analysis if args.pretrained: plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png") outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"], "q4_dev_conf.png") outputPredictions(devset, devFeatures, devLabels, bestResult["clf"], "q4_dev_pred.txt")