def sanity_check(): """ Run python q4_softmaxreg.py. """ random.seed(314159) np.random.seed(265) dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) _, wordVectors0, _ = load_saved_params() wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:]) dimVectors = wordVectors.shape[1] dummy_weights = 0.1 * np.random.randn(dimVectors, 5) dummy_features = np.zeros((10, dimVectors)) dummy_labels = np.zeros((10,), dtype=np.int32) for i in xrange(10): words, dummy_labels[i] = dataset.getRandomTrainSentence() dummy_features[i, :] = getSentenceFeature(tokens, wordVectors, words) print "==== Gradient check for softmax regression ====" gradcheck_naive(lambda weights: softmaxRegression(dummy_features, dummy_labels, weights, 1.0, nopredictions = True), dummy_weights) print "\n=== Results ===" print softmaxRegression(dummy_features, dummy_labels, dummy_weights, 1.0)
def sanity_check(): """ Run python q4_softmaxreg.py. """ random.seed(314159) np.random.seed(265) dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) _, wordVectors0, _ = load_saved_params() N = wordVectors0.shape[0]//2 #assert N == nWords wordVectors = (wordVectors0[:N,:] + wordVectors0[N:,:]) dimVectors = wordVectors.shape[1] dummy_weights = 0.1 * np.random.randn(dimVectors, 5) dummy_features = np.zeros((10, dimVectors)) dummy_labels = np.zeros((10,), dtype=np.int32) for i in range(10): words, dummy_labels[i] = dataset.getRandomTrainSentence() dummy_features[i, :] = getSentenceFeature(tokens, wordVectors, words) print("==== Gradient check for softmax regression ====") gradcheck_naive(lambda weights: softmaxRegression(dummy_features, dummy_labels, weights, 1.0, nopredictions = True), dummy_weights) print("\n=== Results ===") print(softmaxRegression(dummy_features, dummy_labels, dummy_weights, 1.0)) dummy_weights = 0.1 * np.random.randn(40, 10) + 1.0 dummy_features = np.random.randn(2000, 40) dummy_labels = np.argmax(np.random.randn(2000, 10), axis=1) print(-np.log(0.1))#expected correct classification (random) = 1 in 10; #cost then becomes -np.log(0.1) print(softmaxRegression(dummy_features, dummy_labels, dummy_weights, 0.0)[0]) dummy_weights = 0.1 * np.random.randn(40, 80) + 1.0 dummy_features = np.random.randn(2000, 40) dummy_labels = np.argmax(np.random.randn(2000, 80), axis=1) print(-np.log(1./80))#expected correct classification (random) = 1 in 80; #cost then becomes -np.log(1./80) print(softmaxRegression(dummy_features, dummy_labels, dummy_weights, 0.0)[0]) dummy_weights = 0.1 * np.random.randn(40, 1000) + 1.0 dummy_features = np.random.randn(40000, 40) dummy_labels = np.argmax(np.random.randn(40000, 1000), axis=1) print(-np.log(1./1000))#expected correct classification (random) = 1 in 80; #cost then becomes -np.log(1./80) print(softmaxRegression(dummy_features, dummy_labels, dummy_weights, 0.0)[0]) print(np.exp(-softmaxRegression(dummy_features, dummy_labels, dummy_weights, 0.0)[0]))
def main(args): """ Train a model to do sentiment analyis""" # Load the dataset dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) if args.yourvectors: _, wordVectors, _ = load_saved_params() wordVectors = np.concatenate( (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=1) elif args.pretrained: wordVectors = glove.loadWordVectors(tokens) dimVectors = wordVectors.shape[1] # Load the train set trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) trainLabels = np.zeros((nTrain, ), dtype=np.int32) for i in xrange(nTrain): words, trainLabels[i] = trainset[i] trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare dev set features devset = dataset.getDevSentences() nDev = len(devset) devFeatures = np.zeros((nDev, dimVectors)) devLabels = np.zeros((nDev, ), dtype=np.int32) for i in xrange(nDev): words, devLabels[i] = devset[i] devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare test set features testset = dataset.getTestSentences() nTest = len(testset) testFeatures = np.zeros((nTest, dimVectors)) testLabels = np.zeros((nTest, ), dtype=np.int32) for i in xrange(nTest): words, testLabels[i] = testset[i] testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # We will save our results from each run results = [] regValues = getRegularizationValues() for reg in regValues: print "Training for reg=%f" % reg # Note: add a very small number to regularization to please the library clf = LogisticRegression(C=1.0 / (reg + 1e-12)) clf.fit(trainFeatures, trainLabels) # Test on train set pred = clf.predict(trainFeatures) trainAccuracy = accuracy(trainLabels, pred) print "Train accuracy (%%): %f" % trainAccuracy # Test on dev set pred = clf.predict(devFeatures) devAccuracy = accuracy(devLabels, pred) print "Dev accuracy (%%): %f" % devAccuracy # Test on test set # Note: always running on test is poor style. Typically, you should # do this only after validation. pred = clf.predict(testFeatures) testAccuracy = accuracy(testLabels, pred) print "Test accuracy (%%): %f" % testAccuracy results.append({ "reg": reg, "clf": clf, "train": trainAccuracy, "dev": devAccuracy, "test": testAccuracy }) # Print the accuracies print "" print "=== Recap ===" print "Reg\t\tTrain\tDev\tTest" for result in results: print "%.2E\t%.3f\t%.3f\t%.3f" % (result["reg"], result["train"], result["dev"], result["test"]) print "" bestResult = chooseBestModel(results) print "Best regularization value: %0.2E" % bestResult["reg"] print "Test accuracy (%%): %f" % bestResult["test"] # do some error analysis if args.pretrained: plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png") outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"], "q4_dev_conf.png") outputPredictions(devset, devFeatures, devLabels, bestResult["clf"], "q4_dev_pred.txt")
from q4_softmaxreg import softmaxRegression, getSentenceFeature, accuracy, softmax_wrapper # Try different regularizations and pick the best! # NOTE: fill in one more "your code here" below before running! REGULARIZATION = np.logspace(-5,0.5,20) # Assign a list of floats in the block below ### YOUR CODE HERE REGULARIZATION = np.hstack([0, REGULARIZATION]) ### END YOUR CODE # Load the dataset dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) # Load the word vectors we trained earlier _, wordVectors0, _ = load_saved_params() wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:] dimVectors = wordVectors.shape[1] # Load the train set trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) trainLabels = np.zeros((nTrain,), dtype=np.int32) for i in range(nTrain): words, trainLabels[i] = trainset[i] trainFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words) # Prepare dev set features devset = dataset.getDevSentences() nDev = len(devset)
REGULARIZATION = None # Assign a list of floats in the block below ### YOUR CODE HERE #REGULARIZATION = 10 ** np.arange(-10.,1.,1) # Look closer at these values before the model drops off REGULARIZATION = 10 ** np.arange( -5, -3, .2) ### END YOUR CODE # Load the dataset dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) # Load the word vectors we trained earlier _, wordVectors0, _ = load_saved_params() wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:]) dimVectors = wordVectors.shape[1] # Load the train set trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) trainLabels = np.zeros((nTrain,), dtype=np.int32) for i in xrange(nTrain): words, trainLabels[i] = trainset[i] trainFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words) # Prepare dev set features devset = dataset.getDevSentences() nDev = len(devset)
def sanity_check(): """ Run python q4_softmaxreg.py. """ random.seed(314159) np.random.seed(265) dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) _, wordVectors0, _ = load_saved_params() N = wordVectors0.shape[0] // 2 #assert N == nWords wordVectors = (wordVectors0[:N, :] + wordVectors0[N:, :]) dimVectors = wordVectors.shape[1] dummy_weights = 0.1 * np.random.randn(dimVectors, 5) dummy_features = np.zeros((10, dimVectors)) dummy_labels = np.zeros((10, ), dtype=np.int32) for i in range(10): words, dummy_labels[i] = dataset.getRandomTrainSentence() dummy_features[i, :] = getSentenceFeature(tokens, wordVectors, words) print("==== Gradient check for softmax regression ====") gradcheck_naive( lambda weights: softmaxRegression( dummy_features, dummy_labels, weights, 1.0, nopredictions=True), dummy_weights) print("\n=== Results ===") print(softmaxRegression(dummy_features, dummy_labels, dummy_weights, 1.0)) dummy_weights = 0.1 * np.random.randn(40, 10) + 1.0 dummy_features = np.random.randn(2000, 40) dummy_labels = np.argmax(np.random.randn(2000, 10), axis=1) print(-np.log(0.1)) #expected correct classification (random) = 1 in 10; #cost then becomes -np.log(0.1) print( softmaxRegression(dummy_features, dummy_labels, dummy_weights, 0.0)[0]) dummy_weights = 0.1 * np.random.randn(40, 80) + 1.0 dummy_features = np.random.randn(2000, 40) dummy_labels = np.argmax(np.random.randn(2000, 80), axis=1) print( -np.log(1. / 80)) #expected correct classification (random) = 1 in 80; #cost then becomes -np.log(1./80) print( softmaxRegression(dummy_features, dummy_labels, dummy_weights, 0.0)[0]) dummy_weights = 0.1 * np.random.randn(40, 1000) + 1.0 dummy_features = np.random.randn(40000, 40) dummy_labels = np.argmax(np.random.randn(40000, 1000), axis=1) print(-np.log( 1. / 1000)) #expected correct classification (random) = 1 in 80; #cost then becomes -np.log(1./80) print( softmaxRegression(dummy_features, dummy_labels, dummy_weights, 0.0)[0]) print( np.exp(-softmaxRegression(dummy_features, dummy_labels, dummy_weights, 0.0)[0]))
def main(args): """ Train a model to do sentiment analyis""" # Load the dataset dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) if args.yourvectors: _, wordVectors, _ = load_saved_params() wordVectors = np.concatenate( (wordVectors[:nWords,:], wordVectors[nWords:,:]), axis=1) elif args.pretrained: wordVectors = glove.loadWordVectors(tokens) dimVectors = wordVectors.shape[1] # Load the train set trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) trainLabels = np.zeros((nTrain,), dtype=np.int32) for i in xrange(nTrain): words, trainLabels[i] = trainset[i] trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare dev set features devset = dataset.getDevSentences() nDev = len(devset) devFeatures = np.zeros((nDev, dimVectors)) devLabels = np.zeros((nDev,), dtype=np.int32) for i in xrange(nDev): words, devLabels[i] = devset[i] devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare test set features testset = dataset.getTestSentences() nTest = len(testset) testFeatures = np.zeros((nTest, dimVectors)) testLabels = np.zeros((nTest,), dtype=np.int32) for i in xrange(nTest): words, testLabels[i] = testset[i] testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # We will save our results from each run results = [] regValues = getRegularizationValues() for reg in regValues: print "Training for reg=%f" % reg # Note: add a very small number to regularization to please the library clf = LogisticRegression(C=1.0/(reg + 1e-12)) clf.fit(trainFeatures, trainLabels) # Test on train set pred = clf.predict(trainFeatures) trainAccuracy = accuracy(trainLabels, pred) print "Train accuracy (%%): %f" % trainAccuracy # Test on dev set pred = clf.predict(devFeatures) devAccuracy = accuracy(devLabels, pred) print "Dev accuracy (%%): %f" % devAccuracy # Test on test set # Note: always running on test is poor style. Typically, you should # do this only after validation. pred = clf.predict(testFeatures) testAccuracy = accuracy(testLabels, pred) print "Test accuracy (%%): %f" % testAccuracy results.append({ "reg": reg, "clf": clf, "train": trainAccuracy, "dev": devAccuracy, "test": testAccuracy}) # Print the accuracies print "" print "=== Recap ===" print "Reg\t\tTrain\tDev\tTest" for result in results: print "%.2E\t%.3f\t%.3f\t%.3f" % ( result["reg"], result["train"], result["dev"], result["test"]) print "" bestResult = chooseBestModel(results) print "Best regularization value: %0.2E" % bestResult["reg"] print "Test accuracy (%%): %f" % bestResult["test"] # do some error analysis if args.pretrained: plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png") outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"], "q4_dev_conf.png") outputPredictions(devset, devFeatures, devLabels, bestResult["clf"], "q4_dev_pred.txt") else: # plotRegVsAccuracy(regValues, results, "q4_reg_v_acc_your.png") outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"], "q4_dev_conf_your.png")
def main(args): """ Train a model to do sentiment analyis""" dataset, tokens, maxSentence = getToxicData() print len(dataset) # Shuffle data shuffle(dataset) num_data = len(dataset) # Create train, dev, and test train_cutoff = int(0.6 * num_data) dev_start = int(0.6 * num_data) + 1 dev_cutoff = int(0.8 * num_data) trainset = dataset[:train_cutoff] devset = dataset[dev_start:dev_cutoff] testset = dataset[dev_cutoff + 1:] nWords = len(tokens) if args.yourvectors: _, wordVectors, _ = load_saved_params() wordVectors = np.concatenate( (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=1) elif args.pretrained: wordVectors = glove.loadWordVectors(tokens) dimVectors = wordVectors.shape[1] # Load the train set #trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) trainLabels = np.zeros((nTrain, ), dtype=np.int32) for i in xrange(nTrain): words, trainLabels[i] = trainset[i] trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare dev set features #devset = dataset.getDevSentences() nDev = len(devset) devFeatures = np.zeros((nDev, dimVectors)) devLabels = np.zeros((nDev, ), dtype=np.int32) for i in xrange(nDev): words, devLabels[i] = devset[i] devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare test set features #testset = dataset.getTestSentences() nTest = len(testset) testFeatures = np.zeros((nTest, dimVectors)) testLabels = np.zeros((nTest, ), dtype=np.int32) for i in xrange(nTest): words, testLabels[i] = testset[i] testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # We will save our results from each run results = [] regValues = getRegularizationValues() print "SVM Results:" clf = SVC() clf.fit(trainFeatures, trainLabels) # Test on train set pred = clf.predict(trainFeatures) trainAccuracy = accuracy(trainLabels, pred) print "Train accuracy (%%): %f" % trainAccuracy # Test on dev set pred = clf.predict(devFeatures) devAccuracy = accuracy(devLabels, pred) print "Dev accuracy (%%): %f" % devAccuracy # Test on test set # Note: always running on test is poor style. Typically, you should # do this only after validation. pred = clf.predict(testFeatures) testAccuracy = accuracy(testLabels, pred) print "Test accuracy (%%): %f" % testAccuracy results.append({ "reg": 0.0, "clf": clf, "train": trainAccuracy, "dev": devAccuracy, "test": testAccuracy }) # Print the accuracies print "" print "=== Recap ===" print "Reg\t\tTrain\tDev\tTest" for result in results: print "%.2E\t%.3f\t%.3f\t%.3f" % (result["reg"], result["train"], result["dev"], result["test"]) print "" bestResult = chooseBestModel(results) # print "Best regularization value: %0.2E" % bestResult["reg"] # print "Test accuracy (%%): %f" % bestResult["test"] # do some error analysis if args.pretrained: plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png") outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"], "q4_dev_svm_conf.png") outputPredictions(devset, devFeatures, devLabels, bestResult["clf"], "q4_dev_svm_pred.txt")
def main(args): """ Train a model to do sentiment analyis""" # Load the dataset dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) if args.yourvectors: _, wordVectors, _ = load_saved_params() wordVectors = np.concatenate( (wordVectors[:nWords,:], wordVectors[nWords:,:]), axis=1) elif args.pretrained: wordVectors = glove.loadWordVectors(tokens) dimVectors = wordVectors.shape[1] # Load the train set trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) trainLabels = np.zeros((nTrain,), dtype=np.int32) #frequency counting freq = Counter() Sum = 0 for sen in trainset: for word in sen[0]: Sum += 1 freq[word]+=1 for word,tf in freq.items(): freq[word] = tf/Sum #generate all sentence features for i in range(nTrain): words, trainLabels[i] = trainset[i] trainFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq) #svd in training set svd = TruncatedSVD(n_components=1, n_iter=5, random_state=0) u = svd.fit(trainFeatures).components_[0] # the first singular vector # remove the projections of the sentence embeddings to their first principal component for i in range(trainFeatures.shape[0]): trainFeatures[i] = trainFeatures[i] - np.dot(trainFeatures[i],u.T) * u # Prepare dev set features devset = dataset.getDevSentences() nDev = len(devset) devFeatures = np.zeros((nDev, dimVectors)) devLabels = np.zeros((nDev,), dtype=np.int32) for i in range(nDev): words, devLabels[i] = devset[i] devFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq) for i in range(devFeatures.shape[0]): devFeatures[i] = devFeatures[i] - np.dot(devFeatures[i],u.T) * u # Prepare test set features testset = dataset.getTestSentences() nTest = len(testset) testFeatures = np.zeros((nTest, dimVectors)) testLabels = np.zeros((nTest,), dtype=np.int32) for i in range(nTest): words, testLabels[i] = testset[i] testFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq) for i in range(testFeatures.shape[0]): testFeatures[i] = testFeatures[i] - np.dot(testFeatures[i],u.T) * u # We will save our results from each run results = [] regValues = getRegularizationValues() for reg in regValues: print("Training for reg=%f" % reg) # Note: add a very small number to regularization to please the library clf = LogisticRegression(C=1.0/(reg + 1e-12)) clf.fit(trainFeatures, trainLabels) # Test on train set pred = clf.predict(trainFeatures) trainAccuracy = accuracy(trainLabels, pred) print("Train accuracy (%%): %f" % trainAccuracy) # Test on dev set pred = clf.predict(devFeatures) devAccuracy = accuracy(devLabels, pred) print("Dev accuracy (%%): %f" % devAccuracy) # Test on test set # Note: always running on test is poor style. Typically, you should # do this only after validation. pred = clf.predict(testFeatures) testAccuracy = accuracy(testLabels, pred) print("Test accuracy (%%): %f" % testAccuracy) results.append({ "reg": reg, "clf": clf, "train": trainAccuracy, "dev": devAccuracy, "test": testAccuracy}) # Print the accuracies print ("") print ("=== Recap ===") print ("Reg\t\tTrain\tDev\tTest") for result in results: print ("%.2E\t%.3f\t%.3f\t%.3f" % ( result["reg"], result["train"], result["dev"], result["test"])) print ("") bestResult = chooseBestModel(results) print ("Best regularization value: %0.2E" % bestResult["reg"]) print ("Test accuracy (%%): %f" % bestResult["test"]) # do some error analysis if args.pretrained: plotRegVsAccuracy(regValues, results, "q4_sif_reg_v_acc.png") outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"], "q4_sif_dev_conf.png") outputPredictions(devset, devFeatures, devLabels, bestResult["clf"], "q4_sif_dev_pred.txt")