def main(): print ("Generating language models....") trainAACleanLM = LanguageModel(CLEAN_TRAIN_AA_FILE) trainAAInsultLM = LanguageModel(INSULT_TRAIN_AA_FILE) trainABCleanLM = LanguageModel(CLEAN_TRAIN_AB_FILE) trainABInsultLM = LanguageModel(INSULT_TRAIN_AB_FILE) testCleanLM = LanguageModel(CLEAN_TEST_FILE) testInsultLM = LanguageModel(INSULT_TEST_FILE) trainLabels = np.array(([0] * trainABCleanLM.getDocCount()) + ([1] * trainABInsultLM.getDocCount())) testLabels = np.array(([0] * testCleanLM.getDocCount()) + ([1] * testInsultLM.getDocCount())) ### Just baseline probabilities print ("Running baseline....") NB = baselineNaiveBayes(trainAACleanLM, trainAAInsultLM) print ("\tTraining NB....") NB.train() print ("\tTesting NB....") totalNBMatrix = np.array(NB.genProbs(trainABCleanLM.getSents(), trainABInsultLM.getSents())) trainMatrix = totalNBMatrix testMatrix = np.array(NB.genProbs(testCleanLM.getSents(), testInsultLM.getSents())) clf = LogisticRegression() print ("\tTraining SVM....") clf.fit(trainMatrix, trainLabels) print ("\tTesting SVM....") output1 = clf.predict(testMatrix).tolist() ### Baseline + PoS Features print ("Running baseline + PoS Features....") cleanPosMatrix = trainABCleanLM.getPosMatrix() insultPosMatrix = trainABInsultLM.getPosMatrix() testCleanPosMatrix = testCleanLM.getPosMatrix() testInsultPosMatrix = testInsultLM.getPosMatrix() posFeatures = np.array(cleanPosMatrix + insultPosMatrix) testPosFeatures = np.array(testCleanPosMatrix + testInsultPosMatrix) trainMatrix = np.hstack((trainMatrix, posFeatures)) testMatrix = np.hstack((testMatrix, testPosFeatures)) clf = LogisticRegression() print ("\tTraining SVM....") clf.fit(trainMatrix, trainLabels) print ("\tTesting SVM....") output2 = clf.predict(testMatrix).tolist() ### Baseline + PoS Features + TF-IDF Features (TODO Arun) print("Running baseline + PoS Features + TF-IDF Features") # generate list of features with TFIDF, using trainABCleanLM and trainABInsultLM # trainMatrix = np.hstack((trainMatrix, the new thing you just generated)) # do same for testMatrix # clf = svm.SVC() # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output3 = clf.predict(testMatrix).tolist() # then update the output_file.txt thing below tfidf_train_features = tfidf.make_feature_vectors(trainAACleanLM, trainAAInsultLM, trainABCleanLM, trainABInsultLM) tfidf_test_features = tfidf.make_feature_vectors(trainAACleanLM, trainAAInsultLM, testCleanLM, testInsultLM) print tfidf_test_features.shape, tfidf_train_features.shape print testMatrix.shape, trainMatrix.shape trainMatrix = np.hstack((trainMatrix, tfidf_train_features)) testMatrix = np.hstack((testMatrix, tfidf_test_features)) clf = LogisticRegression() print ("\tTraining SVM....") clf.fit(trainMatrix, trainLabels) print ("\tTesting SVM....") output3 = clf.predict(testMatrix).tolist() ### SENTIMENT ### print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features") s = Sentiment() clean_train = np.array(s.get_clean_train_vector()) insult_train = np.array(s.get_insult_train_vector()) sentiment_train_features = np.concatenate((clean_train, insult_train), axis=0) shape = sentiment_train_features.shape sentiment_train_features = sentiment_train_features.reshape((shape[0], 1)) print sentiment_train_features.shape clean_test = np.array(s.get_clean_test_vector()) insult_test = np.array(s.get_insult_test_vector()) sentiment_test_features = np.concatenate((clean_test, insult_test), axis=0) shape = sentiment_test_features.shape sentiment_test_features = sentiment_test_features.reshape((shape[0], 1)) print sentiment_test_features.shape trainMatrix = np.hstack((trainMatrix, sentiment_train_features)) testMatrix = np.hstack((testMatrix, sentiment_test_features)) clf = LogisticRegression() print ("\tTraining SVM....") clf.fit(trainMatrix, trainLabels) print ("\tTesting SVM....") output4 = clf.predict(testMatrix).tolist() ### MISSPELLINGS ### print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features + Misspellings features") m = Misspellings() clean_train = np.array(m.get_clean_misspellings(False)) insult_train = np.array(m.get_insult_misspellings(False)) misspellings_train_features = np.concatenate((clean_train, insult_train), axis=0) shape = misspellings_train_features.shape misspellings_train_features = misspellings_train_features.reshape((shape[0], 1)) print misspellings_train_features.shape clean_test = np.array(m.get_clean_misspellings()) insult_test = np.array(m.get_insult_misspellings()) misspellings_test_features = np.concatenate((clean_test, insult_test), axis=0) shape = misspellings_test_features.shape misspellings_test_features = misspellings_test_features.reshape((shape[0], 1)) print misspellings_test_features.shape trainMatrix = np.hstack((trainMatrix, sentiment_train_features)) testMatrix = np.hstack((testMatrix, sentiment_test_features)) clf = LogisticRegression() print ("\tTraining SVM....") clf.fit(trainMatrix, trainLabels) print ("\tTesting SVM....") output5 = clf.predict(testMatrix).tolist() with open('LOG_REG_output_file_w_SB.txt', 'w+') as f: f.write("Output 1\n") f.write("{}\n".format(output1)) interpret_results(output1, testLabels, f) f.write("\nOutput 2\n") f.write("{}\n".format(output2)) interpret_results(output2, testLabels, f) f.write("\nOutput 3\n") f.write("{}\n".format(output3)) interpret_results(output3, testLabels, f) f.write("Output 4\n") f.write("{}\n".format(output4)) interpret_results(output4, testLabels, f) f.write("Output 5\n") f.write("{}\n".format(output5)) interpret_results(output5, testLabels, f)
def main(): print("Generating language models....") trainAACleanLM = LanguageModel(CLEAN_TRAIN_AA_FILE) trainAAInsultLM = LanguageModel(INSULT_TRAIN_AA_FILE) trainABCleanLM = LanguageModel(CLEAN_TRAIN_AB_FILE) trainABInsultLM = LanguageModel(INSULT_TRAIN_AB_FILE) testCleanLM = LanguageModel(CLEAN_TEST_FILE) testInsultLM = LanguageModel(INSULT_TEST_FILE) trainLabels = np.array(([0] * trainABCleanLM.getDocCount()) + ([1] * trainABInsultLM.getDocCount())) testLabels = np.array(([0] * testCleanLM.getDocCount()) + ([1] * testInsultLM.getDocCount())) ### Just baseline probabilities print("Running baseline....") NB = baselineNaiveBayes(trainAACleanLM, trainAAInsultLM) print("\tTraining NB....") NB.train() print("\tTesting NB....") totalNBMatrix = np.array( NB.genProbs(trainABCleanLM.getSents(), trainABInsultLM.getSents())) trainMatrix = totalNBMatrix testMatrix = np.array( NB.genProbs(testCleanLM.getSents(), testInsultLM.getSents())) # clf = svm.SVC(kernel='linear') # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output1 = clf.predict(testMatrix).tolist() ## Baseline + PoS Features print("Running baseline + PoS Features....") cleanPosMatrix = trainABCleanLM.getPosMatrix() insultPosMatrix = trainABInsultLM.getPosMatrix() testCleanPosMatrix = testCleanLM.getPosMatrix() testInsultPosMatrix = testInsultLM.getPosMatrix() posFeatures = np.array(cleanPosMatrix + insultPosMatrix) testPosFeatures = np.array(testCleanPosMatrix + testInsultPosMatrix) trainMatrix = np.hstack((trainMatrix, posFeatures)) testMatrix = np.hstack((testMatrix, testPosFeatures)) # clf = svm.SVC(kernel='linear') # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output2 = clf.predict(testMatrix).tolist() ### Baseline + PoS Features + TF-IDF Features (TODO Arun) print("Running baseline + PoS Features + TF-IDF Features") # generate list of features with TFIDF, using trainABCleanLM and trainABInsultLM # trainMatrix = np.hstack((trainMatrix, the new thing you just generated)) # do same for testMatrix # clf = svm.SVC() # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output3 = clf.predict(testMatrix).tolist() # then update the output_file.txt thing below tfidf_train_features = tfidf.make_feature_vectors(trainAACleanLM, trainAAInsultLM, trainABCleanLM, trainABInsultLM) tfidf_test_features = tfidf.make_feature_vectors(trainAACleanLM, trainAAInsultLM, testCleanLM, testInsultLM) print tfidf_test_features.shape, tfidf_train_features.shape print testMatrix.shape, trainMatrix.shape trainMatrix = np.hstack((trainMatrix, tfidf_train_features)) testMatrix = np.hstack((testMatrix, tfidf_test_features)) # clf = svm.SVC(kernel='linear') # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output3 = clf.predict(testMatrix).tolist() ### SENTIMENT ### print( "Running baseline + PoS Features + TF-IDF Features + Sentiment Features" ) s = Sentiment() clean_train = np.array(s.get_clean_train_vector()) insult_train = np.array(s.get_insult_train_vector()) sentiment_train_features = np.concatenate((clean_train, insult_train), axis=0) shape = sentiment_train_features.shape sentiment_train_features = sentiment_train_features.reshape((shape[0], 1)) print sentiment_train_features.shape clean_test = np.array(s.get_clean_test_vector()) insult_test = np.array(s.get_insult_test_vector()) sentiment_test_features = np.concatenate((clean_test, insult_test), axis=0) shape = sentiment_test_features.shape sentiment_test_features = sentiment_test_features.reshape((shape[0], 1)) print sentiment_test_features.shape trainMatrix = np.hstack((trainMatrix, sentiment_train_features)) testMatrix = np.hstack((testMatrix, sentiment_test_features)) # clf = svm.SVC(kernel='linear') # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output4 = clf.predict(testMatrix).tolist() ### MISSPELLINGS ### print( "Running baseline + PoS Features + TF-IDF Features + Sentiment Features + Misspellings features" ) m = Misspellings() clean_train = np.array(m.get_clean_misspellings(False)) insult_train = np.array(m.get_insult_misspellings(False)) misspellings_train_features = np.concatenate((clean_train, insult_train), axis=0) shape = misspellings_train_features.shape misspellings_train_features = misspellings_train_features.reshape( (shape[0], 1)) print misspellings_train_features.shape clean_test = np.array(m.get_clean_misspellings()) insult_test = np.array(m.get_insult_misspellings()) misspellings_test_features = np.concatenate((clean_test, insult_test), axis=0) shape = misspellings_test_features.shape misspellings_test_features = misspellings_test_features.reshape( (shape[0], 1)) print misspellings_test_features.shape trainMatrix = np.hstack((trainMatrix, sentiment_train_features)) testMatrix = np.hstack((testMatrix, sentiment_test_features)) clf = svm.SVC(kernel='linear') print("\tTraining SVM....") clf.fit(trainMatrix, trainLabels) print("\tTesting SVM....") output5 = clf.predict(testMatrix).tolist() index_shuf = range(len(trainMatrix)) trainMatrix_shuf = [] trainLabel_shuf = [] shuffle(index_shuf) for i in index_shuf: trainMatrix_shuf.append(trainMatrix[i]) trainLabel_shuf.append(trainLabels[i]) train_sizes, train_scores, valid_scores = learning_curve( svm.SVC(), trainMatrix_shuf, trainLabel_shuf, train_sizes=[100, 300, 500, 700, 900], cv=2) average_train_scores = [sum(i) / float(len(i)) for i in train_scores] average_valid_scores = [sum(i) / float(len(i)) for i in valid_scores] plt.plot(train_sizes, average_train_scores) plt.plot(train_sizes, average_valid_scores) plt.legend(['Training score', 'Cross-validation score'], loc='center left', bbox_to_anchor=(0.85, 0.5)) plt.ylabel('Score') plt.xlabel('Training examples') plt.show() # with open('SVM_output_file_with_SB.txt', 'w+') as f: # f.write("Output 1\n") # f.write("{}\n".format(output1)) # interpret_results(output1, testLabels, f) # f.write("\nOutput 2\n") # f.write("{}\n".format(output2)) # interpret_results(output2, testLabels, f) # f.write("\nOutput 3\n") # f.write("{}\n".format(output3)) # interpret_results(output3, testLabels, f) # f.write("Output 4\n") # f.write("{}\n".format(output4)) # interpret_results(output4, testLabels, f) # f.write("Output 5\n") # f.write("{}\n".format(output5)) # interpret_results(output5, testLabels, f) get_pca_graph(trainMatrix, trainLabels, "train_pca.png", title="PCA of Training Set") get_pca_graph(testMatrix, testLabels, "test_pca.png", title="PCA of Test Set") get_pca_graph(trainMatrix, trainLabels, "train_pca2.png", title="PCA of Training Set (Insults Only)", plot_negative=False) get_pca_graph(testMatrix, testLabels, "test_pca2.png", title="PCA of Test Set (Insults Only)", plot_negative=False)
def main(): print("Generating language models....") trainAACleanLM = LanguageModel(CLEAN_TRAIN_AA_FILE) trainAAInsultLM = LanguageModel(INSULT_TRAIN_AA_FILE) trainABCleanLM = LanguageModel(CLEAN_TRAIN_AB_FILE) trainABInsultLM = LanguageModel(INSULT_TRAIN_AB_FILE) testCleanLM = LanguageModel(CLEAN_TEST_FILE) testInsultLM = LanguageModel(INSULT_TEST_FILE) trainLabels = np.array(([0] * trainABCleanLM.getDocCount()) + ([1] * trainABInsultLM.getDocCount())) testLabels = np.array(([0] * testCleanLM.getDocCount()) + ([1] * testInsultLM.getDocCount())) ### Just baseline probabilities print("Running baseline....") NB = baselineNaiveBayes(trainAACleanLM, trainAAInsultLM) print("\tTraining NB....") NB.train() print("\tTesting NB....") totalNBMatrix = np.array( NB.genProbs(trainABCleanLM.getSents(), trainABInsultLM.getSents())) trainMatrix = totalNBMatrix testMatrix = np.array( NB.genProbs(testCleanLM.getSents(), testInsultLM.getSents())) clf = RandomForestClassifier() print("\tTraining random forest....") clf.fit(trainMatrix, trainLabels) print("\tTesting random forest....") output1 = clf.predict(testMatrix).tolist() ### Baseline + PoS Features print("Running baseline + PoS Features....") cleanPosMatrix = trainABCleanLM.getPosMatrix() insultPosMatrix = trainABInsultLM.getPosMatrix() testCleanPosMatrix = testCleanLM.getPosMatrix() testInsultPosMatrix = testInsultLM.getPosMatrix() posFeatures = np.array(cleanPosMatrix + insultPosMatrix) testPosFeatures = np.array(testCleanPosMatrix + testInsultPosMatrix) trainMatrix = np.hstack((trainMatrix, posFeatures)) testMatrix = np.hstack((testMatrix, testPosFeatures)) clf = RandomForestClassifier() print("\tTraining SVM....") clf.fit(trainMatrix, trainLabels) print("\tTesting SVM....") output2 = clf.predict(testMatrix).tolist() ### Baseline + PoS Features + TF-IDF Features (TODO Arun) print("Running baseline + PoS Features + TF-IDF Features") # generate list of features with TFIDF, using trainABCleanLM and trainABInsultLM # trainMatrix = np.hstack((trainMatrix, the new thing you just generated)) # do same for testMatrix # clf = svm.SVC() # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output3 = clf.predict(testMatrix).tolist() # then update the output_file.txt thing below tfidf_train_features = tfidf.make_feature_vectors(trainAACleanLM, trainAAInsultLM, trainABCleanLM, trainABInsultLM) tfidf_test_features = tfidf.make_feature_vectors(trainAACleanLM, trainAAInsultLM, testCleanLM, testInsultLM) print tfidf_test_features.shape, tfidf_train_features.shape print testMatrix.shape, trainMatrix.shape trainMatrix = np.hstack((trainMatrix, tfidf_train_features)) testMatrix = np.hstack((testMatrix, tfidf_test_features)) clf = RandomForestClassifier() print("\tTraining random forest....") clf.fit(trainMatrix, trainLabels) print("\tTesting random forest....") output3 = clf.predict(testMatrix).tolist() ### SENTIMENT ### print( "Running baseline + PoS Features + TF-IDF Features + Sentiment Features" ) s = Sentiment() clean_train = np.array(s.get_clean_train_vector()) insult_train = np.array(s.get_insult_train_vector()) sentiment_train_features = np.concatenate((clean_train, insult_train), axis=0) shape = sentiment_train_features.shape sentiment_train_features = sentiment_train_features.reshape((shape[0], 1)) print sentiment_train_features.shape clean_test = np.array(s.get_clean_test_vector()) insult_test = np.array(s.get_insult_test_vector()) sentiment_test_features = np.concatenate((clean_test, insult_test), axis=0) shape = sentiment_test_features.shape sentiment_test_features = sentiment_test_features.reshape((shape[0], 1)) print sentiment_test_features.shape trainMatrix = np.hstack((trainMatrix, sentiment_train_features)) testMatrix = np.hstack((testMatrix, sentiment_test_features)) clf = RandomForestClassifier() print("\tTraining random forest....") clf.fit(trainMatrix, trainLabels) print("\tTesting random forest....") output4 = clf.predict(testMatrix).tolist() ### MISSPELLINGS ### print( "Running baseline + PoS Features + TF-IDF Features + Sentiment Features + Misspellings features" ) m = Misspellings() clean_train = np.array(m.get_clean_misspellings(False)) insult_train = np.array(m.get_insult_misspellings(False)) misspellings_train_features = np.concatenate((clean_train, insult_train), axis=0) shape = misspellings_train_features.shape misspellings_train_features = misspellings_train_features.reshape( (shape[0], 1)) print misspellings_train_features.shape clean_test = np.array(m.get_clean_misspellings()) insult_test = np.array(m.get_insult_misspellings()) misspellings_test_features = np.concatenate((clean_test, insult_test), axis=0) shape = misspellings_test_features.shape misspellings_test_features = misspellings_test_features.reshape( (shape[0], 1)) print misspellings_test_features.shape trainMatrix = np.hstack((trainMatrix, sentiment_train_features)) testMatrix = np.hstack((testMatrix, sentiment_test_features)) clf = RandomForestClassifier() print("\tTraining random forest....") clf.fit(trainMatrix, trainLabels) print("\tTesting forest....") output5 = clf.predict(testMatrix).tolist() with open('RANDOM_FOREST_output_file_without_SB.txt', 'w+') as f: f.write("Output 1\n") f.write("{}\n".format(output1)) interpret_results(output1, testLabels, f) f.write("\nOutput 2\n") f.write("{}\n".format(output2)) interpret_results(output2, testLabels, f) f.write("\nOutput 3\n") f.write("{}\n".format(output3)) interpret_results(output3, testLabels, f) f.write("Output 4\n") f.write("{}\n".format(output4)) interpret_results(output4, testLabels, f) f.write("Output 5\n") f.write("{}\n".format(output5)) interpret_results(output5, testLabels, f)
def main(): print ("Generating language models....") trainAACleanLM = LanguageModel(CLEAN_TRAIN_AA_FILE) trainAAInsultLM = LanguageModel(INSULT_TRAIN_AA_FILE) trainABCleanLM = LanguageModel(CLEAN_TRAIN_AB_FILE) trainABInsultLM = LanguageModel(INSULT_TRAIN_AB_FILE) testCleanLM = LanguageModel(CLEAN_TEST_FILE) testInsultLM = LanguageModel(INSULT_TEST_FILE) trainLabels = np.array(([0] * trainABCleanLM.getDocCount()) + ([1] * trainABInsultLM.getDocCount())) testLabels = np.array(([0] * testCleanLM.getDocCount()) + ([1] * testInsultLM.getDocCount())) ### Just baseline probabilities print ("Running baseline....") NB = baselineNaiveBayes(trainAACleanLM, trainAAInsultLM) print ("\tTraining NB....") NB.train() print ("\tTesting NB....") totalNBMatrix = np.array(NB.genProbs(trainABCleanLM.getSents(), trainABInsultLM.getSents())) trainMatrix = totalNBMatrix testMatrix = np.array(NB.genProbs(testCleanLM.getSents(), testInsultLM.getSents())) # clf = svm.SVC(kernel='linear') # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output1 = clf.predict(testMatrix).tolist() ## Baseline + PoS Features print ("Running baseline + PoS Features....") cleanPosMatrix = trainABCleanLM.getPosMatrix() insultPosMatrix = trainABInsultLM.getPosMatrix() testCleanPosMatrix = testCleanLM.getPosMatrix() testInsultPosMatrix = testInsultLM.getPosMatrix() posFeatures = np.array(cleanPosMatrix + insultPosMatrix) testPosFeatures = np.array(testCleanPosMatrix + testInsultPosMatrix) trainMatrix = np.hstack((trainMatrix, posFeatures)) testMatrix = np.hstack((testMatrix, testPosFeatures)) # clf = svm.SVC(kernel='linear') # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output2 = clf.predict(testMatrix).tolist() ### Baseline + PoS Features + TF-IDF Features (TODO Arun) print("Running baseline + PoS Features + TF-IDF Features") # generate list of features with TFIDF, using trainABCleanLM and trainABInsultLM # trainMatrix = np.hstack((trainMatrix, the new thing you just generated)) # do same for testMatrix # clf = svm.SVC() # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output3 = clf.predict(testMatrix).tolist() # then update the output_file.txt thing below tfidf_train_features = tfidf.make_feature_vectors(trainAACleanLM, trainAAInsultLM, trainABCleanLM, trainABInsultLM) tfidf_test_features = tfidf.make_feature_vectors(trainAACleanLM, trainAAInsultLM, testCleanLM, testInsultLM) print tfidf_test_features.shape, tfidf_train_features.shape print testMatrix.shape, trainMatrix.shape trainMatrix = np.hstack((trainMatrix, tfidf_train_features)) testMatrix = np.hstack((testMatrix, tfidf_test_features)) # clf = svm.SVC(kernel='linear') # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output3 = clf.predict(testMatrix).tolist() ### SENTIMENT ### print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features") s = Sentiment() clean_train = np.array(s.get_clean_train_vector()) insult_train = np.array(s.get_insult_train_vector()) sentiment_train_features = np.concatenate((clean_train, insult_train), axis=0) shape = sentiment_train_features.shape sentiment_train_features = sentiment_train_features.reshape((shape[0], 1)) print sentiment_train_features.shape clean_test = np.array(s.get_clean_test_vector()) insult_test = np.array(s.get_insult_test_vector()) sentiment_test_features = np.concatenate((clean_test, insult_test), axis=0) shape = sentiment_test_features.shape sentiment_test_features = sentiment_test_features.reshape((shape[0], 1)) print sentiment_test_features.shape trainMatrix = np.hstack((trainMatrix, sentiment_train_features)) testMatrix = np.hstack((testMatrix, sentiment_test_features)) # clf = svm.SVC(kernel='linear') # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output4 = clf.predict(testMatrix).tolist() ### MISSPELLINGS ### print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features + Misspellings features") m = Misspellings() clean_train = np.array(m.get_clean_misspellings(False)) insult_train = np.array(m.get_insult_misspellings(False)) misspellings_train_features = np.concatenate((clean_train, insult_train), axis=0) shape = misspellings_train_features.shape misspellings_train_features = misspellings_train_features.reshape((shape[0], 1)) print misspellings_train_features.shape clean_test = np.array(m.get_clean_misspellings()) insult_test = np.array(m.get_insult_misspellings()) misspellings_test_features = np.concatenate((clean_test, insult_test), axis=0) shape = misspellings_test_features.shape misspellings_test_features = misspellings_test_features.reshape((shape[0], 1)) print misspellings_test_features.shape trainMatrix = np.hstack((trainMatrix, sentiment_train_features)) testMatrix = np.hstack((testMatrix, sentiment_test_features)) clf = svm.SVC(kernel='linear') print ("\tTraining SVM....") clf.fit(trainMatrix, trainLabels) print ("\tTesting SVM....") output5 = clf.predict(testMatrix).tolist() index_shuf = range(len(trainMatrix)) trainMatrix_shuf = [] trainLabel_shuf = [] shuffle(index_shuf) for i in index_shuf: trainMatrix_shuf.append(trainMatrix[i]) trainLabel_shuf.append(trainLabels[i]) train_sizes, train_scores, valid_scores = learning_curve(svm.SVC(), trainMatrix_shuf, trainLabel_shuf, train_sizes=[100, 300, 500, 700, 900], cv=2) average_train_scores = [sum(i)/float(len(i)) for i in train_scores] average_valid_scores = [sum(i)/float(len(i)) for i in valid_scores] plt.plot(train_sizes, average_train_scores) plt.plot(train_sizes, average_valid_scores) plt.legend(['Training score', 'Cross-validation score'], loc='center left', bbox_to_anchor=(0.85, 0.5)) plt.ylabel('Score') plt.xlabel('Training examples') plt.show() # with open('SVM_output_file_with_SB.txt', 'w+') as f: # f.write("Output 1\n") # f.write("{}\n".format(output1)) # interpret_results(output1, testLabels, f) # f.write("\nOutput 2\n") # f.write("{}\n".format(output2)) # interpret_results(output2, testLabels, f) # f.write("\nOutput 3\n") # f.write("{}\n".format(output3)) # interpret_results(output3, testLabels, f) # f.write("Output 4\n") # f.write("{}\n".format(output4)) # interpret_results(output4, testLabels, f) # f.write("Output 5\n") # f.write("{}\n".format(output5)) # interpret_results(output5, testLabels, f) get_pca_graph(trainMatrix, trainLabels, "train_pca.png", title="PCA of Training Set") get_pca_graph(testMatrix, testLabels, "test_pca.png", title="PCA of Test Set") get_pca_graph(trainMatrix, trainLabels, "train_pca2.png", title="PCA of Training Set (Insults Only)", plot_negative=False) get_pca_graph(testMatrix, testLabels, "test_pca2.png", title="PCA of Test Set (Insults Only)", plot_negative=False)