def applyRules(IDsFilename): """Uses rule based approach to classify the reviews from the given set.""" print("Using the set at '{path}{file}'".format(path=CORPUS_PATH, file=IDsFilename)) print("Creating reviews...(this may take a while)") dataSet = Corpus(IDsFilename, corpusPath=CORPUS_PATH) # print("Loading reviews...") # dataSet = Corpus.loadCorpus(filename="training_set.pk") print("Extracting features...") features, featureVectors = extractFeatures(dataSet.reviewIDs, dataSet.reviews) gold = dataSet.goldStandard classification = classify(features, featureVectors) showFeatureOccurrence(features, featureVectors, gold, classification) targets = [] cls = [] for ID, g in gold.items(): targets.append(g) cls.append(classification[ID]) showPerformance(targets, cls)
def applyDecisionTree(trainData, trainTargets, testData, testTargets, featureNames): """Train and classify using a Decision Tree and prints the decision Tree.""" decisionTree = DecisionTreeClassifier() model = decisionTree.fit(trainData, trainTargets) # Create graph description of the Decision Tree dot_data = StringIO() #export_graphviz(model, out_file=dot_data, max_depth=5) print("Feature names:", featureNames) export_graphviz(model, out_file=dot_data, feature_names=featureNames, max_depth=5) export_graphviz(model, out_file="DecisionTree.dot", feature_names=featureNames, max_depth=5) #with open("DecisionTree.dot", 'r') as dotFile: # dotFile.write(exportFile) # Create PDF from dot graph = pydot.graph_from_dot_data(dot_data.getvalue()) #path = "/Users/konstantin/Documents/University/Bachelorthesis/paper/src/DecisionTree.dot" #graph = pydot.graph_from_dot_file(path) #graph.write_pdf("DecisionTree.pdf") classification = [model.predict(d)[0] for d in testData] print("\nUsing a Decision Tree:") showPerformance(testTargets, classification)
def applyClassifier(classifier, trainData, trainTargets, testData, testTargets): """Train and classify using a Support Vector Machine.""" model = classifier.fit(trainData, trainTargets) classification = model.predict(testData) print("\nUsing {0}".format(classifier)) showPerformance(testTargets, classification)
def applyClassifier(classifier, trainData, trainTargets, testData, testTargets): """Train and classify using a Support Vector Machine.""" model = classifier.fit(trainData, trainTargets) classification = [model.predict(d)[0] for d in testData] print("\nUsing {0}".format(classifier)) showPerformance(testTargets, classification)
def applySVM(trainData, trainTargets, testData, testTargets): """Train and classify using a Support Vector Machine (linear kernel).""" svm = SVC(kernel="linear") model = svm.fit(trainData, trainTargets) classification = [model.predict(d)[0] for d in testData] print("\nUsing a Support Vector Machine:") showPerformance(testTargets, classification)
def applyNaiveBayes2(trainData, trainTargets, testData, testTargets): """Train and classify using Naive Bayes.""" gnb = MultinomialNB() model = gnb.fit(trainData, trainTargets) classification = [model.predict(d)[0] for d in testData] print("\nUsing Naive Bayes:") showPerformance(testTargets, classification)
def applyMLP(trainData, trainTargets, testData, testTargets): """Train and classify using Naive Bayes.""" mlp = MLPClassifier() model = mlp.fit(trainData, trainTargets) classification = [model.predict(d)[0] for d in testData] print("\nUsing MLP:") showPerformance(testTargets, classification)
def testRules(): """Uses rule based approach to classify reviews.""" ironicIDs, regularIDs, reviews = createTestReviews() features, featureVectors = extractFeatures(ironicIDs + regularIDs, reviews) gold = {ID: reviews[ID].ironic for ID in ironicIDs + regularIDs} classification = ruleClassify(features, featureVectors) showFeatureOccurrence(features, featureVectors, gold, classification) showPerformance(gold, classification)
def applyClassifier(class1, class2, doc_name, id_map, classifier, name, trainData, trainTargets, testData, testTargets): """Train and classify using a Support Vector Machine.""" # feed model with data from feature extraction. model = classifier.fit(trainData, trainTargets) trainData = [] trainTargets = [] # save created model for re-use. joblib.dump(model, "model_" + doc_name + '.pkl') classification = [model.predict(d)[0] for d in testData] testData = [] y_pred = np.asarray(classification) y = np.asarray(testTargets) classif_rate = np.mean(y_pred.ravel() == y.ravel()) * 100 # get IDs of 'problematic' tweets in additional file for further analysis. with open("info_" + doc_name + ".txt", "a") as info, open( "mispredicted_ids_" + doc_name + ".txt", "w") as mis, open( "predicted_as_" + class1 + "_" + doc_name + ".txt", "w") as correct, open( "predicted_as_" + class2 + "_" + doc_name + ".txt", "w") as wrong: info.write("\nUsing {0}".format(classifier) + "\n") print("\nUsing {0}".format(classifier)) info.write("classifier_rate for " + name + ": " + str(classif_rate) + "\n") print("classif_rate for %s : %f " % (name, classif_rate)) showPerformance(doc_name, name, testTargets, classification, [class1, class2]) for i in range(len(classification)): # Get IDs of the mispredicted tweets: if classification[i] != testTargets[i]: mis.write(str(id_map[i]) + "\n") # Get all tweets predicted as class 1 - does not matter if prediction is correct if classification[i] == 1: correct.write(str(id_map[i]) + "\n") # Get all tweets predicted as class 2 else: wrong.write(str(id_map[i]) + "\n") testTargets = [] id_map = {} classification = []
def applySingleRules(IDsFilename): """ Should originally just apply one rule. Is now used to apply one feature to the given corpus. So it basically shows how often each feature occurs in ironic and regular reviews. """ print("Using the set at '{path}{file}'".format(path=CORPUS_PATH, file=IDsFilename)) print("Creating reviews...(this may take a while)") dataSet = Corpus(IDsFilename, corpusPath=CORPUS_PATH) print("Loading reviews...") # dataSet = Corpus.loadCorpus(filename="training_set.pk") # dataSet = Corpus.loadCorpus(filename="training_and_validation_set.pk") print("Extracting features...") features, featureVectors = extractFeatures(dataSet.reviewIDs, dataSet.reviews) showFeatureOccurrence(features, featureVectors) gold = dataSet.goldStandard # decisiveFeatureNames = ["Scare quotes", # "Positive star polarity discrepancy", # "Negative star polarity discrepancy", # "Positive Ppunctuation", # "Negative Ppunctuation", # "Streak of Positive Words", # "Ellipsis and Punctuation", # "Emoticon Happy", "Emoticon Laughing", # "Emoticon Winking", "Emotion Tongue", # "LoLAcroym", "GrinAcronym", "Onomatopoeia", # "Interrobang"] decisiveFeatureNames = [f.name for f in features] for d in decisiveFeatureNames: classification = classify(features, featureVectors, [d]) targets = [] cls = [] for ID, g in gold.items(): targets.append(g) cls.append(classification[ID]) print("\nClassifying by rule: ", d) showPerformance(targets, cls)
def applyClassifier(class1, class2, doc_name, id_map, classifier, name, trainData, trainTargets, testData, testTargets): """Train and classify using a Support Vector Machine.""" # feed model with data from feature extraction. model = classifier.fit(trainData, trainTargets) trainData = [] trainTargets = [] # save created model for re-use. joblib.dump(model, "model_" + doc_name + '.pkl') classification = [model.predict(d)[0] for d in testData] testData = [] y_pred = np.asarray(classification) y = np.asarray(testTargets) classif_rate = np.mean(y_pred.ravel() == y.ravel()) * 100 # get IDs of 'problematic' tweets in additional file for further analysis. with open("info_" + doc_name + ".txt", "a") as info, open("mispredicted_ids_" + doc_name + ".txt", "w") as mis, open("predicted_as_" + class1 + "_" + doc_name + ".txt", "w") as correct, open("predicted_as_" + class2 + "_" + doc_name + ".txt", "w") as wrong: info.write("\nUsing {0}".format(classifier) + "\n") print("\nUsing {0}".format(classifier)) info.write("classifier_rate for " + name + ": " + str(classif_rate) + "\n") print("classif_rate for %s : %f " % (name, classif_rate)) showPerformance(doc_name, name, testTargets, classification, [class1, class2]) for i in range(len(classification)): # Get IDs of the mispredicted tweets: if classification[i] != testTargets[i]: mis.write(str(id_map[i]) + "\n") # Get all tweets predicted as class 1 - does not matter if prediction is correct if classification[i] == 1: correct.write(str(id_map[i]) + "\n") # Get all tweets predicted as class 2 else: wrong.write(str(id_map[i]) + "\n") testTargets = [] id_map = {} classification = []