示例#1
0
def applyRules(IDsFilename):
    """Uses rule based approach to classify the reviews from the given set."""
    print("Using the set at '{path}{file}'".format(path=CORPUS_PATH, 
                                                    file=IDsFilename))
    
    print("Creating reviews...(this may take a while)")
    dataSet = Corpus(IDsFilename, corpusPath=CORPUS_PATH)
    
    # print("Loading reviews...")
    # dataSet = Corpus.loadCorpus(filename="training_set.pk")

    print("Extracting features...")
    features, featureVectors = extractFeatures(dataSet.reviewIDs, 
                                                dataSet.reviews)

    gold = dataSet.goldStandard
    classification = classify(features, featureVectors)

    showFeatureOccurrence(features, featureVectors, gold, classification)
    
    targets = []
    cls = []

    for ID, g in gold.items():
        targets.append(g)
        cls.append(classification[ID])

    showPerformance(targets, cls)
def applyDecisionTree(trainData, trainTargets, testData, testTargets, featureNames):
    """Train and classify using a Decision Tree and prints the decision Tree."""
    decisionTree = DecisionTreeClassifier()
    model = decisionTree.fit(trainData, trainTargets)

    # Create graph description of the Decision Tree
    dot_data = StringIO()
    #export_graphviz(model, out_file=dot_data, max_depth=5)
    print("Feature names:", featureNames)
    export_graphviz(model, out_file=dot_data, feature_names=featureNames,
                    max_depth=5)
    export_graphviz(model, out_file="DecisionTree.dot", feature_names=featureNames,
                    max_depth=5)
    #with open("DecisionTree.dot", 'r') as dotFile:
    #    dotFile.write(exportFile)
    # Create PDF from dot
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    #path = "/Users/konstantin/Documents/University/Bachelorthesis/paper/src/DecisionTree.dot"
    #graph = pydot.graph_from_dot_file(path)
    #graph.write_pdf("DecisionTree.pdf")


    classification = [model.predict(d)[0] for d in testData]

    print("\nUsing a Decision Tree:")
    showPerformance(testTargets, classification)
示例#3
0
def applyRules(IDsFilename):
    """Uses rule based approach to classify the reviews from the given set."""
    print("Using the set at '{path}{file}'".format(path=CORPUS_PATH,
                                                   file=IDsFilename))

    print("Creating reviews...(this may take a while)")
    dataSet = Corpus(IDsFilename, corpusPath=CORPUS_PATH)

    # print("Loading reviews...")
    # dataSet = Corpus.loadCorpus(filename="training_set.pk")

    print("Extracting features...")
    features, featureVectors = extractFeatures(dataSet.reviewIDs,
                                               dataSet.reviews)

    gold = dataSet.goldStandard
    classification = classify(features, featureVectors)

    showFeatureOccurrence(features, featureVectors, gold, classification)

    targets = []
    cls = []

    for ID, g in gold.items():
        targets.append(g)
        cls.append(classification[ID])

    showPerformance(targets, cls)
def applyDecisionTree(trainData, trainTargets, testData, testTargets, featureNames):
    """Train and classify using a Decision Tree and prints the decision Tree."""
    decisionTree = DecisionTreeClassifier()
    model = decisionTree.fit(trainData, trainTargets)

    # Create graph description of the Decision Tree
    dot_data = StringIO() 
    #export_graphviz(model, out_file=dot_data, max_depth=5)
    print("Feature names:", featureNames)
    export_graphviz(model, out_file=dot_data, feature_names=featureNames, 
                    max_depth=5)
    export_graphviz(model, out_file="DecisionTree.dot", feature_names=featureNames, 
                    max_depth=5)
    #with open("DecisionTree.dot", 'r') as dotFile:
    #    dotFile.write(exportFile)
    # Create PDF from dot
    graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
    #path = "/Users/konstantin/Documents/University/Bachelorthesis/paper/src/DecisionTree.dot"
    #graph = pydot.graph_from_dot_file(path) 
    #graph.write_pdf("DecisionTree.pdf")


    classification = [model.predict(d)[0] for d in testData]

    print("\nUsing a Decision Tree:")
    showPerformance(testTargets, classification)
def applyClassifier(classifier, trainData, trainTargets, testData, testTargets):
    """Train and classify using a Support Vector Machine."""
    model = classifier.fit(trainData, trainTargets)

    classification = model.predict(testData)

    print("\nUsing {0}".format(classifier))
    showPerformance(testTargets, classification)
def applyClassifier(classifier, trainData, trainTargets, testData, testTargets):
    """Train and classify using a Support Vector Machine."""
    model = classifier.fit(trainData, trainTargets)

    classification = [model.predict(d)[0] for d in testData]

    print("\nUsing {0}".format(classifier))
    showPerformance(testTargets, classification)    
def applySVM(trainData, trainTargets, testData, testTargets):
    """Train and classify using a Support Vector Machine (linear kernel)."""
    svm = SVC(kernel="linear")
    model = svm.fit(trainData, trainTargets)

    classification = [model.predict(d)[0] for d in testData]

    print("\nUsing a Support Vector Machine:")
    showPerformance(testTargets, classification)
def applyNaiveBayes2(trainData, trainTargets, testData, testTargets):
    """Train and classify using Naive Bayes."""
    gnb = MultinomialNB()
    model = gnb.fit(trainData, trainTargets)

    classification = [model.predict(d)[0] for d in testData]

    print("\nUsing Naive Bayes:")
    showPerformance(testTargets, classification)
def applyNaiveBayes2(trainData, trainTargets, testData, testTargets):
    """Train and classify using Naive Bayes."""
    gnb = MultinomialNB()
    model = gnb.fit(trainData, trainTargets)

    classification = [model.predict(d)[0] for d in testData]

    print("\nUsing Naive Bayes:")
    showPerformance(testTargets, classification)
示例#10
0
def applySVM(trainData, trainTargets, testData, testTargets):
    """Train and classify using a Support Vector Machine (linear kernel)."""
    svm = SVC(kernel="linear")
    model = svm.fit(trainData, trainTargets)

    classification = [model.predict(d)[0] for d in testData]

    print("\nUsing a Support Vector Machine:")
    showPerformance(testTargets, classification)
示例#11
0
def applyMLP(trainData, trainTargets, testData, testTargets):
    """Train and classify using Naive Bayes."""
    mlp = MLPClassifier()
    model = mlp.fit(trainData, trainTargets)

    classification = [model.predict(d)[0] for d in testData]

    print("\nUsing MLP:")
    showPerformance(testTargets, classification)
示例#12
0
def testRules():
    """Uses rule based approach to classify reviews."""
    ironicIDs, regularIDs, reviews = createTestReviews()
    features, featureVectors = extractFeatures(ironicIDs + regularIDs, reviews)

    gold = {ID: reviews[ID].ironic for ID in ironicIDs + regularIDs}
    classification = ruleClassify(features, featureVectors)

    showFeatureOccurrence(features, featureVectors, gold, classification)
    showPerformance(gold, classification)
示例#13
0
def testRules():
    """Uses rule based approach to classify reviews."""
    ironicIDs, regularIDs, reviews = createTestReviews()
    features, featureVectors = extractFeatures(ironicIDs + regularIDs, reviews)

    gold = {ID: reviews[ID].ironic for ID in ironicIDs + regularIDs}
    classification = ruleClassify(features, featureVectors)

    showFeatureOccurrence(features, featureVectors, gold, classification)
    showPerformance(gold, classification)
def applyClassifier(class1, class2, doc_name, id_map, classifier, name,
                    trainData, trainTargets, testData, testTargets):
    """Train and classify using a Support Vector Machine."""

    # feed model with data from feature extraction.
    model = classifier.fit(trainData, trainTargets)

    trainData = []
    trainTargets = []

    # save created model for re-use.
    joblib.dump(model, "model_" + doc_name + '.pkl')

    classification = [model.predict(d)[0] for d in testData]
    testData = []

    y_pred = np.asarray(classification)
    y = np.asarray(testTargets)
    classif_rate = np.mean(y_pred.ravel() == y.ravel()) * 100

    # get IDs of 'problematic' tweets in additional file for further analysis.
    with open("info_" + doc_name + ".txt", "a") as info, open(
            "mispredicted_ids_" + doc_name + ".txt", "w") as mis, open(
                "predicted_as_" + class1 + "_" + doc_name + ".txt",
                "w") as correct, open(
                    "predicted_as_" + class2 + "_" + doc_name + ".txt",
                    "w") as wrong:
        info.write("\nUsing {0}".format(classifier) + "\n")
        print("\nUsing {0}".format(classifier))
        info.write("classifier_rate for " + name + ": " + str(classif_rate) +
                   "\n")
        print("classif_rate for %s : %f " % (name, classif_rate))

        showPerformance(doc_name, name, testTargets, classification,
                        [class1, class2])

        for i in range(len(classification)):
            # Get IDs of the mispredicted tweets:
            if classification[i] != testTargets[i]:
                mis.write(str(id_map[i]) + "\n")

            # Get all tweets predicted as class 1 - does not matter if prediction is correct
            if classification[i] == 1:
                correct.write(str(id_map[i]) + "\n")

            # Get all tweets predicted as class 2
            else:
                wrong.write(str(id_map[i]) + "\n")

        testTargets = []
        id_map = {}
        classification = []
示例#15
0
def applySingleRules(IDsFilename):
    """
    Should originally just apply one rule.
    Is now used to apply one feature to the given corpus.
    So it basically shows how often each feature occurs in ironic and regular 
    reviews.
    """
    print("Using the set at '{path}{file}'".format(path=CORPUS_PATH, 
                                                    file=IDsFilename))
    
    print("Creating reviews...(this may take a while)")
    dataSet = Corpus(IDsFilename, corpusPath=CORPUS_PATH)
    print("Loading reviews...")
#   dataSet = Corpus.loadCorpus(filename="training_set.pk")
    # dataSet = Corpus.loadCorpus(filename="training_and_validation_set.pk")


    print("Extracting features...")
    features, featureVectors = extractFeatures(dataSet.reviewIDs, 
                                                dataSet.reviews)

    showFeatureOccurrence(features, featureVectors)

    gold = dataSet.goldStandard
    
    # decisiveFeatureNames = ["Scare quotes", 
    #                         "Positive star polarity discrepancy",
    #                         "Negative star polarity discrepancy",
    #                         "Positive Ppunctuation",
    #                         "Negative Ppunctuation",
    #                         "Streak of Positive Words",
    #                         "Ellipsis and Punctuation",
    #                         "Emoticon Happy", "Emoticon Laughing", 
    #                         "Emoticon Winking", "Emotion Tongue", 
    #                         "LoLAcroym", "GrinAcronym", "Onomatopoeia",
    #                         "Interrobang"]

    decisiveFeatureNames = [f.name for f in features]

    for d in decisiveFeatureNames:
        classification = classify(features, featureVectors, [d])

        targets = []
        cls = []

        for ID, g in gold.items():
            targets.append(g)
            cls.append(classification[ID])

        print("\nClassifying by rule: ", d)

        showPerformance(targets, cls)
示例#16
0
def applySingleRules(IDsFilename):
    """
    Should originally just apply one rule.
    Is now used to apply one feature to the given corpus.
    So it basically shows how often each feature occurs in ironic and regular
    reviews.
    """
    print("Using the set at '{path}{file}'".format(path=CORPUS_PATH,
                                                   file=IDsFilename))

    print("Creating reviews...(this may take a while)")
    dataSet = Corpus(IDsFilename, corpusPath=CORPUS_PATH)
    print("Loading reviews...")
    #   dataSet = Corpus.loadCorpus(filename="training_set.pk")
    # dataSet = Corpus.loadCorpus(filename="training_and_validation_set.pk")

    print("Extracting features...")
    features, featureVectors = extractFeatures(dataSet.reviewIDs,
                                               dataSet.reviews)

    showFeatureOccurrence(features, featureVectors)

    gold = dataSet.goldStandard

    # decisiveFeatureNames = ["Scare quotes",
    #                         "Positive star polarity discrepancy",
    #                         "Negative star polarity discrepancy",
    #                         "Positive Ppunctuation",
    #                         "Negative Ppunctuation",
    #                         "Streak of Positive Words",
    #                         "Ellipsis and Punctuation",
    #                         "Emoticon Happy", "Emoticon Laughing",
    #                         "Emoticon Winking", "Emotion Tongue",
    #                         "LoLAcroym", "GrinAcronym", "Onomatopoeia",
    #                         "Interrobang"]

    decisiveFeatureNames = [f.name for f in features]

    for d in decisiveFeatureNames:
        classification = classify(features, featureVectors, [d])

        targets = []
        cls = []

        for ID, g in gold.items():
            targets.append(g)
            cls.append(classification[ID])

        print("\nClassifying by rule: ", d)

        showPerformance(targets, cls)
def applyClassifier(class1, class2, doc_name, id_map, classifier, name, trainData, trainTargets, testData, testTargets):
    """Train and classify using a Support Vector Machine."""
    
    # feed model with data from feature extraction.
    model = classifier.fit(trainData, trainTargets)
    
    trainData = []
    trainTargets = []
    
    # save created model for re-use.
    joblib.dump(model, "model_" + doc_name + '.pkl')
    
    classification = [model.predict(d)[0] for d in testData]
    testData = []
    
    y_pred = np.asarray(classification)          
    y = np.asarray(testTargets)
    classif_rate = np.mean(y_pred.ravel() == y.ravel()) * 100
    
    # get IDs of 'problematic' tweets in additional file for further analysis.
    with open("info_" + doc_name + ".txt", "a") as info, open("mispredicted_ids_" + doc_name + ".txt", "w") as mis, open("predicted_as_" + class1 + "_" + doc_name + ".txt", "w") as correct, open("predicted_as_" + class2 + "_" + doc_name + ".txt", "w") as wrong:
        info.write("\nUsing {0}".format(classifier) + "\n")
        print("\nUsing {0}".format(classifier))
        info.write("classifier_rate for " + name + ": " + str(classif_rate) + "\n")
        print("classif_rate for %s : %f " % (name, classif_rate))
        
        showPerformance(doc_name, name, testTargets, classification, [class1, class2]) 
        
        
        for i in range(len(classification)):
            # Get IDs of the mispredicted tweets:
            if classification[i] != testTargets[i]:
                mis.write(str(id_map[i]) + "\n")
            
            # Get all tweets predicted as class 1 - does not matter if prediction is correct
            if classification[i] == 1:
                correct.write(str(id_map[i]) + "\n")
                
            # Get all tweets predicted as class 2
            else:
                wrong.write(str(id_map[i]) + "\n")
        
        testTargets = []
        id_map = {}
        classification = []