示例#1
0
class NaiveBayesClassifier(object):
    def __init__(self):
        self.dictionary = Dictionary()
        self.classifier = Classifier()
        self.preprocessor = Preprocessor()
        self.set_categories()
        self.fit = False

    def set_categories(self):
        # Determines which labels are to be included
        self.dictionary.clear()
        self.categories = []
        print("Dictionary cleared")
        print("\nDecide which categories to use:")
        for label in label_list:
            choice = ""
            while choice not in ["y", "n"]:
                choice = input(
                    ("Include category: " + str(label_list.index(label) + 1) +
                     "/" + str(len(label_list)) + " " + label +
                     "? (y/n)\n")).lower()
                if choice == "y":
                    self.categories.append(label_list.index(label) + 1)
                if choice not in ["y", "n"]:
                    print("You must input y or n.")

        if len(self.categories) == 0:
            print("Error! At least one category must be selected!")
            self.set_categories()

        print("You have selected the following categories:\n")
        for category in self.categories:
            print(str(category) + ". " + label_list[category - 1])

    def extract(self):
        # Extracts forum posts from .json-files generated by Scraper
        lines = []
        for category in self.categories:
            lines.append(
                Extractor.extract(('flashback' + str(category) + '.json'),
                                  ('extracted' + str(category) + '.txt')))
            dataset_divider.Divider.divide(
                ('extracted' + str(category) + '.txt'), lines[len(lines) - 1])

    def classify(self):
        # Classifies unknown forum posts
        if not self.fit:
            print("Fitting must be performed before classifying")
            return

        vectorizer = Vectorizer(self.dictionary.dictionary)
        input_file = input(
            "Enter the name of the .txt file containing the unknown posts (including file-ending: "
        )
        try:
            with open(input_file, "r") as file:
                vectors = vectorizer.vectorize(
                    self.preprocessor.preprocess(file))
        except FileNotFoundError:
            if input("File not found. Press enter to try again or type 'm' and press enter to return to menu.").lower()\
                    == "m":
                return
            self.classify()
            return

        with open("result.txt", "w") as result_file:
            for line in self.classifier.classify(vectors):
                result_file.write((label_list[line] + "\n"))
        print(
            "Result saved in result.txt. " +
            "The predicted label of each post is printed on the corresponding line of the document."
        )

    def preprocess_and_fit(self):
        # Method that preprocesses data, indexes all words, vectorizes posts and finally trains and tests the classifier
        processed = []
        processed_test = []
        for category in self.categories:
            processed.append(
                self.preprocessor.preprocess('training' + str(category) +
                                             ".txt"))
            processed_test.append(
                self.preprocessor.preprocess('testing' + str(category) +
                                             ".txt"))

        # Word indexing
        for category in processed:  # indexes all words into dictionary
            self.dictionary.index_words(category)
        print("Words indexed. Dictionary size: ",
              len(self.dictionary.dictionary), " words")

        # Vectorization
        vectorizer = Vectorizer(
            self.dictionary.dictionary
        )  # initializes vectorizer-object with dictionary
        vector_start = time.time()
        print("Vectorizing...")
        training_vectors = []
        testing_vectors = []
        for category in processed:
            training_vectors.append(vectorizer.vectorize(category))
        for category in processed_test:
            testing_vectors.append(vectorizer.vectorize(category))
        vector_time = time.time() - vector_start
        print("Vectorization completed in ", ("%.2f" % vector_time), "seconds")

        # Training and evaluation
        self.classifier.train(training_vectors)
        self.fit = True
        self.classifier.evaluate(testing_vectors)
示例#2
0
#!/usr/bin/env python3
#-*- coding: utf-8 -*-

from dictionary import Dictionary
import sys


if len(sys.argv) != 3:
    sys.exit("Usage: generate_dictionary.py <input file> <ouput dictionary>")

with open(sys.argv[1], 'r') as input_file:
    d = Dictionary()
    d.open(sys.argv[2])
    d.clear()

    for word in input_file:
        d.add_word(word.strip())

    d.save()
    d.close()