class NaiveBayesClassifier(object): def __init__(self): self.dictionary = Dictionary() self.classifier = Classifier() self.preprocessor = Preprocessor() self.set_categories() self.fit = False def set_categories(self): # Determines which labels are to be included self.dictionary.clear() self.categories = [] print("Dictionary cleared") print("\nDecide which categories to use:") for label in label_list: choice = "" while choice not in ["y", "n"]: choice = input( ("Include category: " + str(label_list.index(label) + 1) + "/" + str(len(label_list)) + " " + label + "? (y/n)\n")).lower() if choice == "y": self.categories.append(label_list.index(label) + 1) if choice not in ["y", "n"]: print("You must input y or n.") if len(self.categories) == 0: print("Error! At least one category must be selected!") self.set_categories() print("You have selected the following categories:\n") for category in self.categories: print(str(category) + ". " + label_list[category - 1]) def extract(self): # Extracts forum posts from .json-files generated by Scraper lines = [] for category in self.categories: lines.append( Extractor.extract(('flashback' + str(category) + '.json'), ('extracted' + str(category) + '.txt'))) dataset_divider.Divider.divide( ('extracted' + str(category) + '.txt'), lines[len(lines) - 1]) def classify(self): # Classifies unknown forum posts if not self.fit: print("Fitting must be performed before classifying") return vectorizer = Vectorizer(self.dictionary.dictionary) input_file = input( "Enter the name of the .txt file containing the unknown posts (including file-ending: " ) try: with open(input_file, "r") as file: vectors = vectorizer.vectorize( self.preprocessor.preprocess(file)) except FileNotFoundError: if input("File not found. Press enter to try again or type 'm' and press enter to return to menu.").lower()\ == "m": return self.classify() return with open("result.txt", "w") as result_file: for line in self.classifier.classify(vectors): result_file.write((label_list[line] + "\n")) print( "Result saved in result.txt. " + "The predicted label of each post is printed on the corresponding line of the document." ) def preprocess_and_fit(self): # Method that preprocesses data, indexes all words, vectorizes posts and finally trains and tests the classifier processed = [] processed_test = [] for category in self.categories: processed.append( self.preprocessor.preprocess('training' + str(category) + ".txt")) processed_test.append( self.preprocessor.preprocess('testing' + str(category) + ".txt")) # Word indexing for category in processed: # indexes all words into dictionary self.dictionary.index_words(category) print("Words indexed. Dictionary size: ", len(self.dictionary.dictionary), " words") # Vectorization vectorizer = Vectorizer( self.dictionary.dictionary ) # initializes vectorizer-object with dictionary vector_start = time.time() print("Vectorizing...") training_vectors = [] testing_vectors = [] for category in processed: training_vectors.append(vectorizer.vectorize(category)) for category in processed_test: testing_vectors.append(vectorizer.vectorize(category)) vector_time = time.time() - vector_start print("Vectorization completed in ", ("%.2f" % vector_time), "seconds") # Training and evaluation self.classifier.train(training_vectors) self.fit = True self.classifier.evaluate(testing_vectors)
#!/usr/bin/env python3 #-*- coding: utf-8 -*- from dictionary import Dictionary import sys if len(sys.argv) != 3: sys.exit("Usage: generate_dictionary.py <input file> <ouput dictionary>") with open(sys.argv[1], 'r') as input_file: d = Dictionary() d.open(sys.argv[2]) d.clear() for word in input_file: d.add_word(word.strip()) d.save() d.close()