def __init__(self, wd=None, save=None): if wd: os.chdir(wd) self.reddit = str(wd).split("/")[-2] self.cutoff = compute_cutoff(wd) self.popular_texts = [] self.unpopular_texts = [] self.corpus = [] self.all_texts = [] self.save = False self.word_counts = defaultdict(int) self.vectorizer = None if save: if save.lower()[0] == "y": self.save = True
print 'F1 Score:', F1score print 'Confusion matrix:' print confusion return F1score if __name__ == "__main__": parser = argparse.ArgumentParser(description="Builds a Naive Bayes model for classification.") parser.add_argument("filepath", help="Argument must be the filepath where the text files are located") parser.add_argument("topic_type", help="topic_type is either bow, tfidf or lda") parser.add_argument("valid_or_test", help="Either v or t to test against validation or test set") parser.add_argument("--num_topics", default=10, help="The amount of topics to be grabbed from the LDA model") args = parser.parse_args() cutoff = compute_cutoff(args.filepath) print "Classifying the initial data." classify_initial_data(args.filepath, cutoff, SOURCES) training_data = make_data(SOURCES, args.filepath) if args.valid_or_test[0].lower() == "v": print "Classifying the validation data." validation_filepath = args.filepath + "/validation" classify_initial_data(validation_filepath, cutoff, VALIDATION) validation_data = make_data(VALIDATION, validation_filepath) create_Naive_Bayes(training_data, validation_data, args.topic_type, args.num_topics) else: print "Classifying the testing data." testing_filepath = args.filepath + "/testing"