class Glados(object): def __init__(self, data_filename): self.logger = logging.getLogger("Glados") self.file = File() self.parser = SentenceParser() self.data_filename = data_filename self.classifier = self.train_and_get_classifier(self.data_filename) """ Public api input: user question text output: answer """ def get_help(self, question): features = self.parser.extract_feature(question) print('features', features) answer = self.classifier.classify(features) prob = self.classifier.prob_classify(features).prob(answer) # self.logger.info('features for question are %s', features) print('Answer:', answer, "(", prob, ")") response = dict(question=question, answer=answer, probability=prob) return response def train_and_get_classifier(self, data_filename): split_ratio = 0.8 data = self.parser.get_content(data_filename) data_set = self.parser.extract_feature_from_doc(data) random.shuffle(data_set) data_length = len(data) train_split = int(data_length * split_ratio) training_data = data_set[:train_split] self.error_analysis_for_text_input_to_feature_extraction(data) self.error_analysis_for_features_to_predicted_answer(training_data) test_data = data_set[train_split:] # self.logger.debug('\n'.join([str(x) for x in data_set])) classifier, classifier_name, test_set_accuracy, training_set_accuracy = self.train_using_naive_bayes( training_data, test_data) self.file.append( get_module_path("output/accuracy.txt"), "\n%s\t\t%s\t\t\t%.8f\t\t%.8f" % (classifier_name, data_length, training_set_accuracy, test_set_accuracy)) return classifier def error_analysis_for_text_input_to_feature_extraction(self, data): text_features_answer = [] for (text, category, answer) in data: sent_features = self.parser.extract_feature(text) text_features_answer.append((text, sent_features, answer)) # logging.debug(datetime.datetime.now()) # print('>>>>>>>>TEXT>>>>>>>>>FEATURES>>>>>>ANSWER:\n ', str(json.dumps(text_features_answer))) def error_analysis_for_features_to_predicted_answer(self, test_data): classifier = nltk.NaiveBayesClassifier.train(test_data) classifier.show_most_informative_features() errors = [] for (feature, actual_output) in test_data: guess = classifier.classify(feature) if (guess != actual_output): errors.append((feature, actual_output, guess)) print( '>>>>>FEATURE>>>>>>>>>ACUTUAL OUTPUT><<<<<<<<PREDICTION<<<<<<<<<<<<<<<' ) print('Errors:', json.dumps(errors)) def train_using_naive_bayes(self, training_data, test_data): classifier = nltk.NaiveBayesClassifier.train(training_data) classifier_name = type(classifier).__name__ training_set_accuracy = nltk.classify.accuracy(classifier, training_data) test_set_accuracy = nltk.classify.accuracy(classifier, test_data) return classifier, classifier_name, test_set_accuracy, training_set_accuracy