def main(): # load training files into Classifier path_to_res = os.path.join(sys.path[0], "resources\\") neg_classif = Classifier(path_to_res + "training_negative.txt") neu_classif = Classifier(path_to_res + "training_neutral.txt") pos_classif = Classifier(path_to_res + "training_positive.txt") total_entries = neu_classif.get_entries() + neg_classif.get_entries() + pos_classif.get_entries() # load test files test_parser = Parser(path_to_res + "test_set.txt") counter = 1 while(True): word_list = test_parser.giveWordList() if len(word_list) == 0: break neg_p = neg_classif.classification_probability(word_list, total_entries) neu_p = neu_classif.classification_probability(word_list, total_entries) pos_p = pos_classif.classification_probability(word_list, total_entries) print("Test " + str(counter) + ":\n") print("\tNegative: " + str(math.fabs(neg_p)) + "%\n") print("\tNeutral: " + str(math.fabs(neu_p)) + "%\n") print("\tPositve: " + str(math.fabs(pos_p)) + "%\n") counter += 1
class Classifier(object): """ Contains stastical data from training text One Classifier object per each classifiable catagory """ def __init__(self, training_file_path): self._dict = {} self._entries = 0 self._total_words = 0 self.parser = Parser(training_file_path) self._learn() def _learn(self): """ Uses Parser to get entries and build the dictionary """ while(True): word_list = self.parser.giveWordList() if len(word_list) == 0: return else: for word in word_list: self._add_word(word) self._entries += 1 def _add_word(self, word): """ adds a word to the dictionary, or increments it if already inside """ if word in self._dict: self._dict[word] += 1 self._total_words += 1 else: self._dict[word] = 1 self._total_words += 1 def likelihood(self, word): """ the likelihood of a word xi occuring conditionally, p(xi | Ck) """ if word in self._dict: return self._dict[word] / self._entries else: return 0 def classification_probability(self, word_list, total_entries): """ Returns the probability that the given entry is of this classifiable catagory word_list: List, entry from test file Parser's giveWordList() total_entries: int, sum of all Classifier's entries Naive Bayes Formula: p(Ck | x1, x2, .... xi) ~ p(Ck) * SIGMA(for all i)[ log p(xi | Ck) ] = """ prob_c = self._entries / total_entries total_sum = 1 for word in word_list: likelihood = self.likelihood(word) if likelihood == 0: total_sum += 0 else: total_sum += math.log(self.likelihood(word)) #if(total_sum == 1): # return 0 return prob_c * total_sum def get_entries(self): """ Returns the number of entries """ return self._entries