def kfold_average_score(learner, files, dirs, k=5, min_word_len=2, min_freq=10, feature_size=160, weight='tfidf', sw=True): scores = [0] * k kfold = KFold(files, dirs, k) for ii in range(k): train_X, train_Y, test_X, test_Y = kfold.kth(ii) features = Features(train_X, train_Y, min_word_len, min_freq, feature_size, sw) x = [] for f_name in train_X: x.append(features.get_x_vector(f_name, weight)) learner.fit(x, features.y_transform(train_Y)) x = [] for f_name in test_X: x.append(features.get_x_vector(f_name, weight)) scores[ii] = f1_score(features.y_transform(test_Y), learner.predict(x).tolist()) return mean(scores)
class NaiveBayes: def __init__(self): self.keys = {} self.Pv = {} # The probability of every classification self.Pwv = {} # The conditional probability, Pwv[y][x] represents P(x|y) self.features = None def fit(self, x, y, num=500): """ fit to get the probabilities :param x: docs :param y: classification :return: None """ self.features = Features(x, y, 3, 10, num) # vocabularies = fe.read_in_all_words(x, 3, 5) N = len(y) self.keys = set(y) docs = Counter(y) num_of_words = len(self.features.features) for j in docs: self.Pv[j] = log2(docs[j] / N) self.Pwv[j] = {} word_nums = sum(self.features.class_word_count[j].values()) for word in self.features.features: # print(word) self.Pwv[j][word] = log2((self.features.class_word_count[j][word] + 1) / \ (word_nums + num_of_words)) def predict_list(self, names): list = [] for name in names: list.append(self.predict(name)) return list def predict(self, name): """ predict a file's class :param name: filename :return: class we predicted """ words = self.features.read_file(name) #, self.Pwv[next(iter(self.keys))]) max_prob = float('-inf') result = '' for cls in self.Pv: tmp_prob = self.Pv[cls] for word in words: word = word.lower() if word in self.Pwv[cls]: tmp_prob += self.Pwv[cls][word] # wordset.add(word) if tmp_prob > max_prob: max_prob = tmp_prob result = cls # print('Pr:', result, max_prob, end=' ') return result
def fit(self, x, y, num=500): """ fit to get the probabilities :param x: docs :param y: classification :return: None """ self.features = Features(x, y, 3, 10, num) # vocabularies = fe.read_in_all_words(x, 3, 5) N = len(y) self.keys = set(y) docs = Counter(y) num_of_words = len(self.features.features) for j in docs: self.Pv[j] = log2(docs[j] / N) self.Pwv[j] = {} word_nums = sum(self.features.class_word_count[j].values()) for word in self.features.features: # print(word) self.Pwv[j][word] = log2((self.features.class_word_count[j][word] + 1) / \ (word_nums + num_of_words))
for f_name in test_X: x.append(features.get_x_vector(f_name, weight)) scores[ii] = f1_score(features.y_transform(test_Y), learner.predict(x).tolist()) return mean(scores) if __name__ == '__main__': parser = argparse.ArgumentParser("Read in data directory") parser.add_argument('data_dir') print('Reading Data Path') files, dirs = fe.get_file_name_and_path(parser.parse_args().data_dir) print('Spliting Train-Test Set ') train_x, train_y, test_x, test_y = train_test_split(files, dirs, 0.25) learner = svm.SVC(kernel='rbf', C=1) features = Features(train_x, train_y, 3, 0, 160) x = [] for f_name in train_x: x.append(features.get_x_vector(f_name, 'tfidf')) learner.fit(x, features.y_transform(train_y)) x=[] for f_name in test_x: x.append(features.get_x_vector(f_name, 'tfidf')) print('Score:', f1_score(features.y_transform(test_y), learner.predict(x).tolist())) # print('Test if "TFIDF" is better than "TF"') # print('TF:', kfold_average_score(learner, train_x, train_y, weight='tf',feature_size=2)) # print('TFIDF:', kfold_average_score(learner, train_x, train_y, weight='tfidf',feature_size=2)) # #