Пример #1
0
def kfold_average_score(learner, files, dirs, k=5, min_word_len=2, min_freq=10, feature_size=160, weight='tfidf', sw=True):
    scores = [0] * k
    kfold = KFold(files, dirs, k)
    for ii in range(k):
        train_X, train_Y, test_X, test_Y = kfold.kth(ii)
        features = Features(train_X, train_Y, min_word_len, min_freq, feature_size, sw)
        x = []
        for f_name in train_X:
            x.append(features.get_x_vector(f_name, weight))

        learner.fit(x, features.y_transform(train_Y))
        x = []
        for f_name in test_X:
            x.append(features.get_x_vector(f_name, weight))
        scores[ii] = f1_score(features.y_transform(test_Y), learner.predict(x).tolist())
    return mean(scores)
Пример #2
0
class NaiveBayes:
    def __init__(self):
        self.keys = {}
        self.Pv = {}  # The probability of every classification
        self.Pwv = {}  # The conditional probability, Pwv[y][x] represents P(x|y)
        self.features = None

    def fit(self, x, y, num=500):
        """
        fit to get the probabilities
        :param x: docs
        :param y: classification
        :return: None
        """
        self.features = Features(x, y, 3, 10, num)
        # vocabularies = fe.read_in_all_words(x, 3, 5)
        N = len(y)
        self.keys = set(y)
        docs = Counter(y)
        num_of_words = len(self.features.features)
        for j in docs:
            self.Pv[j] = log2(docs[j] / N)
            self.Pwv[j] = {}
            word_nums = sum(self.features.class_word_count[j].values())
            for word in self.features.features:
                # print(word)
                self.Pwv[j][word] = log2((self.features.class_word_count[j][word] + 1) / \
                                      (word_nums + num_of_words))

    def predict_list(self, names):
        list = []
        for name in names:
            list.append(self.predict(name))
        return list


    def predict(self, name):
        """
        predict a file's class
        :param name: filename
        :return: class we predicted
        """
        words = self.features.read_file(name) #, self.Pwv[next(iter(self.keys))])
        max_prob = float('-inf')
        result = ''
        for cls in self.Pv:
            tmp_prob = self.Pv[cls]
            for word in words:
                word = word.lower()
                if word in self.Pwv[cls]:
                    tmp_prob += self.Pwv[cls][word]
                    # wordset.add(word)
            if tmp_prob > max_prob:
                max_prob = tmp_prob
                result = cls
        # print('Pr:', result, max_prob, end=' ')
        return result
Пример #3
0
 def fit(self, x, y, num=500):
     """
     fit to get the probabilities
     :param x: docs
     :param y: classification
     :return: None
     """
     self.features = Features(x, y, 3, 10, num)
     # vocabularies = fe.read_in_all_words(x, 3, 5)
     N = len(y)
     self.keys = set(y)
     docs = Counter(y)
     num_of_words = len(self.features.features)
     for j in docs:
         self.Pv[j] = log2(docs[j] / N)
         self.Pwv[j] = {}
         word_nums = sum(self.features.class_word_count[j].values())
         for word in self.features.features:
             # print(word)
             self.Pwv[j][word] = log2((self.features.class_word_count[j][word] + 1) / \
                                   (word_nums + num_of_words))
Пример #4
0
        for f_name in test_X:
            x.append(features.get_x_vector(f_name, weight))
        scores[ii] = f1_score(features.y_transform(test_Y), learner.predict(x).tolist())
    return mean(scores)


if __name__ == '__main__':
    parser = argparse.ArgumentParser("Read in data directory")
    parser.add_argument('data_dir')
    print('Reading Data Path')
    files, dirs = fe.get_file_name_and_path(parser.parse_args().data_dir)
    print('Spliting Train-Test Set ')
    train_x, train_y, test_x, test_y = train_test_split(files, dirs, 0.25)

    learner = svm.SVC(kernel='rbf', C=1)
    features = Features(train_x, train_y, 3, 0, 160)
    x = []
    for f_name in train_x:
        x.append(features.get_x_vector(f_name, 'tfidf'))

    learner.fit(x, features.y_transform(train_y))
    x=[]
    for f_name in test_x:
        x.append(features.get_x_vector(f_name, 'tfidf'))
    print('Score:', f1_score(features.y_transform(test_y), learner.predict(x).tolist()))


    # print('Test if "TFIDF" is better than "TF"')
    # print('TF:', kfold_average_score(learner, train_x, train_y, weight='tf',feature_size=2))
    # print('TFIDF:', kfold_average_score(learner, train_x, train_y, weight='tfidf',feature_size=2))
    # #