def __init__(self, x, y, min_word_length=0, min_word_freq=0, num_of_features_of_each_class=100, sw=True):
     self.weights = {'tf', 'tfidf'}
     self.num_of_docs = len(x)
     self.classes = set(y)
     word_count = self.read_in_all_words(x, min_word_length, min_word_freq, sw)
     self.class_word_count = self.get_class_word_count(x, y, word_count)
     self.features = self.select_features(word_count, self.class_word_count, num_of_features_of_each_class)
     self.x_le = LabelEncoder().fit(self.features)
     self.y_le = LabelEncoder().fit(y)
class Features:
    def __init__(self, x, y, min_word_length=0, min_word_freq=0, num_of_features_of_each_class=100, sw=True):
        self.weights = {'tf', 'tfidf'}
        self.num_of_docs = len(x)
        self.classes = set(y)
        word_count = self.read_in_all_words(x, min_word_length, min_word_freq, sw)
        self.class_word_count = self.get_class_word_count(x, y, word_count)
        self.features = self.select_features(word_count, self.class_word_count, num_of_features_of_each_class)
        self.x_le = LabelEncoder().fit(self.features)
        self.y_le = LabelEncoder().fit(y)

    def read_in_all_words(self, x, min_word_len=0, min_to_count=0, sw=True):
        """
        Get all vocabularies that can be the feature

        :param x: A list of files to read.
        :return: A set of under_conditions_words that contained in x
        """
        stopwords.add("")  # to remove empty word
        vocabularies = Counter()  # Count the vocabularies
        voc = Counter()  # the vocabularies that we finally need
        for name in x:
            file = open(name, 'r', errors='ignore')  # There's some decoding errors
            word_mark = set()  # Mark if the word has been count
            for line in file:
                for word in r.split(line):  # strip the punctuation
                    word = word.lower()  # lower case
                    if (not sw or word not in stopwords) and len(word) >= min_word_len:  # ignore the words that too short
                        if word not in vocabularies:
                            vocabularies[word] = [1, 1]
                            word_mark.add(word)
                        else:
                            vocabularies[word][0] += 1
                            vocabularies[word][1] += 1 if word not in word_mark else 0
                        if vocabularies[word.lower()][0] >= min_to_count:  # ignore the low frequency word
                            voc[word] = vocabularies[word]
            file.close()
        return voc

    @staticmethod
    def cal_chi_square(n11, n10, n01, n00):
        return (n11 + n10 + n01 + n00) * (n11 * n00 - n10 * n01)**2 /\
               ((n11+n01) * (n11 + n10) * (n10 + n00) * (n01 + n00))

    def get_class_word_count(self, x, y, voc=None):
        """
        Count the word frequency in every classifications.

        :param x: files to be count
        :param y: classifications related to x
        # :param vocabularies: features
        :return: a set of Counters that counts the word frequency
        """
        class_word_count = {}
        for key in set(y):
            class_word_count[key] = Counter()  # init
        for ii in range(len(x)):
            file = open(x[ii], 'r', errors='ignore')
            for line in file:
                for word in r.split(line):
                    word = word.lower()
                    if not voc or word in voc:
                        class_word_count[y[ii]][word] += 1  # count
        return class_word_count

    def select_features(self, word_count, class_word_count, num):
        """Select features.

        :param x: a list of files
        :param y: the classification related to x
        :return: a set of word to be features
        """
        features = {}
        total_word_count = {}
        for cls in self.classes:
            features[cls] = [(0, '')] * num
            total_word_count[cls] = sum(class_word_count[cls].values())
        for word in word_count:
            for cls in self.classes:
                n11 = class_word_count[cls][word]
                n10 = word_count[word][0] - n11  # This word but not in classifier
                n01 = total_word_count[cls] - n11  # Not this word but in classifier
                n00 = 0
                for c in self.classes:
                    if c != cls:
                        n00 += total_word_count[cls] - class_word_count[cls][word]
                chi_s = Features.cal_chi_square(n11, n10, n01, n00)
                if chi_s > features[cls][0][0]:
                    heappushpop(features[cls], (chi_s, word))
        feature_set = {}
        for cls in self.classes:
            for c_s, _word in features[cls]:
                # print(c_s, _word)
                if _word:
                    feature_set[_word] = word_count[_word][1]
        return feature_set

    def read_file(self, name):
        file = open(name, 'r', errors='ignore')
        word_set = set()
        for line in file:
            for word in r.split(line):
                word = word.lower()
                if word in self.features:
                    word_set.add(word)
        file.close()
        return word_set

    def get_x_vector(self, name, weight='tf'):
        if weight not in self.weights:
            print("Error: weight not found!")
            return
        file = open(name, 'r', errors='ignore')
        word_array = self.cal_tf(file)
        if weight == 'tfidf':
            word_array = self.cal_tfidf(word_array)
        file.close()
        return word_array

    def cal_tf(self, file):
        word_array = [0] * len(self.features)
        for line in file:
            for word in r.split(line):
                word = word.lower()
                if word in self.features:
                    word_array[self.x_transform(word)] += 1
        return word_array

    def cal_tfidf(self, word_array):
        for i in range(len(self.features)):
            word_array[i] = 0 if not word_array[i] else \
                (log2(word_array[i])+1)*log2(self.num_of_docs/self.features[self.x_le.inverse_transform(i)])
        return word_array

    def x_transform(self, thing):
        return self.x_le.transform(thing)

    def y_transform(self, thing):
        return self.y_le.transform(thing)