示例#1
0
 def test_clf(self, clf, clf_name):
     pre_train = Preprocesser(self.pos_file, self.neg_file)
     pre_test = Preprocesser(self.pos_test, self.neg_test)
     ngrams = ('unigram', )  # 'unigram', 'bigram', 'dictWords'
     selector_methods = (
         'df',
         'mi',
     )
     weight_methods = ('tf_idf', )
     for ngram in ngrams:
         train_data, train_target = pre_train.get_ngram(ngram)
         test_data, test_target = pre_test.get_ngram(ngram)
         selector = FeatureSelector(train_data, train_target)
         for selector_method in selector_methods:
             all_features = selector.select(selector_method,
                                            selector.all_features_size)
             print(len(all_features))
             for count in self.feature_range:
                 print(int(len(all_features) * count))
                 features = all_features[:int(len(all_features) *
                                              count)]  #[:count]
                 train_vectorizer = TermWeight(train_data, train_target,
                                               features)
                 for weight_method in weight_methods:
                     train_weighted_data = train_vectorizer.weight(
                         weight_method)
                     test_vectorizer = TermWeight(test_data, test_target,
                                                  features)
                     test_weighted_data = test_vectorizer.weight(
                         weight_method)
                     clf.fit(train_weighted_data, train_target)
                     test_result = clf.predict(test_weighted_data)
                     score = compute_aprf(test_target, test_result)
                     self.write_result(clf_name, ngram, selector_method,
                                       len(features), weight_method, score)
示例#2
0
 def test_clf(self, clf, clf_name):
     pre_train = Preprocesser(self.pos_file, self.neg_file)
     ngrams = ('unigram', )  # 'bigram'
     selector_methods = ('df', )
     weight_methods = ('tf_idf', )
     for ngram in ngrams:
         data, target = pre_train.get_ngram(ngram)
         train_data, test_data, train_target, test_target = train_test_split(
             data, target, test_size=0.70)
         selector = FeatureSelector(train_data, train_target)
         for selector_method in selector_methods:
             all_features = selector.select(selector_method,
                                            selector.all_features_size)
             for count in self.feature_range:
                 features = all_features[:count]
                 train_vectorizer = TermWeight(train_data, train_target,
                                               features)
                 for weight_method in weight_methods:
                     train_weighted_data = train_vectorizer.weight(
                         weight_method)
                     test_vectorizer = TermWeight(test_data, test_target,
                                                  features)
                     test_weighted_data = test_vectorizer.weight(
                         weight_method)
                     clf.fit(train_weighted_data, train_target)
                     test_result = clf.predict(test_weighted_data)
                     score = compute_aprf(test_target, test_result)
                     self.write_result(clf_name, ngram, selector_method,
                                       count, weight_method, score)
示例#3
0
def test_unigram(clf, clf_name, pos_file, neg_file, data_type):
    pre = Preprocesser(pos_file, neg_file)
    ngrams = ('unigram', )
    weight_methods = ('tf_idf', )
    for ngram in ngrams:
        data, target = pre.get_ngram(ngram)
        train_data, test_data, train_target, test_target = train_test_split(
            data, target)
        selector = FeatureSelector(train_data, train_target)
        all_features_count = selector.all_features_size
        features = selector.all_features
        train_vectorizer = TermWeight(train_data, train_target, features)
        for weight_method in weight_methods:
            train_weighted_data = train_vectorizer.weight(weight_method)
            test_vectorizer = TermWeight(test_data, test_target, features)
            test_weighted_data = test_vectorizer.weight(weight_method)
            clf.fit(train_weighted_data, train_target)
            score = clf.score(test_weighted_data, test_target)
            result_dir = '../data/result/'
            if not os.path.exists(result_dir):
                os.mkdir(result_dir)
            with open(os.path.join(result_dir, clf_name + '_unigram.txt'),
                      'at',
                      encoding='utf-8') as f:
                f.write(
                    'clf={1}\t data_type={2}\t count={3}\t score={4:.2f}\n'.
                    format(clf_name, data_type, all_features_count,
                           score * 100))
示例#4
0
 def test_clf_by_percent(self,
                         clf,
                         clf_name,
                         feature_count_range=[
                             1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000,
                             9000, 10000
                         ],
                         bigram_range=[0.1, 0.3, 0.5, 0.7, 0.9]):
     pre = Preprocesser(self.pos_file, self.neg_file)
     selector_methods = ('df', )
     weight_methods = ('tf_idf', )
     unigram_data, unigram_target = pre.get_unigram(is_shuffle=False)
     bigram_data, bigram_target = pre.get_bigram(is_shuffle=False)
     train_unigram, test_unigram, train_bigram, test_bigram, train_target, \
                                     test_target = self.split_data(unigram_data, bigram_data, unigram_target)
     unigram_selector = FeatureSelector(train_unigram, train_target)
     bigram_selector = FeatureSelector(train_bigram, train_target)
     train_data = [
         unigram_line + bigram_line
         for unigram_line, bigram_line in zip(train_unigram, train_bigram)
     ]
     test_data = [
         unigram_line + bigram_line
         for unigram_line, bigram_line in zip(test_unigram, test_bigram)
     ]
     for selector_method in selector_methods:
         unigram_features = unigram_selector.select(
             selector_method, unigram_selector.all_features_size)
         bigram_features = bigram_selector.select(
             selector_method, bigram_selector.all_features_size)
         for count in feature_count_range:
             for bigram_size in bigram_range:
                 bigram_count = int(count * bigram_size)
                 bigram_selected_features = bigram_features[:bigram_count]
                 unigram_selected_features = unigram_features[:(
                     count - bigram_count)]
                 features = unigram_selected_features
                 features.extend(bigram_selected_features)
                 train_vectorizer = TermWeight(train_data, train_target,
                                               features)
                 for weight_method in weight_methods:
                     train_weighted_data = train_vectorizer.weight(
                         weight_method)
                     test_vectorizer = TermWeight(test_data, test_target,
                                                  features)
                     test_weighted_data = test_vectorizer.weight(
                         weight_method)
                     clf.fit(train_weighted_data, train_target)
                     test_result = clf.predict(test_weighted_data)
                     score = compute_aprf(test_target, test_result)
                     self.write_result(clf_name + '_uni_bi',
                                       'unigram+bigram', selector_method,
                                       str(count) + ':' + str(bigram_size),
                                       weight_method, score)
示例#5
0
def test_rule(clf, clf_name, train_unigram, train_bigram, train_target,
              test_unigram, test_bigram, test_target, data_type):
    feature_count_range = [
        500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 5000, 6000
    ]
    bigram_range = [0.1, 0.3, 0.5, 0.7, 0.9, 1]
    selector_methods = ('mi', 'df', 'ig')
    weight_methods = ('tf_idf', )

    unigram_selector = FeatureSelector(train_unigram, train_target)
    bigram_selector = FeatureSelector(train_bigram, train_target)
    train_data = [
        unigram_line + bigram_line
        for unigram_line, bigram_line in zip(train_unigram, train_bigram)
    ]
    test_data = [
        unigram_line + bigram_line
        for unigram_line, bigram_line in zip(test_unigram, test_bigram)
    ]
    for selector_method in selector_methods:
        unigram_features = unigram_selector.select(
            selector_method, unigram_selector.all_features_size)
        bigram_features = bigram_selector.select(
            selector_method, bigram_selector.all_features_size)
        for count in feature_count_range:
            for bigram_size in bigram_range:
                bigram_count = int(count * bigram_size)
                bigram_selected_features = bigram_features[:bigram_count]
                unigram_selected_features = unigram_features[:(count -
                                                               bigram_count)]
                features = unigram_selected_features
                features.extend(bigram_selected_features)
                train_vectorizer = TermWeight(train_data, train_target,
                                              features)
                for weight_method in weight_methods:
                    train_weighted_data = train_vectorizer.weight(
                        weight_method)
                    test_vectorizer = TermWeight(test_data, test_target,
                                                 features)
                    test_weighted_data = test_vectorizer.weight(weight_method)
                    clf.fit(train_weighted_data, train_target)
                    score = clf.score(test_weighted_data, test_target)
                    result_dir = '../data/result/'
                    if not os.path.exists(result_dir):
                        os.mkdir(result_dir)
                    with open(os.path.join(result_dir,
                                           clf_name + '_unigram_and_rule.txt'),
                              'at',
                              encoding='utf-8') as f:
                        f.write(
                            'clf={1}\t data_type={2}\t count={3}\t rule_size={4}\t score={5:.2f}\n'
                            .format(clf_name, data_type, count, bigram_size,
                                    score * 100))
示例#6
0
def test_unigram(pos_file, neg_file, clf):
    pos_data, _ = read_word_rule(pos_file)
    neg_data, _ = read_word_rule(neg_file)
    test_size = 0.2
    pos_test_index = int(len(pos_data) * test_size)
    neg_test_index = int(len(neg_data) * test_size)
    pos_test_data = pos_data[:pos_test_index]
    pos_train_data = pos_data[pos_test_index:]
    neg_test_data = neg_data[:neg_test_index]
    neg_train_data = neg_data[neg_test_index:]
    train_data = pos_train_data + neg_train_data
    test_data = pos_test_data + neg_test_data
    train_target = [1] * len(pos_train_data) + [0] * len(neg_train_data)
    test_target = [1] * len(pos_test_data) + [0] * len(neg_test_data)
    features = get_all_features(train_data)
    train_vectorizer = TermWeight(train_data, train_target, features)
    weight_method = 'tf_idf'
    train_weighted_data = train_vectorizer.weight(weight_method)
    test_vectorizer = TermWeight(test_data, test_target, features)
    test_weighted_data = test_vectorizer.weight(weight_method)
    clf.fit(train_weighted_data, train_target)
    score = clf.score(test_weighted_data, test_target)
    return score
示例#7
0
    def test_clf_uni_bi(
            self,
            clf,
            clf_name,
            feature_count_range=[
                100, 300, 500, 700, 1000, 1500, 2000, 2500, 3000, 4000, 5000
            ],
            uni_bi_range=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]):
        #       pre = Preprocesser(self.pos_file, self.neg_file)
        pre_train = Preprocesser(self.pos_file, self.neg_file)
        pre_test = Preprocesser(self.pos_test, self.neg_test)
        selector_methods = (
            'df',
            'ig',
            'mi',
            'chi',
        )  # 'df', 'ig', 'mi', 'chi'
        weight_methods = ('tf_idf', )

        train_unigram, train_target = pre_train.get_unigram(is_shuffle=False)
        train_bigram, train_target = pre_train.get_bigram(is_shuffle=False)

        test_unigram, test_target = pre_test.get_unigram(
            is_shuffle=False)  #, 0.7, 0.9
        test_bigram, test_target = pre_test.get_bigram(is_shuffle=False)

        unigram_selector = FeatureSelector(train_unigram, train_target)
        bigram_selector = FeatureSelector(train_bigram, train_target)
        train_data = [
            unigram_line + bigram_line
            for unigram_line, bigram_line in zip(train_unigram, train_bigram)
        ]
        test_data = [
            unigram_line + bigram_line
            for unigram_line, bigram_line in zip(test_unigram, test_bigram)
        ]
        for selector_method in selector_methods:
            unigram_features = unigram_selector.select(
                selector_method, unigram_selector.all_features_size)
            bigram_features = bigram_selector.select(
                selector_method, bigram_selector.all_features_size)
            for count in feature_count_range:
                for uni_bi_size in uni_bi_range:
                    bi_size = int(count * uni_bi_size)
                    bigram_selected_features = bigram_features[:bi_size]
                    unigram_selected_features = unigram_features[:(count -
                                                                   bi_size)]
                    features = unigram_selected_features
                    features.extend(bigram_selected_features)
                    train_vectorizer = TermWeight(train_data, train_target,
                                                  features)
                    for weight_method in weight_methods:
                        train_weighted_data = train_vectorizer.weight(
                            weight_method)
                        test_vectorizer = TermWeight(test_data, test_target,
                                                     features)
                        test_weighted_data = test_vectorizer.weight(
                            weight_method)
                        clf.fit(train_weighted_data, train_target)
                        test_result = clf.predict(test_weighted_data)
                        score = compute_aprf(test_target, test_result)
                        self.write_result(clf_name + '_uni_bi',
                                          'unigram+bigram', selector_method,
                                          str(count) + ':' + str(uni_bi_size),
                                          weight_method, score)
示例#8
0
def test_unigram_rule(pos_file, neg_file, clf, clf_name, data_type):
    selector_methods = ('df', )
    weight_methods = ('tf_idf', )
    feature_count_range = [
        1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000
    ]
    rule_range = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1)
    pos_data, rule_pos_data = read_word_rule(pos_file)
    neg_data, rule_neg_data = read_word_rule(neg_file)
    test_size = 0.2
    pos_test_index = int(len(pos_data) * test_size)
    neg_test_index = int(len(neg_data) * test_size)
    pos_test_data = pos_data[:pos_test_index]
    pos_train_data = pos_data[pos_test_index:]
    neg_test_data = neg_data[:neg_test_index]
    neg_train_data = neg_data[neg_test_index:]
    train_data = pos_train_data + neg_train_data
    test_data = pos_test_data + neg_test_data
    train_target = [1] * len(pos_train_data) + [0] * len(neg_train_data)
    test_target = [1] * len(pos_test_data) + [0] * len(neg_test_data)

    rule_pos_test_data = rule_pos_data[:pos_test_index]
    rule_pos_train_data = rule_pos_data[pos_test_index:]
    rule_neg_test_data = rule_neg_data[:neg_test_index]
    rule_neg_train_data = rule_neg_data[neg_test_index:]
    rule_train_data = rule_pos_train_data + rule_neg_train_data
    rule_test_data = rule_pos_test_data + rule_neg_test_data

    def merge_data(data_a, data_b):
        result = []
        for i in range(len(data_a)):
            result.append(data_a[i] + data_b[i])
        return result

    all_train_data = merge_data(train_data, rule_train_data)
    all_test_data = merge_data(test_data, rule_test_data)

    unigram_selector = FeatureSelector(train_data, train_target)
    rule_selector = FeatureSelector(rule_train_data, train_target)

    for selector_method in selector_methods:
        unigram_features = unigram_selector.select(
            selector_method, unigram_selector.all_features_size)
        rule_features = rule_selector.select(selector_method,
                                             rule_selector.all_features_size)
        for count in feature_count_range:
            for rule_size in rule_range:
                rule_count = int(count * rule_size)
                rule_selected_features = rule_features[:rule_count]
                unigram_selected_features = unigram_features[:(count -
                                                               rule_count)]
                features = unigram_selected_features
                features.extend(rule_selected_features)
                train_vectorizer = TermWeight(all_train_data, train_target,
                                              features)
                for weight_method in weight_methods:
                    train_weighted_data = train_vectorizer.weight(
                        weight_method)
                    test_vectorizer = TermWeight(all_test_data, test_target,
                                                 features)
                    test_weighted_data = test_vectorizer.weight(weight_method)
                    clf.fit(train_weighted_data, train_target)
                    test_result = clf.predict(test_weighted_data)
                    score = compute_aprf(test_target, test_result)
                    write_result(clf_name, data_type, selector_method, count,
                                 rule_size, score)