示例#1
0
文件: corpus.py 项目: uukuguy/digger
    def show_keywords_matrix(self):
        categories = self.get_categories()

        # 计算每个词条在各个类别中使用的总次数
        # {term_id: (term_used, standard_deviation, category_info)}
        # category_info - {category_id:(term_weight, term_used_in_category, term_ratio)}
        term_category_matrix = {}

        tsm = self.tsm

        sfm_tfidf = FeatureWeight.transform(tsm, FeatureWeight.TFIDF)

        for (term_id, term_info) in tsm.term_matrix_iterator():
            (_, (term_used, term_samples, sample_map)) = term_info
            if term_used < 50:
                continue
            category_info = {}
            if term_id in term_category_matrix:
                (_, _, category_info) = term_category_matrix[term_id]
            for sample_id in sample_map:
                term_used_in_sample = sample_map[sample_id]
                (category_id, sample_terms, term_map) = tsm.get_sample_row(sample_id)

                term_weight = 0.0
                term_used_in_category = 0
                term_ratio = 0.0
                term_ratio_variance = 0.0
                if category_id in category_info:
                    (term_weight, term_used_in_category, term_ratio) = category_info[category_id]

                v = sfm_tfidf.get_sample_feature(sample_id, term_id)
                if v is None:
                    continue

                category_info[category_id] = (term_weight + v, term_used_in_category + term_used_in_sample,  term_ratio)

                term_category_matrix[term_id] = (term_used, 0.0, category_info)

        # 计算每个词条在各个类别中的使用占比。
        for term_id in term_category_matrix:
            (term_used, _, category_info) = term_category_matrix[term_id]
            # 计算词条使用占比
            term_weight_sum = 0.0
            for category_id in category_info:
                (term_weight, term_used_in_category, _) = category_info[category_id]
                term_weight_sum += term_weight
                #term_weight_sum += term_used_in_category

            ratio_sum = 0.0
            for category_id in category_info:
                (term_weight, term_used_in_category, _) = category_info[category_id]
                term_ratio = term_weight / term_weight_sum
                category_info[category_id] = (term_weight, term_used_in_category, term_ratio)
                ratio_sum += term_ratio

            term_category_matrix[term_id] = (term_used, 0.0, category_info)

            #ratio_mean = ratio_sum / len(category_info)
            ratio_mean = ratio_sum / len(categories.categories_2)

            # 计算标准差

            sum_0 = 0.0
            for category_id in category_info:
                (term_weight, term_used_in_category, term_ratio) = category_info[category_id]
                x = term_ratio - ratio_mean
                sum_0 += x * x
            #standard_deviation = math.sqrt(sum_0 / len(category_info))
            standard_deviation = math.sqrt(sum_0 / len(categories.categories_2))
            term_category_matrix[term_id] = (term_used, standard_deviation, category_info)

        # 输出结果
        # 按标准差从大到小排序
        terms_by_sd = {}
        for term_id in term_category_matrix:
            (term_used, standard_deviation, category_info) = term_category_matrix[term_id]
            terms_by_sd[term_id] = standard_deviation

        rowidx = 0
        terms_by_sd_list = sorted_dict_by_values(terms_by_sd, reverse = True)
        for (term_id, standard_deviation) in terms_by_sd_list:
            (term_used, _, category_info) = term_category_matrix[term_id]
            term_text = self.corpus.vocabulary.get_term_text(term_id)

            str_term_categories = u""
            category_info_list = sorted_dict_by_values(category_info, reverse = True)
            for (category_id, (term_weight, term_used_in_category, term_ratio)) in category_info_list:
                category_name = categories.get_category_name(category_id)

                str_term_categories += " <%s[%d]: %.2f%% (%d)> " % (category_name, category_id, term_ratio * 100, term_used_in_category)

            print "--------------------------------"
            print "<%d/%d> %s(%d) sd:%.6f %d used. %s" % (rowidx, len(terms_by_sd_list), term_text, term_id, standard_deviation, term_used, str_term_categories)
            rowidx += 1
示例#2
0
def export_samples_to_svm(samples, svm_file):
    samples.load()
    sfm = FeatureWeight.transform(samples.tsm, None, FeatureWeight.TFIDF)
    sfm.save_to_svmfile(svm_file, False)