def show_keywords_matrix(self): categories = self.get_categories() # 计算每个词条在各个类别中使用的总次数 # {term_id: (term_used, standard_deviation, category_info)} # category_info - {category_id:(term_weight, term_used_in_category, term_ratio)} term_category_matrix = {} tsm = self.tsm sfm_tfidf = FeatureWeight.transform(tsm, FeatureWeight.TFIDF) for (term_id, term_info) in tsm.term_matrix_iterator(): (_, (term_used, term_samples, sample_map)) = term_info if term_used < 50: continue category_info = {} if term_id in term_category_matrix: (_, _, category_info) = term_category_matrix[term_id] for sample_id in sample_map: term_used_in_sample = sample_map[sample_id] (category_id, sample_terms, term_map) = tsm.get_sample_row(sample_id) term_weight = 0.0 term_used_in_category = 0 term_ratio = 0.0 term_ratio_variance = 0.0 if category_id in category_info: (term_weight, term_used_in_category, term_ratio) = category_info[category_id] v = sfm_tfidf.get_sample_feature(sample_id, term_id) if v is None: continue category_info[category_id] = (term_weight + v, term_used_in_category + term_used_in_sample, term_ratio) term_category_matrix[term_id] = (term_used, 0.0, category_info) # 计算每个词条在各个类别中的使用占比。 for term_id in term_category_matrix: (term_used, _, category_info) = term_category_matrix[term_id] # 计算词条使用占比 term_weight_sum = 0.0 for category_id in category_info: (term_weight, term_used_in_category, _) = category_info[category_id] term_weight_sum += term_weight #term_weight_sum += term_used_in_category ratio_sum = 0.0 for category_id in category_info: (term_weight, term_used_in_category, _) = category_info[category_id] term_ratio = term_weight / term_weight_sum category_info[category_id] = (term_weight, term_used_in_category, term_ratio) ratio_sum += term_ratio term_category_matrix[term_id] = (term_used, 0.0, category_info) #ratio_mean = ratio_sum / len(category_info) ratio_mean = ratio_sum / len(categories.categories_2) # 计算标准差 sum_0 = 0.0 for category_id in category_info: (term_weight, term_used_in_category, term_ratio) = category_info[category_id] x = term_ratio - ratio_mean sum_0 += x * x #standard_deviation = math.sqrt(sum_0 / len(category_info)) standard_deviation = math.sqrt(sum_0 / len(categories.categories_2)) term_category_matrix[term_id] = (term_used, standard_deviation, category_info) # 输出结果 # 按标准差从大到小排序 terms_by_sd = {} for term_id in term_category_matrix: (term_used, standard_deviation, category_info) = term_category_matrix[term_id] terms_by_sd[term_id] = standard_deviation rowidx = 0 terms_by_sd_list = sorted_dict_by_values(terms_by_sd, reverse = True) for (term_id, standard_deviation) in terms_by_sd_list: (term_used, _, category_info) = term_category_matrix[term_id] term_text = self.corpus.vocabulary.get_term_text(term_id) str_term_categories = u"" category_info_list = sorted_dict_by_values(category_info, reverse = True) for (category_id, (term_weight, term_used_in_category, term_ratio)) in category_info_list: category_name = categories.get_category_name(category_id) str_term_categories += " <%s[%d]: %.2f%% (%d)> " % (category_name, category_id, term_ratio * 100, term_used_in_category) print "--------------------------------" print "<%d/%d> %s(%d) sd:%.6f %d used. %s" % (rowidx, len(terms_by_sd_list), term_text, term_id, standard_deviation, term_used, str_term_categories) rowidx += 1
def export_samples_to_svm(samples, svm_file): samples.load() sfm = FeatureWeight.transform(samples.tsm, None, FeatureWeight.TFIDF) sfm.save_to_svmfile(svm_file, False)