def cal_weight_improve(self, key_words, class_label): """ 计算获取特征词后的权重信息 :param key_words: [{'sentence': {}}, ...] or [{}, ...] 有可能是测试集数据有可能是训练集数据 :return: """ print "Cal Improve Weight: ", time.strftime('%Y-%m-%d %H:%M:%S') if not self.istrain: dir_ = os.path.join(TEXT_OUT, "key_words") filename = self.__class__.__name__ + ".txt" if self.subjective else self.__class__.__name__ + "_objective.txt" url = os.path.join(dir_, filename) train_key_words = FileUtil.read(url) train_class_label = [d.get("emotion-1-type") for d in train_key_words] else: train_key_words = key_words train_class_label = class_label train_key_words = [d.get("sentence") if "sentence" in d else d for d in train_key_words] key_words = [d.get("sentence") if "sentence" in d else d for d in key_words] # 获得 tf key_words = [{k: v / sum(d.values()) for k, v in d.items()} for d in key_words] fit_train_key_words = Feature_Hasher.transform(train_key_words) fit_key_words = Feature_Hasher.transform(key_words) tfidf = TfidfImprove() # 训练 idf tfidf.fit(fit_train_key_words, train_class_label) weight_matrix = tfidf.transform(fit_key_words, class_label) print "Cal Weight Done: ", time.strftime('%Y-%m-%d %H:%M:%S') print return weight_matrix
def _check_feature_size(url): l = [] for line in FileUtil.read(url): line = ",".join(line.get("sentence")) line = line.split(",") l.append(line) feature_size = set(flatten(l)) return len(feature_size)
def _get_splited_train(self): """ 优先从文件中读取训练集分词后的结果 :return: """ dir_ = os.path.join(TEXT_OUT, "split") if self.subjective: split_txt = os.path.join(dir_, self.__class__.__name__ + ".txt") training_datas = Load.load_training_balance() else: split_txt = os.path.join(dir_, self.__class__.__name__ + "_objective.txt") training_datas = Load.load_training_objective_balance() if self.f or not FileUtil.isexist(split_txt) or FileUtil.isempty(split_txt): # 加载训练集 # 每个句子还包含类别信息 splited_words_list = Feature.__split(flatten(training_datas)) # splited_words_list = Feature.__del_low_frequency_word(splited_words_list) FileUtil.write(split_txt, splited_words_list) else: splited_words_list = FileUtil.read(split_txt) return splited_words_list
def _collect(self, splited_words_list, sentence_size): dir_ = os.path.join(TEXT_OUT, "key_words") if self.subjective: key_words_txt = os.path.join(dir_, self.__class__.__name__ + ".txt") else: key_words_txt = os.path.join(dir_, self.__class__.__name__ + "_objective.txt") # def norm(word_scores): # """ # 以样本为单位正则化 # 归一化(正则化) # Normalization 主要思想是对每个样本计算其p-范数,然后对该样本中每个元素除以该范数, # 这样处理的结果是使得每个处理后样本的p-范数(l1-norm,l2-norm)等于1。 # # p-范数的计算公式:||X||p=(|x1|^p+|x2|^p+...+|xn|^p)^1/p # # 该方法主要应用于文本分类和聚类中。 # # :param word_scores: a dict {word: score} # """ # p = 0.0 # for v in word_scores.values(): # p += math.pow(math.fabs(v), 2) # p = math.pow(p, 1.0 / 2) # # for k, v in word_scores.items(): # word_scores[k] = v / p # def reduce_dim(word_scores): # """ # 降维:选取累加权重信息占比超过 0.9 的特征词 # """ # _size = len(word_scores) # _max = math.pow(_size, 1.0 / 2) * 0.85 # res = {} # # 降序排序 # sort = sorted(word_scores.items(), key=lambda x: x[1], reverse=True) # _sum = 0.0 # for k, v in sort: # if(_sum > _max): # break # res[k] = v # _sum += v # return res if not self.istrain or self.f or not FileUtil.isexist(key_words_txt) or FileUtil.isempty(key_words_txt): print "Cal Scores: ", time.strftime('%Y-%m-%d %H:%M:%S') if len(splited_words_list) == sentence_size: train_range = slice(sentence_size) else: train_range = slice(sentence_size, len(splited_words_list)) # 获取所有类别下的文本 all_class_datas = Feature.all_class_text(splited_words_list[train_range], self.getclasses()) # 获取类别标签 class_label = [d.get("emotion-1-type") for d in splited_words_list[: sentence_size]] # return term/frequency or term/score res = [] for splited_words_dict in splited_words_list[0: sentence_size]: splited_words = splited_words_dict.get("sentence") label = splited_words_dict.get("emotion-1-type") # 计算每个单词的得分 scores: {word: [score, frequency], ...} scores = {splited_word: [self.cal_score(splited_word, splited_words, label, all_class_datas, [d.get("sentence") for d in splited_words_list[train_range]]), frequency] for splited_word, frequency in splited_words.items()} # 归一化 # norm(scores) # 降维处理 sorted_words = scores # if not self.istrain: # sorted_words = reduce_dim(scores) # Collection # if False return term/score # if True return term/frequency # if False: # for k in sorted_words.keys(): # sorted_words[k] = splited_words.count(k) res.append({"sentence": sorted_words, "emotion-1-type": splited_words_dict.get("emotion-1-type")}) print "Cal Scores Done: ", time.strftime('%Y-%m-%d %H:%M:%S') # FileUtil.write(TEST_BASE_URL + "scores.txt", res) print "Begin Normalization: ", time.strftime('%Y-%m-%d %H:%M:%S') # 归一化 self.norm(res) # FileUtil.write(TEST_BASE_URL + "norm.txt", res) print "Normalization Done: ", time.strftime('%Y-%m-%d %H:%M:%S') print "Begin Reduce: ", time.strftime('%Y-%m-%d %H:%M:%S') # 降维 self.reduce_dim(res) print "Reduce Done: ", time.strftime('%Y-%m-%d %H:%M:%S') # Try Convert term/score to term/frequency # if False return term/score # if True return term/frequency for d in res: ws = d.get("sentence") for k, v in ws.items(): ws[k] = v[0] if True: ws[k] = v[1] # 由于分词或降维的过程中,有可能因为样本的信息关键词不够, # 使得该样本经过上诉步骤后为空,返回这类样本的索引 danger_index = [] res = filter(lambda x: danger_index.append(x[0]) if not x[1].get("sentence") else x, enumerate(res)) res = list(zip(*res)[1]) class_label = [c for i, c in enumerate(class_label) if i not in danger_index] # 写入文件 if self.istrain: FileUtil.write(key_words_txt, res) else: res = FileUtil.read(key_words_txt) class_label = [r["emotion-1-type"] for r in res] danger_index = [] # 输出统计信息 if False: self.__print_top_key_word(res) return res, class_label, danger_index