def _get_key_sentence(self, contents, query_cut): """ 获得关键语句作为答案 :param contents: 句子集合 :param query_cut: 问句提取关键词 :return: """ # 一个句子内有更高的高频词,说明句子的重要性更棒棒 split_result = [] # 分词结果 TF = {} IDF = {} TF_IDF = {} for s in contents: word_list = TextProcess.cut(s) word_list = list( set([word for word in word_list if word not in self.stop_word])) split_result.append(word_list) for word in word_list: TF[word] = TF.get(word, 0) + 1 for word in set(word_list): IDF[word] = IDF.get(word, 0) + 1 # 含该词的句子数,而不是出现的次数 for k in TF: TF[k] = TF[k] / len(TF) IDF[k] = math.log(len(contents) / IDF[k]) TF_IDF[k] = TF[k] * IDF[k] topic_word = sorted(TF_IDF, key=lambda k: TF_IDF[k], reverse=True) topic_word = topic_word[:self.topic] # print("Query:", query_cut) # print("Topic:", topic_word) # 得分 词的重要性是(用tf或tf-idf衡量)/句子长度 score = [] for i, word_list in enumerate(split_result): s = 0. if len(word_list) <= 1 or (len(word_list) == 2 and word_list[1] == " "): # 只有一个词或者一个词加空格不太可能是答案 continue # print("sentence:{}\nwortcut:{}".format(contents[i], word_list)) for word in word_list: w = 0 if word in query_cut: # print("Word {} in query".format(word)) w += 0.5 if word in topic_word: # print("Word {} in topic".format(word)) w += 0.5 s += TF_IDF[word] * w # s = s / len(word_list) score.append((i, s)) # print("Score:{:.5f}".format(s)) # print("-------------------------------------") score = sorted(score, key=lambda x: x[1], reverse=True) result = [] if len(score) > self.n: score = score[:self.n] for pair in score: result.append(contents[pair[0]]) return result
def _get_key_sentence(self, contents): """ 获得关键语句作为答案 :param contents: 句子集合 :return: """ # 一个句子内有更高的高频词,说明句子的重要性更棒棒 split_result = [] # 分词结果 TF = {} IDF = {} for s in contents: word_list = TextProcess.cut(s) word_list = [ word for word in word_list if word not in self.stop_word ] split_result.append(word_list) for word in word_list: TF[word] = TF.get(word, 0) + 1 for word in set(word_list): IDF[word] = IDF.get(word, 0) + 1 # 含该词的句子数,而不是出现的次数 for k in TF: TF[k] = TF[k] / len(TF) IDF[k] = math.log(len(contents) / IDF[k]) # 得分 词的重要性是(用tf或tf-idf衡量)/句子长度 score = [] for i, word_list in enumerate(split_result): s = 0. if len(word_list) <= 1 or (len(word_list) == 2 and word_list[1] == " "): # 只有一个词或者一个词加空格不太可能是答案 continue alpha_num = 0. # 考虑答案多为代码和命令含英文和符号超过一定比例的的权重增加 for word in word_list: if self._judge_pure_english(word): alpha_num += 1 s += TF[word] * IDF[word] if alpha_num == 0: s = 0 # 一个英文字符都没,肯定是不需要的 else: s = s / len(word_list) if (alpha_num / len(word_list)) > 0.5: s = s * (1. + (alpha_num / len(word_list))) else: s = s * (alpha_num / len(word_list)) # print("word cut{} score:{} alpha:{}".format(word_list, s, alpha_num)) score.append((i, s)) score = sorted(score, key=lambda x: x[1], reverse=True) result = [] if len(score) > self.n: score = score[:self.n] for pair in score: print(contents[pair[0]], pair[1]) result.append(contents[pair[0]]) return result
def _similarity(self, t1, t2): """ 百度api存在qps的问题 :param t1: :param t2: :return: """ t1_list = [ word for word in TextProcess.cut(t1) if word not in self.stop_word ] t2_list = [ word for word in TextProcess.cut(t2) if word not in self.stop_word ] em1 = self.sentence_emb(t1_list) em2 = self.sentence_emb(t2_list) score = self.cos(em1, em2) # score = self.vector_similarity(t1_list, t2_list) score = score * 0.5 + 0.5 # 归一化 return 1, score