Пример #1
0
 def _get_key_sentence(self, contents, query_cut):
     """
     获得关键语句作为答案
     :param contents: 句子集合
     :param query_cut: 问句提取关键词
     :return:
     """
     # 一个句子内有更高的高频词,说明句子的重要性更棒棒
     split_result = []  # 分词结果
     TF = {}
     IDF = {}
     TF_IDF = {}
     for s in contents:
         word_list = TextProcess.cut(s)
         word_list = list(
             set([word for word in word_list
                  if word not in self.stop_word]))
         split_result.append(word_list)
         for word in word_list:
             TF[word] = TF.get(word, 0) + 1
         for word in set(word_list):
             IDF[word] = IDF.get(word, 0) + 1  # 含该词的句子数,而不是出现的次数
     for k in TF:
         TF[k] = TF[k] / len(TF)
         IDF[k] = math.log(len(contents) / IDF[k])
         TF_IDF[k] = TF[k] * IDF[k]
     topic_word = sorted(TF_IDF, key=lambda k: TF_IDF[k], reverse=True)
     topic_word = topic_word[:self.topic]
     # print("Query:", query_cut)
     # print("Topic:", topic_word)
     # 得分 词的重要性是(用tf或tf-idf衡量)/句子长度
     score = []
     for i, word_list in enumerate(split_result):
         s = 0.
         if len(word_list) <= 1 or (len(word_list) == 2
                                    and word_list[1] == " "):
             # 只有一个词或者一个词加空格不太可能是答案
             continue
         # print("sentence:{}\nwortcut:{}".format(contents[i], word_list))
         for word in word_list:
             w = 0
             if word in query_cut:
                 # print("Word {} in query".format(word))
                 w += 0.5
             if word in topic_word:
                 # print("Word {} in topic".format(word))
                 w += 0.5
             s += TF_IDF[word] * w
         # s = s / len(word_list)
         score.append((i, s))
         # print("Score:{:.5f}".format(s))
         # print("-------------------------------------")
     score = sorted(score, key=lambda x: x[1], reverse=True)
     result = []
     if len(score) > self.n:
         score = score[:self.n]
     for pair in score:
         result.append(contents[pair[0]])
     return result
Пример #2
0
    def _get_key_sentence(self, contents):
        """

        获得关键语句作为答案
        :param contents: 句子集合
        :return:
        """
        # 一个句子内有更高的高频词,说明句子的重要性更棒棒
        split_result = []  # 分词结果
        TF = {}
        IDF = {}
        for s in contents:
            word_list = TextProcess.cut(s)
            word_list = [
                word for word in word_list if word not in self.stop_word
            ]
            split_result.append(word_list)
            for word in word_list:
                TF[word] = TF.get(word, 0) + 1
            for word in set(word_list):
                IDF[word] = IDF.get(word, 0) + 1  # 含该词的句子数,而不是出现的次数
        for k in TF:
            TF[k] = TF[k] / len(TF)
            IDF[k] = math.log(len(contents) / IDF[k])
        # 得分 词的重要性是(用tf或tf-idf衡量)/句子长度
        score = []
        for i, word_list in enumerate(split_result):
            s = 0.

            if len(word_list) <= 1 or (len(word_list) == 2
                                       and word_list[1] == " "):
                # 只有一个词或者一个词加空格不太可能是答案
                continue
            alpha_num = 0.  # 考虑答案多为代码和命令含英文和符号超过一定比例的的权重增加
            for word in word_list:
                if self._judge_pure_english(word):
                    alpha_num += 1
                s += TF[word] * IDF[word]

            if alpha_num == 0:
                s = 0  # 一个英文字符都没,肯定是不需要的
            else:
                s = s / len(word_list)
                if (alpha_num / len(word_list)) > 0.5:
                    s = s * (1. + (alpha_num / len(word_list)))
                else:
                    s = s * (alpha_num / len(word_list))
            # print("word cut{} score:{} alpha:{}".format(word_list, s, alpha_num))
            score.append((i, s))
        score = sorted(score, key=lambda x: x[1], reverse=True)
        result = []
        if len(score) > self.n:
            score = score[:self.n]

        for pair in score:
            print(contents[pair[0]], pair[1])
            result.append(contents[pair[0]])
        return result
Пример #3
0
 def _similarity(self, t1, t2):
     """
     百度api存在qps的问题
     :param t1:
     :param t2:
     :return:
     """
     t1_list = [
         word for word in TextProcess.cut(t1) if word not in self.stop_word
     ]
     t2_list = [
         word for word in TextProcess.cut(t2) if word not in self.stop_word
     ]
     em1 = self.sentence_emb(t1_list)
     em2 = self.sentence_emb(t2_list)
     score = self.cos(em1, em2)
     # score = self.vector_similarity(t1_list, t2_list)
     score = score * 0.5 + 0.5  # 归一化
     return 1, score