Пример #1
0
    def summarize(self, text, num=8, alpha=0.6):
        """

        :param text: str
        :param num: int
        :return: list
        """
        # 切句
        if type(text) == str:
            self.sentences = cut_sentence(text)
        elif type(text) == list:
            self.sentences = text
        else:
            raise RuntimeError("text type must be list or str")
        # 切词
        sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence))
                          if word.strip()] for sentence in self.sentences]
        # 去除停用词等
        self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut]
        self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut]
        # # 计算每个句子的词语个数
        # sen_word_len = [len(sc)+1 for sc in sentences_cut]
        # 计算每个句子的tfidf
        sen_tfidf = tfidf_fit(self.sentences_cut)
        # 矩阵中两两句子相似度
        SimMatrix = (sen_tfidf * sen_tfidf.T).A # 例如: SimMatrix[1, 3]  # "第2篇与第4篇的相似度"
        # 输入文本句子长度
        len_sen = len(self.sentences)
        # 句子标号
        sen_idx = [i for i in range(len_sen)]
        summary_set = []
        mmr = {}
        for i in range(len_sen):
            if not self.sentences[i] in summary_set:
                sen_idx_pop = copy.deepcopy(sen_idx)
                sen_idx_pop.pop(i)
                # 两两句子相似度
                sim_i_j = [SimMatrix[i, j] for j in sen_idx_pop]
                score_tfidf = sen_tfidf[i].toarray()[0].sum() # / sen_word_len[i], 如果除以词语个数就不准确
                mmr[self.sentences[i]] = alpha * score_tfidf - (1 - alpha) * max(sim_i_j)
                summary_set.append(self.sentences[i])
        score_sen = [(rc[1], rc[0]) for rc in sorted(mmr.items(), key=lambda d: d[1], reverse=True)]
        if len(mmr) > num:
            score_sen = score_sen[0:num]
        return score_sen
Пример #2
0
    def summarize(self, text, num=8, topic_min=6, judge_topic=None):
        """
            LDA
        :param text: str
        :param num: int
        :param topic_min: int 
        :param judge_topic: boolean
        :return: 
        """
        # 切句
        if type(text) == str:
            self.sentences = cut_sentence(text)
        elif type(text) == list:
            self.sentences = text
        else:
            raise RuntimeError("text type must be list or str")
        len_sentences_cut = len(self.sentences)
        # 切词
        sentences_cut = [[
            word for word in macropodus_cut(extract_chinese(sentence))
            if word.strip()
        ] for sentence in self.sentences]
        # 去除停用词等
        self.sentences_cut = [
            list(filter(lambda x: x not in self.stop_words, sc))
            for sc in sentences_cut
        ]
        self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut]
        # # 计算每个句子的tf
        # vector_c = CountVectorizer(ngram_range=(1, 2), stop_words=self.stop_words)
        # tf_ngram = vector_c.fit_transform(self.sentences_cut)
        # 计算每个句子的tfidf
        tf_ngram = tfidf_fit(self.sentences_cut)
        # 主题数, 经验判断
        topic_num = min(topic_min, int(len(sentences_cut) / 2))  # 设定最小主题数为3
        lda = LatentDirichletAllocation(n_topics=topic_num,
                                        max_iter=32,
                                        learning_method='online',
                                        learning_offset=50.,
                                        random_state=2019)
        res_lda_u = lda.fit_transform(tf_ngram.T)
        res_lda_v = lda.components_

        if judge_topic:
            ### 方案一, 获取最大那个主题的k个句子
            ##################################################################################
            topic_t_score = np.sum(res_lda_v, axis=-1)
            # 对每列(一个句子topic_num个主题),得分进行排序,0为最大
            res_nmf_h_soft = res_lda_v.argsort(axis=0)[-topic_num:][::-1]
            # 统计为最大每个主题的句子个数
            exist = (res_nmf_h_soft <= 0) * 1.0
            factor = np.ones(res_nmf_h_soft.shape[1])
            topic_t_count = np.dot(exist, factor)
            # 标准化
            topic_t_count /= np.sum(topic_t_count, axis=-1)
            topic_t_score /= np.sum(topic_t_score, axis=-1)
            # 主题最大个数占比, 与主题总得分占比选择最大的主题
            topic_t_tc = topic_t_count + topic_t_score
            topic_t_tc_argmax = np.argmax(topic_t_tc)
            # 最后得分选择该最大主题的
            res_nmf_h_soft_argmax = res_lda_v[topic_t_tc_argmax].tolist()
            res_combine = {}
            for l in range(len_sentences_cut):
                res_combine[self.sentences[l]] = res_nmf_h_soft_argmax[l]
            score_sen = [(rc[1], rc[0]) for rc in sorted(
                res_combine.items(), key=lambda d: d[1], reverse=True)]
            #####################################################################################
        else:
            ### 方案二, 获取最大主题概率的句子, 不分主题
            res_combine = {}
            for i in range(len_sentences_cut):
                res_row_i = res_lda_v[:, i]
                res_row_i_argmax = np.argmax(res_row_i)
                res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax]
            score_sen = [(rc[1], rc[0]) for rc in sorted(
                res_combine.items(), key=lambda d: d[1], reverse=True)]
        num_min = min(num, int(len_sentences_cut * 0.6))
        return score_sen[0:num_min]
Пример #3
0
    def summarize(self, text, num=320, topic_min=5, judge_topic='all'):
        """
            
        :param text: 
        :param num: 
        :return: 
        """
        # 切句
        if type(text) == str:
            self.sentences = cut_sentence(text)
        elif type(text) == list:
            self.sentences = text
        else:
            raise RuntimeError("text type must be list or str")
        len_sentences_cut = len(self.sentences)
        # 切词
        sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence))
                          if word.strip()] for sentence in self.sentences]
        # 去除停用词等
        self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut]
        self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut]
        # 计算每个句子的tfidf
        sen_tfidf = tfidf_fit(self.sentences_cut)
        # 主题数, 经验判断
        topic_num = min(topic_min, int(len(sentences_cut)/2))  # 设定最小主题数为3
        svd_tfidf = TruncatedSVD(n_components=topic_num, n_iter=32)
        res_svd_u = svd_tfidf.fit_transform(sen_tfidf.T)
        res_svd_v = svd_tfidf.components_

        if judge_topic:
            ### 方案一, 获取最大那个主题的k个句子
            ##################################################################################
            topic_t_score = np.sum(res_svd_v, axis=-1)
            # 对每列(一个句子topic_num个主题),得分进行排序,0为最大
            res_nmf_h_soft = res_svd_v.argsort(axis=0)[-topic_num:][::-1]
            # 统计为最大每个主题的句子个数
            exist = (res_nmf_h_soft <= 0) * 1.0
            factor = np.ones(res_nmf_h_soft.shape[1])
            topic_t_count = np.dot(exist, factor)
            # 标准化
            topic_t_count /= np.sum(topic_t_count, axis=-1)
            topic_t_score /= np.sum(topic_t_score, axis=-1)
            # 主题最大个数占比, 与主题总得分占比选择最大的主题
            topic_t_tc = topic_t_count + topic_t_score
            topic_t_tc_argmax = np.argmax(topic_t_tc)
            # 最后得分选择该最大主题的
            res_nmf_h_soft_argmax = res_svd_v[topic_t_tc_argmax].tolist()
            res_combine = {}
            for l in range(len_sentences_cut):
                res_combine[self.sentences[l]] = res_nmf_h_soft_argmax[l]
            score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
            #####################################################################################
        else:
            ### 方案二, 获取最大主题概率的句子, 不分主题
            res_combine = {}
            for i in range(len_sentences_cut):
                res_row_i = res_svd_v[:, i]
                res_row_i_argmax = np.argmax(res_row_i)
                res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax]
            score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
        num_min = min(num, int(len_sentences_cut * 0.6))
        return score_sen[0:num_min]