예제 #1
0
    def _swap_position(self, char_list, orig_pos):
        # 找到可交换取值范围
        start_pos = 0
        end_pos = -1
        while orig_pos + start_pos > 0 \
                and check_chinese_char(char_list[orig_pos + start_pos - 1]):
            start_pos -= 1

        while orig_pos + end_pos < len(char_list) - 1 \
                and check_chinese_char(char_list[orig_pos + end_pos + 1]):
            end_pos += 1
        # print(orig_pos + start_pos, orig_pos + end_pos)

        if orig_pos + start_pos == orig_pos + end_pos:
            # 孤立汉字直接返回
            return orig_pos

        # 以高斯分布选取交换值,越靠近
        while True:
            res = round(np.random.normal(0, self.scale))
            if res == 0:
                continue
            if start_pos <= res <= end_pos:
                break

        return res + orig_pos
예제 #2
0
    def _prepare(self, add_ratio=0.02, delete_ratio=0.02, seed=1):
        orig_char_distribution = char_distribution_loader()
        char_distribution = dict()
        for char, distribution in orig_char_distribution.items():
            is_chinese = check_chinese_char(char)
            # 插入的字符不可以为中文字符
            # 1、此处考虑特殊情况,对一些常见标点、常见字符的分布做删除,因其易干扰结果
            # 2、高频符号出现次数极高,为平衡高频低频的字符,须做分布平滑,采用指数函数
            if not is_chinese and char not in ',:。;“”;…!!??':
                char_distribution.update(
                    {char: np.exp(np.log10(distribution['total_num']))})

        total_num = sum(list(char_distribution.values()))
        self.char_distribution = dict()
        for char, count in char_distribution.items():
            self.char_distribution.update({char: count / total_num})
        self.char_keys = list(self.char_distribution.keys())
        self.char_probs = list(self.char_distribution.values())

        self.char_distribution = sorted(self.char_distribution.items(),
                                        key=lambda i: i[1],
                                        reverse=True)
        # self.tmp = [item[1] for item in self.char_distribution if item[1] > 0.001]

        del self.char_distribution

        self.add_ratio = add_ratio
        self.delete_ratio = delete_ratio

        self.random = np.random
        self.seed = seed
        if seed != 0:
            self.random.seed(seed)
예제 #3
0
    def _augment_one(self, text):
        char_list = list(text)
        for i in range(len(char_list)):
            if np.random.uniform(0, 1) < self.swap_ratio:
                if not check_chinese_char(char_list[i]):
                    continue
                change_i = self._swap_position(char_list, i)
                # print(i, change_i)
                # print(char_list[i], char_list[change_i])
                char_list[i], char_list[change_i] = char_list[
                    change_i], char_list[i]

        return ''.join(char_list)
예제 #4
0
    def __call__(self,
                 text,
                 summary_length=200,
                 lead_3_weight=1.2,
                 topic_theta=0.2,
                 allow_topic_weight=True):

        # 输入检查
        if type(text) is not str:
            raise ValueError('type of `text` should only be str')
        try:
            # 初始化加载
            if self.unk_topic_prominence_value == 0.:
                self._prepare()

            if lead_3_weight < 1:
                raise ValueError(
                    'the params `lead_3_weight` should not be less than 1.0')
            if len(text) <= summary_length:
                return text

            # step 0: 清洗文本
            text = clean_text(text)

            # step 1: 分句,并逐句清理杂质
            sentences_list = split_sentence(text)

            # step 2: 分词与词性标注
            sentences_segs_dict = dict()
            counter_segs_list = list()
            for idx, sen in enumerate(sentences_list):
                if not check_chinese_char(sen):  # 若无中文字符,则略过
                    continue

                sen_segs = self.seg.cut(sen)
                sentences_segs_dict.update({sen: [idx, sen_segs, list(), 0]})
                counter_segs_list.extend(sen_segs)

            # step 3: 计算词频
            total_length = len(counter_segs_list)
            freq_dict = dict()
            for word_pos in counter_segs_list:
                word, pos = word_pos
                if word in freq_dict:
                    freq_dict[word][1] += 1
                else:
                    freq_dict.update({word: [pos, 1]})

            # step 4: 计算每一个词的权重
            for sen, sen_segs in sentences_segs_dict.items():
                sen_segs_weights = list()
                for word_pos in sen_segs[1]:
                    word, pos = word_pos
                    if pos not in self.pos_name and word in self.stop_words:  # 虚词权重为 0
                        weight = 0.0
                    else:
                        weight = freq_dict[word][1] * self.idf_dict.get(
                            word, self.median_idf) / total_length
                    sen_segs_weights.append(weight)

                sen_segs[2] = sen_segs_weights
                sen_segs[3] = len([w for w in sen_segs_weights if w != 0]) / len(sen_segs_weights) \
                    if len(sen_segs_weights) == 0 else 0

            # step 5: 得到每个句子的权重
            for sen, sen_segs in sentences_segs_dict.items():
                # tfidf 权重
                tfidf_weight = sum(sen_segs[2]) / len(sen_segs[2])

                # 主题模型权重
                if allow_topic_weight:
                    topic_weight = 0.0
                    for item in sen_segs[1]:
                        topic_weight += self.topic_prominence_dict.get(
                            item[0], self.unk_topic_prominence_value)
                    topic_weight = topic_weight / len(sen_segs[1])
                else:
                    topic_weight = 0.0

                sen_weight = topic_weight * topic_theta + tfidf_weight

                # 句子长度超过限制,权重削减
                if len(sen) < 15 or len(sen) > 70:
                    sen_weight = 0.7 * sen_weight

                # LEAD-3 权重
                if sen_segs[0] < 3:
                    sen_weight *= lead_3_weight

                sen_segs[3] = sen_weight

            # step 6: 按照 MMR 算法重新计算权重,并把不想要的过滤掉
            sentences_info_list = sorted(sentences_segs_dict.items(),
                                         key=lambda item: item[1][3],
                                         reverse=True)

            mmr_list = list()
            for sentence_info in sentences_info_list:
                # 计算与已有句子的相似度
                sim_ratio = self._mmr_similarity(sentence_info, mmr_list)
                sentence_info[1][3] = (1 - sim_ratio) * sentence_info[1][3]
                mmr_list.append(sentence_info)

            # step 7: 按重要程度进行排序,选取若干个句子作为摘要
            if len(sentences_info_list) == 1:
                return sentences_info_list[0][0]
            total_length = 0
            summary_list = list()
            for idx, item in enumerate(sentences_info_list):
                if len(item[0]) + total_length > summary_length:
                    if idx == 0:
                        return item[0]
                    else:
                        # 按序号排序
                        summary_list = sorted(summary_list,
                                              key=lambda item: item[1][0])
                        summary = ''.join([item[0] for item in summary_list])
                        return summary
                else:
                    summary_list.append(item)
                    total_length += len(item[0])
                    if idx == len(sentences_info_list) - 1:
                        summary_list = sorted(summary_list,
                                              key=lambda item: item[1][0])
                        summary = ''.join([item[0] for item in summary_list])
                        return summary

            return text[:summary_length]
        except Exception as e:
            logging.error('the text is illegal. \n{}'.format(e))
            return ''