Python convert_to_unicode示例，mypycorrector.utils.text_utils.convert_to_unicode Python示例

示例#1

0

显示文件

def multi_threads_correct(text):
    threads_list = []
    text_new = ''
    details = []
    # 编码统一，utf-8 to unicode
    test = convert_to_unicode(text)
    blocks = bertCorrector.split_2_short_text(text, include_symbol=True)
    try:
        blocks = [(blocks[i][0] + blocks[i + 1][0], blocks[i][1])
                  for i in range(0, len(blocks), 2)]
    except Exception as e:
        pass
    punc = '。'
    for blk, start_idx in blocks:
        threads_list.append(MyThread(punc + blk, start_idx))
        punc = blk[-1]
    for thread in threads_list:
        thread.start()

    for thread in threads_list:
        thread.join()
        pred_text, pred_details = thread.get_result()
        text_new += pred_text
        for detail in pred_details:
            details.append(detail)
    return text_new, details

示例#2

0

显示文件

 def generate_bertScore_sound_shape_file(self, text, right_sentence='',id_lists=[]):
     """
     生成bert_score、sound_score、shape_score文件
     :param text: 句子文本
     :return: file
     """
     text_new = ''
     details = []
     self.check_corrector_initialized()
     # 编码统一，utf-8 to unicode
     text = convert_to_unicode(text)
     # 长句切分为短句
     blocks = self.split_2_short_text(text, include_symbol=True)
     try:
         blocks = [(blocks[i][0] + blocks[i + 1][0], blocks[i][1]) for i in range(0, len(blocks), 2)]
     except Exception as e:
         pass
     punc='。'
     if self.is_char_error_detect:
         for blk, start_idx in blocks:
             blk=punc+blk
             blk_new = ''
             for idx, s in enumerate(blk):
                 # 对非中文的错误不做处理
                 if is_chinese_string(s):
                     # 对已包含错误不处理
                     maybe_err = [s, idx, idx + 1, ErrorType.char]
                     if not self._check_contain_details_error(maybe_err, details):
                         sentence_lst = list(blk_new + blk[idx:])
                         sentence_lst[idx] = self.mask
                         sentence_new = ''.join(sentence_lst)
                         predicts = self.model(sentence_new)
                         top_tokens = []
                         ssc_s = self._getSSC(s)
                         for p in predicts:
                             token_id = p.get('token', 0)
                             token_score = p.get('score', 0)
                             token_str = self.model.tokenizer.convert_ids_to_tokens(token_id)
                             ssc_token = self._getSSC(token_str)
                             soundSimi=computeSoundCodeSimilarity(ssc_s[:4], ssc_token[:4])
                             shapeSimi=computeShapeCodeSimilarity(ssc_s[4:], ssc_token[4:])
                             ssc_similarity = computeSSCSimilarity(ssc_s, ssc_token)
                             top_tokens.append({'bert_score': token_score, 'token_str': token_str, \
                                 'ssc_similar': ssc_similarity, 'sound_similar': soundSimi, 'shape_similar': shapeSimi})
                     
                         if top_tokens and (s not in [token.get('token_str') for token in top_tokens]):
                             self.write2scorefile(s, top_tokens, start_idx + idx - 1, id_lists, right_sentence[start_idx + idx - 1])
                             # correct_item = self.ssc_correct_item(s, top_tokens)
                             correct_item = right_sentence[start_idx + idx - 1]
                             if correct_item != s:
                                 details.append([s, correct_item, idx + start_idx, idx + start_idx + 1, ErrorType.char])
                                 s = correct_item
                 blk_new += s
             text_new += blk_new
             punc = blk_new[-1]
     details = sorted(details, key=operator.itemgetter(2))
     return text_new, details

示例#3

0

显示文件

    def bert_correct_ssc_origin(self, text):
        """
        使用ssc音形码进行句子纠错
        :param text: 句子文本
        :return: list[list], [error_word, begin_pos, end_pos, error_type]
        """
        text_new = ''
        details = []
        self.check_corrector_initialized()
        # 编码统一，utf-8 to unicode
        text = convert_to_unicode(text)
        if self.is_char_error_detect:
            text_new = ""
            for idx, s in enumerate(text):
                # 对非中文的错误不做处理
                if is_chinese_string(s):
                    # 对已包含错误不处理
                    maybe_err = [s, idx, idx + 1, ErrorType.char]
                    if not self._check_contain_details_error(maybe_err, details):
                        sentence_lst = list(text_new + text[idx:])
                        sentence_lst[idx] = self.mask
                        sentence_new = ''.join(sentence_lst)
                        predicts = self.model(sentence_new)
                        top_tokens = []
                        ssc_s = self._getSSC(s)
                        for p in predicts:
                            token_id = p.get('token', 0)
                            token_score = p.get('score', 0)
                            token_str = self.model.tokenizer.convert_ids_to_tokens(token_id)
                            ssc_token = self._getSSC(token_str)
                            soundSimi=computeSoundCodeSimilarity(ssc_s[:4], ssc_token[:4])
                            shapeSimi=computeShapeCodeSimilarity(ssc_s[4:], ssc_token[4:])
                            ssc_similarity = computeSSCSimilarity(ssc_s, ssc_token)
                            top_tokens.append({'bert_score': token_score, 'token_str': token_str, \
                                'ssc_similar': ssc_similarity, 'sound_similar': soundSimi, 'shape_similar': shapeSimi})

                        if top_tokens and (s not in [token.get('token_str') for token in top_tokens]):
                            # correct_item = self.ssc_correct_item(s, top_tokens)
                            correct_item = self.neural_ssc_correct_item(s, top_tokens)
                            if correct_item != s:
                                details.append([s, correct_item, idx, idx + 1, ErrorType.char])
                            s = correct_item
                            # 取得所有可能正确的词
                            # candidates = self.generate_items(s)
                            # if candidates:
                            #     for token_str in top_tokens:
                            #         if token_str in candidates:
                            #             details.append([s, token_str, idx, idx + 1,ErrorType.char])
                            #             s = token_str
                            #             break
                text_new += s

        details = sorted(details, key=operator.itemgetter(2))
        return text_new, details

示例#4

0

显示文件

 def detect(self, sentence):
     """
     检测句子中的疑似错误信息，包括[词、位置、错误类型]
     :param text:
     :return: list[list], [error_word, begin_pos, end_pos, error_type]
     """
     maybe_errors = []
     if not sentence.strip():
         return maybe_errors
     # 初始化
     self.check_detector_initialized()
     # 编码统一，utf-8 to unicode
     sentence = convert_to_unicode(sentence)
     # 文本归一化
     sentence = uniform(sentence)
     # 切词
     tokens = self.tokenizer.tokenize(sentence)
     
     # 自定义混淆集加入疑似错误词典
     for confuse in self.custom_confusion:
         idx = sentence.find(confuse)
         if idx > -1:
             maybe_err = [confuse, idx, idx + len(confuse), ErrorType.confusion]
             self._add_maybe_error_item(maybe_err, maybe_errors)
     
     if self.is_word_error_detect:
         # 未登录词加入疑似错误词典
         for word, begin_idx, end_idx in tokens:
             # pass filter word
             if self.is_filter_token(word):
                 continue
             # pass in dict
             if word in self.word_freq:
                 # 多字词或词频大于50000的单字，可以continue
                 if len(word) == 1 and word in self.char_freq and self.char_freq.get(word) < 10000:                                  
                     maybe_err = [word, begin_idx, end_idx, ErrorType.word_char]
                     self._add_maybe_error_item(maybe_err, maybe_errors)
                     continue
                 # 出现叠字，考虑是否多字
                 if len(word) == 1 and sentence[begin_idx - 1] == word:
                     maybe_err = [word, begin_idx, end_idx, ErrorType.redundancy]
                     self._add_maybe_error_item(maybe_err, maybe_errors)
                 continue
             
             # 对碎片单字进行检测，可能多字、少字、错字
             if len(word) == 1:
                 maybe_err = [word, begin_idx, end_idx, ErrorType.word_char]
                 self._add_maybe_error_item(maybe_err, maybe_errors)
                 continue
             # maybe_err = [word, begin_idx, end_idx, ErrorType.word]
             # self._add_maybe_error_item(maybe_err, maybe_errors)
     return sorted(maybe_errors, key=lambda k: k[1], reverse=False)

示例#5

0

显示文件

    def bert_correct_ssc(self, text):
        """
        使用ssc音形码进行句子纠错
        :param text: 句子文本
        :return: list[list], [error_word, begin_pos, end_pos, error_type]
        """
        text_new = ''
        details = []
        self.check_corrector_initialized()
        # 编码统一，utf-8 to unicode
        text = convert_to_unicode(text)
        # 长句切分为短句
        blocks = self.split_2_short_text(text, include_symbol=True)
        # blocks = self.split_2_short_text(text)
        if self.is_word_error_detect:
            pass
        
        if self.is_char_error_detect:
            for blk, start_idx in blocks:
                blk_new = ''
                for idx, s in enumerate(blk):
                    # 对非中文的错误不做处理
                    if is_chinese_string(s):
                        sentence_lst = list(blk_new + blk[idx:])
                        sentence_lst[idx] = self.mask
                        sentence_new = ''.join(sentence_lst)
                        predicts = self.model(sentence_new)
                        top_tokens = []
                        ssc_s = self._getSSC(s)
                        for p in predicts:
                            token_id = p.get('token', 0)
                            token_score = p.get('score', 0)
                            token_str = self.model.tokenizer.convert_ids_to_tokens(token_id)
                            ssc_token = self._getSSC(token_str)
                            soundSimi=computeSoundCodeSimilarity(ssc_s[:4], ssc_token[:4])
                            shapeSimi=computeShapeCodeSimilarity(ssc_s[4:], ssc_token[4:])
                            ssc_similarity = computeSSCSimilarity(ssc_s, ssc_token)
                            top_tokens.append({'bert_score': token_score, 'token_str': token_str, \
                                'ssc_similar': ssc_similarity, 'sound_similar': soundSimi, 'shape_similar': shapeSimi})
                       
                        if top_tokens and (s not in [token.get('token_str') for token in top_tokens]):
                            # correct_item = self.ssc_correct_item(s, top_tokens)
                            correct_item = self.neural_ssc_correct_item(s, top_tokens)
                            if correct_item != s:
                                details.append([s, correct_item, idx + start_idx, idx + start_idx + 1, ErrorType.char])
                                s = correct_item
                    blk_new += s
                text_new += blk_new

        details = sorted(details, key=operator.itemgetter(2))
        return text_new, details

示例#6

0

显示文件

    def electra_correct(self, text):
        """
        句子纠错
        :param text: 句子文本
        :return: corrected_text, list[list], [error_word, correct_word, begin_pos, end_pos]
        """
        text_new = ''
        details = []
        # 编码统一，utf-8 to unicode
        text = convert_to_unicode(text)
        # 长句切分为短句
        blocks = self.split_2_short_text(text, include_symbol=True)
        for blk, start_idx in blocks:
            error_ids = self.electra_detect(blk)
            sentence_lst = list(blk)
            for idx in error_ids:
                s = sentence_lst[idx]
                if is_chinese_string(s):
                    # 处理中文错误
                    sentence_lst[idx] = self.mask
                    sentence_new = ''.join(sentence_lst)
                    # 生成器fill-mask预测[mask]，默认取top5
                    predicts = self.g_model(sentence_new)
                    top_tokens = []
                    for p in predicts:
                        token_id = p.get('token', 0)
                        token_str = self.g_model.tokenizer.convert_ids_to_tokens(
                            token_id)
                        top_tokens.append(token_str)

                    if top_tokens and (s not in top_tokens):
                        # 取得所有可能正确的词
                        candidates = self.generate_items(s)
                        if candidates:
                            for token_str in top_tokens:
                                if token_str in candidates:
                                    details.append([
                                        s, token_str, start_idx + idx,
                                        start_idx + idx + 1
                                    ])
                                    sentence_lst[idx] = token_str
                                    break
                    # 还原
                    if sentence_lst[idx] == self.mask:
                        sentence_lst[idx] = s

            blk_new = ''.join(sentence_lst)
            text_new += blk_new
        details = sorted(details, key=operator.itemgetter(2))
        return text_new, details

示例#7

0

显示文件

    def correct_short(self, text, start_idx=0):
        text_new = ''
        details = []
        self.check_corrector_initialized()
        # 编码统一，utf-8 to unicode
        text = convert_to_unicode(text)
        if self.is_word_error_detect:
            maybe_errors = self.detect(text)
            # trick: 类似翻译模型，倒序处理
            maybe_errors = sorted(maybe_errors, key=operator.itemgetter(2), reverse=True)
            for cur_item, begin_idx, end_idx, err_type in maybe_errors:
                # 纠错，逐个处理
                before_sent = text[:begin_idx]
                after_sent = text[end_idx:]

                # 对非中文的错字不做处理
                if not is_chinese_string(cur_item):
                    continue
                # 困惑集中指定的词，直接取结果
                if err_type == ErrorType.confusion:
                    corrected_item = (self.custom_confusion[cur_item], ErrorType.confusion)
                # 对碎片且不常用单字，可能错误是多字少字
                elif err_type == ErrorType.word_char:
                    maybe_right_items = self.generate_items_word_char(cur_item, before_sent, after_sent, begin_idx, end_idx)
                    corrected_item = self.lm_correct_item(cur_item, maybe_right_items, before_sent, after_sent)
                # 多字
                elif err_type == ErrorType.redundancy:
                    maybe_right_items = [('',ErrorType.redundancy)]
                    corrected_item = self.lm_correct_item(cur_item, maybe_right_items, before_sent, after_sent)
                
                # output
                if corrected_item[0] != cur_item:
                    text = before_sent + corrected_item[0] + after_sent
                    detail_word = [cur_item, corrected_item[0], start_idx+begin_idx, start_idx+end_idx, corrected_item[1]]
                    details.append(detail_word)    

        if self.is_char_error_detect:
            for idx, s in enumerate(text):
                # 对非中文的错误不做处理
                if is_chinese_string(s):
                    sentence_lst = list(text_new + text[idx:])
                    sentence_lst[idx] = self.mask
                    sentence_new = ''.join(sentence_lst)
                    predicts = self.model(sentence_new)
                    top_tokens = []
                    ssc_s = self._getSSC(s)
                    for p in predicts:
                        token_id = p.get('token', 0)
                        token_score = p.get('score', 0)
                        token_str = self.model.tokenizer.convert_ids_to_tokens(token_id)
                        ssc_token = self._getSSC(token_str)
                        soundSimi=computeSoundCodeSimilarity(ssc_s[:4], ssc_token[:4])
                        shapeSimi=computeShapeCodeSimilarity(ssc_s[4:], ssc_token[4:])
                        ssc_similarity = computeSSCSimilarity(ssc_s, ssc_token)
                        top_tokens.append({'bert_score': token_score, 'token_str': token_str, \
                            'ssc_similar': ssc_similarity, 'sound_similar': soundSimi, 'shape_similar': shapeSimi})
                    
                    if top_tokens and (s not in [token.get('token_str') for token in top_tokens]):
                        # correct_item = self.ssc_correct_item(s, top_tokens)
                        correct_item = self.neural_ssc_correct_item(s, top_tokens)
                        if correct_item != s:
                            # details.append([s, correct_item, idx + start_idx, idx + start_idx + 1, ErrorType.char])
                            details.append([s, correct_item, idx + start_idx -1, idx + start_idx, ErrorType.char])
                            s = correct_item
                text_new += s
        details = sorted(details, key=operator.itemgetter(2))
        return text_new[1:], details

示例#8

0

显示文件

    def bert_correct(self, text):
        """
        句子纠错
        :param text: 句子文本
        :return: list[list], [error_word, begin_pos, end_pos, error_type]
        """
        text_new = ''
        details = []
        self.check_corrector_initialized()
        # 编码统一，utf-8 to unicode
        text = convert_to_unicode(text)
        if self.is_word_error_detect:
            maybe_errors = self.detect(text)
            # trick: 类似翻译模型，倒序处理
            maybe_errors = sorted(maybe_errors, key=operator.itemgetter(2), reverse=True)
            for cur_item, begin_idx, end_idx, err_type in maybe_errors:
                # 纠错，逐个处理
                before_sent = text[:begin_idx]
                after_sent = text[end_idx:]

                # 对非中文的错字不做处理
                if not is_chinese_string(cur_item):
                    continue
                # 困惑集中指定的词，直接取结果
                if err_type == ErrorType.confusion:
                    corrected_item = (self.custom_confusion[cur_item], ErrorType.confusion)
                # 对碎片且不常用单字，可能错误是多字少字
                elif err_type == ErrorType.word_char:
                    maybe_right_items = self.generate_items_word_char(cur_item, before_sent, after_sent, begin_idx, end_idx)
                    corrected_item = self.lm_correct_item(cur_item, maybe_right_items, before_sent, after_sent)
                # 多字
                elif err_type == ErrorType.redundancy:
                    maybe_right_items = [('',ErrorType.redundancy)]
                    corrected_item = self.lm_correct_item(cur_item, maybe_right_items, before_sent, after_sent)
                elif err_type == ErrorType.word:
                    # 取得所有可能正确的词
                    candidates = self.generate_items(cur_item)
                    if not candidates:
                        continue
                    candidates=[(item,ErrorType.word) for item in candidates]
                    corrected_item = self.lm_correct_item(cur_item, candidates, before_sent, after_sent)
                    # 对ErrorType.word错误进行双层检测
                    # 对多字词进行处理
                    if len(corrected_item[0]) > 2 and corrected_item[0] not in self.word_freq:
                        candidates = self.generate_items_for_word(corrected_item[0])
                        if not candidates:
                            continue
                        candidates=[(item,ErrorType.word) for item in candidates]
                        corrected_item = self.lm_correct_item(corrected_item[0], candidates, before_sent, after_sent)
    
                # output
                if corrected_item[0] != cur_item:
                    text = before_sent + corrected_item[0] + after_sent
                    detail_word = [cur_item, corrected_item[0], begin_idx, end_idx, corrected_item[1]]
                    details.append(detail_word)

        if self.is_char_error_detect:
            text_new = ""
            for idx, s in enumerate(text):
                # 对非中文的错误不做处理
                if is_chinese_string(s):
                    # 对已包含错误不处理
                    maybe_err = [s, idx, idx + 1, ErrorType.char]
                    if not self._check_contain_details_error(maybe_err, details):
                        sentence_lst = list(text_new + text[idx:])
                        sentence_lst[idx] = self.mask
                        sentence_new = ''.join(sentence_lst)
                        predicts = self.model(sentence_new)
                        top_tokens = []
                        for p in predicts:
                            token_id = p.get('token', 0)
                            token_str = self.model.tokenizer.convert_ids_to_tokens(token_id)
                            top_tokens.append(token_str)

                        if top_tokens and (s not in top_tokens):
                            # 取得所有可能正确的词
                            candidates = self.generate_items(s)
                            if candidates:
                                for token_str in top_tokens:
                                    if token_str in candidates:
                                        details.append([s, token_str, idx, idx + 1,ErrorType.char])
                                        s = token_str
                                        break
                text_new += s

        details = sorted(details, key=operator.itemgetter(2))
        return text_new, details