Пример #1
0
def detect(sentence):
    maybe_error_indices = set()
    # 文本归一化
    sentence = uniform(sentence)
    # 切词
    tokens = tokenize(sentence)
    # 未登录词加入疑似错误字典
    for word, begin_idx, end_idx in tokens:
        if word not in PUNCTUATION_LIST and word not in word_freq.keys():
            for i in range(begin_idx, end_idx):
                maybe_error_indices.add(i)
    # 语言模型检测疑似错字
    ngram_avg_scores = []
    for n in [2, 3]:
        scores = []
        for i in range(len(sentence) - n + 1):
            word = sentence[i:i + n]
            score = get_ngram_score(list(word), mode=trigram_char)
            scores.append(score)
        # 移动窗口补全得分
        for _ in range(n - 1):
            scores.insert(0, scores[0])
            scores.append(scores[-1])
        avg_scores = [sum(scores[i:i + n]) / len(scores[i:i + n]) for i in range(len(sentence))]
        ngram_avg_scores.append(avg_scores)
    # 取拼接后的ngram平均得分
    sent_scores = list(np.average(np.array(ngram_avg_scores), axis=0))
    maybe_error_char_indices = _get_maybe_error_index(sent_scores)
    # 合并字、词错误
    maybe_error_indices |= set(maybe_error_char_indices)
    return sorted(maybe_error_indices)
Пример #2
0
    def ccm_sort(self, sentence):
        """
        """
        # 加载排序词典
        name_model = self.load_ccm_word_freq_dict(self.name_sort_path)
        maybe_errors = []
        if not sentence.strip():
            return maybe_errors
        # 文本归一化
        sentence = uniform(sentence)
        # 切词
        tokens = self.tokenizer.tokenize(sentence)
        print(tokens)
        temp = None
        error_list = []
        correct_list = []
        new = []
        i = -1
        for word, begin_idx, end_idx in tokens:
            new.append(word)
            i += 1
            if word in LINK_WORD:
                temp = None
            if name_model.get(word):
                if not temp:
                    temp = name_model.get(word)
                    continue
                else:
                    if temp > name_model.get(word):
                        p = tokens[i]
                        tokens[i] = tokens[i - 2]
                        tokens[i - 2] = p
                        print(tokens[i][0])
                        print(tokens[i - 2][0])
                        correct_list.append((tokens[i][0], i))
                        correct_list.append((tokens[i - 2][0], i - 2))
                        error_list.append((tokens[i][0], i))
                    else:
                        pass
            # print(tokens)
        # correct_list.append((tokens[i][0]))
        for word, p in correct_list:
            new[p] = word
        print(new)
        print("ls:" + str(correct_list))
        correct = ''.join(new)
        print("correct:" + correct)
        # print(error_list)

        # print(tokens)
        # print(tokens[0])

        return sorted(maybe_errors, key=lambda k: k[1], reverse=False)
Пример #3
0
 def detect(self, text):
     maybe_errors = []
     if not text.strip():
         return maybe_errors
     # 初始化
     self.check_detector_initialized()
     # 编码统一,utf-8 to unicode
     text = convert_to_unicode(text)
     # 文本归一化
     text = uniform(text)
     # 长句切分为短句
     blocks = self.split_2_short_text(text)
     for blk, idx in blocks:
         maybe_errors += self.detect_short(blk, idx)
     return maybe_errors
Пример #4
0
def detect(sentence):
    maybe_error_indices = set()
    # 文本归一化
    sentence = uniform(sentence)
    # 切词
    tokens = tokenize(sentence)
    # 未登录词加入疑似错误字典
    for word, begin_idx, end_idx in tokens:
        # fixed: pass num alpha
        if word.isalnum(): continue
        # punctuation
        if word in PUNCTUATION_LIST: continue
        # in dict
        if word in word_freq.keys(): continue
        for i in range(begin_idx, end_idx):
            maybe_error_indices.add(i)
    # 语言模型检测疑似错字
    ngram_avg_scores = []
    try:
        for n in [2, 3]:
            scores = []
            for i in range(len(sentence) - n + 1):
                word = sentence[i:i + n]
                score = get_ngram_score(list(word), mode=trigram_char)
                scores.append(score)
            if not scores: continue
            # 移动窗口补全得分
            for _ in range(n - 1):
                scores.insert(0, scores[0])
                scores.append(scores[-1])
            avg_scores = [
                sum(scores[i:i + n]) / len(scores[i:i + n])
                for i in range(len(sentence))
            ]
            ngram_avg_scores.append(avg_scores)

        # 取拼接后的ngram平均得分
        sent_scores = list(np.average(np.array(ngram_avg_scores), axis=0))
        maybe_error_char_indices = _get_maybe_error_index(sent_scores)
        # 合并字、词错误
        maybe_error_indices |= set(maybe_error_char_indices)
    except IndexError as ie:
        default_logger.warn("index error, sentence:" + sentence + ie)
    except Exception as e:
        default_logger.warn("detect error, sentence:" + sentence + e)
    return sorted(maybe_error_indices)
Пример #5
0
 def detect(self, sentence):
     maybe_errors = []
     if not sentence.strip():
         return maybe_errors
     # 初始化
     self.check_detector_initialized()
     # 文本归一化
     sentence = uniform(sentence)
     # 长句切分为短句
     blocks = re_han.split(sentence)
     start_idx = 0
     for blk in blocks:
         if not blk:
             continue
         if re_han.match(blk):
             maybe_errors += self._detect_short(blk, start_idx)
             start_idx += len(blk)
         else:
             start_idx += len(blk)
     return maybe_errors
Пример #6
0
def detect(sentence):
    maybe_error_indices = set()
    # 文本归一化
    sentence = uniform(sentence)
    # 切词
    tokens = tokenize(sentence)
    # 未登录词加入疑似错误字典
    for word, begin_idx, end_idx in tokens:
        if word not in PUNCTUATION_LIST and word not in word_freq.keys():
            for i in range(begin_idx, end_idx):
                maybe_error_indices.add(i)
    # 语言模型检测疑似错字
    ngram_avg_scores = []
    try:
        for n in [2, 3]:
            scores = []
            for i in range(len(sentence) - n + 1):
                word = sentence[i:i + n]
                score = get_ngram_score(list(word), mode=trigram_char)
                scores.append(score)
            # 移动窗口补全得分
            for _ in range(n - 1):
                scores.insert(0, scores[0])
                scores.append(scores[-1])
            avg_scores = [sum(scores[i:i + n]) / len(scores[i:i + n]) for i in range(len(sentence))]
            ngram_avg_scores.append(avg_scores)

        # 取拼接后的ngram平均得分
        sent_scores = list(np.average(np.array(ngram_avg_scores), axis=0))
        maybe_error_char_indices = _get_maybe_error_index(sent_scores)
        # 合并字、词错误
        maybe_error_indices |= set(maybe_error_char_indices)
    except IndexError as ie:
        print("index error, sentence:", sentence, ie)
        pass
    except Exception as e:
        print("detect error, sentence:", sentence, e)
    return sorted(maybe_error_indices)
Пример #7
0
 def name_job(self, sentence):
     """
     """
     # 加载人名-职务词典
     job_model = self.load_ccm_job_freq_dict(self.leader_job_path)
     print(job_model)
     maybe_errors = []
     if not sentence.strip():
         return maybe_errors
     # 文本归一化
     sentence = uniform(sentence)
     # 切词
     tokens = self.tokenizer.tokenize(sentence)
     print(tokens)
     # temp = None
     error_list = []
     correct_list = []
     new = []
     i = 0
     j = 0
     for word, begin_idx, end_idx in tokens:
         if job_model.get(word):
             print(i)  # 如果找到人名了,那么现在的i就是该人名的坐标
             a = job_model.get(word)
             front = a.get('1')
             temp_list = []
             for x in range(j, i):  # j就是起点坐标,i就是终点坐标
                 if self.leader_job_freq_dict.get(tokens[x][0]):
                     if tokens[x][0] not in front:
                         temp_list.append(tokens[x][0])
             if temp_list:
                 error_list.append({word: temp_list})
             else:
                 pass
             j = i + 1  # 起点坐标变为上一个人坐标的下一位坐标
         i += 1
     print(error_list)
Пример #8
0
    def detect(self, sentence):
        """
        检测句子中的疑似错误信息,包括[词、位置、错误类型]
        :param sentence:
        :return: list[list], [error_word, begin_pos, end_pos, error_type]
        """
        maybe_errors = []
        if not sentence.strip():
            return maybe_errors
        # 初始化
        self.check_detector_initialized()
        # 文本归一化
        sentence = uniform(sentence)
        # 切词
        tokens = self.tokenizer.tokenize(sentence)
        # print(tokens)
        # 自定义混淆集加入疑似错误词典
        for confuse in self.custom_confusion:
            idx = sentence.find(confuse)
            if idx > -1:
                maybe_err = [confuse, idx, idx + len(confuse), ErrorType.confusion]
                self._add_maybe_error_item(maybe_err, maybe_errors)

        if self.is_word_error_detect:
            # 未登录词加入疑似错误词典
            for word, begin_idx, end_idx in tokens:
                # pass filter word
                if self.is_filter_token(word):
                    continue
                # pass in dict
                if word in self.word_freq:
                    continue
                maybe_err = [word, begin_idx, end_idx, ErrorType.word]
                self._add_maybe_error_item(maybe_err, maybe_errors)

        if self.is_char_error_detect:
            # 语言模型检测疑似错误字
            if self.enable_rnnlm:
                scores = self.char_scores(sentence)
                # 取疑似错字信息
                for i in self._get_maybe_error_index_by_rnnlm(scores):
                    token = sentence[i]
                    # pass filter word
                    if self.is_filter_token(token):
                        continue
                    maybe_err = [token, i, i + 1, ErrorType.char]  # token, begin_idx, end_idx, error_type
                    self._add_maybe_error_item(maybe_err, maybe_errors)
            else:
                try:
                    ngram_avg_scores = []
                    for n in [2, 3]:
                        scores = []
                        for i in range(len(sentence) - n + 1):
                            word = sentence[i:i + n]
                            score = self.ngram_score(list(word))
                            scores.append(score)
                        if not scores:
                            continue
                        # 移动窗口补全得分
                        for _ in range(n - 1):
                            scores.insert(0, scores[0])
                            scores.append(scores[-1])
                        avg_scores = [sum(scores[i:i + n]) / len(scores[i:i + n]) for i in range(len(sentence))]
                        ngram_avg_scores.append(avg_scores)

                    # 取拼接后的n-gram平均得分
                    sent_scores = list(np.average(np.array(ngram_avg_scores), axis=0))
                    # 取疑似错字信息
                    for i in self._get_maybe_error_index(sent_scores):
                        token = sentence[i]
                        # pass filter word
                        if self.is_filter_token(token):
                            continue
                        maybe_err = [token, i, i + 1, ErrorType.char]  # token, begin_idx, end_idx, error_type
                        self._add_maybe_error_item(maybe_err, maybe_errors)
                except IndexError as ie:
                    logger.warn("index error, sentence:" + sentence + str(ie))
                except Exception as e:
                    logger.warn("detect error, sentence:" + sentence + str(e))
        return sorted(maybe_errors, key=lambda k: k[1], reverse=False)
Пример #9
0
    def detect(self, sentence):
        """
        检测句子中的疑似错误信息,包括[词、位置、错误类型]
        :param sentence:
        :return: [error_word, begin_pos, end_pos, error_type]
        """
        maybe_errors = []
        if not sentence.strip():
            return maybe_errors
        self.check_detector_initialized()
        # 文本归一化
        sentence = uniform(sentence)
        # 切词
        tokens = self.tokenizer.tokenize(sentence)
        # print(tokens)
        # 自定义混淆集加入疑似错误词典
        for confuse in self.custom_confusion:
            idx = sentence.find(confuse)
            if idx > -1:
                maybe_err = [
                    confuse, idx, idx + len(confuse), error_type["confusion"]
                ]
                self._add_maybe_error_item(maybe_err, maybe_errors)

        if self.is_word_error_detect:
            # 未登录词加入疑似错误词典
            for word, begin_idx, end_idx in tokens:
                # pass blank
                if not word.strip():
                    continue
                # punctuation
                if word in PUNCTUATION_LIST:
                    continue
                # pass num
                if word.isdigit():
                    continue
                # pass alpha
                if is_alphabet_string(word.lower()):
                    continue
                # in dict
                if word in self.word_freq:
                    continue
                maybe_err = [word, begin_idx, end_idx, error_type["word"]]
                self._add_maybe_error_item(maybe_err, maybe_errors)

        if self.is_char_error_detect:
            # 语言模型检测疑似错误字
            ngram_avg_scores = []
            try:
                for n in [2, 3]:
                    scores = []
                    for i in range(len(sentence) - n + 1):
                        word = sentence[i:i + n]
                        score = self.ngram_score(list(word))
                        scores.append(score)
                    if not scores:
                        continue
                    # 移动窗口补全得分
                    for _ in range(n - 1):
                        scores.insert(0, scores[0])
                        scores.append(scores[-1])
                    avg_scores = [
                        sum(scores[i:i + n]) / len(scores[i:i + n])
                        for i in range(len(sentence))
                    ]
                    ngram_avg_scores.append(avg_scores)

                # 取拼接后的ngram平均得分
                sent_scores = list(
                    np.average(np.array(ngram_avg_scores), axis=0))
                # 取疑似错字信息
                for i in self._get_maybe_error_index(sent_scores):
                    maybe_err = [sentence[i], i, i + 1, error_type["char"]]
                    self._add_maybe_error_item(maybe_err, maybe_errors)
            except IndexError as ie:
                logger.warn("index error, sentence:" + sentence + str(ie))
            except Exception as e:
                logger.warn("detect error, sentence:" + sentence + str(e))
        return sorted(maybe_errors, key=lambda k: k[1], reverse=False)
Пример #10
0
def detect(sentence):
    maybe_error_indices = set()

    sentence = uniform(sentence)

    tokens = tokenize(sentence)

    # unknown chars
    for word, begin_idx, end_idx in tokens:
        if word not in PUNCTUATION_LIST and word not in word_freq.keys():
            for i in range(begin_idx, end_idx):
                maybe_error_indices.add(i)

    ngram_avg_scores = []
    try:
        for n in [1, 2, 3]:
            scores = []
            for i in range(len(sentence) - n + 1):
                word = sentence[i:i + n]
                score = get_ngram_score(list(word), mode=trigram_char)
                scores.append(score)

            for _ in range(n - 1):
                scores.insert(0, scores[0])
                scores.append(scores[-1])

            avg_scores = [
                sum(scores[i:i + n]) / len(scores[i:i + n])
                for i in range(len(sentence))
            ]
            ngram_avg_scores.append(avg_scores)

        sent_scores = list(np.average(np.array(ngram_avg_scores), axis=0))
        maybe_error_char_indices = _get_maybe_error_index(sent_scores)

        maybe_error_indices |= set(maybe_error_char_indices)
    except IndexError as ie:
        print("index error, sentence:", sentence, ie)
        pass
    except Exception as e:
        print("detect error, sentence:", sentence, e)

    # # to get rid of special nouns like name
    seg = pseg.lcut(sentence)
    # # in the form of list of pair(w.word, w.flag)
    word = [w.word for w in seg]
    tag = [w.flag for w in seg]

    for i in range(len(tag)):
        if tag[i] in {'nz', 'nr', 'nt', 'ns'}:
            if i > 0 and tag[i - 1] == 'd':
                continue

            if len(word[i]) > 1:
                maybe_error_indices -= set(range(len(''.join(word[:i])), \
                                                 len(''.join(word[:i + 1]))))
            elif i + 1 < len(tag) and tag[i + 1] in {'nz', 'nr', 'nt', 'ns'}:
                maybe_error_indices -= set(range(len(''.join(word[:i])), \
                                                 len(''.join(word[:i + 2]))))
        # if tag[i] == 'j' and len(word[i]) > 1:
        #     maybe_error_indices -= set(range(len(''.join(word[:i])), \
        #                                      len(''.join(word[:i + 1]))))
    return sorted(maybe_error_indices)