Пример #1
0
def load_same_pinyin(path, sep='\t'):
    """
    加载同音字
    :param path:
    :param sep:
    :return:
    """
    result = dict()
    if not os.path.exists(path):
        logger.warn("file not exists:" + path)
        return result
    with codecs.open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.startswith('#'):
                continue
            parts = line.split(sep)
            if parts and len(parts) > 2:
                key_char = parts[0]
                # same_pron_same_tone = set(list(parts[1]))
                # same_pron_diff_tone = set(list(parts[2]))
                # value = same_pron_same_tone.union(same_pron_diff_tone)
                value = set()
                for part in parts[1:]:
                    value = value.union(set(list(part)))
                if len(key_char) > 1 or not value:
                    continue
                result[key_char] = value
    return result
Пример #2
0
def load_same_stroke(path, sep='\t'):
    """
    加载形似字
    :param path:
    :param sep:
    :return:
    """
    result = dict()
    if not os.path.exists(path):
        logger.warn("file not exists:" + path)
        return result
    with codecs.open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.startswith('#'):
                continue
            parts = line.split(sep)
            if parts and len(parts) > 1:
                # for i, c in enumerate(parts):
                #     result[c] = set(list(parts[:i] + parts[i + 1:]))
                result[parts[0]] = set(list(parts[1]))
    return result
Пример #3
0
    def detect(self, sentence):
        """
        检测句子中的疑似错误信息,包括[词、位置、错误类型]
        :param sentence:
        :return: list[list], [error_word, begin_pos, end_pos, error_type]
        """
        maybe_errors = []
        if not sentence.strip():
            return maybe_errors
        # 初始化
        self.check_detector_initialized()
        # 文本归一化
        sentence = uniform(sentence)
        # 切词
        tokens = self.tokenizer.tokenize(sentence)
        self.tokens = [token[0] for token in tokens]
        # print(tokens)
        # 自定义混淆集加入疑似错误词典
        # for confuse in self.custom_confusion:
        #     idx = sentence.find(confuse)
        #     if idx > -1:
        #         maybe_err = [confuse, idx, idx +
        #                      len(confuse), ErrorType.confusion]
        #         self._add_maybe_error_item(maybe_err, maybe_errors)

        if self.is_word_error_detect:
            # 未登录词加入疑似错误词典
            for word, begin_idx, end_idx in tokens:
                # pass filter word
                if self.is_filter_token(word):
                    continue
                # pass in dict
                if word in self.word_freq:
                    if self.is_redundancy_miss_error_detect:
                        # 多字词或词频大于50000的单字,可以continue
                        if len(
                                word
                        ) == 1 and word in self.char_freq and self.char_freq.get(
                                word) < 50000:
                            maybe_err = [
                                word, begin_idx, end_idx, ErrorType.word_char
                            ]
                            self._add_maybe_error_item(maybe_err, maybe_errors)
                            continue
                        # 出现叠字,考虑是否多字
                        if len(word) == 1 and sentence[begin_idx - 1] == word:
                            maybe_err = [
                                word, begin_idx, end_idx, ErrorType.redundancy
                            ]
                            self._add_maybe_error_item(maybe_err, maybe_errors)
                            continue
                    continue
                # 对碎片单字进行检测,可能多字、少字、错字
                if self.is_redundancy_miss_error_detect:
                    if len(word) == 1:
                        maybe_err = [
                            word, begin_idx, end_idx, ErrorType.word_char
                        ]
                        self._add_maybe_error_item(maybe_err, maybe_errors)
                        continue
                maybe_err = [word, begin_idx, end_idx, ErrorType.word]
                self._add_maybe_error_item(maybe_err, maybe_errors)

        if self.is_char_error_detect:
            # 语言模型检测疑似错误字
            try:
                for prob, f in self.predict_token_prob(sentence):
                    # logger.debug('prob:%s, token:%s, idx:%s' % (prob, f.token, f.id))
                    if prob < self.threshold:
                        maybe_err = [f.token, f.id, f.id + 1, ErrorType.char]
                        self._add_maybe_error_item(maybe_err, maybe_errors)
                # return maybe_errors
            except IndexError as ie:
                logger.warn("index error, sentence:" + sentence + str(ie))
            except Exception as e:
                logger.warn("detect error, sentence:" + sentence + str(e))
        return sorted(maybe_errors, key=lambda k: k[1], reverse=False)
    def detect(self, sentence):
        """
        检测句子中的疑似错误信息,包括[词、位置、错误类型]
        :param sentence:
        :return: list[list], [error_word, begin_pos, end_pos, error_type]
        """
        maybe_errors = []
        if not sentence.strip():
            return maybe_errors
        # 初始化
        self.check_detector_initialized()
        # 文本归一化
        sentence = uniform(sentence)
        # 切词
        tokens = self.tokenizer.tokenize(sentence)
        # print(tokens)
        # 自定义混淆集加入疑似错误词典
        for confuse in self.custom_confusion:
            idx = sentence.find(confuse)
            if idx > -1:
                maybe_err = [
                    confuse, idx, idx + len(confuse), ErrorType.confusion
                ]
                self._add_maybe_error_item(maybe_err, maybe_errors)

        if self.is_word_error_detect:
            # 未登录词加入疑似错误词典
            for word, begin_idx, end_idx in tokens:
                # pass filter word
                if self.is_filter_token(word):
                    continue
                # pass in dict
                if word in self.word_freq:
                    continue
                maybe_err = [word, begin_idx, end_idx, ErrorType.word]
                self._add_maybe_error_item(maybe_err, maybe_errors)

        if self.is_char_error_detect:
            try:
                ngram_avg_scores = []
                for n in [2, 3]:
                    scores = []
                    for i in range(len(sentence) - n + 1):
                        word = sentence[i:i + n]
                        score = self.ngram_score(list(word))
                        scores.append(score)
                    if not scores:
                        continue
                    # 移动窗口补全得分
                    for _ in range(n - 1):
                        scores.insert(0, scores[0])
                        scores.append(scores[-1])
                    avg_scores = [
                        sum(scores[i:i + n]) / len(scores[i:i + n])
                        for i in range(len(sentence))
                    ]
                    ngram_avg_scores.append(avg_scores)

                # 取拼接后的n-gram平均得分
                sent_scores = list(
                    np.average(np.array(ngram_avg_scores), axis=0))
                # 取疑似错字信息
                for i in self._get_maybe_error_index(sent_scores):
                    token = sentence[i]
                    # pass filter word
                    if self.is_filter_token(token):
                        continue
                    # token, begin_idx, end_idx, error_type
                    maybe_err = [token, i, i + 1, ErrorType.char]
                    self._add_maybe_error_item(maybe_err, maybe_errors)
            except IndexError as ie:
                logger.warn("index error, sentence:" + sentence + str(ie))
            except Exception as e:
                logger.warn("detect error, sentence:" + sentence + str(e))
        return sorted(maybe_errors, key=lambda k: k[1], reverse=False)