def load_same_pinyin(path, sep='\t'): """ 加载同音字 :param path: :param sep: :return: """ result = dict() if not os.path.exists(path): logger.warn("file not exists:" + path) return result with codecs.open(path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line.startswith('#'): continue parts = line.split(sep) if parts and len(parts) > 2: key_char = parts[0] # same_pron_same_tone = set(list(parts[1])) # same_pron_diff_tone = set(list(parts[2])) # value = same_pron_same_tone.union(same_pron_diff_tone) value = set() for part in parts[1:]: value = value.union(set(list(part))) if len(key_char) > 1 or not value: continue result[key_char] = value return result
def load_same_stroke(path, sep='\t'): """ 加载形似字 :param path: :param sep: :return: """ result = dict() if not os.path.exists(path): logger.warn("file not exists:" + path) return result with codecs.open(path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line.startswith('#'): continue parts = line.split(sep) if parts and len(parts) > 1: # for i, c in enumerate(parts): # result[c] = set(list(parts[:i] + parts[i + 1:])) result[parts[0]] = set(list(parts[1])) return result
def detect(self, sentence): """ 检测句子中的疑似错误信息,包括[词、位置、错误类型] :param sentence: :return: list[list], [error_word, begin_pos, end_pos, error_type] """ maybe_errors = [] if not sentence.strip(): return maybe_errors # 初始化 self.check_detector_initialized() # 文本归一化 sentence = uniform(sentence) # 切词 tokens = self.tokenizer.tokenize(sentence) self.tokens = [token[0] for token in tokens] # print(tokens) # 自定义混淆集加入疑似错误词典 # for confuse in self.custom_confusion: # idx = sentence.find(confuse) # if idx > -1: # maybe_err = [confuse, idx, idx + # len(confuse), ErrorType.confusion] # self._add_maybe_error_item(maybe_err, maybe_errors) if self.is_word_error_detect: # 未登录词加入疑似错误词典 for word, begin_idx, end_idx in tokens: # pass filter word if self.is_filter_token(word): continue # pass in dict if word in self.word_freq: if self.is_redundancy_miss_error_detect: # 多字词或词频大于50000的单字,可以continue if len( word ) == 1 and word in self.char_freq and self.char_freq.get( word) < 50000: maybe_err = [ word, begin_idx, end_idx, ErrorType.word_char ] self._add_maybe_error_item(maybe_err, maybe_errors) continue # 出现叠字,考虑是否多字 if len(word) == 1 and sentence[begin_idx - 1] == word: maybe_err = [ word, begin_idx, end_idx, ErrorType.redundancy ] self._add_maybe_error_item(maybe_err, maybe_errors) continue continue # 对碎片单字进行检测,可能多字、少字、错字 if self.is_redundancy_miss_error_detect: if len(word) == 1: maybe_err = [ word, begin_idx, end_idx, ErrorType.word_char ] self._add_maybe_error_item(maybe_err, maybe_errors) continue maybe_err = [word, begin_idx, end_idx, ErrorType.word] self._add_maybe_error_item(maybe_err, maybe_errors) if self.is_char_error_detect: # 语言模型检测疑似错误字 try: for prob, f in self.predict_token_prob(sentence): # logger.debug('prob:%s, token:%s, idx:%s' % (prob, f.token, f.id)) if prob < self.threshold: maybe_err = [f.token, f.id, f.id + 1, ErrorType.char] self._add_maybe_error_item(maybe_err, maybe_errors) # return maybe_errors except IndexError as ie: logger.warn("index error, sentence:" + sentence + str(ie)) except Exception as e: logger.warn("detect error, sentence:" + sentence + str(e)) return sorted(maybe_errors, key=lambda k: k[1], reverse=False)
def detect(self, sentence): """ 检测句子中的疑似错误信息,包括[词、位置、错误类型] :param sentence: :return: list[list], [error_word, begin_pos, end_pos, error_type] """ maybe_errors = [] if not sentence.strip(): return maybe_errors # 初始化 self.check_detector_initialized() # 文本归一化 sentence = uniform(sentence) # 切词 tokens = self.tokenizer.tokenize(sentence) # print(tokens) # 自定义混淆集加入疑似错误词典 for confuse in self.custom_confusion: idx = sentence.find(confuse) if idx > -1: maybe_err = [ confuse, idx, idx + len(confuse), ErrorType.confusion ] self._add_maybe_error_item(maybe_err, maybe_errors) if self.is_word_error_detect: # 未登录词加入疑似错误词典 for word, begin_idx, end_idx in tokens: # pass filter word if self.is_filter_token(word): continue # pass in dict if word in self.word_freq: continue maybe_err = [word, begin_idx, end_idx, ErrorType.word] self._add_maybe_error_item(maybe_err, maybe_errors) if self.is_char_error_detect: try: ngram_avg_scores = [] for n in [2, 3]: scores = [] for i in range(len(sentence) - n + 1): word = sentence[i:i + n] score = self.ngram_score(list(word)) scores.append(score) if not scores: continue # 移动窗口补全得分 for _ in range(n - 1): scores.insert(0, scores[0]) scores.append(scores[-1]) avg_scores = [ sum(scores[i:i + n]) / len(scores[i:i + n]) for i in range(len(sentence)) ] ngram_avg_scores.append(avg_scores) # 取拼接后的n-gram平均得分 sent_scores = list( np.average(np.array(ngram_avg_scores), axis=0)) # 取疑似错字信息 for i in self._get_maybe_error_index(sent_scores): token = sentence[i] # pass filter word if self.is_filter_token(token): continue # token, begin_idx, end_idx, error_type maybe_err = [token, i, i + 1, ErrorType.char] self._add_maybe_error_item(maybe_err, maybe_errors) except IndexError as ie: logger.warn("index error, sentence:" + sentence + str(ie)) except Exception as e: logger.warn("detect error, sentence:" + sentence + str(e)) return sorted(maybe_errors, key=lambda k: k[1], reverse=False)