def detect(sentence): maybe_error_indices = set() # 文本归一化 sentence = uniform(sentence) # 切词 tokens = tokenize(sentence) # 未登录词加入疑似错误字典 for word, begin_idx, end_idx in tokens: if word not in PUNCTUATION_LIST and word not in word_freq.keys(): for i in range(begin_idx, end_idx): maybe_error_indices.add(i) # 语言模型检测疑似错字 ngram_avg_scores = [] for n in [2, 3]: scores = [] for i in range(len(sentence) - n + 1): word = sentence[i:i + n] score = get_ngram_score(list(word), mode=trigram_char) scores.append(score) # 移动窗口补全得分 for _ in range(n - 1): scores.insert(0, scores[0]) scores.append(scores[-1]) avg_scores = [sum(scores[i:i + n]) / len(scores[i:i + n]) for i in range(len(sentence))] ngram_avg_scores.append(avg_scores) # 取拼接后的ngram平均得分 sent_scores = list(np.average(np.array(ngram_avg_scores), axis=0)) maybe_error_char_indices = _get_maybe_error_index(sent_scores) # 合并字、词错误 maybe_error_indices |= set(maybe_error_char_indices) return sorted(maybe_error_indices)
def detect(sentence): maybe_error_indices = set() # 文本归一化 sentence = uniform(sentence) # 切词 tokens = tokenize(sentence) # 未登录词加入疑似错误字典 for word, begin_idx, end_idx in tokens: # fixed: pass num alpha if word.isalnum(): continue # punctuation if word in PUNCTUATION_LIST: continue # in dict if word in word_freq.keys(): continue for i in range(begin_idx, end_idx): maybe_error_indices.add(i) # 语言模型检测疑似错字 ngram_avg_scores = [] try: for n in [2, 3]: scores = [] for i in range(len(sentence) - n + 1): word = sentence[i:i + n] score = get_ngram_score(list(word), mode=trigram_char) scores.append(score) if not scores: continue # 移动窗口补全得分 for _ in range(n - 1): scores.insert(0, scores[0]) scores.append(scores[-1]) avg_scores = [ sum(scores[i:i + n]) / len(scores[i:i + n]) for i in range(len(sentence)) ] ngram_avg_scores.append(avg_scores) # 取拼接后的ngram平均得分 sent_scores = list(np.average(np.array(ngram_avg_scores), axis=0)) maybe_error_char_indices = _get_maybe_error_index(sent_scores) # 合并字、词错误 maybe_error_indices |= set(maybe_error_char_indices) except IndexError as ie: default_logger.warn("index error, sentence:" + sentence + ie) except Exception as e: default_logger.warn("detect error, sentence:" + sentence + e) return sorted(maybe_error_indices)
def detect(sentence): maybe_error_indices = set() # 文本归一化 sentence = uniform(sentence) # 切词 tokens = tokenize(sentence) # 未登录词加入疑似错误字典 for word, begin_idx, end_idx in tokens: if word not in PUNCTUATION_LIST and word not in word_freq.keys(): for i in range(begin_idx, end_idx): maybe_error_indices.add(i) # 语言模型检测疑似错字 ngram_avg_scores = [] try: for n in [2, 3]: scores = [] for i in range(len(sentence) - n + 1): word = sentence[i:i + n] score = get_ngram_score(list(word), mode=trigram_char) scores.append(score) # 移动窗口补全得分 for _ in range(n - 1): scores.insert(0, scores[0]) scores.append(scores[-1]) avg_scores = [sum(scores[i:i + n]) / len(scores[i:i + n]) for i in range(len(sentence))] ngram_avg_scores.append(avg_scores) # 取拼接后的ngram平均得分 sent_scores = list(np.average(np.array(ngram_avg_scores), axis=0)) maybe_error_char_indices = _get_maybe_error_index(sent_scores) # 合并字、词错误 maybe_error_indices |= set(maybe_error_char_indices) except IndexError as ie: print("index error, sentence:", sentence, ie) pass except Exception as e: print("detect error, sentence:", sentence, e) return sorted(maybe_error_indices)
from pypinyin import lazy_pinyin from pycorrector.utils.text_utils import traditional2simplified, simplified2traditional from pycorrector.utils.text_utils import tokenize, get_homophones_by_char, get_homophones_by_pinyin traditional_sentence = '憂郁的臺灣烏龜' simplified_sentence = traditional2simplified(traditional_sentence) print(simplified_sentence) simplified_sentence = '忧郁的台湾乌龟' traditional_sentence = simplified2traditional(simplified_sentence) print(traditional_sentence) print(lazy_pinyin('中心')) # 不带音调 print(tokenize('小姑娘蹦蹦跳跳的去了她外公家')) # 判断拼音还是英文 en_dict = enchant.Dict("en_US") print(en_dict.check("hello")) print(en_dict.check("hello boy what is your name")) strs = "hello boy what is your name" flag = False for word in strs: if en_dict.check(word): flag = True else: flag = False break print(flag) print(en_dict.check("zhangsan"))
def detect(self, sentence): maybe_errors = [] if not sentence.strip(): return maybe_errors self.check_detector_initialized() # 文本归一化 sentence = uniform(sentence) # 切词 tokens = tokenize(sentence) # 自定义混淆集加入疑似错误词典 for confuse in self.custom_confusion: idx = sentence.find(confuse) if idx > -1: maybe_err = [confuse, idx, idx + len(confuse)] self._add_maybe_error_item(maybe_err, maybe_errors) # 未登录词加入疑似错误词典 for word, begin_idx, end_idx in tokens: # pass blank if not word.strip(): continue # punctuation if word in PUNCTUATION_LIST: continue # pass num if word.isdigit(): continue # pass alpha if is_alphabet_string(word.lower()): continue # in dict if word in self.word_freq: continue maybe_err = [word, begin_idx, end_idx] self._add_maybe_error_item(maybe_err, maybe_errors) # 语言模型检测疑似错误字 ngram_avg_scores = [] try: for n in [2, 3]: scores = [] for i in range(len(sentence) - n + 1): word = sentence[i:i + n] score = self.ngram_score(list(word)) scores.append(score) if not scores: continue # 移动窗口补全得分 for _ in range(n - 1): scores.insert(0, scores[0]) scores.append(scores[-1]) avg_scores = [ sum(scores[i:i + n]) / len(scores[i:i + n]) for i in range(len(sentence)) ] ngram_avg_scores.append(avg_scores) # 取拼接后的ngram平均得分 sent_scores = list(np.average(np.array(ngram_avg_scores), axis=0)) # 取疑似错字信息 for i in self._get_maybe_error_index(sent_scores): maybe_err = [sentence[i], i, i + 1] self._add_maybe_error_item(maybe_err, maybe_errors) except IndexError as ie: default_logger.warn("index error, sentence:" + sentence + str(ie)) except Exception as e: default_logger.warn("detect error, sentence:" + sentence + str(e)) return sorted(maybe_errors, key=lambda k: k[1], reverse=False)
def detect(sentence): maybe_error_indices = set() sentence = uniform(sentence) tokens = tokenize(sentence) # unknown chars for word, begin_idx, end_idx in tokens: if word not in PUNCTUATION_LIST and word not in word_freq.keys(): for i in range(begin_idx, end_idx): maybe_error_indices.add(i) ngram_avg_scores = [] try: for n in [1, 2, 3]: scores = [] for i in range(len(sentence) - n + 1): word = sentence[i:i + n] score = get_ngram_score(list(word), mode=trigram_char) scores.append(score) for _ in range(n - 1): scores.insert(0, scores[0]) scores.append(scores[-1]) avg_scores = [ sum(scores[i:i + n]) / len(scores[i:i + n]) for i in range(len(sentence)) ] ngram_avg_scores.append(avg_scores) sent_scores = list(np.average(np.array(ngram_avg_scores), axis=0)) maybe_error_char_indices = _get_maybe_error_index(sent_scores) maybe_error_indices |= set(maybe_error_char_indices) except IndexError as ie: print("index error, sentence:", sentence, ie) pass except Exception as e: print("detect error, sentence:", sentence, e) # # to get rid of special nouns like name seg = pseg.lcut(sentence) # # in the form of list of pair(w.word, w.flag) word = [w.word for w in seg] tag = [w.flag for w in seg] for i in range(len(tag)): if tag[i] in {'nz', 'nr', 'nt', 'ns'}: if i > 0 and tag[i - 1] == 'd': continue if len(word[i]) > 1: maybe_error_indices -= set(range(len(''.join(word[:i])), \ len(''.join(word[:i + 1])))) elif i + 1 < len(tag) and tag[i + 1] in {'nz', 'nr', 'nt', 'ns'}: maybe_error_indices -= set(range(len(''.join(word[:i])), \ len(''.join(word[:i + 2])))) # if tag[i] == 'j' and len(word[i]) > 1: # maybe_error_indices -= set(range(len(''.join(word[:i])), \ # len(''.join(word[:i + 1])))) return sorted(maybe_error_indices)