result[c] |= set(parts) return result char_dict_path = os.path.join(pwd_path, config.char_dict_path) cn_char_set = load_char_dict(char_dict_path) two_char_dict = load_2char_dict(pwd_path + '/data/char_two_set.txt') # # word dictionary word_dict_text_path = os.path.join(pwd_path, config.word_dict_path) word_dict_model_path = os.path.join(pwd_path, config.word_dict_model_path) if os.path.exists(word_dict_model_path): cn_word_set = load_pkl(word_dict_model_path) else: default_logger.debug('load word dict from text file:', word_dict_model_path) cn_word_set = load_word_dict(word_dict_text_path) dump_pkl(cn_word_set, word_dict_model_path) # similar pronuciation same_pinyin_text_path = os.path.join(pwd_path, config.same_pinyin_text_path) same_pinyin_model_path = os.path.join(pwd_path, config.same_pinyin_model_path) # same_pinyin = load_same_pinyin(same_pinyin_text_path) if os.path.exists(same_pinyin_model_path): same_pinyin = load_pkl(same_pinyin_model_path) else: default_logger.debug('load same pinyin from text file:', same_pinyin_text_path) same_pinyin = load_same_pinyin(same_pinyin_text_path) dump_pkl(same_pinyin, same_pinyin_model_path) # similar shape same_stroke_text_path = os.path.join(pwd_path, config.same_stroke_text_path) same_stroke_model_path = os.path.join(pwd_path, config.same_stroke_model_path)
info = line.split() word = info[0] freq = int(info[1]) word_freq[word] = freq return word_freq # 字频统计 word_dict_path = os.path.join(pwd_path, config.word_dict_path) word_dict_model_path = os.path.join(pwd_path, config.word_dict_model_path) if os.path.exists(word_dict_model_path): word_freq = load_pkl(word_dict_model_path) else: default_logger.debug('load word freq from text file:', word_dict_path) word_freq = load_word_freq_dict(word_dict_path) dump_pkl(word_freq, word_dict_model_path) def get_ngram_score(chars, mode=trigram_char): """ 取n元文法得分 :param chars: list, 以词或字切分 :param mode: :return: """ return mode.score(' '.join(chars), bos=False, eos=False) def get_ppl_score(words, mode=trigram_char): """ 取语言模型困惑度得分,越小句子越通顺
info = line.split() word = info[0] freq = int(info[1]) word_freq[word] = freq return word_freq # 字频统计 word_freq_path = os.path.join(pwd_path, config.word_freq_path) word_freq_model_path = os.path.join(pwd_path, config.word_freq_model_path) if os.path.exists(word_freq_model_path): word_freq = load_pkl(word_freq_model_path) else: default_logger.debug('load word freq from text file:', word_freq_path) word_freq = load_word_freq_dict(word_freq_path) dump_pkl(word_freq, word_freq_model_path) def get_ngram_score(chars, mode=trigram_char): """ 取n元文法得分 :param chars: list, 以词或字切分 :param mode: :return: """ return mode.score(' '.join(chars), bos=False, eos=False) def get_ppl_score(words, mode=trigram_char): """ 取语言模型困惑度得分,越小句子越通顺
for i, c in enumerate(parts): result[c] = set(list(parts[:i] + parts[i + 1:])) return result cn_char_set = load_word_dict(char_file_path) same_pinyin_text_path = os.path.join(pwd_path, config.same_pinyin_text_path) same_pinyin_model_path = os.path.join(pwd_path, config.same_pinyin_model_path) # 同音字 if os.path.exists(same_pinyin_model_path): same_pinyin = load_pkl(same_pinyin_model_path) else: default_logger.debug('load same pinyin from text file:', same_pinyin_text_path) same_pinyin = load_same_pinyin(same_pinyin_text_path) dump_pkl(same_pinyin, same_pinyin_model_path) # 形似字 same_stroke_text_path = os.path.join(pwd_path, config.same_stroke_text_path) same_stroke_model_path = os.path.join(pwd_path, config.same_stroke_model_path) if os.path.exists(same_stroke_model_path): same_stroke = load_pkl(same_stroke_model_path) else: default_logger.debug('load same stroke from text file:', same_stroke_text_path) same_stroke = load_same_stroke(same_stroke_text_path) dump_pkl(same_stroke, same_stroke_model_path) def get_same_pinyin(char): """
if parts and len(parts) > 1: for i, c in enumerate(parts): result[c] = set(list(parts[:i] + parts[i + 1:])) return result cn_char_set = load_word_dict(char_file_path) same_pinyin_text_path = os.path.join(pwd_path, config.same_pinyin_text_path) same_pinyin_model_path = os.path.join(pwd_path, config.same_pinyin_model_path) # 同音字 if os.path.exists(same_pinyin_model_path): same_pinyin = load_pkl(same_pinyin_model_path) else: default_logger.debug('load same pinyin from text file:', same_pinyin_text_path) same_pinyin = load_same_pinyin(same_pinyin_text_path) dump_pkl(same_pinyin, same_pinyin_model_path) # 形似字 same_stroke_text_path = os.path.join(pwd_path, config.same_stroke_text_path) same_stroke_model_path = os.path.join(pwd_path, config.same_stroke_model_path) if os.path.exists(same_stroke_model_path): same_stroke = load_pkl(same_stroke_model_path) else: default_logger.debug('load same stroke from text file:', same_stroke_text_path) same_stroke = load_same_stroke(same_stroke_text_path) dump_pkl(same_stroke, same_stroke_model_path) def get_same_pinyin(char): """ 取同音字