Пример #1
0
                    result[c] |= set(parts)
    return result

char_dict_path = os.path.join(pwd_path, config.char_dict_path)
cn_char_set = load_char_dict(char_dict_path)
two_char_dict = load_2char_dict(pwd_path + '/data/char_two_set.txt')

# # word dictionary
word_dict_text_path = os.path.join(pwd_path, config.word_dict_path)
word_dict_model_path = os.path.join(pwd_path, config.word_dict_model_path)
if os.path.exists(word_dict_model_path):
    cn_word_set = load_pkl(word_dict_model_path)
else:
    default_logger.debug('load word dict from text file:', word_dict_model_path)
    cn_word_set = load_word_dict(word_dict_text_path)
    dump_pkl(cn_word_set, word_dict_model_path)

# similar pronuciation
same_pinyin_text_path = os.path.join(pwd_path, config.same_pinyin_text_path)
same_pinyin_model_path = os.path.join(pwd_path, config.same_pinyin_model_path)
# same_pinyin = load_same_pinyin(same_pinyin_text_path)
if os.path.exists(same_pinyin_model_path):
    same_pinyin = load_pkl(same_pinyin_model_path)
else:
    default_logger.debug('load same pinyin from text file:', same_pinyin_text_path)
    same_pinyin = load_same_pinyin(same_pinyin_text_path)
    dump_pkl(same_pinyin, same_pinyin_model_path)

# similar shape
same_stroke_text_path = os.path.join(pwd_path, config.same_stroke_text_path)
same_stroke_model_path = os.path.join(pwd_path, config.same_stroke_model_path)
Пример #2
0
            info = line.split()
            word = info[0]
            freq = int(info[1])
            word_freq[word] = freq
    return word_freq


# 字频统计
word_dict_path = os.path.join(pwd_path, config.word_dict_path)
word_dict_model_path = os.path.join(pwd_path, config.word_dict_model_path)
if os.path.exists(word_dict_model_path):
    word_freq = load_pkl(word_dict_model_path)
else:
    default_logger.debug('load word freq from text file:', word_dict_path)
    word_freq = load_word_freq_dict(word_dict_path)
    dump_pkl(word_freq, word_dict_model_path)


def get_ngram_score(chars, mode=trigram_char):
    """
    取n元文法得分
    :param chars: list, 以词或字切分
    :param mode:
    :return:
    """
    return mode.score(' '.join(chars), bos=False, eos=False)


def get_ppl_score(words, mode=trigram_char):
    """
    取语言模型困惑度得分,越小句子越通顺
Пример #3
0
            info = line.split()
            word = info[0]
            freq = int(info[1])
            word_freq[word] = freq
    return word_freq


# 字频统计
word_freq_path = os.path.join(pwd_path, config.word_freq_path)
word_freq_model_path = os.path.join(pwd_path, config.word_freq_model_path)
if os.path.exists(word_freq_model_path):
    word_freq = load_pkl(word_freq_model_path)
else:
    default_logger.debug('load word freq from text file:', word_freq_path)
    word_freq = load_word_freq_dict(word_freq_path)
    dump_pkl(word_freq, word_freq_model_path)


def get_ngram_score(chars, mode=trigram_char):
    """
    取n元文法得分
    :param chars: list, 以词或字切分
    :param mode:
    :return:
    """
    return mode.score(' '.join(chars), bos=False, eos=False)


def get_ppl_score(words, mode=trigram_char):
    """
    取语言模型困惑度得分,越小句子越通顺
Пример #4
0
                for i, c in enumerate(parts):
                    result[c] = set(list(parts[:i] + parts[i + 1:]))
    return result


cn_char_set = load_word_dict(char_file_path)
same_pinyin_text_path = os.path.join(pwd_path, config.same_pinyin_text_path)
same_pinyin_model_path = os.path.join(pwd_path, config.same_pinyin_model_path)
# 同音字
if os.path.exists(same_pinyin_model_path):
    same_pinyin = load_pkl(same_pinyin_model_path)
else:
    default_logger.debug('load same pinyin from text file:',
                         same_pinyin_text_path)
    same_pinyin = load_same_pinyin(same_pinyin_text_path)
    dump_pkl(same_pinyin, same_pinyin_model_path)

# 形似字
same_stroke_text_path = os.path.join(pwd_path, config.same_stroke_text_path)
same_stroke_model_path = os.path.join(pwd_path, config.same_stroke_model_path)
if os.path.exists(same_stroke_model_path):
    same_stroke = load_pkl(same_stroke_model_path)
else:
    default_logger.debug('load same stroke from text file:',
                         same_stroke_text_path)
    same_stroke = load_same_stroke(same_stroke_text_path)
    dump_pkl(same_stroke, same_stroke_model_path)


def get_same_pinyin(char):
    """
Пример #5
0
            if parts and len(parts) > 1:
                for i, c in enumerate(parts):
                    result[c] = set(list(parts[:i] + parts[i + 1:]))
    return result


cn_char_set = load_word_dict(char_file_path)
same_pinyin_text_path = os.path.join(pwd_path, config.same_pinyin_text_path)
same_pinyin_model_path = os.path.join(pwd_path, config.same_pinyin_model_path)
# 同音字
if os.path.exists(same_pinyin_model_path):
    same_pinyin = load_pkl(same_pinyin_model_path)
else:
    default_logger.debug('load same pinyin from text file:', same_pinyin_text_path)
    same_pinyin = load_same_pinyin(same_pinyin_text_path)
    dump_pkl(same_pinyin, same_pinyin_model_path)

# 形似字
same_stroke_text_path = os.path.join(pwd_path, config.same_stroke_text_path)
same_stroke_model_path = os.path.join(pwd_path, config.same_stroke_model_path)
if os.path.exists(same_stroke_model_path):
    same_stroke = load_pkl(same_stroke_model_path)
else:
    default_logger.debug('load same stroke from text file:', same_stroke_text_path)
    same_stroke = load_same_stroke(same_stroke_text_path)
    dump_pkl(same_stroke, same_stroke_model_path)


def get_same_pinyin(char):
    """
    取同音字
Пример #6
0
            info = line.split()
            word = info[0]
            freq = int(info[1])
            word_freq[word] = freq
    return word_freq


# 字频统计
word_freq_path = os.path.join(pwd_path, config.word_freq_path)
word_freq_model_path = os.path.join(pwd_path, config.word_freq_model_path)
if os.path.exists(word_freq_model_path):
    word_freq = load_pkl(word_freq_model_path)
else:
    default_logger.debug('load word freq from text file:', word_freq_path)
    word_freq = load_word_freq_dict(word_freq_path)
    dump_pkl(word_freq, word_freq_model_path)


def get_ngram_score(chars, mode=trigram_char):
    """
    取n元文法得分
    :param chars: list, 以词或字切分
    :param mode:
    :return:
    """
    return mode.score(' '.join(chars), bos=False, eos=False)


def get_ppl_score(words, mode=trigram_char):
    """
    取语言模型困惑度得分,越小句子越通顺