예제 #1
0
def main():
    """
    预测韦氏所有声韵母组合.
    IN: more/韦氏拼音表_{声母,韵母,零声母}.txt
    OT: 非标准韦氏拼音表.txt
            注意: 生成的文件中含有大量的不可靠韦氏拼音, 有些是形而似的, 有些甚至
            毫无逻辑, 可能对英文单词形成干扰. 请谨慎使用.
            请在脚本生成此文件后, 对此文件进行人工清理.
            之后不建议再通过本脚本生成. 以后的维护工作全部转到 "非标准韦氏拼音表
            .txt" 本身由人工维护.
    """
    initials = read_and_write.read_lines('more/韦氏拼音表_声母.txt')
    finals = read_and_write.read_lines('more/韦氏拼音表_韵母.txt')
    aoe = read_and_write.read_lines('more/韦氏拼音表_零声母.txt')

    combo = []
    for i in initials:
        for j in finals:
            combo.append(i + j)

    combo.extend(aoe)
    combo = list(set(combo))

    # extra (customs)
    combo.extend(['xean'])

    combo.sort()
    read_and_write.write_file(combo, '非标准韦氏拼音表.txt')
예제 #2
0
def generate_trie_file(*ifiles, ofile, check_duplicate=False):
    if len(ifiles) == 1:
        words = read_and_write.read_lines(ifiles[0])
        trie.add_list(words)
    else:
        words_sum = []
        for f in ifiles:
            words = read_and_write.read_lines(f)
            for w in words:
                if w in words_sum:
                    if check_duplicate:
                        lk.logt('[W5317]', w, f, h='parent')
                else:
                    words_sum.append(w)
        words_sum.sort()
        trie.add_list(words_sum)
    trie.save(ofile)
    trie.clear()
    lk.loga('file saved', ofile)
예제 #3
0
def generate_postrie_file(*ifiles, ofile, check_duplicate=False):
    if len(ifiles) == 1:
        words = read_and_write.read_lines(ifiles[0])
        nwords = [x[::-1] for x in words]  # 反转字符串. 'ABC' -> 'CBA'
        trie.add_list(nwords)
    else:
        words_sum = []
        for f in ifiles:
            words = read_and_write.read_lines(f)
            for w in words:
                w = w[::-1]
                if w in words_sum:
                    if check_duplicate:
                        lk.logt('[W5318]', w, f, h='parent')
                else:
                    words_sum.append(w)
        words_sum.sort()
        trie.add_list(words_sum)
    trie.save(ofile)
    trie.clear()
    lk.loga('file saved', ofile)
예제 #4
0
def generate_plain_file(*ifiles, ofile, check_duplicate=False):
    if len(ifiles) == 1:
        copyfile(ifiles[0], ofile)
        lk.loga('file saved', ofile)
        return

    data_sum = []
    for f in ifiles:
        data = read_and_write.read_lines(f)
        for d in data:
            if d in data_sum:
                if check_duplicate:
                    lk.logt('[W5316]', d, f, h='parent')
            else:
                data_sum.append(d)
    data_sum.sort()
    read_and_write.write_file(data_sum, ofile)
    lk.loga('file saved', ofile)
예제 #5
0
def main():
    """
    通过大量的样本来预测可能的 "模棱两可" 的拼音的特征.
    """
    i_file = 'train_in.txt'
    o_file = 'train_out.txt'

    aoe = ("a", "ai", "an", "ang", "ao", "o", "ou", "e", "ei", "en", "eng",
           "er")

    r = read_and_write.read_lines(i_file)
    w = []

    for i in r:
        for j in aoe:
            if i.endswith(j) and i.rsplit(j, 1)[0] in r:
                w.append(f'{i}\t{i[:-1 * len(j)]}\t{j}')
                break

    read_and_write.write_file(w, o_file)