def main(): """ 预测韦氏所有声韵母组合. IN: more/韦氏拼音表_{声母,韵母,零声母}.txt OT: 非标准韦氏拼音表.txt 注意: 生成的文件中含有大量的不可靠韦氏拼音, 有些是形而似的, 有些甚至 毫无逻辑, 可能对英文单词形成干扰. 请谨慎使用. 请在脚本生成此文件后, 对此文件进行人工清理. 之后不建议再通过本脚本生成. 以后的维护工作全部转到 "非标准韦氏拼音表 .txt" 本身由人工维护. """ initials = read_and_write.read_lines('more/韦氏拼音表_声母.txt') finals = read_and_write.read_lines('more/韦氏拼音表_韵母.txt') aoe = read_and_write.read_lines('more/韦氏拼音表_零声母.txt') combo = [] for i in initials: for j in finals: combo.append(i + j) combo.extend(aoe) combo = list(set(combo)) # extra (customs) combo.extend(['xean']) combo.sort() read_and_write.write_file(combo, '非标准韦氏拼音表.txt')
def generate_trie_file(*ifiles, ofile, check_duplicate=False): if len(ifiles) == 1: words = read_and_write.read_lines(ifiles[0]) trie.add_list(words) else: words_sum = [] for f in ifiles: words = read_and_write.read_lines(f) for w in words: if w in words_sum: if check_duplicate: lk.logt('[W5317]', w, f, h='parent') else: words_sum.append(w) words_sum.sort() trie.add_list(words_sum) trie.save(ofile) trie.clear() lk.loga('file saved', ofile)
def generate_postrie_file(*ifiles, ofile, check_duplicate=False): if len(ifiles) == 1: words = read_and_write.read_lines(ifiles[0]) nwords = [x[::-1] for x in words] # 反转字符串. 'ABC' -> 'CBA' trie.add_list(nwords) else: words_sum = [] for f in ifiles: words = read_and_write.read_lines(f) for w in words: w = w[::-1] if w in words_sum: if check_duplicate: lk.logt('[W5318]', w, f, h='parent') else: words_sum.append(w) words_sum.sort() trie.add_list(words_sum) trie.save(ofile) trie.clear() lk.loga('file saved', ofile)
def generate_plain_file(*ifiles, ofile, check_duplicate=False): if len(ifiles) == 1: copyfile(ifiles[0], ofile) lk.loga('file saved', ofile) return data_sum = [] for f in ifiles: data = read_and_write.read_lines(f) for d in data: if d in data_sum: if check_duplicate: lk.logt('[W5316]', d, f, h='parent') else: data_sum.append(d) data_sum.sort() read_and_write.write_file(data_sum, ofile) lk.loga('file saved', ofile)
def main(): """ 通过大量的样本来预测可能的 "模棱两可" 的拼音的特征. """ i_file = 'train_in.txt' o_file = 'train_out.txt' aoe = ("a", "ai", "an", "ang", "ao", "o", "ou", "e", "ei", "en", "eng", "er") r = read_and_write.read_lines(i_file) w = [] for i in r: for j in aoe: if i.endswith(j) and i.rsplit(j, 1)[0] in r: w.append(f'{i}\t{i[:-1 * len(j)]}\t{j}') break read_and_write.write_file(w, o_file)