def fill_lu(item: CharPhoneTable, transformer: Dict[str, str]) -> CharPhoneTable: sy = split_sy(item.full) if sy[0] not in transformer or sy[1] not in transformer: raise RuntimeError(f"{sy} not in transformer") item.lu = transformer[sy[0]] + transformer[sy[1]] return item
def fill_zrm(item: CharPhoneTable, transformer: Dict[str, str]) -> Tuple[CharPhoneTable, bool]: sy = split_sy(item.full) if sy[0] not in transformer or sy[1] not in transformer: print(f"{sy} not in transformer", file=sys.stderr) return item, False item.zrm = transformer[sy[0]] + transformer[sy[1]] return item, True
def fix_diff_s_same_y_full(item: CharPhoneTable) -> CharPhoneTable: full_sy = split_sy(item.full) correct_s = item.phones[0] if correct_s == "u": correct_s = "sh" elif correct_s == "i": correct_s = "ch" elif correct_s == "v": correct_s = "zh" item.full = correct_s + full_sy[1] return item
map(lambda e: e.word), set) with open(words_path, "r", encoding='utf8') as fin: #FIXME: bug to fix, we have more phone type now. ft_dict = get_double_dict() to_add_words = pipe( fin, map(lambda e: e.strip().split('\t')), filter(lambda e: len(e) in (1, 2)), filter(lambda e: len(e[0]) <= 5), filter(lambda e: not contain_alpha(e[0]) and not contain_symbols(e[ 0])), filter(lambda e: e[0] not in exist_words), map(cols_to_item), map(lambda e: (e, map(lambda e: split_sy(e), lazy_pinyin(e.word)))), map(lambda e: attr.evolve( e[0], phones=''.join(full_to_double(e[1], ft_dict)))), map(lambda e: WordPhoneTable(word=e.word, phones=e.phones, priority=e.priority, updatedt=datetime.now())), ) with db.atomic(): WordPhoneTable.bulk_create(to_add_words, batch_size=100) print('done')
def full_to_double(pinyin, full_to_two): return [full_to_two[e[0]] + full_to_two[e[1]] for e in pinyin] def get_double_dict(): full_to_two = {} for item in FullToTwoTable.select(): if item.full in full_to_two: print(f"ERROR in {item.full}") sys.exit(1) else: full_to_two[item.full] = item.two return full_to_two if __name__ == "__main__": full_to_two = get_double_dict() for item in WordPhoneTable.select(): word = item.word phones = item.phones pinyin = [split_sy(e) for e in lazy_pinyin(word)] # print(word, phones, pinyin) double = ''.join(full_to_double(pinyin, full_to_two)) if phones != double: print(f"diff in {item.id}, {word}, {phones}, {double}") item.delete_instance() print("done")