def part(text, pinyin2cmu_dict): phone_list = list() tone_list = list() # print(text) text = re.sub(r'[^\x00-\x7F]+', '', text) # exit() for pinyin in text.split(" "): if len(pinyin) == 0 or pinyin == " " or "#" in pinyin: continue tone = re.findall(r"\d+\.?\d*", pinyin) if len(tone) == 0: tone = "5" pass tone = int(tone[0]) + 3 pinyin = pinyin.replace(str(tone-3), "") print(pinyin, len(pinyin)) print('test', get_initials(pinyin, False).upper(), get_initials(pinyin, False).upper() not in pinyin2cmu_dict.keys()) print('tste', get_finals(pinyin, False).upper(), get_finals(pinyin, False).upper() not in pinyin2cmu_dict.keys()) print((get_initials(pinyin, False).upper() not in pinyin2cmu_dict.keys()) and (get_finals(pinyin, False).upper() not in pinyin2cmu_dict.keys())) print("===================") # if (get_initials(pinyin, False).upper() not in pinyin2cmu_dict.keys()) and (get_finals(pinyin, False).upper() not in pinyin2cmu_dict.keys()): # print("not in:", pinyin) # print(get_initials(pinyin, False).upper()) # print(get_finals(pinyin, False).upper()) # phone_list.append(pinyin) # phone_list.append(" ") # tone_list.append(str(tone)) # tone_list.append(str(tone)) # continue for pin_part in (get_initials(pinyin, False), get_finals(pinyin, False)): print("pin_part", pin_part) if pin_part.upper() in pinyin2cmu_dict.keys(): phone_list.append(pinyin2cmu_dict[pin_part.upper()]) for _ in pinyin2cmu_dict[pin_part.upper()].split(" "): tone_list.append(str(tone)) print("cmu", pinyin2cmu_dict[pin_part.upper()]) phone_list.append("$") tone_list.append(str(8)) # print(len(phone_list[:-5])) print(phone_list) return phone_list, tone_list
def _nonstd_style(pinyin, **kwargs): initials = get_initials(pinyin, strict=True) finals = finals_converter.to_finals_tone3(pinyin, strict=True) # process silent finals if finals in sil_finals and initials in sil_finals_initials: finals = finals.replace('i', '') pinyin = '' + initials + finals return pinyin
def pypinyin_g2p_phone(text) -> List[str]: from pypinyin import pinyin from pypinyin import Style from pypinyin.style._utils import get_finals from pypinyin.style._utils import get_initials phones = [ p for phone in pinyin(text, style=Style.TONE3) for p in [ get_initials(phone[0], strict=True), get_finals(phone[0], strict=True), ] if len(p) != 0 ] return phones
def part2(text, pinyin2cmu_dict): phone_list = list() tone_list = list() # print(text) text = re.sub(r'[^\x00-\x7F]+', '', text) # exit() new_phone_list = list() for pinyin in text.split(" "): if "#" not in pinyin: tone = re.findall(r"\d+\.?\d*", pinyin) else: tone = [] if len(tone) == 0: tone = 7 else: tone = int(tone[0]) + 2 # print(pinyin2cmu_dict.keys()) # exit() # pinyin = pinyin.replace(str(tone - 2), "") head = get_initials(pinyin, False).upper() tail = get_finals(pinyin, False).upper() if "#" in pinyin: new_phone_list.append(pinyin) continue if head not in pinyin2cmu_dict.keys() and tail not in pinyin2cmu_dict: new_phone_list.append(pinyin) continue if head != "": new_phone_list.append(pinyin2cmu_dict[head]) if tail != "": tone = re.findall(r"\d+\.?\d*", tail) if len(tone)==0: new_phone_list.append(pinyin2cmu_dict[tail]) else: tail = tail.replace(str(tone[0]), "") new_phone_list.append(pinyin2cmu_dict[tail]+str(tone[0])) pass new_phone_list.append(" ") # if get_initials(pinyin, False).upper() in pinyin2cmu_dict.keys(): # new_phone_list.append(pinyin2cmu_dict[get_initials(pinyin, False).upper()]) # elif get_finals(pinyin, False).upper() in pinyin2cmu_dict.keys(): # new_phone_list.append(pinyin2cmu_dict[get_finals(pinyin, False).upper()]) # else: # new_phone_list.append(pinyin) return new_phone_list
def pypinyin_g2p_phone(text) -> List[str]: from pypinyin import Style, pinyin from pypinyin.style._utils import get_finals, get_initials phones = [ p for phone in pinyin(text, style=Style.TONE3) for p in [ get_initials(phone[0], strict=True), get_finals(phone[0][:-1], strict=True) + phone[0][-1] if phone[0][-1].isdigit() else get_finals( phone[0], strict=True) if phone[0][-1].isalnum() else phone[0], ] # Remove the case of individual tones as a phoneme if len(p) != 0 and not p.isdigit() ] return phones
def to_wade_glides(pinyin, **kwargs): pinyin = replace_symbol_to_no_symbol(pinyin).replace('v', 'ü') whole_converted = _convert_whole(pinyin, _except_table) if whole_converted != pinyin: return _fixed_result(whole_converted) whole_converted = _convert_whole(pinyin, _convert_table) if whole_converted != pinyin: return _fixed_result(whole_converted) initials = get_initials(pinyin, strict=False) tones = pinyin[len(initials):] initials = _convert_whole(initials, _initial_table) tones = _convert_whole(tones, _tone_table) return _fixed_result('{}{}'.format(initials, tones))
def frontend(text): """Clean text and then convert to id sequence.""" text = pinyin(text, style=Style.TONE3) text = [c[0] for c in text] print(f"Cleaned text: {text}") idseq = [] for x in text: c_init = get_initials(x, strict=True) c_final = get_finals(x, strict=True) for c in [c_init, c_final]: if len(c) == 0: continue if c not in char_to_id.keys(): print(f"WARN: {c} is not included in dict.") idseq += [char_to_id["<unk>"]] else: idseq += [char_to_id[c]] idseq += [idim - 1] # <eos> return torch.LongTensor(idseq).view(-1).to(device)
def get_pinyin(content): # Some special rules to match CSMSC pinyin text = pinyin(content, style=Style.TONE3) text = [c[0] for c in text] clean_content = [] for c in text: c_init = get_initials(c, strict=True) c_final = get_finals(c, strict=True).replace("ü", "v") if c_init == 'w': c_init = '' if c_final != 'u': c_final = 'u' + c_final if c_init == 'y': c_init = '' if c_final.startswith("u"): c_final = c_final.replace('u', 'v') elif not c_final.startswith('i'): c_final = 'i' + c_final if re.match("iu\d", c_final): c_final = c_final.replace("iu", "iou") if re.match("ui\d", c_final): c_final = c_final.replace("ui", "uei") if re.match("ue\d", c_final): c_final = c_final.replace("ue", "ve") if re.match("i\d", c_final): if c_init in ['z', 'c', 's']: c_final = c_final.replace("i", "ii") elif c_init in ['zh', 'ch', 'sh', 'r']: c_final = c_final.replace("i", "iii") if re.match("(u|un|uan)\d", c_final): if c_init in ['j', 'q', 'x', 'y']: c_final = c_final.replace("u", "v") else: if re.match("un\d", c_final): c_final = c_final.replace("un", "uen") if c_init: clean_content.append(c_init) clean_content.append(c_final) return ' '.join(clean_content)
def to_initials(pinyin, strict=True): """将 :py:attr:`~pypinyin.Style.TONE`、 :py:attr:`~pypinyin.Style.TONE2` 、 :py:attr:`~pypinyin.Style.TONE3` 或 :py:attr:`~pypinyin.Style.NORMAL` 风格的拼音转换为 :py:attr:`~pypinyin.Style.INITIALS` 风格的拼音 :param pinyin: :py:attr:`~pypinyin.Style.TONE`、 :py:attr:`~pypinyin.Style.TONE2` 、 :py:attr:`~pypinyin.Style.TONE3` 或 :py:attr:`~pypinyin.Style.NORMAL` 风格的拼音 :param strict: 返回结果是否严格遵照《汉语拼音方案》来处理声母和韵母, 详见 :ref:`strict` :return: :py:attr:`~pypinyin.Style.INITIALS` 风格的拼音 Usage:: >>> from pypinyin.contrib.tone_convert import to_initials >>> to_initials('zhōng') 'zh' """ return get_initials(pinyin, strict=strict)
def translate_pinyin(sentence): """ 将中文语句(只能包含中文字符和中文标点符号)转化为模型需要的拼音 :param sentence: 输入的中文语句 :return: 返回转化后的拼音数据 """ # 对中文语句进行编码转化,转化为utf-8编码格式 sentence = sentence.decode(encoding='utf-8') # 正则匹配去除掉非中文以及我们所不需要的标点 regex = u".*?([\u2E80-\u9FFF,!。?、]+).*?" m = re.findall(regex, sentence) regex_txt = ''.join(m).encode(encoding='utf-8') # 对中文语句进行转化 pinyin_list = lazy_pinyin(regex_txt, style=Style.TONE3) result = [] # 对转化的拼音的格式进行修改 for pinyin in pinyin_list: # 判断当前拼音是否为标点符号 if pinyin in [u'\uff0c', u'\u3002', u'\uff1f', u'\uff01', u'\u3001']: result.append(pinyin) continue '''对拼音进行标准化,更换声母和部分韵母的表达方式,以及对一些轻音的添加声调为第5声''' # 对没有声调的轻音,将其转化为第5声 if pinyin[-1] not in ['1', '2', '3', '4']: pinyin = pinyin + '5' # 按照 TRANSLATE_DICT 中,对部分拼音进行更换声母或者韵母 if pinyin[:-1] in TRANSLATE_DICT.keys(): pinyin = TRANSLATE_DICT[pinyin[:-1]] + pinyin[-1] # 获取声母 shengmu = get_initials(pinyin, strict=False) # 获取韵母 yunmu = get_finals(pinyin, strict=False) # 将单个拼音按照“{声母 韵母}”的格式输出 result.append('{' + shengmu + ' ' + yunmu + '}') # 返回中文语句转化成拼音的字符串 return ' '.join(result)
# clean every line in transcription file first transcription_dict = {} with codecs.open(args.transcription_path, "r", "utf-8") as fid: for line in fid.readlines(): segments = line.split(" ") lang_char = args.transcription_path.split("/")[-1][0] id = args.spk + "_" + lang_char + segments[0] # ex. TMF1_M10001 content = segments[1].replace("\n", "") # Some special rules to match CSMSC pinyin text = pinyin(content, style=Style.TONE3) text = [c[0] for c in text] clean_content = [] for c in text: c_init = get_initials(c, strict=True) c_final = get_finals(c, strict=True) for c in [c_init, c_final]: if len(c) == 0: continue c = c.replace("ü", "v") c = c.replace("ui", "uei") c = c.replace("un", "uen") c = c.replace("iu", "iou") # Special rule: "e5n" -> "en5" if "5" in c: c = c.replace("5", "") + "5" clean_content.append(c) transcription_dict[id] = " ".join(["<" + args.lang_tag + ">"] +
def convert(pinyin, **kwargs): strict = kwargs.get('strict', True) return get_initials(pinyin, strict)