Exemplo n.º 1
0
def part(text, pinyin2cmu_dict):
    phone_list = list()
    tone_list = list()
#     print(text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
#     exit()
    for pinyin in text.split(" "):
        if len(pinyin) == 0 or pinyin == " " or "#" in pinyin:
            continue
        tone = re.findall(r"\d+\.?\d*", pinyin)
        if len(tone) == 0:
            tone = "5"
            pass
        tone = int(tone[0]) + 3

        pinyin = pinyin.replace(str(tone-3), "")
        print(pinyin, len(pinyin))
        print('test', get_initials(pinyin, False).upper(), get_initials(pinyin, False).upper() not in pinyin2cmu_dict.keys())
        print('tste', get_finals(pinyin, False).upper(), get_finals(pinyin, False).upper() not in pinyin2cmu_dict.keys())
        print((get_initials(pinyin, False).upper() not in pinyin2cmu_dict.keys()) and (get_finals(pinyin, False).upper() not in pinyin2cmu_dict.keys()))
        print("===================")
        # if (get_initials(pinyin, False).upper() not in pinyin2cmu_dict.keys()) and (get_finals(pinyin, False).upper() not in pinyin2cmu_dict.keys()):
        #     print("not in:", pinyin)
        #     print(get_initials(pinyin, False).upper())
        #     print(get_finals(pinyin, False).upper())
        #     phone_list.append(pinyin)
        #     phone_list.append(" ")
        #     tone_list.append(str(tone))
        #     tone_list.append(str(tone))
        #     continue
        
        for pin_part in (get_initials(pinyin, False), get_finals(pinyin, False)):
            print("pin_part", pin_part)
            if pin_part.upper() in pinyin2cmu_dict.keys():
                phone_list.append(pinyin2cmu_dict[pin_part.upper()])
                for _ in pinyin2cmu_dict[pin_part.upper()].split(" "):
                    tone_list.append(str(tone))
                print("cmu", pinyin2cmu_dict[pin_part.upper()])
        phone_list.append("$")
        tone_list.append(str(8))
    # print(len(phone_list[:-5]))
    print(phone_list)
    return phone_list, tone_list
Exemplo n.º 2
0
def _nonstd_style(pinyin, **kwargs):
    initials = get_initials(pinyin, strict=True)
    finals = finals_converter.to_finals_tone3(pinyin, strict=True)

    # process silent finals
    if finals in sil_finals and initials in sil_finals_initials:
        finals = finals.replace('i', '')

    pinyin = '' + initials + finals

    return pinyin
Exemplo n.º 3
0
def pypinyin_g2p_phone(text) -> List[str]:
    from pypinyin import pinyin
    from pypinyin import Style
    from pypinyin.style._utils import get_finals
    from pypinyin.style._utils import get_initials

    phones = [
        p for phone in pinyin(text, style=Style.TONE3) for p in [
            get_initials(phone[0], strict=True),
            get_finals(phone[0], strict=True),
        ] if len(p) != 0
    ]
    return phones
Exemplo n.º 4
0
def part2(text, pinyin2cmu_dict):
    phone_list = list()
    tone_list = list()
    #     print(text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    #     exit()
    new_phone_list = list()
    for pinyin in text.split(" "):
        if "#" not in pinyin:
            tone = re.findall(r"\d+\.?\d*", pinyin)
        else:
            tone = []
        if len(tone) == 0:
            tone = 7
        else:
            tone = int(tone[0]) + 2
        #         print(pinyin2cmu_dict.keys())
        #         exit()
        # pinyin = pinyin.replace(str(tone - 2), "")

        head = get_initials(pinyin, False).upper()
        tail = get_finals(pinyin, False).upper()

        if "#" in pinyin:
            new_phone_list.append(pinyin)
            continue

        if head not in pinyin2cmu_dict.keys() and tail not in pinyin2cmu_dict:
            new_phone_list.append(pinyin)
            continue
        if head != "":
            new_phone_list.append(pinyin2cmu_dict[head])
        if tail != "":
            tone = re.findall(r"\d+\.?\d*", tail)
            if len(tone)==0:
                new_phone_list.append(pinyin2cmu_dict[tail])
            else:
                tail = tail.replace(str(tone[0]), "")
                new_phone_list.append(pinyin2cmu_dict[tail]+str(tone[0]))
                pass
        new_phone_list.append(" ")
        # if get_initials(pinyin, False).upper() in pinyin2cmu_dict.keys():
        #     new_phone_list.append(pinyin2cmu_dict[get_initials(pinyin, False).upper()])
        # elif get_finals(pinyin, False).upper() in pinyin2cmu_dict.keys():
        #     new_phone_list.append(pinyin2cmu_dict[get_finals(pinyin, False).upper()])
        # else:
        #     new_phone_list.append(pinyin)

    return new_phone_list
Exemplo n.º 5
0
def pypinyin_g2p_phone(text) -> List[str]:
    from pypinyin import Style, pinyin
    from pypinyin.style._utils import get_finals, get_initials

    phones = [
        p for phone in pinyin(text, style=Style.TONE3) for p in [
            get_initials(phone[0], strict=True),
            get_finals(phone[0][:-1], strict=True) +
            phone[0][-1] if phone[0][-1].isdigit() else get_finals(
                phone[0], strict=True) if phone[0][-1].isalnum() else phone[0],
        ]
        # Remove the case of individual tones as a phoneme
        if len(p) != 0 and not p.isdigit()
    ]
    return phones
Exemplo n.º 6
0
def to_wade_glides(pinyin, **kwargs):
    pinyin = replace_symbol_to_no_symbol(pinyin).replace('v', 'ü')

    whole_converted = _convert_whole(pinyin, _except_table)
    if whole_converted != pinyin:
        return _fixed_result(whole_converted)
    whole_converted = _convert_whole(pinyin, _convert_table)
    if whole_converted != pinyin:
        return _fixed_result(whole_converted)

    initials = get_initials(pinyin, strict=False)
    tones = pinyin[len(initials):]

    initials = _convert_whole(initials, _initial_table)
    tones = _convert_whole(tones, _tone_table)

    return _fixed_result('{}{}'.format(initials, tones))
Exemplo n.º 7
0
def frontend(text):
    """Clean text and then convert to id sequence."""
    text = pinyin(text, style=Style.TONE3)
    text = [c[0] for c in text]
    print(f"Cleaned text: {text}")
    idseq = []
    for x in text:
        c_init = get_initials(x, strict=True)
        c_final = get_finals(x, strict=True)
        for c in [c_init, c_final]:
            if len(c) == 0:
                continue
            if c not in char_to_id.keys():
                print(f"WARN: {c} is not included in dict.")
                idseq += [char_to_id["<unk>"]]
            else:
                idseq += [char_to_id[c]]
    idseq += [idim - 1]  # <eos>
    return torch.LongTensor(idseq).view(-1).to(device)
Exemplo n.º 8
0
def get_pinyin(content):
    # Some special rules to match CSMSC pinyin
    text = pinyin(content, style=Style.TONE3)
    text = [c[0] for c in text]
    clean_content = []
    for c in text:
        c_init = get_initials(c, strict=True)
        c_final = get_finals(c, strict=True).replace("ü", "v")
        if c_init == 'w':
            c_init = ''
            if c_final != 'u':
                c_final = 'u' + c_final

        if c_init == 'y':
            c_init = ''
            if c_final.startswith("u"):
                c_final = c_final.replace('u', 'v')
            elif not c_final.startswith('i'):
                c_final = 'i' + c_final

        if re.match("iu\d", c_final):
            c_final = c_final.replace("iu", "iou")
        if re.match("ui\d", c_final):
            c_final = c_final.replace("ui", "uei")
        if re.match("ue\d", c_final):
            c_final = c_final.replace("ue", "ve")

        if re.match("i\d", c_final):
            if c_init in ['z', 'c', 's']:
                c_final = c_final.replace("i", "ii")
            elif c_init in ['zh', 'ch', 'sh', 'r']:
                c_final = c_final.replace("i", "iii")

        if re.match("(u|un|uan)\d", c_final):
            if c_init in ['j', 'q', 'x', 'y']:
                c_final = c_final.replace("u", "v")
            else:
                if re.match("un\d", c_final):
                    c_final = c_final.replace("un", "uen")
        if c_init:
            clean_content.append(c_init)
        clean_content.append(c_final)
    return ' '.join(clean_content)
Exemplo n.º 9
0
def to_initials(pinyin, strict=True):
    """将 :py:attr:`~pypinyin.Style.TONE`、
    :py:attr:`~pypinyin.Style.TONE2` 、
    :py:attr:`~pypinyin.Style.TONE3` 或
    :py:attr:`~pypinyin.Style.NORMAL` 风格的拼音转换为
    :py:attr:`~pypinyin.Style.INITIALS` 风格的拼音

    :param pinyin: :py:attr:`~pypinyin.Style.TONE`、
                   :py:attr:`~pypinyin.Style.TONE2` 、
                   :py:attr:`~pypinyin.Style.TONE3` 或
                   :py:attr:`~pypinyin.Style.NORMAL` 风格的拼音
    :param strict: 返回结果是否严格遵照《汉语拼音方案》来处理声母和韵母,
                   详见 :ref:`strict`
    :return: :py:attr:`~pypinyin.Style.INITIALS` 风格的拼音

    Usage::

      >>> from pypinyin.contrib.tone_convert import to_initials
      >>> to_initials('zhōng')
      'zh'

    """
    return get_initials(pinyin, strict=strict)
def translate_pinyin(sentence):
    """
        将中文语句(只能包含中文字符和中文标点符号)转化为模型需要的拼音
    :param sentence: 输入的中文语句
    :return: 返回转化后的拼音数据
    """
    # 对中文语句进行编码转化,转化为utf-8编码格式
    sentence = sentence.decode(encoding='utf-8')
    # 正则匹配去除掉非中文以及我们所不需要的标点
    regex = u".*?([\u2E80-\u9FFF,!。?、]+).*?"
    m = re.findall(regex, sentence)
    regex_txt = ''.join(m).encode(encoding='utf-8')
    # 对中文语句进行转化
    pinyin_list = lazy_pinyin(regex_txt, style=Style.TONE3)
    result = []
    # 对转化的拼音的格式进行修改
    for pinyin in pinyin_list:
        # 判断当前拼音是否为标点符号
        if pinyin in [u'\uff0c', u'\u3002', u'\uff1f', u'\uff01', u'\u3001']:
            result.append(pinyin)
            continue
        '''对拼音进行标准化,更换声母和部分韵母的表达方式,以及对一些轻音的添加声调为第5声'''
        # 对没有声调的轻音,将其转化为第5声
        if pinyin[-1] not in ['1', '2', '3', '4']:
            pinyin = pinyin + '5'
        # 按照 TRANSLATE_DICT 中,对部分拼音进行更换声母或者韵母
        if pinyin[:-1] in TRANSLATE_DICT.keys():
            pinyin = TRANSLATE_DICT[pinyin[:-1]] + pinyin[-1]

        # 获取声母
        shengmu = get_initials(pinyin, strict=False)
        # 获取韵母
        yunmu = get_finals(pinyin, strict=False)
        # 将单个拼音按照“{声母 韵母}”的格式输出
        result.append('{' + shengmu + ' ' + yunmu + '}')
    # 返回中文语句转化成拼音的字符串
    return ' '.join(result)
Exemplo n.º 11
0
    # clean every line in transcription file first
    transcription_dict = {}
    with codecs.open(args.transcription_path, "r", "utf-8") as fid:
        for line in fid.readlines():
            segments = line.split(" ")
            lang_char = args.transcription_path.split("/")[-1][0]
            id = args.spk + "_" + lang_char + segments[0]  # ex. TMF1_M10001
            content = segments[1].replace("\n", "")

            # Some special rules to match CSMSC pinyin
            text = pinyin(content, style=Style.TONE3)
            text = [c[0] for c in text]
            clean_content = []
            for c in text:
                c_init = get_initials(c, strict=True)
                c_final = get_finals(c, strict=True)
                for c in [c_init, c_final]:
                    if len(c) == 0:
                        continue
                    c = c.replace("ü", "v")
                    c = c.replace("ui", "uei")
                    c = c.replace("un", "uen")
                    c = c.replace("iu", "iou")

                    # Special rule: "e5n" -> "en5"
                    if "5" in c:
                        c = c.replace("5", "") + "5"
                    clean_content.append(c)

            transcription_dict[id] = " ".join(["<" + args.lang_tag + ">"] +
Exemplo n.º 12
0
def convert(pinyin, **kwargs):
    strict = kwargs.get('strict', True)
    return get_initials(pinyin, strict)