def reset(self): self.question_list = [] self.question_id = 0 # 选出要考的诗 random.shuffle(self.poem_list) poem_list = self.poem_list[:10] for poem in poem_list: # 把一首诗打散成很多联 pair_list = [] pair_list_ = poem["content"].replace("\n", u"。").split(u"。") for pair in pair_list_: pair = pair.strip() if not pair: continue if set(list(u"()()?!:?;")) & set(list(pair)): continue if u"," not in pair: continue pair_list.append(pair) if not pair_list: continue # 选一联出题 pair = random.choice(pair_list) sentence_list = pair.split(u",") # 选出要考的句子 id_ = random.randint(0, len(sentence_list) - 1) target_sentence = sentence_list[id_] sentence_list[id_] = "_" * 14 # 找到跟要考的句子相似的句子组成四个选项 similar_sentence_list = [] random.shuffle(self.sentence_list) for s in self.sentence_list: if s.strip() == target_sentence.strip(): continue if len(s) != len(target_sentence): continue # 要求最后一个字的韵母相同 if get_finals(lazy_pinyin(s[-1])[0], True) != get_finals( lazy_pinyin(target_sentence[-1])[0], True): continue similar_sentence_list.append(s) if len(similar_sentence_list) == 3: break if len(similar_sentence_list) < 3: similar_sentence_list += ["达拉崩吧公主米亚幸福的像个童话" ] * (3 - len(similar_sentence_list)) insert_index = random.randint(0, 3) similar_sentence_list.insert(insert_index, target_sentence) # 加入试卷 self.question_list.append({ "question": u",".join(sentence_list), "choice_list": similar_sentence_list, "answer": "ABCD"[insert_index] })
def pypinyin_g2p_phone(text) -> List[str]: from pypinyin import Style, pinyin from pypinyin.style._utils import get_finals, get_initials phones = [ p for phone in pinyin(text, style=Style.TONE3) for p in [ get_initials(phone[0], strict=True), get_finals(phone[0][:-1], strict=True) + phone[0][-1] if phone[0][-1].isdigit() else get_finals( phone[0], strict=True) if phone[0][-1].isalnum() else phone[0], ] # Remove the case of individual tones as a phoneme if len(p) != 0 and not p.isdigit() ] return phones
def to_finals(pinyin, strict=True, v_to_u=False): """将 :py:attr:`~pypinyin.Style.TONE`、 :py:attr:`~pypinyin.Style.TONE2` 、 :py:attr:`~pypinyin.Style.TONE3` 或 :py:attr:`~pypinyin.Style.NORMAL` 风格的拼音转换为 :py:attr:`~pypinyin.Style.FINALS` 风格的拼音 :param pinyin: :py:attr:`~pypinyin.Style.TONE`、 :py:attr:`~pypinyin.Style.TONE2` 、 :py:attr:`~pypinyin.Style.TONE3` 或 :py:attr:`~pypinyin.Style.NORMAL` 风格的拼音 :param strict: 返回结果是否严格遵照《汉语拼音方案》来处理声母和韵母, 详见 :ref:`strict` :param v_to_u: 是否使用 ``ü`` 代替原来的 ``v``, 当为 False 时结果中将使用 ``v`` 表示 ``ü`` :return: :py:attr:`~pypinyin.Style.FINALS` 风格的拼音 Usage:: >>> from pypinyin.contrib.tone_convert import to_finals >>> to_finals('zhōng') 'ong' """ new_pinyin = replace_symbol_to_no_symbol(pinyin).replace('v', 'ü') finals = get_finals(new_pinyin, strict=strict) finals = _fix_v_u(finals, finals, v_to_u) return finals
def get_rhyme(word): rhyme = get_finals(lazy_pinyin(word)[0], strict=False) rhymes_chinese = read_rhyme_finals() for idx, rhymes in enumerate(rhymes_chinese): if rhyme in rhymes: return idx + 1 return 0
def generate_words_by_rhyme(input_word): rhymes_df = pd.read_csv('./dataset/rhymes-table.csv', sep=',', header=0, encoding='utf-8') rhymes_df = rhymes_df.iloc[:, 1:] word_rhyme = '' for rhyme in lazy_pinyin(input_word): rhyme_without_tone = get_finals(rhyme, strict=False) word_rhyme += rhyme_without_tone + "'" word_rhyme = word_rhyme[0:-1] if word_rhyme in rhymes_df.columns: rhymes_without_nan = [i for i in list(rhymes_df[word_rhyme]) if i == i] np.random.shuffle(rhymes_without_nan) corresponding_rhyme = [ ast.literal_eval(i)[0] for i in rhymes_without_nan ] corresponding_frequency = np.asarray( [ast.literal_eval(i)[1] for i in rhymes_without_nan]) normalized_frequency = corresponding_frequency - np.min( corresponding_frequency) / np.max( corresponding_frequency) - np.min(corresponding_frequency) sample_num = min(3, len(corresponding_rhyme)) words = np.random.choice(corresponding_rhyme, size=sample_num, replace=False, p=normalized_frequency) print('匹配的韵脚是:') for word in words: print(word) else: print('Sorry! 好像没有找到押韵的词语哦~')
def part(text, pinyin2cmu_dict): phone_list = list() tone_list = list() # print(text) text = re.sub(r'[^\x00-\x7F]+', '', text) # exit() for pinyin in text.split(" "): if len(pinyin) == 0 or pinyin == " " or "#" in pinyin: continue tone = re.findall(r"\d+\.?\d*", pinyin) if len(tone) == 0: tone = "5" pass tone = int(tone[0]) + 3 pinyin = pinyin.replace(str(tone-3), "") print(pinyin, len(pinyin)) print('test', get_initials(pinyin, False).upper(), get_initials(pinyin, False).upper() not in pinyin2cmu_dict.keys()) print('tste', get_finals(pinyin, False).upper(), get_finals(pinyin, False).upper() not in pinyin2cmu_dict.keys()) print((get_initials(pinyin, False).upper() not in pinyin2cmu_dict.keys()) and (get_finals(pinyin, False).upper() not in pinyin2cmu_dict.keys())) print("===================") # if (get_initials(pinyin, False).upper() not in pinyin2cmu_dict.keys()) and (get_finals(pinyin, False).upper() not in pinyin2cmu_dict.keys()): # print("not in:", pinyin) # print(get_initials(pinyin, False).upper()) # print(get_finals(pinyin, False).upper()) # phone_list.append(pinyin) # phone_list.append(" ") # tone_list.append(str(tone)) # tone_list.append(str(tone)) # continue for pin_part in (get_initials(pinyin, False), get_finals(pinyin, False)): print("pin_part", pin_part) if pin_part.upper() in pinyin2cmu_dict.keys(): phone_list.append(pinyin2cmu_dict[pin_part.upper()]) for _ in pinyin2cmu_dict[pin_part.upper()].split(" "): tone_list.append(str(tone)) print("cmu", pinyin2cmu_dict[pin_part.upper()]) phone_list.append("$") tone_list.append(str(8)) # print(len(phone_list[:-5])) print(phone_list) return phone_list, tone_list
def to_finals_tone2(self, pinyin, **kwargs): if kwargs.get('strict'): pinyin = convert_finals(pinyin) has_fi = has_finals(pinyin) # 用数字表示声调 pinyin = replace_symbol_to_number(pinyin) if not has_fi: return pinyin # 获取韵母部分 return get_finals(pinyin, strict=False)
def to_finals(self, pinyin, **kwargs): if kwargs.get('strict'): pinyin = convert_finals(pinyin) has_fi = has_finals(pinyin) # 替换声调字符为无声调字符 pinyin = replace_symbol_to_no_symbol(pinyin) if not has_fi: return pinyin # 获取韵母部分 return get_finals(pinyin, strict=False)
def pypinyin_g2p_phone(text) -> List[str]: from pypinyin import pinyin from pypinyin import Style from pypinyin.style._utils import get_finals from pypinyin.style._utils import get_initials phones = [ p for phone in pinyin(text, style=Style.TONE3) for p in [ get_initials(phone[0], strict=True), get_finals(phone[0], strict=True), ] if len(p) != 0 ] return phones
def to_finals_tone3(self, pinyin, **kwargs): if kwargs.get('strict'): pinyin = convert_finals(pinyin) has_fi = has_finals(pinyin) # 用数字表示声调 pinyin = replace_symbol_to_number(pinyin) # 将声调数字移动到最后 pinyin = RE_TONE3.sub(r'\1\3\2', pinyin) if not has_fi: return pinyin # 获取韵母部分 return get_finals(pinyin, strict=False)
def part2(text, pinyin2cmu_dict): phone_list = list() tone_list = list() # print(text) text = re.sub(r'[^\x00-\x7F]+', '', text) # exit() new_phone_list = list() for pinyin in text.split(" "): if "#" not in pinyin: tone = re.findall(r"\d+\.?\d*", pinyin) else: tone = [] if len(tone) == 0: tone = 7 else: tone = int(tone[0]) + 2 # print(pinyin2cmu_dict.keys()) # exit() # pinyin = pinyin.replace(str(tone - 2), "") head = get_initials(pinyin, False).upper() tail = get_finals(pinyin, False).upper() if "#" in pinyin: new_phone_list.append(pinyin) continue if head not in pinyin2cmu_dict.keys() and tail not in pinyin2cmu_dict: new_phone_list.append(pinyin) continue if head != "": new_phone_list.append(pinyin2cmu_dict[head]) if tail != "": tone = re.findall(r"\d+\.?\d*", tail) if len(tone)==0: new_phone_list.append(pinyin2cmu_dict[tail]) else: tail = tail.replace(str(tone[0]), "") new_phone_list.append(pinyin2cmu_dict[tail]+str(tone[0])) pass new_phone_list.append(" ") # if get_initials(pinyin, False).upper() in pinyin2cmu_dict.keys(): # new_phone_list.append(pinyin2cmu_dict[get_initials(pinyin, False).upper()]) # elif get_finals(pinyin, False).upper() in pinyin2cmu_dict.keys(): # new_phone_list.append(pinyin2cmu_dict[get_finals(pinyin, False).upper()]) # else: # new_phone_list.append(pinyin) return new_phone_list
def frontend(text): """Clean text and then convert to id sequence.""" text = pinyin(text, style=Style.TONE3) text = [c[0] for c in text] print(f"Cleaned text: {text}") idseq = [] for x in text: c_init = get_initials(x, strict=True) c_final = get_finals(x, strict=True) for c in [c_init, c_final]: if len(c) == 0: continue if c not in char_to_id.keys(): print(f"WARN: {c} is not included in dict.") idseq += [char_to_id["<unk>"]] else: idseq += [char_to_id[c]] idseq += [idim - 1] # <eos> return torch.LongTensor(idseq).view(-1).to(device)
def get_pinyin(content): # Some special rules to match CSMSC pinyin text = pinyin(content, style=Style.TONE3) text = [c[0] for c in text] clean_content = [] for c in text: c_init = get_initials(c, strict=True) c_final = get_finals(c, strict=True).replace("ü", "v") if c_init == 'w': c_init = '' if c_final != 'u': c_final = 'u' + c_final if c_init == 'y': c_init = '' if c_final.startswith("u"): c_final = c_final.replace('u', 'v') elif not c_final.startswith('i'): c_final = 'i' + c_final if re.match("iu\d", c_final): c_final = c_final.replace("iu", "iou") if re.match("ui\d", c_final): c_final = c_final.replace("ui", "uei") if re.match("ue\d", c_final): c_final = c_final.replace("ue", "ve") if re.match("i\d", c_final): if c_init in ['z', 'c', 's']: c_final = c_final.replace("i", "ii") elif c_init in ['zh', 'ch', 'sh', 'r']: c_final = c_final.replace("i", "iii") if re.match("(u|un|uan)\d", c_final): if c_init in ['j', 'q', 'x', 'y']: c_final = c_final.replace("u", "v") else: if re.match("un\d", c_final): c_final = c_final.replace("un", "uen") if c_init: clean_content.append(c_init) clean_content.append(c_final) return ' '.join(clean_content)
def get_rhymes_table(input_filepath, output_filepath): dataset = pd.read_csv(input_filepath, header=None, sep='\t', encoding='utf-8', names=['words', 'pronunciation', 'frequency']) rhymes = {} for _, data in dataset.iterrows(): word_rhyme = '' for character in data['pronunciation'].split("'"): rhyme_with_tone = get_finals(character, strict=False) rhyme_without_tone = rhyme_with_tone[0:-1] word_rhyme += rhyme_without_tone + "'" word_rhyme = word_rhyme[0:-1] if word_rhyme in rhymes.keys(): rhymes[word_rhyme].append([data['words'], data['frequency']]) else: rhymes[word_rhyme] = [[data['words'], data['frequency']]] rhymes_df = pd.DataFrame( {key: pd.Series(value) for key, value in rhymes.items()}) rhymes_df.to_csv(output_filepath, sep=",", header=True)
def translate_pinyin(sentence): """ 将中文语句(只能包含中文字符和中文标点符号)转化为模型需要的拼音 :param sentence: 输入的中文语句 :return: 返回转化后的拼音数据 """ # 对中文语句进行编码转化,转化为utf-8编码格式 sentence = sentence.decode(encoding='utf-8') # 正则匹配去除掉非中文以及我们所不需要的标点 regex = u".*?([\u2E80-\u9FFF,!。?、]+).*?" m = re.findall(regex, sentence) regex_txt = ''.join(m).encode(encoding='utf-8') # 对中文语句进行转化 pinyin_list = lazy_pinyin(regex_txt, style=Style.TONE3) result = [] # 对转化的拼音的格式进行修改 for pinyin in pinyin_list: # 判断当前拼音是否为标点符号 if pinyin in [u'\uff0c', u'\u3002', u'\uff1f', u'\uff01', u'\u3001']: result.append(pinyin) continue '''对拼音进行标准化,更换声母和部分韵母的表达方式,以及对一些轻音的添加声调为第5声''' # 对没有声调的轻音,将其转化为第5声 if pinyin[-1] not in ['1', '2', '3', '4']: pinyin = pinyin + '5' # 按照 TRANSLATE_DICT 中,对部分拼音进行更换声母或者韵母 if pinyin[:-1] in TRANSLATE_DICT.keys(): pinyin = TRANSLATE_DICT[pinyin[:-1]] + pinyin[-1] # 获取声母 shengmu = get_initials(pinyin, strict=False) # 获取韵母 yunmu = get_finals(pinyin, strict=False) # 将单个拼音按照“{声母 韵母}”的格式输出 result.append('{' + shengmu + ' ' + yunmu + '}') # 返回中文语句转化成拼音的字符串 return ' '.join(result)
def to_finals_tone(self, pinyin, **kwargs): if not has_finals(pinyin): return pinyin # 获取韵母部分 return get_finals(pinyin, strict=kwargs.get('strict'))
# clean every line in transcription file first transcription_dict = {} with codecs.open(args.transcription_path, "r", "utf-8") as fid: for line in fid.readlines(): segments = line.split(" ") lang_char = args.transcription_path.split("/")[-1][0] id = args.spk + "_" + lang_char + segments[0] # ex. TMF1_M10001 content = segments[1].replace("\n", "") # Some special rules to match CSMSC pinyin text = pinyin(content, style=Style.TONE3) text = [c[0] for c in text] clean_content = [] for c in text: c_init = get_initials(c, strict=True) c_final = get_finals(c, strict=True) for c in [c_init, c_final]: if len(c) == 0: continue c = c.replace("ü", "v") c = c.replace("ui", "uei") c = c.replace("un", "uen") c = c.replace("iu", "iou") # Special rule: "e5n" -> "en5" if "5" in c: c = c.replace("5", "") + "5" clean_content.append(c) transcription_dict[id] = " ".join(["<" + args.lang_tag + ">"] + clean_content)