def evaluate(model: tf.keras.Model, data_path: str, max_len: int, vocab_size: int, max_train_data_size: int, batch_size: int, buffer_size: int, tokenized_type: str = "phoneme"): """ 评估模块 :param model: 模型 :param data_path: 文本数据路径 :param max_len: 文本序列最大长度 :param vocab_size: 词汇大小 :param tokenized_type: 分词类型,默认按音素分词,模式:phoneme(音素)/word(单词)/char(字符) :param buffer_size: Dataset加载缓存大小 :param batch_size: Dataset加载批大小 :param max_train_data_size: 最大训练数据量 :return: 无返回值 """ dataset, _, steps_per_epoch, _ = \ _dataset.load_data(train_data_path=data_path, max_len=max_len, vocab_size=vocab_size, batch_size=batch_size, buffer_size=buffer_size, tokenized_type=tokenized_type, max_train_data_size=max_train_data_size) j = 0 score_sum = 0 for (batch, (mel, stop_token, sentence)) in enumerate(dataset.take(steps_per_epoch)): for i in range(sentence.shape[0]): new_input_ids = sentence[i] new_input_ids = tf.expand_dims(new_input_ids, axis=0) mel_outputs, mel_outputs_postnet, gate_outputs, alignments = model.inference( new_input_ids) mel2 = mel[i] mel2 = tf.expand_dims(mel2, axis=0) mel2 = tf.transpose(mel2, [0, 2, 1]) score = spec_distance(mel_outputs_postnet, mel2) score_sum += score j = j + 1 print('第{}个样本的欧式距离为:{}'.format(j, score)) print("样本平均欧式距离为:", score_sum / j)
def generate(model: tf.keras.Model, max_db: int, ref_db: int, sr: int, max_len: int, wave_save_dir: str, n_fft: int, n_mels: int, pre_emphasis: float, n_iter: int, hop_length: int, cmu_dict_path: str, win_length: int, dict_path: str = "", tokenized_type: str = "phoneme"): """ 生成语音的方法 :param model: 模型 :param max_len: 句子序列最大长度 :param wave_save_dir: 合成的音频保存目录 :param n_fft: FFT窗口大小 :param n_mels: 产生的梅尔带数 :param hop_length: 帧移 :param n_iter: 指针 :param win_length: 每一帧音频都由window()加窗,窗长win_length,然后用零填充以匹配N_FFT :param max_db: 峰值分贝值 :param ref_db: 参考分贝值 :param sr: 采样率 :param pre_emphasis: 预加重 :param dict_path: 字典路径 :param cmu_dict_path: 音素字典路径 :param tokenized_type: 分词类型 :return: 无返回值 """ if not os.path.exists(wave_save_dir): os.makedirs(wave_save_dir) i = 0 # 抓取文本数据 while True: i = i + 1 b = str(i) print() seq = input("请输入您要合成的话,输入ESC结束:") if seq == 'ESC': break sequences_list = [] sequences_list.append( text_to_phonemes(text=seq, cmu_dict_path=cmu_dict_path)) if tokenized_type == "phoneme": input_ids = text_to_sequence_phoneme(texts=sequences_list, max_len=max_len) else: with open(dict_path, 'r', encoding="utf-8") as dict_file: json_string = dict_file.read().strip().strip("\n") tokenizer = tf.keras.preprocessing.text.tokenizer_from_json( json_string) input_ids = tokenizer.texts_to_sequences(sequences_list) input_ids = tf.keras.preprocessing.sequence.pad_sequences( input_ids, max_len=max_len, padding="post") input_ids = tf.convert_to_tensor(input_ids) # 预测 mel_outputs, mel_outputs_postnet, gate_outputs, alignments = model.inference( input_ids) # 生成预测声音 wav = melspectrogram2wav(mel_outputs_postnet[0].numpy(), max_db, ref_db, sr, n_fft, n_mels, pre_emphasis, n_iter, hop_length, win_length) name = wave_save_dir + '\\generated' + b + '.wav' wave.write(name, rate=sr, data=wav) playsound(name) print("已合成,路径:{}".format(name)) print("合成结束")