def __init__(self): gpus = tf.config.experimental.list_physical_devices(device_type='GPU') tf.config.experimental.set_visible_devices(devices=gpus[0:2], device_type='GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) self._converter_model() self.tts_pause = TTSSegPause() self.tts_py = TTSPinYin()
class TTSModel(): def __init__(self): gpus = tf.config.experimental.list_physical_devices(device_type='GPU') tf.config.experimental.set_visible_devices(devices=gpus[0:2], device_type='GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) self.__init_model() self.tts_pause = TTSSegPause() self.tts_py = TTSPinYin() def __init_model(self): tacotron2_config = AutoConfig.from_pretrained(config.tacotron2_baker) self.tacotron2 = TFAutoModel.from_pretrained( config=tacotron2_config, pretrained_path=config.tacotron2_pretrained_path, training=False, name="tacotron2") self.tacotron2.setup_window(win_front=5, win_back=5) mb_melgan_config = AutoConfig.from_pretrained( config.multiband_melgan_baker) self.mb_melgan = TFAutoModel.from_pretrained( config=mb_melgan_config, pretrained_path=config.multiband_melgan_pretrained_path, name="mb_melgan") self.processor = AutoProcessor.from_pretrained( pretrained_path=config.baker_mapper_pretrained_path) def text_to_pinyin_sequence(self, text): # pinyin = self.processor.pinyin_parser(text, style=Style.TONE3, errors="ignore") pinyin, text = self.tts_py.get_pyin(text) new_pinyin = [] for x in str(pinyin).split(" "): if "#" not in x: new_pinyin.append(x) phonemes = self.processor.get_phoneme_from_char_and_pinyin( text, new_pinyin) text = " ".join(phonemes) print("phoneme seq: {}".format(text)) logging.info( "[TTSModel] [text_to_pinyin_sequence] phoneme seq:{}".format(text)) input_ids = self.processor.text_to_sequence(text, inference=False) return input_ids def do_synthesis(self, input_text): input_text = self.tts_pause.add_pause(input_text) print("input_text>>>>", input_text) logging.info( "[TTSModel] [do_synthesis] input_text:{}".format(input_text)) input_ids = self.processor.text_to_sequence(input_text, inference=True) _, mel_outputs, stop_token_prediction, alignment_history = self.tacotron2.inference( tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0), tf.convert_to_tensor([len(input_ids)], tf.int32), tf.convert_to_tensor([0], dtype=tf.int32)) remove_end = 1024 audio = self.mb_melgan.inference(mel_outputs)[0, :-remove_end, 0] return mel_outputs.numpy(), alignment_history.numpy(), audio.numpy() '''
# -*- coding:utf-8 -*- import re import yaml import numpy as np import pandas as pd from pypinyin import Style from conf.config import config from core.parse_text_add_pause import TTSSegPause from core.tftts_pinyin import TTSModel if __name__ == "__main__": tts_model = TTSModel() tts_seg_pause = TTSSegPause() data_pd = pd.read_csv(config.MIX_VOICE_TEXT_DATA_PATH, sep=',', encoding='utf-8') mix_voice_text_index_list = list( data_pd[config.MIX_VOICE_TEXT_INDEX].values) mix_voice_text_list = list(data_pd[config.MIX_VOICE_TEXT].values) f2 = "./data/010001-020000.txt" f1 = open("./data/000001-010000.txt") lines = f1.readlines() with open(f2, "w") as file: for idx in range(0, len(lines), 2): utt_id, chn_char = lines[idx].strip().split() per_text_pinyin = lines[idx + 1].strip().split() if "IY1" in per_text_pinyin or "B" in chn_char: print(f"Skip this: {utt_id} {chn_char} {per_text_pinyin}")
class TTSModel(): def __init__(self): gpus = tf.config.experimental.list_physical_devices(device_type='GPU') tf.config.experimental.set_visible_devices(devices=gpus[0:2], device_type='GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) self._converter_model() self.tts_pause = TTSSegPause() self.tts_py = TTSPinYin() def _converter_model(self): with open(config.tacotron2_baker) as f: conf = yaml.load(f, Loader=yaml.Loader) conf = Tacotron2Config(**conf["tacotron2_params"]) self.tacotron2 = TFTacotron2(config=conf, training=False, name="tacotron2", enable_tflite_convertible=True) self.tacotron2.setup_window(win_front=5, win_back=5) self.tacotron2.setup_maximum_iterations(1000) # be careful self.tacotron2._build() self.tacotron2.load_weights(config.tacotron2_pretrained_path) tacotron2_concrete_function = self.tacotron2.inference_tflite.get_concrete_function( ) converter = tf.lite.TFLiteConverter.from_concrete_functions( [tacotron2_concrete_function]) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.target_spec.supported_ops = [ tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS ] tflite_model = converter.convert() with open('tacotron2.tflite', 'wb') as f: f.write(tflite_model) print('Model size is %f MBs.' % (len(tflite_model) / 1024 / 1024.0)) #tacotron2_config = AutoConfig.from_pretrained( config.tacotron2_baker ) #self.tacotron2 = TFAutoModel.from_pretrained( config=tacotron2_config, pretrained_path='tacotron2.tflite', training=False, name="tacotron2" ) #self.tacotron2.setup_window(win_front=5, win_back=5) self.interpreter = tf.lite.Interpreter(model_path='tacotron2.tflite') self.interpreter.allocate_tensors() self.input_details = self.interpreter.get_input_details() self.output_details = self.interpreter.get_output_details() mb_melgan_config = AutoConfig.from_pretrained( config.multiband_melgan_baker) self.mb_melgan = TFAutoModel.from_pretrained( config=mb_melgan_config, pretrained_path=config.multiband_melgan_pretrained_path, name="mb_melgan") self.processor = AutoProcessor.from_pretrained( pretrained_path=config.baker_mapper_pretrained_path) def prepare_input(self, input_ids): return (tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0), tf.convert_to_tensor([len(input_ids)], tf.int32), tf.convert_to_tensor([0], dtype=tf.int32)) def do_synthesis(self, input_text): input_text = self.tts_pause.add_pause(input_text) print("input_text>>>>", input_text) logging.info( "[TTSModel] [do_synthesis] input_text:{}".format(input_text)) input_ids = self.processor.text_to_sequence(input_text, inference=True) # nput_ids = np.concatenate([input_ids, [219 - 1]], -1) self.interpreter.resize_tensor_input(self.input_details[0]['index'], [1, len(input_ids)]) self.interpreter.allocate_tensors() input_data = self.prepare_input(input_ids) for i, detail in enumerate(self.input_details): input_shape = detail['shape'] self.interpreter.set_tensor(detail['index'], input_data[i]) # self.interpreter.invoke() decoder_output_tflite, mel_outputs = self.interpreter.get_tensor( self.output_details[0]['index']), interpreter.get_tensor( self.output_details[1]['index']) remove_end = 1024 audio = self.mb_melgan.inference(mel_outputs)[0, :-remove_end, 0] return mel_outputs.numpy(), decoder_output_tflite.numpy(), audio.numpy( ) '''
# print("interpreter_mb_melgan:",interpreter_mb_melgan) interpreter_mb_melgan.allocate_tensors() # Get input and output tensors. input_details_mb_melgan = interpreter_mb_melgan.get_input_details() # print("input_details_mb_melgan:",input_details_mb_melgan) output_details_mb_melgan = interpreter_mb_melgan.get_output_details() # print("output_details_mb_melgan:",output_details_mb_melgan) # Prepare input data. def prepare_input(input_ids): return (tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0), tf.convert_to_tensor([len(input_ids)], tf.int32), tf.convert_to_tensor([0], dtype=tf.int32)) tts_pause = TTSSegPause() # Test the model on random input data. def infer(input_text): processor = AutoProcessor.from_pretrained(pretrained_path=config_lp.baker_mapper_pretrained_path) input_text = tts_pause.add_pause(input_text) # logging.info( "[TTSModel] [do_synthesis] input_text:{}".format( input_text ) ) input_ids = processor.text_to_sequence(input_text, inference=True) # input_ids = np.concatenate([input_ids, [len(symbols) - 1]], -1) # eos. # interpreter_tacotron.resize_tensor_input(input_details_tacotron[0]['index'], [1, len(input_ids)]) interpreter_tacotron.allocate_tensors() input_data = prepare_input(input_ids) for i, detail in enumerate(input_details_tacotron): print(detail) input_shape = detail['shape']