def get_forward_model(model_path): device = torch.device('cuda') model = ForwardTacotron(embed_dims=hp.forward_embed_dims, num_chars=len(symbols), durpred_rnn_dims=hp.forward_durpred_rnn_dims, durpred_conv_dims=hp.forward_durpred_conv_dims, rnn_dim=hp.forward_rnn_dims, postnet_k=hp.forward_postnet_K, postnet_dims=hp.forward_postnet_dims, prenet_k=hp.forward_prenet_K, prenet_dims=hp.forward_prenet_dims, highways=hp.forward_num_highways, dropout=hp.forward_dropout, n_mels=hp.num_mels).to(device) model.load(model_path) return model
pitch_rnn_dims=hp.forward_pitch_rnn_dims, pitch_conv_dims=hp.forward_pitch_conv_dims, pitch_dropout=hp.forward_pitch_dropout, pitch_emb_dims=hp.forward_pitch_emb_dims, pitch_proj_dropout=hp.forward_pitch_proj_dropout, rnn_dim=hp.forward_rnn_dims, postnet_k=hp.forward_postnet_K, postnet_dims=hp.forward_postnet_dims, prenet_k=hp.forward_prenet_K, prenet_dims=hp.forward_prenet_dims, highways=hp.forward_num_highways, dropout=hp.forward_dropout, n_mels=hp.num_mels).to(device) tts_load_path = tts_weights if tts_weights else paths.forward_latest_weights tts_model.load(tts_load_path) if input_text: text = clean_text(input_text.strip()) inputs = [text_to_sequence(text)] else: with open('sentences.txt') as f: inputs = [clean_text(l.strip()) for l in f] inputs = [text_to_sequence(t) for t in inputs] tts_k = tts_model.get_step() // 1000 if args.vocoder == 'wavernn': voc_k = voc_model.get_step() // 1000 simple_table([('Forward Tacotron', str(tts_k) + 'k'), ('Vocoder Type', 'WaveRNN'),
def main(): # Parse Arguments parser = argparse.ArgumentParser(description='TTS Generator') parser.add_argument( '--tts_weights', type=str, help='[string/path] Load in different FastSpeech weights') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') parser.add_argument( '--alpha', type=float, default=1., help='Parameter for controlling length regulator for speedup ' 'or slow-down of generated speech, e.g. alpha=2.0 is double-time') if not os.path.exists('onnx'): os.mkdir('onnx') args = parser.parse_args() hp.configure(args.hp_file) input_text = "the forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves." tts_weights = args.tts_weights paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) device = torch.device('cpu') print('Using device:', device) print('\nInitialising Forward TTS Model...\n') tts_model = ForwardTacotron(embed_dims=hp.forward_embed_dims, num_chars=len(symbols), durpred_rnn_dims=hp.forward_durpred_rnn_dims, durpred_conv_dims=hp.forward_durpred_conv_dims, rnn_dim=hp.forward_rnn_dims, postnet_k=hp.forward_postnet_K, postnet_dims=hp.forward_postnet_dims, prenet_k=hp.forward_prenet_K, prenet_dims=hp.forward_prenet_dims, highways=hp.forward_num_highways, dropout=hp.forward_dropout, n_mels=hp.num_mels).to(device) tts_load_path = tts_weights or paths.forward_latest_weights tts_model.load(tts_load_path) encoder = DurationPredictor(tts_model) decoder = Tacotron(tts_model) tts_model.eval() encoder.eval() decoder.eval() opset_version = 10 with torch.no_grad(): input_seq = text_to_sequence(input_text.strip(), hp.tts_cleaner_names) input_seq = torch.as_tensor(input_seq, dtype=torch.long, device=device).unsqueeze(0) ''' FIRST STEP: predict symbols duration ''' torch.onnx.export(encoder, input_seq, "./onnx/forward_tacotron_duration_prediction.onnx", opset_version=opset_version, do_constant_folding=True, input_names=["input_seq"], output_names=["embeddings", "duration"]) x, durations = encoder(input_seq) ''' SECOND STEP: expand symbols by durations ''' x = encoder.lr(x, durations) ''' THIRD STEP: generate mel ''' torch.onnx.export(decoder, x, "./onnx/forward_tacotron_regression.onnx", opset_version=opset_version, do_constant_folding=True, input_names=["data"], output_names=["mel"]) print('Done!')