def main(): from utils import hparams as hp parser = argparse.ArgumentParser() parser.add_argument('--hp_file', metavar='FILE', default='hparams.py') parser.add_argument('--train_script', default=None) args = parser.parse_args() hp.configure(args.hp_file) if args.train_script is not None: hp.train_script = args.train_script print(f'train script = {hp.train_script}') datasets = TrainDatasets(hp.train_script, hp) sampler = LengthsBatchSampler(datasets, hp.max_seqlen, hp.lengths_file, shuffle=True, shuffle_one_time=False, shuffle_all=False) dataloader = DataLoader(datasets, batch_sampler=sampler, num_workers=4, collate_fn=collate_fn) DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') from tqdm import tqdm pbar = tqdm(dataloader) for d in pbar: text, wav_input, pos_text, pos_wav2vec2, text_lengths, wav2vec2_lengths = d text = text.to(DEVICE, non_blocking=True) wav_input = wav_input.to(DEVICE, non_blocking=True) pos_text = pos_text.to(DEVICE, non_blocking=True) pos_wav2vec2 = pos_wav2vec2.to(DEVICE, non_blocking=True) text_lengths = text_lengths.to(DEVICE, non_blocking=True)
def synthesize(text): input = text + "|00-" + lang + "|" + lang # Change to Multi_TTS path sys.path.append( os.path.join(os.path.dirname(__file__), "dependencies/Multilingual_Text_to_Speech")) if "utils" in sys.modules: del sys.modules["utils"] from synthesize import synthesize from utils import build_model # Load Mulilingual pretrained model model = build_model( os.path.abspath("./dependencies/checkpoints/generated_switching.pyt")) model.eval() # generate spectogram spectogram = synthesize(model, "|" + input) # Change to WaveRNN Path sys.path.append( os.path.join(os.path.dirname(__file__), "dependencies/WaveRNN")) if "utils" in sys.modules: del sys.modules["utils"] from models.fatchord_version import WaveRNN from utils import hparams as hp from gen_wavernn import generate import torch # Load WaveRNN pretrained model hp.configure("hparams.py") model = WaveRNN( rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to( torch.device('cuda' if torch.cuda.is_available() else 'cpu')) model.load( os.path.join(os.path.dirname(__file__), "dependencies/checkpoints/wavernn_weight.pyt")) waveform = generate(model, s, hp.voc_gen_batched, hp.voc_target, hp.voc_overlap) f = write("./temp/result.wav", "x") f.write(waveform) f.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--hp_file', type=str, default='hparams.py') args = parser.parse_args() hp.configure(args.hp_file) fill_variables(hp) log_config(hp) os.makedirs(hp.save_dir, exist_ok=True) n_gpus = torch.cuda.device_count() args.__setattr__('n_gpus', n_gpus) if n_gpus > 1: run_distributed(run_training, args, hp) else: run_training(0, args, hp, None)
def main(): # Parse Arguments parser = argparse.ArgumentParser(description='Train Tacotron TTS') parser.add_argument('--force_train', '-f', action='store_true', help='Forces the model to train past total steps') parser.add_argument('--force_gta', '-g', action='store_true', help='Force the model to create GTA features') parser.add_argument( '--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') args = parser.parse_args() hp.configure(args.hp_file) # Load hparams from file paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) force_train = args.force_train force_gta = args.force_gta if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') for session in hp.tts_schedule: _, _, _, batch_size = session if batch_size % torch.cuda.device_count() != 0: raise ValueError( '`batch_size` must be evenly divisible by n_gpus!') else: device = torch.device('cpu') print('Using device:', device) # Instantiate Tacotron Model print('\nInitialising Tacotron Model...\n') model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) optimizer = optim.Adam(model.parameters()) restore_checkpoint('tts', paths, model, optimizer, create_if_missing=True) if not force_gta: for i, session in enumerate(hp.tts_schedule): current_step = model.get_step() r, lr, max_step, batch_size = session training_steps = max_step - current_step # Do we need to change to the next session? if current_step >= max_step: # Are there no further sessions than the current one? if i == len(hp.tts_schedule) - 1: # There are no more sessions. Check if we force training. if force_train: # Don't finish the loop - train forever training_steps = 999_999_999 else: # We have completed training. Breaking is same as continue break else: # There is a following session, go to it continue model.r = r simple_table([('Steps with r=%s' % (repr1(r)), str(training_steps // 1000) + 'k Steps'), ('Batch Size', batch_size), ('Learning Rate', lr), ('Outputs/Step (r)', model.r)]) train_set, attn_example = get_tts_datasets(paths.data, batch_size, r) tts_train_loop(paths, model, optimizer, train_set, lr, training_steps, attn_example) print('Training Complete.') print( 'To continue training increase tts_total_steps in hparams.py or use --force_train\n' ) print('Creating Ground Truth Aligned Dataset...\n') train_set, attn_example = get_tts_datasets(paths.data, 8, model.r) create_gta_features(model, train_set, paths.gta) print( '\n\nYou can now train WaveRNN on GTA features - use python train_wavernn.py --gta\n' )
def __iter__(self): if self.shuffle: np.random.shuffle(self.all_indices) for indices in self.all_indices: yield indices def __len__(self): return len(self.all_indices) def get_dataset(script_file='examples/LJSpeech/data/train/script_16000/train_id_sort_xlen.txt'): print(f'script_file = {script_file}') return TrainDatasets(script_file) def get_test_dataset(script_file='examples/LJSpeech/data/dev/script_16000/dev_id.txt'): print(f'script_file = {script_file}') return TestDatasets(script_file) if __name__ == '__main__': hp.configure('configs/hparams_LJSpeech.py') datasets = get_test_dataset('examples/LJSpeech/data/dev/script_16000/dev_id.txt') sampler = NumBatchSampler(datasets, 1, shuffle=False) dataloader = DataLoader(datasets, batch_sampler=sampler, num_workers=4, collate_fn=collate_fn_test) from tqdm import tqdm pbar = tqdm(dataloader) for d in pbar: print(d[1])
'-u', dest='batched', action='store_false', help='Slower Unbatched Generation (better quality)') parser.add_argument( '--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') args = parser.parse_args() hp.configure('hparams.py') # Load hparams from file parser.set_defaults(batched=True) parser.set_defaults(input_text=None) batched = args.batched input_text = args.input_text if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print('Using device:', device) print('\nInitialising WaveRNN Model...\n')
parser.add_argument('--beam_width', type=int, default=None) parser.add_argument('--log_params', action='store_true') parser.add_argument('--calc_wer', action='store_true') parser.add_argument('--segment', type=int, default=10000) parser.add_argument('--silence_file', type=str, default=None) parser.add_argument('--lm_type', type=str, default='LSTM') args = parser.parse_args() hp_file = args.hp_file model_name = args.load_name # save dir name model_path = os.path.dirname(model_name) if hp_file is None: hp_file = os.path.join(model_path, 'hparams.py') hp.configure(hp_file) fill_variables(hp) setattr(hp, 'silence_file', args.silence_file) if args.beam_width is not None: print(f'beam width is set to {args.beam_width}') hp.beam_width = args.beam_width script_file = hp.eval_file if args.test_script is not None: script_file = args.test_script if hp.lm_weight is not None: args.lm_weight = hp.lm_weight print(f'lm weight = {args.lm_weight}')
default='.wav', help='file extension to search for in dataset folder') parser.add_argument( '--num_workers', '-w', metavar='N', type=valid_n_workers, default=cpu_count() - 1, help='The number of worker threads to use for preprocessing') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') args = parser.parse_args() hp.configure(args.hp_file) # Load hparams from file if args.path is None: args.path = hp.wav_path extension = args.extension path = args.path def convert_file(path: Path): y = load_wav(path) peak = np.abs(y).max() if hp.peak_norm or peak > 1.0: y /= peak mel = melspectrogram(y) if hp.voc_mode == 'RAW': quant = encode_mu_law(
def thak(): class Tshamsoo(): force_cpu = os.getenv('FORCE_CPU', False) hp_file = 'hparams.py' vocoder = os.getenv('VOCODER', 'wavernn') batched = os.getenv('BATCHED', True) target = os.getenv('TARGET', None) overlap = os.getenv('OVERLAP', None) tts_weights = None save_attn = os.getenv('SAVE_ATTN', False) voc_weights = None iters = os.getenv('GL_ITERS', 32) args = Tshamsoo() if args.vocoder in ['griffinlim', 'gl']: args.vocoder = 'griffinlim' elif args.vocoder in ['wavernn', 'wr']: args.vocoder = 'wavernn' else: raise argparse.ArgumentError('Must provide a valid vocoder type!') hp.configure(args.hp_file) # Load hparams from file tts_weights = args.tts_weights save_attn = args.save_attn paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print('Using device:', device) if args.vocoder == 'wavernn': # set defaults for any arguments that depend on hparams if args.target is None: args.target = hp.voc_target if args.overlap is None: args.overlap = hp.voc_overlap if args.batched is None: args.batched = hp.voc_gen_batched batched = args.batched target = int(args.target) overlap = int(args.overlap) print('\nInitialising WaveRNN Model...\n') # Instantiate WaveRNN Model voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) voc_load_path = args.voc_weights if args.voc_weights else paths.voc_latest_weights voc_model.load(voc_load_path) else: voc_model = None batched = None target = None overlap = None print('\nInitialising Tacotron Model...\n') # Instantiate Tacotron Model tts_model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) tts_load_path = tts_weights if tts_weights else paths.tts_latest_weights tts_model.load(tts_load_path) return args, voc_model, tts_model, batched, target, overlap, save_attn
def main(): # Parse Arguments parser = argparse.ArgumentParser(description='TTS Generator') parser.add_argument( '--tts_weights', type=str, help='[string/path] Load in different FastSpeech weights') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') parser.add_argument( '--alpha', type=float, default=1., help='Parameter for controlling length regulator for speedup ' 'or slow-down of generated speech, e.g. alpha=2.0 is double-time') if not os.path.exists('onnx'): os.mkdir('onnx') args = parser.parse_args() hp.configure(args.hp_file) input_text = "the forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves." tts_weights = args.tts_weights paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) device = torch.device('cpu') print('Using device:', device) print('\nInitialising Forward TTS Model...\n') tts_model = ForwardTacotron(embed_dims=hp.forward_embed_dims, num_chars=len(symbols), durpred_rnn_dims=hp.forward_durpred_rnn_dims, durpred_conv_dims=hp.forward_durpred_conv_dims, rnn_dim=hp.forward_rnn_dims, postnet_k=hp.forward_postnet_K, postnet_dims=hp.forward_postnet_dims, prenet_k=hp.forward_prenet_K, prenet_dims=hp.forward_prenet_dims, highways=hp.forward_num_highways, dropout=hp.forward_dropout, n_mels=hp.num_mels).to(device) tts_load_path = tts_weights or paths.forward_latest_weights tts_model.load(tts_load_path) encoder = DurationPredictor(tts_model) decoder = Tacotron(tts_model) tts_model.eval() encoder.eval() decoder.eval() opset_version = 10 with torch.no_grad(): input_seq = text_to_sequence(input_text.strip(), hp.tts_cleaner_names) input_seq = torch.as_tensor(input_seq, dtype=torch.long, device=device).unsqueeze(0) ''' FIRST STEP: predict symbols duration ''' torch.onnx.export(encoder, input_seq, "./onnx/forward_tacotron_duration_prediction.onnx", opset_version=opset_version, do_constant_folding=True, input_names=["input_seq"], output_names=["embeddings", "duration"]) x, durations = encoder(input_seq) ''' SECOND STEP: expand symbols by durations ''' x = encoder.lr(x, durations) ''' THIRD STEP: generate mel ''' torch.onnx.export(decoder, x, "./onnx/forward_tacotron_regression.onnx", opset_version=opset_version, do_constant_folding=True, input_names=["data"], output_names=["mel"]) print('Done!')
def get_dataset( script_file='examples/LJSpeech/data/train/script_16000/train_id_sort_xlen.txt' ): print(f'script_file = {script_file}') return TrainDatasets(script_file) def get_test_dataset( script_file='examples/LJSpeech/data/dev/script_16000/dev_id.txt'): print(f'script_file = {script_file}') return TestDatasets(script_file) if __name__ == '__main__': hp.configure('configs/fastSpeech2/hparams_LJ_melgan.py') datasets = get_dataset(hp.train_script) sampler = NumBatchSampler(datasets, 3, shuffle=False) dataloader = DataLoader(datasets, batch_sampler=sampler, num_workers=4, collate_fn=collate_fn) from tqdm import tqdm pbar = tqdm(dataloader) for dic in pbar: import pdb pdb.set_trace() print(dic[1])
def main(): # Parse Arguments parser = argparse.ArgumentParser(description='Train Tacotron TTS') parser.add_argument('--force_train', '-f', action='store_true', help='Forces the model to train past total steps') parser.add_argument('--force_gta', '-g', action='store_true', help='Force the model to create GTA features') parser.add_argument( '--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') args = parser.parse_args() hp.configure(args.hp_file) # Load hparams from file paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) force_gta = args.force_gta if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') for session in hp.forward_schedule: _, _, batch_size = session if batch_size % torch.cuda.device_count() != 0: raise ValueError( '`batch_size` must be evenly divisible by n_gpus!') else: device = torch.device('cpu') print('Using device:', device) # Instantiate Forward TTS Model print('\nInitialising Forward TTS Model...\n') model = ForwardTacotron(embed_dims=hp.forward_embed_dims, num_chars=len(symbols), durpred_rnn_dims=hp.forward_durpred_rnn_dims, durpred_conv_dims=hp.forward_durpred_conv_dims, rnn_dim=hp.forward_rnn_dims, postnet_k=hp.forward_postnet_K, postnet_dims=hp.forward_postnet_dims, prenet_k=hp.forward_prenet_K, prenet_dims=hp.forward_prenet_dims, highways=hp.forward_num_highways, dropout=hp.forward_dropout, n_mels=hp.num_mels).to(device) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print(f'num params {params}') optimizer = optim.Adam(model.parameters()) restore_checkpoint('forward', paths, model, optimizer, create_if_missing=True) if not force_gta: for i, session in enumerate(hp.forward_schedule): current_step = model.get_step() lr, max_step, batch_size = session training_steps = max_step - current_step simple_table([(f'Steps', str(training_steps // 1000) + 'k Steps'), ('Batch Size', batch_size), ('Learning Rate', lr)]) train_set, mel_example = get_tts_datasets(paths.data, batch_size, 1, alignments=True) train_loop(paths, model, optimizer, train_set, lr, training_steps, mel_example) train_set, mel_example = get_tts_datasets(paths.data, 8, 1, alignments=True) create_gta_features(model, train_set, paths.gta) print('Training Complete.')
def main(): # Parse Arguments parser = argparse.ArgumentParser(description='TTS Generator') parser.add_argument('--mel', type=str, help='[string/path] path to test mel file') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') parser.add_argument('--batched', '-b', dest='batched', action='store_true', help='Fast Batched Generation') parser.add_argument( '--voc_weights', type=str, help='[string/path] Load in different FastSpeech weights', default="pretrained/wave_800K.pyt") args = parser.parse_args() if not os.path.exists('onnx'): os.mkdir('onnx') hp.configure(args.hp_file) device = torch.device('cpu') print('Using device:', device) ##### print('\nInitialising WaveRNN Model...\n') # Instantiate WaveRNN Model voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) voc_load_path = args.voc_weights voc_model.load(voc_load_path) voc_upsampler = WaveRNNUpsamplerONNX(voc_model, args.batched, hp.voc_target, hp.voc_overlap) voc_infer = WaveRNNONNX(voc_model) voc_model.eval() voc_upsampler.eval() voc_infer.eval() opset_version = 11 with torch.no_grad(): mels = np.load(args.mel) mels = torch.from_numpy(mels) mels = mels.unsqueeze(0) mels = voc_upsampler.pad_tensor(mels) mels_onnx = mels.clone() torch.onnx.export(voc_upsampler, mels_onnx, "./onnx/wavernn_upsampler.onnx", opset_version=opset_version, do_constant_folding=True, input_names=["mels"], output_names=["upsample_mels", "aux"]) mels, aux = voc_upsampler(mels) mels = mels[:, 550:-550, :] mels, aux = voc_upsampler.fold(mels, aux) h1, h2, x = voc_infer.get_initial_parameters(mels) aux_split = voc_infer.split_aux(aux) b_size, seq_len, _ = mels.size() if seq_len: m_t = mels[:, 0, :] a1_t, a2_t, a3_t, a4_t = \ (a[:, 0, :] for a in aux_split) rnn_input = (m_t, a1_t, a2_t, a3_t, a4_t, h1, h2, x) torch.onnx.export(voc_infer, rnn_input, "./onnx/wavernn_rnn.onnx", opset_version=opset_version, do_constant_folding=True, input_names=[ "m_t", "a1_t", "a2_t", "a3_t", "a4_t", "h1", "h2", "x" ], output_names=["logits", "h1", "h2"]) print('Done!')
def init_hparams(hp_file): hp.configure(hp_file)
def __init__(self): # Parse Arguments parser = argparse.ArgumentParser(description='TTS') self.args = parser.parse_args() self.args.vocoder = 'wavernn' self.args.hp_file = 'hparams.py' self.args.voc_weights = False self.args.tts_weights = False self.args.save_attn = False self.args.batched = True self.args.target = None self.args.overlap = None self.args.force_cpu = False #================ vocoder ================# if self.args.vocoder in ['griffinlim', 'gl']: self.args.vocoder = 'griffinlim' elif self.args.vocoder in ['wavernn', 'wr']: self.args.vocoder = 'wavernn' else: raise argparse.ArgumentError('Must provide a valid vocoder type!') hp.configure(self.args.hp_file) # Load hparams from file # set defaults for any arguments that depend on hparams if self.args.vocoder == 'wavernn': if self.args.target is None: self.args.target = hp.voc_target if self.args.overlap is None: self.args.overlap = hp.voc_overlap if self.args.batched is None: self.args.batched = hp.voc_gen_batched #================ others ================# paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) print("hello") print(paths.base) if not self.args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print('Using device:', device) # === Wavernn === # if self.args.vocoder == 'wavernn': print('\nInitialising WaveRNN Model...\n') self.voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) voc_load_path = self.args.voc_weights if self.args.voc_weights else paths.voc_latest_weights #print(paths.voc_latest_weights) self.voc_model.load(voc_load_path) # === Tacotron === # if hp.tts_model == 'tacotron': print('\nInitialising Tacotron Model...\n') self.tts_model = Tacotron( embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights self.tts_model.load(tts_load_path) # === Tacotron2 === # elif hp.tts_model == 'tacotron2': print('\nInitializing Tacotron2 Model...\n') self.tts_model = Tacotron2().to(device) tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights self.tts_model.load(tts_load_path) # === Infomation === # if hp.tts_model == 'tacotron': if self.args.vocoder == 'wavernn': voc_k = self.voc_model.get_step() // 1000 tts_k = self.tts_model.get_step() // 1000 simple_table([ ('Tacotron', str(tts_k) + 'k'), ('r', self.tts_model.r), ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'), ('Generation Mode', 'Batched' if self.args.batched else 'Unbatched'), ('Target Samples', self.args.target if self.args.batched else 'N/A'), ('Overlap Samples', self.args.overlap if self.args.batched else 'N/A') ]) elif self.args.vocoder == 'griffinlim': tts_k = self.tts_model.get_step() // 1000 simple_table([('Tacotron', str(tts_k) + 'k'), ('r', self.tts_model.r), ('Vocoder Type', 'Griffin-Lim'), ('GL Iters', self.args.iters)]) elif hp.tts_model == 'tacotron2': if self.args.vocoder == 'wavernn': voc_k = self.voc_model.get_step() // 1000 tts_k = self.tts_model.get_step() // 1000 simple_table([ ('Tacotron2', str(tts_k) + 'k'), ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'), ('Generation Mode', 'Batched' if self.args.batched else 'Unbatched'), ('Target Samples', self.args.target if self.args.batched else 'N/A'), ('Overlap Samples', self.args.overlap if self.args.batched else 'N/A') ]) elif self.args.vocoder == 'griffinlim': tts_k = self.tts_model.get_step() // 1000 simple_table([('Tacotron2', str(tts_k) + 'k'), ('Vocoder Type', 'Griffin-Lim'), ('GL Iters', self.args.iters)])
def main(): # Parse Arguments parser = argparse.ArgumentParser(description='Train WaveRNN Vocoder') parser.add_argument('--lr', '-l', type=float, help='[float] override hparams.py learning rate') parser.add_argument('--batch_size', '-b', type=int, help='[int] override hparams.py batch size') parser.add_argument('--force_train', '-f', action='store_true', help='Forces the model to train past total steps') parser.add_argument('--gta', '-g', action='store_true', help='train wavernn on GTA features') parser.add_argument( '--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') args = parser.parse_args() hp.configure(args.hp_file) # load hparams from file if args.lr is None: args.lr = hp.voc_lr if args.batch_size is None: args.batch_size = hp.voc_batch_size paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) batch_size = args.batch_size force_train = args.force_train train_gta = args.gta lr = args.lr if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') if batch_size % torch.cuda.device_count() != 0: raise ValueError( '`batch_size` must be evenly divisible by n_gpus!') else: device = torch.device('cpu') print('Using device:', device) print('\nInitialising Model...\n') # Instantiate WaveRNN Model voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) # Check to make sure the hop length is correctly factorised assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length optimizer = optim.Adam(voc_model.parameters()) restore_checkpoint('voc', paths, voc_model, optimizer, create_if_missing=True) train_set, test_set = get_vocoder_datasets(paths.data, batch_size, train_gta) total_steps = 10_000_000 if force_train else hp.voc_total_steps simple_table([ ('Remaining', str( (total_steps - voc_model.get_step()) // 1000) + 'k Steps'), ('Batch Size', batch_size), ('LR', lr), ('Sequence Len', hp.voc_seq_len), ('GTA Train', train_gta) ]) loss_func = F.cross_entropy if voc_model.mode == 'RAW' else discretized_mix_logistic_loss voc_train_loop(paths, voc_model, loss_func, optimizer, train_set, test_set, lr, total_steps) print('Training Complete.') print( 'To continue training increase voc_total_steps in hparams.py or use --force_train' )
wav = wav / 32768.0 elif wav.dtype == np.int32: wav = wav / 2147483648.0 elif wav.dtype == np.uint8: wav = (wav - 128) / 128.0 wav = wav.astype(np.float32) return sr, wav if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--hp_file', metavar='FILE', default='hparams.py') parser.add_argument('-d', '--data_path', type=str, required=True, help="root directory of wav files") parser.add_argument('-o', '--out_path', type=str, default=None, help="save directory of mel files") args = parser.parse_args() if args.out_path is None: args.out_path = args.data_path hp.configure(args.hp_file) main(hp, args)
os.chdir(CHECKPOINTS_FOLDER) os.system("curl -O -L 'https://github.com/Tomiinek/Multilingual_Text_to_Speech/releases/download/v1.0/" +wavernn_chpt+"'") print("Cur Dir", os.getcwd()) if "utils" in sys.modules: del sys.modules["utils"] sys.path.append(WAVERNN_FOLDER) from gen_wavernn import generate from utils import hparams as hp from models.fatchord_version import WaveRNN hp.configure(WAVERNN_FOLDER+'/hparams.py') model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to('cpu') model.load(CHECKPOINTS_FOLDER + "/" + wavernn_chpt) y = [] ix=1 while os.path.exists(CHR_FOLDER+"/"+str(ix)+".npy"): print("Found", CHR_FOLDER+"/"+str(ix)+".npy") y.append(np.load(CHR_FOLDER+"/"+str(ix)+".npy")) ix+=1 idx=1 for s in y:
def main_work(): parser = argparse.ArgumentParser(description='Get durations for Tacotron') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') parser.add_argument('--model_name', default='taco', help='taco or dctts') args = parser.parse_args() hp.configure(args.hp_file) # Load hparams from file model = args.model_name if model == "dctts": time_step = 50 else: time_step = 12.5 transcript_file = Path(f'{hp.data_path}/{hp.metadata}') outfile = Path(f'{hp.data_path}/train_durations.csv') transcript = read_transcript(transcript_file) # check if label files exist if not os.path.exists(f'{hp.data_path}/labels/label_state_align/'): print("No label_state_align directory found!") exit() if len(os.listdir(f'{hp.data_path}/labels/label_state_align/')) == 0: print(f'{hp.data_path}/labels/label_state_align/ is empty') exit() if not os.path.exists(f'{hp.data_path}/mel'): print("No mel directory found!") exit() if len(os.listdir(f'{hp.data_path}/mel')) == 0: print(f'{hp.data_path}/mel is empty') exit() for labfile in os.listdir(f'{hp.data_path}/labels/label_state_align/'): print(f'Processing {labfile} ... ') labfile = Path(labfile) #os.makedirs(f'{hp.data_path}/attention_guides_dctts', exist_ok=True) #out_guide_file = Path(f'{hp.data_path}/attention_guides_dctts/{labfile.stem}.npy') os.makedirs(f'{hp.data_path}/attention_guides', exist_ok=True) out_guide_file = Path(f'{hp.data_path}/attention_guides/{labfile.stem}.npy') labfile = Path(os.path.join(f'{hp.data_path}/labels/label_state_align/',labfile)) (mono, lengths) = merlin_state_label_to_monophones(labfile) mel_file = labfile.stem # NOTE THE DIMENSIONS -- dctts nframe is in [0] and taco is in [1] mel_features = np.load(f'{hp.data_path}/mel/{mel_file}.npy') if model == "dctts": audio_msec_length = mel_features.shape[0] * time_step else: audio_msec_length = mel_features.shape[1] * time_step resampled_lengths = resample_timings(lengths, 5.0, time_step, total_duration=audio_msec_length) if resampled_lengths is not None: resampled_lengths_in_frames = (resampled_lengths / time_step).astype(int) timings = match_up((mono, resampled_lengths_in_frames), transcript[labfile.stem]['phones']) assert len(transcript[labfile.stem]['phones']) == len(timings), (len(transcript[labfile.stem]['phones']), len(timings), transcript[labfile.stem]['phones'], timings) transcript[labfile.stem]['duration'] = timings guided_attention_matrix = durations_to_attention_matrix(np.array(timings)) save_guided_attention(guided_attention_matrix, out_guide_file) else: print(f'{labfile} was not successfully processed!') write_transcript(transcript, outfile, duration=True)