def __init__(self, cfg: Config): self.cfg = cfg self.paths = Paths() self.audio = Audio(cfg) self.ckpt_path = self.paths.ckpt / cfg.config_id log_dir = self.ckpt_path / 'tensorboard' self.writer = SummaryWriter(log_dir=log_dir, comment='v1') self.criterion = MaskedL1()
elif hp.voc_mode == 'MOL': quant = float_2_label(y, bits=16) return mel.astype(np.float32), quant.astype(np.int64) def process_wav(path: Path): wav_id = path.stem m, x = convert_file(path) np.save(paths.mel / f'{wav_id}.npy', m, allow_pickle=False) np.save(paths.quant / f'{wav_id}.npy', x, allow_pickle=False) return wav_id, m.shape[-1] wav_files = get_files(path, extension) paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) print(f'\n{len(wav_files)} {extension[1:]} files found in "{path}"\n') if len(wav_files) == 0: print('Please point wav_path in hparams.py to your dataset,') print('or use the --path option.\n') else: text_dict = ljspeech(path) with open(paths.data / 'text_dict.pkl', 'wb') as f: pickle.dump(text_dict, f) n_workers = max(1, args.num_workers)
def main(): # Parse Arguments parser = argparse.ArgumentParser(description='Train WaveRNN Vocoder') parser.add_argument('--lr', '-l', type=float, help='[float] override hparams.py learning rate') parser.add_argument('--batch_size', '-b', type=int, help='[int] override hparams.py batch size') parser.add_argument('--force_train', '-f', action='store_true', help='Forces the model to train past total steps') parser.add_argument('--gta', '-g', action='store_true', help='train wavernn on GTA features') parser.add_argument( '--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') args = parser.parse_args() hp.configure(args.hp_file) # load hparams from file if args.lr is None: args.lr = hp.voc_lr if args.batch_size is None: args.batch_size = hp.voc_batch_size paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) batch_size = args.batch_size force_train = args.force_train train_gta = args.gta lr = args.lr if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') if batch_size % torch.cuda.device_count() != 0: raise ValueError( '`batch_size` must be evenly divisible by n_gpus!') else: device = torch.device('cpu') print('Using device:', device) print('\nInitialising Model...\n') # Instantiate WaveRNN Model voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) # Check to make sure the hop length is correctly factorised assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length optimizer = optim.Adam(voc_model.parameters()) restore_checkpoint('voc', paths, voc_model, optimizer, create_if_missing=True) train_set, test_set = get_vocoder_datasets(paths.data, batch_size, train_gta) total_steps = 10_000_000 if force_train else hp.voc_total_steps simple_table([ ('Remaining', str( (total_steps - voc_model.get_step()) // 1000) + 'k Steps'), ('Batch Size', batch_size), ('LR', lr), ('Sequence Len', hp.voc_seq_len), ('GTA Train', train_gta) ]) loss_func = F.cross_entropy if voc_model.mode == 'RAW' else discretized_mix_logistic_loss voc_train_loop(paths, voc_model, loss_func, optimizer, train_set, test_set, lr, total_steps) print('Training Complete.') print( 'To continue training increase voc_total_steps in hparams.py or use --force_train' )
def main(): # Parse Arguments parser = argparse.ArgumentParser(description='Train Tacotron TTS') parser.add_argument('--force_train', '-f', action='store_true', help='Forces the model to train past total steps') parser.add_argument('--force_gta', '-g', action='store_true', help='Force the model to create GTA features') parser.add_argument( '--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') args = parser.parse_args() hp.configure(args.hp_file) # Load hparams from file paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) force_train = args.force_train force_gta = args.force_gta if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') for session in hp.tts_schedule: _, _, _, batch_size = session if batch_size % torch.cuda.device_count() != 0: raise ValueError( '`batch_size` must be evenly divisible by n_gpus!') else: device = torch.device('cpu') print('Using device:', device) # Instantiate Tacotron Model print('\nInitialising Tacotron Model...\n') model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) optimizer = optim.Adam(model.parameters()) restore_checkpoint('tts', paths, model, optimizer, create_if_missing=True) if not force_gta: for i, session in enumerate(hp.tts_schedule): current_step = model.get_step() r, lr, max_step, batch_size = session training_steps = max_step - current_step # Do we need to change to the next session? if current_step >= max_step: # Are there no further sessions than the current one? if i == len(hp.tts_schedule) - 1: # There are no more sessions. Check if we force training. if force_train: # Don't finish the loop - train forever training_steps = 999_999_999 else: # We have completed training. Breaking is same as continue break else: # There is a following session, go to it continue model.r = r simple_table([('Steps with r=%s' % (repr1(r)), str(training_steps // 1000) + 'k Steps'), ('Batch Size', batch_size), ('Learning Rate', lr), ('Outputs/Step (r)', model.r)]) train_set, attn_example = get_tts_datasets(paths.data, batch_size, r) tts_train_loop(paths, model, optimizer, train_set, lr, training_steps, attn_example) print('Training Complete.') print( 'To continue training increase tts_total_steps in hparams.py or use --force_train\n' ) print('Creating Ground Truth Aligned Dataset...\n') train_set, attn_example = get_tts_datasets(paths.data, 8, model.r) create_gta_features(model, train_set, paths.gta) print( '\n\nYou can now train WaveRNN on GTA features - use python train_wavernn.py --gta\n' )
parser = argparse.ArgumentParser( description='Preprocessing script that generates mel spectrograms.') parser.add_argument( '--path', '-p', help='Point to the data path, expects LJSpeech-like folder.') parser.add_argument('--config', '-c', help='Point to the config.', default='config.yaml') args = parser.parse_args() cfg = Config.load(args.config) audio = Audio(cfg) paths = Paths() preprocessor = Preprocessor(audio, paths.mel) files = get_files(args.path) n_workers = min(cpu_count() - 1, cfg.n_workers) pool = Pool(processes=n_workers) map_func = pool.imap_unordered(preprocessor.process_wav, files) dataset = [] text_dict = read_metafile(args.path) display_params([ ('Num Train', len(files) - cfg.n_val), ('Num Val', cfg.n_val), ('Num Mels', cfg.n_mels), ('Win Length', cfg.win_length), ('Hop Length', cfg.hop_length),
quant = encode_mu_law(wav, mu=2**hp.bits) else: quant = float_2_label(wav, bits=hp.bits) return mel.astype(np.float32), quant.astype(np.int16) def process_wav(path): id = path.split('/')[-1][:-4] m, x = convert_file(path) np.save(f'{paths.mel}{id}.npy', m) np.save(f'{paths.quant}{id}.npy', x) return id wav_files = get_files(hp.wav_path) paths = Paths(hp.data_path, hp.model_id) print(f'\n{len(wav_files)} wav files found in hparams.wav_path\n') if len(wav_files) == 0: print('Please point wav_path in hparams.py to your dataset\n') else: print('+--------------------+--------------+---------+-----------------+') print( f'| Sample Rate: {hp.sample_rate} | Mu Law: {hp.mu_law} | Bits: {hp.bits} | Hop Length: {hp.hop_length} |' ) print('+--------------------+--------------+---------+-----------------+') pool = Pool(processes=cpu_count())
def __init__(self): # Parse Arguments parser = argparse.ArgumentParser(description='TTS') self.args = parser.parse_args() self.args.vocoder = 'wavernn' self.args.hp_file = 'hparams.py' self.args.voc_weights = False self.args.tts_weights = False self.args.save_attn = False self.args.batched = True self.args.target = None self.args.overlap = None self.args.force_cpu = False #================ vocoder ================# if self.args.vocoder in ['griffinlim', 'gl']: self.args.vocoder = 'griffinlim' elif self.args.vocoder in ['wavernn', 'wr']: self.args.vocoder = 'wavernn' else: raise argparse.ArgumentError('Must provide a valid vocoder type!') hp.configure(self.args.hp_file) # Load hparams from file # set defaults for any arguments that depend on hparams if self.args.vocoder == 'wavernn': if self.args.target is None: self.args.target = hp.voc_target if self.args.overlap is None: self.args.overlap = hp.voc_overlap if self.args.batched is None: self.args.batched = hp.voc_gen_batched #================ others ================# paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) print("hello") print(paths.base) if not self.args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print('Using device:', device) # === Wavernn === # if self.args.vocoder == 'wavernn': print('\nInitialising WaveRNN Model...\n') self.voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) voc_load_path = self.args.voc_weights if self.args.voc_weights else paths.voc_latest_weights #print(paths.voc_latest_weights) self.voc_model.load(voc_load_path) # === Tacotron === # if hp.tts_model == 'tacotron': print('\nInitialising Tacotron Model...\n') self.tts_model = Tacotron( embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights self.tts_model.load(tts_load_path) # === Tacotron2 === # elif hp.tts_model == 'tacotron2': print('\nInitializing Tacotron2 Model...\n') self.tts_model = Tacotron2().to(device) tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights self.tts_model.load(tts_load_path) # === Infomation === # if hp.tts_model == 'tacotron': if self.args.vocoder == 'wavernn': voc_k = self.voc_model.get_step() // 1000 tts_k = self.tts_model.get_step() // 1000 simple_table([ ('Tacotron', str(tts_k) + 'k'), ('r', self.tts_model.r), ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'), ('Generation Mode', 'Batched' if self.args.batched else 'Unbatched'), ('Target Samples', self.args.target if self.args.batched else 'N/A'), ('Overlap Samples', self.args.overlap if self.args.batched else 'N/A') ]) elif self.args.vocoder == 'griffinlim': tts_k = self.tts_model.get_step() // 1000 simple_table([('Tacotron', str(tts_k) + 'k'), ('r', self.tts_model.r), ('Vocoder Type', 'Griffin-Lim'), ('GL Iters', self.args.iters)]) elif hp.tts_model == 'tacotron2': if self.args.vocoder == 'wavernn': voc_k = self.voc_model.get_step() // 1000 tts_k = self.tts_model.get_step() // 1000 simple_table([ ('Tacotron2', str(tts_k) + 'k'), ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'), ('Generation Mode', 'Batched' if self.args.batched else 'Unbatched'), ('Target Samples', self.args.target if self.args.batched else 'N/A'), ('Overlap Samples', self.args.overlap if self.args.batched else 'N/A') ]) elif self.args.vocoder == 'griffinlim': tts_k = self.tts_model.get_step() // 1000 simple_table([('Tacotron2', str(tts_k) + 'k'), ('Vocoder Type', 'Griffin-Lim'), ('GL Iters', self.args.iters)])
def thak(): class Tshamsoo(): force_cpu = os.getenv('FORCE_CPU', False) hp_file = 'hparams.py' vocoder = os.getenv('VOCODER', 'wavernn') batched = os.getenv('BATCHED', True) target = os.getenv('TARGET', None) overlap = os.getenv('OVERLAP', None) tts_weights = None save_attn = os.getenv('SAVE_ATTN', False) voc_weights = None iters = os.getenv('GL_ITERS', 32) args = Tshamsoo() if args.vocoder in ['griffinlim', 'gl']: args.vocoder = 'griffinlim' elif args.vocoder in ['wavernn', 'wr']: args.vocoder = 'wavernn' else: raise argparse.ArgumentError('Must provide a valid vocoder type!') hp.configure(args.hp_file) # Load hparams from file tts_weights = args.tts_weights save_attn = args.save_attn paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print('Using device:', device) if args.vocoder == 'wavernn': # set defaults for any arguments that depend on hparams if args.target is None: args.target = hp.voc_target if args.overlap is None: args.overlap = hp.voc_overlap if args.batched is None: args.batched = hp.voc_gen_batched batched = args.batched target = int(args.target) overlap = int(args.overlap) print('\nInitialising WaveRNN Model...\n') # Instantiate WaveRNN Model voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) voc_load_path = args.voc_weights if args.voc_weights else paths.voc_latest_weights voc_model.load(voc_load_path) else: voc_model = None batched = None target = None overlap = None print('\nInitialising Tacotron Model...\n') # Instantiate Tacotron Model tts_model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) tts_load_path = tts_weights if tts_weights else paths.tts_latest_weights tts_model.load(tts_load_path) return args, voc_model, tts_model, batched, target, overlap, save_attn
from utils.paths import Paths if __name__ == '__main__': parser = argparse.ArgumentParser(description='Train WaveRNN Vocoder') parser.add_argument('--gta', '-g', action='store_true', help='train wavernn on GTA features') parser.add_argument('--config', metavar='FILE', default='config.yaml', help='The config containing all hyperparams.') args = parser.parse_args() config = read_config(args.config) paths = Paths(config['data_path'], config['voc_model_id'], config['tts_model_id']) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') print('Using device:', device) print('\nInitialising Model...\n') voc_model = WaveRNN.from_config(config).to(device) dsp = DSP.from_config(config) assert np.cumprod( config['vocoder']['model']['upsample_factors'])[-1] == dsp.hop_length optimizer = optim.Adam(voc_model.parameters()) restore_checkpoint(model=voc_model, optim=optimizer, path=paths.voc_checkpoints / 'latest_model.pt', device=device)
def main(): # Parse Arguments parser = argparse.ArgumentParser(description='TTS Generator') parser.add_argument( '--tts_weights', type=str, help='[string/path] Load in different FastSpeech weights') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') parser.add_argument( '--alpha', type=float, default=1., help='Parameter for controlling length regulator for speedup ' 'or slow-down of generated speech, e.g. alpha=2.0 is double-time') if not os.path.exists('onnx'): os.mkdir('onnx') args = parser.parse_args() hp.configure(args.hp_file) input_text = "the forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves." tts_weights = args.tts_weights paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) device = torch.device('cpu') print('Using device:', device) print('\nInitialising Forward TTS Model...\n') tts_model = ForwardTacotron(embed_dims=hp.forward_embed_dims, num_chars=len(symbols), durpred_rnn_dims=hp.forward_durpred_rnn_dims, durpred_conv_dims=hp.forward_durpred_conv_dims, rnn_dim=hp.forward_rnn_dims, postnet_k=hp.forward_postnet_K, postnet_dims=hp.forward_postnet_dims, prenet_k=hp.forward_prenet_K, prenet_dims=hp.forward_prenet_dims, highways=hp.forward_num_highways, dropout=hp.forward_dropout, n_mels=hp.num_mels).to(device) tts_load_path = tts_weights or paths.forward_latest_weights tts_model.load(tts_load_path) encoder = DurationPredictor(tts_model) decoder = Tacotron(tts_model) tts_model.eval() encoder.eval() decoder.eval() opset_version = 10 with torch.no_grad(): input_seq = text_to_sequence(input_text.strip(), hp.tts_cleaner_names) input_seq = torch.as_tensor(input_seq, dtype=torch.long, device=device).unsqueeze(0) ''' FIRST STEP: predict symbols duration ''' torch.onnx.export(encoder, input_seq, "./onnx/forward_tacotron_duration_prediction.onnx", opset_version=opset_version, do_constant_folding=True, input_names=["input_seq"], output_names=["embeddings", "duration"]) x, durations = encoder(input_seq) ''' SECOND STEP: expand symbols by durations ''' x = encoder.lr(x, durations) ''' THIRD STEP: generate mel ''' torch.onnx.export(decoder, x, "./onnx/forward_tacotron_regression.onnx", opset_version=opset_version, do_constant_folding=True, input_names=["data"], output_names=["mel"]) print('Done!')
def main(): # Parse Arguments parser = argparse.ArgumentParser(description='Train Tacotron TTS') parser.add_argument('--force_train', '-f', action='store_true', help='Forces the model to train past total steps') parser.add_argument('--force_gta', '-g', action='store_true', help='Force the model to create GTA features') parser.add_argument( '--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') args = parser.parse_args() hp.configure(args.hp_file) # Load hparams from file paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) force_gta = args.force_gta if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') for session in hp.forward_schedule: _, _, batch_size = session if batch_size % torch.cuda.device_count() != 0: raise ValueError( '`batch_size` must be evenly divisible by n_gpus!') else: device = torch.device('cpu') print('Using device:', device) # Instantiate Forward TTS Model print('\nInitialising Forward TTS Model...\n') model = ForwardTacotron(embed_dims=hp.forward_embed_dims, num_chars=len(symbols), durpred_rnn_dims=hp.forward_durpred_rnn_dims, durpred_conv_dims=hp.forward_durpred_conv_dims, rnn_dim=hp.forward_rnn_dims, postnet_k=hp.forward_postnet_K, postnet_dims=hp.forward_postnet_dims, prenet_k=hp.forward_prenet_K, prenet_dims=hp.forward_prenet_dims, highways=hp.forward_num_highways, dropout=hp.forward_dropout, n_mels=hp.num_mels).to(device) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print(f'num params {params}') optimizer = optim.Adam(model.parameters()) restore_checkpoint('forward', paths, model, optimizer, create_if_missing=True) if not force_gta: for i, session in enumerate(hp.forward_schedule): current_step = model.get_step() lr, max_step, batch_size = session training_steps = max_step - current_step simple_table([(f'Steps', str(training_steps // 1000) + 'k Steps'), ('Batch Size', batch_size), ('Learning Rate', lr)]) train_set, mel_example = get_tts_datasets(paths.data, batch_size, 1, alignments=True) train_loop(paths, model, optimizer, train_set, lr, training_steps, mel_example) train_set, mel_example = get_tts_datasets(paths.data, 8, 1, alignments=True) create_gta_features(model, train_set, paths.gta) print('Training Complete.')
bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, adaptnet=adaptnet, mode=hp.voc_mode).to(device) print(voc_model) trainable_params = list(voc_model.parameters()) paths = Paths(hp.data_path, hp.voc_model_id, '') # Load pase model print('Building PASE...') if hp.pase_cfg is not None: # 2 PASEs: (1) Identifier extractor, (2) Content extractor pase_cntnt = wf_builder(hp.pase_cfg) if hp.pase_ckpt is not None: pase_cntnt.load_pretrained(hp.pase_ckpt, load_last=True, verbose=True) pase_cntnt.to(device) if conversion: pase_id = wf_builder(hp.pase_cfg) if hp.pase_ckpt is not None: pase_id.load_pretrained(hp.pase_ckpt,