def train(self, model: ForwardTacotron, optimizer: Optimizer) -> None: forward_schedule = self.train_cfg['schedule'] forward_schedule = parse_schedule(forward_schedule) for i, session_params in enumerate(forward_schedule, 1): lr, max_step, bs = session_params if model.get_step() < max_step: train_set, val_set = get_tts_datasets( path=self.paths.data, batch_size=bs, r=1, model_type='forward', max_mel_len=self.train_cfg['max_mel_len'], filter_attention=self.train_cfg['filter_attention'], filter_min_alignment=self. train_cfg['min_attention_alignment'], filter_min_sharpness=self. train_cfg['min_attention_sharpness']) session = TTSSession(index=i, r=1, lr=lr, max_step=max_step, bs=bs, train_set=train_set, val_set=val_set) self.train_session(model, optimizer, session)
def train(self, model: ForwardTacotron, optimizer: Optimizer) -> None: for i, session_params in enumerate(hp.forward_schedule, 1): lr, max_step, bs = session_params if model.get_step() < max_step: train_set, val_set = get_tts_datasets( path=self.paths.data, batch_size=bs, r=1, model_type='forward') session = TTSSession( index=i, r=1, lr=lr, max_step=max_step, bs=bs, train_set=train_set, val_set=val_set) self.train_session(model, optimizer, session)
def train(self, model_tts: ForwardTacotron, model_asr: Wav2Vec2ForCTC, optimizer_tts: Optimizer, optimizer_asr: Optimizer) -> None: print("Loading ASR training data...") asr_train_set = unpickle_binary('./data/speech-sme-asr/train_asr.pkl') asr_test_set = unpickle_binary('./data/speech-sme-asr/test_asr.pkl') # exit() asr_trainer = init_trainer(asr_train_set, asr_test_set) for i, session_params in enumerate(hp.forward_schedule, 1): lr, max_step, bs = session_params if model_tts.get_step() < max_step: path = self.paths.data # print(path) tts_train_set, tts_val_set = get_tts_datasets( path=self.paths.data, batch_size=bs, r=1, model_type='forward') asr_train_set = asr_trainer.get_train_dataloader() asr_test_set = asr_trainer.get_test_dataloader(asr_test_set) asr_pr = Wav2Vec2Processor.from_pretrained( './asr_output/pretrained_processor') tts_session = ForwardSession( path, index=i, r=1, lr=lr, max_step=max_step, bs=bs, train_set=tts_train_set, val_set=tts_val_set, ) asr_session = ASRSession(asr_pr, index=i, r=1, lr=lr, max_step=max_step, bs=4, train_set=asr_train_set, test_set=asr_test_set) self.train_session(model_tts, model_asr, optimizer_tts, tts_session, asr_session, asr_trainer, optimizer_asr)
def train(self, model: Tacotron, optimizer: Optimizer) -> None: tts_schedule = self.train_cfg['schedule'] tts_schedule = parse_schedule(tts_schedule) for i, session_params in enumerate(tts_schedule, 1): r, lr, max_step, bs = session_params if model.get_step() < max_step: train_set, val_set = get_tts_datasets( path=self.paths.data, batch_size=bs, r=r, model_type='tacotron', max_mel_len=self.train_cfg['max_mel_len'], filter_attention=False) session = TTSSession(index=i, r=r, lr=lr, max_step=max_step, bs=bs, train_set=train_set, val_set=val_set) self.train_session(model, optimizer, session=session)
prenet_k=hp.forward_prenet_K, prenet_dims=hp.forward_prenet_dims, highways=hp.forward_num_highways, dropout=hp.forward_dropout, n_mels=hp.num_mels).to(device) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print(f'num params {params}') optimizer = optim.Adam(model.parameters()) restore_checkpoint('forward', paths, model, optimizer, create_if_missing=True) if force_gta: print('Creating Ground Truth Aligned Dataset...\n') train_set, val_set = get_tts_datasets(paths.data, 8, r=1, model_type='forward') create_gta_features(model, train_set, val_set, paths.gta) print( '\n\nYou can now train WaveRNN on GTA features - use python train_wavernn.py --gta\n' ) else: trainer = ForwardTrainer(paths) trainer.train(model, optimizer)
def main(): # Parse Arguments parser = argparse.ArgumentParser(description='Train Tacotron TTS') parser.add_argument('--force_train', '-f', action='store_true', help='Forces the model to train past total steps') parser.add_argument('--force_gta', '-g', action='store_true', help='Force the model to create GTA features') parser.add_argument( '--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') args = parser.parse_args() hp.configure(args.hp_file) # Load hparams from file paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) force_train = args.force_train force_gta = args.force_gta if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') for session in hp.tts_schedule: _, _, _, batch_size = session if batch_size % torch.cuda.device_count() != 0: raise ValueError( '`batch_size` must be evenly divisible by n_gpus!') else: device = torch.device('cpu') print('Using device:', device) # Instantiate Tacotron Model print('\nInitialising Tacotron Model...\n') model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) optimizer = optim.Adam(model.parameters()) restore_checkpoint('tts', paths, model, optimizer, create_if_missing=True) if not force_gta: for i, session in enumerate(hp.tts_schedule): current_step = model.get_step() r, lr, max_step, batch_size = session training_steps = max_step - current_step # Do we need to change to the next session? if current_step >= max_step: # Are there no further sessions than the current one? if i == len(hp.tts_schedule) - 1: # There are no more sessions. Check if we force training. if force_train: # Don't finish the loop - train forever training_steps = 999_999_999 else: # We have completed training. Breaking is same as continue break else: # There is a following session, go to it continue model.r = r simple_table([('Steps with r=%s' % (repr1(r)), str(training_steps // 1000) + 'k Steps'), ('Batch Size', batch_size), ('Learning Rate', lr), ('Outputs/Step (r)', model.r)]) train_set, attn_example = get_tts_datasets(paths.data, batch_size, r) tts_train_loop(paths, model, optimizer, train_set, lr, training_steps, attn_example) print('Training Complete.') print( 'To continue training increase tts_total_steps in hparams.py or use --force_train\n' ) print('Creating Ground Truth Aligned Dataset...\n') train_set, attn_example = get_tts_datasets(paths.data, 8, model.r) create_gta_features(model, train_set, paths.gta) print( '\n\nYou can now train WaveRNN on GTA features - use python train_wavernn.py --gta\n' )
stop_threshold=hp.tts_stop_threshold).to(device) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print(f'Num Params: {params}') optimizer = optim.Adam(model.parameters()) restore_checkpoint('tts', paths, model, optimizer, create_if_missing=True, device=device) if args.force_gta: print('Creating Ground Truth Aligned Dataset...\n') train_set, val_set = get_tts_datasets(paths.data, 8, model.r) create_gta_features(model, train_set, val_set, paths.gta) print( '\n\nYou can now train WaveRNN on GTA features - use python train_wavernn.py --gta\n' ) elif args.force_align: print('Creating Attention Alignments and Pitch Values...') train_set, val_set = get_tts_datasets(paths.data, 1, model.r) create_align_features(model, train_set, val_set, paths.alg) # paths.phon_pitch) print( '\n\nYou can now train ForwardTacotron - use python train_forward.py\n' ) elif args.fa_dt: print('Creating Attention Alignments for DT...')
force_gta = args.force_gta device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') print('Using device:', device) # Instantiate Forward TTS Model print('\nInitialising Forward TTS Model...\n') model = ForwardTacotron.from_config(config).to(device) optimizer = optim.Adam(model.parameters()) restore_checkpoint(model=model, optim=optimizer, path=paths.forward_checkpoints / 'latest_model.pt', device=device) if force_gta: print('Creating Ground Truth Aligned Dataset...\n') train_set, val_set = get_tts_datasets(paths.data, 8, r=1, model_type='forward', filter_attention=False, max_mel_len=None) create_gta_features(model, train_set, val_set, paths.gta) print( '\n\nYou can now train WaveRNN on GTA features - use python train_wavernn.py --gta\n' ) else: trainer = ForwardTrainer(paths=paths, dsp=dsp, config=config) trainer.train(model, optimizer)
print('Device:', device) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print(f'Num Params: {params}') optimizer = optim.Adam(model.parameters()) restore_checkpoint('tts', paths, model, optimizer, create_if_missing=True, device=device) model = model if args.force_gta: print('Creating Ground Truth Aligned Dataset...\n') train_set, val_set = get_tts_datasets(paths.data, 8, model.r) create_gta_features(model, train_set, val_set, paths.gta) print( '\n\nYou can now train WaveRNN on GTA features - use python train_wavernn.py --gta\n' ) elif args.force_align: print('Creating Attention Alignments and Pitch Values...') train_set, val_set = get_tts_datasets(paths.data, 1, model.r) create_align_features(model, train_set, val_set, paths.alg, paths.phon_pitch) extract_pitch(paths.phon_pitch) print( '\n\nYou can now train ForwardTacotron - use python train_forward.py\n' ) else: trainer = TacoTrainer(paths)
def dual_transform(self, model_tts, model_asr, optimizer_tts, optimizer_asr, asr_test_set, m_loss_avg, dur_loss_avg, device, asr_current_step, e, epochs, duration_avg, total_iters, tts_s_loss, asr_s_loss, tts_lr, tts_dt_path): print('\n\nStarting DualTransformation loop...\n') # exit() tmp_dir = './checkpoints/sme_speech_tts.asr_forward/dual_transform_tmp' os.makedirs(tmp_dir, exist_ok=True) # generate tmp ASR training data asr_train_data = [] input_set = get_unpaired_txt(35) # print(input_set) text = [clean_text(v) for v in input_set] inputs = [text_to_sequence(t) for t in text] # generate unpaired data for ASR from TTS for i, x in enumerate(inputs, 1): _, m, dur = model_tts.generate(x, alpha=1.) wav = reconstruct_waveform(m, n_iter=32) wav_path = os.path.join(tmp_dir, f'{i}.wav') save_wav(wav, wav_path) asr_train_data.append((wav_path, text[i - 1])) # print(asr_train_data) dt_asr_data = load_dt_data(asr_train_data) # reinit trainer with only tmp train data asr_trainer_dt = init_trainer(dt_asr_data, None) dt_train = asr_trainer_dt.get_train_dataloader() # unsuper train loop for ASR for step, inputs in enumerate(dt_train, 1): # model_asr.cpu() model_asr.train() model_asr.to(device) for k, v in inputs.items(): if isinstance(v, torch.Tensor): inputs[k] = v.to(device) # model_asr.train() outputs = model_asr(**inputs) asr_u_loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] # asr_u_loss.detach() # asr_u_loss = asr_s_loss.mean() # model_name = step + asr_current_step msg_asr = f'| ASR MODEL (unsupervised training) : '\ f'| Epoch: {e}/{epochs} ({step}/{len(dt_train)}) | Loss ASR: {asr_u_loss:#.4} '\ f' ||||||||||||||||||||||||||||||||||||||||||||||||' stream(msg_asr) # for f in os.listdir(tmp_dir): # file_path = os.path.join(tmp_dir, f) # if f.endswith('.wav'): # os.unlink(file_path) # generate tmp TTS data from ASR # model_asr.to(device) asr_predict_for_dt(model_asr) subprocess.check_output( 'python preprocess.py -p "./data/speech-sme-tts" -d=True', shell=True, stderr=subprocess.STDOUT) print('Finished preprocessing for tmp data!') tmp_tts_train = get_tts_datasets(tts_dt_path, batch_size=2, r=1, model_type='forward_dt') print("Loaded tmp dataset!") # unsuper TTS training for i, (x, m, ids, x_lens, mel_lens, dur) in enumerate(tmp_tts_train, 1): start = time.time() model_tts.to(device) model_tts.train() # optimizer_tts.zero_grad() x, m, dur, x_lens, mel_lens = x.to(device), m.to(device), dur.to(device),\ x_lens.to(device), mel_lens.to(device) m1_hat, m2_hat, dur_hat = model_tts(x, m, dur, mel_lens) m1_loss = self.l1_loss(m1_hat, m, mel_lens) m2_loss = self.l1_loss(m2_hat, m, mel_lens) dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1), x_lens) tts_u_loss = m1_loss + m2_loss + 0.1 * dur_loss # optimizer_tts.zero_grad() # tts_u_loss.backward() torch.nn.utils.clip_grad_norm_(model_tts.parameters(), hp.tts_clip_grad_norm) # optimizer_tts.step() m_loss_avg.add(m1_loss.item() + m2_loss.item()) dur_loss_avg.add(dur_loss.item()) step = model_tts.get_step() k = step // 1000 duration_avg.add(time.time() - start) # pitch_loss_avg.add(pitch_loss.item()) speed = 1. / duration_avg.get() msg_tts = f'| TTS MODEL (unsupervised training ): '\ f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \ f'| Dur Loss: {dur_loss_avg.get():#.4} ' \ f'| {speed:#.2} steps/s | Step: {k}k | ' stream(msg_tts) # m_val_loss, dur_val_loss = self.evaluate(model_tts, tts_session.val_set) #TODO: combine L and update # asr_s_loss = torch.tensor(asr_s_loss).to(device) combined_loss = 0.5 * (tts_s_loss + asr_s_loss) + (tts_u_loss + asr_u_loss) # backwards combined_loss.to(device) # print(combined_loss) combined_loss.backward() optimizer_tts.step() for state in optimizer_asr.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.to(device) optimizer_asr.step() m_loss_avg.reset() duration_avg.reset() # pitch_loss_avg.reset() dt_msg = f'\n\nFinished DT loop in epoch {e}!\n' stream(dt_msg) print(' ') return tts_u_loss, asr_u_loss
def main(): # Parse Arguments parser = argparse.ArgumentParser(description='Train Tacotron TTS') parser.add_argument('--force_train', '-f', action='store_true', help='Forces the model to train past total steps') parser.add_argument('--force_gta', '-g', action='store_true', help='Force the model to create GTA features') parser.add_argument( '--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') args = parser.parse_args() hp.configure(args.hp_file) # Load hparams from file paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) force_gta = args.force_gta if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') for session in hp.forward_schedule: _, _, batch_size = session if batch_size % torch.cuda.device_count() != 0: raise ValueError( '`batch_size` must be evenly divisible by n_gpus!') else: device = torch.device('cpu') print('Using device:', device) # Instantiate Forward TTS Model print('\nInitialising Forward TTS Model...\n') model = ForwardTacotron(embed_dims=hp.forward_embed_dims, num_chars=len(symbols), durpred_rnn_dims=hp.forward_durpred_rnn_dims, durpred_conv_dims=hp.forward_durpred_conv_dims, rnn_dim=hp.forward_rnn_dims, postnet_k=hp.forward_postnet_K, postnet_dims=hp.forward_postnet_dims, prenet_k=hp.forward_prenet_K, prenet_dims=hp.forward_prenet_dims, highways=hp.forward_num_highways, dropout=hp.forward_dropout, n_mels=hp.num_mels).to(device) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print(f'num params {params}') optimizer = optim.Adam(model.parameters()) restore_checkpoint('forward', paths, model, optimizer, create_if_missing=True) if not force_gta: for i, session in enumerate(hp.forward_schedule): current_step = model.get_step() lr, max_step, batch_size = session training_steps = max_step - current_step simple_table([(f'Steps', str(training_steps // 1000) + 'k Steps'), ('Batch Size', batch_size), ('Learning Rate', lr)]) train_set, mel_example = get_tts_datasets(paths.data, batch_size, 1, alignments=True) train_loop(paths, model, optimizer, train_set, lr, training_steps, mel_example) train_set, mel_example = get_tts_datasets(paths.data, 8, 1, alignments=True) create_gta_features(model, train_set, paths.gta) print('Training Complete.')