def create_eval_dags( neural_factory, neural_modules, waveglow_params, eval_datasets, eval_batch_size, eval_freq, cpu_per_dl=1, ): data_preprocessor, waveglow, _ = neural_modules eval_dl_params = copy.deepcopy(waveglow_params["AudioDataLayer"]) eval_dl_params.update(waveglow_params["AudioDataLayer"]["eval"]) del eval_dl_params["train"] del eval_dl_params["eval"] callbacks = [] # assemble eval DAGs for eval_dataset in eval_datasets: data_layer_eval = nemo_tts.AudioDataLayer( manifest_filepath=eval_dataset, batch_size=eval_batch_size, num_workers=cpu_per_dl, **eval_dl_params, ) audio, audio_len, = data_layer_eval() spec_target, spec_target_len = data_preprocessor(input_signal=audio, length=audio_len) audio_pred, log_s_list, log_det_W_list = waveglow(mel_spectrogram=spec_target, audio=audio) # create corresponding eval callback tagname = os.path.basename(eval_dataset).split(".")[0] eval_callback = nemo.core.EvaluatorCallback( eval_tensors=[audio_pred, spec_target, spec_target_len], user_iter_callback=waveglow_process_eval_batch, user_epochs_done_callback=lambda x: x, tb_writer_func=partial(waveglow_eval_log_to_tb_func, tag=tagname, mel_fb=data_preprocessor.filter_banks,), eval_step=eval_freq, tb_writer=neural_factory.tb_writer, ) callbacks.append(eval_callback) return callbacks
def test_waveglow_training(self): data_layer = nemo_tts.AudioDataLayer( manifest_filepath=self.manifest_filepath, n_segments=4000, batch_size=4, ) preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( window_size=None, window_stride=None, n_window_size=512, n_window_stride=128, normalize=None, preemph=None, dither=0, mag_power=1.0, pad_value=-11.52, ) waveglow = nemo_tts.WaveGlowNM( n_mel_channels=64, n_flows=6, n_group=4, n_early_every=4, n_early_size=2, n_wn_layers=4, n_wn_channels=256, wn_kernel_size=3, ) waveglow_loss = nemo_tts.WaveGlowLoss() # DAG audio, audio_len, = data_layer() spec_target, _ = preprocessing(input_signal=audio, length=audio_len) z, log_s_list, log_det_W_list = waveglow(mel_spectrogram=spec_target, audio=audio) loss_t = waveglow_loss(z=z, log_s_list=log_s_list, log_det_W_list=log_det_W_list) callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss_t], print_func=lambda x: logging.info(f'Train Loss: {str(x[0].item())}' ), ) # Instantiate an optimizer to perform `train` action neural_factory = nemo.core.NeuralModuleFactory( backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False, ) optimizer = neural_factory.get_trainer() optimizer.train( [loss_t], callbacks=[callback], optimizer="sgd", optimization_params={ "num_epochs": 10, "lr": 0.0003 }, )
def test_waveglow_training(self): """Integtaion test that instantiates a smaller WaveGlow model and tests training with the sample asr data. Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss at the first step. """ data_layer = nemo_tts.AudioDataLayer( manifest_filepath=self.manifest_filepath, n_segments=4000, batch_size=4, sample_rate=16000 ) preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( window_size=None, window_stride=None, n_window_size=512, n_window_stride=128, normalize=None, preemph=None, dither=0, mag_power=1.0, pad_value=-11.52, ) waveglow = nemo_tts.WaveGlowNM( n_mel_channels=64, n_flows=6, n_group=4, n_early_every=4, n_early_size=2, n_wn_layers=4, n_wn_channels=256, wn_kernel_size=3, sample_rate=16000, ) waveglow_loss = nemo_tts.WaveGlowLoss(sample_rate=16000) # DAG audio, audio_len, = data_layer() spec_target, _ = preprocessing(input_signal=audio, length=audio_len) z, log_s_list, log_det_W_list = waveglow(mel_spectrogram=spec_target, audio=audio) loss_t = waveglow_loss(z=z, log_s_list=log_s_list, log_det_W_list=log_det_W_list) loss_list = [] callback = SimpleLossLoggerCallback( tensors=[loss_t], print_func=partial(self.print_and_log_loss, loss_log_list=loss_list), step_freq=1 ) # Instantiate an optimizer to perform `train` action optimizer = PtActions() optimizer.train( [loss_t], callbacks=[callback], optimizer="sgd", optimization_params={"max_steps": 3, "lr": 0.01} ) # Assert that training loss went down assert loss_list[-1] < loss_list[0]
def create_train_dag( neural_factory, neural_modules, waveglow_params, train_dataset, batch_size, checkpoint_save_freq, cpu_per_dl=1, ): data_preprocessor, waveglow, waveglow_loss = neural_modules train_dl_params = copy.deepcopy(waveglow_params["AudioDataLayer"]) train_dl_params.update(waveglow_params["AudioDataLayer"]["train"]) del train_dl_params["train"] del train_dl_params["eval"] data_layer = nemo_tts.AudioDataLayer( manifest_filepath=train_dataset, batch_size=batch_size, num_workers=cpu_per_dl, **train_dl_params, ) N = len(data_layer) steps_per_epoch = int(N / (batch_size * neural_factory.world_size)) logging.info('Have {0} examples to train on.'.format(N)) # Train DAG audio, audio_len, = data_layer() spec_target, spec_target_len = data_preprocessor(input_signal=audio, length=audio_len) z, log_s_list, log_det_W_list = waveglow(mel_spectrogram=spec_target, audio=audio) loss_t = waveglow_loss(z=z, log_s_list=log_s_list, log_det_W_list=log_det_W_list) # Callbacks needed to print info to console and Tensorboard train_callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss_t, z, spec_target, spec_target_len], print_func=lambda x: logging.info(f"Loss: {x[0].data}"), log_to_tb_func=partial(waveglow_log_to_tb_func, log_images=False), tb_writer=neural_factory.tb_writer, ) chpt_callback = nemo.core.CheckpointCallback( folder=neural_factory.checkpoint_dir, step_freq=checkpoint_save_freq) callbacks = [train_callback, chpt_callback] return loss_t, callbacks, steps_per_epoch