def evaluate_master(model, num_gpus, output_directory, epochs, learning_rate, lr_decay_step, lr_decay_gamma, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard): torch.manual_seed(seed) torch.cuda.manual_seed(seed) # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": if args.average_checkpoint == 0: model, _, _, iteration = load_checkpoint(checkpoint_path, model, None, None) else: print("INFO: --average_checkpoint > 0. loading an averaged weight of last {} checkpoints...".format(args.average_checkpoint)) model, iteration = load_averaged_checkpoint(checkpoint_path, model, args.average_checkpoint) if fp16_run: raise NotImplementedError("do not run evaluation loop with fp16 mode!") testset = Mel2Samp("test", True, True, **data_config) test_sampler = None test_loader = DataLoader(testset, num_workers=4, shuffle=False, sampler=test_sampler, batch_size=1, pin_memory=False, drop_last=False) # Get shared output_directory ready if not os.path.isdir(os.path.join(output_directory, waveflow_config["model_name"])): os.makedirs(os.path.join(output_directory, waveflow_config["model_name"]), exist_ok=True) os.chmod(os.path.join(output_directory, waveflow_config["model_name"]), 0o775) print("output directory", os.path.join(output_directory, waveflow_config["model_name"])) if not os.path.isdir(os.path.join(output_directory, "samples")): os.makedirs(os.path.join(output_directory, "samples"), exist_ok=True) os.chmod(os.path.join(output_directory, "samples"), 0o775) os.makedirs(os.path.join(output_directory, "samples", waveflow_config["model_name"]), exist_ok=True) os.chmod(os.path.join(output_directory, "samples", waveflow_config["model_name"]), 0o775) criterion = WaveFlowLossDataParallel(sigma) model.eval() epoch_eval_loss = 0 for i, batch in enumerate(test_loader): with torch.no_grad(): mel, audio, filename = batch mel, audio = mel.cuda(), audio.cuda() outputs = model(audio, mel) loss = criterion(outputs) reduced_loss = loss.item() epoch_eval_loss += reduced_loss print("eval data {}: {:.9f}".format(i, reduced_loss)) epoch_eval_loss = epoch_eval_loss / len(test_loader) print("EVAL_FULL {}:\t{:.9f}".format(iteration, epoch_eval_loss)) model.train()
def load_LJSpeech(trainset_config, batch_size=4, num_gpus=1): LJSpeech_dataset = Mel2Samp(**trainset_config) # distributed sampler train_sampler = DistributedSampler( LJSpeech_dataset) if num_gpus > 1 else None trainloader = torch.utils.data.DataLoader(LJSpeech_dataset, batch_size=batch_size, sampler=train_sampler, num_workers=4, pin_memory=False, drop_last=True) return trainloader
def test(sigma, batch_size, seed, checkpoint_path): torch.manual_seed(seed) torch.cuda.manual_seed(seed) criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda().eval() # Load checkpoint if one exists model, iteration = load_checkpoint(checkpoint_path, model) model.eval() testset = Mel2Samp(data_config['testing_files'], data_config['segment_length'], data_config['filter_length'], data_config['hop_length'], data_config['win_length'], data_config['sampling_rate'], data_config['mel_fmin'], data_config['mel_fmax']) test_loader = DataLoader(testset, num_workers=1, shuffle=False, sampler=None, batch_size=batch_size, pin_memory=False, drop_last=True) with torch.no_grad(): val_loss = 0.0 for j, batch in enumerate(test_loader): mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) val_loss += loss.item() val_loss = val_loss / (j + 1) model.train() print("test loss: {}:\t{:.9f}".format(iteration, val_loss))
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, checkpoint_path): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # Load checkpoint if one exists iteration = 0 print("checkpoint path", checkpoint_path) #model = warm_load_checkpoint(checkpoint_path, model) model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=True, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() loss.backward() optimizer.step() if (iteration % iters_per_checkpoint == 0): print("{}:\t{:.9f}".format(iteration, reduced_loss)) checkpoint_path = "{}/waveglow".format(output_directory) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def main(waveglow_path, sigma, output_dir, is_fp16, denoiser_strength): # mel_files = files_to_list(mel_files) waveglow = torch.load(waveglow_path)['model'] waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() testset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== # train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== test_loader = DataLoader( testset, num_workers=32, shuffle=False, # sampler=train_sampler, batch_size=12, pin_memory=False, drop_last=True) speakers_to_sids = deepcopy(testset.speakers) sids_to_speakers = create_reverse_dict(speakers_to_sids) ut_to_uids = deepcopy(testset.utterances) uids_to_ut = create_reverse_dict(ut_to_uids) sid_target = np.random.randint(len(speakers_to_sids)) speaker_target = sids_to_speakers[sid_target] sid_target = torch.LongTensor([[sid_target] * test_loader.batch_size ]).view(test_loader.batch_size, 1).to(device) audios = [] n_audios = 0 for i, batch in enumerate(test_loader): mel_source, _, sid_source, uid_source, is_last = batch mel_source = mel_source.to(device) import pdb pdb.set_trace() with torch.no_grad(): predicted = waveglow.infer(mel_source, sid_target, sigma=sigma) if denoiser_strength > 0: predicted = denoiser(predicted, denoiser_strength) predicted = predicted * MAX_WAV_VALUE for j in range(len(predicted)): # p = predicted[j].squeeze().cpu().numpy().astype('int16') p = predicted[j].cpu() audios.append(p) speaker_source = sids_to_speakers[sid_source[j].data.item()] ut_source = uids_to_ut[uid_source[j].data.item()] last = is_last[j].data.item() if last: audio_path = os.path.join( output_dir, "{}_{}_to_{}_synthesis.wav".format(speaker_source, ut_source, speaker_target)) print("Synthesizing file No.{} at {}".format( n_audios, audio_path)) save_audio_chunks(audios, audio_path, data_config['stride'], data_config['sampling_rate']) audios = [] n_audios += 1
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, num_workers=2): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 # HACK: setup separate training and eval sets training_files = data_config['training_files'] eval_files = data_config['eval_files'] del data_config['training_files'] del data_config['eval_files'] data_config['audio_files'] = training_files trainset = Mel2Samp(**data_config) data_config['audio_files'] = eval_files evalset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None eval_sampler = DistributedSampler(evalset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== print("Creating dataloaders with " + str(num_workers) + " workers") train_loader = DataLoader(trainset, num_workers=num_workers, shuffle=True, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) eval_loader = DataLoader(evalset, num_workers=num_workers, shuffle=True, sampler=eval_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger_train = SummaryWriter( os.path.join(output_directory, 'logs', 'train')) logger_eval = SummaryWriter( os.path.join(output_directory, 'logs', 'eval')) epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): model.train() with tqdm(total=len(train_loader)) as train_pbar: for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() train_pbar.set_description( "Epoch {} Iter {} Loss {:.3f}".format( epoch, iteration, reduced_loss)) if with_tensorboard and rank == 0 and iteration % 10 == 0: logger_train.add_scalar('loss', reduced_loss, i + len(train_loader) * epoch) # adding logging for GPU utilization and memory usage gpu_memory_used, gpu_utilization = get_gpu_stats() k = 'gpu' + str(0) logger_train.add_scalar(k + '/memory', gpu_memory_used, iteration) logger_train.add_scalar(k + '/load', gpu_utilization, iteration) logger_train.flush() if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1 train_pbar.update(1) # Eval model.eval() torch.cuda.empty_cache() with torch.no_grad(): tensorboard_mel, tensorboard_audio = None, None loss_accum = [] with tqdm(total=len(eval_loader)) as eval_pbar: for i, batch in enumerate(eval_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs).item() loss_accum.append(loss) eval_pbar.set_description("Epoch {} Eval {:.3f}".format( epoch, loss)) outputs = None # use the first batch for tensorboard audio samples if i == 0: tensorboard_mel = mel tensorboard_audio = audio eval_pbar.update(1) if with_tensorboard and rank == 0: loss_avg = statistics.mean(loss_accum) tqdm.write("Epoch {} Eval AVG {}".format(epoch, loss_avg)) logger_eval.add_scalar('loss', loss_avg, iteration) # log audio samples to tensorboard tensorboard_audio_generated = model.infer(tensorboard_mel) for i in range(0, 5): ta = tensorboard_audio[i].cpu().numpy() tag = tensorboard_audio_generated[i].cpu().numpy() logger_eval.add_audio("sample " + str(i) + "/orig", ta, epoch, sample_rate=data_config['sampling_rate']) logger_eval.add_audio("sample " + str(i) + "/gen", tag, epoch, sample_rate=data_config['sampling_rate']) logger_eval.flush()
def train(num_gpus, rank, group_name, prj_name, run_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, grad_clip_thresh, checkpoint_path, pretrained_path, with_tensorboard, with_wandb): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 if pretrained_path != "": model = load_pretrained(pretrained_path, model) trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: train_sampler = DistributedSampler(trainset) shuffle_at_dataloader = False else: train_sampler = None shuffle_at_dataloader = True # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle_at_dataloader, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): iter_start = time.perf_counter() float_epoch = float(iteration) / len(train_loader) model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss, etc = criterion(outputs) (z_L2_normalized, neg_log_s_total, neg_log_det_W_total) = etc if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() is_overflow = False if fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), grad_clip_thresh) is_overflow = math.isnan(grad_norm) if not is_overflow: clipped_grad_norm = get_clip_grad_norm( grad_norm, grad_clip_thresh) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), grad_clip_thresh) clipped_grad_norm = get_clip_grad_norm(grad_norm, grad_clip_thresh) optimizer.step() iter_duration = time.perf_counter() - iter_start print("{}:\t{:.9f}".format(iteration, reduced_loss)) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if with_wandb and rank == 0: wandb.log( { 'iteration': iteration, 'epoch': float_epoch, 'iter_duration': iter_duration, 'training_loss': reduced_loss, 'training_loss/z_L2_normalized': z_L2_normalized, 'training_loss/neg_log_s_total': neg_log_s_total, 'training_loss/neg_log_det_W_total': neg_log_det_W_total, }, step=iteration) if not is_overflow: wandb.log( { 'grad_norm': grad_norm, 'clipped_grad_norm': clipped_grad_norm, }, step=iteration) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/{}/{}/waveglow_{}".format( output_directory, prj_name, run_name, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train( num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, ): torch.manual_seed(seed) torch.cuda.manual_seed(seed) # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) # =====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) # =====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level="O1") # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader( trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True, ) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, "logs")) # fixed for visualization real_mels, real_audios = zip(*[trainset[i] for i in range(8)]) real_mel = torch.cat(real_mels, dim=-1) real_audio = torch.cat(real_audios, dim=0) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() print("{}:\t{:.9f}".format(iteration, reduced_loss)) if with_tensorboard and rank == 0: step = i + len(train_loader) * epoch logger.add_scalar("training_loss", reduced_loss, step) if step % 500 == 0: # select the first eight data sample model.eval() with torch.no_grad(): device = mel.device fake_audio = (model.infer( torch.stack(real_mels).to(device)).flatten( 0, 1).cpu()) model.train() fake_mel = trainset.get_mel(fake_audio) logger.add_image( "training_mel_real", plot_spectrogram_to_numpy(real_mel), step, dataformats="HWC", ) logger.add_audio( "training_audio_real", real_audio, step, 22050, ) logger.add_image( "training_mel_fake", plot_spectrogram_to_numpy(fake_mel), step, dataformats="HWC", ) logger.add_audio( "training_audio_fake", fake_audio, step, 22050, ) logger.flush() if iteration % iters_per_checkpoint == 0: if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, loss_empthasis, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, logdirname, datedlogdir, warm_start=False, optimizer='ADAM', start_zero=False): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== global WaveGlow global WaveGlowLoss ax = True # this is **really** bad coding practice :D if ax: from efficient_model_ax import WaveGlow from efficient_loss import WaveGlowLoss else: if waveglow_config["yoyo"]: # efficient_mode # TODO: Add to Config File from efficient_model import WaveGlow from efficient_loss import WaveGlowLoss else: from glow import WaveGlow, WaveGlowLoss criterion = WaveGlowLoss(sigma, loss_empthasis) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== STFTs = [STFT.TacotronSTFT(filter_length=window, hop_length=data_config['hop_length'], win_length=window, sampling_rate=data_config['sampling_rate'], n_mel_channels=160, mel_fmin=0, mel_fmax=16000) for window in data_config['validation_windows']] loader_STFT = STFT.TacotronSTFT(filter_length=data_config['filter_length'], hop_length=data_config['hop_length'], win_length=data_config['win_length'], sampling_rate=data_config['sampling_rate'], n_mel_channels=data_config['n_mel_channels'] if 'n_mel_channels' in data_config.keys() else 160, mel_fmin=data_config['mel_fmin'], mel_fmax=data_config['mel_fmax']) #optimizer = "Adam" optimizer = optimizer.lower() optimizer_fused = bool( 0 ) # use Apex fused optimizer, should be identical to normal but slightly faster and only works on RTX cards if optimizer_fused: from apex import optimizers as apexopt if optimizer == "adam": optimizer = apexopt.FusedAdam(model.parameters(), lr=learning_rate) elif optimizer == "lamb": optimizer = apexopt.FusedLAMB(model.parameters(), lr=learning_rate, max_grad_norm=200) else: if optimizer == "adam": optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) elif optimizer == "lamb": from lamb import Lamb as optLAMB optimizer = optLAMB(model.parameters(), lr=learning_rate) #import torch_optimizer as optim #optimizer = optim.Lamb(model.parameters(), lr=learning_rate) #raise# PyTorch doesn't currently include LAMB optimizer. if fp16_run: global amp from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') else: amp = None ## LEARNING RATE SCHEDULER if True: from torch.optim.lr_scheduler import ReduceLROnPlateau min_lr = 1e-8 factor = 0.1**(1/5) # amount to scale the LR by on Validation Loss plateau scheduler = ReduceLROnPlateau(optimizer, 'min', factor=factor, patience=20, cooldown=2, min_lr=min_lr, verbose=True, threshold=0.0001, threshold_mode='abs') print("ReduceLROnPlateau used as Learning Rate Scheduler.") else: scheduler=False # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration, scheduler = load_checkpoint(checkpoint_path, model, optimizer, scheduler, fp16_run, warm_start=warm_start) iteration += 1 # next iteration is iteration + 1 if start_zero: iteration = 0 trainset = Mel2Samp(**data_config, check_files=True) speaker_lookup = trainset.speaker_ids # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: train_sampler = DistributedSampler(trainset, shuffle=True) shuffle = False else: train_sampler = None shuffle = True # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=3, shuffle=shuffle, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter if datedlogdir: timestr = time.strftime("%Y_%m_%d-%H_%M_%S") log_directory = os.path.join(output_directory, logdirname, timestr) else: log_directory = os.path.join(output_directory, logdirname) logger = SummaryWriter(log_directory) moving_average = int(min(len(train_loader), 100)) # average loss over entire Epoch rolling_sum = StreamingMovingAverage(moving_average) start_time = time.time() start_time_iter = time.time() start_time_dekaiter = time.time() model.train() # best (averaged) training loss if os.path.exists(os.path.join(output_directory, "best_model")+".txt"): best_model_loss = float(str(open(os.path.join(output_directory, "best_model")+".txt", "r", encoding="utf-8").read()).split("\n")[0]) else: best_model_loss = -6.20 # best (validation) MSE on inferred spectrogram. if os.path.exists(os.path.join(output_directory, "best_val_model")+".txt"): best_MSE = float(str(open(os.path.join(output_directory, "best_val_model")+".txt", "r", encoding="utf-8").read()).split("\n")[0]) else: best_MSE = 9e9 epoch_offset = max(0, int(iteration / len(train_loader))) pytorch_total_params = sum(p.numel() for p in model.parameters()) print("{:,} total parameters in model".format(pytorch_total_params)) pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print("{:,} trainable parameters.".format(pytorch_total_params)) print(f"Segment Length: {data_config['segment_length']:,}\nBatch Size: {batch_size:,}\nNumber of GPUs: {num_gpus:,}\nSamples/Iter: {data_config['segment_length']*batch_size*num_gpus:,}") training = True while training: try: if rank == 0: epochs_iterator = tqdm(range(epoch_offset, epochs), initial=epoch_offset, total=epochs, smoothing=0.01, desc="Epoch", position=1, unit="epoch") else: epochs_iterator = range(epoch_offset, epochs) # ================ MAIN TRAINING LOOP! =================== for epoch in epochs_iterator: print(f"Epoch: {epoch}") if num_gpus > 1: train_sampler.set_epoch(epoch) if rank == 0: iters_iterator = tqdm(enumerate(train_loader), desc=" Iter", smoothing=0, total=len(train_loader), position=0, unit="iter", leave=True) else: iters_iterator = enumerate(train_loader) for i, batch in iters_iterator: # run external code every iter, allows the run to be adjusted without restarts if (i==0 or iteration % param_interval == 0): try: with open("run_every_epoch.py") as f: internal_text = str(f.read()) if len(internal_text) > 0: #code = compile(internal_text, "run_every_epoch.py", 'exec') ldict = {'iteration': iteration, 'seconds_elapsed': time.time()-start_time} exec(internal_text, globals(), ldict) else: print("No Custom code found, continuing without changes.") except Exception as ex: print(f"Custom code FAILED to run!\n{ex}") globals().update(ldict) locals().update(ldict) if show_live_params: print(internal_text) if not iteration % 50: # check actual learning rate every 20 iters (because I sometimes see learning_rate variable go out-of-sync with real LR) learning_rate = optimizer.param_groups[0]['lr'] # Learning Rate Schedule if custom_lr: old_lr = learning_rate if iteration < warmup_start: learning_rate = warmup_start_lr elif iteration < warmup_end: learning_rate = (iteration-warmup_start)*((A_+C_)-warmup_start_lr)/(warmup_end-warmup_start) + warmup_start_lr # learning rate increases from warmup_start_lr to A_ linearly over (warmup_end-warmup_start) iterations. else: if iteration < decay_start: learning_rate = A_ + C_ else: iteration_adjusted = iteration - decay_start learning_rate = (A_*(e**(-iteration_adjusted/B_))) + C_ assert learning_rate > -1e-8, "Negative Learning Rate." if old_lr != learning_rate: for param_group in optimizer.param_groups: param_group['lr'] = learning_rate else: scheduler.patience = scheduler_patience scheduler.cooldown = scheduler_cooldown if override_scheduler_last_lr: scheduler._last_lr = override_scheduler_last_lr if override_scheduler_best: scheduler.best = override_scheduler_best if override_scheduler_last_lr or override_scheduler_best: print("scheduler._last_lr =", scheduler._last_lr, "scheduler.best =", scheduler.best, " |", end='') model.zero_grad() mel, audio, speaker_ids = batch mel = torch.autograd.Variable(mel.cuda(non_blocking=True)) audio = torch.autograd.Variable(audio.cuda(non_blocking=True)) speaker_ids = speaker_ids.cuda(non_blocking=True).long().squeeze(1) outputs = model(mel, audio, speaker_ids) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if (reduced_loss > LossExplosionThreshold) or (math.isnan(reduced_loss)): model.zero_grad() raise LossExplosion(f"\nLOSS EXPLOSION EXCEPTION ON RANK {rank}: Loss reached {reduced_loss} during iteration {iteration}.\n\n\n") if use_grad_clip: if fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), grad_clip_thresh) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), grad_clip_thresh) if type(grad_norm) == torch.Tensor: grad_norm = grad_norm.item() is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm) else: is_overflow = False; grad_norm=0.00001 optimizer.step() if not is_overflow and rank == 0: # get current Loss Scale of first optimizer loss_scale = amp._amp_state.loss_scalers[0]._loss_scale if fp16_run else 32768 if with_tensorboard: if (iteration % 100000 == 0): # plot distribution of parameters for tag, value in model.named_parameters(): tag = tag.replace('.', '/') logger.add_histogram(tag, value.data.cpu().numpy(), iteration) logger.add_scalar('training_loss', reduced_loss, iteration) logger.add_scalar('training_loss_samples', reduced_loss, iteration*batch_size) if (iteration % 20 == 0): logger.add_scalar('learning.rate', learning_rate, iteration) if (iteration % 10 == 0): logger.add_scalar('duration', ((time.time() - start_time_dekaiter)/10), iteration) average_loss = rolling_sum.process(reduced_loss) if (iteration % 10 == 0): tqdm.write("{} {}: {:.3f} {:.3f} {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective) {:.2f}s/iter {:.4f}s/item".format(time.strftime("%H:%M:%S"), iteration, reduced_loss, average_loss, best_MSE, round(grad_norm,3), learning_rate, min((grad_clip_thresh/grad_norm)*learning_rate,learning_rate), (time.time() - start_time_dekaiter)/10, ((time.time() - start_time_dekaiter)/10)/(batch_size*num_gpus))) start_time_dekaiter = time.time() else: tqdm.write("{} {}: {:.3f} {:.3f} {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective) {}LS".format(time.strftime("%H:%M:%S"), iteration, reduced_loss, average_loss, best_MSE, round(grad_norm,3), learning_rate, min((grad_clip_thresh/grad_norm)*learning_rate,learning_rate), loss_scale)) start_time_iter = time.time() if rank == 0 and (len(rolling_sum.values) > moving_average-2): if (average_loss+best_model_margin) < best_model_loss: checkpoint_path = os.path.join(output_directory, "best_model") try: save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) except KeyboardInterrupt: # Avoid corrupting the model. save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) text_file = open((f"{checkpoint_path}.txt"), "w", encoding="utf-8") text_file.write(str(average_loss)+"\n"+str(iteration)) text_file.close() best_model_loss = average_loss #Only save the model if X better than the current loss. if rank == 0 and iteration > 0 and ((iteration % iters_per_checkpoint == 0) or (os.path.exists(save_file_check_path))): checkpoint_path = f"{output_directory}/waveglow_{iteration}" save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) if (os.path.exists(save_file_check_path)): os.remove(save_file_check_path) if (iteration % validation_interval == 0): if rank == 0: MSE, MAE = validate(model, loader_STFT, STFTs, logger, iteration, data_config['validation_files'], speaker_lookup, sigma, output_directory, data_config) if scheduler: MSE = torch.tensor(MSE, device='cuda') if num_gpus > 1: broadcast(MSE, 0) scheduler.step(MSE.item()) if MSE < best_MSE: checkpoint_path = os.path.join(output_directory, "best_val_model") try: save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) except KeyboardInterrupt: # Avoid corrupting the model. save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) text_file = open((f"{checkpoint_path}.txt"), "w", encoding="utf-8") text_file.write(str(MSE.item())+"\n"+str(iteration)) text_file.close() best_MSE = MSE.item() #Only save the model if X better than the current loss. else: if scheduler: MSE = torch.zeros(1, device='cuda') broadcast(MSE, 0) scheduler.step(MSE.item()) learning_rate = optimizer.param_groups[0]['lr'] #check actual learning rate (because I sometimes see learning_rate variable go out-of-sync with real LR) iteration += 1 training = False # exit the While loop except LossExplosion as ex: # print Exception and continue from checkpoint. (turns out it takes < 4 seconds to restart like this, f*****g awesome) print(ex) # print Loss checkpoint_path = os.path.join(output_directory, "best_model") assert os.path.exists(checkpoint_path), "best_val_model must exist for automatic restarts" # clearing VRAM for load checkpoint audio = mel = speaker_ids = loss = None torch.cuda.empty_cache() model.eval() model, optimizer, iteration, scheduler = load_checkpoint(checkpoint_path, model, optimizer, scheduler, fp16_run) learning_rate = optimizer.param_groups[0]['lr'] epoch_offset = max(0, int(iteration / len(train_loader))) model.train() iteration += 1 pass # and continue training.
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, weight_sharing, optimizer_type, dataloader_type): ws = weight_sharing torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer_type = optimizer_type.lower() if optimizer_type == "sgd": optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) elif optimizer_type == "adam": optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) else: print("Unsupported optimizer: %s. Aborting." % optimizer_type) return None if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 dataloader_type = dataloader_type.lower() if dataloader_type == "vanilla": trainset = Mel2Samp(**data_config) elif dataloader_type == "split": trainset = Mel2SampSplit(**data_config) else: print("Unsupported dataloader type: %s. Aborting." % dataloader_type) return None # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=(num_gpus == 1), sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) name = "waveglow_ws%d_%s_%s_batch%d" % (ws, optimizer_type, dataloader_type, batch_size) if learning_rate != 1e-4: name = name + "_lr{:.0e}".format(learning_rate) if num_gpus > 1: name = name + "_x%d" % num_gpus if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join("./logs", name)) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== stime2 = None for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) stime = time() for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() if (iteration % 100 == 0): if not stime2 is None: tot_time2 = time() - stime2 print("{}:\t{:.9f}, time: {}".format( iteration, reduced_loss, int(tot_time2))) stime2 = time() if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}_{}".format( output_directory, name, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1 tot_time = time() - stime print("Epoch %d completed. Time: %d seconds" % (epoch, int(tot_time)))
def train(num_gpus, rank, group_name, stage, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, logdirname, datedlogdir, warm_start=False, optimizer='ADAM', start_zero=False): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== from model import HiFiGAN, HiFiGANLoss criterion = HiFiGANLoss(**hifigan_config).cuda() model = HiFiGAN(**hifigan_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) if stage >= 2: criterion = apply_gradient_allreduce(criterion) #=====END: ADDED FOR DISTRIBUTED====== criterion, optimizer_d = get_optimizer(criterion, optimizer, fp16_run, optimizer_fused=True) if stage >= 2 else (criterion, None) model, optimizer = get_optimizer(model, optimizer, fp16_run, optimizer_fused=True) ## LEARNING RATE SCHEDULER if True: from torch.optim.lr_scheduler import ReduceLROnPlateau min_lr = 1e-8 factor = 0.1**(1/5) # amount to scale the LR by on Validation Loss plateau scheduler = ReduceLROnPlateau(optimizer, 'min', factor=factor, patience=20, cooldown=2, min_lr=min_lr, verbose=True, threshold=0.0001, threshold_mode='abs') print("ReduceLROnPlateau used as Learning Rate Scheduler.") else: scheduler=False # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, criterion, optimizer_d, iteration, scheduler = load_checkpoint(checkpoint_path, model, optimizer, criterion, optimizer_d, scheduler, fp16_run, stage, warm_start=warm_start) iteration += 1 # next iteration is iteration + 1 if start_zero: iteration = 0 trainset = Mel2Samp(**data_config, check_files=True) speaker_lookup = trainset.speaker_ids # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: train_sampler = DistributedSampler(trainset, shuffle=True) shuffle = False else: train_sampler = None shuffle = True # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=3, shuffle=shuffle, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter if datedlogdir: timestr = time.strftime("%Y_%m_%d-%H_%M_%S") log_directory = os.path.join(output_directory, logdirname, timestr) else: log_directory = os.path.join(output_directory, logdirname) logger = SummaryWriter(log_directory) moving_average = int(min(len(train_loader), 200)) # average loss over entire Epoch rolling_sum = StreamingMovingAverage(moving_average) start_time = time.time() start_time_iter = time.time() start_time_dekaiter = time.time() model.train() # best (averaged) training loss if os.path.exists(os.path.join(output_directory, "best_model")+".txt"): best_model_loss = float(str(open(os.path.join(output_directory, "best_model")+".txt", "r", encoding="utf-8").read()).split("\n")[0]) else: best_model_loss = 9e9 # best (validation) MSE on inferred spectrogram. if os.path.exists(os.path.join(output_directory, "best_val_model")+".txt"): best_MSE = float(str(open(os.path.join(output_directory, "best_val_model")+".txt", "r", encoding="utf-8").read()).split("\n")[0]) else: best_MSE = 9e9 epoch_offset = max(0, int(iteration / len(train_loader))) print_params(model, name='generator') print(f"Segment Length: {data_config['segment_length']:,}\nBatch Size: {batch_size:,}\nNumber of GPUs: {num_gpus:,}\nSamples/Iter: {data_config['segment_length']*batch_size*num_gpus:,}") training = True while training: try: if rank == 0: epochs_iterator = tqdm(range(epoch_offset, epochs), initial=epoch_offset, total=epochs, smoothing=0.01, desc="Epoch", position=1, unit="epoch") else: epochs_iterator = range(epoch_offset, epochs) # ================ MAIN TRAINING LOOP! =================== for epoch in epochs_iterator: print(f"Epoch: {epoch}") if num_gpus > 1: train_sampler.set_epoch(epoch) if rank == 0: iters_iterator = tqdm(enumerate(train_loader), desc=" Iter", smoothing=0, total=len(train_loader), position=0, unit="iter", leave=True) else: iters_iterator = enumerate(train_loader) for i, batch in iters_iterator: # run external code every iter, allows the run to be adjusted without restarts if (i==0 or iteration % param_interval == 0): try: with open("run_every_epoch.py") as f: internal_text = str(f.read()) if len(internal_text) > 0: #code = compile(internal_text, "run_every_epoch.py", 'exec') ldict = {'iteration': iteration, 'seconds_elapsed': time.time()-start_time} exec(internal_text, globals(), ldict) else: print("No Custom code found, continuing without changes.") except Exception as ex: print(f"Custom code FAILED to run!\n{ex}") globals().update(ldict) locals().update(ldict) if show_live_params: print(internal_text) # Learning Rate Schedule if custom_lr: old_lr = learning_rate if iteration < warmup_start: learning_rate = warmup_start_lr elif iteration < warmup_end: learning_rate = (iteration-warmup_start)*((A_+C_)-warmup_start_lr)/(warmup_end-warmup_start) + warmup_start_lr # learning rate increases from warmup_start_lr to A_ linearly over (warmup_end-warmup_start) iterations. else: if iteration < decay_start: learning_rate = A_ + C_ else: iteration_adjusted = iteration - decay_start learning_rate = (A_*(e**(-iteration_adjusted/B_))) + C_ assert learning_rate > -1e-8, "Negative Learning Rate." for param_group in optimizer.param_groups: param_group['lr'] = learning_rate if optimizer_d is not None: for param_group in optimizer_d.param_groups: param_group['lr'] = learning_rate*d_lr_scale else: scheduler.patience = scheduler_patience scheduler.cooldown = scheduler_cooldown if override_scheduler_last_lr: scheduler._last_lr = override_scheduler_last_lr if override_scheduler_best: scheduler.best = override_scheduler_best if override_scheduler_last_lr or override_scheduler_best: print(f"scheduler._last_lr = {scheduler._last_lr} scheduler.best = {scheduler.best} |", end='') model.zero_grad() noisy_audio, gt_audio, speaker_ids = batch noisy_audio = torch.autograd.Variable(noisy_audio.cuda(non_blocking=True)) gt_audio = torch.autograd.Variable(gt_audio.cuda(non_blocking=True)) speaker_ids = speaker_ids.cuda(non_blocking=True).long().squeeze(1) pred_audio = model(noisy_audio)#, speaker_ids) metrics = criterion(pred_audio, gt_audio, amp, model, optimizer, optimizer_d, num_gpus, use_grad_clip, grad_clip_thresh) if not metrics['is_overflow'] and rank == 0: # get current Loss Scale of first optimizer loss_scale = amp._amp_state.loss_scalers[0]._loss_scale if fp16_run else 32768 if with_tensorboard: if (iteration % 100000 == 0): # plot distribution of parameters for tag, value in model.named_parameters(): tag = tag.replace('.', '/') logger.add_histogram(tag, value.data.cpu().numpy(), iteration) for key, value in metrics.items(): if key not in ['is_overflow',]: logger.add_scalar(key, value, iteration) if (iteration % 20 == 0): logger.add_scalar('learning.rate', learning_rate, iteration) if (iteration % 10 == 0): logger.add_scalar('duration', ((time.time() - start_time_dekaiter)/10), iteration) logged_loss = metrics['g_train_loss'] if stage >= 2 else metrics['train_loss'] grad_norm = metrics['grad_norm'] average_loss = rolling_sum.process(logged_loss) if (iteration % 10 == 0): tqdm.write("{} {}: {:.3f} {:.3f} {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective) {:.2f}s/iter {:.4f}s/item".format(time.strftime("%H:%M:%S"), iteration, logged_loss, average_loss, best_MSE, round(grad_norm,3), learning_rate, min((grad_clip_thresh/grad_norm)*learning_rate,learning_rate), (time.time() - start_time_dekaiter)/10, ((time.time() - start_time_dekaiter)/10)/(batch_size*num_gpus))) start_time_dekaiter = time.time() else: tqdm.write("{} {}: {:.3f} {:.3f} {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective) {}LS".format(time.strftime("%H:%M:%S"), iteration, logged_loss, average_loss, best_MSE, round(grad_norm,3), learning_rate, min((grad_clip_thresh/grad_norm)*learning_rate,learning_rate), loss_scale)) start_time_iter = time.time() if rank == 0 and (len(rolling_sum.values) > moving_average-2): if (average_loss+best_model_margin) < best_model_loss: checkpoint_path = os.path.join(output_directory, "best_model") try: save_checkpoint(model, optimizer, criterion, optimizer_d, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) except KeyboardInterrupt: # Avoid corrupting the model. save_checkpoint(model, optimizer, criterion, optimizer_d, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) text_file = open((f"{checkpoint_path}.txt"), "w", encoding="utf-8") text_file.write(str(average_loss)+"\n"+str(iteration)) text_file.close() best_model_loss = average_loss #Only save the model if X better than the current loss. if rank == 0 and iteration > 0 and ((iteration % iters_per_checkpoint == 0) or (os.path.exists(save_file_check_path))): checkpoint_path = f"{output_directory}/waveglow_{iteration}" save_checkpoint(model, optimizer, criterion, optimizer_d, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) if (os.path.exists(save_file_check_path)): os.remove(save_file_check_path) if iteration%validation_interval == 0: if rank == 0: MSE, MAE = validate(model, trainset, logger, iteration, data_config['validation_files'], speaker_lookup, output_directory, data_config) if scheduler: MSE = torch.tensor(MSE, device='cuda') if num_gpus > 1: broadcast(MSE, 0) scheduler.step(MSE.item()) if MSE < best_MSE: checkpoint_path = os.path.join(output_directory, "best_val_model") try: save_checkpoint(model, optimizer, criterion, optimizer_d, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) except KeyboardInterrupt: # Avoid corrupting the model. save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) text_file = open((f"{checkpoint_path}.txt"), "w", encoding="utf-8") text_file.write(str(MSE.item())+"\n"+str(iteration)) text_file.close() best_MSE = MSE.item() else: if scheduler: MSE = torch.zeros(1, device='cuda') broadcast(MSE, 0) scheduler.step(MSE.item()) iteration += 1 training = False # exit the training While loop except LossExplosion as ex: # print Exception and continue from checkpoint. (turns out it takes < 4 seconds to restart like this, f*****g awesome) print(ex) # print Loss checkpoint_path = os.path.join(output_directory, "best_model") assert os.path.exists(checkpoint_path), "best_model must exist for automatic restarts" # clearing VRAM for load checkpoint audio = mel = speaker_ids = loss = None torch.cuda.empty_cache() model.eval() model, optimizer, iteration, scheduler = load_checkpoint(checkpoint_path, model, optimizer, scheduler, fp16_run) learning_rate = optimizer.param_groups[0]['lr'] epoch_offset = max(0, int(iteration / len(train_loader))) model.train() iteration += 1 pass # and continue training.
def train(output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard): torch.manual_seed(seed) torch.cuda.manual_seed(seed) criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config, filter_length=data_config["filter_length"], hop_length=data_config["hop_length"]).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) trainset = Mel2Samp(**data_config) train_loader = DataLoader(trainset, num_workers=6, sampler=RandomSampler(0, 14), batch_size=batch_size, pin_memory=True, drop_last=False) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) model.train() model = model.cuda() s = time() reduced_loss = 0 for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) reduced_loss += loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() denominator = i % iters_per_checkpoint + 1 print("iteration:{}, loss:{:.4f}, time:{:.2f} " "".format(iteration + 1, reduced_loss / denominator, (time() - s) / denominator), end="\r") if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss / denominator, iteration + 1) if (iteration + 1) % iters_per_checkpoint == 0: s = time() reduced_loss = 0 if rank == 0: checkpoint_path = "{}/waveglow_it{}.pt".format( output_directory, iteration + 1) save_checkpoint(model, optimizer, learning_rate, iteration + 1, checkpoint_path) iteration += 1
def main(squeezewave_path, sigma, output_dir, is_fp16, denoiser_strength): # mel_files = files_to_list(mel_files) squeezewave = torch.load(squeezewave_path)['model'] squeezewave = squeezewave.remove_weightnorm(squeezewave) squeezewave.cuda().eval() if is_fp16: from apex import amp squeezewave, _ = amp.initialize(squeezewave, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(squeezewave).cuda() n_audio_channel = squeezewave_config["n_audio_channel"] testset = Mel2Samp(n_audio_channel, frame_energy_thres=0.02, **data_config) # =====START: ADDED FOR DISTRIBUTED====== # train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== test_loader = DataLoader( testset, num_workers=0, shuffle=False, # sampler=train_sampler, batch_size=1 if data_config['split'] == 'test' else 12, pin_memory=False, drop_last=True) speakers_to_sids = deepcopy(testset.speakers) sids_to_speakers = create_reverse_dict(speakers_to_sids) ut_to_uids = deepcopy(testset.utterances) uids_to_ut = create_reverse_dict(ut_to_uids) # sid_target = np.random.randint(len(speakers_to_sids)) # speaker_target = sids_to_speakers[sid_target] # sid_target = torch.LongTensor([[sid_target] * # test_loader.batch_size]).view( # test_loader.batch_size, 1).to('cuda') audios = [] mels = [] n_audios = 0 for i, batch in enumerate(test_loader): audio_source, sid_source, uid_source, is_last = batch mel_source = get_mel(audio_source) mel_source = mel_source.to('cuda') import pdb pdb.set_trace() with torch.no_grad(): predicted = squeezewave.infer(mel_source, sigma=sigma) if denoiser_strength > 0: predicted = denoiser(predicted, denoiser_strength) predicted = predicted.squeeze(1) # predicted = predicted * MAX_WAV_VALUE for j in range(len(predicted)): p = predicted[j].cpu() audios.append(p) mels.append(mel_source[j].cpu()) speaker_source = sids_to_speakers[sid_source[j].data.item()] ut_source = uids_to_ut[uid_source[j].data.item()] last = is_last[j].data.item() if last: ## Hacking to print mel_source here fname = os.path.join( output_dir, "{}_{}_mel.pt".format(speaker_source, ut_source)) pdb.set_trace() torch.save(mels, fname) print("Saved mel to {}".format(fname)) ## # audio_path = os.path.join( # output_dir, # "{}_{}_to_{}_synthesis.wav".format(speaker_source, # ut_source, # speaker_target)) audio_path = os.path.join( output_dir, "{}_{}_synthesis.wav".format(speaker_source, ut_source)) print("Synthesizing file No.{} at {}".format( n_audios, audio_path)) save_audio_chunks(audios, audio_path, data_config['stride'], data_config['sampling_rate']) audios = [] mels = [] n_audios += 1
if args.average_checkpoint == 0: model, optimizer, scheduler, iteration = load_checkpoint_warm_start(checkpoint_path, model, optimizer, scheduler,fp16_run) else: print("INFO: --average_checkpoint > 0. loading an averaged weight of last {} checkpoints...".format(args.average_checkpoint)) model, optimizer, scheduler, iteration = load_averaged_checkpoint_warm_start(checkpoint_path, model, optimizer, scheduler,fp16_run) else: model, optimizer, scheduler, iteration = load_checkpoint(checkpoint_path, model, optimizer, scheduler,fp16_run) iteration += 1 # next iteration is iteration + 1 if distributed_run: model = DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) model = torch.nn.DataParallel(model) trainset = Mel2Samp("train", False, False, **data_config) testset = Mel2Samp("test", False, False, **data_config) if distributed_run: train_sampler, shuffle = DistributedSampler(trainset), False test_sampler, shuffle = DistributedSampler(testset), False else: train_sampler, shuffle = None, True test_sampler, shuffle = None, True train_loader = DataLoader(trainset, num_workers=16, shuffle=shuffle, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True)
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, warm_start): torch.manual_seed(seed) torch.cuda.manual_seed(seed) # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) # =====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) # =====END: ADDED FOR DISTRIBUTED====== optimizer = Over9000(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') else: amp = None # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer, warm_start) if fp16_run and not warm_start: amp.load_state_dict(torch.load(checkpoint_path)['amp']) iteration += 1 trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=16, shuffle=True, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.999, patience=250, cooldown=250, verbose=True, min_lr=1e-5) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = mel.cuda() audio = audio.cuda() outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), 1.0) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), 1.0) optimizer.step() if epoch > 1: scheduler.step(loss) print("{}:\t{:.9f}\t{:.9f}".format(iteration, reduced_loss, grad_norm)) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, amp, iteration, checkpoint_path) iteration += 1
def test_mel2samp(): """Test mel2samp modules on example data.""" from mel2samp import Mel2Samp hparams = hparams_class() passed = 0 # test filelist loader try: from mel2samp import load_filepaths_and_text audio_files = load_filepaths_and_text("code_tests/test_materials/filelists/validation_utf8.txt") passed+=1 print("--PASSED--\n") except Exception as ex: print("--EXCEPTION-- @ Load Filepaths and Text (UTF-8)") traceback.print_exc(file=sys.stdout) print("\n") # test filelist checker try: assert audio_files from mel2samp import check_files audio_files = check_files(audio_files, hparams) assert len(audio_files) == 1 passed+=1 print("--PASSED--\n") del audio_files except Exception as ex: print("--EXCEPTION-- @ Load Filepaths and Text (UTF-8)") traceback.print_exc(file=sys.stdout) print("\n") # test initalization try: trainset = Mel2Samp(hparams) passed+=1 print("--PASSED--\n") except Exception as ex: print("--EXCEPTION-- @ Mel2Samp Initialization") traceback.print_exc(file=sys.stdout) print("\n") # test 16-BIT .wav to torch try: from mel2samp import load_wav_to_torch x, sr = load_wav_to_torch("code_tests/test_materials/audio_0/example_16bits.wav") assert len(x) assert x.max() <= 2**15 assert x.min() >= -(2**15) assert sr == 48000 passed+=1 print("--PASSED--\n") except Exception as ex: print("--EXCEPTION-- @ Load 16-BIT .wav to Pytorch") traceback.print_exc(file=sys.stdout) print("\n") # test 24-BIT .wav to torch try: from mel2samp import load_wav_to_torch x, sr = load_wav_to_torch("code_tests/test_materials/audio_0/example_24bits.wav") assert len(x) assert x.max() <= 2**23 assert x.min() >= -(2**23) assert sr == 48000 passed+=1 print("--PASSED--\n") except Exception as ex: print("--EXCEPTION-- @ Load 24-BIT .wav to Pytorch") traceback.print_exc(file=sys.stdout) print("\n") # test 32-BIT .wav to torch try: from mel2samp import load_wav_to_torch x, sr = load_wav_to_torch("code_tests/test_materials/audio_0/example_32bits.wav") assert len(x) assert x.max() <= 2**31 assert x.min() >= -(2**31) assert sr == 48000 passed+=1 print("--PASSED--\n") except Exception as ex: print("--EXCEPTION-- @ Load 32-BIT .wav to Pytorch") traceback.print_exc(file=sys.stdout) print("\n") # test 32-BIT .mp3 to torch try: from mel2samp import load_wav_to_torch x, sr = load_wav_to_torch("code_tests/test_materials/audio_0/example_32bits.mp3") assert len(x) assert x.max() <= 2**31 assert x.min() >= -(2**31) assert sr == 48000 passed+=1 print("--PASSED--\n") except Exception as ex: print("--EXCEPTION-- @ Load 32-BIT .mp3 to Pytorch") traceback.print_exc(file=sys.stdout) print("\n") # test 16-BIT .wav to mel try: x, sr = load_wav_to_torch("code_tests/test_materials/audio_0/example_16bits.wav") x = trainset.get_mel(x) passed+=1 print("--PASSED--\n") except Exception as ex: print("--EXCEPTION-- @ 16-BIT .wav to Mel-spec") traceback.print_exc(file=sys.stdout) print("\n") # test 24-BIT .wav to mel try: x, sr = load_wav_to_torch("code_tests/test_materials/audio_0/example_24bits.wav") x = trainset.get_mel(x) passed+=1 print("--PASSED--\n") except Exception as ex: print("--EXCEPTION-- @ 24-BIT .wav to Mel-spec") traceback.print_exc(file=sys.stdout) print("\n") # test 32-BIT .wav to mel try: x, sr = load_wav_to_torch("code_tests/test_materials/audio_0/example_32bits.wav") x = trainset.get_mel(x) passed+=1 print("--PASSED--\n") except Exception as ex: print("--EXCEPTION-- @ 32-BIT .wav to Mel-spec") traceback.print_exc(file=sys.stdout) print("\n") # test 32-BIT .mp3 to mel try: x, sr = load_wav_to_torch("code_tests/test_materials/audio_0/example_32bits.mp3") x = trainset.get_mel(x) passed+=1 print("--PASSED--\n") except Exception as ex: print("--EXCEPTION-- @ 32-BIT .mp3 to Mel-spec") traceback.print_exc(file=sys.stdout) print("\n") # test __getitem__ with load_mel_from_disk = False try: assert trainset # This test will fail if Mel2Samp cannot initalize trainset.load_mel_from_disk = False trainset.__getitem__(0) passed+=1 print("--PASSED--\n") except Exception as ex: print("--EXCEPTION-- @ __getitem__ with load_mel_from_disk = False") traceback.print_exc(file=sys.stdout) print("\n") # test __getitem__ with load_mel_from_disk = True try: assert trainset # This test will fail if Mel2Samp cannot initalize trainset.load_mel_from_disk = True trainset.__getitem__(0) passed+=1 print("--PASSED--\n") except Exception as ex: print("--EXCEPTION-- @ __getitem__ with load_mel_from_disk = True") traceback.print_exc(file=sys.stdout) print("\n") # test initalization with Pre-empthasis try: trainset = None hparams.preempthasis = 0.98 trainset = Mel2Samp(hparams) passed+=1 print("--PASSED--\n") except Exception as ex: print("--EXCEPTION-- @ Mel2Samp with Pre-empthasis Initialization") traceback.print_exc(file=sys.stdout) print("\n") # test __getitem__ with Pre-empthasis try: assert trainset # This test will fail if Mel2Samp cannot initalize trainset.load_mel_from_disk = False trainset.__getitem__(0) passed+=1 print("--PASSED--\n") except Exception as ex: print("--EXCEPTION-- @ __getitem__ with Pre-empthasis") traceback.print_exc(file=sys.stdout) print("\n")
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(data_config['training_files'], data_config['segment_length'], data_config['filter_length'], data_config['hop_length'], data_config['win_length'], data_config['sampling_rate'], data_config['mel_fmin'], data_config['mel_fmax'], debug=False) if 'testing_files' in data_config: testset = Mel2Samp(data_config['testing_files'], data_config['segment_length'], data_config['filter_length'], data_config['hop_length'], data_config['win_length'], data_config['sampling_rate'], data_config['mel_fmin'], data_config['mel_fmax'], debug=True) else: testset = None # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) else: logger = None model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): start = time.perf_counter() model.zero_grad() print("train batch loaded, {} ({} of {})".format( iteration, i, len(train_loader))) mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() is_overflow = False if fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), 1.0) is_overflow = math.isnan(grad_norm) optimizer.step() duration = time.perf_counter() - start print( "train batch done, {} ({} of {}): {:.9f} (took {:.2f})".format( iteration, i, len(train_loader), reduced_loss, duration)) if logger: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) logger.add_scalar('duration', duration, i + len(train_loader) * epoch) if testset and not is_overflow and (iteration % iters_per_checkpoint == 0): if testset: validate(model, criterion, testset, iteration, batch_size, num_gpus, logger) if rank == 0: rotate_checkpoints(output_directory) checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(model, num_gpus, output_directory, epochs, learning_rate, lr_decay_step, lr_decay_gamma, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard): # local eval and synth functions def evaluate(): # eval loop model.eval() epoch_eval_loss = 0 for i, batch in enumerate(test_loader): with torch.no_grad(): mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model(audio, mel) loss = criterion(outputs) if num_gpus > 1: reduced_loss = loss.mean().item() else: reduced_loss = loss.item() epoch_eval_loss += reduced_loss epoch_eval_loss = epoch_eval_loss / len(test_loader) print("EVAL {}:\t{:.9f}".format(iteration, epoch_eval_loss)) if with_tensorboard: logger.add_scalar('eval_loss', epoch_eval_loss, iteration) logger.flush() model.train() def synthesize(sigma): model.eval() # synthesize loop for i, batch in enumerate(synth_loader): if i == 0: with torch.no_grad(): mel, _, filename = batch mel = torch.autograd.Variable(mel.cuda()) try: audio = model.reverse(mel, sigma) except AttributeError: audio = model.module.reverse(mel, sigma) except NotImplementedError: print("reverse not implemented for this model. skipping synthesize!") model.train() return audio = audio * MAX_WAV_VALUE audio = audio.squeeze() audio = audio.cpu().numpy() audio = audio.astype('int16') audio_path = os.path.join( os.path.join(output_directory, "samples", waveflow_config["model_name"]), "generate_{}.wav".format(iteration)) write(audio_path, data_config["sampling_rate"], audio) model.train() torch.manual_seed(seed) torch.cuda.manual_seed(seed) criterion = WaveFlowLossDataParallel(sigma) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) scheduler = torch.optim.lr_scheduler.StepLR( optimizer, step_size=lr_decay_step, gamma=lr_decay_gamma) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if args.resume: model_directory = os.path.join( output_directory, waveflow_config["model_name"] ) logging.info("--resume. Resuming the training from the last " "checkpoint found in {}.".format(model_directory)) last_checkpoint = last_n_checkpoints(model_directory, 1)[0] model, optimizer, scheduler, iteration = \ load_checkpoint(last_checkpoint, model, optimizer, scheduler) elif checkpoint_path != "": # Warm-start if args.warm_start and args.average_checkpoint == 0: print("INFO: --warm_start. optimizer and scheduler are initialized and strict=False for load_state_dict().") model, optimizer, scheduler, iteration = load_checkpoint_warm_start( checkpoint_path, model, optimizer, scheduler) elif args.warm_start and args.average_checkpoint != 0: print("INFO: --average_checkpoint > 0. loading an averaged " "weight of last {} checkpoints...".format(args.average_checkpoint)) model, optimizer, scheduler, iteration = load_averaged_checkpoint_warm_start( checkpoint_path, model, optimizer, scheduler ) else: model, optimizer, scheduler, iteration = \ load_checkpoint(checkpoint_path, model, optimizer, scheduler) iteration += 1 # next iteration is iteration + 1 if num_gpus > 1: print("num_gpus > 1. converting the model to DataParallel...") model = torch.nn.DataParallel(model) trainset = Mel2Samp("train", False, False, **data_config) train_loader = DataLoader(trainset, num_workers=4, shuffle=True, batch_size=batch_size, pin_memory=False, drop_last=True) testset = Mel2Samp("test", False, False, **data_config) test_sampler = None test_loader = DataLoader(testset, num_workers=4, shuffle=False, sampler=test_sampler, batch_size=batch_size, pin_memory=False, drop_last=False) synthset = Mel2Samp("test", True, True, **data_config) synth_sampler = None synth_loader = DataLoader(synthset, num_workers=4, shuffle=False, sampler=synth_sampler, batch_size=1, pin_memory=False, drop_last=False) # Get shared output_directory ready if not os.path.isdir(os.path.join(output_directory, waveflow_config["model_name"])): os.makedirs(os.path.join(output_directory, waveflow_config["model_name"]), exist_ok=True) os.chmod(os.path.join(output_directory, waveflow_config["model_name"]), 0o775) print("output directory", os.path.join(output_directory, waveflow_config["model_name"])) if not os.path.isdir(os.path.join(output_directory, "samples")): os.makedirs(os.path.join(output_directory, "samples"), exist_ok=True) os.chmod(os.path.join(output_directory, "samples"), 0o775) os.makedirs(os.path.join(output_directory, "samples", waveflow_config["model_name"]), exist_ok=True) os.chmod(os.path.join(output_directory, "samples", waveflow_config["model_name"]), 0o775) if with_tensorboard: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, waveflow_config["model_name"], 'logs')) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in tqdm.tqdm(enumerate(train_loader), total=len(train_loader)): tic = time.time() model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model(audio, mel) loss = criterion(outputs) if num_gpus > 1: reduced_loss = loss.mean().item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.mean().backward() if fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 5.) else: grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 5.) optimizer.step() toc = time.time() - tic #print("{}:\t{:.9f}, {:.4f} seconds".format(iteration, reduced_loss, toc)) if with_tensorboard: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) logger.add_scalar('lr', get_lr(optimizer), i + len(train_loader) * epoch) logger.add_scalar('grad_norm', grad_norm, i + len(train_loader) * epoch) logger.flush() if (iteration % iters_per_checkpoint == 0): checkpoint_path = "{}/waveflow_{}".format( os.path.join(output_directory, waveflow_config["model_name"]), iteration) save_checkpoint(model, optimizer, scheduler, learning_rate, iteration, checkpoint_path) if iteration != 0: evaluate() del mel, audio, outputs, loss gc.collect() synthesize(sigma) iteration += 1 scheduler.step() evaluate()
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, num_workers=4): print("num_workers", num_workers) torch.manual_seed(seed) torch.cuda.manual_seed(seed) # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) # =====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) # =====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) scheduler = StepLR(optimizer, step_size=1, gamma=0.96) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(**data_config) evalset = Mel2Samp(**eval_data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None eval_sampler = DistributedSampler(evalset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=num_workers, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) eval_loader = DataLoader(evalset, num_workers=num_workers, shuffle=False, sampler=eval_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) epoch_offset = max(1, int(iteration / len(train_loader))) start_time = datetime.datetime.now() # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print('Epoch:', epoch, 'LR:', scheduler.get_lr()) elapsed = datetime.datetime.now() - start_time print("Epoch: [{}][els: {}] {}".format( datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), elapsed, epoch)) model.train() total_loss = 0. for i, batch in enumerate(train_loader): model.zero_grad() if waveglow_config["multi_speaker_config"]["use_multi_speaker"]: mel, audio, spk_embed_or_id = batch spk_embed_or_id = torch.autograd.Variable( spk_embed_or_id.cuda()) else: mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) if waveglow_config["multi_speaker_config"]["use_multi_speaker"]: outputs = model((mel, audio, spk_embed_or_id)) else: outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() total_loss += reduced_loss if i > 0 and i % 10 == 0: elapsed = datetime.datetime.now() - start_time print( "[{}][els: {}] epoch {},total steps{}, {}/{} steps:\t{:.9f}" .format( datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), elapsed, epoch, iteration, i, len(train_loader), reduced_loss)) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1 elapsed = datetime.datetime.now() - start_time print("[{}][els: {}] {} epoch :\tavg loss {:.9f}".format( datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), elapsed, epoch, total_loss / len(train_loader))) scheduler.step() eval.eval(eval_loader, model, criterion, num_gpus, start_time, epoch, waveglow_config["multi_speaker_config"]["use_multi_speaker"])
def synthesize_master(model, num_gpus, temp, output_directory, epochs, learning_rate, lr_decay_step, lr_decay_gamma, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard): torch.manual_seed(seed) torch.cuda.manual_seed(seed) # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, _, _, iteration = load_checkpoint(checkpoint_path, model, None, None) # remove all weight_norm from the model model.remove_weight_norm() # fuse mel-spec conditioning layer weights to maximize speed model.fuse_conditioning_layers() if fp16_run: from apex import amp model, _ = amp.initialize(model, [], opt_level="O3") synthset = Mel2Samp("test", True, True, **data_config) synth_sampler = None synth_loader = DataLoader(synthset, num_workers=4, shuffle=False, sampler=synth_sampler, batch_size=1, pin_memory=False, drop_last=False) # Get shared output_directory ready if not os.path.isdir(os.path.join(output_directory, waveflow_config["model_name"])): os.makedirs(os.path.join(output_directory, waveflow_config["model_name"]), exist_ok=True) os.chmod(os.path.join(output_directory, waveflow_config["model_name"]), 0o775) print("output directory", os.path.join(output_directory, waveflow_config["model_name"])) if not os.path.isdir(os.path.join(output_directory, "samples")): os.makedirs(os.path.join(output_directory, "samples"), exist_ok=True) os.chmod(os.path.join(output_directory, "samples"), 0o775) os.makedirs(os.path.join(output_directory, "samples", waveflow_config["model_name"]), exist_ok=True) os.chmod(os.path.join(output_directory, "samples", waveflow_config["model_name"]), 0o775) # synthesize loop model.eval() for i, batch in enumerate(synth_loader): with torch.no_grad(): mel, _, filename = batch mel = torch.autograd.Variable(mel.cuda()) if fp16_run: mel = mel.half() torch.cuda.synchronize() tic = time.time() audio = model.reverse_fast(mel, temp) torch.cuda.synchronize() toc = time.time() - tic print('{}: {:.4f} seconds, {:.4f}kHz'.format(i, toc, audio.shape[1] / toc / 1000)) audio = audio * MAX_WAV_VALUE audio = audio.squeeze() audio = audio.cpu().numpy() audio = audio.astype('int16') audio_path = os.path.join( os.path.join(output_directory, "samples", waveflow_config["model_name"]), "generate_{}_{}_t{}.wav".format(iteration, i, temp)) write(audio_path, data_config["sampling_rate"], audio) model.train()
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard): torch.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cpu() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cpu()) audio = torch.autograd.Variable(audio.cpu()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() print("{}:\t{:.9f}".format(iteration, reduced_loss)) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(output_directory, log_directory, checkpoint_path, warm_start, warm_start_force, n_gpus, rank, group_name, hparams): torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) #=====START: ADDED FOR DISTRIBUTED====== if n_gpus > 1: init_distributed(rank, n_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== model, criterion = getCore(hparams) #=====START: ADDED FOR DISTRIBUTED====== if n_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== STFT = [ TacotronSTFT(filter_length=window, hop_length=hparams.hop_length, win_length=window, sampling_rate=hparams.sampling_rate, n_mel_channels=160, mel_fmin=hparams.mel_fmin, mel_fmax=hparams.mel_fmax) for window in hparams.validation_windows ] optimizer = getOptimizer(model, hparams) if hparams.fp16_run: global amp from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level=hparams.fp16_opt_level, min_loss_scale=2.0) else: amp = None # LEARNING RATE SCHEDULER if hparams.LRScheduler.lower() == "ReduceLROnPlateau".lower(): from torch.optim.lr_scheduler import ReduceLROnPlateau min_lr = 1e-5 factor = 0.1**( 1 / 5) # amount to scale the LR by on Validation Loss plateau scheduler = ReduceLROnPlateau(optimizer, 'min', factor=factor, patience=20, cooldown=2, min_lr=min_lr, verbose=True) print("ReduceLROnPlateau used as Learning Rate Scheduler.") else: scheduler = None # Load checkpoint if one exists iteration = 0 if checkpoint_path: model, optimizer, iteration, scheduler = load_checkpoint( warm_start, warm_start_force, checkpoint_path, model, optimizer, scheduler, hparams.fp16_run) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(hparams) speaker_lookup = trainset.speaker_ids # =====START: ADDED FOR DISTRIBUTED====== if n_gpus > 1: train_sampler = DistributedSampler(trainset, shuffle=True) shuffle = False else: train_sampler = None shuffle = True # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=hparams.n_dataloader_workers, shuffle=shuffle, sampler=train_sampler, batch_size=hparams.batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if rank == 0: from tensorboardX import SummaryWriter if False: # dated and seperated log dirs for each run timestr = time.strftime("%Y_%m_%d-%H_%M_%S") log_directory = os.path.join(output_directory, log_directory, timestr) else: log_directory = os.path.join(output_directory, log_directory) logger = SummaryWriter(log_directory) moving_average = int(min(len(train_loader), 100)) # average loss over 100 iters rolling_sum = StreamingMovingAverage(moving_average) start_time = time.time() start_time_single_batch = time.time() model.train() if os.path.exists(os.path.join(output_directory, "best_train_model")): best_model_loss = float( str( open(os.path.join(output_directory, "best_train_model") + ".txt", "r", encoding="utf-8").read()).split("\n")[0]) else: best_model_loss = -4.20 if os.path.exists(os.path.join(output_directory, "best_val_model")): best_MSE = float( str( open(os.path.join(output_directory, "best_val_model") + ".txt", "r", encoding="utf-8").read()).split("\n")[0]) else: best_MSE = 9e9 epoch_offset = max(0, int(iteration / len(train_loader))) pytorch_total_params = sum(p.numel() for p in model.parameters()) print("{:,} total parameters.".format(pytorch_total_params)) pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print("{:,} trainable parameters.".format(pytorch_total_params)) learning_rate = hparams.learning_rate # ================ MAIN TRAINING LOOP! =================== for epoch in get_progress_bar(range(epoch_offset, hparams.epochs), dict(initial=epoch_offset, total=hparams.epochs, smoothing=0.01, desc="Epoch", position=1, unit="epoch"), hparams, rank=rank): cprint(f"Epoch: {epoch}", b_tqdm=hparams.tqdm) if n_gpus > 1: train_sampler.set_epoch(epoch) for i, batch in get_progress_bar(enumerate(train_loader), dict(desc=" Iter", smoothing=0, total=len(train_loader), position=0, unit="iter", leave=True), hparams, rank=rank): # run external code every iter, allows the run to be adjusted without restarts if (i == 0 or iteration % param_interval == 0): try: with open("hparams_realtime.py") as f: internal_text = str(f.read()) ldict = {'iteration': iteration} exec(internal_text, globals(), ldict) except Exception as ex: cprint(f"Custom code FAILED to run!\n{ex}", b_tqdm=hparams.tqdm) globals().update(ldict) locals().update(ldict) if show_live_params: cprint(internal_text, b_tqdm=hparams.tqdm) assert warmup_start <= iteration, "Current iteration less than warmup_start." # Learning Rate Schedule if custom_lr: old_lr = learning_rate if iteration < warmup_end: learning_rate = (iteration - warmup_start) * ( (A_ + C_) - warmup_start_lr ) / ( warmup_end - warmup_start ) + warmup_start_lr # learning rate increases from warmup_start_lr to A_ linearly over (warmup_end-warmup_start) iterations. else: if iteration < decay_start: learning_rate = A_ + C_ else: iteration_adjusted = iteration - decay_start learning_rate = (A_ * (e**(-iteration_adjusted / B_))) + C_ assert learning_rate > -1e-8, "Negative Learning Rate." if old_lr != learning_rate: for param_group in optimizer.param_groups: param_group['lr'] = learning_rate else: scheduler.patience = scheduler_patience scheduler.cooldown = scheduler_cooldown if override_scheduler_last_lr: scheduler._last_lr = override_scheduler_last_lr cprint("Scheduler last_lr overriden. scheduler._last_lr =", scheduler._last_lr, b_tqdm=hparams.tqdm) if not iteration % 20: # check actual learning rate every 20 iters (because I sometimes see learning_rate variable go out-of-sync with real LR) learning_rate = optimizer.param_groups[0]['lr'] if override_scheduler_best: scheduler.best = override_scheduler_best cprint("Scheduler best metric overriden. scheduler.best =", override_scheduler_best, b_tqdm=hparams.tqdm) model.zero_grad() mel, audio, speaker_ids = batch mel = torch.autograd.Variable(mel.cuda(non_blocking=True)) audio = torch.autograd.Variable(audio.cuda(non_blocking=True)) if model.multispeaker: speaker_ids = torch.autograd.Variable( speaker_ids.cuda(non_blocking=True)).long().squeeze(1) outputs = model(mel, audio, speaker_ids) else: outputs = model(mel, audio) loss = criterion(outputs) if n_gpus > 1: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() assert reduced_loss < 1e5, "Model Diverged. Loss > 1e5" if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.b_grad_clip: if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm) else: is_overflow = False grad_norm = 0.00001 optimizer.step() if not is_overflow and rank == 0: if (iteration % 100000 == 0): # plot distribution of parameters for tag, value in model.named_parameters(): tag = tag.replace('.', '/') logger.add_histogram(tag, value.data.cpu().numpy(), iteration) logger.add_scalar('training_loss', reduced_loss, iteration) if (iteration % 20 == 0): logger.add_scalar('learning.rate', learning_rate, iteration) if (iteration % 10 == 0): logger.add_scalar('duration', ((time.time() - start_time) / 10), iteration) start_time_single_batch = time.time() average_loss = rolling_sum.process(reduced_loss) if rank == 0: if (iteration % 10 == 0): cprint( "{} {}: {:.3f} {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective) {:.2f}s/iter {:.4f}s/item" .format( time.strftime("%H:%M:%S"), iteration, reduced_loss, average_loss, round(grad_norm, 3), learning_rate, min((hparams.grad_clip_thresh / grad_norm) * learning_rate, learning_rate), (time.time() - start_time) / 10, ((time.time() - start_time) / 10) / (hparams.batch_size * n_gpus)), b_tqdm=hparams.tqdm) start_time = time.time() else: cprint( "{} {}: {:.3f} {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective)" .format( time.strftime("%H:%M:%S"), iteration, reduced_loss, average_loss, round(grad_norm, 3), learning_rate, min((hparams.grad_clip_thresh / grad_norm) * learning_rate, learning_rate)), b_tqdm=hparams.tqdm) if rank == 0 and (len(rolling_sum.values) > moving_average - 2): if (average_loss + best_model_margin) < best_model_loss: checkpoint_path = os.path.join(output_directory, "best_train_model") try: save_checkpoint(model, optimizer, hparams, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) except KeyboardInterrupt: # Avoid corrupting the model. save_checkpoint(model, optimizer, hparams, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) text_file = open((f"{checkpoint_path}.txt"), "w", encoding="utf-8") text_file.write(str(average_loss) + "\n" + str(iteration)) text_file.close() best_model_loss = average_loss #Only save the model if X better than the current loss. if rank == 0 and ((iteration % hparams.iters_per_checkpoint == 0) or (os.path.exists(save_file_check_path))): checkpoint_path = f"{output_directory}/waveglow_{iteration}" save_checkpoint(model, optimizer, hparams, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) start_time_single_batch = time.time() if (os.path.exists(save_file_check_path)): os.remove(save_file_check_path) if (iteration % validation_interval == 0): if rank == 0: MSE, MAE = validate(model, STFT, logger, iteration, speaker_lookup, hparams, output_directory) if scheduler and n_gpus > 1: MSE = torch.tensor(MSE, device='cuda') broadcast(MSE, 0) scheduler.step(MSE.item()) if MSE < best_MSE: checkpoint_path = os.path.join( output_directory, "best_val_model") try: save_checkpoint(model, optimizer, hparams, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) except KeyboardInterrupt: # Avoid corrupting the model. save_checkpoint(model, optimizer, hparams, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) text_file = open((f"{checkpoint_path}.txt"), "w", encoding="utf-8") text_file.write( str(MSE.item()) + "\n" + str(iteration)) text_file.close() best_MSE = MSE.item( ) #Only save the model if X better than the current loss. else: if scheduler: MSE = torch.zeros(1, device='cuda') broadcast(MSE, 0) scheduler.step(MSE.item()) iteration += 1