def load_model(hparams): model = Tacotron2(hparams).cuda() if hparams.fp16_run: model.decoder.attention_layer.score_mask_value = finfo('float16').min if hparams.distributed_run: model = apply_gradient_allreduce(model) return model
def load_model(hparams, use_cuda=True): device = torch.device('cuda' if use_cuda else 'cpu') model = Tacotron2(hparams).to(device) if hparams.fp16_run: model.decoder.attention_layer.score_mask_value = finfo('float16').min if hparams.distributed_run: model = apply_gradient_allreduce(model) return model
def load_model(hparams): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = Tacotron2(hparams).to(device) if hparams.fp16_run: model.decoder.attention_layer.score_mask_value = finfo('float16').min if hparams.distributed_run: model = apply_gradient_allreduce(model) return model
def load_model(hparams): model = Cascaded_Tacotron2( hparams).cuda() ## 参数是hparams,因为继承了nn.module,所以有.cuda() if hparams.fp16_run: ## False model.decoder.attention_layer.score_mask_value = finfo( 'float16').min ##? if hparams.distributed_run: ## False model = apply_gradient_allreduce(model) return model
def load_Tacotron2(hparams, device=torch.device('cuda')): model = Tacotron2(hparams).to(device) if hparams.fp16_run: model = batchnorm_to_float(model.half()) model = lstmcell_to_float(model) model.decoder.attention_layer.score_mask_value = float( finfo('float16').min) if hparams.distributed_run: model = apply_gradient_allreduce(model) return model, Tacotron2Loss()
def load_model(hyper_params): # according to the documentation, it is recommended to move a model to GPU before constructing the optimizer model = tacotron_2(hyper_params).cuda() if hyper_params[ 'fp16_run']: # converts everything into half type (16 bits) model = batchnorm_to_float(model.half()) model.decoder.attention_layer.score_mask_value = float( finfo('float16').min) if hyper_params['distributed_run']: model = apply_gradient_allreduce(model) return model
def load_model(hparams, device="cuda"): if hparams.model_type == "tacotron2": model = Tacotron2(hparams).to(device) model.requires_durations = False elif hparams.model_type == "forwardtacotron": model = ForwardTacotron(hparams, num_chars=hparams.n_symbols, n_mels=hparams.n_mel_channels).to(device) model.requires_durations = True elif hparams.model_type == "durationtacotron2": model = DurationTacotron2().to(device) model.requires_durations = True if hparams.fp16_run: model.decoder.attention_layer.score_mask_value = finfo('float16').min if hparams.distributed_run: model = apply_gradient_allreduce(model) return model
def setup_model(distributed: bool, rank: int, world_size: int, group_name: str, checkpoint: str, learning_rate: float, fp16: bool, sync_bn: bool, warm_load_keys: tuple): """Create model, cast to fp16 if needed, load checkpoint, apply DDP and sync_bn if needed""" torch.cuda.set_device(rank) model = Model(fp16=fp16) model = model.to( rank) # move model to appropriate gpu when using distributed training opt = torch.optim.Adam(model.parameters(), lr=learning_rate) # order is important: cast to fp16 first, load fp16 checkpoint (with amp weights), apply DDP, apply sync_bn # as stated here: https://github.com/NVIDIA/apex/tree/master/examples/imagenet if fp16: # Initialization with apex opt_level = 'O2' model, opt = amp.initialize(model, opt, opt_level=opt_level) iteration = 0 best_metric = 1e3 if checkpoint: iteration, best_metric = load_checkpoint(checkpoint, model, opt, fp16, rank, warm_load_keys) print(f"resuming from {iteration} iteration") if distributed: # set default .to('cuda') behavior to current local rank setup_distributed(rank, world_size, group_name) model = apply_gradient_allreduce(model) # run with python -m torch.distributed.launch --nproc_per_node=NUM_GPUS train.py if sync_bn: from apex.parallel import convert_syncbn_model if rank == 0: print("using apex synced BN") model = convert_syncbn_model(model) return model, opt, iteration, best_metric
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard): torch.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cpu() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cpu()) audio = torch.autograd.Variable(audio.cpu()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() print("{}:\t{:.9f}".format(iteration, reduced_loss)) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) model = initiate_model(hparams) learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss() logger = prepare_directories_and_logger(output_directory, log_directory, rank) single_train_loader, single_valset, single_collate_fn, single_train_sampler = prepare_single_dataloaders( hparams, output_directory) train_loader, valset, collate_fn, train_sampler = prepare_dataloaders( hparams, output_directory) single_train_loader.dataset.speaker_ids = train_loader.dataset.speaker_ids single_valset.speaker_ids = train_loader.dataset.speaker_ids # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model, hparams.ignore_layers) else: # model = torch.nn.DataParallel(model) model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(single_train_loader))) model = torch.nn.DataParallel(model) model.train() is_overflow = False # init training loop with single speaker for epoch in range(epoch_offset, 30): print("Epoch: {}".format(epoch)) if single_train_sampler is not None: single_train_sampler.set_epoch(epoch) for i, batch in enumerate(single_train_loader): start = time.perf_counter() if iteration > 0 and iteration % hparams.learning_rate_anneal == 0: learning_rate = max(hparams.learning_rate_min, learning_rate * 0.5) for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = parse_batch(batch) mel_outputs, mel_outputs_postnet, gate_outputs, alignments, length = model( x) y_pred = parse_output( [mel_outputs, mel_outputs_postnet, gate_outputs, alignments], length) loss = criterion(y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start print( "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): validate(model, criterion, single_valset, iteration, hparams.batch_size, n_gpus, single_collate_fn, logger, hparams.distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model.module, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1 # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(30, hparams.epochs): print("Epoch: {}".format(epoch)) if train_sampler is not None: train_sampler.set_epoch(epoch) for i, batch in enumerate(train_loader): start = time.perf_counter() if iteration > 0 and iteration % hparams.learning_rate_anneal == 0: learning_rate = max(hparams.learning_rate_min, learning_rate * 0.5) for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = parse_batch(batch) mel_outputs, mel_outputs_postnet, gate_outputs, alignments, length = model( x) y_pred = parse_output( [mel_outputs, mel_outputs_postnet, gate_outputs, alignments], length) loss = criterion(y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start print( "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model.module, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def load_model(hparams): model = Parrot(hparams).cuda() if hparams.distributed_run: model = apply_gradient_allreduce(model) return model
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) model = load_model(hparams) learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss() logger = prepare_directories_and_logger(output_directory, log_directory, rank) train_loader, valset, collate_fn = prepare_dataloaders(hparams) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() is_overflow = False skipped = 0 # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): start = time.perf_counter() for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) try: y_pred = model(x) except ValueError: skipped += 1 print( 'Skipped an iteration due to value error, you have now skipped {} iterations' .format(skipped)) iteration += 1 continue loss = criterion(y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start print( "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0) and iteration > 0: validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(n_gpus, rank, output_directory, epochs, optim_algo, learning_rate, weight_decay, sigma, iters_per_checkpoint, batch_size, seed, checkpoint_path, ignore_layers, include_layers, finetune_layers, warmstart_checkpoint_path, with_tensorboard, grad_clip_val, fp16_run, tensorboard_path=None): fp16_run = bool(fp16_run) torch.manual_seed(seed) torch.cuda.manual_seed(seed) if n_gpus > 1: init_distributed(rank, n_gpus, **dist_config) criterion = FlowtronLoss(sigma, bool(model_config['n_components']), bool(model_config['use_gate_layer'])) model = Flowtron(**model_config).cuda() if len(finetune_layers): for name, param in model.named_parameters(): if name in finetune_layers: param.requires_grad = True else: param.requires_grad = False print("Initializing %s optimizer" % (optim_algo)) if optim_algo == 'Adam': optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) elif optim_algo == 'RAdam': optimizer = RAdam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) else: print("Unrecognized optimizer %s!" % (optim_algo)) exit(1) # Load checkpoint if one exists iteration = 0 if warmstart_checkpoint_path != "": model = warmstart(warmstart_checkpoint_path, model) if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer, ignore_layers) iteration += 1 # next iteration is iteration + 1 if n_gpus > 1: model = apply_gradient_allreduce(model) print(model) scaler = amp.GradScaler(enabled=fp16_run) train_loader, valset, collate_fn = prepare_dataloaders( data_config, n_gpus, batch_size) # Get shared output_directory ready if rank == 0 and not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("Output directory", output_directory) if with_tensorboard and rank == 0: tboard_out_path = tensorboard_path if tensorboard_path is None: tboard_out_path = os.path.join(output_directory, "logs/run1") print("Setting up Tensorboard log in %s" % (tboard_out_path)) logger = FlowtronLogger(tboard_out_path) # force set the learning rate to what is specified for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for batch in train_loader: model.zero_grad() mel, speaker_vecs, text, in_lens, out_lens, gate_target, attn_prior = batch mel, speaker_vecs, text = mel.cuda(), speaker_vecs.cuda( ), text.cuda() in_lens, out_lens, gate_target = in_lens.cuda(), out_lens.cuda( ), gate_target.cuda() attn_prior = attn_prior.cuda() if valset.use_attn_prior else None with amp.autocast(enabled=fp16_run): z, log_s_list, gate_pred, attn, mean, log_var, prob = model( mel, speaker_vecs, text, in_lens, out_lens, attn_prior) loss_nll, loss_gate = criterion( (z, log_s_list, gate_pred, mean, log_var, prob), gate_target, out_lens) loss = loss_nll + loss_gate if n_gpus > 1: reduced_loss = reduce_tensor(loss.data, n_gpus).item() reduced_gate_loss = reduce_tensor(loss_gate.data, n_gpus).item() reduced_nll_loss = reduce_tensor(loss_nll.data, n_gpus).item() else: reduced_loss = loss.item() reduced_gate_loss = loss_gate.item() reduced_nll_loss = loss_nll.item() scaler.scale(loss).backward() if grad_clip_val > 0: scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip_val) scaler.step(optimizer) scaler.update() if rank == 0: print("{}:\t{:.9f}".format(iteration, reduced_loss), flush=True) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, iteration) logger.add_scalar('training_loss_gate', reduced_gate_loss, iteration) logger.add_scalar('training_loss_nll', reduced_nll_loss, iteration) logger.add_scalar('learning_rate', learning_rate, iteration) if iteration % iters_per_checkpoint == 0: val_loss, val_loss_nll, val_loss_gate, attns, gate_pred, gate_target = compute_validation_loss( model, criterion, valset, collate_fn, batch_size, n_gpus) if rank == 0: print("Validation loss {}: {:9f} ".format( iteration, val_loss)) if with_tensorboard: logger.log_validation(val_loss, val_loss_nll, val_loss_gate, attns, gate_pred, gate_target, iteration) checkpoint_path = "{}/model_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(output_directory, log_directory, checkpoint_path, warm_start, warm_start_force, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ # setup distributed hparams.n_gpus = n_gpus hparams.rank = rank if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) # reproducablilty stuffs torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) # initialize blank model model = load_model(hparams) model.eval() # initialize blank discriminator discriminator = load_model_d(hparams) discriminator.eval() # (optional) show the names of each layer in model, mainly makes it easier to copy/paste what you want to adjust if hparams.print_layer_names_during_startup: print(*[f"Layer{i} = "+str(x[0])+" "+str(x[1].shape) for i,x in enumerate(list(model.named_parameters()))], sep="\n") # (optional) Freeze layers by disabling grads if len(hparams.frozen_modules): for layer, params in list(model.named_parameters()): if any(layer.startswith(module) for module in hparams.frozen_modules): params.requires_grad = False print(f"Layer: {layer} has been frozen") # define optimizer (any params without requires_grad are ignored) if True: optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.0, weight_decay=hparams.weight_decay) discriminator_optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, discriminator.parameters()), lr=0.0, weight_decay=hparams.weight_decay) else: optimizer = apexopt.FusedAdam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.0, weight_decay=hparams.weight_decay) discriminator_optimizer = apexopt.FusedAdam(filter(lambda p: p.requires_grad, discriminator.parameters()), lr=0.0, weight_decay=hparams.weight_decay) if hparams.fp16_run: opt_level = 'O1' model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) discriminator, discriminator_optimizer = amp.initialize(discriminator, discriminator_optimizer, opt_level=opt_level) if hparams.distributed_run: model = apply_gradient_allreduce(model) discriminator = apply_gradient_allreduce(discriminator) logger = prepare_directories_and_logger( output_directory, log_directory, rank, hparams.sampling_rate) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 saved_lookup = None if checkpoint_path is not None: if warm_start: model, iteration, saved_lookup = warm_start_model(checkpoint_path, model, hparams.ignore_layers) elif warm_start_force: model, iteration, saved_lookup = warm_start_force_model(checkpoint_path, model) else: model, optimizer, discriminator, discriminator_optimizer, iteration, saved_lookup = load_checkpoint( checkpoint_path, model, optimizer, discriminator, discriminator_optimizer) iteration += 1 # next iteration is iteration + 1 print('Model Loaded') # define datasets/dataloaders train_loader, valset, collate_fn, train_sampler, trainset = prepare_dataloaders(hparams, saved_lookup) epoch_offset = max(0, int(iteration / len(train_loader))) speaker_lookup = trainset.speaker_ids model.train() discriminator.train() is_overflow = False rolling_loss = StreamingMovingAverage(min(int(len(train_loader)), 200)) rolling_d_loss = StreamingMovingAverage(min(int(len(train_loader)), 200)) # ================ MAIN TRAINNIG LOOP! =================== for epoch in tqdm(range(epoch_offset, hparams.epochs), initial=epoch_offset, total=hparams.epochs, desc="Epoch:", position=1, unit="epoch"): tqdm.write("Epoch:{}".format(epoch)) if hparams.distributed_run: # shuffles the train_loader when doing multi-gpu training train_sampler.set_epoch(epoch) start_time = time.time() # start iterating through the epoch for i, batch in tqdm(enumerate(train_loader), desc="Iter: ", smoothing=0, total=len(train_loader), position=0, unit="iter"): ################################### ### Live Learning Rate & Params ### ################################### if (iteration % 10 == 0 or i==0): try: with open("run_every_epoch.py") as f: internal_text = str(f.read()) if len(internal_text) > 0: ldict = {'iteration': iteration} exec(internal_text, globals(), ldict) else: print("[info] tried to execute 'run_every_epoch.py' but it is empty") except Exception as ex: print(f"[warning] 'run_every_epoch.py' FAILED to execute!\nException:\n{ex}") globals().update(ldict) locals().update(ldict) if iteration < decay_start: learning_rate = A_ + C_ else: iteration_adjusted = iteration - decay_start learning_rate = (A_*(e**(-iteration_adjusted/B_))) + C_ learning_rate = max(min_learning_rate, learning_rate) for param_group in optimizer.param_groups: param_group['lr'] = learning_rate for param_group in discriminator_optimizer.param_groups: param_group['lr'] = learning_rate * descriminator_loss_scale # /run external code every epoch, allows the run to be adjusting without restarts/ ######################### ### Model Stuff ### ######################### model.zero_grad() x = model.parse_batch(batch) # move batch to GPU (async) true_labels = torch.zeros(hparams.batch_size, device=x[0].device, dtype=x[0].dtype)# [B] fake_labels = torch.ones( hparams.batch_size, device=x[0].device, dtype=x[0].dtype)# [B] pred_audio, pred_durations = model(x) model_fakeness = discriminator(pred_audio, x[1]) # [B] -> [] predict fakeness of generated samples model_loss = nn.BCELoss()(model_fakeness, true_labels) # calc loss to decrease fakeness of model reduced_model_loss = reduce_tensor(model_loss.data, n_gpus).item() if hparams.distributed_run else model_loss.item() if hparams.fp16_run: with amp.scale_loss(model_loss, optimizer) as scaled_loss: scaled_loss.backward() else: model_loss.backward() if grad_clip_thresh > 0.0: if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), grad_clip_thresh) is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), grad_clip_thresh) optimizer.step() ############################# ### Discriminator Stuff ### ############################# # Train the discriminator on the true/generated data discriminator_optimizer.zero_grad() true_fakeness = discriminator(x[0], x[1])# predicted fakeness of the actual audio sample. true_discriminator_loss = nn.BCELoss()(true_fakeness, true_labels)# loss for predicted fakeness of actual real audio. # add .detach() here think about this model_fakeness = discriminator(pred_audio.detach(), x[1]) fake_discriminator_loss = nn.BCELoss()(model_fakeness, fake_labels)# calc loss to increase fakeness of discriminator when it sees these samples discriminator_loss = (true_discriminator_loss + fake_discriminator_loss) / 2 reduced_discriminator_loss = reduce_tensor(discriminator_loss.data, n_gpus).item() if hparams.distributed_run else discriminator_loss.item() if hparams.fp16_run: with amp.scale_loss(discriminator_loss, discriminator_optimizer) as scaled_d_loss: scaled_d_loss.backward() else: discriminator_loss.backward() if grad_clip_thresh > 0.0: if hparams.fp16_run: grad_norm_d = torch.nn.utils.clip_grad_norm_( amp.master_params(discriminator_optimizer), grad_clip_thresh) is_overflow = math.isinf(grad_norm_d) or math.isnan(grad_norm_d) else: grad_norm_d = torch.nn.utils.clip_grad_norm_( discriminator.parameters(), grad_clip_thresh) discriminator_optimizer.step() ######################### ### Logging Metrics ### ######################### if not is_overflow and rank == 0: duration = time.time() - start_time average_loss = rolling_loss.process(reduced_model_loss) average_d_loss = rolling_d_loss.process(reduced_discriminator_loss) tqdm.write("{} [Train_loss {:.4f} Avg {:.4f}] [Descrim_loss {:.4f} Avg {:.4f}] [Grad Norm {:.4f} D {:.4f}] [{:.2f}s/it] [{:.3f}s/file] [{:.7f} LR]".format( iteration, reduced_model_loss, average_loss, reduced_discriminator_loss, average_d_loss, grad_norm, grad_norm_d, duration, (duration/(hparams.batch_size*n_gpus)), learning_rate) ) logger.log_training(iteration, reduced_model_loss, reduced_discriminator_loss, grad_norm, grad_norm_d, learning_rate, duration) start_time = time.time() elif is_overflow and rank == 0: tqdm.write("Gradient Overflow! Skipping Step") ######################### ### Save Checkpoints? ### ######################### if not is_overflow and (iteration%hparams.iters_per_checkpoint == 0 or os.path.exists(save_file_check_path)): # save model checkpoint like normal if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, discriminator, discriminator_optimizer, learning_rate, iteration, hparams, speaker_lookup, checkpoint_path) if rank == 0 and os.path.exists(save_file_check_path): os.remove(save_file_check_path) ################################################ ### Valiation (Pred Spectrograms / MOSNet) ### ################################################ if not is_overflow and (iteration%hparams.iters_per_validation==0 or (iteration < 1000 and iteration%250==0)): # perform validation and save "best_val_model" depending on validation loss val_loss = validate(model, valset, iteration, hparams.val_batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) #if rank == 0 and val_loss < best_validation_loss: # checkpoint_path = os.path.join(output_directory, "best_val_model") # save_checkpoint(model, optimizer, discriminator, discriminator_optimizer, learning_rate, iteration, hparams, speaker_lookup, checkpoint_path) #best_validation_loss = min(val_loss, best_validation_loss) iteration += 1
def train(n_gpus, rank, output_directory, epochs, learning_rate, weight_decay, sigma, iters_per_checkpoint, batch_size, seed, checkpoint_path, ignore_layers, include_layers, warmstart_checkpoint_path, with_tensorboard, fp16_run): torch.manual_seed(seed) torch.cuda.manual_seed(seed) if n_gpus > 1: init_distributed(rank, n_gpus, **dist_config) criterion = FlowtronLoss(sigma, bool(model_config['n_components']), model_config['use_gate_layer']) model = Flowtron(**model_config).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) # Load checkpoint if one exists iteration = 0 if warmstart_checkpoint_path != "": model = warmstart(warmstart_checkpoint_path, model) if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer, ignore_layers) iteration += 1 # next iteration is iteration + 1 if n_gpus > 1: model = apply_gradient_allreduce(model) print(model) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') train_loader, valset, collate_fn = prepare_dataloaders( data_config, n_gpus, batch_size) # Get shared output_directory ready if rank == 0 and not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: logger = FlowtronLogger(os.path.join(output_directory, 'logs')) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for batch in train_loader: model.zero_grad() mel, speaker_vecs, text, in_lens, out_lens, gate_target = batch mel, speaker_vecs, text = mel.cuda(), speaker_vecs.cuda( ), text.cuda() in_lens, out_lens, gate_target = in_lens.cuda(), out_lens.cuda( ), gate_target.cuda() z, log_s_list, gate_pred, attn, mean, log_var, prob = model( mel, speaker_vecs, text, in_lens, out_lens) loss = criterion((z, log_s_list, gate_pred, mean, log_var, prob), gate_target, out_lens) if n_gpus > 1: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() if rank == 0: print("{}:\t{:.9f}".format(iteration, reduced_loss), flush=True) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, iteration) logger.add_scalar('learning_rate', learning_rate, iteration) if (iteration % iters_per_checkpoint == 0): val_loss, attns, gate_pred, gate_target = compute_validation_loss( model, criterion, valset, collate_fn, batch_size, n_gpus) if rank == 0: print("Validation loss {}: {:9f} ".format( iteration, val_loss)) if with_tensorboard: logger.log_validation(val_loss, attns, gate_pred, gate_target, iteration) checkpoint_path = "{}/model_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def infer(output_directory, checkpoint_path, warm_start, hparams, debug=False): """Inference with teaching force Params ------ output_directory (string): directory to the spectrograms checkpoint_path(string): checkpoint path hparams (object): comma separated list of "name=value" pairs. """ os.makedirs(output_directory, exist_ok=True) taco_stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, sampling_rate=hparams.sampling_rate) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) model = load_model(hparams) learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) return_file_name = True trainset = TextMelLoader(hparams.training_files, hparams, return_file_name=return_file_name) collate_fn = TextMelCollate(hparams.n_frames_per_step, return_file_name=return_file_name) train_sampler = None train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=hparams.batch_size, pin_memory=False, collate_fn=collate_fn) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.eval() for i, batch in enumerate(train_loader): x, y = model.parse_batch(batch[:][:-1]) files_name = batch[:][-1] mel_outputs, mel_outputs_postnet, _, alignments = model(x) _, _, mel_expected_padded, _, mel_lengths = x for idx in range(mel_outputs_postnet.size(0)): name = os.path.basename(files_name[idx]).replace(".wav", '') mel_padded = mel_outputs_postnet[idx] mel_length = mel_lengths[idx] mel = mel_padded[:, :mel_length] np.save(os.path.join(output_directory, name + '.npy'), mel.detach().cpu().numpy()) if debug: print( "Debug Mode ON: Saving Wave files and Spectrograms Plot in:", output_directory) # plot audios librosa.output.write_wav( os.path.join(output_directory, name + '.wav'), spec_to_waveform(taco_stft, mel).detach().cpu().numpy(), sr=hparams.sampling_rate) librosa.output.write_wav( os.path.join(output_directory, name + '_padded.wav'), spec_to_waveform(taco_stft, mel_padded).detach().cpu().numpy(), sr=hparams.sampling_rate) librosa.output.write_wav( os.path.join(output_directory, name + '_expected_padded.wav'), spec_to_waveform( taco_stft, mel_expected_padded[idx]).detach().cpu().numpy(), sr=hparams.sampling_rate) # plot figures plot_spectrogram(mel.detach().cpu().numpy(), ) plot_spectrogram( mel_padded.detach().cpu().numpy(), os.path.join(output_directory, name + '_padded.png')) plot_spectrogram( mel_expected_padded[idx].detach().cpu().numpy(), os.path.join(output_directory, name + '_expect_padded.png'))
def train(experiment, output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams, max_steps=150000): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): hparams object containing configuration. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) # create model - does not load weights yet model = load_model(hparams) global_mean_path = os.path.join(experiment.paths["acoustic_features"], "global_mean.npy") train_loader, trainset, valset, collate_fn = prepare_dataloaders( experiment, hparams, model.requires_durations) if hparams.drop_frame_rate > 0.: global_mean = calculate_global_mean(train_loader, global_mean_path) hparams.global_mean = global_mean learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) if hparams.model_type == "forwardtacotron": print("Using ForwardTacotronLoss") criterion = ForwardTacotronLoss() else: print("Using TacotronLoss") criterion = Tacotron2Loss() logger = prepare_directories_and_logger(output_directory, log_directory, rank, hparams.model_type) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() is_overflow = False # ================ MAIN TRAINNIG LOOP! =================== #for epoch in range(epoch_offset, hparams.epochs): epoch = epoch_offset while iteration < max_steps: print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): start = time.perf_counter() for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) mel_lens = x[4] if model.requires_durations: dur = x[7] else: dur = None y_pred = model(x) loss, loginfo = criterion(y_pred, y, mel_lens, dur) if model.mi is not None: # transpose to [b, T, dim] decoder_outputs = y_pred[0].transpose(2, 1) ctc_text, ctc_text_lengths, aco_lengths = x[-2], x[-1], x[4] taco_loss = loss mi_loss = model.mi(decoder_outputs, ctc_text, aco_lengths, ctc_text_lengths, dur) if hparams.use_gaf: if i % gradient_adaptive_factor.UPDATE_GAF_EVERY_N_STEP == 0: safe_loss = 0. * sum( [x.sum() for x in model.parameters()]) gaf = gradient_adaptive_factor.calc_grad_adapt_factor( taco_loss + safe_loss, mi_loss + safe_loss, model.parameters(), optimizer) gaf = min(gaf, hparams.max_gaf) else: gaf = 1.0 loss = loss + gaf * mi_loss else: taco_loss = loss mi_loss = torch.tensor([-1.0]) gaf = -1.0 if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() taco_loss = reduce_tensor(taco_loss.data, n_gpus).item() mi_loss = reduce_tensor(mi_loss.data, n_gpus).item() else: reduced_loss = loss.item() taco_loss = taco_loss.item() mi_loss = mi_loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start print("Train loss {} {:.4f} mi_loss {:.4f} Grad Norm {:.4f} " "gaf {:.4f} {:.2f}s/it".format(iteration, taco_loss, mi_loss, grad_norm, gaf, duration)) logger.log_training(loginfo, reduced_loss, taco_loss, mi_loss, grad_norm, gaf, learning_rate, duration, iteration) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) best_checkpoint_path = os.path.join( output_directory, "checkpoint_best".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, best_checkpoint_path) iteration += 1 epoch += 1 # generate GTA features and leave train_loader_tmp = DataLoader(trainset, num_workers=0, shuffle=False, batch_size=hparams.batch_size, pin_memory=False, drop_last=False, collate_fn=collate_fn) val_loader = DataLoader(valset, num_workers=0, shuffle=False, batch_size=hparams.batch_size, pin_memory=False, collate_fn=collate_fn, drop_last=False) create_gta_features(experiment, model, train_loader_tmp, val_loader)
def train(num_gpus, rank, group_name, output_directory, log_directory, checkpoint_path, hparams): torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(hparams.sigma) model = WaveGlow(hparams).cuda() Taco2 = load_pretrained_taco('tacotron2.pt', hparams) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path: model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = TextMelLoader(hparams.training_files, hparams) collate_fn = TextMelCollate() # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== batch_size = hparams.batch_size train_loader = DataLoader(trainset, num_workers=0, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) # Get shared output_directory readya if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if hparams.with_tensorboard and rank == 0: logger = prepare_directories_and_logger(output_directory, log_directory) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) print("Total Epochs: {}".format(hparams.epochs)) print("Batch Size: {}".format(hparams.batch_size)) print("learning rate: {}".format(hparams.learning_rate)) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() text_padded, input_lengths, mel_padded, max_len, output_lengths = parse_batch( batch) with torch.no_grad(): enc_outputs, alignments = Taco2( (text_padded, input_lengths, mel_padded, max_len, output_lengths)) # mel_padded = mel_padded.transpose(1, 2) # mel_padded = mel_padded / torch.abs(mel_padded).max().item() mel_pos = torch.arange(1000) mel_pos = to_gpu(mel_pos).long().unsqueeze(0) mel_pos = mel_pos.expand(hparams.batch_size, -1) src_pos = torch.arange(hparams.n_position) src_pos = to_gpu(src_pos).long().unsqueeze(0) src_pos = src_pos.expand(hparams.batch_size, -1) mel_padded = (mel_padded + 5) / 10 z, log_s_list, log_det_w_list, dec_enc_attn = model( mel_padded, enc_outputs, mel_pos, src_pos, input_lengths) outputs = (z, log_s_list, log_det_w_list, dec_enc_attn) loss = criterion(outputs, alignments) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() print("{}:\t{:.9f}".format(iteration, reduced_loss)) if hparams.with_tensorboard and rank == 0: logger.log_training(reduced_loss, grad_norm, learning_rate, iteration) if (iteration % hparams.iters_per_checkpoint == 0): if rank == 0: mel_predict, test_attn = model.test( mel_padded, enc_outputs, mel_pos, src_pos, input_lengths) logger.log_alignment(model, dec_enc_attn, alignments, mel_padded, mel_predict, test_attn, iteration) checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(num_gpus, rank, group_name, output_directory, log_directory, checkpoint_path): # Get device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') torch.manual_seed(hp.seed) torch.cuda.manual_seed(hp.seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(hp.sigma) model = WaveGlow().cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== learning_rate = hp.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if hp.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path: model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 # Get dataset dataset = FastSpeechDataset() # Get training loader print("Get Training Loader") training_loader = DataLoader(dataset, batch_size=hp.batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True, num_workers=cpu_count()) if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if hp.with_tensorboard and rank == 0: logger = prepare_directories_and_logger(output_directory, log_directory) model = model.train() epoch_offset = max(0, int(iteration / len(training_loader))) beta = hp.batch_size print("Total Epochs: {}".format(hp.epochs)) print("Batch Size: {}".format(hp.batch_size)) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hp.epochs): print("Epoch: {}".format(epoch)) for i, data_of_batch in enumerate(training_loader): model.zero_grad() if not hp.pre_target: # Prepare Data src_seq = data_of_batch["texts"] src_pos = data_of_batch["pos"] mel_tgt = data_of_batch["mels"] src_seq = torch.from_numpy(src_seq).long().to(device) src_pos = torch.from_numpy(src_pos).long().to(device) mel_tgt = torch.from_numpy(mel_tgt).float().to(device) alignment_target = get_alignment(src_seq, tacotron2).float().to(device) # For Data Parallel mel_max_len = mel_tgt.size(1) else: # Prepare Data src_seq = data_of_batch["texts"] src_pos = data_of_batch["pos"] mel_tgt = data_of_batch["mels"] alignment_target = data_of_batch["alignment"] src_seq = torch.from_numpy(src_seq).long().to(device) src_pos = torch.from_numpy(src_pos).long().to(device) mel_tgt = torch.from_numpy(mel_tgt).float().to(device) alignment_target = torch.from_numpy( alignment_target).float().to(device) # For Data Parallel mel_max_len = mel_tgt.size(1) outputs = model(src_seq, src_pos, mel_tgt, mel_max_len, alignment_target) _, _, _, duration_predictor = outputs mel_tgt = mel_tgt.transpose(1, 2) max_like, dur_loss = criterion(outputs, alignment_target, mel_tgt) if beta > 1 and iteration % 10000 == 0: beta = beta // 2 loss = max_like + dur_loss if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if hp.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() #grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), hp.grad_clip_thresh) optimizer.step() print("{}:\t{:.9f}".format(iteration, reduced_loss)) if hp.with_tensorboard and rank == 0: logger.log_training(reduced_loss, dur_loss, learning_rate, iteration) if (iteration % hp.save_step == 0): if rank == 0: # logger.log_alignment(model, mel_predict, mel_tgt, iteration) checkpoint_path = "{}/TTSglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(input_directory, output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) # torch.cuda.manual_seed(hparams.seed) model = load_model(hparams) learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss() logger = prepare_directories_and_logger(output_directory, log_directory, rank) train_loader, valset, collate_fn, train_sampler = prepare_dataloaders( input_directory, hparams) # 记录训练的元数据。 meta_folder = os.path.join(output_directory, 'metadata') os.makedirs(meta_folder, exist_ok=True) path = os.path.join(meta_folder, "speakers.json") obj = dict(valset.speaker_ids) json_dump(obj, path) path = os.path.join(meta_folder, "hparams.json") obj = {k: v for k, v in hparams.items()} json_dump(obj, path) path = os.path.join(meta_folder, "symbols.json") from text.symbols import symbols obj = {w: i for i, w in enumerate(symbols)} json_dump(obj, path) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() is_overflow = False # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) if train_sampler is not None: train_sampler.set_epoch(epoch) for i, batch in enumerate(train_loader): start = time.perf_counter() if iteration > 0 and iteration % hparams.learning_rate_anneal == 0: learning_rate = max(hparams.learning_rate_min, learning_rate * 0.5) for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) y_pred = model(x) loss = criterion(y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start print( "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank, outdir=Path(output_directory)) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint-{:06d}.pt".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(num_gpus, rank, group_name, output_directory, epochs, init_lr, final_lr, sigma, epochs_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard): os.makedirs(output_directory, exist_ok=True) torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=init_lr) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists epoch_offset = 1 if checkpoint_path != "": model, optimizer, epoch_offset = load_checkpoint( checkpoint_path, model, optimizer) epoch_offset += 1 # next epoch is epoch_offset + 1 trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=8, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) model.train() # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs + 1): print(f'Epoch: {epoch}') adjust_learning_rate(optimizer, epoch, init_lr, final_lr, epochs) for i, batch in enumerate(tqdm.tqdm(train_loader)): optimizer.zero_grad() batch = model.pre_process(batch) outputs = model(batch) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + 1 + len(train_loader) * epoch) if epoch % epochs_per_checkpoint == 0: if rank == 0: # Keep only one checkpoint last_chkpt = os.path.join( output_directory, f'waveglow_{epoch - epochs_per_checkpoint:06d}.pt') if os.path.exists(last_chkpt): os.remove(last_chkpt) checkpoint_path = os.path.join(output_directory, f'waveglow_{epoch:06d}.pt') save_checkpoint(model, optimizer, epoch, checkpoint_path)
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) model = load_model(hparams) learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss() logger = prepare_directories_and_logger(output_directory, log_directory, rank) #train_loader, valset, collate_fn, train_sampler = prepare_dataloaders(hparams) train_loader, train_sampler, val_loader, val_sampler = prepare_dataloaders( hparams) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() is_overflow = False # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) if train_sampler is not None: train_sampler.set_epoch(epoch) for i, batch in enumerate(train_loader): start = time.perf_counter() if iteration > 0 and iteration % hparams.learning_rate_anneal == 0: learning_rate = max(hparams.learning_rate_min, learning_rate * 0.5) for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() dplist = batch['support']['datapath'] logstr = '||STEP {}, rank {} ||'.format(i, rank) logstr += 'SUPPORTS: ' + '\n'.join(dplist) + '\n' dplist = batch['query']['datapath'] logstr += 'QUERIES: ' + '\n'.join(dplist) + '\n' with open('logs/rk{}.logs'.format(rank), 'at') as f: f.writelines(logstr + '\n\n') x, y = model.parse_batch(batch) y_pred = model(x) loss = criterion(y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start print( "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): # validate(model, criterion, valset, iteration, # hparams.batch_size, n_gpus, collate_fn, logger, # hparams.distributed_run, rank) validate(model, val_sampler, val_loader, criterion, iteration, n_gpus, logger, hparams.distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) model = load_model(hparams) learning_rate = hparams.learning_rate parameters_main, parameters_sc = model.grouped_parameters() optimizer_main = torch.optim.Adam(parameters_main, lr=learning_rate, weight_decay=hparams.weight_decay) optimizer_sc = torch.optim.Adam(parameters_sc, lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = ParrotLoss(hparams).cuda() logger = prepare_directories_and_logger(output_directory, log_directory, rank) train_loader, valset, collate_fn = prepare_dataloaders(hparams) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model) else: model, optimizer_main, optimizer_sc, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer_main, optimizer_sc) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): if epoch > hparams.warmup: learning_rate = hparams.learning_rate * hparams.decay_rate**( (epoch - hparams.warmup) // hparams.decay_every + 1) for i, batch in enumerate(train_loader): start = time.time() for param_group in optimizer_main.param_groups: param_group['lr'] = learning_rate for param_group in optimizer_sc.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) if i % 2 == 0: y_pred = model(x, True) else: y_pred = model(x, False) losses, acces, l_main, l_sc = criterion(y_pred, y) if hparams.distributed_run: reduced_losses = [] for l in losses: reduced_losses.append(reduce_tensor(l.data, n_gpus).item()) reduced_acces = [] for a in acces: reduced_acces.append(reduce_tensor(a.data, n_gpus).item()) redl_main = reduce_tensor(l_main.data, n_gpus).item() redl_sc = reduce_tensor(l_sc.data, n_gpus).item() else: reduced_losses = [l.item() for l in losses] reduced_acces = [a.item() for a in acces] redl_main = l_main.item() redl_sc = l_sc.item() for p in parameters_sc: p.requires_grad_(requires_grad=False) if hparams.fp16_run: optimizer.backward(loss) grad_norm = optimizer.clip_fp32_grads(hparams.grad_clip_thresh) else: l_main.backward(retain_graph=True) grad_norm_main = torch.nn.utils.clip_grad_norm_( parameters_main, hparams.grad_clip_thresh) optimizer_main.step() for p in parameters_sc: p.requires_grad_(requires_grad=True) for p in parameters_main: p.requires_grad_(requires_grad=False) l_sc.backward() grad_norm_sc = torch.nn.utils.clip_grad_norm_( parameters_sc, hparams.grad_clip_thresh) optimizer_sc.step() for p in parameters_main: p.requires_grad_(requires_grad=True) if not math.isnan(redl_main) and rank == 0: duration = time.time() - start task = 'TTS' if i % 2 == 0 else 'VC' print(("Train {} {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( task, iteration, redl_main + redl_sc, grad_norm_main, duration))) logger.log_training(redl_main + redl_sc, reduced_losses, reduced_acces, grad_norm_main, learning_rate, duration, iteration) if (iteration % hparams.iters_per_checkpoint == 0): validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( os.path.join(output_directory, log_directory), "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer_main, optimizer_sc, learning_rate, iteration, checkpoint_path) iteration += 1
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(data_config['training_files'], data_config['segment_length'], data_config['filter_length'], data_config['hop_length'], data_config['win_length'], data_config['sampling_rate'], data_config['mel_fmin'], data_config['mel_fmax'], debug=False) if 'testing_files' in data_config: testset = Mel2Samp(data_config['testing_files'], data_config['segment_length'], data_config['filter_length'], data_config['hop_length'], data_config['win_length'], data_config['sampling_rate'], data_config['mel_fmin'], data_config['mel_fmax'], debug=True) else: testset = None # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) else: logger = None model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): start = time.perf_counter() model.zero_grad() print("train batch loaded, {} ({} of {})".format( iteration, i, len(train_loader))) mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() is_overflow = False if fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), 1.0) is_overflow = math.isnan(grad_norm) optimizer.step() duration = time.perf_counter() - start print( "train batch done, {} ({} of {}): {:.9f} (took {:.2f})".format( iteration, i, len(train_loader), reduced_loss, duration)) if logger: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) logger.add_scalar('duration', duration, i + len(train_loader) * epoch) if testset and not is_overflow and (iteration % iters_per_checkpoint == 0): if testset: validate(model, criterion, testset, iteration, batch_size, num_gpus, logger) if rank == 0: rotate_checkpoints(output_directory) checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def _Set_Distribution(self): if self.num_gpus > 1: self.model = apply_gradient_allreduce(self.model)
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ #rank += 4 if hparams.distributed_run: init_distributed(hparams, rank, group_name) print('checkpoint path: {}'.format(checkpoint_path)) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) model = load_model(hparams) learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize( model, optimizer, opt_level='O1') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss() logger = prepare_directories_and_logger( output_directory, log_directory, rank) train_loader, valset, collate_fn = prepare_dataloaders(hparams) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model( checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.fp16_run: checkpoint = torch.load(checkpoint_path, map_location='cpu') amp_state_dict = checkpoint['amp'] amp.load_state_dict(checkpoint['amp']) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) #print('HERE') model.train() is_overflow = False # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): start = time.perf_counter() for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) #print('X value') #from hashlib import sha1 #np_x = x[0].data.cpu().numpy() #foo = sha1(np_x) #print(foo.hexdigest()) y_pred = model(x) loss = criterion(y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, hparams.world_size).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training( reduced_loss, grad_norm, learning_rate, duration, iteration) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): #validate(model, criterion, valset, iteration, # hparams.batch_size, hparams.world_size, collate_fn, logger, # hparams.distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path, hparams.fp16_run, amp) wandb.save(checkpoint_path) iteration += 1
def train(output_directory, log_directory, checkpoint_path, warm_start, warm_start_force, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ # setup distributed hparams.n_gpus = n_gpus hparams.rank = rank if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) # reproducablilty stuffs torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) # initialize blank model model = load_model(hparams) model.eval() learning_rate = hparams.learning_rate # (optional) show the names of each layer in model, mainly makes it easier to copy/paste what you want to adjust if hparams.print_layer_names_during_startup: print(*[ f"Layer{i} = " + str(x[0]) + " " + str(x[1].shape) for i, x in enumerate(list(model.named_parameters())) ], sep="\n") # (optional) Freeze layers by disabling grads if len(hparams.frozen_modules): for layer, params in list(model.named_parameters()): if any( layer.startswith(module) for module in hparams.frozen_modules): params.requires_grad = False print(f"Layer: {layer} has been frozen") # define optimizer (any params without requires_grad are ignored) optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate, weight_decay=hparams.weight_decay) #optimizer = apexopt.FusedAdam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss(hparams) logger = prepare_directories_and_logger(output_directory, log_directory, rank) # Load checkpoint if one exists best_validation_loss = 0.8 # used to see when "best_model" should be saved, default = 0.4, load_checkpoint will update to last best value. iteration = 0 epoch_offset = 0 _learning_rate = 1e-3 saved_lookup = None if checkpoint_path is not None: if warm_start: model, iteration, saved_lookup = warm_start_model( checkpoint_path, model, hparams.ignore_layers) elif warm_start_force: model, iteration, saved_lookup = warm_start_force_model( checkpoint_path, model) else: model, optimizer, _learning_rate, iteration, best_validation_loss, saved_lookup = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 print('Model Loaded') # define datasets/dataloaders train_loader, valset, collate_fn, train_sampler, trainset = prepare_dataloaders( hparams, saved_lookup) epoch_offset = max(0, int(iteration / len(train_loader))) speaker_lookup = trainset.speaker_ids # define scheduler use_scheduler = 0 if use_scheduler: scheduler = ReduceLROnPlateau(optimizer, factor=0.1**(1 / 5), patience=10) model.train() is_overflow = False validate_then_terminate = 0 if validate_then_terminate: val_loss = validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) raise Exception("Finished Validation") for param_group in optimizer.param_groups: param_group['lr'] = learning_rate rolling_loss = StreamingMovingAverage(min(int(len(train_loader)), 200)) # ================ MAIN TRAINNIG LOOP! =================== for epoch in tqdm(range(epoch_offset, hparams.epochs), initial=epoch_offset, total=hparams.epochs, desc="Epoch:", position=1, unit="epoch"): tqdm.write("Epoch:{}".format(epoch)) if hparams.distributed_run: # shuffles the train_loader when doing multi-gpu training train_sampler.set_epoch(epoch) start_time = time.time() # start iterating through the epoch for i, batch in tqdm(enumerate(train_loader), desc="Iter: ", smoothing=0, total=len(train_loader), position=0, unit="iter"): # run external code every iter, allows the run to be adjusted without restarts if (i == 0 or iteration % param_interval == 0): try: with open("run_every_epoch.py") as f: internal_text = str(f.read()) if len(internal_text) > 0: #code = compile(internal_text, "run_every_epoch.py", 'exec') ldict = {'iteration': iteration} exec(internal_text, globals(), ldict) else: print( "No Custom code found, continuing without changes." ) except Exception as ex: print(f"Custom code FAILED to run!\n{ex}") globals().update(ldict) locals().update(ldict) if show_live_params: print(internal_text) if not iteration % 50: # check actual learning rate every 20 iters (because I sometimes see learning_rate variable go out-of-sync with real LR) learning_rate = optimizer.param_groups[0]['lr'] # Learning Rate Schedule if custom_lr: old_lr = learning_rate if iteration < warmup_start: learning_rate = warmup_start_lr elif iteration < warmup_end: learning_rate = (iteration - warmup_start) * ( (A_ + C_) - warmup_start_lr ) / ( warmup_end - warmup_start ) + warmup_start_lr # learning rate increases from warmup_start_lr to A_ linearly over (warmup_end-warmup_start) iterations. else: if iteration < decay_start: learning_rate = A_ + C_ else: iteration_adjusted = iteration - decay_start learning_rate = (A_ * (e**(-iteration_adjusted / B_))) + C_ assert learning_rate > -1e-8, "Negative Learning Rate." if old_lr != learning_rate: for param_group in optimizer.param_groups: param_group['lr'] = learning_rate # /run external code every epoch, allows the run to be adjusting without restarts/ model.zero_grad() x, y = model.parse_batch(batch) y_pred = model(x) loss, len_loss, loss_z, loss_w, loss_s, loss_att = criterion( y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() reduced_len_loss = reduce_tensor(len_loss.data, n_gpus).item() reduced_loss_z = reduce_tensor(loss_z.data, n_gpus).item() reduced_loss_w = reduce_tensor(loss_w.data, n_gpus).item() reduced_loss_s = reduce_tensor(loss_s.data, n_gpus).item() reduced_loss_att = reduce_tensor( loss_att.data, n_gpus).item() if (loss_att is not None) else 0 else: reduced_loss = loss.item() reduced_len_loss = len_loss.item() reduced_loss_z = loss_z.item() reduced_loss_w = loss_w.item() reduced_loss_s = loss_s.item() reduced_loss_att = loss_att.item() if (loss_att is not None) else 0 if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), grad_clip_thresh) is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.time() - start_time average_loss = rolling_loss.process(reduced_loss) loss_scale = amp._amp_state.loss_scalers[ 0]._loss_scale if hparams.fp16_run else 0 # get current Loss Scale of first optimizer tqdm.write( "{} [Train_loss:{:.4f} Avg:{:.4f} Len:{:.4f} z:{:.4f} w:{:.4f} s:{:.4f} att:{:.4f}] [Grad Norm {:.4f}] " "[{:.2f}s/it] [{:.3f}s/file] [{:.7f} LR] [{} LS]".format( iteration, reduced_loss, average_loss, reduced_len_loss, reduced_loss_z, reduced_loss_w, reduced_loss_s, reduced_loss_att, grad_norm, duration, (duration / (hparams.batch_size * n_gpus)), learning_rate, round(loss_scale))) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) start_time = time.time() #from time import sleep #sleep(2.5) if is_overflow and rank == 0: tqdm.write("Gradient Overflow, Skipping Step") if not is_overflow and ((iteration % (hparams.iters_per_checkpoint / 1) == 0) or (os.path.exists(save_file_check_path))): # save model checkpoint like normal if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, hparams, best_validation_loss, average_loss, speaker_lookup, checkpoint_path) if not is_overflow and ( (iteration % int(validation_interval) == 0) or (os.path.exists(save_file_check_path)) or (iteration < 1000 and (iteration % 250 == 0))): if rank == 0 and os.path.exists(save_file_check_path): os.remove(save_file_check_path) # perform validation and save "best_model" depending on validation loss val_loss = validate(model, criterion, valset, iteration, hparams.val_batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) #validate (0.8 forcing) if use_scheduler: scheduler.step(val_loss) if (val_loss < best_validation_loss): best_validation_loss = val_loss if rank == 0: checkpoint_path = os.path.join(output_directory, "best_model") save_checkpoint(model, optimizer, learning_rate, iteration, hparams, best_validation_loss, average_loss, speaker_lookup, checkpoint_path) iteration += 1
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, loss_empthasis, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, logdirname, datedlogdir, warm_start=False, optimizer='ADAM', start_zero=False): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== global WaveGlow global WaveGlowLoss ax = True # this is **really** bad coding practice :D if ax: from efficient_model_ax import WaveGlow from efficient_loss import WaveGlowLoss else: if waveglow_config["yoyo"]: # efficient_mode # TODO: Add to Config File from efficient_model import WaveGlow from efficient_loss import WaveGlowLoss else: from glow import WaveGlow, WaveGlowLoss criterion = WaveGlowLoss(sigma, loss_empthasis) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== STFTs = [STFT.TacotronSTFT(filter_length=window, hop_length=data_config['hop_length'], win_length=window, sampling_rate=data_config['sampling_rate'], n_mel_channels=160, mel_fmin=0, mel_fmax=16000) for window in data_config['validation_windows']] loader_STFT = STFT.TacotronSTFT(filter_length=data_config['filter_length'], hop_length=data_config['hop_length'], win_length=data_config['win_length'], sampling_rate=data_config['sampling_rate'], n_mel_channels=data_config['n_mel_channels'] if 'n_mel_channels' in data_config.keys() else 160, mel_fmin=data_config['mel_fmin'], mel_fmax=data_config['mel_fmax']) #optimizer = "Adam" optimizer = optimizer.lower() optimizer_fused = bool( 0 ) # use Apex fused optimizer, should be identical to normal but slightly faster and only works on RTX cards if optimizer_fused: from apex import optimizers as apexopt if optimizer == "adam": optimizer = apexopt.FusedAdam(model.parameters(), lr=learning_rate) elif optimizer == "lamb": optimizer = apexopt.FusedLAMB(model.parameters(), lr=learning_rate, max_grad_norm=200) else: if optimizer == "adam": optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) elif optimizer == "lamb": from lamb import Lamb as optLAMB optimizer = optLAMB(model.parameters(), lr=learning_rate) #import torch_optimizer as optim #optimizer = optim.Lamb(model.parameters(), lr=learning_rate) #raise# PyTorch doesn't currently include LAMB optimizer. if fp16_run: global amp from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') else: amp = None ## LEARNING RATE SCHEDULER if True: from torch.optim.lr_scheduler import ReduceLROnPlateau min_lr = 1e-8 factor = 0.1**(1/5) # amount to scale the LR by on Validation Loss plateau scheduler = ReduceLROnPlateau(optimizer, 'min', factor=factor, patience=20, cooldown=2, min_lr=min_lr, verbose=True, threshold=0.0001, threshold_mode='abs') print("ReduceLROnPlateau used as Learning Rate Scheduler.") else: scheduler=False # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration, scheduler = load_checkpoint(checkpoint_path, model, optimizer, scheduler, fp16_run, warm_start=warm_start) iteration += 1 # next iteration is iteration + 1 if start_zero: iteration = 0 trainset = Mel2Samp(**data_config, check_files=True) speaker_lookup = trainset.speaker_ids # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: train_sampler = DistributedSampler(trainset, shuffle=True) shuffle = False else: train_sampler = None shuffle = True # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=3, shuffle=shuffle, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter if datedlogdir: timestr = time.strftime("%Y_%m_%d-%H_%M_%S") log_directory = os.path.join(output_directory, logdirname, timestr) else: log_directory = os.path.join(output_directory, logdirname) logger = SummaryWriter(log_directory) moving_average = int(min(len(train_loader), 100)) # average loss over entire Epoch rolling_sum = StreamingMovingAverage(moving_average) start_time = time.time() start_time_iter = time.time() start_time_dekaiter = time.time() model.train() # best (averaged) training loss if os.path.exists(os.path.join(output_directory, "best_model")+".txt"): best_model_loss = float(str(open(os.path.join(output_directory, "best_model")+".txt", "r", encoding="utf-8").read()).split("\n")[0]) else: best_model_loss = -6.20 # best (validation) MSE on inferred spectrogram. if os.path.exists(os.path.join(output_directory, "best_val_model")+".txt"): best_MSE = float(str(open(os.path.join(output_directory, "best_val_model")+".txt", "r", encoding="utf-8").read()).split("\n")[0]) else: best_MSE = 9e9 epoch_offset = max(0, int(iteration / len(train_loader))) pytorch_total_params = sum(p.numel() for p in model.parameters()) print("{:,} total parameters in model".format(pytorch_total_params)) pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print("{:,} trainable parameters.".format(pytorch_total_params)) print(f"Segment Length: {data_config['segment_length']:,}\nBatch Size: {batch_size:,}\nNumber of GPUs: {num_gpus:,}\nSamples/Iter: {data_config['segment_length']*batch_size*num_gpus:,}") training = True while training: try: if rank == 0: epochs_iterator = tqdm(range(epoch_offset, epochs), initial=epoch_offset, total=epochs, smoothing=0.01, desc="Epoch", position=1, unit="epoch") else: epochs_iterator = range(epoch_offset, epochs) # ================ MAIN TRAINING LOOP! =================== for epoch in epochs_iterator: print(f"Epoch: {epoch}") if num_gpus > 1: train_sampler.set_epoch(epoch) if rank == 0: iters_iterator = tqdm(enumerate(train_loader), desc=" Iter", smoothing=0, total=len(train_loader), position=0, unit="iter", leave=True) else: iters_iterator = enumerate(train_loader) for i, batch in iters_iterator: # run external code every iter, allows the run to be adjusted without restarts if (i==0 or iteration % param_interval == 0): try: with open("run_every_epoch.py") as f: internal_text = str(f.read()) if len(internal_text) > 0: #code = compile(internal_text, "run_every_epoch.py", 'exec') ldict = {'iteration': iteration, 'seconds_elapsed': time.time()-start_time} exec(internal_text, globals(), ldict) else: print("No Custom code found, continuing without changes.") except Exception as ex: print(f"Custom code FAILED to run!\n{ex}") globals().update(ldict) locals().update(ldict) if show_live_params: print(internal_text) if not iteration % 50: # check actual learning rate every 20 iters (because I sometimes see learning_rate variable go out-of-sync with real LR) learning_rate = optimizer.param_groups[0]['lr'] # Learning Rate Schedule if custom_lr: old_lr = learning_rate if iteration < warmup_start: learning_rate = warmup_start_lr elif iteration < warmup_end: learning_rate = (iteration-warmup_start)*((A_+C_)-warmup_start_lr)/(warmup_end-warmup_start) + warmup_start_lr # learning rate increases from warmup_start_lr to A_ linearly over (warmup_end-warmup_start) iterations. else: if iteration < decay_start: learning_rate = A_ + C_ else: iteration_adjusted = iteration - decay_start learning_rate = (A_*(e**(-iteration_adjusted/B_))) + C_ assert learning_rate > -1e-8, "Negative Learning Rate." if old_lr != learning_rate: for param_group in optimizer.param_groups: param_group['lr'] = learning_rate else: scheduler.patience = scheduler_patience scheduler.cooldown = scheduler_cooldown if override_scheduler_last_lr: scheduler._last_lr = override_scheduler_last_lr if override_scheduler_best: scheduler.best = override_scheduler_best if override_scheduler_last_lr or override_scheduler_best: print("scheduler._last_lr =", scheduler._last_lr, "scheduler.best =", scheduler.best, " |", end='') model.zero_grad() mel, audio, speaker_ids = batch mel = torch.autograd.Variable(mel.cuda(non_blocking=True)) audio = torch.autograd.Variable(audio.cuda(non_blocking=True)) speaker_ids = speaker_ids.cuda(non_blocking=True).long().squeeze(1) outputs = model(mel, audio, speaker_ids) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if (reduced_loss > LossExplosionThreshold) or (math.isnan(reduced_loss)): model.zero_grad() raise LossExplosion(f"\nLOSS EXPLOSION EXCEPTION ON RANK {rank}: Loss reached {reduced_loss} during iteration {iteration}.\n\n\n") if use_grad_clip: if fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), grad_clip_thresh) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), grad_clip_thresh) if type(grad_norm) == torch.Tensor: grad_norm = grad_norm.item() is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm) else: is_overflow = False; grad_norm=0.00001 optimizer.step() if not is_overflow and rank == 0: # get current Loss Scale of first optimizer loss_scale = amp._amp_state.loss_scalers[0]._loss_scale if fp16_run else 32768 if with_tensorboard: if (iteration % 100000 == 0): # plot distribution of parameters for tag, value in model.named_parameters(): tag = tag.replace('.', '/') logger.add_histogram(tag, value.data.cpu().numpy(), iteration) logger.add_scalar('training_loss', reduced_loss, iteration) logger.add_scalar('training_loss_samples', reduced_loss, iteration*batch_size) if (iteration % 20 == 0): logger.add_scalar('learning.rate', learning_rate, iteration) if (iteration % 10 == 0): logger.add_scalar('duration', ((time.time() - start_time_dekaiter)/10), iteration) average_loss = rolling_sum.process(reduced_loss) if (iteration % 10 == 0): tqdm.write("{} {}: {:.3f} {:.3f} {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective) {:.2f}s/iter {:.4f}s/item".format(time.strftime("%H:%M:%S"), iteration, reduced_loss, average_loss, best_MSE, round(grad_norm,3), learning_rate, min((grad_clip_thresh/grad_norm)*learning_rate,learning_rate), (time.time() - start_time_dekaiter)/10, ((time.time() - start_time_dekaiter)/10)/(batch_size*num_gpus))) start_time_dekaiter = time.time() else: tqdm.write("{} {}: {:.3f} {:.3f} {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective) {}LS".format(time.strftime("%H:%M:%S"), iteration, reduced_loss, average_loss, best_MSE, round(grad_norm,3), learning_rate, min((grad_clip_thresh/grad_norm)*learning_rate,learning_rate), loss_scale)) start_time_iter = time.time() if rank == 0 and (len(rolling_sum.values) > moving_average-2): if (average_loss+best_model_margin) < best_model_loss: checkpoint_path = os.path.join(output_directory, "best_model") try: save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) except KeyboardInterrupt: # Avoid corrupting the model. save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) text_file = open((f"{checkpoint_path}.txt"), "w", encoding="utf-8") text_file.write(str(average_loss)+"\n"+str(iteration)) text_file.close() best_model_loss = average_loss #Only save the model if X better than the current loss. if rank == 0 and iteration > 0 and ((iteration % iters_per_checkpoint == 0) or (os.path.exists(save_file_check_path))): checkpoint_path = f"{output_directory}/waveglow_{iteration}" save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) if (os.path.exists(save_file_check_path)): os.remove(save_file_check_path) if (iteration % validation_interval == 0): if rank == 0: MSE, MAE = validate(model, loader_STFT, STFTs, logger, iteration, data_config['validation_files'], speaker_lookup, sigma, output_directory, data_config) if scheduler: MSE = torch.tensor(MSE, device='cuda') if num_gpus > 1: broadcast(MSE, 0) scheduler.step(MSE.item()) if MSE < best_MSE: checkpoint_path = os.path.join(output_directory, "best_val_model") try: save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) except KeyboardInterrupt: # Avoid corrupting the model. save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) text_file = open((f"{checkpoint_path}.txt"), "w", encoding="utf-8") text_file.write(str(MSE.item())+"\n"+str(iteration)) text_file.close() best_MSE = MSE.item() #Only save the model if X better than the current loss. else: if scheduler: MSE = torch.zeros(1, device='cuda') broadcast(MSE, 0) scheduler.step(MSE.item()) learning_rate = optimizer.param_groups[0]['lr'] #check actual learning rate (because I sometimes see learning_rate variable go out-of-sync with real LR) iteration += 1 training = False # exit the While loop except LossExplosion as ex: # print Exception and continue from checkpoint. (turns out it takes < 4 seconds to restart like this, f*****g awesome) print(ex) # print Loss checkpoint_path = os.path.join(output_directory, "best_model") assert os.path.exists(checkpoint_path), "best_val_model must exist for automatic restarts" # clearing VRAM for load checkpoint audio = mel = speaker_ids = loss = None torch.cuda.empty_cache() model.eval() model, optimizer, iteration, scheduler = load_checkpoint(checkpoint_path, model, optimizer, scheduler, fp16_run) learning_rate = optimizer.param_groups[0]['lr'] epoch_offset = max(0, int(iteration / len(train_loader))) model.train() iteration += 1 pass # and continue training.
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) torch.nn.functional.sigmoid model = load_model(hparams) learning_rate = hparams.learning_rate #lr = args.lr * (0.1 ** (epoch // 30)) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) # optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, dampening=0, weight_decay=hparams.weight_decay) if hparams.fp16_run: optimizer = FP16_Optimizer( optimizer, dynamic_loss_scale=hparams.dynamic_loss_scaling) if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss(hparams) logger = prepare_directories_and_logger(output_directory, log_directory, rank) train_loader, valset, collate_fn = prepare_dataloaders(hparams) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() # ================ MAIN TRAINNIG LOOP! =================== step = 0 for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): start = time.perf_counter() for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) y_pred = model(x) loss, recon_loss, S_kl_loss, R_kl_loss, speaker_loss, augment_loss, alignment_loss = criterion( y_pred, y, iteration) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: optimizer.backward(loss) grad_norm = optimizer.clip_fp32_grads(hparams.grad_clip_thresh) else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() overflow = optimizer.overflow if hparams.fp16_run else False if not overflow and not math.isnan(reduced_loss) and rank == 0: duration = time.perf_counter() - start print( "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, recon_loss, S_kl_loss, R_kl_loss, \ speaker_loss, augment_loss, alignment_loss, iteration) if not overflow and (iteration % hparams.iters_per_checkpoint == 0): validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, num_workers=4): print("num_workers", num_workers) torch.manual_seed(seed) torch.cuda.manual_seed(seed) # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) # =====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) # =====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) scheduler = StepLR(optimizer, step_size=1, gamma=0.96) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(**data_config) evalset = Mel2Samp(**eval_data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None eval_sampler = DistributedSampler(evalset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=num_workers, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) eval_loader = DataLoader(evalset, num_workers=num_workers, shuffle=False, sampler=eval_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) epoch_offset = max(1, int(iteration / len(train_loader))) start_time = datetime.datetime.now() # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print('Epoch:', epoch, 'LR:', scheduler.get_lr()) elapsed = datetime.datetime.now() - start_time print("Epoch: [{}][els: {}] {}".format( datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), elapsed, epoch)) model.train() total_loss = 0. for i, batch in enumerate(train_loader): model.zero_grad() if waveglow_config["multi_speaker_config"]["use_multi_speaker"]: mel, audio, spk_embed_or_id = batch spk_embed_or_id = torch.autograd.Variable( spk_embed_or_id.cuda()) else: mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) if waveglow_config["multi_speaker_config"]["use_multi_speaker"]: outputs = model((mel, audio, spk_embed_or_id)) else: outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() total_loss += reduced_loss if i > 0 and i % 10 == 0: elapsed = datetime.datetime.now() - start_time print( "[{}][els: {}] epoch {},total steps{}, {}/{} steps:\t{:.9f}" .format( datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), elapsed, epoch, iteration, i, len(train_loader), reduced_loss)) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1 elapsed = datetime.datetime.now() - start_time print("[{}][els: {}] {} epoch :\tavg loss {:.9f}".format( datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), elapsed, epoch, total_loss / len(train_loader))) scheduler.step() eval.eval(eval_loader, model, criterion, num_gpus, start_time, epoch, waveglow_config["multi_speaker_config"]["use_multi_speaker"])