def __init__(self, device='cpu', jit=False): """ Required """ self.device = device self.jit = jit self.hparams = self.create_hparams() self.model = load_model(self.hparams).to(device=device) self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.hparams.learning_rate, weight_decay=self.hparams.weight_decay) self.criterion = Tacotron2Loss().to(device=device) train_loader, valset, collate_fn = prepare_dataloaders(self.hparams) self.example_input, self.target = self.model.parse_batch(list(train_loader)[0], device=self.device)
def load_Tacotron2(hparams, device=torch.device('cuda')): model = Tacotron2(hparams).to(device) if hparams.fp16_run: model = batchnorm_to_float(model.half()) model = lstmcell_to_float(model) model.decoder.attention_layer.score_mask_value = float( finfo('float16').min) if hparams.distributed_run: model = apply_gradient_allreduce(model) return model, Tacotron2Loss()
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ GPU_NUM = 0 # 원하는 GPU 번호 입력 device = torch.device( f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu') torch.cuda.set_device(device) # change allocation of current GPU print('Current cuda device ', torch.cuda.current_device()) # check if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) model = load_model(hparams) learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss() logger = prepare_directories_and_logger(output_directory, log_directory, rank) train_loader, valset, collate_fn, train_loader2, train_loader3 = prepare_dataloaders( hparams) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() is_overflow = False # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) if epoch % 3 == 0: train_loader = train_loader3 print('3sentence synth') elif epoch % 3 == 1: train_loader = train_loader2 print('2sentence synth') else: train_loader = train_loader print('1sentence synth') for i, batch in enumerate(train_loader): start = time.perf_counter() for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) y_pred = model(x) loss, d_loss = criterion(y_pred, y) #for plot encoder attention # if iteration % 1000 == 0: # _, mel, _, _, attns, attns_dec = y_pred # import matplotlib.pylab as plt # plt.figure() # plt.imshow(mel[0].T.cpu().detach().numpy()) # alignment_path = os.path.join("/media/qw/data/Experiment/Encoder_selfAtt/outdir/Encoder_alignment", # "mel_{}".format(iteration)) # plt.savefig(alignment_path) # for j in range(3): # for i in range(4): # plt.imshow(attns[j][i*hparams.batch_size].T.cpu().detach().numpy()) # alignment_path = os.path.join("/media/qw/data/Experiment/Encoder_selfAtt/outdir/Encoder_alignment","alignment_{}_{}_{}".format(iteration, j, i)) # plt.savefig(alignment_path) # plt.imshow(attns_dec[2][i*hparams.batch_size].T.cpu().detach().numpy()) # alignment_path = os.path.join("/media/qw/data/Experiment/Encoder_selfAtt/outdir/Decoder_alignment","alignment_{}_{}".format(iteration, i)) # plt.savefig(alignment_path) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start print( "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration, d_loss.item()) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(output_directory, log_directory, checkpoint_path, warm_start, warm_start_force, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ # setup distributed hparams.n_gpus = n_gpus hparams.rank = rank if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) # reproducablilty stuffs torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) # initialize blank model model = load_model(hparams) model.eval() learning_rate = hparams.learning_rate # (optional) show the names of each layer in model, mainly makes it easier to copy/paste what you want to adjust if hparams.print_layer_names_during_startup: print(*[ f"Layer{i} = " + str(x[0]) + " " + str(x[1].shape) for i, x in enumerate(list(model.named_parameters())) ], sep="\n") # (optional) Freeze layers by disabling grads if len(hparams.frozen_modules): for layer, params in list(model.named_parameters()): if any( layer.startswith(module) for module in hparams.frozen_modules): params.requires_grad = False print(f"Layer: {layer} has been frozen") # define optimizer (any params without requires_grad are ignored) optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate, weight_decay=hparams.weight_decay) #optimizer = apexopt.FusedAdam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss(hparams) logger = prepare_directories_and_logger(output_directory, log_directory, rank) # Load checkpoint if one exists best_validation_loss = 0.8 # used to see when "best_model" should be saved, default = 0.4, load_checkpoint will update to last best value. iteration = 0 epoch_offset = 0 _learning_rate = 1e-3 saved_lookup = None if checkpoint_path is not None: if warm_start: model, iteration, saved_lookup = warm_start_model( checkpoint_path, model, hparams.ignore_layers) elif warm_start_force: model, iteration, saved_lookup = warm_start_force_model( checkpoint_path, model) else: model, optimizer, _learning_rate, iteration, best_validation_loss, saved_lookup = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 print('Model Loaded') # define datasets/dataloaders train_loader, valset, collate_fn, train_sampler, trainset = prepare_dataloaders( hparams, saved_lookup) epoch_offset = max(0, int(iteration / len(train_loader))) speaker_lookup = trainset.speaker_ids # define scheduler use_scheduler = 0 if use_scheduler: scheduler = ReduceLROnPlateau(optimizer, factor=0.1**(1 / 5), patience=10) model.train() is_overflow = False validate_then_terminate = 0 if validate_then_terminate: val_loss = validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) raise Exception("Finished Validation") for param_group in optimizer.param_groups: param_group['lr'] = learning_rate rolling_loss = StreamingMovingAverage(min(int(len(train_loader)), 200)) # ================ MAIN TRAINNIG LOOP! =================== for epoch in tqdm(range(epoch_offset, hparams.epochs), initial=epoch_offset, total=hparams.epochs, desc="Epoch:", position=1, unit="epoch"): tqdm.write("Epoch:{}".format(epoch)) if hparams.distributed_run: # shuffles the train_loader when doing multi-gpu training train_sampler.set_epoch(epoch) start_time = time.time() # start iterating through the epoch for i, batch in tqdm(enumerate(train_loader), desc="Iter: ", smoothing=0, total=len(train_loader), position=0, unit="iter"): # run external code every iter, allows the run to be adjusted without restarts if (i == 0 or iteration % param_interval == 0): try: with open("run_every_epoch.py") as f: internal_text = str(f.read()) if len(internal_text) > 0: #code = compile(internal_text, "run_every_epoch.py", 'exec') ldict = {'iteration': iteration} exec(internal_text, globals(), ldict) else: print( "No Custom code found, continuing without changes." ) except Exception as ex: print(f"Custom code FAILED to run!\n{ex}") globals().update(ldict) locals().update(ldict) if show_live_params: print(internal_text) if not iteration % 50: # check actual learning rate every 20 iters (because I sometimes see learning_rate variable go out-of-sync with real LR) learning_rate = optimizer.param_groups[0]['lr'] # Learning Rate Schedule if custom_lr: old_lr = learning_rate if iteration < warmup_start: learning_rate = warmup_start_lr elif iteration < warmup_end: learning_rate = (iteration - warmup_start) * ( (A_ + C_) - warmup_start_lr ) / ( warmup_end - warmup_start ) + warmup_start_lr # learning rate increases from warmup_start_lr to A_ linearly over (warmup_end-warmup_start) iterations. else: if iteration < decay_start: learning_rate = A_ + C_ else: iteration_adjusted = iteration - decay_start learning_rate = (A_ * (e**(-iteration_adjusted / B_))) + C_ assert learning_rate > -1e-8, "Negative Learning Rate." if old_lr != learning_rate: for param_group in optimizer.param_groups: param_group['lr'] = learning_rate # /run external code every epoch, allows the run to be adjusting without restarts/ model.zero_grad() x, y = model.parse_batch(batch) y_pred = model(x) loss, len_loss, loss_z, loss_w, loss_s, loss_att = criterion( y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() reduced_len_loss = reduce_tensor(len_loss.data, n_gpus).item() reduced_loss_z = reduce_tensor(loss_z.data, n_gpus).item() reduced_loss_w = reduce_tensor(loss_w.data, n_gpus).item() reduced_loss_s = reduce_tensor(loss_s.data, n_gpus).item() reduced_loss_att = reduce_tensor( loss_att.data, n_gpus).item() if (loss_att is not None) else 0 else: reduced_loss = loss.item() reduced_len_loss = len_loss.item() reduced_loss_z = loss_z.item() reduced_loss_w = loss_w.item() reduced_loss_s = loss_s.item() reduced_loss_att = loss_att.item() if (loss_att is not None) else 0 if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), grad_clip_thresh) is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.time() - start_time average_loss = rolling_loss.process(reduced_loss) loss_scale = amp._amp_state.loss_scalers[ 0]._loss_scale if hparams.fp16_run else 0 # get current Loss Scale of first optimizer tqdm.write( "{} [Train_loss:{:.4f} Avg:{:.4f} Len:{:.4f} z:{:.4f} w:{:.4f} s:{:.4f} att:{:.4f}] [Grad Norm {:.4f}] " "[{:.2f}s/it] [{:.3f}s/file] [{:.7f} LR] [{} LS]".format( iteration, reduced_loss, average_loss, reduced_len_loss, reduced_loss_z, reduced_loss_w, reduced_loss_s, reduced_loss_att, grad_norm, duration, (duration / (hparams.batch_size * n_gpus)), learning_rate, round(loss_scale))) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) start_time = time.time() #from time import sleep #sleep(2.5) if is_overflow and rank == 0: tqdm.write("Gradient Overflow, Skipping Step") if not is_overflow and ((iteration % (hparams.iters_per_checkpoint / 1) == 0) or (os.path.exists(save_file_check_path))): # save model checkpoint like normal if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, hparams, best_validation_loss, average_loss, speaker_lookup, checkpoint_path) if not is_overflow and ( (iteration % int(validation_interval) == 0) or (os.path.exists(save_file_check_path)) or (iteration < 1000 and (iteration % 250 == 0))): if rank == 0 and os.path.exists(save_file_check_path): os.remove(save_file_check_path) # perform validation and save "best_model" depending on validation loss val_loss = validate(model, criterion, valset, iteration, hparams.val_batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) #validate (0.8 forcing) if use_scheduler: scheduler.step(val_loss) if (val_loss < best_validation_loss): best_validation_loss = val_loss if rank == 0: checkpoint_path = os.path.join(output_directory, "best_model") save_checkpoint(model, optimizer, learning_rate, iteration, hparams, best_validation_loss, average_loss, speaker_lookup, checkpoint_path) iteration += 1
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) print("Loading models...") model = load_model(hparams) print("Initializing optimizer...") learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: optimizer = FP16_Optimizer( optimizer, dynamic_loss_scale=hparams.dynamic_loss_scaling) criterion = Tacotron2Loss() print("Initializing logger...") logger = prepare_directories_and_logger(output_directory, log_directory, rank) print("Initializing dataloader...") train_loader, valset, collate_fn = prepare_dataloaders(hparams) print("Loading checkpoints...") # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() if hparams.distributed_run or torch.cuda.device_count() > 1: batch_parser = model.module.parse_batch else: batch_parser = model.parse_batch # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): start = time.perf_counter() for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = batch_parser(batch) y_pred = model(x) loss = criterion(y_pred, y) reduced_loss = reduce_tensor(loss.data, n_gpus)[0] \ if hparams.distributed_run else loss.data[0] if hparams.fp16_run: optimizer.backward(loss) grad_norm = optimizer.clip_fp32_grads(hparams.grad_clip_thresh) else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm( model.parameters(), hparams.grad_clip_thresh) optimizer.step() overflow = optimizer.overflow if hparams.fp16_run else False if not overflow and not math.isnan(reduced_loss) and rank == 0: duration = time.perf_counter() - start print( "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) if not overflow and (iteration % hparams.iters_per_checkpoint == 0): reduced_val_loss = validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) if rank == 0: print("Validation loss {}: {:9f} ".format( iteration, reduced_val_loss)) checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) logger.log_validation(reduced_val_loss, model, x, y, y_pred, iteration, hparams) iteration += 1
from model import Tacotron2 from loss_function import Tacotron2Loss hparams = create_hparams() text_loader = TextMelLoader(hparams.training_lst, hparams) collate_fn = TextMelCollate(hparams.n_frames_per_step) text, mel = text_loader[0] # mel.shape (80 * frame_num) plt.matshow(mel, origin='lower') plt.colorbar() plt.savefig('mel_demo.png') train_loader = torch.utils.data.DataLoader(text_loader, num_workers=1, shuffle=False, batch_size=3, pin_memory=False, drop_last=True, collate_fn=collate_fn) print(len(train_loader)) tacotron = Tacotron2(hparams) criterion = Tacotron2Loss() for batch in train_loader: text_padded, text_alignment_padded, input_lengths, mel_padded, alignments, alignments_weights_padded,\ output_lengths = batch max_len = torch.max(input_lengths.data).item() x = (text_padded, input_lengths, mel_padded, max_len, output_lengths) y = (mel_padded, alignments, alignments_weights_padded, text_alignment_padded) y_pred = tacotron(x) print(criterion(y_pred, y)) break
def train(experiment, output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams, max_steps=150000): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): hparams object containing configuration. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) # create model - does not load weights yet model = load_model(hparams) global_mean_path = os.path.join(experiment.paths["acoustic_features"], "global_mean.npy") train_loader, trainset, valset, collate_fn = prepare_dataloaders( experiment, hparams, model.requires_durations) if hparams.drop_frame_rate > 0.: global_mean = calculate_global_mean(train_loader, global_mean_path) hparams.global_mean = global_mean learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) if hparams.model_type == "forwardtacotron": print("Using ForwardTacotronLoss") criterion = ForwardTacotronLoss() else: print("Using TacotronLoss") criterion = Tacotron2Loss() logger = prepare_directories_and_logger(output_directory, log_directory, rank, hparams.model_type) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() is_overflow = False # ================ MAIN TRAINNIG LOOP! =================== #for epoch in range(epoch_offset, hparams.epochs): epoch = epoch_offset while iteration < max_steps: print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): start = time.perf_counter() for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) mel_lens = x[4] if model.requires_durations: dur = x[7] else: dur = None y_pred = model(x) loss, loginfo = criterion(y_pred, y, mel_lens, dur) if model.mi is not None: # transpose to [b, T, dim] decoder_outputs = y_pred[0].transpose(2, 1) ctc_text, ctc_text_lengths, aco_lengths = x[-2], x[-1], x[4] taco_loss = loss mi_loss = model.mi(decoder_outputs, ctc_text, aco_lengths, ctc_text_lengths, dur) if hparams.use_gaf: if i % gradient_adaptive_factor.UPDATE_GAF_EVERY_N_STEP == 0: safe_loss = 0. * sum( [x.sum() for x in model.parameters()]) gaf = gradient_adaptive_factor.calc_grad_adapt_factor( taco_loss + safe_loss, mi_loss + safe_loss, model.parameters(), optimizer) gaf = min(gaf, hparams.max_gaf) else: gaf = 1.0 loss = loss + gaf * mi_loss else: taco_loss = loss mi_loss = torch.tensor([-1.0]) gaf = -1.0 if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() taco_loss = reduce_tensor(taco_loss.data, n_gpus).item() mi_loss = reduce_tensor(mi_loss.data, n_gpus).item() else: reduced_loss = loss.item() taco_loss = taco_loss.item() mi_loss = mi_loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start print("Train loss {} {:.4f} mi_loss {:.4f} Grad Norm {:.4f} " "gaf {:.4f} {:.2f}s/it".format(iteration, taco_loss, mi_loss, grad_norm, gaf, duration)) logger.log_training(loginfo, reduced_loss, taco_loss, mi_loss, grad_norm, gaf, learning_rate, duration, iteration) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) best_checkpoint_path = os.path.join( output_directory, "checkpoint_best".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, best_checkpoint_path) iteration += 1 epoch += 1 # generate GTA features and leave train_loader_tmp = DataLoader(trainset, num_workers=0, shuffle=False, batch_size=hparams.batch_size, pin_memory=False, drop_last=False, collate_fn=collate_fn) val_loader = DataLoader(valset, num_workers=0, shuffle=False, batch_size=hparams.batch_size, pin_memory=False, collate_fn=collate_fn, drop_last=False) create_gta_features(experiment, model, train_loader_tmp, val_loader)
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) model = load_model(hparams) learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss() logger = prepare_directories_and_logger(output_directory, log_directory, rank) #train_loader, valset, collate_fn, train_sampler = prepare_dataloaders(hparams) train_loader, train_sampler, val_loader, val_sampler = prepare_dataloaders( hparams) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() is_overflow = False # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) if train_sampler is not None: train_sampler.set_epoch(epoch) for i, batch in enumerate(train_loader): start = time.perf_counter() if iteration > 0 and iteration % hparams.learning_rate_anneal == 0: learning_rate = max(hparams.learning_rate_min, learning_rate * 0.5) for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() dplist = batch['support']['datapath'] logstr = '||STEP {}, rank {} ||'.format(i, rank) logstr += 'SUPPORTS: ' + '\n'.join(dplist) + '\n' dplist = batch['query']['datapath'] logstr += 'QUERIES: ' + '\n'.join(dplist) + '\n' with open('logs/rk{}.logs'.format(rank), 'at') as f: f.writelines(logstr + '\n\n') x, y = model.parse_batch(batch) y_pred = model(x) loss = criterion(y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start print( "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): # validate(model, criterion, valset, iteration, # hparams.batch_size, n_gpus, collate_fn, logger, # hparams.distributed_run, rank) validate(model, val_sampler, val_loader, criterion, iteration, n_gpus, logger, hparams.distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams, ax_max_run_timer, parameters): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) train_loader, valset, collate_fn, train_sampler = prepare_dataloaders( hparams) model = load_model(hparams) model.train() learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize( model, optimizer, opt_level='O2', min_loss_scale=1.0) #, loss_scale=256.0) if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss() log_dir_counter = 0 starting_log_directory = log_directory while os.path.exists(log_directory): log_dir_counter += 1 log_directory = starting_log_directory + "_" + str(log_dir_counter) logger = prepare_directories_and_logger(output_directory, log_directory, rank) # Load checkpoint if one exists best_validation_loss = 0.6 # used to see when "best_model" should be saved, default = 0.4, load_checkpoint will update to last best value. val_avg_prob = 0.0 iteration = 0 epoch_offset = 0 _learning_rate = 1e-3 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, _learning_rate, iteration, best_validation_loss = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) # define scheduler use_scheduler = 0 if use_scheduler: scheduler = ReduceLROnPlateau(optimizer, factor=0.562341325, patience=15) model.train() is_overflow = False validate_then_terminate = 0 if validate_then_terminate: val_loss = validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) raise Exception("Finished Validation") for param_group in optimizer.param_groups: param_group['lr'] = learning_rate decay_start = parameters["decay_start"] A_ = parameters["lr_A"] B_ = parameters["lr_B"] C_ = 0 min_learning_rate = 1e-6 epochs_between_updates = parameters["epochs_between_updates"] p_teacher_forcing = 1.00 teacher_force_till = 30 rolling_loss = StreamingMovingAverage(int(len(train_loader))) ax_start_time = time.time() # ================ MAIN TRAINNIG LOOP! =================== for epoch in tqdm(range(epoch_offset, hparams.epochs), initial=epoch_offset, total=hparams.epochs, desc="Epoch:", position=1, unit="epoch"): tqdm.write("Epoch:{}".format(epoch)) # run external code every epoch, allows the run to be adjusting without restarts try: with open("run_every_epoch.py") as f: internal_text = str(f.read()) if len(internal_text) > 0: print(internal_text) #code = compile(internal_text, "run_every_epoch.py", 'exec') ldict = {} exec(internal_text, globals(), ldict) C_ = ldict['C_'] min_learning_rate = ldict['min_learning_rate'] p_teacher_forcing = ldict['p_teacher_forcing'] teacher_force_till = ldict['teacher_force_till'] print( "Custom code excecuted\nPlease remove code if it was intended to be ran once." ) else: print("No Custom code found, continuing without changes.") except Exception as ex: print(f"Custom code FAILED to run!\n{ex}") print("decay_start is ", decay_start) print("A_ is ", A_) print("B_ is ", B_) print("C_ is ", C_) print("min_learning_rate is ", min_learning_rate) print("epochs_between_updates is ", epochs_between_updates) print("p_teacher_forcing is ", p_teacher_forcing) print("teacher_force_till is ", teacher_force_till) if epoch % epochs_between_updates == 0 or epoch_offset == epoch: #if None: tqdm.write("Old learning rate [{:.6f}]".format(learning_rate)) if iteration < decay_start: learning_rate = A_ else: iteration_adjusted = iteration - decay_start learning_rate = (A_ * (e**(-iteration_adjusted / B_))) + C_ learning_rate = max(min_learning_rate, learning_rate) # output the largest number #if epoch_offset == epoch: # hold learning rate low during first pass to let optimizer rebuild # learning_rate = 1e-5 tqdm.write( "Changing Learning Rate to [{:.6f}]".format(learning_rate)) for param_group in optimizer.param_groups: param_group['lr'] = learning_rate if hparams.distributed_run: # shuffles the train_loader when doing multi-gpu training train_sampler.set_epoch(epoch) start_time = time.time() # start iterating through the epoch for i, batch in tqdm(enumerate(train_loader), desc="Iter: ", smoothing=0, total=len(train_loader), position=0, unit="iter"): model.zero_grad() x, y = model.parse_batch(batch) # move batch to GPU (async) y_pred = model(x, teacher_force_till=teacher_force_till, p_teacher_forcing=p_teacher_forcing) loss = criterion(y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() for j, param_group in enumerate(optimizer.param_groups): learning_rate = (float(param_group['lr'])) break if not is_overflow and rank == 0: duration = time.time() - start_time average_loss = rolling_loss.process(reduced_loss) tqdm.write( "{} [Train_loss {:.4f} Avg {:.4f}] [Grad Norm {:.4f}] " "[{:.2f}s/it] [{:.3f}s/file] [{:.7f} LR]".format( iteration, reduced_loss, average_loss, grad_norm, duration, (duration / (hparams.batch_size * n_gpus)), learning_rate)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) start_time = time.time() if is_overflow and rank == 0: tqdm.write("Gradient Overflow, Skipping Step") if rank == 0: if (os.path.exists(save_file_check_path)): os.remove(save_file_check_path) if (time.time() - ax_start_time) > ax_max_run_timer: break iteration += 1 # end of iteration loop # end of epoch loop # perform validation and save "ax_model" val_loss, val_avg_prob = validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) if use_scheduler: scheduler.step(val_loss) if rank == 0: checkpoint_path = os.path.join(output_directory, "ax_model") save_checkpoint(model, optimizer, learning_rate, iteration, best_validation_loss, checkpoint_path) # lets pretend this code is actually able to finish return val_avg_prob
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams, args): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ tstart = time.time() if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) model = load_model(hparams) learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss() logger = prepare_directories_and_logger(output_directory, log_directory, rank) train_loader, valset, collate_fn, train_sampler = prepare_dataloaders( hparams) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model, hparams.ignore_layers) else: if checkpoint_path.startswith('pid'): checkpoint = os.path.basename(checkpoint_path) checkpoint_path = download_checkpoints(args.pid, checkpoint, output_directory) model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() is_overflow = False unsaved_data = False # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): if args.max_duration and time.time() - tstart > args.max_duration: if unsaved_data: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) unsaved_data = False if args.pid: try: log_files = glob.glob( os.path.join(output_directory, log_directory, '*')) upload_to_drive([checkpoint_path] + log_files, args.pid) except Exception as e: print('error while uploading to drive\n%s' % str(e)) break print("Epoch: {}".format(epoch)) unsaved_data = True if train_sampler is not None: train_sampler.set_epoch(epoch) for i, batch in enumerate(train_loader): start = time.perf_counter() if iteration > 0 and iteration % hparams.learning_rate_anneal == 0: learning_rate = max(hparams.learning_rate_min, learning_rate * 0.5) for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) y_pred = model(x) loss = criterion(y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start print( "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) unsaved_data = False if args.pid: try: log_files = glob.glob( os.path.join(output_directory, log_directory, '*')) upload_to_drive([checkpoint_path] + log_files, args.pid) except Exception as e: print('error while uploading to drive\n%s' % str(e)) iteration += 1
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hyper_params, train_loader, valset, collate_fn): """Training and validation method with logging results to tensorboard and stdout :param output_directory (string): directory to save checkpoints :param log_directory (string): directory to save tensorboard logs :param checkpoint_path (string): checkpoint path :param n_gpus (int): number of gpus :param rank (int): rank of current gpu :param hyper_params (object dictionary): dictionary with all hyper parameters """ # Check whether is a distributed running if hyper_params['distributed_run']: init_distributed(hyper_params, n_gpus, rank, group_name) # set the same fixed seed to reproduce same results everytime we train torch.manual_seed(hyper_params['seed']) torch.cuda.manual_seed(hyper_params['seed']) model = load_model(hyper_params) learning_rate = hyper_params['learning_rate'] optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hyper_params['weight_decay']) if hyper_params['fp16_run']: optimizer = FP16_Optimizer( optimizer, dynamic_loss_scale=hyper_params['dynamic_loss_scaling']) # Define the criterion of the loss function. The objective. criterion = Tacotron2Loss() logger = prepare_directories_and_logger(output_directory, log_directory, rank) # logger = '' iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: # Re-start the model from the last checkpoint if we save the parameters and don't want to start from 0 model = warm_start_model(checkpoint_path, model) else: # CHECK THIS OUT!!! model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hyper_params['use_saved_learning_rate']: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) # Set this to make all modules and regularization aware this is the training stage: model.train() # MAIN LOOP for epoch in range(epoch_offset, hyper_params['epochs']): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): start = time.perf_counter() # CHECK THIS OUT!!! for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() input_data, output_target = model.parse_batch(batch) output_predicted = model(input_data) loss = criterion(output_predicted, output_target) if hyper_params['distributed_run']: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hyper_params['fp16_run']: optimizer.backward( loss) # transformed optimizer into fp16 type grad_norm = optimizer.clip_fp32_grads( hyper_params['grad_clip_thresh']) else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hyper_params['grad_clip_thresh']) # Performs a single optimization step (parameter update) optimizer.step() # This boolean controls overflow when running in fp16 optimizer overflow = optimizer.overflow if hyper_params['fp16_run'] else False # If overflow is True, it will not enter. If isnan is True, it will not enter neither. if not overflow and not math.isnan(reduced_loss) and rank == 0: duration = time.perf_counter() - start print( "Train loss {} {:.6f} Grand Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) # logs training information of the current iteration logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) # Every iters_per_checkpoint steps there is a validation of the model and its updated parameters if not overflow and (iteration % hyper_params['iters_per_checkpoint'] == 0): validate(model, criterion, valset, iteration, hyper_params['batch_size'], n_gpus, collate_fn, logger, hyper_params['distributed_run'], rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
model.eval() # test if this is needed anymore learning_rate = hparams.learning_rate if hparams.Apex_optimizer: # apex optimizer is slightly faster with slightly more vram usage in my testing. Helps in both fp32 and fp16. optimizer = apexopt.FusedAdam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) else: optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss(hparams) logger = prepare_directories_and_logger( output_directory, log_directory, rank) # Load checkpoint if one exists best_validation_loss = 0.8 # used to see when "best_model" should be saved, default = 0.8, load_checkpoint will update to last best value. iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model, iteration = warm_start_model( checkpoint_path, model, hparams.ignore_layers) elif warm_start_force: model, iteration = warm_start_force_model( checkpoint_path, model)
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) # else: # torch.cuda.set_device('cuda:1') torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) model = load_model(hparams) learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss() waveglow_path = 'waveglow_256channels_universal_v5.pt' waveglow = torch.load(waveglow_path)['model'] waveglow.cuda().eval().float() # waveglow.cuda().eval().half() for k in waveglow.convinv: k.float() # ---------------------- MELLOTRON CODE BLOCK -------------------------- arpabet_dict = cmudict.CMUDict('data/cmu_dictionary') audio_paths = 'data/examples_filelist.txt' dataloader = TextMelLoader(audio_paths, hparams) datacollate = TextMelCollate(hparams.n_frames_per_step) file_idx = 0 audio_path, text, sid = dataloader.audiopaths_and_text[file_idx] stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) def load_mel(path): audio, sampling_rate = librosa.core.load(path, sr=hparams.sampling_rate) audio = torch.from_numpy(audio) if sampling_rate != hparams.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, stft.sampling_rate)) audio_norm = audio.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = stft.mel_spectrogram(audio_norm) melspec = melspec.cuda() return melspec # get audio path, encoded text, pitch contour and mel for gst text_encoded = torch.LongTensor( text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None, :].cuda() mel = load_mel(audio_path) print(audio_path, text) inference_batch = datacollate([dataloader[file_idx]]) # ---------------------- MELLOTRON CODE BLOCK (END) -------------------------- logger = prepare_directories_and_logger(output_directory, log_directory, rank) train_loader, valset, collate_fn, train_sampler = prepare_dataloaders( hparams) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() is_overflow = False # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) if train_sampler is not None: train_sampler.set_epoch(epoch) for i, batch in enumerate(train_loader): start = time.perf_counter() if iteration > 0 and iteration % hparams.learning_rate_anneal == 0: learning_rate = max(hparams.learning_rate_min, learning_rate * 0.5) for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) y_pred = model(x) loss = criterion(y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start print( "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) # if not is_overflow and (iteration % 2 == 0): log_audio(model, iteration, logger, waveglow, inference_batch, text_encoded, mel) iteration += 1
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) model = load_model(hparams) learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize( model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) if hparams.reverse: criterion = TacotronAsrLoss(hparams) else: criterion = Tacotron2Loss(hparams) logger = prepare_directories_and_logger( output_directory, log_directory, rank) train_loader, valset, collate_fn = prepare_dataloaders(hparams) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 def load_ckpt(checkpoint_path, model, optimizer): model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate else: learning_rate = hparams.learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) return model, optimizer, learning_rate, iteration, epoch_offset if checkpoint_path is not None: if warm_start: model = warm_start_model( checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, learning_rate, iteration, epoch_offset = load_ckpt(checkpoint_path, model, optimizer) else: ckpt_paths = glob.glob(os.path.join(output_directory, 'checkpoint_*')) if len(ckpt_paths) > 0: last_ckpt_path = sorted(ckpt_paths, key=lambda x: int(x.split("_")[-1]))[-1] model, optimizer, learning_rate, iteration, epoch_offset = load_ckpt(last_ckpt_path, model, optimizer) # print(">>>>", model.wavenet.first_conv.weight.data[0]) model.train() if hparams.save_mels: model.eval() print(model) parameters = filter(lambda p: p.requires_grad, model.parameters()) parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000 print('Trainable Parameters: %.3fM' % parameters) # if hparams.full_song: # splits_fn = os.path.join(output_directory, 'splits_info.json') # if not os.path.exists(splits_fn): # splits_info = {} # for idx in np.load(hparams.ds_name + '.npz', allow_pickle=True).keys(): # splits_info[idx] = [[0, 0]] # with open(splits_fn, 'w') as f: # json.dump(splits_info, f) is_overflow = False # ================ MAIN TRAINNIG LOOP! =================== all_alignments = {} all_mels = {} all_linears = {} for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) t = tqdm(enumerate(train_loader)) all_reduced_loss = 0 for i, batch in t: current_lr = hparams.learning_rate if hparams.lr_schedule is not None: lr_schedule_f = getattr(lrschedule, hparams.lr_schedule) current_lr = lr_schedule_f( hparams.learning_rate, iteration, **hparams.lr_schedule_kwargs) for param_group in optimizer.param_groups: param_group['lr'] = current_lr if hparams.test_mode or (not is_overflow and (iteration % hparams.iters_per_checkpoint == 0)): if hparams.do_validation: validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank, hparams, output_directory) if hparams.test_mode: exit() if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path, output_directory) start = time.perf_counter() for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) y_pred = model(x, iteration) losses = criterion(y_pred, y, x) loss = sum(losses.values()) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if not hparams.save_mels: if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) if not is_overflow: optimizer.step() else: optimizer.zero_grad() print(loss, "grad overflow!!") input_lengths, output_lengths, uttids, mel_outputs, linear_outputs, alignments = \ x[1], x[4], x[7], y_pred[1], y_pred[2], y_pred[3] if rank == 0: # if not is_overflow and rank == 0: duration = time.perf_counter() - start all_reduced_loss += reduced_loss t.set_description("iter:{},loss:{:.6f},GN:{:.6f},{:.2f}s/it,lr:{:.6f}," "details:{},shape:{}".format( iteration, all_reduced_loss / (i + 1), grad_norm, duration, current_lr, "".join(["[{}]:{:.4f}".format(k, v.item()) for k, v in losses.items()]), list(mel_outputs.data.shape))) logger.log_training( reduced_loss, grad_norm, learning_rate, duration, iteration) iteration += 1 # save alignments input_lengths = input_lengths.data.cpu().numpy() output_lengths = output_lengths.data.cpu().numpy() uttids = uttids.data.cpu().numpy() if hparams.save_attn: alignments = alignments.data.cpu().numpy() for uttid, alignment, input_length, output_length \ in zip(uttids, alignments, input_lengths, output_lengths): if hparams.reverse: all_alignments[str(uttid)] = alignment[:input_length, :output_length] else: all_alignments[str(uttid)] = alignment[:output_length, :input_length] if hparams.save_mels: mel_outputs = mel_outputs.data.cpu().numpy() linear_outputs = linear_outputs.data.cpu().numpy() for uttid, mel_output, linear_output, input_length, output_length \ in zip(uttids, mel_outputs, linear_outputs, input_lengths, output_lengths): all_mels[str(uttid)] = mel_output[:, :output_length] all_linears[str(uttid)] = linear_output[:, :output_length] if hparams.save_attn: np.savez(os.path.join(output_directory, "all_alignments"), **all_alignments) exit() if hparams.save_mels: np.savez(os.path.join(output_directory, "all_mels"), **all_mels) np.savez(os.path.join(output_directory, "all_linears"), **all_linears) exit()
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) train_loader, valset, collate_fn = prepare_dataloaders(hparams) if hparams.drop_frame_rate > 0.: global_mean = calculate_global_mean(train_loader, hparams.global_mean_npy) hparams.global_mean = global_mean model = load_model(hparams) learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss() if hparams.use_guided_attn_loss: criterion_attn = GuidedAttentionLoss( sigma=hparams.guided_attn_loss_sigma, alpha=hparams.guided_attn_loss_lambda, ) logger = prepare_directories_and_logger(hparams, output_directory, log_directory, rank) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() is_overflow = False # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): start = time.perf_counter() if iteration < 50000: learning_rate = 1e-3 elif iteration >= 50000 and iteration < 100000: learning_rate = 5e-4 elif iteration >= 100000 and iteration < 150000: learning_rate = 3e-4 elif iteration >= 150000 and iteration < 200000: learning_rate = 1e-4 elif iteration >= 200000 and iteration < 250000: learning_rate = 5e-5 elif iteration >= 250000 and iteration < 300000: learning_rate = 3e-5 else: learning_rate = 1e-5 for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) y_pred = model(x) loss = criterion(y_pred, y) if hparams.use_guided_attn_loss is not None: alignments, r_len_pad, ilens, olens = y_pred[-1], x[2], x[ 1], x[5] attn_loss = criterion_attn(alignments, ilens, (olens + r_len_pad) // hparams.n_frames_per_step) loss = loss + attn_loss if model.mi is not None: # transpose to [b, T, dim] decoder_outputs = y_pred[0].transpose(2, 1) ctc_text, ctc_text_lengths, aco_lengths = x[-2], x[-1], x[5] taco_loss = loss mi_loss = model.mi(decoder_outputs, ctc_text, aco_lengths, ctc_text_lengths) if hparams.use_gaf: if i % gradient_adaptive_factor.UPDATE_GAF_EVERY_N_STEP == 0: safe_loss = 0. * sum( [x.sum() for x in model.parameters()]) gaf = gradient_adaptive_factor.calc_grad_adapt_factor( taco_loss + safe_loss, mi_loss + safe_loss, model.parameters(), optimizer) gaf = min(gaf, hparams.max_gaf) else: gaf = 1.0 loss = loss + gaf * mi_loss else: taco_loss = loss mi_loss = torch.tensor([-1.0]) gaf = -1.0 if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() taco_loss = reduce_tensor(taco_loss.data, n_gpus).item() mi_loss = reduce_tensor(mi_loss.data, n_gpus).item() attn_loss = reduce_tensor(attn_loss.data, n_gpus).item() else: reduced_loss = loss.item() taco_loss = taco_loss.item() mi_loss = mi_loss.item() attn_loss = attn_loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start logger.log_training(reduced_loss, taco_loss, attn_loss, mi_loss, grad_norm, gaf, learning_rate, duration, iteration) if not is_overflow and (iteration % hparams.iters_per_validate == 0): print("Train loss {} {:.4f} mi_loss {:.4f} Grad Norm {:.4f} " "gaf {:.4f} {:.2f}s/it".format(iteration, taco_loss, mi_loss, grad_norm, gaf, duration)) validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) if rank == 0 and (iteration % hparams.iters_per_checkpoint == 0): checkpoint_path = os.path.join( output_directory, "checkpoint_{}_{}".format( iteration, output_directory.split('/')[-1].replace( 'outdir_', ''))) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def main(args): # Get device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Define model model = nn.DataParallel(BERT_Tacotron2(hp)).to(device) # model = Tacotron2(hp).to(device) print("Model Have Been Defined") num_param = sum(param.numel() for param in model.parameters()) print('Number of Tacotron Parameters:', num_param) # Get dataset dataset = BERTTacotron2Dataset() # Optimizer optimizer = torch.optim.Adam(model.parameters(), lr=hp.learning_rate, weight_decay=hp.weight_decay) # Criterion criterion = Tacotron2Loss() # Load checkpoint if exists try: checkpoint = torch.load( os.path.join(hp.checkpoint_path, 'checkpoint_%d.pth.tar' % args.restore_step)) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("\n---Model Restored at Step %d---\n" % args.restore_step) except: print("\n---Start New Training---\n") if not os.path.exists(hp.checkpoint_path): os.mkdir(hp.checkpoint_path) # Init logger if not os.path.exists(hp.logger_path): os.mkdir(hp.logger_path) # Define Some Information Time = np.array([]) Start = time.clock() # Training model = model.train() for epoch in range(hp.epochs): # Get training loader training_loader = DataLoader(dataset, batch_size=hp.batch_size**2, shuffle=True, collate_fn=collate_fn, drop_last=True, num_workers=0) total_step = hp.epochs * len(training_loader) * hp.batch_size for i, batchs in enumerate(training_loader): for j, data_of_batch in enumerate(batchs): start_time = time.clock() current_step = i * hp.batch_size + j + args.restore_step + \ epoch * len(training_loader)*hp.batch_size + 1 # Init optimizer.zero_grad() # Get Data character = torch.from_numpy( data_of_batch["text"]).long().to(device) mel_target = torch.from_numpy( data_of_batch["mel_target"]).float().to( device).contiguous().transpose(1, 2) stop_target = torch.from_numpy( data_of_batch["stop_token"]).float().to(device) embeddings = data_of_batch["bert_embeddings"].float().to( device) input_lengths = torch.from_numpy( data_of_batch["length_text"]).long().to(device) output_lengths = torch.from_numpy( data_of_batch["length_mel"]).long().to(device) # Forward batch = character, input_lengths, mel_target, stop_target, output_lengths, embeddings x, y = model.module.parse_batch(batch) y_pred = model(x) # Cal Loss mel_loss, mel_postnet_loss, stop_pred_loss = criterion( y_pred, y) total_loss = mel_loss + mel_postnet_loss + stop_pred_loss # Logger t_l = total_loss.item() m_l = mel_loss.item() m_p_l = mel_postnet_loss.item() s_l = stop_pred_loss.item() with open(os.path.join("logger", "total_loss.txt"), "a") as f_total_loss: f_total_loss.write(str(t_l) + "\n") with open(os.path.join("logger", "mel_loss.txt"), "a") as f_mel_loss: f_mel_loss.write(str(m_l) + "\n") with open(os.path.join("logger", "mel_postnet_loss.txt"), "a") as f_mel_postnet_loss: f_mel_postnet_loss.write(str(m_p_l) + "\n") with open(os.path.join("logger", "stop_pred_loss.txt"), "a") as f_s_loss: f_s_loss.write(str(s_l) + "\n") # Backward total_loss.backward() # Clipping gradients to avoid gradient explosion nn.utils.clip_grad_norm_(model.parameters(), 1.) # Update weights optimizer.step() adjust_learning_rate(optimizer, current_step) # Print if current_step % hp.log_step == 0: Now = time.clock() str1 = "Epoch [{}/{}], Step [{}/{}], Mel Loss: {:.4f}, Mel PostNet Loss: {:.4f};".format( epoch + 1, hp.epochs, current_step, total_step, mel_loss.item(), mel_postnet_loss.item()) str2 = "Stop Predicted Loss: {:.4f}, Total Loss: {:.4f}.".format( stop_pred_loss.item(), total_loss.item()) str3 = "Time Used: {:.3f}s, Estimated Time Remaining: {:.3f}s.".format( (Now - Start), (total_step - current_step) * np.mean(Time)) print("\n" + str1) print(str2) print(str3) with open(os.path.join("logger", "logger.txt"), "a") as f_logger: f_logger.write(str1 + "\n") f_logger.write(str2 + "\n") f_logger.write(str3 + "\n") f_logger.write("\n") if current_step % hp.save_step == 0: torch.save( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict() }, os.path.join(hp.checkpoint_path, 'checkpoint_%d.pth.tar' % current_step)) print("save model at step %d ..." % current_step) end_time = time.clock() Time = np.append(Time, end_time - start_time) if len(Time) == hp.clear_Time: temp_value = np.mean(Time) Time = np.delete(Time, [i for i in range(len(Time))], axis=None) Time = np.append(Time, temp_value)
def main(args): # Get device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Define model model = Tacotron2(hp).to(device) model_SpeakerEncoder = SpeakerEncoder.get_model().to(device) # model = Tacotron2(hp).to(device) print("All Models Have Been Defined") # Get dataset dataset = Tacotron2DataLoader() # Optimizer optimizer = torch.optim.Adam( model.parameters(), lr=hp.learning_rate, weight_decay=hp.weight_decay) # Criterion criterion = Tacotron2Loss() # Get training loader print("Get Training Loader") training_loader = DataLoader(dataset, batch_size=hp.batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True, num_workers=cpu_count()) # Load checkpoint if exists try: checkpoint = torch.load(os.path.join( hp.checkpoint_path, 'checkpoint_%d.pth.tar' % args.restore_step)) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("\n---Model Restored at Step %d---\n" % args.restore_step) except: print("\n---Start New Training---\n") if not os.path.exists(hp.checkpoint_path): os.mkdir(hp.checkpoint_path) # Define Some Information total_step = hp.epochs * len(training_loader) Time = np.array([]) Start = time.perf_counter() # Training model = model.train() for epoch in range(hp.epochs): for i, batch in enumerate(training_loader): start_time = time.perf_counter() # Count step current_step = i + args.restore_step + \ epoch * len(training_loader) + 1 # Init optimizer.zero_grad() # Load Data text_padded, input_lengths, mel_padded, gate_padded, output_lengths, mel_for_SE = batch # Get Speaker Embedding # print(np.shape(mel_for_SE)) mel_for_SE = torch.from_numpy(mel_for_SE).float().to(device) # print(mel_for_SE.size()) with torch.no_grad(): SpeakerEmbedding = model_SpeakerEncoder(mel_for_SE) # print(SpeakerEmbedding.size()) # print(SpeakerEmbedding) # print(SpeakerEmbedding.grad) if cuda_available: text_padded = torch.from_numpy(text_padded).type( torch.cuda.LongTensor).to(device) else: text_padded = torch.from_numpy(text_padded).type( torch.LongTensor).to(device) mel_padded = torch.from_numpy(mel_padded).to(device) gate_padded = torch.from_numpy(gate_padded).to(device) input_lengths = torch.from_numpy(input_lengths).to(device) output_lengths = torch.from_numpy(output_lengths).to(device) # print("mel", mel_padded.size()) # print("text", text_padded.size()) # print("gate", gate_padded.size()) batch = text_padded, input_lengths, mel_padded, gate_padded, output_lengths x, y = model.parse_batch(batch) y_pred = model(x, SpeakerEmbedding) # Loss loss, mel_loss, gate_loss = criterion(y_pred, y) # Backward loss.backward() # Clipping gradients to avoid gradient explosion nn.utils.clip_grad_norm_(model.parameters(), hp.grad_clip_thresh) # Update weights optimizer.step() if current_step % hp.log_step == 0: Now = time.perf_counter() str_loss = "Epoch [{}/{}], Step [{}/{}], Mel Loss: {:.4f}, Gate Loss: {:.4f}, Total Loss: {:.4f}.".format( epoch + 1, hp.epochs, current_step, total_step, mel_loss.item(), gate_loss.item(), loss.item()) str_time = "Time Used: {:.3f}s, Estimated Time Remaining: {:.3f}s.".format( (Now - Start), (total_step - current_step) * np.mean(Time)) print(str_loss) print(str_time) with open("logger.txt", "a")as f_logger: f_logger.write(str_loss + "\n") f_logger.write(str_time + "\n") f_logger.write("\n") if current_step % hp.save_step == 0: torch.save({'model': model.state_dict(), 'optimizer': optimizer.state_dict( )}, os.path.join(hp.checkpoint_path, 'checkpoint_%d.pth.tar' % current_step)) print("\nsave model at step %d ...\n" % current_step) end_time = time.perf_counter() Time = np.append(Time, end_time - start_time) if len(Time) == hp.clear_Time: temp_value = np.mean(Time) Time = np.delete( Time, [i for i in range(len(Time))], axis=None) Time = np.append(Time, temp_value)
def train(n_gpus, rank, group_name): if n_gpus > 1: if rank == 0: print('Synchronizing distributed flow...') init_distributed(rank, n_gpus, group_name, config['dist_config']) torch.manual_seed(config['seed']) torch.cuda.manual_seed(config['seed']) if rank == 0: print('Initializing model, optimizer and loss...') model = Tacotron2(config).cuda() criterion = Tacotron2Loss() learning_rate = config['learning_rate'] optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate, weight_decay=config['weight_decay']) if config['fp16_run']: if rank == 0: print('Using FP16...') from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') if rank == 0: print('Preparing dirs, data loaders and logger...') logger = prepare_directories_and_logger(config['output_directory'], config['log_directory'], rank) train_loader, valset, collate_fn = prepare_dataloaders( config['training_files'], config['validation_files'], config['n_frames_per_step'], n_gpus) iteration = 0 epoch_offset = 0 if not config['warm_up_checkpoint'] is None: if rank == 0: print('Loading checkpoint from {}...'.format( config['warm_up_checkpoint'])) model = load_checkpoint(config['warm_up_checkpoint'], model, optimizer) iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.compress_factorize(config=config['compress_config']) model.train() # Main training loop for epoch in range(epoch_offset, config['epochs']): print("Epoch: {}".format(epoch)) for _, batch in enumerate(train_loader): start = time.perf_counter() x, y = model.parse_batch(batch) y_pred = model(x) loss = criterion(y_pred, y) if n_gpus > 1: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if config['fp16_run']: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if iteration % config['iters_per_grad_acc'] == 0: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), config['grad_clip_thresh']) optimizer.step() model.zero_grad() if rank == 0: duration = time.perf_counter() - start print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it". format(iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) if iteration % config['iters_per_validation'] == 0: validate(model, criterion, valset, iteration, config['batch_size'], n_gpus, collate_fn, logger, rank) if iteration % config['iters_per_checkpoint'] == 0: if rank == 0: checkpoint_path = os.path.join( config['output_directory'], "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, iteration, checkpoint_path) iteration += 1
def train_tts(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed( hparams.seed) ##设置(CPU)生成随机数的种子,在每次重新运行程序时,同样的随机数生成代码得到的是同样的结果。 torch.cuda.manual_seed( hparams.seed ) ## 设置当前GPU的随机数生成种子 torch.cuda.manual_seed_all(seed)设置所有GPU的随机数生成种子 ## 手动设置种子一般可用于固定随机初始化的权重值,这样就可以让每次重新从头训练网络时的权重的初始值虽然是随机生成的但却是固定的。 model = load_model(hparams) learning_rate = hparams.learning_rate # optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, # weight_decay=hparams.weight_decay) for name, param in model.named_parameters(): # frozen except tts # if name.split('.')[0] == 'poly_phoneme_classifier': # param.requires_grad = False # frozen poly module except tone sandhi & tts # if name.split('.')[0] == 'poly_phoneme_classifier': # if name.split('.')[1] != 'linear_pre' and name.split('.')[1] != 'conv_layers' and name.split('.')[1] != 'linear_aft': # param.requires_grad = False # frozen except structure CNN & tonesandhi & tts if name.split('.')[0] == 'poly_phoneme_classifier': if name.split('.')[1] == 'g2ptransformermask': if name.split('.')[2] != 'structure_cnn_tts': param.requires_grad = False elif name.split('.')[1] != 'linear_pre' and name.split('.')[ 1] != 'conv_layers' and name.split('.')[1] != 'linear_aft': param.requires_grad = False # else: # param.requires_grad = False training_parameters_list = [ p for p in model.parameters() if p.requires_grad ] optimizer = torch.optim.Adam(training_parameters_list, lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O2') ## apex是一款由Nvidia开发的基于PyTorch的混合精度训练加速神奇,用短短几行代码就能实现不同程度的混合精度加速,训练时间直接缩小一半。 ## fp16:半精度浮点数,是一种计算机使用的二进制浮点数数据类型,使用2字节(16位)存储。 ## fp16优点:减少显存占用;加快训练和推断的计算;张量核心的普及。缺点:量化误差。 if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss() logger = prepare_directories_and_logger(output_directory, log_directory, rank) train_loader, valset, collate_fn = prepare_dataloaders(hparams) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() is_overflow = False # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): start = time.perf_counter() ## 返回当前的计算机系统时间 for param_group in optimizer.param_groups: param_group['lr'] = learning_rate # print('CHECK batch:', batch) model.zero_grad() x, y = model.parse_batch(batch) y_pred = model(x) mask_padded = x[3] loss, mel_loss, gate_loss, select_loss = criterion( y_pred, y, mask_padded ) ## Tacotron2Loss(model_output,targets,mask_padded) ## 区分几种loss if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() reduced_val_mel_loss = reduce_tensor(mel_loss.data, n_gpus).item() reduced_val_gate_loss = reduce_tensor(gate_loss.data, n_gpus).item() reduced_val_select_loss = reduce_tensor( select_loss.data, n_gpus).item() else: reduced_loss = loss.item() reduced_val_mel_loss = mel_loss.item() reduced_val_gate_loss = gate_loss.item() reduced_val_select_loss = select_loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # print('CHECK structure_cnn.convs.0.weight IS CHANGE:', model.structure_cnn.convolutions[0][0].conv.weight) if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() ## 在用pytorch训练模型时,通常会在遍历epochs的过程中依次用到optimizer.zero_grad(),loss.backward(),optimizer.step()三个函数,总的来说,这三个函数的作用是先将梯度归零(optimizer.zero_grad()), ## 然后反向传播计算得到每个参数的梯度值(loss.backward()),最后通过梯度下降执行一步参数更新(optimizer.step()) if not is_overflow and rank == 0: duration = time.perf_counter( ) - start ## time.perf_counter()返回当前的计算机系统时间,只有连续两次perf_counter()进行差值才能有意义,一般用于计算程序运行时间。 print( "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, reduced_val_mel_loss, reduced_val_gate_loss, reduced_val_select_loss, grad_norm, learning_rate, duration, iteration) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(args, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ args.output_directory (string): directory to save checkpoints args.log_directory (string) directory to save tensorboard logs args.checkpoint_path(string): checkpoint path args.n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ # setup distributed hparams.n_gpus = args.n_gpus hparams.rank = rank if hparams.distributed_run: init_distributed(hparams, args.n_gpus, rank, group_name) # reproducablilty stuffs torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) # initialize blank model print('Initializing Tacotron2...') model = load_model(hparams) print('Done') global model_args model_args = get_args(model.forward) model.eval() learning_rate = hparams.learning_rate # (optional) show the names of each layer in model, mainly makes it easier to copy/paste what you want to adjust if hparams.print_layer_names_during_startup: print(*[f"Layer{i} = "+str(x[0])+" "+str(x[1].shape) for i,x in enumerate(list(model.named_parameters()))], sep="\n") # (optional) Freeze layers by disabling grads if len(hparams.frozen_modules): for layer, params in list(model.named_parameters()): if any(layer.startswith(module) for module in hparams.frozen_modules): params.requires_grad = False print(f"Layer: {layer} has been frozen") if len(hparams.unfrozen_modules): for layer, params in list(model.named_parameters()): if any(layer.startswith(module) for module in hparams.frozen_modules): params.requires_grad = True print(f"Layer: {layer} has been unfrozen") # define optimizer (any params without requires_grad are ignored) #optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate, weight_decay=hparams.weight_decay) optimizer = apexopt.FusedAdam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate, weight_decay=hparams.weight_decay) if True and rank == 0: pytorch_total_params = sum(p.numel() for p in model.parameters()) print("{:,} total parameters in model".format(pytorch_total_params)) pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print("{:,} trainable parameters.".format(pytorch_total_params)) print("Initializing AMP Model / Optimzier") if hparams.fp16_run: model, optimizer = amp.initialize(model, optimizer, opt_level=f'O{hparams.fp16_run_optlvl}') print("Initializing Gradient AllReduce model wrapper.") if hparams.distributed_run: model = apply_gradient_allreduce(model) print("Initializing Tacotron2 Loss func.") criterion = Tacotron2Loss(hparams) print("Initializing Tacotron2 Logger.") logger = prepare_directories_and_logger(hparams, args) # Load checkpoint if one exists best_validation_loss = 1e3# used to see when "best_val_model" should be saved best_inf_attsc = -99# used to see when "best_inf_attsc" should be saved n_restarts = 0 checkpoint_iter = 0 iteration = 0 epoch_offset = 0 _learning_rate = 1e-3 saved_lookup = None original_filelist = None global file_losses file_losses = {} global file_losses_smoothness file_losses_smoothness = 0.6 global best_val_loss_dict best_val_loss_dict = None global best_loss_dict best_loss_dict = None global expavg_loss_dict expavg_loss_dict = None expavg_loss_dict_iters = 0# initial iters expavg_loss_dict has been fitted loss_dict_smoothness = 0.95 # smoothing factor if args.checkpoint_path is not None: if args.warm_start: model, iteration, saved_lookup = warm_start_model( args.checkpoint_path, model, hparams.ignore_layers) elif args.warm_start_force: model, iteration, saved_lookup = warm_start_force_model( args.checkpoint_path, model) else: _ = load_checkpoint(args.checkpoint_path, model, optimizer, best_val_loss_dict, best_loss_dict) model, optimizer, _learning_rate, iteration, best_validation_loss, best_inf_attsc, saved_lookup, best_val_loss_dict, best_loss_dict = _ if hparams.use_saved_learning_rate: learning_rate = _learning_rate checkpoint_iter = iteration iteration += 1 # next iteration is iteration + 1 print('Model Loaded') # define datasets/dataloaders dataloader_args = [*get_args(criterion.forward), *model_args] if rank == 0: dataloader_args.extend(get_args(logger.log_training)) train_loader, valset, collate_fn, train_sampler, trainset = prepare_dataloaders(hparams, dataloader_args, args, saved_lookup) epoch_offset = max(0, int(iteration / len(train_loader))) speaker_lookup = trainset.speaker_ids # load and/or generate global_mean if hparams.drop_frame_rate > 0.: if rank != 0: # if global_mean not yet calcuated, wait for main thread to do it while not os.path.exists(hparams.global_mean_npy): time.sleep(1) global_mean = calculate_global_mean(train_loader, hparams.global_mean_npy, hparams) hparams.global_mean = global_mean model.global_mean = global_mean # define scheduler use_scheduler = 0 if use_scheduler: scheduler = ReduceLROnPlateau(optimizer, factor=0.1**(1/5), patience=10) model.train() is_overflow = False validate_then_terminate = 0 if validate_then_terminate: val_loss = validate(model, criterion, valset, iteration, hparams.batch_size, args.n_gpus, collate_fn, logger, hparams.distributed_run, rank) raise Exception("Finished Validation") for param_group in optimizer.param_groups: param_group['lr'] = learning_rate just_did_val = True rolling_loss = StreamingMovingAverage(min(int(len(train_loader)), 200)) # ================ MAIN TRAINNIG LOOP! =================== training = True while training: try: for epoch in tqdm(range(epoch_offset, hparams.epochs), initial=epoch_offset, total=hparams.epochs, desc="Epoch:", position=1, unit="epoch"): tqdm.write("Epoch:{}".format(epoch)) train_loader.dataset.shuffle_dataset()# Shuffle Dataset dataset_len = len(train_loader) start_time = time.time() # start iterating through the epoch for i, batch in tqdm(enumerate(train_loader), desc="Iter: ", smoothing=0, total=len(train_loader), position=0, unit="iter"): # run external code every epoch or 1000 iters, allows the run to be adjusted without restarts if (i==0 or iteration % param_interval == 0): try: with open("run_every_epoch.py", encoding='utf-8') as f: internal_text = str(f.read()) if len(internal_text) > 0: #code = compile(internal_text, "run_every_epoch.py", 'exec') ldict = {'iteration': iteration, 'checkpoint_iter': checkpoint_iter, 'n_restarts': n_restarts} exec(internal_text, globals(), ldict) else: print("[info] tried to execute 'run_every_epoch.py' but it is empty") except Exception as ex: print(f"[warning] 'run_every_epoch.py' FAILED to execute!\nException:\n{ex}") globals().update(ldict) locals().update(ldict) if show_live_params: print(internal_text) n_restarts = n_restarts_override if (n_restarts_override is not None) else n_restarts or 0 # Learning Rate Schedule if custom_lr: if iteration < warmup_start: learning_rate = warmup_start_lr elif iteration < warmup_end: learning_rate = (iteration-warmup_start)*((A_+C_)-warmup_start_lr)/(warmup_end-warmup_start) + warmup_start_lr # learning rate increases from warmup_start_lr to A_ linearly over (warmup_end-warmup_start) iterations. else: if iteration < decay_start: learning_rate = A_ + C_ else: iteration_adjusted = iteration - decay_start learning_rate = (A_*(e**(-iteration_adjusted/B_))) + C_ assert learning_rate > -1e-8, "Negative Learning Rate." if decrease_lr_on_restart: learning_rate = learning_rate/(2**(n_restarts/3)) if just_did_val: learning_rate = 0.0 just_did_val=False for param_group in optimizer.param_groups: param_group['lr'] = learning_rate # /run external code every epoch, allows the run to be adjusting without restarts/ model.zero_grad() y = model.parse_batch(batch) # move batch to GPU (async) y_pred = force(model, valid_kwargs=model_args, **{**y, "teacher_force_till": teacher_force_till, "p_teacher_forcing": p_teacher_forcing, "drop_frame_rate": drop_frame_rate}) loss_scalars = { "spec_MSE_weight": spec_MSE_weight, "spec_MFSE_weight": spec_MFSE_weight, "postnet_MSE_weight": postnet_MSE_weight, "postnet_MFSE_weight": postnet_MFSE_weight, "gate_loss_weight": gate_loss_weight, "sylps_kld_weight": sylps_kld_weight, "sylps_MSE_weight": sylps_MSE_weight, "sylps_MAE_weight": sylps_MAE_weight, "diag_att_weight": diag_att_weight, } loss_dict, file_losses_batch = criterion(y_pred, y, loss_scalars) file_losses = update_smoothed_dict(file_losses, file_losses_batch, file_losses_smoothness) loss = loss_dict['loss'] if hparams.distributed_run: reduced_loss_dict = {k: reduce_tensor(v.data, args.n_gpus).item() if v is not None else 0. for k, v in loss_dict.items()} else: reduced_loss_dict = {k: v.item() if v is not None else 0. for k, v in loss_dict.items()} reduced_loss = reduced_loss_dict['loss'] if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if grad_clip_thresh: if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), grad_clip_thresh) is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), grad_clip_thresh) else: grad_norm = 0.0 optimizer.step() # get current Loss Scale of first optimizer loss_scale = amp._amp_state.loss_scalers[0]._loss_scale if hparams.fp16_run else 32768. # restart if training/model has collapsed if (iteration > 1e3 and (reduced_loss > LossExplosionThreshold)) or (math.isnan(reduced_loss)) or (loss_scale < 1/4): raise LossExplosion(f"\nLOSS EXPLOSION EXCEPTION ON RANK {rank}: Loss reached {reduced_loss} during iteration {iteration}.\n\n\n") if expavg_loss_dict is None: expavg_loss_dict = reduced_loss_dict else: expavg_loss_dict = {k: (reduced_loss_dict[k]*(1-loss_dict_smoothness))+(expavg_loss_dict[k]*loss_dict_smoothness) for k in expavg_loss_dict.keys()} expavg_loss_dict_iters += 1 if expavg_loss_dict_iters > 100: if best_loss_dict is None: best_loss_dict = expavg_loss_dict else: best_loss_dict = {k: min(best_loss_dict[k], expavg_loss_dict[k]) for k in best_loss_dict.keys()} if rank == 0: duration = time.time() - start_time if not is_overflow: average_loss = rolling_loss.process(reduced_loss) tqdm.write( f"{iteration} [Train_loss:{reduced_loss:.4f} Avg:{average_loss:.4f}] " f"[Grad Norm {grad_norm:.4f}] [{duration:.2f}s/it] " f"[{(duration/(hparams.batch_size*args.n_gpus)):.3f}s/file] " f"[{learning_rate:.7f} LR] [{loss_scale:.0f} LS]") logger.log_training(reduced_loss_dict, expavg_loss_dict, best_loss_dict, grad_norm, learning_rate, duration, iteration, teacher_force_till, p_teacher_forcing, drop_frame_rate) else: tqdm.write("Gradient Overflow, Skipping Step") start_time = time.time() if iteration%checkpoint_interval==0 or os.path.exists(save_file_check_path): # save model checkpoint like normal if rank == 0: checkpoint_path = os.path.join(args.output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, hparams, best_validation_loss, best_inf_attsc, average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, checkpoint_path) if iteration%dump_filelosses_interval==0: print("Updating File_losses dict!") file_losses = write_dict_to_file(file_losses, os.path.join(args.output_directory, 'file_losses.csv'), args.n_gpus, rank) if (iteration % int(validation_interval) == 0) or (os.path.exists(save_file_check_path)) or (iteration < 1000 and (iteration % 250 == 0)): if rank == 0 and os.path.exists(save_file_check_path): os.remove(save_file_check_path) # perform validation and save "best_val_model" depending on validation loss val_loss, best_val_loss_dict, file_losses = validate(hparams, args, file_losses, model, criterion, valset, best_val_loss_dict, iteration, collate_fn, logger, val_teacher_force_till, val_p_teacher_forcing, teacher_force=0)# validate/teacher_force file_losses = write_dict_to_file(file_losses, os.path.join(args.output_directory, 'file_losses.csv'), args.n_gpus, rank) valatt_loss, *_ = validate(hparams, args, file_losses, model, criterion, valset, best_val_loss_dict, iteration, collate_fn, logger, 0, 0.0, teacher_force=2)# infer if use_scheduler: scheduler.step(val_loss) if (val_loss < best_validation_loss): best_validation_loss = val_loss if rank == 0 and hparams.save_best_val_model: checkpoint_path = os.path.join(args.output_directory, "best_val_model") save_checkpoint( model, optimizer, learning_rate, iteration, hparams, best_validation_loss, max(best_inf_attsc, val_loss), average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, checkpoint_path) if (valatt_loss > best_inf_attsc): best_inf_attsc = valatt_loss if rank == 0 and hparams.save_best_inf_attsc: checkpoint_path = os.path.join(args.output_directory, "best_inf_attsc") save_checkpoint( model, optimizer, learning_rate, iteration, hparams, best_validation_loss, best_inf_attsc, average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, checkpoint_path) just_did_val = True iteration += 1 # end of iteration loop # update filelist of training dataloader if (iteration > hparams.min_avg_max_att_start) and (iteration-checkpoint_iter >= dataset_len): print("Updating File_losses dict!") file_losses = write_dict_to_file(file_losses, os.path.join(args.output_directory, 'file_losses.csv'), args.n_gpus, rank) print("Done!") print("Updating dataloader filtered paths!") bad_file_paths = [k for k in list(file_losses.keys()) if file_losses[k]['avg_max_attention'] < hparams.min_avg_max_att or# if attention stength if too weak file_losses[k]['att_diagonality'] > hparams.max_diagonality or# or diagonality is too high file_losses[k]['spec_MSE'] > hparams.max_spec_mse] # or audio quality is too low # then add to bad files list bad_file_paths = set(bad_file_paths) # and remove from dataset filted_filelist = [x for x in train_loader.dataset.filelist if not (x[0] in bad_file_paths)] train_loader.dataset.update_filelist(filted_filelist) print(f"Done! {len(bad_file_paths)} Files removed from dataset. {len(filted_filelist)} Files remain.") del filted_filelist, bad_file_paths if iteration > hparams.speaker_mse_sampling_start: print("Updating dataset with speaker MSE Sampler!") if original_filelist is None: original_filelist = train_loader.dataset.filelist train_loader.dataset.update_filelist(get_mse_sampled_filelist( original_filelist, file_losses, hparams.speaker_mse_exponent, seed=iteration)) print("Done!") # end of epoch loop training = False # exit the While loop #except Exception as ex: # print Exception and continue from checkpoint. (turns out it takes < 4 seconds to restart like this, f*****g awesome) except LossExplosion as ex: # print Exception and continue from checkpoint. (turns out it takes < 4 seconds to restart like this, f*****g awesome) print(ex) # print Loss checkpoint_path = os.path.join(args.output_directory, "best_val_model") assert os.path.exists(checkpoint_path), "best_val_model checkpoint must exist for automatic restarts" if hparams.fp16_run: amp._amp_state.loss_scalers[0]._loss_scale = 32768 # clearing VRAM for load checkpoint model.zero_grad() x=y=y_pred=loss=len_loss=loss_z=loss_w=loss_s=loss_att=dur_loss_z=dur_loss_w=dur_loss_s=None torch.cuda.empty_cache() model.eval() model, optimizer, _learning_rate, iteration, best_validation_loss, saved_lookup = load_checkpoint(checkpoint_path, model, optimizer) learning_rate = optimizer.param_groups[0]['lr'] epoch_offset = max(0, int(iteration / len(train_loader))) model.train() checkpoint_iter = iteration iteration += 1 n_restarts += 1 except KeyboardInterrupt as ex: print(ex)
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ #rank += 4 if hparams.distributed_run: init_distributed(hparams, rank, group_name) print('checkpoint path: {}'.format(checkpoint_path)) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) model = load_model(hparams) learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize( model, optimizer, opt_level='O1') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss() logger = prepare_directories_and_logger( output_directory, log_directory, rank) train_loader, valset, collate_fn = prepare_dataloaders(hparams) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model( checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.fp16_run: checkpoint = torch.load(checkpoint_path, map_location='cpu') amp_state_dict = checkpoint['amp'] amp.load_state_dict(checkpoint['amp']) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) #print('HERE') model.train() is_overflow = False # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): start = time.perf_counter() for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) #print('X value') #from hashlib import sha1 #np_x = x[0].data.cpu().numpy() #foo = sha1(np_x) #print(foo.hexdigest()) y_pred = model(x) loss = criterion(y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, hparams.world_size).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training( reduced_loss, grad_norm, learning_rate, duration, iteration) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): #validate(model, criterion, valset, iteration, # hparams.batch_size, hparams.world_size, collate_fn, logger, # hparams.distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path, hparams.fp16_run, amp) wandb.save(checkpoint_path) iteration += 1
def train(output_directory, log_directory, checkpoint_path, warm_start, warm_start_force, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ # setup distributed hparams.n_gpus = n_gpus hparams.rank = rank if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) # reproducablilty stuffs torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) # initialize blank model model = load_model(hparams) model.eval() learning_rate = hparams.learning_rate # (optional) show the names of each layer in model, mainly makes it easier to copy/paste what you want to adjust if hparams.print_layer_names_during_startup: print(*[f"Layer{i} = "+str(x[0])+" "+str(x[1].shape) for i,x in enumerate(list(model.named_parameters()))], sep="\n") # (optional) Freeze layers by disabling grads if len(hparams.frozen_modules): for layer, params in list(model.named_parameters()): if any(layer.startswith(module) for module in hparams.frozen_modules): params.requires_grad = False print(f"Layer: {layer} has been frozen") # define optimizer (any params without requires_grad are ignored) optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate, weight_decay=hparams.weight_decay) #optimizer = apexopt.FusedAdam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss(hparams) logger = prepare_directories_and_logger( output_directory, log_directory, rank) # Load checkpoint if one exists best_validation_loss = 0.8 # used to see when "best_model" should be saved, default = 0.4, load_checkpoint will update to last best value. iteration = 0 epoch_offset = 0 _learning_rate = 1e-3 saved_lookup = None if checkpoint_path is not None: if warm_start: model, iteration, saved_lookup = warm_start_model( checkpoint_path, model, hparams.ignore_layers) elif warm_start_force: model, iteration, saved_lookup = warm_start_force_model( checkpoint_path, model) else: model, optimizer, _learning_rate, iteration, best_validation_loss, saved_lookup = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 print('Model Loaded') # define datasets/dataloaders train_loader, valset, collate_fn, train_sampler, trainset = prepare_dataloaders(hparams, saved_lookup) epoch_offset = max(0, int(iteration / len(train_loader))) speaker_lookup = trainset.speaker_ids # load and/or generate global_mean if hparams.drop_frame_rate > 0.: if rank != 0: # if global_mean not yet calcuated, wait for main thread to do it while not os.path.exists(hparams.global_mean_npy): time.sleep(1) global_mean = calculate_global_mean(train_loader, hparams.global_mean_npy, hparams) hparams.global_mean = global_mean model.global_mean = global_mean # define scheduler use_scheduler = 0 if use_scheduler: scheduler = ReduceLROnPlateau(optimizer, factor=0.1**(1/5), patience=10) model.train() is_overflow = False validate_then_terminate = 0 if validate_then_terminate: val_loss = validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) raise Exception("Finished Validation") for param_group in optimizer.param_groups: param_group['lr'] = learning_rate rolling_loss = StreamingMovingAverage(min(int(len(train_loader)), 200)) # ================ MAIN TRAINNIG LOOP! =================== for epoch in tqdm(range(epoch_offset, hparams.epochs), initial=epoch_offset, total=hparams.epochs, desc="Epoch:", position=1, unit="epoch"): tqdm.write("Epoch:{}".format(epoch)) if hparams.distributed_run: # shuffles the train_loader when doing multi-gpu training train_sampler.set_epoch(epoch) start_time = time.time() # start iterating through the epoch for i, batch in tqdm(enumerate(train_loader), desc="Iter: ", smoothing=0, total=len(train_loader), position=0, unit="iter"): # run external code every epoch, allows the run to be adjusting without restarts if (iteration % 1000 == 0 or i==0): try: with open("run_every_epoch.py") as f: internal_text = str(f.read()) if len(internal_text) > 0: print(internal_text) #code = compile(internal_text, "run_every_epoch.py", 'exec') ldict = {'iteration': iteration} exec(internal_text, globals(), ldict) print("Custom code excecuted\nPlease remove code if it was intended to be ran once.") else: print("No Custom code found, continuing without changes.") except Exception as ex: print(f"Custom code FAILED to run!\n{ex}") globals().update(ldict) locals().update(ldict) print("decay_start is ",decay_start) print("A_ is ",A_) print("B_ is ",B_) print("C_ is ",C_) print("min_learning_rate is ",min_learning_rate) print("epochs_between_updates is ",epochs_between_updates) print("drop_frame_rate is ",drop_frame_rate) print("p_teacher_forcing is ",p_teacher_forcing) print("teacher_force_till is ",teacher_force_till) print("val_p_teacher_forcing is ",val_p_teacher_forcing) print("val_teacher_force_till is ",val_teacher_force_till) print("grad_clip_thresh is ",grad_clip_thresh) if epoch % epochs_between_updates == 0 or epoch_offset == epoch: #if None: tqdm.write("Old learning rate [{:.6f}]".format(learning_rate)) if iteration < decay_start: learning_rate = A_ + C_ else: iteration_adjusted = iteration - decay_start learning_rate = (A_*(e**(-iteration_adjusted/B_))) + C_ learning_rate = max(min_learning_rate, learning_rate) # output the largest number tqdm.write("Changing Learning Rate to [{:.6f}]".format(learning_rate)) for param_group in optimizer.param_groups: param_group['lr'] = learning_rate # /run external code every epoch, allows the run to be adjusting without restarts/ model.zero_grad() x, y = model.parse_batch(batch) # move batch to GPU (async) y_pred = model(x, teacher_force_till=teacher_force_till, p_teacher_forcing=p_teacher_forcing, drop_frame_rate=drop_frame_rate) loss, gate_loss = criterion(y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() reduced_gate_loss = reduce_tensor(gate_loss.data, n_gpus).item() else: reduced_loss = loss.item() reduced_gate_loss = gate_loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), grad_clip_thresh) is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), grad_clip_thresh) optimizer.step() for j, param_group in enumerate(optimizer.param_groups): learning_rate = (float(param_group['lr'])); break if iteration < decay_start: learning_rate = A_ + C_ else: iteration_adjusted = iteration - decay_start learning_rate = (A_*(e**(-iteration_adjusted/B_))) + C_ learning_rate = max(min_learning_rate, learning_rate) # output the largest number for param_group in optimizer.param_groups: param_group['lr'] = learning_rate if not is_overflow and rank == 0: duration = time.time() - start_time average_loss = rolling_loss.process(reduced_loss) tqdm.write("{} [Train_loss {:.4f} Avg {:.4f}] [Gate_loss {:.4f}] [Grad Norm {:.4f}] " "[{:.2f}s/it] [{:.3f}s/file] [{:.7f} LR]".format( iteration, reduced_loss, average_loss, reduced_gate_loss, grad_norm, duration, (duration/(hparams.batch_size*n_gpus)), learning_rate)) if iteration % 20 == 0: diagonality, avg_prob = alignment_metric(x, y_pred) logger.log_training( reduced_loss, grad_norm, learning_rate, duration, iteration, teacher_force_till, p_teacher_forcing, diagonality=diagonality, avg_prob=avg_prob) else: logger.log_training( reduced_loss, grad_norm, learning_rate, duration, iteration, teacher_force_till, p_teacher_forcing) start_time = time.time() if is_overflow and rank == 0: tqdm.write("Gradient Overflow, Skipping Step") if not is_overflow and ((iteration % (hparams.iters_per_checkpoint/1) == 0) or (os.path.exists(save_file_check_path))): # save model checkpoint like normal if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, hparams, best_validation_loss, average_loss, speaker_lookup, checkpoint_path) if not is_overflow and ((iteration % int((hparams.iters_per_validation)/1) == 0) or (os.path.exists(save_file_check_path)) or (iteration < 1000 and (iteration % 250 == 0))): if rank == 0 and os.path.exists(save_file_check_path): os.remove(save_file_check_path) # perform validation and save "best_model" depending on validation loss val_loss = validate(model, criterion, valset, iteration, hparams.val_batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank, val_teacher_force_till, val_p_teacher_forcing, teacher_force=1) #teacher_force val_loss = validate(model, criterion, valset, iteration, hparams.val_batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank, val_teacher_force_till, val_p_teacher_forcing, teacher_force=2) #infer val_loss = validate(model, criterion, valset, iteration, hparams.val_batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank, val_teacher_force_till, val_p_teacher_forcing, teacher_force=0) #validate (0.8 forcing) if use_scheduler: scheduler.step(val_loss) if (val_loss < best_validation_loss): best_validation_loss = val_loss if rank == 0: checkpoint_path = os.path.join(output_directory, "best_model") save_checkpoint(model, optimizer, learning_rate, iteration, hparams, best_validation_loss, average_loss, speaker_lookup, checkpoint_path) iteration += 1
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) model = load_model(hparams) learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss() logger = prepare_directories_and_logger(output_directory, log_directory, rank) train_loader, valset, collate_fn = prepare_dataloaders(hparams) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() is_overflow = False skipped = 0 # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): start = time.perf_counter() for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) try: y_pred = model(x) except ValueError: skipped += 1 print( 'Skipped an iteration due to value error, you have now skipped {} iterations' .format(skipped)) iteration += 1 continue loss = criterion(y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start print( "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0) and iteration > 0: validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(output_directory, log_directory, checkpoint_path, warm_start, warm_start_force, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ # setup distributed hparams.n_gpus = n_gpus hparams.rank = rank if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) # reproducablilty stuffs torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) # initialize blank model print('Initializing UnTTS...') model = load_model(hparams) print('Done') model.eval() learning_rate = hparams.learning_rate # (optional) show the names of each layer in model, mainly makes it easier to copy/paste what you want to adjust if hparams.print_layer_names_during_startup: print(*[ f"Layer{i} = " + str(x[0]) + " " + str(x[1].shape) for i, x in enumerate(list(model.named_parameters())) ], sep="\n") # (optional) Freeze layers by disabling grads if len(hparams.frozen_modules): for layer, params in list(model.named_parameters()): if any( layer.startswith(module) for module in hparams.frozen_modules): params.requires_grad = False print(f"Layer: {layer} has been frozen") # define optimizer (any params without requires_grad are ignored) #optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate, weight_decay=hparams.weight_decay) optimizer = apexopt.FusedAdam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate, weight_decay=hparams.weight_decay) if True and rank == 0: pytorch_total_params = sum(p.numel() for p in model.parameters()) print("{:,} total parameters in model".format(pytorch_total_params)) pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print("{:,} trainable parameters.".format(pytorch_total_params)) if hparams.fp16_run: model, optimizer = amp.initialize( model, optimizer, opt_level=f'O{hparams.fp16_run_optlvl}') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss(hparams) logger = prepare_directories_and_logger(hparams, output_directory, log_directory, rank) # Load checkpoint if one exists best_validation_loss = 1e3 # used to see when "best_model" should be saved n_restarts = 0 checkpoint_iter = 0 iteration = 0 epoch_offset = 0 _learning_rate = 1e-3 saved_lookup = None global best_val_loss_dict best_val_loss_dict = None global best_loss_dict best_loss_dict = None global expavg_loss_dict expavg_loss_dict = None expavg_loss_dict_iters = 0 # initial iters expavg_loss_dict has been fitted loss_dict_smoothness = 0.95 # smoothing factor if checkpoint_path is not None: if warm_start: model, iteration, saved_lookup = warm_start_model( checkpoint_path, model, hparams.ignore_layers) elif warm_start_force: model, iteration, saved_lookup = warm_start_force_model( checkpoint_path, model) else: _ = load_checkpoint(checkpoint_path, model, optimizer, best_val_loss_dict, best_loss_dict) model, optimizer, _learning_rate, iteration, best_validation_loss, saved_lookup, best_val_loss_dict, best_loss_dict = _ if hparams.use_saved_learning_rate: learning_rate = _learning_rate checkpoint_iter = iteration iteration += 1 # next iteration is iteration + 1 print('Model Loaded') # define datasets/dataloaders train_loader, valset, collate_fn, train_sampler, trainset = prepare_dataloaders( hparams, saved_lookup) epoch_offset = max(0, int(iteration / len(train_loader))) speaker_lookup = trainset.speaker_ids # load and/or generate global_mean if hparams.drop_frame_rate > 0.: if rank != 0: # if global_mean not yet calcuated, wait for main thread to do it while not os.path.exists(hparams.global_mean_npy): time.sleep(1) global_mean = calculate_global_mean(train_loader, hparams.global_mean_npy, hparams) hparams.global_mean = global_mean model.global_mean = global_mean # define scheduler use_scheduler = 0 if use_scheduler: scheduler = ReduceLROnPlateau(optimizer, factor=0.1**(1 / 5), patience=10) model.train() is_overflow = False validate_then_terminate = 0 if validate_then_terminate: val_loss = validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) raise Exception("Finished Validation") for param_group in optimizer.param_groups: param_group['lr'] = learning_rate rolling_loss = StreamingMovingAverage(min(int(len(train_loader)), 200)) # ================ MAIN TRAINNIG LOOP! =================== training = True while training: try: for epoch in tqdm(range(epoch_offset, hparams.epochs), initial=epoch_offset, total=hparams.epochs, desc="Epoch:", position=1, unit="epoch"): tqdm.write("Epoch:{}".format(epoch)) if hparams.distributed_run: # shuffles the train_loader when doing multi-gpu training train_sampler.set_epoch(epoch) start_time = time.time() # start iterating through the epoch for i, batch in tqdm(enumerate(train_loader), desc="Iter: ", smoothing=0, total=len(train_loader), position=0, unit="iter"): # run external code every iter, allows the run to be adjusted without restarts if (i == 0 or iteration % param_interval == 0): try: with open("run_every_epoch.py") as f: internal_text = str(f.read()) if len(internal_text) > 0: #code = compile(internal_text, "run_every_epoch.py", 'exec') ldict = { 'iteration': iteration, 'checkpoint_iter': checkpoint_iter, 'n_restarts': n_restarts } exec(internal_text, globals(), ldict) else: print( "[info] tried to execute 'run_every_epoch.py' but it is empty" ) except Exception as ex: print( f"[warning] 'run_every_epoch.py' FAILED to execute!\nException:\n{ex}" ) globals().update(ldict) locals().update(ldict) if show_live_params: print(internal_text) n_restarts = n_restarts_override if ( n_restarts_override is not None) else n_restarts or 0 # Learning Rate Schedule if custom_lr: if iteration < warmup_start: learning_rate = warmup_start_lr elif iteration < warmup_end: learning_rate = (iteration - warmup_start) * ( (A_ + C_) - warmup_start_lr ) / ( warmup_end - warmup_start ) + warmup_start_lr # learning rate increases from warmup_start_lr to A_ linearly over (warmup_end-warmup_start) iterations. else: if iteration < decay_start: learning_rate = A_ + C_ else: iteration_adjusted = iteration - decay_start learning_rate = ( A_ * (e**(-iteration_adjusted / B_))) + C_ assert learning_rate > -1e-8, "Negative Learning Rate." if decrease_lr_on_restart: learning_rate = learning_rate / (2**(n_restarts / 3)) for param_group in optimizer.param_groups: param_group['lr'] = learning_rate # /run external code every epoch, allows the run to be adjusting without restarts/ model.zero_grad() x, y = model.parse_batch( batch) # move batch to GPU (async) y_pred = model(x) loss_scalars = { "MelGlow_ls": MelGlow_ls, "DurGlow_ls": DurGlow_ls, "VarGlow_ls": VarGlow_ls, "Sylps_ls": Sylps_ls, } loss_dict = criterion(y_pred, y, loss_scalars) loss = loss_dict['loss'] if hparams.distributed_run: reduced_loss_dict = { k: reduce_tensor(v.data, n_gpus).item() if v is not None else 0. for k, v in loss_dict.items() } else: reduced_loss_dict = { k: v.item() if v is not None else 0. for k, v in loss_dict.items() } reduced_loss = reduced_loss_dict['loss'] if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if grad_clip_thresh: if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), grad_clip_thresh) is_overflow = math.isinf(grad_norm) or math.isnan( grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), grad_clip_thresh) else: grad_norm = 0.0 optimizer.step() # get current Loss Scale of first optimizer loss_scale = amp._amp_state.loss_scalers[ 0]._loss_scale if hparams.fp16_run else 32768. # restart if training/model has collapsed if (iteration > 1e3 and (reduced_loss > LossExplosionThreshold)) or ( math.isnan(reduced_loss)) or (loss_scale < 1 / 4): raise LossExplosion( f"\nLOSS EXPLOSION EXCEPTION ON RANK {rank}: Loss reached {reduced_loss} during iteration {iteration}.\n\n\n" ) if expavg_loss_dict is None: expavg_loss_dict = reduced_loss_dict else: expavg_loss_dict = { k: (reduced_loss_dict[k] * (1 - loss_dict_smoothness)) + (expavg_loss_dict[k] * loss_dict_smoothness) for k in expavg_loss_dict.keys() } expavg_loss_dict_iters += 1 if expavg_loss_dict_iters > 100: if best_loss_dict is None: best_loss_dict = expavg_loss_dict else: best_loss_dict = { k: min(best_loss_dict[k], expavg_loss_dict[k]) for k in best_loss_dict.keys() } if rank == 0: duration = time.time() - start_time if not is_overflow: average_loss = rolling_loss.process(reduced_loss) tqdm.write( "{} [Train_loss:{:.4f} Avg:{:.4f}] [Grad Norm {:.4f}] " "[{:.2f}s/it] [{:.3f}s/file] [{:.7f} LR] [{} LS]" .format(iteration, reduced_loss, average_loss, grad_norm, duration, (duration / (hparams.batch_size * n_gpus)), learning_rate, round(loss_scale))) logger.log_training(reduced_loss_dict, expavg_loss_dict, best_loss_dict, grad_norm, learning_rate, duration, iteration) else: tqdm.write("Gradient Overflow, Skipping Step") start_time = time.time() if not is_overflow and ( (iteration % (hparams.iters_per_checkpoint / 1) == 0) or (os.path.exists(save_file_check_path))): # save model checkpoint like normal if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, hparams, best_validation_loss, average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, checkpoint_path) if not is_overflow and ( (iteration % int(validation_interval) == 0) or (os.path.exists(save_file_check_path)) or (iteration < 1000 and (iteration % 250 == 0))): if rank == 0 and os.path.exists(save_file_check_path): os.remove(save_file_check_path) # perform validation and save "best_model" depending on validation loss val_loss, best_val_loss_dict = validate( model, criterion, valset, loss_scalars, best_val_loss_dict, iteration, hparams.val_batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) #validate (0.8 forcing) if use_scheduler: scheduler.step(val_loss) if (val_loss < best_validation_loss): best_validation_loss = val_loss if rank == 0: checkpoint_path = os.path.join( output_directory, "best_model") save_checkpoint(model, optimizer, learning_rate, iteration, hparams, best_validation_loss, average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, checkpoint_path) iteration += 1 # end of iteration loop # end of epoch loop training = False # exit the While loop #except Exception as ex: # print Exception and continue from checkpoint. (turns out it takes < 4 seconds to restart like this, f*****g awesome) except LossExplosion as ex: # print Exception and continue from checkpoint. (turns out it takes < 4 seconds to restart like this, f*****g awesome) print(ex) # print Loss checkpoint_path = os.path.join(output_directory, "best_model") assert os.path.exists( checkpoint_path ), "best_model checkpoint must exist for automatic restarts" if hparams.fp16_run: amp._amp_state.loss_scalers[0]._loss_scale = 32768 # clearing VRAM for load checkpoint model.zero_grad() x = y = y_pred = loss = len_loss = loss_z = loss_w = loss_s = loss_att = dur_loss_z = dur_loss_w = dur_loss_s = None torch.cuda.empty_cache() model.eval() model, optimizer, _learning_rate, iteration, best_validation_loss, saved_lookup = load_checkpoint( checkpoint_path, model, optimizer) learning_rate = optimizer.param_groups[0]['lr'] epoch_offset = max(0, int(iteration / len(train_loader))) model.train() checkpoint_iter = iteration iteration += 1 n_restarts += 1
def train(input_directory, output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) # torch.cuda.manual_seed(hparams.seed) model = load_model(hparams) learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss() logger = prepare_directories_and_logger(output_directory, log_directory, rank) # 记录训练的元数据。 meta_folder = os.path.join(output_directory, 'metadata') os.makedirs(meta_folder, exist_ok=True) trpath = os.path.join(meta_folder, "train.txt") vapath = os.path.join(meta_folder, "validation.txt") with open(trpath, 'wt', encoding='utf8') as fout_tr, open(vapath, 'wt', encoding='utf8') as fout_va: lines = open(input_directory, encoding='utf8').readlines() val_ids = set( np.random.choice(list(range(len(lines))), hparams.batch_size * 2, replace=False)) for num, line in enumerate(lines): parts = line.strip().split('\t') abspath = os.path.join( os.path.dirname(os.path.abspath(input_directory)), parts[0]).replace('\\', '/') text = parts[1] if len(parts) >= 3: speaker = parts[2] else: speaker = '0' out = f'{abspath}\t{text}\t{speaker}\n' if num in val_ids: fout_va.write(out) else: fout_tr.write(out) train_loader, valset, collate_fn, train_sampler = prepare_dataloaders( meta_folder, hparams) path = os.path.join(meta_folder, "speakers.json") obj = dict(valset.speaker_ids) json_dump(obj, path) path = os.path.join(meta_folder, "hparams.json") obj = {k: v for k, v in hparams.items()} json_dump(obj, path) path = os.path.join(meta_folder, "symbols.json") from text.symbols import symbols obj = {w: i for i, w in enumerate(symbols)} json_dump(obj, path) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() is_overflow = False # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) if train_sampler is not None: train_sampler.set_epoch(epoch) for i, batch in enumerate( tqdm(train_loader, desc=f"Epoch-{epoch}", ncols=100)): start = time.perf_counter() if iteration > 0 and iteration % hparams.learning_rate_anneal == 0: learning_rate = max(hparams.learning_rate_min, learning_rate * 0.5) for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) y_pred = model(x) loss = criterion(y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() duration = time.perf_counter() - start if not is_overflow and rank == 0: logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): print( "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank, outdir=Path(output_directory), hparams=hparams) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint-{:06d}.pt".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def load_checkpoint(checkpoint_path, model, optimizer): assert os.path.isfile(checkpoint_path) print("Loading checkpoint '{}'".format(checkpoint_path)) checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') state_dict = {k.replace("encoder_speaker_embedding.weight","encoder.encoder_speaker_embedding.weight"): v for k,v in torch.load(checkpoint_path)['state_dict'].items()} model.load_state_dict(state_dict) # tmp for updating old models #model.load_state_dict(checkpoint_dict['state_dict']) # original if 'optimizer' in checkpoint_dict.keys(): optimizer.load_state_dict(checkpoint_dict['optimizer']) if 'amp' in checkpoint_dict.keys(): amp.load_state_dict(checkpoint_dict['amp']) if 'learning_rate' in checkpoint_dict.keys(): learning_rate = checkpoint_dict['learning_rate'] #if 'hparams' in checkpoint_dict.keys(): hparams = checkpoint_dict['hparams'] if 'best_validation_loss' in checkpoint_dict.keys(): best_validation_loss = checkpoint_dict['best_validation_loss'] if 'average_loss' in checkpoint_dict.keys(): average_loss = checkpoint_dict['average_loss'] if (start_from_checkpoints_from_zero): iteration = 0 else: iteration = checkpoint_dict['iteration'] print("Loaded checkpoint '{}' from iteration {}" .format( checkpoint_path, iteration)) return model, optimizer, learning_rate, iteration, best_validation_loss def save_checkpoint(model, optimizer, learning_rate, iteration, hparams, best_validation_loss, average_loss, speaker_id_lookup, filepath): from utils import load_filepaths_and_text tqdm.write("Saving model and optimizer state at iteration {} to {}".format( iteration, filepath)) # get speaker names to ID speakerlist = load_filepaths_and_text(hparams.speakerlist) speaker_name_lookup = {x[2]: speaker_id_lookup[x[3]] for x in speakerlist} torch.save({'iteration': iteration, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'learning_rate': learning_rate, #'amp': amp.state_dict(), 'hparams': hparams, 'speaker_id_lookup': speaker_id_lookup, 'speaker_name_lookup': speaker_name_lookup, 'best_validation_loss': best_validation_loss, 'average_loss': average_loss}, filepath) tqdm.write("Saving Complete") def validate(model, criterion, valset, iteration, batch_size, n_gpus, collate_fn, logger, distributed_run, rank, val_teacher_force_till, val_p_teacher_forcing, teacher_force=1): """Handles all the validation scoring and printing""" model.eval() with torch.no_grad(): val_sampler = DistributedSampler(valset) if distributed_run else None val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1, shuffle=False, batch_size=batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) if teacher_force == 1: val_teacher_force_till = 0 val_p_teacher_forcing = 1.0 elif teacher_force == 2: val_teacher_force_till = 0 val_p_teacher_forcing = 0.0 val_loss = 0.0 diagonality = torch.zeros(1) avg_prob = torch.zeros(1) for i, batch in tqdm(enumerate(val_loader), desc="Validation", total=len(val_loader), smoothing=0): # i = index, batch = stuff in array[i] x, y = model.parse_batch(batch) y_pred = model(x, teacher_force_till=val_teacher_force_till, p_teacher_forcing=val_p_teacher_forcing) rate, prob = alignment_metric(x, y_pred) diagonality += rate avg_prob += prob loss, gate_loss = criterion(y_pred, y) if distributed_run: reduced_val_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_val_loss = loss.item() val_loss += reduced_val_loss # end forloop val_loss = val_loss / (i + 1) diagonality = (diagonality / (i + 1)).item() avg_prob = (avg_prob / (i + 1)).item() # end torch.no_grad() model.train() if rank == 0: tqdm.write("Validation loss {}: {:9f} Average Max Attention: {:9f}".format(iteration, val_loss, avg_prob)) #logger.log_validation(val_loss, model, y, y_pred, iteration) if True:#iteration != 0: if teacher_force == 1: logger.log_teacher_forced_validation(val_loss, model, y, y_pred, iteration, val_teacher_force_till, val_p_teacher_forcing, diagonality, avg_prob) elif teacher_force == 2: logger.log_infer(val_loss, model, y, y_pred, iteration, val_teacher_force_till, val_p_teacher_forcing, diagonality, avg_prob) else: logger.log_validation(val_loss, model, y, y_pred, iteration, val_teacher_force_till, val_p_teacher_forcing, diagonality, avg_prob) return val_loss def calculate_global_mean(data_loader, global_mean_npy, hparams): if global_mean_npy and os.path.exists(global_mean_npy): global_mean = np.load(global_mean_npy) return to_gpu(torch.tensor(global_mean).half()) if hparams.fp16_run else to_gpu(torch.tensor(global_mean).float()) sums = [] frames = [] print('calculating global mean...') for i, batch in tqdm(enumerate(data_loader), total=len(data_loader), smoothing=0.001): text_padded, input_lengths, mel_padded, gate_padded,\ output_lengths, speaker_ids, torchmoji_hidden, preserve_decoder_states = batch # padded values are 0. sums.append(mel_padded.double().sum(dim=(0, 2))) frames.append(output_lengths.double().sum()) global_mean = sum(sums) / sum(frames) global_mean = to_gpu(global_mean.half()) if hparams.fp16_run else to_gpu(global_mean.float()) if global_mean_npy: np.save(global_mean_npy, global_mean.cpu().numpy()) return global_mean def train(output_directory, log_directory, checkpoint_path, warm_start, warm_start_force, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ hparams.n_gpus = n_gpus hparams.rank = rank if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) train_loader, valset, collate_fn, train_sampler, trainset = prepare_dataloaders(hparams) speaker_lookup = trainset.speaker_ids if hparams.drop_frame_rate > 0.: if rank != 0: # if global_mean not yet calcuated, wait for main thread to do it while not os.path.exists(hparams.global_mean_npy): time.sleep(1) global_mean = calculate_global_mean(train_loader, hparams.global_mean_npy, hparams) hparams.global_mean = global_mean model = load_model(hparams) model.eval() # test if this is needed anymore learning_rate = hparams.learning_rate if hparams.Apex_optimizer: # apex optimizer is slightly faster with slightly more vram usage in my testing. Helps in both fp32 and fp16. optimizer = apexopt.FusedAdam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) else: optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss(hparams) logger = prepare_directories_and_logger( output_directory, log_directory, rank) # Load checkpoint if one exists best_validation_loss = 0.8 # used to see when "best_model" should be saved, default = 0.8, load_checkpoint will update to last best value. iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model, iteration = warm_start_model( checkpoint_path, model, hparams.ignore_layers) elif warm_start_force: model, iteration = warm_start_force_model( checkpoint_path, model) else: model, optimizer, _learning_rate, iteration, best_validation_loss = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) print('Model Loaded') ## LEARNING RATE SCHEDULER if True: from torch.optim.lr_scheduler import ReduceLROnPlateau min_lr = 1e-5 factor = 0.1**(1/5) # amount to scale the LR by on Validation Loss plateau scheduler = ReduceLROnPlateau(optimizer, 'min', factor=factor, patience=20, cooldown=2, min_lr=min_lr, verbose=True) print("ReduceLROnPlateau used as (optional) Learning Rate Scheduler.") else: scheduler=False model.train() is_overflow = False validate_then_terminate = 0 # I use this for testing old models with new metrics if validate_then_terminate: val_loss = validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) raise Exception("Finished Validation") for param_group in optimizer.param_groups: param_group['lr'] = learning_rate rolling_loss = StreamingMovingAverage(min(int(len(train_loader)), 200)) # ================ MAIN TRAINNIG LOOP! =================== for epoch in tqdm(range(epoch_offset, hparams.epochs), initial=epoch_offset, total=hparams.epochs, desc="Epoch:", position=1, unit="epoch"): tqdm.write("Epoch:{}".format(epoch)) if hparams.distributed_run: # shuffles the train_loader when doing multi-gpu training train_sampler.set_epoch(epoch) start_time = time.time() # start iterating through the epoch for i, batch in tqdm(enumerate(train_loader), desc="Iter: ", smoothing=0, total=len(train_loader), position=0, unit="iter"): # run external code every iter, allows the run to be adjusted without restarts if (i==0 or iteration % param_interval == 0): try: with open("run_every_epoch.py") as f: internal_text = str(f.read()) if len(internal_text) > 0: #code = compile(internal_text, "run_every_epoch.py", 'exec') ldict = {'iteration': iteration} exec(internal_text, globals(), ldict) else: print("No Custom code found, continuing without changes.") except Exception as ex: print(f"Custom code FAILED to run!\n{ex}") globals().update(ldict) locals().update(ldict) if show_live_params: print(internal_text) if not iteration % 50: # check actual learning rate every 20 iters (because I sometimes see learning_rate variable go out-of-sync with real LR) learning_rate = optimizer.param_groups[0]['lr'] # Learning Rate Schedule if custom_lr: old_lr = learning_rate if iteration < warmup_start: learning_rate = warmup_start_lr elif iteration < warmup_end: learning_rate = (iteration-warmup_start)*((A_+C_)-warmup_start_lr)/(warmup_end-warmup_start) + warmup_start_lr # learning rate increases from warmup_start_lr to A_ linearly over (warmup_end-warmup_start) iterations. else: if iteration < decay_start: learning_rate = A_ + C_ else: iteration_adjusted = iteration - decay_start learning_rate = (A_*(e**(-iteration_adjusted/B_))) + C_ assert learning_rate > -1e-8, "Negative Learning Rate." if old_lr != learning_rate: for param_group in optimizer.param_groups: param_group['lr'] = learning_rate else: scheduler.patience = scheduler_patience scheduler.cooldown = scheduler_cooldown if override_scheduler_last_lr: scheduler._last_lr = override_scheduler_last_lr print("Scheduler last_lr overriden. scheduler._last_lr =", scheduler._last_lr) if override_scheduler_best: scheduler.best = override_scheduler_best print("Scheduler best metric overriden. scheduler.best =", override_scheduler_best)
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) model = initiate_model(hparams) learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss() logger = prepare_directories_and_logger(output_directory, log_directory, rank) single_train_loader, single_valset, single_collate_fn, single_train_sampler = prepare_single_dataloaders( hparams, output_directory) train_loader, valset, collate_fn, train_sampler = prepare_dataloaders( hparams, output_directory) single_train_loader.dataset.speaker_ids = train_loader.dataset.speaker_ids single_valset.speaker_ids = train_loader.dataset.speaker_ids # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model, hparams.ignore_layers) else: # model = torch.nn.DataParallel(model) model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(single_train_loader))) model = torch.nn.DataParallel(model) model.train() is_overflow = False # init training loop with single speaker for epoch in range(epoch_offset, 30): print("Epoch: {}".format(epoch)) if single_train_sampler is not None: single_train_sampler.set_epoch(epoch) for i, batch in enumerate(single_train_loader): start = time.perf_counter() if iteration > 0 and iteration % hparams.learning_rate_anneal == 0: learning_rate = max(hparams.learning_rate_min, learning_rate * 0.5) for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = parse_batch(batch) mel_outputs, mel_outputs_postnet, gate_outputs, alignments, length = model( x) y_pred = parse_output( [mel_outputs, mel_outputs_postnet, gate_outputs, alignments], length) loss = criterion(y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start print( "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): validate(model, criterion, single_valset, iteration, hparams.batch_size, n_gpus, single_collate_fn, logger, hparams.distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model.module, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1 # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(30, hparams.epochs): print("Epoch: {}".format(epoch)) if train_sampler is not None: train_sampler.set_epoch(epoch) for i, batch in enumerate(train_loader): start = time.perf_counter() if iteration > 0 and iteration % hparams.learning_rate_anneal == 0: learning_rate = max(hparams.learning_rate_min, learning_rate * 0.5) for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = parse_batch(batch) mel_outputs, mel_outputs_postnet, gate_outputs, alignments, length = model( x) y_pred = parse_output( [mel_outputs, mel_outputs_postnet, gate_outputs, alignments], length) loss = criterion(y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start print( "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model.module, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams, run_name, prj_name, resume): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) model = load_model(hparams) learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss() criterion_dom = torch.nn.CrossEntropyLoss() logger = prepare_directories_and_logger(output_directory, log_directory, rank, run_name, prj_name, resume) train_loader, valset, collate_fn = prepare_dataloaders(hparams) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() is_overflow = False for param_group in optimizer.param_groups: param_group['initial_lr'] = learning_rate scheduler = torch.optim.lr_scheduler.ExponentialLR( optimizer, 0.5**(1 / (125000 * (64 / hparams.batch_size))), last_epoch=-1) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): batches_per_epoch = len(train_loader) float_epoch = iteration / batches_per_epoch start = time.perf_counter() for param_group in optimizer.param_groups: param_group['lr'] = scheduler.get_lr()[0] model.zero_grad() x, y = model.parse_batch(batch) y_pred = model(x) loss = criterion(y_pred, y) if prj_name == "forward_attention_loss": input_lengths = x[1] alignments = y_pred[3] mean_far, _ = forward_attention_ratio(alignments, input_lengths) if mean_far > 0.95: fa_loss = forward_attention_loss(alignments, input_lengths) loss += fa_loss float_fa_loss = fa_loss.item() else: float_fa_loss = None else: float_fa_loss = None if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) learning_rate = scheduler.get_lr()[0] print("learning_rate:", learning_rate) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start print( "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, x, y_pred, iteration, float_epoch, float_fa_loss) if not is_overflow and ( (iteration % hparams.iters_per_checkpoint == 0) or (i + 1 == batches_per_epoch)): validate(model, criterion, valset, iteration, float_epoch, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank, hparams.sampling_rate) if rank == 0 and (iteration % hparams.iters_per_checkpoint == 0): checkpoint_path = os.path.join( os.path.join(output_directory, prj_name, run_name), "checkpoint_{}-epoch_{:.4}".format( iteration, float_epoch)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) if rank == 0 and (i + 1 == batches_per_epoch): checkpoint_path = os.path.join( os.path.join(output_directory, prj_name, run_name), "checkpoint_{}-epoch_{:.4}_end-epoch_{}".format( iteration, float_epoch, epoch)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) if iteration > round(50000 * (64 / hparams.batch_size)): scheduler.step() iteration += 1
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) torch.nn.functional.sigmoid model = load_model(hparams) learning_rate = hparams.learning_rate #lr = args.lr * (0.1 ** (epoch // 30)) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) # optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, dampening=0, weight_decay=hparams.weight_decay) if hparams.fp16_run: optimizer = FP16_Optimizer( optimizer, dynamic_loss_scale=hparams.dynamic_loss_scaling) if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss(hparams) logger = prepare_directories_and_logger(output_directory, log_directory, rank) train_loader, valset, collate_fn = prepare_dataloaders(hparams) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() # ================ MAIN TRAINNIG LOOP! =================== step = 0 for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): start = time.perf_counter() for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) y_pred = model(x) loss, recon_loss, S_kl_loss, R_kl_loss, speaker_loss, augment_loss, alignment_loss = criterion( y_pred, y, iteration) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: optimizer.backward(loss) grad_norm = optimizer.clip_fp32_grads(hparams.grad_clip_thresh) else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() overflow = optimizer.overflow if hparams.fp16_run else False if not overflow and not math.isnan(reduced_loss) and rank == 0: duration = time.perf_counter() - start print( "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, recon_loss, S_kl_loss, R_kl_loss, \ speaker_loss, augment_loss, alignment_loss, iteration) if not overflow and (iteration % hparams.iters_per_checkpoint == 0): validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) model = load_model(hparams) learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize( model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) if hparams.use_vae: criterion = Tacotron2Loss_VAE(hparams) else: criterion = Tacotron2Loss() logger = prepare_directories_and_logger( output_directory, log_directory, rank, hparams.use_vae) train_loader, valset, collate_fn = prepare_dataloaders(hparams) valset_csv = os.path.join(output_directory, log_directory, 'valset.csv') # list2csv(flatten_list(valset.audiopaths_and_text), valset_csv, delimiter='|') list2csv(valset.audiopaths_and_text, valset_csv, delimiter='|') # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model( checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, _learning_rate, iteration, epoch, step = \ load_checkpoint(checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate if epoch == 0: iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) else: epoch_offset = epoch print('epoch offset: {}'.format(epoch_offset)) train_loader = prepare_dataloaders(hparams, epoch_offset, valset, collate_fn['train'])[0] print('completing loading model ...') model.train() is_overflow = False # ================ MAIN TRAINNIG LOOP! =================== track_csv = os.path.join(output_directory, log_directory, 'track.csv') track_header = ['padding-rate-txt', 'max-len-txt', 'top-len-txt', 'padding-rate-mel', 'max-len-mel', 'top-len-mel', 'batch-size', 'batch-length', 'batch-area', 'mem-use', 'mem-all', 'mem-cached', 'duration', 'iteration', 'epoch', 'step'] if os.path.isfile(track_csv) and checkpoint_path is not None: print('loading existing {} ...'.format(track_csv)) track = csv2dict(track_csv, header=track_header) else: track = {k:[] for k in track_header} print('start training in epoch {} ~ {} ...'.format(epoch_offset, hparams.epochs)) nbatches = len(train_loader) for epoch in range(epoch_offset, hparams.epochs): #if epoch >= 10: break print("Epoch: {}, #batches: {}".format(epoch, nbatches)) batch_sizes, batch_lengths = [0] * nbatches, [0] * nbatches for i, batch in enumerate(train_loader): start = time.perf_counter() for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) y_pred = model(x) if hparams.use_vae: loss, recon_loss, kl, kl_weight = criterion(y_pred, y, iteration) else: loss = criterion(y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start batch_sizes[i], batch_lengths[i] = batch[0].size(0), batch[2].size(2) batch_capacity = batch_sizes[i] * batch_lengths[i] mem_all = torch.cuda.memory_allocated() / (1024**2) mem_cached = torch.cuda.memory_cached() / (1024**2) mem_use = mem_all + mem_cached print("{} ({}:{}/{}): ".format(iteration, epoch, i, nbatches), end='') print("Batch {} ({}X{}) ".format(batch_capacity, batch_sizes[i], batch_lengths[i]), end='') print("Mem {:.1f} ({:.1f}+{:.1f}) ".format(mem_use, mem_all, mem_cached), end='') print("Train loss {:.3f} Grad Norm {:.3f} {:.2f}s/it".format( reduced_loss, grad_norm, duration)) input_lengths, gate_padded = batch[1], batch[4] metadata = (duration, iteration, epoch, i) track_seq(track, input_lengths, gate_padded, metadata) padding_rate_txt = track['padding-rate-txt'][-1] max_len_txt = track['max-len-txt'][-1] padding_rate_mel = track['padding-rate-mel'][-1] max_len_mel = track['max-len-mel'][-1] if hparams.use_vae: logger.log_training( reduced_loss, grad_norm, learning_rate, duration, padding_rate_txt, max_len_txt, padding_rate_mel, max_len_mel, iteration, recon_loss, kl, kl_weight) else: logger.log_training( reduced_loss, grad_norm, learning_rate, duration, padding_rate_txt, max_len_txt, padding_rate_mel, max_len_mel, iteration) check_by_iter = (hparams.check_by == 'iter') and \ (iteration % hparams.iters_per_checkpoint == 0) check_by_epoch = (hparams.check_by == 'epoch') and i == 0 and \ (epoch % hparams.epochs_per_checkpoint == 0) if not is_overflow and (check_by_iter or check_by_epoch): dict2col(track, track_csv, verbose=True) val_loss, (mus, emotions) = validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn['val'], logger, hparams.distributed_run, rank, hparams.use_vae, pre_batching=False) if rank == 0: checkpoint_path = os.path.join(output_directory, "checkpoint_{}-{}-{}_{:.3f}".format(iteration, epoch, i, val_loss)) save_checkpoint(model, optimizer, learning_rate, iteration, epoch, i, checkpoint_path) if hparams.use_vae: image_scatter_path = os.path.join(output_directory, "checkpoint_{0}_scatter_val.png".format(iteration)) image_tsne_path = os.path.join(output_directory, "checkpoint_{0}_tsne_val.png".format(iteration)) imageio.imwrite(image_scatter_path, plot_scatter(mus, emotions)) imageio.imwrite(image_tsne_path, plot_tsne(mus, emotions)) iteration += 1 if hparams.prep_trainset_per_epoch: train_loader = prepare_dataloaders(hparams, epoch+1, valset, collate_fn['train'])[0] nbatches = len(train_loader)