def after_run(self, run_context, run_values): global_step, cross_entropy, l2_loss, total_loss, learning_rate = run_values.results batch_time = time.time() - self.t0 ips = self.global_batch_size / batch_time LOGGER.log("iteration", int(self.current_step)) LOGGER.log("imgs_per_sec", float(ips)) LOGGER.log("cross_entropy", float(cross_entropy)) LOGGER.log("l2_loss", float(l2_loss)) LOGGER.log("total_loss", float(total_loss)) LOGGER.log("learning_rate", float(learning_rate)) LOGGER.iteration_stop() self.current_step += 1 if self._last_step_of_epoch(): LOGGER.epoch_start() LOGGER.log("epoch", int(self.current_epoch)) LOGGER.log("final_cross_entropy", float(cross_entropy)) LOGGER.log("final_l2_loss", float(l2_loss)) LOGGER.log("final_total_loss", float(total_loss)) LOGGER.log("final_learning_rate", float(learning_rate)) LOGGER.epoch_stop() self.current_epoch += 1
def main(): parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Training') parser = parse_args(parser) args, _ = parser.parse_known_args() LOGGER.set_model_name("Tacotron2_PyT") LOGGER.set_backends([ dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), dllg.JsonBackend(log_file=args.log_file if args.rank == 0 else None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) ]) LOGGER.timed_block_start("run") LOGGER.register_metric(tags.TRAIN_ITERATION_LOSS, metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("iter_time", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("epoch_time", metric_scope=dllg.EPOCH_SCOPE) LOGGER.register_metric("run_time", metric_scope=dllg.RUN_SCOPE) LOGGER.register_metric("val_iter_loss", metric_scope=dllg.EPOCH_SCOPE) LOGGER.register_metric("train_epoch_items/sec", metric_scope=dllg.EPOCH_SCOPE) LOGGER.register_metric("train_epoch_avg_loss", metric_scope=dllg.EPOCH_SCOPE) log_hardware() model_name = args.model_name parser = models.parse_model_args(model_name, parser) parser.parse_args() args = parser.parse_args() log_args(args) torch.backends.cudnn.enabled = args.cudnn_enabled torch.backends.cudnn.benchmark = args.cudnn_benchmark distributed_run = args.world_size > 1 if distributed_run: init_distributed(args, args.world_size, args.rank, args.group_name) LOGGER.log(key=tags.RUN_START) run_start_time = time.time() model_config = models.get_model_config(model_name, args) model = models.get_model(model_name, model_config, to_fp16=args.fp16_run, to_cuda=True) epoch_start = 0 if args.resume: resume_model_path = args.resume_tacotron2_path if args.model_name == "Tacotron2" else args.resume_waveglow_path checkpoint = torch.load(resume_model_path, map_location='cpu') epoch_start = checkpoint["epoch"] state_dict = checkpoint['state_dict'] if checkpoint_from_distributed(state_dict): state_dict = unwrap_distributed(state_dict) model.load_state_dict(state_dict) print("restore model %s" % resume_model_path) if distributed_run: model = DDP(model) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) if args.fp16_run: optimizer = FP16_Optimizer( optimizer, dynamic_loss_scale=args.dynamic_loss_scaling) try: sigma = args.sigma except AttributeError: sigma = None criterion = loss_functions.get_loss_function(model_name, sigma) try: n_frames_per_step = args.n_frames_per_step except AttributeError: n_frames_per_step = None collate_fn = data_functions.get_collate_function(model_name, n_frames_per_step) trainset = data_functions.get_data_loader(model_name, args.dataset_path, args.training_files, args) train_sampler = DistributedSampler(trainset) if distributed_run else None train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=args.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) valset = data_functions.get_data_loader(model_name, args.dataset_path, args.validation_files, args) batch_to_gpu = data_functions.get_batch_to_gpu(model_name) iteration = 0 model.train() LOGGER.log(key=tags.TRAIN_LOOP) for epoch in range(epoch_start, args.epochs): LOGGER.epoch_start() epoch_start_time = time.time() LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch) # used to calculate avg items/sec over epoch reduced_num_items_epoch = 0 # used to calculate avg loss over epoch train_epoch_avg_loss = 0.0 num_iters = 0 # if overflow at the last iteration then do not save checkpoint overflow = False for i, batch in enumerate(train_loader): LOGGER.iteration_start() iter_start_time = time.time() LOGGER.log(key=tags.TRAIN_ITER_START, value=i) print("Batch: {}/{} epoch {}".format(i, len(train_loader), epoch)) start = time.perf_counter() adjust_learning_rate(epoch, optimizer, args.learning_rate, args.anneal_steps, args.anneal_factor) model.zero_grad() x, y, num_items = batch_to_gpu(batch) if args.fp16_run: y_pred = model(fp32_to_fp16(x)) loss = criterion(fp16_to_fp32(y_pred), y) else: y_pred = model(x) loss = criterion(y_pred, y) if distributed_run: reduced_loss = reduce_tensor(loss.data, args.world_size).item() reduced_num_items = reduce_tensor(num_items.data, 1).item() else: reduced_loss = loss.item() reduced_num_items = num_items.item() if np.isnan(reduced_loss): raise Exception("loss is NaN") LOGGER.log(key=tags.TRAIN_ITERATION_LOSS, value=reduced_loss) train_epoch_avg_loss += reduced_loss num_iters += 1 # accumulate number of items processed in this epoch reduced_num_items_epoch += reduced_num_items if args.fp16_run: optimizer.backward(loss) grad_norm = optimizer.clip_master_grads(args.grad_clip_thresh) else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), args.grad_clip_thresh) optimizer.step() overflow = optimizer.overflow if args.fp16_run else False iteration += 1 LOGGER.log(key=tags.TRAIN_ITER_STOP, value=i) iter_stop_time = time.time() iter_time = iter_stop_time - iter_start_time LOGGER.log(key="train_iter_items/sec", value=(reduced_num_items / iter_time)) LOGGER.log(key="iter_time", value=iter_time) LOGGER.iteration_stop() LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch) epoch_stop_time = time.time() epoch_time = epoch_stop_time - epoch_start_time LOGGER.log(key="train_epoch_items/sec", value=(reduced_num_items_epoch / epoch_time)) LOGGER.log(key="train_epoch_avg_loss", value=(train_epoch_avg_loss / num_iters if num_iters > 0 else 0.0)) LOGGER.log(key="epoch_time", value=epoch_time) LOGGER.log(key=tags.EVAL_START, value=epoch) validate(model, criterion, valset, iteration, args.batch_size, args.world_size, collate_fn, distributed_run, args.rank, batch_to_gpu, args.fp16_run) LOGGER.log(key=tags.EVAL_STOP, value=epoch) if not overflow and (epoch % args.epochs_per_checkpoint == 0) and args.rank == 0: checkpoint_path = os.path.join( args.output_directory, "checkpoint_{}_{}".format(model_name, epoch)) save_checkpoint(model, epoch, model_config, checkpoint_path) save_sample( model_name, model, args.waveglow_checkpoint, args.tacotron2_checkpoint, args.phrase_path, os.path.join(args.output_directory, "sample_{}_{}.wav".format(model_name, iteration)), args.sampling_rate, args.fp16_run) LOGGER.epoch_stop() run_stop_time = time.time() run_time = run_stop_time - run_start_time LOGGER.log(key="run_time", value=run_time) LOGGER.log(key=tags.RUN_FINAL) print("training time", run_stop_time - run_start_time) LOGGER.timed_block_stop("run") if args.rank == 0: LOGGER.finish()
def main(): parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Training') parser = parse_args(parser) args, _ = parser.parse_known_args() LOGGER.set_model_name("Tacotron2_PyT") LOGGER.set_backends([ dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), dllg.JsonBackend(log_file=os.path.join( args.output_directory, args.log_file) if args.rank == 0 else None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) ]) LOGGER.timed_block_start("run") LOGGER.register_metric(tags.TRAIN_ITERATION_LOSS, metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("iter_time", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("epoch_time", metric_scope=dllg.EPOCH_SCOPE) LOGGER.register_metric("run_time", metric_scope=dllg.RUN_SCOPE) LOGGER.register_metric("val_iter_loss", metric_scope=dllg.EPOCH_SCOPE) LOGGER.register_metric("train_epoch_frames/sec", metric_scope=dllg.EPOCH_SCOPE) LOGGER.register_metric("train_epoch_avg_frames/sec", metric_scope=dllg.EPOCH_SCOPE) LOGGER.register_metric("train_epoch_avg_loss", metric_scope=dllg.EPOCH_SCOPE) log_hardware() parser = parse_tacotron2_args(parser) args = parser.parse_args() log_args(args) torch.backends.cudnn.enabled = args.cudnn_enabled torch.backends.cudnn.benchmark = args.cudnn_benchmark distributed_run = args.world_size > 1 if distributed_run: init_distributed(args, args.world_size, args.rank, args.group_name) os.makedirs(args.output_directory, exist_ok=True) LOGGER.log(key=tags.RUN_START) run_start_time = time.time() model = get_tacotron2_model(args, len(args.training_anchor_dirs), is_training=True) if not args.amp_run and distributed_run: model = DDP(model) model.restore_checkpoint( os.path.join(args.output_directory, args.latest_checkpoint_file)) optimizer = torch.optim.Adam(model.parameters(), lr=args.init_lr, weight_decay=args.weight_decay) if args.amp_run: model, optimizer = amp.initialize(model, optimizer, opt_level="O1") if distributed_run: model = DDP(model) criterion = Tacotron2Loss() collate_fn = TextMelCollate(args) train_dataset = TextMelDataset(args, args.training_anchor_dirs) train_loader = DataLoader(train_dataset, num_workers=2, shuffle=False, batch_size=args.batch_size // len(args.training_anchor_dirs), pin_memory=False, drop_last=True, collate_fn=collate_fn) # valate_dataset = TextMelDataset(args, args.validation_anchor_dirs) model.train() elapsed_epochs = model.get_elapsed_epochs() epochs = args.epochs - elapsed_epochs iteration = elapsed_epochs * len(train_loader) LOGGER.log(key=tags.TRAIN_LOOP) for epoch in range(1, epochs + 1): LOGGER.epoch_start() epoch_start_time = time.time() epoch += elapsed_epochs LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch) # used to calculate avg frames/sec over epoch reduced_num_frames_epoch = 0 # used to calculate avg loss over epoch train_epoch_avg_loss = 0.0 train_epoch_avg_frames_per_sec = 0.0 num_iters = 0 adjust_learning_rate(optimizer, epoch, args) for i, batch in enumerate(train_loader): print(f"Batch: {i}/{len(train_loader)} epoch {epoch}") LOGGER.iteration_start() iter_start_time = time.time() LOGGER.log(key=tags.TRAIN_ITER_START, value=i) # start = time.perf_counter() optimizer.zero_grad() x, y, num_frames = batch_to_gpu(batch) y_pred = model(x) loss = criterion(y_pred, y) if distributed_run: reduced_loss = reduce_tensor(loss.data, args.world_size).item() reduced_num_frames = reduce_tensor(num_frames.data, 1).item() else: reduced_loss = loss.item() reduced_num_frames = num_frames.item() if np.isnan(reduced_loss): raise Exception("loss is NaN") LOGGER.log(key=tags.TRAIN_ITERATION_LOSS, value=reduced_loss) train_epoch_avg_loss += reduced_loss num_iters += 1 # accumulate number of frames processed in this epoch reduced_num_frames_epoch += reduced_num_frames if args.amp_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.grad_clip_thresh) else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), args.grad_clip_thresh) optimizer.step() iteration += 1 LOGGER.log(key=tags.TRAIN_ITER_STOP, value=i) iter_stop_time = time.time() iter_time = iter_stop_time - iter_start_time frames_per_sec = reduced_num_frames / iter_time train_epoch_avg_frames_per_sec += frames_per_sec LOGGER.log(key="train_iter_frames/sec", value=frames_per_sec) LOGGER.log(key="iter_time", value=iter_time) LOGGER.iteration_stop() LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch) epoch_stop_time = time.time() epoch_time = epoch_stop_time - epoch_start_time LOGGER.log(key="train_epoch_frames/sec", value=(reduced_num_frames_epoch / epoch_time)) LOGGER.log(key="train_epoch_avg_frames/sec", value=(train_epoch_avg_frames_per_sec / num_iters if num_iters > 0 else 0.0)) LOGGER.log(key="train_epoch_avg_loss", value=(train_epoch_avg_loss / num_iters if num_iters > 0 else 0.0)) LOGGER.log(key="epoch_time", value=epoch_time) LOGGER.log(key=tags.EVAL_START, value=epoch) # validate(model, criterion, valate_dataset, iteration, collate_fn, distributed_run, args) LOGGER.log(key=tags.EVAL_STOP, value=epoch) # Store latest checkpoint in each epoch model.elapse_epoch() checkpoint_path = os.path.join(args.output_directory, args.latest_checkpoint_file) torch.save(model.state_dict(), checkpoint_path) # Plot alignemnt if epoch % args.epochs_per_alignment == 0 and args.rank == 0: alignments = y_pred[3].data.numpy() index = np.random.randint(len(alignments)) plot_alignment( alignments[index].transpose(0, 1), # [enc_step, dec_step] os.path.join(args.output_directory, f"align_{epoch:04d}_{iteration}.png"), info= f"{datetime.now().strftime('%Y-%m-%d %H:%M')} Epoch={epoch:04d} Iteration={iteration} Average loss={train_epoch_avg_loss/num_iters:.5f}" ) # Save checkpoint if epoch % args.epochs_per_checkpoint == 0 and args.rank == 0: checkpoint_path = os.path.join(args.output_directory, f"checkpoint_{epoch:04d}.pt") print( f"Saving model and optimizer state at epoch {epoch:04d} to {checkpoint_path}" ) torch.save(model.state_dict(), checkpoint_path) # Save evaluation # save_sample(model, args.tacotron2_checkpoint, args.phrase_path, # os.path.join(args.output_directory, f"sample_{epoch:04d}_{iteration}.wav"), args.sampling_rate) LOGGER.epoch_stop() run_stop_time = time.time() run_time = run_stop_time - run_start_time LOGGER.log(key="run_time", value=run_time) LOGGER.log(key=tags.RUN_FINAL) print("training time", run_stop_time - run_start_time) LOGGER.timed_block_stop("run") if args.rank == 0: LOGGER.finish()
def main(): parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Training') parser = parse_args(parser) args, _ = parser.parse_known_args() LOGGER.set_model_name("Tacotron2_PyT") LOGGER.set_backends([ dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), dllg.JsonBackend(log_file=args.log_file if args.rank == 0 else None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) ]) LOGGER.timed_block_start("run") LOGGER.register_metric(tags.TRAIN_ITERATION_LOSS, metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("iter_time", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("epoch_time", metric_scope=dllg.EPOCH_SCOPE) LOGGER.register_metric("run_time", metric_scope=dllg.RUN_SCOPE) LOGGER.register_metric("val_iter_loss", metric_scope=dllg.EPOCH_SCOPE) LOGGER.register_metric("train_epoch_items/sec", metric_scope=dllg.EPOCH_SCOPE) LOGGER.register_metric("train_epoch_avg_items/sec", metric_scope=dllg.EPOCH_SCOPE) LOGGER.register_metric("train_epoch_avg_loss", metric_scope=dllg.EPOCH_SCOPE) log_hardware() # Restore training from checkpoint logic checkpoint = None start_epoch = 0 model_name = args.model_name parser = models.parse_model_args(model_name, parser) parser.parse_args() args = parser.parse_args() log_args(args) torch.backends.cudnn.enabled = args.cudnn_enabled torch.backends.cudnn.benchmark = args.cudnn_benchmark num_gpus = torch.cuda.device_count() print("gpus", num_gpus) distributed_run = num_gpus > 1 if distributed_run: init_distributed(args, args.world_size, args.rank, args.group_name) LOGGER.log(key=tags.RUN_START) run_start_time = time.time() # Restore training from checkpoint logic if args.restore_from: print('Restoring from {} checkpoint'.format(args.restore_from)) checkpoint = torch.load(args.restore_from, map_location='cpu') start_epoch = checkpoint['epoch'] + 1 model_config = checkpoint['config'] model = models.get_model(model_name, model_config, to_cuda=True) new_state_dict = {} for key, value in checkpoint['state_dict'].items(): new_key = key.replace('module.', '') new_state_dict[new_key] = value model_dict = new_state_dict if args.warm_start: ignore_layers = ['embedding.weight'] print('Warm start') if len(ignore_layers) > 0: model_dict = { k: v for k, v in model_dict.items() if k not in ignore_layers } dummy_dict = model.state_dict() dummy_dict.update(model_dict) model_dict = dummy_dict model.load_state_dict(model_dict) else: model_config = models.get_model_config(model_name, args) model = models.get_model(model_name, model_config, to_cuda=True) print("model configured") #model.cuda(4) model.cuda() # if not args.amp_run and distributed_run: # model = DDP(model ,delay_allreduce=True) # # optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) # Restore training from checkpoint logic if checkpoint and 'optimizer_state_dict' in checkpoint and not args.warm_start: # TODO: think about this more print('Restoring optimizer state') optimizer.load_state_dict(checkpoint['optimizer_state_dict']) if args.amp_run: model, optimizer = amp.initialize(model, optimizer, opt_level="O1") print("amp initialized") model = DDP(model, delay_allreduce=True) print("ddpmodel") try: sigma = args.sigma except AttributeError: sigma = None print("train starting") criterion = loss_functions.get_loss_function(model_name, sigma) try: n_frames_per_step = args.n_frames_per_step except AttributeError: n_frames_per_step = None print("data loading start") collate_fn = data_functions.get_collate_function(model_name, n_frames_per_step) trainset = data_functions.get_data_loader(model_name, args.training_files, args) train_sampler = DistributedSampler(trainset) if distributed_run else None print("train loader started") train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=args.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) valset = data_functions.get_data_loader(model_name, args.validation_files, args) batch_to_gpu = data_functions.get_batch_to_gpu(model_name) iteration = 0 model.train() LOGGER.log(key=tags.TRAIN_LOOP) # Restore training from checkpoint logic if start_epoch >= args.epochs: print('Checkpoint epoch {} >= total epochs {}'.format( start_epoch, args.epochs)) else: for epoch in range(start_epoch, args.epochs): LOGGER.epoch_start() epoch_start_time = time.time() LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch) # used to calculate avg items/sec over epoch reduced_num_items_epoch = 0 # used to calculate avg loss over epoch train_epoch_avg_loss = 0.0 train_epoch_avg_items_per_sec = 0.0 num_iters = 0 # if overflow at the last iteration then do not save checkpoint overflow = False for i, batch in enumerate(train_loader): print("Batch: {}/{} epoch {}".format(i, len(train_loader), epoch)) LOGGER.iteration_start() iter_start_time = time.time() LOGGER.log(key=tags.TRAIN_ITER_START, value=i) start = time.perf_counter() adjust_learning_rate(epoch, optimizer, args.learning_rate, args.anneal_steps, args.anneal_factor) model.zero_grad() x, y, num_items = batch_to_gpu(batch) y_pred = model(x) loss = criterion(y_pred, y) if distributed_run: reduced_loss = reduce_tensor(loss.data, args.world_size).item() reduced_num_items = reduce_tensor(num_items.data, 1).item() else: reduced_loss = loss.item() reduced_num_items = num_items.item() if np.isnan(reduced_loss): raise Exception("loss is NaN") LOGGER.log(key=tags.TRAIN_ITERATION_LOSS, value=reduced_loss) train_epoch_avg_loss += reduced_loss num_iters += 1 # accumulate number of items processed in this epoch reduced_num_items_epoch += reduced_num_items if args.amp_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.grad_clip_thresh) else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), args.grad_clip_thresh) optimizer.step() iteration += 1 LOGGER.log(key=tags.TRAIN_ITER_STOP, value=i) iter_stop_time = time.time() iter_time = iter_stop_time - iter_start_time items_per_sec = reduced_num_items / iter_time train_epoch_avg_items_per_sec += items_per_sec LOGGER.log(key="train_iter_items/sec", value=items_per_sec) LOGGER.log(key="iter_time", value=iter_time) LOGGER.iteration_stop() LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch) epoch_stop_time = time.time() epoch_time = epoch_stop_time - epoch_start_time LOGGER.log(key="train_epoch_items/sec", value=(reduced_num_items_epoch / epoch_time)) LOGGER.log(key="train_epoch_avg_items/sec", value=(train_epoch_avg_items_per_sec / num_iters if num_iters > 0 else 0.0)) LOGGER.log(key="train_epoch_avg_loss", value=(train_epoch_avg_loss / num_iters if num_iters > 0 else 0.0)) LOGGER.log(key="epoch_time", value=epoch_time) LOGGER.log(key=tags.EVAL_START, value=epoch) validate(model, criterion, valset, iteration, args.batch_size, args.world_size, collate_fn, distributed_run, args.rank, batch_to_gpu) LOGGER.log(key=tags.EVAL_STOP, value=epoch) if (epoch % args.epochs_per_checkpoint == 0) and args.rank == 0: checkpoint_path = os.path.join( args.output_directory, "checkpoint_{}_{}".format(model_name, epoch)) save_checkpoint(model, epoch, model_config, optimizer, checkpoint_path) save_sample( model_name, model, args.waveglow_checkpoint, args.tacotron2_checkpoint, args.phrase_path, os.path.join( args.output_directory, "sample_{}_{}.wav".format(model_name, iteration)), args.sampling_rate) LOGGER.epoch_stop() run_stop_time = time.time() run_time = run_stop_time - run_start_time LOGGER.log(key="run_time", value=run_time) LOGGER.log(key=tags.RUN_FINAL) print("training time", run_stop_time - run_start_time) LOGGER.timed_block_stop("run") if args.rank == 0: LOGGER.finish()