def train_and_eval(rank, n_gpus, hps): global global_step if rank == 0: logger = utils.get_logger(hps.model_dir) logger.info(hps) utils.check_git_hash(hps.model_dir) writer = SummaryWriter(log_dir=hps.model_dir) writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval")) dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank) torch.manual_seed(hps.train.seed) torch.cuda.set_device(rank) train_dataset = TextMelLoader(hps.data.training_files, hps.data) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=n_gpus, rank=rank, shuffle=True) collate_fn = TextMelCollate(1) train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False, batch_size=hps.train.batch_size, pin_memory=True, drop_last=True, collate_fn=collate_fn, sampler=train_sampler) if rank == 0: val_dataset = TextMelLoader(hps.data.validation_files, hps.data) val_loader = DataLoader(val_dataset, num_workers=8, shuffle=False, batch_size=hps.train.batch_size, pin_memory=True, drop_last=True, collate_fn=collate_fn) generator = models.FlowGenerator( n_vocab=len(symbols), out_channels=hps.data.n_mel_channels, **hps.model).cuda(rank) optimizer_g = commons.Adam(generator.parameters(), scheduler=hps.train.scheduler, dim_model=hps.model.hidden_channels, warmup_steps=hps.train.warmup_steps, lr=hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps) if hps.train.fp16_run: generator, optimizer_g._optim = amp.initialize(generator, optimizer_g._optim, opt_level="O1") generator = DDP(generator) epoch_str = 1 global_step = 0 try: _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), generator, optimizer_g) epoch_str += 1 optimizer_g.step_num = (epoch_str - 1) * len(train_loader) optimizer_g._update_learning_rate() global_step = (epoch_str - 1) * len(train_loader) except: if hps.train.ddi and os.path.isfile(os.path.join(hps.model_dir, "ddi_G.pth")): _ = utils.load_checkpoint(os.path.join(hps.model_dir, "ddi_G.pth"), generator, optimizer_g) for epoch in range(epoch_str, hps.train.epochs + 1): if rank==0: train(rank, epoch, hps, generator, optimizer_g, train_loader, logger, writer) evaluate(rank, epoch, hps, generator, optimizer_g, val_loader, logger, writer_eval) if epoch%50 == 0: utils.save_checkpoint(generator, optimizer_g, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "G_{}.pth".format(epoch))) else: train(rank, epoch, hps, generator, optimizer_g, train_loader, None, None)
def init(checkpoint_path, config_path, device="cuda"): hps = glow_utils.get_hparams_from_json(checkpoint_path, config_path) model = models.FlowGenerator(len(symbols), out_channels=hps.data.n_mel_channels, **hps.model).to(device) if os.path.isdir(checkpoint_path): checkpoint_path = glow_utils.latest_checkpoint_path(checkpoint_path) glow_utils.load_checkpoint(checkpoint_path, model) model.decoder.store_inverse( ) # do not calcuate jacobians for fast decoding _ = model.eval() cmu_dict = cmudict.CMUDict(hps.data.cmudict_path) return cmu_dict, model
def run(rank, n_gpus, hps): global global_step if rank == 0: logger = utils.get_logger(hps.model_dir) logger.info(hps) utils.check_git_hash(hps.model_dir) writer = SummaryWriter(log_dir=hps.model_dir) writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval")) dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank) torch.manual_seed(hps.train.seed) torch.cuda.set_device(rank) train_dataset = AudioSpecLoader(hps.data.training_files, hps.data) train_sampler = DistributedBucketSampler( train_dataset, hps.train.batch_size, [32,300,400,500,600,700,800,900,1000], num_replicas=n_gpus, rank=rank, shuffle=True) collate_fn = AudioSpecCollate() train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False, pin_memory=True, collate_fn=collate_fn, batch_sampler=train_sampler) if rank == 0: eval_dataset = AudioSpecLoader(hps.data.validation_files, hps.data) eval_loader = DataLoader(eval_dataset, num_workers=8, shuffle=False, batch_size=hps.train.batch_size, pin_memory=True, drop_last=False, collate_fn=collate_fn) net_g = SynthesizerTrn( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, **hps.model).cuda(rank) net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) optim_g = torch.optim.AdamW( net_g.parameters(), hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps) optim_d = torch.optim.AdamW( net_d.parameters(), hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps) net_g = DDP(net_g, find_unused_parameters=True,device_ids=[rank]) net_d = DDP(net_d, device_ids=[rank]) try: _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g) _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d) global_step = (epoch_str - 1) * len(train_loader) except: epoch_str = 1 global_step = 0 scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str-2) scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str-2) scaler = GradScaler(enabled=hps.train.fp16_run) for epoch in range(epoch_str, hps.train.epochs + 1): if rank==0: train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, eval_loader], logger, [writer, writer_eval]) else: train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, None], None, None) scheduler_g.step() scheduler_d.step()
def run(rank, n_gpus, hps): global global_step if rank == 0: writer = SummaryWriter(log_dir='./') writer_eval = SummaryWriter(log_dir='./eval') dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank) torch.manual_seed(hps.train.seed) torch.cuda.set_device(rank) train_dataset = ImageTextLoader(hps.data.training_file_path, hps.data) train_sampler = DistributedBucketSampler(train_dataset, hps.train.num_tokens, num_replicas=n_gpus, rank=rank, shuffle=True) collate_fn = ImageTextCollate() train_loader = DataLoader(train_dataset, num_workers=4, shuffle=False, pin_memory=True, collate_fn=collate_fn, batch_sampler=train_sampler) if rank == 0: eval_dataset = ImageTextLoader(hps.data.validation_file_path, hps.data) eval_sampler = DistributedBucketSampler(eval_dataset, hps.train.num_tokens, num_replicas=1, rank=rank, shuffle=True) eval_loader = DataLoader(eval_dataset, num_workers=0, shuffle=False, pin_memory=False, collate_fn=collate_fn, batch_sampler=eval_sampler) model = TableRecognizer(len(train_dataset.vocab), 3 * (hps.data.patch_length**2), **hps.model).cuda(rank) optim = torch.optim.Adam(model.parameters(), hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps) model = DDP(model, device_ids=[rank]) try: _, _, _, epoch_str = utils.load_checkpoint( utils.latest_checkpoint_path('./', "model_*.pth"), model, optim) global_step = (epoch_str - 1) * len(train_loader) except: epoch_str = 1 global_step = 0 scaler = GradScaler(enabled=hps.train.fp16_run) for epoch in range(epoch_str, hps.train.epochs + 1): if rank == 0: train_and_evaluate(rank, epoch, hps, model, optim, scaler, [train_loader, eval_loader], [writer, writer_eval]) else: train_and_evaluate(rank, epoch, hps, model, optim, scaler, [train_loader, None], None)