Пример #1
0
def train(num_gpus,
          rank,
          group_name,
          output_directory,
          epochs,
          learning_rate,
          sigma,
          iters_per_checkpoint,
          batch_size,
          seed,
          fp16_run,
          checkpoint_path,
          with_tensorboard,
          num_workers=2):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    # HACK: setup separate training and eval sets
    training_files = data_config['training_files']
    eval_files = data_config['eval_files']
    del data_config['training_files']
    del data_config['eval_files']
    data_config['audio_files'] = training_files
    trainset = Mel2Samp(**data_config)
    data_config['audio_files'] = eval_files
    evalset = Mel2Samp(**data_config)

    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    eval_sampler = DistributedSampler(evalset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======

    print("Creating dataloaders with " + str(num_workers) + " workers")
    train_loader = DataLoader(trainset,
                              num_workers=num_workers,
                              shuffle=True,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)
    eval_loader = DataLoader(evalset,
                             num_workers=num_workers,
                             shuffle=True,
                             sampler=eval_sampler,
                             batch_size=batch_size,
                             pin_memory=False,
                             drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        logger_train = SummaryWriter(
            os.path.join(output_directory, 'logs', 'train'))
        logger_eval = SummaryWriter(
            os.path.join(output_directory, 'logs', 'eval'))

    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        model.train()
        with tqdm(total=len(train_loader)) as train_pbar:
            for i, batch in enumerate(train_loader):
                model.zero_grad()

                mel, audio = batch
                mel = torch.autograd.Variable(mel.cuda())
                audio = torch.autograd.Variable(audio.cuda())
                outputs = model((mel, audio))

                loss = criterion(outputs)
                if num_gpus > 1:
                    reduced_loss = reduce_tensor(loss.data, num_gpus).item()
                else:
                    reduced_loss = loss.item()

                if fp16_run:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                optimizer.step()

                train_pbar.set_description(
                    "Epoch {} Iter {} Loss {:.3f}".format(
                        epoch, iteration, reduced_loss))
                if with_tensorboard and rank == 0 and iteration % 10 == 0:
                    logger_train.add_scalar('loss', reduced_loss,
                                            i + len(train_loader) * epoch)
                    # adding logging for GPU utilization and memory usage
                    gpu_memory_used, gpu_utilization = get_gpu_stats()
                    k = 'gpu' + str(0)
                    logger_train.add_scalar(k + '/memory', gpu_memory_used,
                                            iteration)
                    logger_train.add_scalar(k + '/load', gpu_utilization,
                                            iteration)
                    logger_train.flush()

                if (iteration % iters_per_checkpoint == 0):
                    if rank == 0:
                        checkpoint_path = "{}/waveglow_{}".format(
                            output_directory, iteration)
                        save_checkpoint(model, optimizer, learning_rate,
                                        iteration, checkpoint_path)

                iteration += 1
                train_pbar.update(1)

        # Eval
        model.eval()
        torch.cuda.empty_cache()

        with torch.no_grad():
            tensorboard_mel, tensorboard_audio = None, None
            loss_accum = []
            with tqdm(total=len(eval_loader)) as eval_pbar:
                for i, batch in enumerate(eval_loader):
                    model.zero_grad()
                    mel, audio = batch
                    mel = torch.autograd.Variable(mel.cuda())
                    audio = torch.autograd.Variable(audio.cuda())
                    outputs = model((mel, audio))
                    loss = criterion(outputs).item()
                    loss_accum.append(loss)
                    eval_pbar.set_description("Epoch {} Eval {:.3f}".format(
                        epoch, loss))
                    outputs = None

                    # use the first batch for tensorboard audio samples
                    if i == 0:
                        tensorboard_mel = mel
                        tensorboard_audio = audio
                    eval_pbar.update(1)

            if with_tensorboard and rank == 0:
                loss_avg = statistics.mean(loss_accum)
                tqdm.write("Epoch {} Eval AVG {}".format(epoch, loss_avg))
                logger_eval.add_scalar('loss', loss_avg, iteration)

            # log audio samples to tensorboard
            tensorboard_audio_generated = model.infer(tensorboard_mel)
            for i in range(0, 5):
                ta = tensorboard_audio[i].cpu().numpy()
                tag = tensorboard_audio_generated[i].cpu().numpy()
                logger_eval.add_audio("sample " + str(i) + "/orig",
                                      ta,
                                      epoch,
                                      sample_rate=data_config['sampling_rate'])
                logger_eval.add_audio("sample " + str(i) + "/gen",
                                      tag,
                                      epoch,
                                      sample_rate=data_config['sampling_rate'])
            logger_eval.flush()
Пример #2
0
            except KeyboardInterrupt:
                self.Save_Checkpoint()
                exit(1)

        self.tqdm.close()
        logging.info('Finished training.')


if __name__ == '__main__':
    argParser = argparse.ArgumentParser()
    argParser.add_argument('-hp',
                           '--hyper_parameters',
                           required=True,
                           type=str)
    argParser.add_argument('-s', '--steps', default=0, type=int)
    argParser.add_argument('-p', '--port', default=54321, type=int)
    argParser.add_argument('-r', '--local_rank', default=0, type=int)
    args = argParser.parse_args()

    hp = Recursive_Parse(
        yaml.load(open(args.hyper_parameters, encoding='utf-8'),
                  Loader=yaml.Loader))
    os.environ['CUDA_VISIBLE_DEVICES'] = hp.Device

    if hp.Use_Multi_GPU:
        init_distributed(rank=int(os.getenv('RANK', '0')),
                         num_gpus=int(os.getenv("WORLD_SIZE", '1')),
                         dist_backend='nccl',
                         dist_url='tcp://127.0.0.1:{}'.format(args.port))
    new_Trainer = Trainer(hp_path=args.hyper_parameters, steps=args.steps)
    new_Trainer.Train()
Пример #3
0
def train(
    num_gpus,
    rank,
    group_name,
    output_directory,
    epochs,
    learning_rate,
    sigma,
    iters_per_checkpoint,
    batch_size,
    seed,
    fp16_run,
    checkpoint_path,
    with_tensorboard,
):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    # =====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda()

    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    # =====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if fp16_run:
        from apex import amp

        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    trainset = Mel2Samp(**data_config)
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(
        trainset,
        num_workers=1,
        shuffle=False,
        sampler=train_sampler,
        batch_size=batch_size,
        pin_memory=False,
        drop_last=True,
    )

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter

        logger = SummaryWriter(os.path.join(output_directory, "logs"))

    # fixed for visualization
    real_mels, real_audios = zip(*[trainset[i] for i in range(8)])
    real_mel = torch.cat(real_mels, dim=-1)
    real_audio = torch.cat(real_audios, dim=0)

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            model.zero_grad()

            mel, audio = batch
            mel = torch.autograd.Variable(mel.cuda())
            audio = torch.autograd.Variable(audio.cuda())
            outputs = model((mel, audio))

            loss = criterion(outputs)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            optimizer.step()

            print("{}:\t{:.9f}".format(iteration, reduced_loss))
            if with_tensorboard and rank == 0:
                step = i + len(train_loader) * epoch
                logger.add_scalar("training_loss", reduced_loss, step)
                if step % 500 == 0:
                    # select the first eight data sample

                    model.eval()
                    with torch.no_grad():
                        device = mel.device
                        fake_audio = (model.infer(
                            torch.stack(real_mels).to(device)).flatten(
                                0, 1).cpu())
                    model.train()
                    fake_mel = trainset.get_mel(fake_audio)

                    logger.add_image(
                        "training_mel_real",
                        plot_spectrogram_to_numpy(real_mel),
                        step,
                        dataformats="HWC",
                    )
                    logger.add_audio(
                        "training_audio_real",
                        real_audio,
                        step,
                        22050,
                    )
                    logger.add_image(
                        "training_mel_fake",
                        plot_spectrogram_to_numpy(fake_mel),
                        step,
                        dataformats="HWC",
                    )
                    logger.add_audio(
                        "training_audio_fake",
                        fake_audio,
                        step,
                        22050,
                    )
                    logger.flush()

            if iteration % iters_per_checkpoint == 0:
                if rank == 0:
                    checkpoint_path = "{}/waveglow_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
Пример #4
0
def train(num_gpus,
          rank,
          group_name,
          output_directory,
          epochs,
          learning_rate,
          sigma,
          iters_per_checkpoint,
          batch_size,
          seed,
          fp16_run,
          checkpoint_path,
          with_tensorboard,
          num_workers=4):
    print("num_workers", num_workers)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    # =====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda()

    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    # =====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = StepLR(optimizer, step_size=1, gamma=0.96)

    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    trainset = Mel2Samp(**data_config)
    evalset = Mel2Samp(**eval_data_config)
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    eval_sampler = DistributedSampler(evalset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset,
                              num_workers=num_workers,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)
    eval_loader = DataLoader(evalset,
                             num_workers=num_workers,
                             shuffle=False,
                             sampler=eval_sampler,
                             batch_size=batch_size,
                             pin_memory=False,
                             drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        logger = SummaryWriter(os.path.join(output_directory, 'logs'))

    epoch_offset = max(1, int(iteration / len(train_loader)))
    start_time = datetime.datetime.now()
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print('Epoch:', epoch, 'LR:', scheduler.get_lr())
        elapsed = datetime.datetime.now() - start_time
        print("Epoch: [{}][els: {}] {}".format(
            datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), elapsed,
            epoch))
        model.train()
        total_loss = 0.
        for i, batch in enumerate(train_loader):
            model.zero_grad()

            if waveglow_config["multi_speaker_config"]["use_multi_speaker"]:
                mel, audio, spk_embed_or_id = batch
                spk_embed_or_id = torch.autograd.Variable(
                    spk_embed_or_id.cuda())
            else:
                mel, audio = batch
            mel = torch.autograd.Variable(mel.cuda())
            audio = torch.autograd.Variable(audio.cuda())

            if waveglow_config["multi_speaker_config"]["use_multi_speaker"]:
                outputs = model((mel, audio, spk_embed_or_id))
            else:
                outputs = model((mel, audio))

            loss = criterion(outputs)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            optimizer.step()
            total_loss += reduced_loss
            if i > 0 and i % 10 == 0:
                elapsed = datetime.datetime.now() - start_time
                print(
                    "[{}][els: {}] epoch {},total steps{}, {}/{} steps:\t{:.9f}"
                    .format(
                        datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"),
                        elapsed, epoch, iteration, i, len(train_loader),
                        reduced_loss))
            if with_tensorboard and rank == 0:
                logger.add_scalar('training_loss', reduced_loss,
                                  i + len(train_loader) * epoch)

            if (iteration % iters_per_checkpoint == 0):
                if rank == 0:
                    checkpoint_path = "{}/waveglow_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
        elapsed = datetime.datetime.now() - start_time
        print("[{}][els: {}] {} epoch :\tavg loss {:.9f}".format(
            datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), elapsed,
            epoch, total_loss / len(train_loader)))
        scheduler.step()
        eval.eval(eval_loader, model, criterion, num_gpus, start_time, epoch,
                  waveglow_config["multi_speaker_config"]["use_multi_speaker"])
Пример #5
0
def train(n_gpus,
          rank,
          output_directory,
          epochs,
          optim_algo,
          learning_rate,
          weight_decay,
          sigma,
          iters_per_checkpoint,
          batch_size,
          seed,
          checkpoint_path,
          ignore_layers,
          include_layers,
          finetune_layers,
          warmstart_checkpoint_path,
          with_tensorboard,
          grad_clip_val,
          fp16_run,
          tensorboard_path=None):
    fp16_run = bool(fp16_run)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    if n_gpus > 1:
        init_distributed(rank, n_gpus, **dist_config)

    criterion = FlowtronLoss(sigma, bool(model_config['n_components']),
                             bool(model_config['use_gate_layer']))
    model = Flowtron(**model_config).cuda()

    if len(finetune_layers):
        for name, param in model.named_parameters():
            if name in finetune_layers:
                param.requires_grad = True
            else:
                param.requires_grad = False

    print("Initializing %s optimizer" % (optim_algo))
    if optim_algo == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=learning_rate,
                                     weight_decay=weight_decay)
    elif optim_algo == 'RAdam':
        optimizer = RAdam(model.parameters(),
                          lr=learning_rate,
                          weight_decay=weight_decay)
    else:
        print("Unrecognized optimizer %s!" % (optim_algo))
        exit(1)

    # Load checkpoint if one exists
    iteration = 0
    if warmstart_checkpoint_path != "":
        model = warmstart(warmstart_checkpoint_path, model)

    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer, ignore_layers)
        iteration += 1  # next iteration is iteration + 1

    if n_gpus > 1:
        model = apply_gradient_allreduce(model)
    print(model)
    scaler = amp.GradScaler(enabled=fp16_run)

    train_loader, valset, collate_fn = prepare_dataloaders(
        data_config, n_gpus, batch_size)

    # Get shared output_directory ready
    if rank == 0 and not os.path.isdir(output_directory):
        os.makedirs(output_directory)
        os.chmod(output_directory, 0o775)
        print("Output directory", output_directory)

    if with_tensorboard and rank == 0:
        tboard_out_path = tensorboard_path
        if tensorboard_path is None:
            tboard_out_path = os.path.join(output_directory, "logs/run1")
        print("Setting up Tensorboard log in %s" % (tboard_out_path))
        logger = FlowtronLogger(tboard_out_path)

    # force set the learning rate to what is specified
    for param_group in optimizer.param_groups:
        param_group['lr'] = learning_rate

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))

    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for batch in train_loader:
            model.zero_grad()

            mel, speaker_vecs, text, in_lens, out_lens, gate_target, attn_prior = batch
            mel, speaker_vecs, text = mel.cuda(), speaker_vecs.cuda(
            ), text.cuda()
            in_lens, out_lens, gate_target = in_lens.cuda(), out_lens.cuda(
            ), gate_target.cuda()
            attn_prior = attn_prior.cuda() if valset.use_attn_prior else None
            with amp.autocast(enabled=fp16_run):
                z, log_s_list, gate_pred, attn, mean, log_var, prob = model(
                    mel, speaker_vecs, text, in_lens, out_lens, attn_prior)

                loss_nll, loss_gate = criterion(
                    (z, log_s_list, gate_pred, mean, log_var, prob),
                    gate_target, out_lens)
                loss = loss_nll + loss_gate

            if n_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
                reduced_gate_loss = reduce_tensor(loss_gate.data,
                                                  n_gpus).item()
                reduced_nll_loss = reduce_tensor(loss_nll.data, n_gpus).item()
            else:
                reduced_loss = loss.item()
                reduced_gate_loss = loss_gate.item()
                reduced_nll_loss = loss_nll.item()

            scaler.scale(loss).backward()
            if grad_clip_val > 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               grad_clip_val)

            scaler.step(optimizer)
            scaler.update()

            if rank == 0:
                print("{}:\t{:.9f}".format(iteration, reduced_loss),
                      flush=True)

            if with_tensorboard and rank == 0:
                logger.add_scalar('training_loss', reduced_loss, iteration)
                logger.add_scalar('training_loss_gate', reduced_gate_loss,
                                  iteration)
                logger.add_scalar('training_loss_nll', reduced_nll_loss,
                                  iteration)
                logger.add_scalar('learning_rate', learning_rate, iteration)

            if iteration % iters_per_checkpoint == 0:
                val_loss, val_loss_nll, val_loss_gate, attns, gate_pred, gate_target = compute_validation_loss(
                    model, criterion, valset, collate_fn, batch_size, n_gpus)
                if rank == 0:
                    print("Validation loss {}: {:9f}  ".format(
                        iteration, val_loss))
                    if with_tensorboard:
                        logger.log_validation(val_loss, val_loss_nll,
                                              val_loss_gate, attns, gate_pred,
                                              gate_target, iteration)

                    checkpoint_path = "{}/model_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
Пример #6
0
def train(n_gpus, rank, output_directory, epochs, learning_rate, weight_decay,
          sigma, iters_per_checkpoint, batch_size, seed, checkpoint_path,
          ignore_layers, include_layers, warmstart_checkpoint_path,
          with_tensorboard, fp16_run):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    if n_gpus > 1:
        init_distributed(rank, n_gpus, **dist_config)

    criterion = FlowtronLoss(sigma, bool(model_config['n_components']),
                             model_config['use_gate_layer'])
    model = Flowtron(**model_config).cuda()
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=weight_decay)

    # Load checkpoint if one exists
    iteration = 0
    if warmstart_checkpoint_path != "":
        model = warmstart(warmstart_checkpoint_path, model)

    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer, ignore_layers)
        iteration += 1  # next iteration is iteration + 1

    if n_gpus > 1:
        model = apply_gradient_allreduce(model)
    print(model)
    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    train_loader, valset, collate_fn = prepare_dataloaders(
        data_config, n_gpus, batch_size)

    # Get shared output_directory ready
    if rank == 0 and not os.path.isdir(output_directory):
        os.makedirs(output_directory)
        os.chmod(output_directory, 0o775)
    print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        logger = FlowtronLogger(os.path.join(output_directory, 'logs'))

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for batch in train_loader:
            model.zero_grad()

            mel, speaker_vecs, text, in_lens, out_lens, gate_target = batch
            mel, speaker_vecs, text = mel.cuda(), speaker_vecs.cuda(
            ), text.cuda()
            in_lens, out_lens, gate_target = in_lens.cuda(), out_lens.cuda(
            ), gate_target.cuda()

            z, log_s_list, gate_pred, attn, mean, log_var, prob = model(
                mel, speaker_vecs, text, in_lens, out_lens)
            loss = criterion((z, log_s_list, gate_pred, mean, log_var, prob),
                             gate_target, out_lens)

            if n_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()

            if fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            optimizer.step()

            if rank == 0:
                print("{}:\t{:.9f}".format(iteration, reduced_loss),
                      flush=True)

            if with_tensorboard and rank == 0:
                logger.add_scalar('training_loss', reduced_loss, iteration)
                logger.add_scalar('learning_rate', learning_rate, iteration)

            if (iteration % iters_per_checkpoint == 0):
                val_loss, attns, gate_pred, gate_target = compute_validation_loss(
                    model, criterion, valset, collate_fn, batch_size, n_gpus)
                if rank == 0:
                    print("Validation loss {}: {:9f}  ".format(
                        iteration, val_loss))
                    if with_tensorboard:
                        logger.log_validation(val_loss, attns, gate_pred,
                                              gate_target, iteration)

                    checkpoint_path = "{}/model_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
Пример #7
0
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
          sigma, iters_per_checkpoint, batch_size, seed, fp16_run,
          checkpoint_path, with_tensorboard):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    trainset = Mel2Samp(data_config['training_files'],
                        data_config['segment_length'],
                        data_config['filter_length'],
                        data_config['hop_length'],
                        data_config['win_length'],
                        data_config['sampling_rate'],
                        data_config['mel_fmin'],
                        data_config['mel_fmax'],
                        debug=False)

    if 'testing_files' in data_config:
        testset = Mel2Samp(data_config['testing_files'],
                           data_config['segment_length'],
                           data_config['filter_length'],
                           data_config['hop_length'],
                           data_config['win_length'],
                           data_config['sampling_rate'],
                           data_config['mel_fmin'],
                           data_config['mel_fmax'],
                           debug=True)
    else:
        testset = None

    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        logger = SummaryWriter(os.path.join(output_directory, 'logs'))
    else:
        logger = None

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            start = time.perf_counter()

            model.zero_grad()

            print("train batch loaded, {} ({} of {})".format(
                iteration, i, len(train_loader)))
            mel, audio = batch
            mel = torch.autograd.Variable(mel.cuda())
            audio = torch.autograd.Variable(audio.cuda())
            outputs = model((mel, audio))

            loss = criterion(outputs)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            is_overflow = False
            if fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), 1.0)
                is_overflow = math.isnan(grad_norm)

            optimizer.step()

            duration = time.perf_counter() - start

            print(
                "train batch done, {} ({} of {}): {:.9f} (took {:.2f})".format(
                    iteration, i, len(train_loader), reduced_loss, duration))

            if logger:
                logger.add_scalar('training_loss', reduced_loss,
                                  i + len(train_loader) * epoch)
                logger.add_scalar('duration', duration,
                                  i + len(train_loader) * epoch)

            if testset and not is_overflow and (iteration %
                                                iters_per_checkpoint == 0):
                if testset:
                    validate(model, criterion, testset, iteration, batch_size,
                             num_gpus, logger)

                if rank == 0:
                    rotate_checkpoints(output_directory)
                    checkpoint_path = "{}/waveglow_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
Пример #8
0
    def __init__(self,
                 opt=None,
                 train_dt=None,
                 train_dt_warm=None,
                 dis_list=[],
                 val_dt_warm=None):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.opt = opt

        self.visualizer = Visualizer(opt)

        num_gpus = torch.cuda.device_count()
        #dis_list[1]
        print(dis_list)
        #torch.cuda.device_count()
        self.rank = dis_list[0]
        print(self.rank)

        #=====START: ADDED FOR DISTRIBUTED======
        if num_gpus > 1:
            #init_distributed(rank, num_gpus, group_name, **dist_config)
            dist_config = dis_list[3]
            init_distributed(dis_list[0], dis_list[1], dis_list[2],
                             **dist_config)
        #=====END:   ADDED FOR DISTRIBUTED======

        if opt.ge_net == "srfeat":
            self.netG = model.G()
        elif opt.ge_net == "carn":
            self.netG = model.G1()
        elif opt.ge_net == "carnm":
            self.netG = model.G2()
        else:
            raise Exception("unknow ")

        self.netD_vgg = model.D(input_c=512, input_width=18)

        self.netD = model.D()

        if opt.vgg_type == "style":
            self.vgg = load_vgg16(opt.vgg_model_path + '/models')
        elif opt.vgg_type == "classify":
            self.vgg = model.vgg19_withoutbn_customefinetune()

        self.vgg.eval()
        for param in self.vgg.parameters():
            param.requires_grad = False

#         for p in self.vgg.parameters():
#             p.requires_grad = False

        init_weights(self.netD, init_type=opt.init)
        init_weights(self.netD_vgg, init_type=opt.init)
        init_weights(self.netG, init_type=opt.init)

        self.vgg = self.vgg.to(self.device)
        self.netD = self.netD.to(self.device)
        self.netD_vgg = self.netD_vgg.to(self.device)
        self.netG = self.netG.to(self.device)

        #=====START: ADDED FOR DISTRIBUTED======
        if num_gpus > 1:
            #self.vgg = apply_gradient_allreduce(self.vgg)
            self.netD_vgg = apply_gradient_allreduce(self.netD_vgg)
            self.netD = apply_gradient_allreduce(self.netD)
            self.netG = apply_gradient_allreduce(self.netG)

        #=====END:   ADDED FOR DISTRIBUTED======

        print(opt)

        self.optim_G= torch. optim.Adam(filter(lambda p: p.requires_grad, self.netG.parameters()),\
         lr=opt.warm_opt.lr, betas=opt.warm_opt.betas, weight_decay=0.0)

        #        self.optim_G= torch.optim.Adam(filter(lambda p: p.requires_grad, self.netG.parameters()),\
        #         lr=opt.gen.lr, betas=opt.gen.betas, weight_decay=0.0)

        if opt.dis.optim == "sgd":
            self.optim_D= torch.optim.SGD( filter(lambda p: p.requires_grad, \
                itertools.chain(self.netD_vgg.parameters(),self.netD.parameters() ) ),\
                lr=opt.dis.lr,
             )
        elif opt.dis.optim == "adam":
            self.optim_D= torch.optim.Adam( filter(lambda p: p.requires_grad, \
                itertools.chain(self.netD_vgg.parameters(),self.netD.parameters() ) ),\
                lr=opt.dis.lr,betas=opt.dis.betas, weight_decay=0.0
             )
        else:
            raise Exception("unknown")

        print("create schedule ")

        lr_sc_G = get_scheduler(self.optim_G, opt.gen)
        lr_sc_D = get_scheduler(self.optim_D, opt.dis)

        self.schedulers = []

        self.schedulers.append(lr_sc_G)
        self.schedulers.append(lr_sc_D)

        # =====START: ADDED FOR DISTRIBUTED======
        train_dt = torch.utils.data.ConcatDataset([train_dt, train_dt_warm])

        train_sampler = DistributedSampler(train_dt) if num_gpus > 1 else None
        val_sampler_warm = DistributedSampler(
            val_dt_warm) if num_gpus > 1 else None
        # =====END:   ADDED FOR DISTRIBUTED======

        kw = {
            "pin_memory": True,
            "num_workers": 8
        } if torch.cuda.is_available() else {}
        dl_c =t_data.DataLoader(train_dt ,batch_size=opt.batch_size,\
             sampler=train_sampler , drop_last=True, **kw )

        dl_val_warm = t_data.DataLoader(
            val_dt_warm,
            batch_size=opt.batch_size
            if not hasattr(opt, "batch_size_warm") else opt.batch_size_warm,
            sampler=val_sampler_warm,
            drop_last=True,
            **kw)

        self.dt_train = dl_c
        self.dt_val_warm = dl_val_warm

        if opt.warm_opt.loss_fn == "mse":
            self.critic_pixel = torch.nn.MSELoss()
        elif opt.warm_opt.loss_fn == "l1":
            self.critic_pixel = torch.nn.L1Loss()
        elif opt.warm_opt.loss_fn == "smooth_l1":
            self.critic_pixel = torch.nn.SmoothL1Loss()
        else:
            raise Exception("unknown")

        self.critic_pixel = self.critic_pixel.to(self.device)

        self.gan_loss = GANLoss(gan_mode=opt.gan_loss_fn).to(self.device)
        print("init ....")

        self.save_dir = os.path.dirname(self.visualizer.log_name)
Пример #9
0
def train(n_gpus, rank, group_name):
    if n_gpus > 1:
        if rank == 0: print('Synchronizing distributed flow...')
        init_distributed(rank, n_gpus, group_name, config['dist_config'])

    torch.manual_seed(config['seed'])
    torch.cuda.manual_seed(config['seed'])

    if rank == 0: print('Initializing model, optimizer and loss...')
    model = Tacotron2(config).cuda()
    criterion = Tacotron2Loss()
    learning_rate = config['learning_rate']
    optimizer = torch.optim.Adam(params=model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=config['weight_decay'])
    if config['fp16_run']:
        if rank == 0: print('Using FP16...')
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    if rank == 0: print('Preparing dirs, data loaders and logger...')
    logger = prepare_directories_and_logger(config['output_directory'],
                                            config['log_directory'], rank)
    train_loader, valset, collate_fn = prepare_dataloaders(
        config['training_files'], config['validation_files'],
        config['n_frames_per_step'], n_gpus)

    iteration = 0
    epoch_offset = 0
    if not config['warm_up_checkpoint'] is None:
        if rank == 0:
            print('Loading checkpoint from {}...'.format(
                config['warm_up_checkpoint']))

        model = load_checkpoint(config['warm_up_checkpoint'], model, optimizer)

        iteration += 1  # next iteration is iteration + 1
        epoch_offset = max(0, int(iteration / len(train_loader)))

    model.compress_factorize(config=config['compress_config'])
    model.train()

    # Main training loop
    for epoch in range(epoch_offset, config['epochs']):
        print("Epoch: {}".format(epoch))
        for _, batch in enumerate(train_loader):
            start = time.perf_counter()
            x, y = model.parse_batch(batch)
            y_pred = model(x)

            loss = criterion(y_pred, y)
            if n_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()
            if config['fp16_run']:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if iteration % config['iters_per_grad_acc'] == 0:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), config['grad_clip_thresh'])

                optimizer.step()
                model.zero_grad()

                if rank == 0:
                    duration = time.perf_counter() - start
                    print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".
                          format(iteration, reduced_loss, grad_norm, duration))
                    logger.log_training(reduced_loss, grad_norm, learning_rate,
                                        duration, iteration)

            if iteration % config['iters_per_validation'] == 0:
                validate(model, criterion, valset, iteration,
                         config['batch_size'], n_gpus, collate_fn, logger,
                         rank)

            if iteration % config['iters_per_checkpoint'] == 0:
                if rank == 0:
                    checkpoint_path = os.path.join(
                        config['output_directory'],
                        "checkpoint_{}".format(iteration))
                    save_checkpoint(model, optimizer, iteration,
                                    checkpoint_path)

            iteration += 1
Пример #10
0
def fit(a, epochs):
    if h.num_gpus > 1:
        init_distributed(a.rank, h.num_gpus, a.group_name, h.dist_config['dist_backend'], h.dist_config['dist_url'])

    generator = Generator().to(device)
    discriminator = MultiScaleDiscriminator().to(device)

    if h.num_gpus > 1:
        generator = apply_gradient_allreduce(generator)
        discriminator = apply_gradient_allreduce(discriminator)

    g_optim = torch.optim.Adam(generator.parameters(), h.learning_rate, betas=[h.adam_b1, h.adam_b2])
    d_optim = torch.optim.Adam(discriminator.parameters(), h.learning_rate, betas=[h.adam_b1, h.adam_b2])

    steps = 0
    if a.cp_g != "" and  a.cp_d != "":
        generator, g_optim, steps = load_checkpoint(a.cp_g, generator, g_optim)
        discriminator, d_optim, steps = load_checkpoint(a.cp_d, discriminator, d_optim)
        steps += 1

    with open(a.input_train_metafile, 'r', encoding='utf-8') as fi:
        training_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
                          for x in fi.read().split('\n') if len(x) > 0]

    with open(a.input_valid_metafile, 'r', encoding='utf-8') as fi:
        validation_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
                            for x in fi.read().split('\n') if len(x) > 0]

    trainset = MelDataset(training_files, h.segment_size, h.n_fft, h.num_mels,
                        h.hop_size, h.win_size, h.sampling_rate, h.fmin, h.fmax)

    train_sampler = DistributedSampler(trainset) if h.num_gpus > 1 else None

    train_loader = DataLoader(trainset, num_workers=h.num_workers, shuffle=False,
                              sampler=train_sampler,
                              batch_size=h.batch_size,
                              pin_memory=False,
                              drop_last=True)

    if a.rank == 0:
        validset = MelDataset(validation_files, h.segment_size, h.n_fft, h.num_mels,
                            h.hop_size, h.win_size, h.sampling_rate, h.fmin, h.fmax, False, False, n_cache_reuse=0)
        valid_loader = DataLoader(validset, num_workers=1, shuffle=False,
                                  sampler=None,
                                  batch_size=1,
                                  pin_memory=False,
                                  drop_last=True)

    if a.rank == 0:
        os.makedirs(a.cps, exist_ok=True)
        print("checkpoints directory : ", a.cps)
        sw = SummaryWriter(os.path.join(a.cps, 'logs'))

    epoch_offset = max(0, int(steps / len(train_loader)))
    generator.train()
    discriminator.train()
    for epoch in range(epoch_offset, epochs):
        start = time.time()

        if a.rank == 0:
            print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            start_b = time.time()
            x, y, _ = batch
            x = torch.autograd.Variable(x.to(device))
            y = torch.autograd.Variable(y.to(device))
            y = y.unsqueeze(1)

            g_optim.zero_grad()
            y_ghat = generator(x)
            y_dhat_r, y_dhat_g, fmap_r, fmap_g = discriminator(y, y_ghat)
            loss_fm = feature_loss(fmap_r, fmap_g)
            loss_gen = generator_loss(y_dhat_g) + loss_fm
            if h.num_gpus > 1:
                reduced_loss_gen = reduce_tensor(loss_gen.data, h.num_gpus).item()
            else:
                reduced_loss_gen = loss_gen.item()
            loss_gen.backward()
            g_optim.step()

            d_optim.zero_grad()
            y_ghat = y_ghat.detach()
            y_dhat_r, y_dhat_g, _, _ = discriminator(y, y_ghat)
            loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_dhat_r, y_dhat_g)
            if h.num_gpus > 1:
                reduced_loss_disc = reduce_tensor(loss_disc.data, h.num_gpus).item()
            else:
                reduced_loss_disc = loss_disc.item()
            loss_disc.backward()
            d_optim.step()

            if a.rank == 0 and steps % a.stdout_interval == 0:
                print('Steps : {:d}, Gen Loss : {:4.3f}, Disc Loss : {:4.3f}, s/b : {:4.3f}'.
                      format(steps, reduced_loss_gen, reduced_loss_disc, time.time() - start_b))

            if a.rank == 0 and steps % a.checkpoint_interval == 0 and steps != 0:
                checkpoint_path = "{}/g_{:08d}".format(a.cps, steps)
                save_checkpoint(generator, g_optim, h.learning_rate, steps, checkpoint_path)
                checkpoint_path = "{}/d_{:08d}".format(a.cps, steps)
                save_checkpoint(discriminator, d_optim, h.learning_rate, steps, checkpoint_path)

            if a.rank == 0 and steps % a.summary_interval == 0:
                sw.add_scalar("training/gen_loss", reduced_loss_gen, steps)
                sw.add_scalar("training/disc_loss", reduced_loss_disc, steps)
                for i, (r, g) in enumerate(zip(losses_disc_r, losses_disc_g)):
                    sw.add_scalar("training/disc{:d}_loss_r".format(i+1), r, steps)
                    sw.add_scalar("training/disc{:d}_loss_g".format(i+1), g, steps)
                for i, (r, g) in enumerate(zip(y_dhat_r, y_dhat_g)):
                    sw.add_histogram("training/disc{:d}_r_output".format(i+1), r, steps)
                    sw.add_histogram("training/disc{:d}_g_output".format(i+1), g, steps)
                sw.add_histogram("training/gen_output", y_ghat, steps)
                sw.add_audio('training_gt/y', y[0], steps, h.sampling_rate)
                sw.add_audio('training_predicted/y_hat', y_ghat[0], steps, h.sampling_rate)

            if a.rank == 0 and steps % a.validation_interval == 0: # and steps != 0:
                for i, batch in enumerate(valid_loader):
                    x, y, _ = batch
                    y_ghat = generator(x.to(device))

                    sw.add_audio('validation_gt/y_{}'.format(i), y[0], steps, h.sampling_rate)
                    sw.add_audio('validation_predicted/y_hat_{}'.format(i), y_ghat[0], steps, h.sampling_rate)

                    # print(plot_spectrogram(x[i]))
                    sw.add_figure('validation_gt/y_spec_{}'.format(i), plot_spectrogram(x[0]), steps)
                    y_hat_spec = mel_spectrogram(y_ghat.detach().cpu().numpy()[0][0], h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size,
                              h.fmin, h.fmax, center=False)
                    sw.add_figure('validation_predicted/y_hat_spec_{}'.format(i), plot_spectrogram(y_hat_spec), steps)
                    if i == 4:
                        break

            steps += 1

        if a.rank == 0:
            print('Time taken for epoch {} is {} sec\n'.format(epoch + 1, int(time.time()-start)))
Пример #11
0
def train(num_gpus, rank, group_name, output_directory, epochs,
          g_learning_rate, d_learning_rate, adv_ag, adv_fd, lamda_adv,
          lamda_feat, warmup_steps, decay_learning_rate, iters_per_checkpoint,
          batch_size, seed, checkpoint_path):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    model = torch.nn.Module()
    model.add_module('encoder', Encoder(**encoder_config))
    model.add_module('generator',
                     Generator(sum(encoder_config['n_out_channels'])))
    model.add_module('discriminator',
                     MultiScaleDiscriminator(**discriminator_config))
    model.add_module(
        'disentangler',
        Disentangler(encoder_config['n_out_channels'][0],
                     sum(encoder_config['n_out_channels'][1:])))
    model = model.cuda()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    # Using RAdam as optimizer
    # Lookahead has resume training issues:
    # lr schedule doesn't affect nested RAdam of Lookahead
    g_parameters = list(model.generator.parameters())
    g_parameters = list(model.encoder.parameters()) + g_parameters
    g_optimizer = RAdam(g_parameters, lr=g_learning_rate)

    d_parameters = list(model.discriminator.parameters())
    d_parameters = list(model.disentangler.parameters()) + d_parameters
    d_optimizer = RAdam(d_parameters, lr=d_learning_rate)

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, g_optimizer, d_optimizer, iteration = load_checkpoint(
            checkpoint_path, model, g_optimizer, d_optimizer)
        iteration += 1  # next iteration is iteration + 1

    customer_g_optimizer = Optimizer(g_optimizer, g_learning_rate, iteration,
                                     warmup_steps, decay_learning_rate)
    customer_d_optimizer = Optimizer(d_optimizer, d_learning_rate, iteration,
                                     warmup_steps, decay_learning_rate)

    criterion = nn.MSELoss()
    l1_loss = nn.L1Loss()
    stft_criterion = MultiResolutionSTFTLoss()

    trainset = Dataset(**data_config)
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=(train_sampler is None),
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)
        logdir = os.path.join(
            output_directory,
            time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()))
        os.makedirs(logdir, exist_ok=True)
        writer = SummaryWriter(logdir=logdir)
        anchors = [
            'loss_g', 'loss_g_sc', 'loss_g_mag', 'loss_g_adv', 'loss_g_feat',
            'loss_g_fd', 'loss_d', 'loss_d_real', 'loss_d_fake', 'loss_d_fd'
        ]
        meters = {
            x: LossMeter(x, writer, 100, iteration, True)
            for x in anchors
        }

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))

    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        train_sampler.set_epoch(epoch) if train_sampler is not None else None
        tbar = tqdm(
            enumerate(train_loader)) if rank == 0 else enumerate(train_loader)
        for i, batch in tbar:
            model.zero_grad()

            cond, a = [to_gpu(x) for x in batch]
            # Get generator outputs
            x = model.encoder(cond)
            g_outputs = model.generator(x)

            losses = {}

            # Get Discrimiantor loss
            customer_d_optimizer.zero_grad()
            d_loss = []
            # Adversarial training for audio generation
            if adv_ag == True:
                real_scores, _ = model.discriminator(a.unsqueeze(1))
                fake_scores, _ = model.discriminator(g_outputs.detach())

                d_loss_fake_list, d_loss_real_list = [], []
                for (real_score, fake_score) in zip(real_scores, fake_scores):
                    d_loss_real_list.append(
                        criterion(real_score, torch.ones_like(real_score)))
                    d_loss_fake_list.append(
                        criterion(fake_score, torch.zeros_like(fake_score)))

                d_loss_real = sum(d_loss_real_list) / len(d_loss_real_list)
                d_loss_fake = sum(d_loss_fake_list) / len(d_loss_fake_list)
                d_loss = d_loss + [d_loss_real, d_loss_fake]
                losses.update({
                    'loss_d_real': d_loss_real,
                    'loss_d_fake': d_loss_fake
                })
            # Adversarial training for feature disentanglement
            if adv_fd == True:
                split_x = torch.split(x.detach(),
                                      encoder_config['n_out_channels'],
                                      dim=1)
                pred = model.disentangler(split_x[0])
                d_loss_fd = F.l1_loss(pred, torch.cat((split_x[1:]), dim=1))
                d_loss = d_loss + [d_loss_fd]
                losses.update({'loss_d_fd': d_loss_fd})
            if len(d_loss) > 0:
                d_loss = sum(d_loss)
                d_loss.backward()
                nn.utils.clip_grad_norm_(d_parameters, max_norm=10)
                customer_d_optimizer.step_and_update_lr()
                losses.update({'loss_d': d_loss})

            # Get generator loss
            customer_g_optimizer.zero_grad()
            g_clip_norm_scale = 10
            # STFT Loss
            sc_loss, mag_loss = stft_criterion(g_outputs.squeeze(1), a)
            g_loss = sc_loss + mag_loss
            losses.update({'loss_g_sc': sc_loss, 'loss_g_mag': mag_loss})
            # Adversarial training for audio generation
            if adv_ag == True:
                fake_scores, fake_feats = model.discriminator(g_outputs)
                real_scores, real_feats = model.discriminator(a.unsqueeze(1))

                adv_loss_list, feat_loss_list = [], []
                for i, fake_score in enumerate(fake_scores):
                    adv_loss_list.append(
                        criterion(fake_score, torch.ones_like(fake_score)))
                adv_loss = sum(adv_loss_list) / len(adv_loss_list)

                for i in range(len(fake_feats)):
                    for j in range(len(fake_feats[i])):
                        feat_loss_list.append(
                            l1_loss(fake_feats[i][j],
                                    real_feats[i][j].detach()))
                feat_loss = sum(feat_loss_list) / len(feat_loss_list)

                g_loss = g_loss + adv_loss * lamda_adv + feat_loss * lamda_feat
                losses.update({'loss_g_adv': adv_loss})
                losses.update({'loss_g_feat': feat_loss})
                g_clip_norm_scale = 0.5
            # Adversarial training for feature disentanglement
            if adv_fd == True:
                split_x = torch.split(x,
                                      encoder_config['n_out_channels'],
                                      dim=1)
                pred = model.disentangler(split_x[0])
                g_loss_fd = F.l1_loss(pred,
                                      torch.cat((split_x[1:]), dim=1).detach())
                g_loss = g_loss + (-1.0) * g_loss_fd
                losses.update({'loss_g_fd': g_loss_fd})
            g_loss.backward()
            nn.utils.clip_grad_norm_(g_parameters, max_norm=g_clip_norm_scale)
            customer_g_optimizer.step_and_update_lr()
            losses.update({'loss_g': g_loss})

            # only output log of 0-th GPU
            if rank == 0:
                tbar.set_description("{:>7}:  ".format(iteration) + ', '.join([
                    "{}: {:.1e}".format(x[5:], losses[x].item())
                    for x in losses.keys()
                ]))
                for x in losses:
                    meters[x].add(losses[x].item())
                if (iteration % iters_per_checkpoint == 0):
                    checkpoint_path = "{}/model_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, g_optimizer, d_optimizer, iteration,
                                    checkpoint_path)

            iteration += 1
Пример #12
0
def train(model_name, train_list, max_seq_len, batch_size, train_epoch,
          learning_rate, iters_per_checkpoint, iters_per_eval,
          n_warm_up_epoch, warm_up_lr, checkpoint_dir, use_f0=True, preload_data=False,
          checkpoint_path="", seed=12345, num_gpus=1, rank=0, group_name=""):
    torch.manual_seed(seed)
    if num_gpus > 1:
        init_distributed(rank=rank, num_gpus=num_gpus, group_name=group_name, **dist_configs)

    timestamp = strftime("%Y%m%d_%H%M_" + checkpoint_dir, localtime())
    output_path = join("checkpoints/", timestamp)
    dataset = MelCepstrumDataset(train_list, use_f0=use_f0, preload_data=preload_data)

    if rank == 0:
        print("Checkpoint dir: %s" % output_path)
        if not exists(output_path):
            os.makedirs(output_path)
        subprocess.run(["cp", "-r", args.config, "modules", "models", output_path])
        with open(join(output_path, "speaker_label.json"), "w") as f:
            json.dump(dataset.speaker_label, f)

    train_sampler = DistributedSampler(dataset) if num_gpus > 1 else None
    print("Data directory: ", train_list)
    print("No. training data: ", len(dataset))
    print("No. speakers:", dataset.n_speaker)
    print("Normalize: ", model_configs["norm"])
    print("Use F0: ", use_f0)
    collate_fn = MelCepstrumCollateFn(max_seq_len=max_seq_len)
    dataloader = DataLoader(dataset=dataset,
                            sampler=train_sampler,
                            batch_size=batch_size//num_gpus,
                            collate_fn=collate_fn,
                            num_workers=4,
                            pin_memory=True,
                            shuffle=False)
    model = None
    if model_name == "VQVAE3Stage":
        model = VQVAE3Stage(n_speaker=dataset.n_speaker, **model_configs).cuda()
    elif model_name == "VQVAE2Stage":
        model = VQVAE2Stage(n_speaker=dataset.n_speaker, **model_configs).cuda()
    elif model_name == "VQVAE1Stage":
        model = VQVAE1Stage(n_speaker=dataset.n_speaker, **model_configs).cuda()
    else:
        print("Unsupported model name: %s" % model_name)
    if checkpoint_path != "":
        print(checkpoint_path)
        model.load_state_dict(torch.load(checkpoint_path))

    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    # =====END:   ADDED FOR DISTRIBUTED======

    optimizer = Adam(model.parameters(), lr=warm_up_lr)
    if rank == 0:
        logger = DataLogger(logdir=join(output_path, "logs"))
        validator = Validator(logger=logger,
                              speaker_label=dataset.speaker_label,
                              use_f0=use_f0,
                              **validation_configs)

    else:
        logger = None
        validator = None
    iteration = 0
    for epoch in range(train_epoch):
        model.train()
        if train_sampler is not None:
            train_sampler.set_epoch(epoch)
        if rank == 0:
            iterator = progressbar(dataloader, redirect_stdout=True)
        else:
            iterator = dataloader

        for batch in iterator:
            model.zero_grad()
            batch = [batch[0].cuda(), batch[1].cuda()]
            loss, loss_components = model(batch)

            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
                for i in range(len(loss_components)):
                    if isinstance(loss_components[i], list):
                        for j in range(len(loss_components[i])):
                            loss_components[i][j] = reduce_tensor(loss_components[i][j].data, num_gpus).item()
                    else:
                        loss_components[i] = reduce_tensor(loss_components[i].data, num_gpus).item()
            else:
                reduced_loss = loss.item()
                for i in range(len(loss_components)):
                    if isinstance(loss_components[i], list):
                        for j in range(len(loss_components[i])):
                            loss_components[i][j] = loss_components[i][j].item()
                    else:
                        loss_components[i] = loss_components[i].item()
            loss.backward()
            optimizer.step()
            if rank == 0:
                rc_loss, mel_loss, vq_loss, commitment_loss, perplexity = loss_components
                print("%d|%d: loss=%.2e, rc_loss=%.2e, mel_loss=%.2e, vq_loss=%.2e" %
                      (epoch, iteration, reduced_loss, rc_loss, mel_loss, vq_loss))
                perplexity_tag = ["training/perplexity"] + [str(i) for i in range(len(perplexity))]

                if logger is not None:
                    logger.log_training([reduced_loss, rc_loss, mel_loss, vq_loss, perplexity],
                                        ["training/loss", "training/rc_loss", "training/mel_loss",
                                         "training/vq_loss", perplexity_tag],
                                        iteration)

                if (iteration % iters_per_eval) == 0:
                    torch.save(model.state_dict(), join(output_path, "weight_latest.pt"))
                    if validator is not None:
                        validator(model, iteration)

                if (iteration % iters_per_checkpoint) == 0 and iteration > 0:
                    torch.save(model.state_dict(),
                               join(output_path, "weight_%d.pt" % iteration))
            iteration += 1
        if epoch < n_warm_up_epoch:
            lr = min(learning_rate,
                     warm_up_lr - epoch * (warm_up_lr - learning_rate)/n_warm_up_epoch)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

    print("Finished!")
    return
Пример #13
0
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
          sigma, iters_per_checkpoint, batch_size, seed, fp16_run,
          checkpoint_path, with_tensorboard, warm_start):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    # =====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda()

    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    # =====END:   ADDED FOR DISTRIBUTED======
    optimizer = Over9000(model.parameters(), lr=learning_rate)

    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
    else:
        amp = None

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer, warm_start)
        if fp16_run and not warm_start:
            amp.load_state_dict(torch.load(checkpoint_path)['amp'])
        iteration += 1

    trainset = Mel2Samp(**data_config)
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset,
                              num_workers=16,
                              shuffle=True,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        logger = SummaryWriter(os.path.join(output_directory, 'logs'))

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           factor=0.999,
                                                           patience=250,
                                                           cooldown=250,
                                                           verbose=True,
                                                           min_lr=1e-5)
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            model.zero_grad()

            mel, audio = batch
            mel = mel.cuda()
            audio = audio.cuda()
            outputs = model((mel, audio))

            loss = criterion(outputs)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), 1.0)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), 1.0)

            optimizer.step()

            if epoch > 1:
                scheduler.step(loss)

            print("{}:\t{:.9f}\t{:.9f}".format(iteration, reduced_loss,
                                               grad_norm))
            if with_tensorboard and rank == 0:
                logger.add_scalar('training_loss', reduced_loss,
                                  i + len(train_loader) * epoch)

            if (iteration % iters_per_checkpoint == 0):
                if rank == 0:
                    checkpoint_path = "{}/waveglow_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, amp, iteration,
                                    checkpoint_path)

            iteration += 1
Пример #14
0
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
          iters_per_checkpoint, batch_size, seed, checkpoint_path):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = CrossEntropyLoss()
    model = WaveNet(**wavenet_config).cpu()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    print(f"receptive_field: {model.receptive_field()}")
    trainset = WavenetDataset(
        dataset_file='data/dataset.npz',
        item_length=model.receptive_field() + 1000 + model.output_length - 1,
        target_length=model.output_length,
        file_location='data/',
        test_stride=500,
    )
    print(trainset._length)
    print('the dataset has ' + str(len(trainset)) + ' items')
    train_loader = DataLoader(
        trainset,
        batch_size=batch_size,
        shuffle=True,
        pin_memory=False,
    )

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    start = time.time()
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            model.zero_grad()
            y, target = batch
            y = to_gpu(y).float()
            target = to_gpu(target)
            y_pred = model((None, y))
            loss = criterion(y_pred[:, :, -model.output_length:], target)
            loss.backward()
            optimizer.step()

            print("{}:\t{:.9f}".format(iteration, loss))
            print_etr(start,
                      total_iterations=(epochs - epoch_offset) *
                      len(train_loader),
                      current_iteration=epoch * len(train_loader) + i + 1)
            writer.add_scalar('Loss/train', loss, global_step=iteration)

            if (iteration % iters_per_checkpoint == 0):
                y_choice = y_pred[0].detach().cpu().transpose(0, 1)
                y_prob = F.softmax(y_choice, dim=1)
                y_prob_collapsed = torch.multinomial(y_prob,
                                                     num_samples=1).squeeze(1)
                y_pred_audio = mu_law_decode_numpy(y_prob_collapsed.numpy(),
                                                   model.n_out_channels)
                import torchaudio
                y_audio = mu_law_decode_numpy(y.numpy(), model.n_out_channels)
                torchaudio.save("test_in.wav", torch.tensor(y_audio), 16000)
                torchaudio.save("test_out.wav", torch.tensor(y_pred_audio),
                                16000)
                writer.add_audio('Audio',
                                 y_pred_audio,
                                 global_step=iteration,
                                 sample_rate=data_config['sampling_rate'])
                checkpoint_path = "{}/wavenet_{}".format(
                    output_directory, iteration)
                save_checkpoint(model, optimizer, learning_rate, iteration,
                                checkpoint_path)

            writer.flush()
            iteration += 1
Пример #15
0
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
          iters_per_checkpoint, batch_size, seed, checkpoint_path):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = CrossEntropyLoss()
    model = WaveNet(**wavenet_config).cuda()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    #trainset = Mel2SampOnehot(**data_config)
    trainset = DeepMels(**data_config)
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        total_loss = 0
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            model.zero_grad()

            x, y = batch
            x = to_gpu(x).float()
            y = to_gpu(y)
            x = (x, y)  # auto-regressive takes outputs as inputs
            y_pred = model(x)
            loss = criterion(y_pred, y)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus)[0]
            else:
                reduced_loss = loss.data[0]
            loss.backward()
            optimizer.step()

            total_loss += reduced_loss

            if (iteration % iters_per_checkpoint == 0):
                if rank == 0:
                    checkpoint_path = "{}/wavenet_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
        print("epoch:{}, total epoch loss:{}".format(epoch, total_loss))
Пример #16
0
def train(num_gpus,
          rank,
          group_name,
          device,
          output_directory,
          epochs,
          learning_rate,
          iters_per_checkpoint,
          batch_size,
          seed,
          checkpoint_path,
          use_scheduled_sampling=False,
          use_wavenet_autoencoder=True,
          use_variational_autoencoder=False,
          diversity_scale=0.005,
          use_logistic_mixtures=False,
          n_mixtures=3,
          audio_hz=16000,
          midi_hz=250,
          aggressive_loss_threshold=3.0,
          encoder_error_thresh=0.0):

    assert use_wavenet_autoencoder is True

    if num_gpus > 1:
        device = init_distributed(rank, num_gpus, group_name, **dist_config)
    device = torch.device(device)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    if use_logistic_mixtures:
        sampler = DML.SampleDiscretizedMixLogistics()
        criterion = DML.DiscretizedMixLogisticLoss()
    else:
        sampler = utils.CategoricalSampler()
        criterion = CrossEntropyLoss()

    model = WavenetAutoencoder(wavenet_config, cond_wavenet_config,
                               use_variational_autoencoder).to(device)
    if use_variational_autoencoder:
        diversity_loss = L2DiversityLoss()

    if num_gpus > 1:
        model = apply_gradient_allreduce(model)

    if use_scheduled_sampling:
        scheduled_sampler = ScheduledSamplerWithPatience(
            model, sampler, **scheduled_sampler_config)

    encoder_optimizer = torch.optim.Adam(model.encoder_wavenet.parameters(),
                                         lr=learning_rate)
    decoder_optimizer = torch.optim.Adam(model.wavenet.parameters(),
                                         lr=learning_rate)

    # Train state params
    aggressive = True
    train_encoder = False

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, encoder_optimizer, decoder_optimizer, aggressive, iteration = load_checkpoint(
            checkpoint_path, model, encoder_optimizer, decoder_optimizer)
        iteration += 1

    # Dataloader
    trainset = MaestroDataloader(**data_config)
    if num_gpus > 1:
        train_sampler = DistributedSampler(trainset)
    else:
        train_sampler = None
    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)

    # Get shared output_directory ready for distributed
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    # Initialize training variables
    epoch_offset = max(0, int(iteration / len(train_loader)))
    start_iter = iteration

    loss_idx = 0
    loss_sum = 0
    prev_loss = 999999999

    print("output directory: " + output_directory)

    # write loss to csv file
    loss_writer = DictWriter(open(output_directory + "/train.csv",
                                  'w',
                                  newline=''),
                             fieldnames=['iteration', 'loss'])
    loss_writer.writeheader()

    signal_writer = DictWriter(open(output_directory + "/signal.csv",
                                    "w",
                                    newline=''),
                               fieldnames=[
                                   'iteration', 'cosim', 'p-dist',
                                   'forwardMagnitude', 'midiMagnitude'
                               ])
    signal_writer.writeheader()

    model.train()
    # ================ MAIN TRAINING LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            model.zero_grad()

            x, y = batch

            x = as_variable(x, device)
            y = as_variable(y, device)
            y_true = y.clone()

            if use_scheduled_sampling:
                y = scheduled_sampler(x, y)

            y_preds = model((x, y))

            if use_wavenet_autoencoder:
                q_bar = y_preds[1]
                y_preds = y_preds[0]

            loss = criterion(y_preds, y_true)
            if use_variational_autoencoder:
                div_loss = diversity_loss(q_bar)
                loss = loss + (diversity_scale * div_loss)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.data.item()
            loss.backward()

            if aggressive and train_encoder:
                encoder_optimizer.step()
                print("Encoder step")

            elif aggressive:
                decoder_optimizer.step()
                print("Decoder step")

            else:  # normal training
                encoder_optimizer.step()
                decoder_optimizer.step()

            print("total loss:     {}:\t{:.9f}".format(iteration,
                                                       reduced_loss))
            if use_variational_autoencoder:
                print("    diversity loss: {:.9f}".format(div_loss))

            if use_scheduled_sampling:
                scheduled_sampler.update(reduced_loss)

            # record running average of loss
            loss_sum += reduced_loss
            loss_idx += 1
            if (iteration % 10 == 0):
                loss_avg = loss_sum / loss_idx
                print("floating avg of 10: " + str(loss_avg))
                #loss_writer.writerow({"iteration": str(i),
                #                     "loss": str(reduced_loss)})

                if aggressive and loss_avg < aggressive_loss_threshold:
                    agressive = False
                elif aggressive and train_encoder and loss_avg >= (
                        prev_loss + encoder_error_thresh):
                    train_encoder = False
                elif aggressive:
                    train_encoder = True

                prev_loss = loss_avg
                loss_sum = 0
                loss_idx = 0

            # save model
            if (iteration % iters_per_checkpoint == 0):
                if rank == 0:
                    checkpoint_path = "{}/wavenet_{}".format(
                        output_directory, iteration)
                    save_checkpoint_autoencoder(model, device,
                                                use_variational_autoencoder,
                                                encoder_optimizer,
                                                decoder_optimizer, aggressive,
                                                learning_rate, iteration,
                                                checkpoint_path)

            iteration += 1
            del loss
Пример #17
0
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
          sigma, iters_per_checkpoint, batch_size, seed, checkpoint_path):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Load checkpoint if one exists
    iteration = 0
    print("checkpoint path", checkpoint_path)
    #model = warm_load_checkpoint(checkpoint_path, model)
    model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                  optimizer)
    iteration += 1
    trainset = Mel2Samp(**data_config)
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=True,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            model.zero_grad()

            mel, audio = batch
            mel = torch.autograd.Variable(mel.cuda())
            audio = torch.autograd.Variable(audio.cuda())
            outputs = model((mel, audio))

            loss = criterion(outputs)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()
            loss.backward()
            optimizer.step()
            if (iteration % iters_per_checkpoint == 0):
                print("{}:\t{:.9f}".format(iteration, reduced_loss))
                checkpoint_path = "{}/waveglow".format(output_directory)
                save_checkpoint(model, optimizer, learning_rate, iteration,
                                checkpoint_path)
            iteration += 1
Пример #18
0
def train(num_gpus, rank, group_name, stage, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, logdirname, datedlogdir, warm_start=False, optimizer='ADAM', start_zero=False):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======
    
    from model import HiFiGAN, HiFiGANLoss
    criterion = HiFiGANLoss(**hifigan_config).cuda()
    model = HiFiGAN(**hifigan_config).cuda()
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
        if stage >= 2:
            criterion = apply_gradient_allreduce(criterion)
    #=====END:   ADDED FOR DISTRIBUTED======
    
    criterion, optimizer_d = get_optimizer(criterion, optimizer, fp16_run, optimizer_fused=True) if stage >= 2 else (criterion, None)
    model, optimizer = get_optimizer(model, optimizer, fp16_run, optimizer_fused=True)
    
    ## LEARNING RATE SCHEDULER
    if True:
        from torch.optim.lr_scheduler import ReduceLROnPlateau
        min_lr = 1e-8
        factor = 0.1**(1/5) # amount to scale the LR by on Validation Loss plateau
        scheduler = ReduceLROnPlateau(optimizer, 'min', factor=factor, patience=20, cooldown=2, min_lr=min_lr, verbose=True, threshold=0.0001, threshold_mode='abs')
        print("ReduceLROnPlateau used as Learning Rate Scheduler.")
    else: scheduler=False
    
    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, criterion, optimizer_d, iteration, scheduler = load_checkpoint(checkpoint_path, model,
                                                      optimizer, criterion, optimizer_d, scheduler, fp16_run, stage, warm_start=warm_start)
        iteration += 1  # next iteration is iteration + 1
    if start_zero:
        iteration = 0
    
    trainset = Mel2Samp(**data_config, check_files=True)
    speaker_lookup = trainset.speaker_ids
    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        train_sampler = DistributedSampler(trainset, shuffle=True)
        shuffle = False
    else:
        train_sampler = None
        shuffle = True
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset, num_workers=3, shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)
    
    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)
    
    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        if datedlogdir:
            timestr = time.strftime("%Y_%m_%d-%H_%M_%S")
            log_directory = os.path.join(output_directory, logdirname, timestr)
        else:
            log_directory = os.path.join(output_directory, logdirname)
        logger = SummaryWriter(log_directory)
    
    moving_average = int(min(len(train_loader), 200)) # average loss over entire Epoch
    rolling_sum = StreamingMovingAverage(moving_average)
    start_time = time.time()
    start_time_iter = time.time()
    start_time_dekaiter = time.time()
    model.train()
    
    # best (averaged) training loss
    if os.path.exists(os.path.join(output_directory, "best_model")+".txt"):
        best_model_loss = float(str(open(os.path.join(output_directory, "best_model")+".txt", "r", encoding="utf-8").read()).split("\n")[0])
    else:
        best_model_loss = 9e9
    
    # best (validation) MSE on inferred spectrogram.
    if os.path.exists(os.path.join(output_directory, "best_val_model")+".txt"):
        best_MSE = float(str(open(os.path.join(output_directory, "best_val_model")+".txt", "r", encoding="utf-8").read()).split("\n")[0])
    else:
        best_MSE = 9e9
    
    epoch_offset = max(0, int(iteration / len(train_loader)))
    
    print_params(model, name='generator')
    
    print(f"Segment Length: {data_config['segment_length']:,}\nBatch Size: {batch_size:,}\nNumber of GPUs: {num_gpus:,}\nSamples/Iter: {data_config['segment_length']*batch_size*num_gpus:,}")
    
    training = True
    while training:
        try:
            if rank == 0:
                epochs_iterator = tqdm(range(epoch_offset, epochs), initial=epoch_offset, total=epochs, smoothing=0.01, desc="Epoch", position=1, unit="epoch")
            else:
                epochs_iterator = range(epoch_offset, epochs)
            # ================ MAIN TRAINING LOOP! ===================
            for epoch in epochs_iterator:
                print(f"Epoch: {epoch}")
                if num_gpus > 1:
                    train_sampler.set_epoch(epoch)
                
                if rank == 0:
                    iters_iterator = tqdm(enumerate(train_loader), desc=" Iter", smoothing=0, total=len(train_loader), position=0, unit="iter", leave=True)
                else:
                    iters_iterator = enumerate(train_loader)
                for i, batch in iters_iterator:
                    # run external code every iter, allows the run to be adjusted without restarts
                    if (i==0 or iteration % param_interval == 0):
                        try:
                            with open("run_every_epoch.py") as f:
                                internal_text = str(f.read())
                                if len(internal_text) > 0:
                                    #code = compile(internal_text, "run_every_epoch.py", 'exec')
                                    ldict = {'iteration': iteration, 'seconds_elapsed': time.time()-start_time}
                                    exec(internal_text, globals(), ldict)
                                else:
                                    print("No Custom code found, continuing without changes.")
                        except Exception as ex:
                            print(f"Custom code FAILED to run!\n{ex}")
                        globals().update(ldict)
                        locals().update(ldict)
                        if show_live_params:
                            print(internal_text)
                    # Learning Rate Schedule
                    if custom_lr:
                        old_lr = learning_rate
                        if iteration < warmup_start:
                            learning_rate = warmup_start_lr
                        elif iteration < warmup_end:
                            learning_rate = (iteration-warmup_start)*((A_+C_)-warmup_start_lr)/(warmup_end-warmup_start) + warmup_start_lr # learning rate increases from warmup_start_lr to A_ linearly over (warmup_end-warmup_start) iterations.
                        else:
                            if iteration < decay_start:
                                learning_rate = A_ + C_
                            else:
                                iteration_adjusted = iteration - decay_start
                                learning_rate = (A_*(e**(-iteration_adjusted/B_))) + C_
                        assert learning_rate > -1e-8, "Negative Learning Rate."
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = learning_rate
                        if optimizer_d is not None:
                            for param_group in optimizer_d.param_groups:
                                param_group['lr'] = learning_rate*d_lr_scale
                    else:
                        scheduler.patience = scheduler_patience
                        scheduler.cooldown = scheduler_cooldown
                        if override_scheduler_last_lr:
                            scheduler._last_lr = override_scheduler_last_lr
                        if override_scheduler_best:
                            scheduler.best = override_scheduler_best
                        if override_scheduler_last_lr or override_scheduler_best:
                            print(f"scheduler._last_lr = {scheduler._last_lr} scheduler.best = {scheduler.best}  |", end='')
                    model.zero_grad()
                    noisy_audio, gt_audio, speaker_ids = batch
                    noisy_audio = torch.autograd.Variable(noisy_audio.cuda(non_blocking=True))
                    gt_audio = torch.autograd.Variable(gt_audio.cuda(non_blocking=True))
                    speaker_ids = speaker_ids.cuda(non_blocking=True).long().squeeze(1)
                    pred_audio = model(noisy_audio)#, speaker_ids)
                    
                    metrics = criterion(pred_audio, gt_audio, amp, model, optimizer, optimizer_d, num_gpus, use_grad_clip, grad_clip_thresh)
                    
                    if not metrics['is_overflow'] and rank == 0:
                        # get current Loss Scale of first optimizer
                        loss_scale = amp._amp_state.loss_scalers[0]._loss_scale if fp16_run else 32768
                        
                        if with_tensorboard:
                            if (iteration % 100000 == 0):
                                # plot distribution of parameters
                                for tag, value in model.named_parameters():
                                    tag = tag.replace('.', '/')
                                    logger.add_histogram(tag, value.data.cpu().numpy(), iteration)
                            for key, value in metrics.items():
                                if key not in ['is_overflow',]:
                                    logger.add_scalar(key, value, iteration)
                            if (iteration % 20 == 0):
                                logger.add_scalar('learning.rate', learning_rate, iteration)
                            if (iteration % 10 == 0):
                                logger.add_scalar('duration', ((time.time() - start_time_dekaiter)/10), iteration)
                        
                        logged_loss = metrics['g_train_loss'] if stage >= 2 else metrics['train_loss']
                        grad_norm = metrics['grad_norm']
                        average_loss = rolling_sum.process(logged_loss)
                        if (iteration % 10 == 0):
                            tqdm.write("{} {}:  {:.3f} {:.3f}  {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective)  {:.2f}s/iter {:.4f}s/item".format(time.strftime("%H:%M:%S"), iteration, logged_loss, average_loss, best_MSE, round(grad_norm,3), learning_rate, min((grad_clip_thresh/grad_norm)*learning_rate,learning_rate), (time.time() - start_time_dekaiter)/10, ((time.time() - start_time_dekaiter)/10)/(batch_size*num_gpus)))
                            start_time_dekaiter = time.time()
                        else:
                            tqdm.write("{} {}:  {:.3f} {:.3f}  {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective) {}LS".format(time.strftime("%H:%M:%S"), iteration, logged_loss, average_loss, best_MSE, round(grad_norm,3), learning_rate, min((grad_clip_thresh/grad_norm)*learning_rate,learning_rate), loss_scale))
                        start_time_iter = time.time()
                    
                    if rank == 0 and (len(rolling_sum.values) > moving_average-2):
                        if (average_loss+best_model_margin) < best_model_loss:
                            checkpoint_path = os.path.join(output_directory, "best_model")
                            try:
                                save_checkpoint(model, optimizer, criterion, optimizer_d, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path)
                            except KeyboardInterrupt: # Avoid corrupting the model.
                                save_checkpoint(model, optimizer, criterion, optimizer_d, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path)
                            text_file = open((f"{checkpoint_path}.txt"), "w", encoding="utf-8")
                            text_file.write(str(average_loss)+"\n"+str(iteration))
                            text_file.close()
                            best_model_loss = average_loss #Only save the model if X better than the current loss.
                    if rank == 0 and iteration > 0 and ((iteration % iters_per_checkpoint == 0) or (os.path.exists(save_file_check_path))):
                        checkpoint_path = f"{output_directory}/waveglow_{iteration}"
                        save_checkpoint(model, optimizer, criterion, optimizer_d, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path)
                        if (os.path.exists(save_file_check_path)):
                            os.remove(save_file_check_path)
                    
                    if iteration%validation_interval == 0:
                        if rank == 0:
                            MSE, MAE = validate(model, trainset, logger, iteration, data_config['validation_files'], speaker_lookup, output_directory, data_config)
                            if scheduler:
                                MSE = torch.tensor(MSE, device='cuda')
                                if num_gpus > 1:
                                    broadcast(MSE, 0)
                                scheduler.step(MSE.item())
                                if MSE < best_MSE:
                                    checkpoint_path = os.path.join(output_directory, "best_val_model")
                                    try:
                                        save_checkpoint(model, optimizer, criterion, optimizer_d, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path)
                                    except KeyboardInterrupt: # Avoid corrupting the model.
                                        save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path)
                                    text_file = open((f"{checkpoint_path}.txt"), "w", encoding="utf-8")
                                    text_file.write(str(MSE.item())+"\n"+str(iteration))
                                    text_file.close()
                                    best_MSE = MSE.item()
                        else:
                            if scheduler:
                                MSE = torch.zeros(1, device='cuda')
                                broadcast(MSE, 0)
                                scheduler.step(MSE.item())
                    iteration += 1
            training = False # exit the training While loop
        
        except LossExplosion as ex: # print Exception and continue from checkpoint. (turns out it takes < 4 seconds to restart like this, f*****g awesome)
            print(ex) # print Loss
            checkpoint_path = os.path.join(output_directory, "best_model")
            assert os.path.exists(checkpoint_path), "best_model must exist for automatic restarts"
            
            # clearing VRAM for load checkpoint
            audio = mel = speaker_ids = loss = None
            torch.cuda.empty_cache()
            
            model.eval()
            model, optimizer, iteration, scheduler = load_checkpoint(checkpoint_path, model, optimizer, scheduler, fp16_run)
            learning_rate = optimizer.param_groups[0]['lr']
            epoch_offset = max(0, int(iteration / len(train_loader)))
            model.train()
            iteration += 1
            pass # and continue training.
Пример #19
0
def train(num_gpus, rank, group_name, output_directory, log_directory,
          checkpoint_path, hparams):
    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(hparams.sigma)
    model = WaveGlow(hparams).cuda()

    Taco2 = load_pretrained_taco('tacotron2.pt', hparams)

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path:
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    trainset = TextMelLoader(hparams.training_files, hparams)
    collate_fn = TextMelCollate()
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    batch_size = hparams.batch_size
    train_loader = DataLoader(trainset,
                              num_workers=0,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)

    # Get shared output_directory readya

    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if hparams.with_tensorboard and rank == 0:
        logger = prepare_directories_and_logger(output_directory,
                                                log_directory)

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    print("Total Epochs: {}".format(hparams.epochs))
    print("Batch Size: {}".format(hparams.batch_size))
    print("learning rate: {}".format(hparams.learning_rate))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, hparams.epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            model.zero_grad()

            text_padded, input_lengths, mel_padded, max_len, output_lengths = parse_batch(
                batch)
            with torch.no_grad():
                enc_outputs, alignments = Taco2(
                    (text_padded, input_lengths, mel_padded, max_len,
                     output_lengths))

            # mel_padded = mel_padded.transpose(1, 2)
            # mel_padded = mel_padded / torch.abs(mel_padded).max().item()
            mel_pos = torch.arange(1000)
            mel_pos = to_gpu(mel_pos).long().unsqueeze(0)
            mel_pos = mel_pos.expand(hparams.batch_size, -1)
            src_pos = torch.arange(hparams.n_position)
            src_pos = to_gpu(src_pos).long().unsqueeze(0)
            src_pos = src_pos.expand(hparams.batch_size, -1)

            mel_padded = (mel_padded + 5) / 10

            z, log_s_list, log_det_w_list, dec_enc_attn = model(
                mel_padded, enc_outputs, mel_pos, src_pos, input_lengths)
            outputs = (z, log_s_list, log_det_w_list, dec_enc_attn)
            loss = criterion(outputs, alignments)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            grad_norm = torch.nn.utils.clip_grad_norm_(
                model.parameters(), hparams.grad_clip_thresh)
            optimizer.step()

            print("{}:\t{:.9f}".format(iteration, reduced_loss))
            if hparams.with_tensorboard and rank == 0:
                logger.log_training(reduced_loss, grad_norm, learning_rate,
                                    iteration)

            if (iteration % hparams.iters_per_checkpoint == 0):
                if rank == 0:
                    mel_predict, test_attn = model.test(
                        mel_padded, enc_outputs, mel_pos, src_pos,
                        input_lengths)
                    logger.log_alignment(model, dec_enc_attn, alignments,
                                         mel_padded, mel_predict, test_attn,
                                         iteration)
                    checkpoint_path = "{}/waveglow_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
Пример #20
0
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
          sigma, batch_size, seed, checkpoint_path, hparams):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    # =====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    if num_gpus >= 1:
        model = WaveGlow(**waveglow_config, hparams=hparams).cuda()
    else:
        model = WaveGlow(**waveglow_config, hparams=hparams)

    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    # =====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Load checkpoint if one exists
    iteration, eval_iteration = 0, 0

    if checkpoint_path != "":
        model, optimizer, iteration, eval_iteration = load_checkpoint(checkpoint_path, model, optimizer)
        iteration += 1  # next iteration is iteration + 1
        eval_iteration += 1
    # trainset = Mel2Samp(**data_config)

    trainset = TextMelLoader(
        audiopaths_and_text='./filelists/ljs_audio_text_train_filelist.txt', hparams=hparams)
    testset = TextMelLoader(
        audiopaths_and_text='./filelists/ljs_audio_text_test_filelist.txt', hparams=hparams)


    collate_fn = TextMelCollate(hparams, fixed_length=True)
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset, num_workers=1,
                              collate_fn=collate_fn,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)
    test_loader = DataLoader(testset, num_workers=1,
                             collate_fn=collate_fn,
                             shuffle=False,
                             sampler=train_sampler,
                             batch_size=batch_size,
                             pin_memory=False,
                             drop_last=True)
    log_path = os.path.join(output_directory, 'log-event')
    os.makedirs(log_path, exist_ok=True)
    logger = WaveGlowLogger(log_path)
    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    model.train()
    tacotron2 = Tacotron2(hparams)
    batch_parser = tacotron2.parse_batch
    # we use tacotron-2's pipeline
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        model.train()
        for i, batch in enumerate(train_loader):
            model.zero_grad()
            x, y = batch_parser(batch)
            text_padded, input_lengths, mel_padded, max_len, output_lengths = x
            # print(text_padded.size(), mel_padded.size())
            mel_padded, gate_padded = y
            outputs = model((text_padded, mel_padded))

            loss = criterion(outputs)
            logger.log_loss('train/loss', loss, iteration)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()
            loss.backward()
            optimizer.step()

            print("{}:\t{:.9f}".format(iteration, reduced_loss))
            iteration += 1

        # model.eval()
        # for i, batch in enumerate(test_loader):
        #     x, y = batch_parser(batch)
        #     text_padded, input_lengths, mel_padded, max_len, output_lengths = x
        #     mel_padded, gate_padded = y
        #     outputs = model((text_padded, mel_padded))
        #     loss = criterion(outputs)
        #     logger.log_loss('eval/loss', loss, iteration)
        #     eval_iteration += 1

        if rank == 0:
            checkpoint_path = "{}/waveglow_epoch_{}".format(output_directory, epoch)
            save_checkpoint(model, optimizer, learning_rate, iteration, eval_iteration, checkpoint_path,
                            hparams=hparams)
Пример #21
0
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
          sigma, loss_empthasis, iters_per_checkpoint, batch_size, seed, fp16_run,
          checkpoint_path, with_tensorboard, logdirname, datedlogdir, warm_start=False, optimizer='ADAM', start_zero=False):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======
    
    global WaveGlow
    global WaveGlowLoss
    
    ax = True # this is **really** bad coding practice :D
    if ax:
        from efficient_model_ax import WaveGlow
        from efficient_loss import WaveGlowLoss
    else:
        if waveglow_config["yoyo"]: # efficient_mode # TODO: Add to Config File
            from efficient_model import WaveGlow
            from efficient_loss import WaveGlowLoss
        else:
            from glow import WaveGlow, WaveGlowLoss
    
    criterion = WaveGlowLoss(sigma, loss_empthasis)
    model = WaveGlow(**waveglow_config).cuda()
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======
    STFTs = [STFT.TacotronSTFT(filter_length=window,
                                 hop_length=data_config['hop_length'],
                                 win_length=window,
                                 sampling_rate=data_config['sampling_rate'],
                                 n_mel_channels=160,
                                 mel_fmin=0, mel_fmax=16000) for window in data_config['validation_windows']]
    
    loader_STFT = STFT.TacotronSTFT(filter_length=data_config['filter_length'],
                                 hop_length=data_config['hop_length'],
                                 win_length=data_config['win_length'],
                                 sampling_rate=data_config['sampling_rate'],
                                 n_mel_channels=data_config['n_mel_channels'] if 'n_mel_channels' in data_config.keys() else 160,
                                 mel_fmin=data_config['mel_fmin'], mel_fmax=data_config['mel_fmax'])
    
    #optimizer = "Adam"
    optimizer = optimizer.lower()
    optimizer_fused = bool( 0 ) # use Apex fused optimizer, should be identical to normal but slightly faster and only works on RTX cards
    if optimizer_fused:
        from apex import optimizers as apexopt
        if optimizer == "adam":
            optimizer = apexopt.FusedAdam(model.parameters(), lr=learning_rate)
        elif optimizer == "lamb":
            optimizer = apexopt.FusedLAMB(model.parameters(), lr=learning_rate, max_grad_norm=200)
    else:
        if optimizer == "adam":
            optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        elif optimizer == "lamb":
            from lamb import Lamb as optLAMB
            optimizer = optLAMB(model.parameters(), lr=learning_rate)
            #import torch_optimizer as optim
            #optimizer = optim.Lamb(model.parameters(), lr=learning_rate)
            #raise# PyTorch doesn't currently include LAMB optimizer.
    
    if fp16_run:
        global amp
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
    else:
        amp = None
    
    ## LEARNING RATE SCHEDULER
    if True:
        from torch.optim.lr_scheduler import ReduceLROnPlateau
        min_lr = 1e-8
        factor = 0.1**(1/5) # amount to scale the LR by on Validation Loss plateau
        scheduler = ReduceLROnPlateau(optimizer, 'min', factor=factor, patience=20, cooldown=2, min_lr=min_lr, verbose=True, threshold=0.0001, threshold_mode='abs')
        print("ReduceLROnPlateau used as Learning Rate Scheduler.")
    else: scheduler=False
    
    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration, scheduler = load_checkpoint(checkpoint_path, model,
                                                      optimizer, scheduler, fp16_run, warm_start=warm_start)
        iteration += 1  # next iteration is iteration + 1
    if start_zero:
        iteration = 0
    
    trainset = Mel2Samp(**data_config, check_files=True)
    speaker_lookup = trainset.speaker_ids
    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        train_sampler = DistributedSampler(trainset, shuffle=True)
        shuffle = False
    else:
        train_sampler = None
        shuffle = True
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset, num_workers=3, shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)
    
    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)
    
    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        if datedlogdir:
            timestr = time.strftime("%Y_%m_%d-%H_%M_%S")
            log_directory = os.path.join(output_directory, logdirname, timestr)
        else:
            log_directory = os.path.join(output_directory, logdirname)
        logger = SummaryWriter(log_directory)
    
    moving_average = int(min(len(train_loader), 100)) # average loss over entire Epoch
    rolling_sum = StreamingMovingAverage(moving_average)
    start_time = time.time()
    start_time_iter = time.time()
    start_time_dekaiter = time.time()
    model.train()
    
    # best (averaged) training loss
    if os.path.exists(os.path.join(output_directory, "best_model")+".txt"):
        best_model_loss = float(str(open(os.path.join(output_directory, "best_model")+".txt", "r", encoding="utf-8").read()).split("\n")[0])
    else:
        best_model_loss = -6.20
    
    # best (validation) MSE on inferred spectrogram.
    if os.path.exists(os.path.join(output_directory, "best_val_model")+".txt"):
        best_MSE = float(str(open(os.path.join(output_directory, "best_val_model")+".txt", "r", encoding="utf-8").read()).split("\n")[0])
    else:
        best_MSE = 9e9
    
    epoch_offset = max(0, int(iteration / len(train_loader)))
    
    pytorch_total_params = sum(p.numel() for p in model.parameters())
    print("{:,} total parameters in model".format(pytorch_total_params))
    pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print("{:,} trainable parameters.".format(pytorch_total_params))
    
    print(f"Segment Length: {data_config['segment_length']:,}\nBatch Size: {batch_size:,}\nNumber of GPUs: {num_gpus:,}\nSamples/Iter: {data_config['segment_length']*batch_size*num_gpus:,}")
    
    training = True
    while training:
        try:
            if rank == 0:
                epochs_iterator = tqdm(range(epoch_offset, epochs), initial=epoch_offset, total=epochs, smoothing=0.01, desc="Epoch", position=1, unit="epoch")
            else:
                epochs_iterator = range(epoch_offset, epochs)
            # ================ MAIN TRAINING LOOP! ===================
            for epoch in epochs_iterator:
                print(f"Epoch: {epoch}")
                if num_gpus > 1:
                    train_sampler.set_epoch(epoch)
                
                if rank == 0:
                    iters_iterator = tqdm(enumerate(train_loader), desc=" Iter", smoothing=0, total=len(train_loader), position=0, unit="iter", leave=True)
                else:
                    iters_iterator = enumerate(train_loader)
                for i, batch in iters_iterator:
                    # run external code every iter, allows the run to be adjusted without restarts
                    if (i==0 or iteration % param_interval == 0):
                        try:
                            with open("run_every_epoch.py") as f:
                                internal_text = str(f.read())
                                if len(internal_text) > 0:
                                    #code = compile(internal_text, "run_every_epoch.py", 'exec')
                                    ldict = {'iteration': iteration, 'seconds_elapsed': time.time()-start_time}
                                    exec(internal_text, globals(), ldict)
                                else:
                                    print("No Custom code found, continuing without changes.")
                        except Exception as ex:
                            print(f"Custom code FAILED to run!\n{ex}")
                        globals().update(ldict)
                        locals().update(ldict)
                        if show_live_params:
                            print(internal_text)
                    if not iteration % 50: # check actual learning rate every 20 iters (because I sometimes see learning_rate variable go out-of-sync with real LR)
                        learning_rate = optimizer.param_groups[0]['lr']
                    # Learning Rate Schedule
                    if custom_lr:
                        old_lr = learning_rate
                        if iteration < warmup_start:
                            learning_rate = warmup_start_lr
                        elif iteration < warmup_end:
                            learning_rate = (iteration-warmup_start)*((A_+C_)-warmup_start_lr)/(warmup_end-warmup_start) + warmup_start_lr # learning rate increases from warmup_start_lr to A_ linearly over (warmup_end-warmup_start) iterations.
                        else:
                            if iteration < decay_start:
                                learning_rate = A_ + C_
                            else:
                                iteration_adjusted = iteration - decay_start
                                learning_rate = (A_*(e**(-iteration_adjusted/B_))) + C_
                        assert learning_rate > -1e-8, "Negative Learning Rate."
                        if old_lr != learning_rate:
                            for param_group in optimizer.param_groups:
                                param_group['lr'] = learning_rate
                    else:
                        scheduler.patience = scheduler_patience
                        scheduler.cooldown = scheduler_cooldown
                        if override_scheduler_last_lr:
                            scheduler._last_lr = override_scheduler_last_lr
                        if override_scheduler_best:
                            scheduler.best = override_scheduler_best
                        if override_scheduler_last_lr or override_scheduler_best:
                            print("scheduler._last_lr =", scheduler._last_lr, "scheduler.best =", scheduler.best, "  |", end='')
                    model.zero_grad()
                    mel, audio, speaker_ids = batch
                    mel = torch.autograd.Variable(mel.cuda(non_blocking=True))
                    audio = torch.autograd.Variable(audio.cuda(non_blocking=True))
                    speaker_ids = speaker_ids.cuda(non_blocking=True).long().squeeze(1)
                    outputs = model(mel, audio, speaker_ids)
                    
                    loss = criterion(outputs)
                    if num_gpus > 1:
                        reduced_loss = reduce_tensor(loss.data, num_gpus).item()
                    else:
                        reduced_loss = loss.item()
                    
                    if fp16_run:
                        with amp.scale_loss(loss, optimizer) as scaled_loss:
                            scaled_loss.backward()
                    else:
                        loss.backward()
                    
                    if (reduced_loss > LossExplosionThreshold) or (math.isnan(reduced_loss)):
                        model.zero_grad()
                        raise LossExplosion(f"\nLOSS EXPLOSION EXCEPTION ON RANK {rank}: Loss reached {reduced_loss} during iteration {iteration}.\n\n\n")
                    
                    if use_grad_clip:
                        if fp16_run:
                            grad_norm = torch.nn.utils.clip_grad_norm_(
                                amp.master_params(optimizer), grad_clip_thresh)
                        else:
                            grad_norm = torch.nn.utils.clip_grad_norm_(
                                model.parameters(), grad_clip_thresh)
                        if type(grad_norm) == torch.Tensor:
                            grad_norm = grad_norm.item()
                        is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm)
                    else: is_overflow = False; grad_norm=0.00001
                    
                    optimizer.step()
                    if not is_overflow and rank == 0:
                        # get current Loss Scale of first optimizer
                        loss_scale = amp._amp_state.loss_scalers[0]._loss_scale if fp16_run else 32768
                        
                        if with_tensorboard:
                            if (iteration % 100000 == 0):
                                # plot distribution of parameters
                                for tag, value in model.named_parameters():
                                    tag = tag.replace('.', '/')
                                    logger.add_histogram(tag, value.data.cpu().numpy(), iteration)
                            logger.add_scalar('training_loss', reduced_loss, iteration)
                            logger.add_scalar('training_loss_samples', reduced_loss, iteration*batch_size)
                            if (iteration % 20 == 0):
                                logger.add_scalar('learning.rate', learning_rate, iteration)
                            if (iteration % 10 == 0):
                                logger.add_scalar('duration', ((time.time() - start_time_dekaiter)/10), iteration)
                        
                        average_loss = rolling_sum.process(reduced_loss)
                        if (iteration % 10 == 0):
                            tqdm.write("{} {}:  {:.3f} {:.3f}  {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective)  {:.2f}s/iter {:.4f}s/item".format(time.strftime("%H:%M:%S"), iteration, reduced_loss, average_loss, best_MSE, round(grad_norm,3), learning_rate, min((grad_clip_thresh/grad_norm)*learning_rate,learning_rate), (time.time() - start_time_dekaiter)/10, ((time.time() - start_time_dekaiter)/10)/(batch_size*num_gpus)))
                            start_time_dekaiter = time.time()
                        else:
                            tqdm.write("{} {}:  {:.3f} {:.3f}  {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective) {}LS".format(time.strftime("%H:%M:%S"), iteration, reduced_loss, average_loss, best_MSE, round(grad_norm,3), learning_rate, min((grad_clip_thresh/grad_norm)*learning_rate,learning_rate), loss_scale))
                        start_time_iter = time.time()
                    
                    if rank == 0 and (len(rolling_sum.values) > moving_average-2):
                        if (average_loss+best_model_margin) < best_model_loss:
                            checkpoint_path = os.path.join(output_directory, "best_model")
                            try:
                                save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup,
                                            checkpoint_path)
                            except KeyboardInterrupt: # Avoid corrupting the model.
                                save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup,
                                            checkpoint_path)
                            text_file = open((f"{checkpoint_path}.txt"), "w", encoding="utf-8")
                            text_file.write(str(average_loss)+"\n"+str(iteration))
                            text_file.close()
                            best_model_loss = average_loss #Only save the model if X better than the current loss.
                    if rank == 0 and iteration > 0 and ((iteration % iters_per_checkpoint == 0) or (os.path.exists(save_file_check_path))):
                        checkpoint_path = f"{output_directory}/waveglow_{iteration}"
                        save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup,
                                        checkpoint_path)
                        if (os.path.exists(save_file_check_path)):
                            os.remove(save_file_check_path)
                    
                    if (iteration % validation_interval == 0):
                        if rank == 0:
                            MSE, MAE = validate(model, loader_STFT, STFTs, logger, iteration, data_config['validation_files'], speaker_lookup, sigma, output_directory, data_config)
                            if scheduler:
                                MSE = torch.tensor(MSE, device='cuda')
                                if num_gpus > 1:
                                    broadcast(MSE, 0)
                                scheduler.step(MSE.item())
                                if MSE < best_MSE:
                                    checkpoint_path = os.path.join(output_directory, "best_val_model")
                                    try:
                                        save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup,
                                                    checkpoint_path)
                                    except KeyboardInterrupt: # Avoid corrupting the model.
                                        save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup,
                                                    checkpoint_path)
                                    text_file = open((f"{checkpoint_path}.txt"), "w", encoding="utf-8")
                                    text_file.write(str(MSE.item())+"\n"+str(iteration))
                                    text_file.close()
                                    best_MSE = MSE.item() #Only save the model if X better than the current loss.
                        else:
                            if scheduler:
                                MSE = torch.zeros(1, device='cuda')
                                broadcast(MSE, 0)
                                scheduler.step(MSE.item())
                        learning_rate = optimizer.param_groups[0]['lr'] #check actual learning rate (because I sometimes see learning_rate variable go out-of-sync with real LR)
                    iteration += 1
            training = False # exit the While loop
        
        except LossExplosion as ex: # print Exception and continue from checkpoint. (turns out it takes < 4 seconds to restart like this, f*****g awesome)
            print(ex) # print Loss
            checkpoint_path = os.path.join(output_directory, "best_model")
            assert os.path.exists(checkpoint_path), "best_val_model must exist for automatic restarts"
            
            # clearing VRAM for load checkpoint
            audio = mel = speaker_ids = loss = None
            torch.cuda.empty_cache()
            
            model.eval()
            model, optimizer, iteration, scheduler = load_checkpoint(checkpoint_path, model, optimizer, scheduler, fp16_run)
            learning_rate = optimizer.param_groups[0]['lr']
            epoch_offset = max(0, int(iteration / len(train_loader)))
            model.train()
            iteration += 1
            pass # and continue training.
Пример #22
0
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
          sigma, iters_per_checkpoint, batch_size, seed, fp16_run,
          checkpoint_path, with_tensorboard, weight_sharing, optimizer_type,
          dataloader_type):

    ws = weight_sharing
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer_type = optimizer_type.lower()
    if optimizer_type == "sgd":
        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    elif optimizer_type == "adam":
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    else:
        print("Unsupported optimizer: %s. Aborting." % optimizer_type)
        return None

    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    dataloader_type = dataloader_type.lower()
    if dataloader_type == "vanilla":
        trainset = Mel2Samp(**data_config)
    elif dataloader_type == "split":
        trainset = Mel2SampSplit(**data_config)
    else:
        print("Unsupported dataloader type: %s. Aborting." % dataloader_type)
        return None

    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=(num_gpus == 1),
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    name = "waveglow_ws%d_%s_%s_batch%d" % (ws, optimizer_type,
                                            dataloader_type, batch_size)

    if learning_rate != 1e-4:
        name = name + "_lr{:.0e}".format(learning_rate)

    if num_gpus > 1:
        name = name + "_x%d" % num_gpus

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        logger = SummaryWriter(os.path.join("./logs", name))

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    stime2 = None
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        stime = time()
        for i, batch in enumerate(train_loader):
            model.zero_grad()

            mel, audio = batch
            mel = torch.autograd.Variable(mel.cuda())
            audio = torch.autograd.Variable(audio.cuda())
            outputs = model((mel, audio))

            loss = criterion(outputs)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            optimizer.step()

            if (iteration % 100 == 0):
                if not stime2 is None:
                    tot_time2 = time() - stime2
                    print("{}:\t{:.9f}, time: {}".format(
                        iteration, reduced_loss, int(tot_time2)))
                stime2 = time()
            if with_tensorboard and rank == 0:
                logger.add_scalar('training_loss', reduced_loss,
                                  i + len(train_loader) * epoch)

            if (iteration % iters_per_checkpoint == 0):
                if rank == 0:
                    checkpoint_path = "{}/waveglow_{}_{}".format(
                        output_directory, name, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
        tot_time = time() - stime
        print("Epoch %d completed. Time: %d seconds" % (epoch, int(tot_time)))
Пример #23
0
def train(num_gpus, rank, group_name, output_directory, log_directory,
          checkpoint_path):
    # Get device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    torch.manual_seed(hp.seed)
    torch.cuda.manual_seed(hp.seed)

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(hp.sigma)
    model = WaveGlow().cuda()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    learning_rate = hp.learning_rate
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if hp.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path:
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    # Get dataset
    dataset = FastSpeechDataset()

    # Get training loader
    print("Get Training Loader")
    training_loader = DataLoader(dataset,
                                 batch_size=hp.batch_size,
                                 shuffle=True,
                                 collate_fn=collate_fn,
                                 drop_last=True,
                                 num_workers=cpu_count())

    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if hp.with_tensorboard and rank == 0:
        logger = prepare_directories_and_logger(output_directory,
                                                log_directory)

    model = model.train()
    epoch_offset = max(0, int(iteration / len(training_loader)))
    beta = hp.batch_size
    print("Total Epochs: {}".format(hp.epochs))
    print("Batch Size: {}".format(hp.batch_size))

    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, hp.epochs):
        print("Epoch: {}".format(epoch))
        for i, data_of_batch in enumerate(training_loader):
            model.zero_grad()

            if not hp.pre_target:
                # Prepare Data
                src_seq = data_of_batch["texts"]
                src_pos = data_of_batch["pos"]
                mel_tgt = data_of_batch["mels"]

                src_seq = torch.from_numpy(src_seq).long().to(device)
                src_pos = torch.from_numpy(src_pos).long().to(device)
                mel_tgt = torch.from_numpy(mel_tgt).float().to(device)
                alignment_target = get_alignment(src_seq,
                                                 tacotron2).float().to(device)
                # For Data Parallel
                mel_max_len = mel_tgt.size(1)
            else:
                # Prepare Data
                src_seq = data_of_batch["texts"]
                src_pos = data_of_batch["pos"]
                mel_tgt = data_of_batch["mels"]
                alignment_target = data_of_batch["alignment"]

                src_seq = torch.from_numpy(src_seq).long().to(device)
                src_pos = torch.from_numpy(src_pos).long().to(device)
                mel_tgt = torch.from_numpy(mel_tgt).float().to(device)
                alignment_target = torch.from_numpy(
                    alignment_target).float().to(device)
                # For Data Parallel
                mel_max_len = mel_tgt.size(1)

            outputs = model(src_seq, src_pos, mel_tgt, mel_max_len,
                            alignment_target)
            _, _, _, duration_predictor = outputs
            mel_tgt = mel_tgt.transpose(1, 2)
            max_like, dur_loss = criterion(outputs, alignment_target, mel_tgt)
            if beta > 1 and iteration % 10000 == 0:
                beta = beta // 2
            loss = max_like + dur_loss

            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if hp.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            #grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), hp.grad_clip_thresh)

            optimizer.step()

            print("{}:\t{:.9f}".format(iteration, reduced_loss))
            if hp.with_tensorboard and rank == 0:
                logger.log_training(reduced_loss, dur_loss, learning_rate,
                                    iteration)

            if (iteration % hp.save_step == 0):
                if rank == 0:
                    # logger.log_alignment(model, mel_predict, mel_tgt, iteration)
                    checkpoint_path = "{}/TTSglow_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
Пример #24
0
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
          iters_per_checkpoint, iters_per_eval, batch_size, seed, checkpoint_path, log_dir, ema_decay=0.9999):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    if train_data_config["no_chunks"]:
        criterion = MaskedCrossEntropyLoss()
    else:
        criterion = CrossEntropyLoss()
    model = WaveNet(**wavenet_config).cuda()
    ema = ExponentialMovingAverage(ema_decay)
    for name, param in model.named_parameters():
        if param.requires_grad:
            ema.register(name, param.data)

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = StepLR(optimizer, step_size=200000, gamma=0.5)

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, scheduler, iteration, ema = load_checkpoint(checkpoint_path, model,
                                                                      optimizer, scheduler, ema)
        iteration += 1  # next iteration is iteration + 1

    trainset = Mel2SampOnehot(audio_config=audio_config, verbose=True, **train_data_config)
    validset = Mel2SampOnehot(audio_config=audio_config, verbose=False, **valid_data_config)
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    valid_sampler = DistributedSampler(validset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    print(train_data_config)
    if train_data_config["no_chunks"]:
        collate_fn = utils.collate_fn
    else:
        collate_fn = torch.utils.data.dataloader.default_collate
    train_loader = DataLoader(trainset, num_workers=1, shuffle=False,
                              collate_fn=collate_fn,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=True,
                              drop_last=True)
    valid_loader = DataLoader(validset, num_workers=1, shuffle=False,
                              sampler=valid_sampler, batch_size=1, pin_memory=True)
    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)
    
    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    writer = SummaryWriter(log_dir)
    print("Checkpoints writing to: {}".format(log_dir))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            if low_memory:
                torch.cuda.empty_cache()
            scheduler.step()
            model.zero_grad()

            if train_data_config["no_chunks"]:
                x, y, seq_lens = batch
                seq_lens = to_gpu(seq_lens)
            else:
                x, y = batch
            x = to_gpu(x).float()
            y = to_gpu(y)
            x = (x, y)  # auto-regressive takes outputs as inputs
            y_pred = model(x)
            if train_data_config["no_chunks"]:
                loss = criterion(y_pred, y, seq_lens)
            else:
                loss = criterion(y_pred, y)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus)[0]
            else:
                reduced_loss = loss.data[0]
            loss.backward()
            optimizer.step()

            for name, param in model.named_parameters():
                if name in ema.shadow:
                    ema.update(name, param.data)

            print("{}:\t{:.9f}".format(iteration, reduced_loss))
            if rank == 0:
                writer.add_scalar('loss', reduced_loss, iteration)
            if (iteration % iters_per_checkpoint == 0 and iteration):
                if rank == 0:
                    checkpoint_path = "{}/wavenet_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, scheduler, learning_rate, iteration,
                                    checkpoint_path, ema, wavenet_config)
            if (iteration % iters_per_eval == 0 and iteration > 0 and not config["no_validation"]):
                if low_memory:
                    torch.cuda.empty_cache()
                if rank == 0:
                    model_eval = nv_wavenet.NVWaveNet(**(model.export_weights()))
                    for j, valid_batch in enumerate(valid_loader):
                        mel, audio = valid_batch
                        mel = to_gpu(mel).float()
                        cond_input = model.get_cond_input(mel)
                        predicted_audio = model_eval.infer(cond_input, nv_wavenet.Impl.AUTO)
                        predicted_audio = utils.mu_law_decode_numpy(predicted_audio[0, :].cpu().numpy(), 256)
                        writer.add_audio("valid/predicted_audio_{}".format(j),
                                         predicted_audio,
                                         iteration,
                                         22050)
                        audio = utils.mu_law_decode_numpy(audio[0, :].cpu().numpy(), 256)
                        writer.add_audio("valid_true/audio_{}".format(j),
                                         audio,
                                         iteration,
                                         22050)
                        if low_memory:
                            torch.cuda.empty_cache()
            iteration += 1
Пример #25
0
def train(num_gpus, rank, group_name, output_directory, epochs, init_lr,
          final_lr, sigma, epochs_per_checkpoint, batch_size, seed, fp16_run,
          checkpoint_path, with_tensorboard):
    os.makedirs(output_directory, exist_ok=True)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=init_lr)

    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    epoch_offset = 1
    if checkpoint_path != "":
        model, optimizer, epoch_offset = load_checkpoint(
            checkpoint_path, model, optimizer)
        epoch_offset += 1  # next epoch is epoch_offset + 1

    trainset = Mel2Samp(**data_config)
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset,
                              num_workers=8,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        logger = SummaryWriter(os.path.join(output_directory, 'logs'))

    model.train()
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs + 1):
        print(f'Epoch: {epoch}')
        adjust_learning_rate(optimizer, epoch, init_lr, final_lr, epochs)

        for i, batch in enumerate(tqdm.tqdm(train_loader)):
            optimizer.zero_grad()

            batch = model.pre_process(batch)
            outputs = model(batch)

            loss = criterion(outputs)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            optimizer.step()

            if with_tensorboard and rank == 0:
                logger.add_scalar('training_loss', reduced_loss,
                                  i + 1 + len(train_loader) * epoch)

        if epoch % epochs_per_checkpoint == 0:
            if rank == 0:
                # Keep only one checkpoint
                last_chkpt = os.path.join(
                    output_directory,
                    f'waveglow_{epoch - epochs_per_checkpoint:06d}.pt')
                if os.path.exists(last_chkpt):
                    os.remove(last_chkpt)

                checkpoint_path = os.path.join(output_directory,
                                               f'waveglow_{epoch:06d}.pt')
                save_checkpoint(model, optimizer, epoch, checkpoint_path)
Пример #26
0
def train(num_gpus, rank, group_name, prj_name, run_name, output_directory,
          epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed,
          fp16_run, grad_clip_thresh, checkpoint_path, pretrained_path,
          with_tensorboard, with_wandb):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    if pretrained_path != "":
        model = load_pretrained(pretrained_path, model)

    trainset = Mel2Samp(**data_config)
    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        train_sampler = DistributedSampler(trainset)
        shuffle_at_dataloader = False
    else:
        train_sampler = None
        shuffle_at_dataloader = True
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=shuffle_at_dataloader,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        logger = SummaryWriter(os.path.join(output_directory, 'logs'))

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            iter_start = time.perf_counter()

            float_epoch = float(iteration) / len(train_loader)

            model.zero_grad()

            mel, audio = batch
            mel = torch.autograd.Variable(mel.cuda())
            audio = torch.autograd.Variable(audio.cuda())
            outputs = model((mel, audio))

            loss, etc = criterion(outputs)
            (z_L2_normalized, neg_log_s_total, neg_log_det_W_total) = etc
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            is_overflow = False
            if fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), grad_clip_thresh)
                is_overflow = math.isnan(grad_norm)
                if not is_overflow:
                    clipped_grad_norm = get_clip_grad_norm(
                        grad_norm, grad_clip_thresh)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), grad_clip_thresh)
                clipped_grad_norm = get_clip_grad_norm(grad_norm,
                                                       grad_clip_thresh)

            optimizer.step()
            iter_duration = time.perf_counter() - iter_start

            print("{}:\t{:.9f}".format(iteration, reduced_loss))
            if with_tensorboard and rank == 0:
                logger.add_scalar('training_loss', reduced_loss,
                                  i + len(train_loader) * epoch)

            if with_wandb and rank == 0:
                wandb.log(
                    {
                        'iteration': iteration,
                        'epoch': float_epoch,
                        'iter_duration': iter_duration,
                        'training_loss': reduced_loss,
                        'training_loss/z_L2_normalized': z_L2_normalized,
                        'training_loss/neg_log_s_total': neg_log_s_total,
                        'training_loss/neg_log_det_W_total':
                        neg_log_det_W_total,
                    },
                    step=iteration)
                if not is_overflow:
                    wandb.log(
                        {
                            'grad_norm': grad_norm,
                            'clipped_grad_norm': clipped_grad_norm,
                        },
                        step=iteration)

            if (iteration % iters_per_checkpoint == 0):
                if rank == 0:
                    checkpoint_path = "{}/{}/{}/waveglow_{}".format(
                        output_directory, prj_name, run_name, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
Пример #27
0
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
          sigma, iters_per_checkpoint, batch_size, seed, fp16_run,
          checkpoint_path, with_tensorboard):
    torch.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cpu()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    trainset = Mel2Samp(**data_config)
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset, num_workers=1, shuffle=False,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        logger = SummaryWriter(os.path.join(output_directory, 'logs'))

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            model.zero_grad()

            mel, audio = batch
            mel = torch.autograd.Variable(mel.cpu())
            audio = torch.autograd.Variable(audio.cpu())
            outputs = model((mel, audio))

            loss = criterion(outputs)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            optimizer.step()

            print("{}:\t{:.9f}".format(iteration, reduced_loss))
            if with_tensorboard and rank == 0:
                logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch)

            if (iteration % iters_per_checkpoint == 0):
                if rank == 0:
                    checkpoint_path = "{}/waveglow_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
Пример #28
0
def train(output_directory, log_directory, checkpoint_path, warm_start,
          warm_start_force, n_gpus, rank, group_name, hparams):
    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if n_gpus > 1:
        init_distributed(rank, n_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    model, criterion = getCore(hparams)

    #=====START: ADDED FOR DISTRIBUTED======
    if n_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    STFT = [
        TacotronSTFT(filter_length=window,
                     hop_length=hparams.hop_length,
                     win_length=window,
                     sampling_rate=hparams.sampling_rate,
                     n_mel_channels=160,
                     mel_fmin=hparams.mel_fmin,
                     mel_fmax=hparams.mel_fmax)
        for window in hparams.validation_windows
    ]

    optimizer = getOptimizer(model, hparams)

    if hparams.fp16_run:
        global amp
        from apex import amp
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=hparams.fp16_opt_level,
                                          min_loss_scale=2.0)
    else:
        amp = None

    # LEARNING RATE SCHEDULER
    if hparams.LRScheduler.lower() == "ReduceLROnPlateau".lower():
        from torch.optim.lr_scheduler import ReduceLROnPlateau
        min_lr = 1e-5
        factor = 0.1**(
            1 / 5)  # amount to scale the LR by on Validation Loss plateau
        scheduler = ReduceLROnPlateau(optimizer,
                                      'min',
                                      factor=factor,
                                      patience=20,
                                      cooldown=2,
                                      min_lr=min_lr,
                                      verbose=True)
        print("ReduceLROnPlateau used as Learning Rate Scheduler.")
    else:
        scheduler = None

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path:
        model, optimizer, iteration, scheduler = load_checkpoint(
            warm_start, warm_start_force, checkpoint_path, model, optimizer,
            scheduler, hparams.fp16_run)
    iteration += 1  # next iteration is iteration + 1

    trainset = Mel2Samp(hparams)
    speaker_lookup = trainset.speaker_ids
    # =====START: ADDED FOR DISTRIBUTED======
    if n_gpus > 1:
        train_sampler = DistributedSampler(trainset, shuffle=True)
        shuffle = False
    else:
        train_sampler = None
        shuffle = True
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset,
                              num_workers=hparams.n_dataloader_workers,
                              shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if rank == 0:
        from tensorboardX import SummaryWriter
        if False:  # dated and seperated log dirs for each run
            timestr = time.strftime("%Y_%m_%d-%H_%M_%S")
            log_directory = os.path.join(output_directory, log_directory,
                                         timestr)
        else:
            log_directory = os.path.join(output_directory, log_directory)
        logger = SummaryWriter(log_directory)

    moving_average = int(min(len(train_loader),
                             100))  # average loss over 100 iters
    rolling_sum = StreamingMovingAverage(moving_average)
    start_time = time.time()
    start_time_single_batch = time.time()

    model.train()

    if os.path.exists(os.path.join(output_directory, "best_train_model")):
        best_model_loss = float(
            str(
                open(os.path.join(output_directory, "best_train_model") +
                     ".txt",
                     "r",
                     encoding="utf-8").read()).split("\n")[0])
    else:
        best_model_loss = -4.20
    if os.path.exists(os.path.join(output_directory, "best_val_model")):
        best_MSE = float(
            str(
                open(os.path.join(output_directory, "best_val_model") + ".txt",
                     "r",
                     encoding="utf-8").read()).split("\n")[0])
    else:
        best_MSE = 9e9
    epoch_offset = max(0, int(iteration / len(train_loader)))

    pytorch_total_params = sum(p.numel() for p in model.parameters())
    print("{:,} total parameters.".format(pytorch_total_params))
    pytorch_total_params = sum(p.numel() for p in model.parameters()
                               if p.requires_grad)
    print("{:,} trainable parameters.".format(pytorch_total_params))

    learning_rate = hparams.learning_rate
    # ================ MAIN TRAINING LOOP! ===================
    for epoch in get_progress_bar(range(epoch_offset, hparams.epochs),
                                  dict(initial=epoch_offset,
                                       total=hparams.epochs,
                                       smoothing=0.01,
                                       desc="Epoch",
                                       position=1,
                                       unit="epoch"),
                                  hparams,
                                  rank=rank):
        cprint(f"Epoch: {epoch}", b_tqdm=hparams.tqdm)
        if n_gpus > 1: train_sampler.set_epoch(epoch)

        for i, batch in get_progress_bar(enumerate(train_loader),
                                         dict(desc=" Iter",
                                              smoothing=0,
                                              total=len(train_loader),
                                              position=0,
                                              unit="iter",
                                              leave=True),
                                         hparams,
                                         rank=rank):
            # run external code every iter, allows the run to be adjusted without restarts
            if (i == 0 or iteration % param_interval == 0):
                try:
                    with open("hparams_realtime.py") as f:
                        internal_text = str(f.read())
                        ldict = {'iteration': iteration}
                        exec(internal_text, globals(), ldict)
                except Exception as ex:
                    cprint(f"Custom code FAILED to run!\n{ex}",
                           b_tqdm=hparams.tqdm)
                globals().update(ldict)
                locals().update(ldict)
                if show_live_params:
                    cprint(internal_text, b_tqdm=hparams.tqdm)
            assert warmup_start <= iteration, "Current iteration less than warmup_start."
            # Learning Rate Schedule
            if custom_lr:
                old_lr = learning_rate
                if iteration < warmup_end:
                    learning_rate = (iteration - warmup_start) * (
                        (A_ + C_) - warmup_start_lr
                    ) / (
                        warmup_end - warmup_start
                    ) + warmup_start_lr  # learning rate increases from warmup_start_lr to A_ linearly over (warmup_end-warmup_start) iterations.
                else:
                    if iteration < decay_start:
                        learning_rate = A_ + C_
                    else:
                        iteration_adjusted = iteration - decay_start
                        learning_rate = (A_ *
                                         (e**(-iteration_adjusted / B_))) + C_
                assert learning_rate > -1e-8, "Negative Learning Rate."
                if old_lr != learning_rate:
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = learning_rate
            else:
                scheduler.patience = scheduler_patience
                scheduler.cooldown = scheduler_cooldown
                if override_scheduler_last_lr:
                    scheduler._last_lr = override_scheduler_last_lr
                    cprint("Scheduler last_lr overriden. scheduler._last_lr =",
                           scheduler._last_lr,
                           b_tqdm=hparams.tqdm)
                if not iteration % 20:  # check actual learning rate every 20 iters (because I sometimes see learning_rate variable go out-of-sync with real LR)
                    learning_rate = optimizer.param_groups[0]['lr']
                if override_scheduler_best:
                    scheduler.best = override_scheduler_best
                    cprint("Scheduler best metric overriden. scheduler.best =",
                           override_scheduler_best,
                           b_tqdm=hparams.tqdm)

            model.zero_grad()
            mel, audio, speaker_ids = batch
            mel = torch.autograd.Variable(mel.cuda(non_blocking=True))
            audio = torch.autograd.Variable(audio.cuda(non_blocking=True))
            if model.multispeaker:
                speaker_ids = torch.autograd.Variable(
                    speaker_ids.cuda(non_blocking=True)).long().squeeze(1)
                outputs = model(mel, audio, speaker_ids)
            else:
                outputs = model(mel, audio)

            loss = criterion(outputs)
            if n_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()

            assert reduced_loss < 1e5, "Model Diverged. Loss > 1e5"
            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if hparams.b_grad_clip:
                if hparams.fp16_run:
                    grad_norm = torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), hparams.grad_clip_thresh)
                else:
                    grad_norm = torch.nn.utils.clip_grad_norm_(
                        model.parameters(), hparams.grad_clip_thresh)
                is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm)
            else:
                is_overflow = False
                grad_norm = 0.00001

            optimizer.step()
            if not is_overflow and rank == 0:
                if (iteration % 100000 == 0):
                    # plot distribution of parameters
                    for tag, value in model.named_parameters():
                        tag = tag.replace('.', '/')
                        logger.add_histogram(tag,
                                             value.data.cpu().numpy(),
                                             iteration)
                logger.add_scalar('training_loss', reduced_loss, iteration)
                if (iteration % 20 == 0):
                    logger.add_scalar('learning.rate', learning_rate,
                                      iteration)
                if (iteration % 10 == 0):
                    logger.add_scalar('duration',
                                      ((time.time() - start_time) / 10),
                                      iteration)
                start_time_single_batch = time.time()

            average_loss = rolling_sum.process(reduced_loss)
            if rank == 0:
                if (iteration % 10 == 0):
                    cprint(
                        "{} {}:  {:.3f}  {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective)  {:.2f}s/iter {:.4f}s/item"
                        .format(
                            time.strftime("%H:%M:%S"), iteration, reduced_loss,
                            average_loss, round(grad_norm, 3), learning_rate,
                            min((hparams.grad_clip_thresh / grad_norm) *
                                learning_rate, learning_rate),
                            (time.time() - start_time) / 10,
                            ((time.time() - start_time) / 10) /
                            (hparams.batch_size * n_gpus)),
                        b_tqdm=hparams.tqdm)
                    start_time = time.time()
                else:
                    cprint(
                        "{} {}:  {:.3f}  {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective)"
                        .format(
                            time.strftime("%H:%M:%S"), iteration, reduced_loss,
                            average_loss, round(grad_norm, 3), learning_rate,
                            min((hparams.grad_clip_thresh / grad_norm) *
                                learning_rate, learning_rate)),
                        b_tqdm=hparams.tqdm)

            if rank == 0 and (len(rolling_sum.values) > moving_average - 2):
                if (average_loss + best_model_margin) < best_model_loss:
                    checkpoint_path = os.path.join(output_directory,
                                                   "best_train_model")
                    try:
                        save_checkpoint(model, optimizer, hparams,
                                        learning_rate, iteration, amp,
                                        scheduler, speaker_lookup,
                                        checkpoint_path)
                    except KeyboardInterrupt:  # Avoid corrupting the model.
                        save_checkpoint(model, optimizer, hparams,
                                        learning_rate, iteration, amp,
                                        scheduler, speaker_lookup,
                                        checkpoint_path)
                    text_file = open((f"{checkpoint_path}.txt"),
                                     "w",
                                     encoding="utf-8")
                    text_file.write(str(average_loss) + "\n" + str(iteration))
                    text_file.close()
                    best_model_loss = average_loss  #Only save the model if X better than the current loss.
            if rank == 0 and ((iteration % hparams.iters_per_checkpoint == 0)
                              or (os.path.exists(save_file_check_path))):
                checkpoint_path = f"{output_directory}/waveglow_{iteration}"
                save_checkpoint(model, optimizer, hparams, learning_rate,
                                iteration, amp, scheduler, speaker_lookup,
                                checkpoint_path)
                start_time_single_batch = time.time()
                if (os.path.exists(save_file_check_path)):
                    os.remove(save_file_check_path)

            if (iteration % validation_interval == 0):
                if rank == 0:
                    MSE, MAE = validate(model, STFT, logger, iteration,
                                        speaker_lookup, hparams,
                                        output_directory)
                    if scheduler and n_gpus > 1:
                        MSE = torch.tensor(MSE, device='cuda')
                        broadcast(MSE, 0)
                        scheduler.step(MSE.item())
                        if MSE < best_MSE:
                            checkpoint_path = os.path.join(
                                output_directory, "best_val_model")
                            try:
                                save_checkpoint(model, optimizer, hparams,
                                                learning_rate, iteration, amp,
                                                scheduler, speaker_lookup,
                                                checkpoint_path)
                            except KeyboardInterrupt:  # Avoid corrupting the model.
                                save_checkpoint(model, optimizer, hparams,
                                                learning_rate, iteration, amp,
                                                scheduler, speaker_lookup,
                                                checkpoint_path)
                            text_file = open((f"{checkpoint_path}.txt"),
                                             "w",
                                             encoding="utf-8")
                            text_file.write(
                                str(MSE.item()) + "\n" + str(iteration))
                            text_file.close()
                            best_MSE = MSE.item(
                            )  #Only save the model if X better than the current loss.
                else:
                    if scheduler:
                        MSE = torch.zeros(1, device='cuda')
                        broadcast(MSE, 0)
                        scheduler.step(MSE.item())
            iteration += 1