예제 #1
0
파일: train.py 프로젝트: Flinst0n/tacotron2
def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
    print("Saving model and optimizer state at iteration {} to {}".format(
          iteration, filepath))
    model_for_saving = WaveGlow(**waveglow_config).cpu()
    model_for_saving.load_state_dict(model.state_dict())
    torch.save({'model': model_for_saving,
                'iteration': iteration,
                'optimizer': optimizer.state_dict(),
                'learning_rate': learning_rate}, filepath)
예제 #2
0
def save_checkpoint(model, optimizer, epoch, filepath):
    print(f'Saving model and optimizer state at epoch {epoch} to {filepath}')
    model_for_saving = WaveGlow(**waveglow_config).cuda()
    model_for_saving.load_state_dict(model.state_dict())
    torch.save(
        {
            'model': model_for_saving,
            'epoch': epoch,
            'optimizer': optimizer.state_dict()
        }, filepath)
예제 #3
0
파일: tts.py 프로젝트: malarinv/tacotron2
 def __init__(self, tacotron2_path, waveglow_path, **kwargs):
     super(TTSModel, self).__init__()
     hparams = HParams(**kwargs)
     self.hparams = hparams
     self.model = Tacotron2(hparams)
     if torch.cuda.is_available():
         self.model.load_state_dict(
             torch.load(tacotron2_path)["state_dict"])
         self.model.cuda().eval()
     else:
         self.model.load_state_dict(
             torch.load(tacotron2_path, map_location="cpu")["state_dict"])
         self.model.eval()
     self.k_cache = klepto.archives.file_archive(cached=False)
     if waveglow_path:
         if torch.cuda.is_available():
             wave_params = torch.load(waveglow_path)
         else:
             wave_params = torch.load(waveglow_path, map_location="cpu")
         try:
             self.waveglow = WaveGlow(**WAVEGLOW_CONFIG)
             self.waveglow.load_state_dict(wave_params)
         except:
             self.waveglow = wave_params["model"]
             self.waveglow = self.waveglow.remove_weightnorm(self.waveglow)
         if torch.cuda.is_available():
             self.waveglow.cuda().eval()
         else:
             self.waveglow.eval()
         # workaround from
         # https://github.com/NVIDIA/waveglow/issues/127
         for m in self.waveglow.modules():
             if "Conv" in str(type(m)):
                 setattr(m, "padding_mode", "zeros")
         for k in self.waveglow.convinv:
             k.float().half()
         self.denoiser = Denoiser(self.waveglow,
                                  n_mel_channels=hparams.n_mel_channels)
         self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
             self._synth_speech)
     else:
         self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
             self._synth_speech_fast)
     self.taco_stft = TacotronSTFT(
         hparams.filter_length,
         hparams.hop_length,
         hparams.win_length,
         n_mel_channels=hparams.n_mel_channels,
         sampling_rate=hparams.sampling_rate,
         mel_fmax=4000,
     )
예제 #4
0
def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
    print("Saving model and optimizer state at iteration {} to {}".format(
        iteration, filepath))
    model_for_saving = WaveGlow(**waveglow_config).cuda()
    model_for_saving.load_state_dict(model.state_dict())
    torch.save(
        {
            "model": model_for_saving,
            "iteration": iteration,
            "optimizer": optimizer.state_dict(),
            "learning_rate": learning_rate,
        },
        filepath,
    )
예제 #5
0
def save_checkpoint(model, optimizer, amp, iteration, filepath):
    print("Saving model and optimizer state at iteration {} to {}".format(
        iteration, filepath))
    model_for_saving = WaveGlow(**waveglow_config).cuda()
    model_for_saving.load_state_dict(model.state_dict())
    checkpoint = {
        'model': model_for_saving,
        'iteration': iteration,
        'optimizer': optimizer.state_dict(),
        'cuda_rng_state_all': torch.cuda.get_rng_state_all(),
        'random_rng_state': torch.random.get_rng_state()
    }

    if amp is not None:
        checkpoint['amp'] = amp.state_dict()

    torch.save(checkpoint, filepath)
예제 #6
0
def main():
    try:
        args = get_arguments()

        lc = read_binary_lc(args.lc, hparams.num_mels)

        if hparams.lc_encode or hparams.transposed_upsampling:
            lc = np.reshape(lc, [1, -1, hparams.num_mels])
        else:
            # upsampling local condition
            lc = np.tile(lc, [1, 1, hparams.upsampling_rate])
            lc = np.reshape(lc, [1, -1, hparams.num_mels])

        print(lc.shape)

        glow = WaveGlow(lc_dim=hparams.num_mels,
                        n_flows=hparams.n_flows,
                        n_group=hparams.n_group,
                        n_early_every=hparams.n_early_every,
                        n_early_size=hparams.n_early_size)

        lc_placeholder = tf.placeholder(tf.float32,
                                        shape=[1, None, hparams.num_mels],
                                        name='lc')
        audio = glow.infer(lc_placeholder, sigma=args.sigma)

        sess = tf.Session(config=tf.ConfigProto(log_device_placement=False,
                                                allow_soft_placement=True))
        print("restore model")
        saver = tf.train.Saver(var_list=tf.trainable_variables())
        saver.restore(sess, args.restore_from)
        print('restore model successfully!')

        audio_output = sess.run(audio, feed_dict={lc_placeholder: lc})
        audio_output = audio_output.flatten()
        print(audio_output)
        write_wav(audio_output, hparams.sample_rate, args.wave_name)
    except Exception:
        raise
예제 #7
0
def test(sigma, batch_size, seed, checkpoint_path):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda().eval()

    # Load checkpoint if one exists
    model, iteration = load_checkpoint(checkpoint_path, model)

    model.eval()

    testset = Mel2Samp(data_config['testing_files'],
                       data_config['segment_length'],
                       data_config['filter_length'], data_config['hop_length'],
                       data_config['win_length'], data_config['sampling_rate'],
                       data_config['mel_fmin'], data_config['mel_fmax'])
    test_loader = DataLoader(testset,
                             num_workers=1,
                             shuffle=False,
                             sampler=None,
                             batch_size=batch_size,
                             pin_memory=False,
                             drop_last=True)

    with torch.no_grad():
        val_loss = 0.0
        for j, batch in enumerate(test_loader):
            mel, audio = batch
            mel = torch.autograd.Variable(mel.cuda())
            audio = torch.autograd.Variable(audio.cuda())
            outputs = model((mel, audio))

            loss = criterion(outputs)
            val_loss += loss.item()
        val_loss = val_loss / (j + 1)
        model.train()

        print("test loss: {}:\t{:.9f}".format(iteration, val_loss))
예제 #8
0
def waveglow_infer(mel, config):
    print(
        colored('Running WaveGlow with ', 'blue', attrs=['bold']) +
        config.vocoder_path)

    waveglow = WaveGlow(config)
    waveglow, _, _ = load_checkpoint(config.vocoder_path, waveglow)

    #waveglow = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_waveglow')
    waveglow = waveglow.remove_weightnorm(waveglow)
    waveglow = set_device(waveglow, config.device)
    waveglow.eval()

    denoiser = Denoiser(waveglow, config)
    denoiser = set_device(denoiser, config.device)

    with torch.no_grad():
        wave = waveglow.infer(mel, config.sigma).float()
        wave = denoiser(wave, strength=config.denoising_strength)

    wave = wave / torch.max(torch.abs(wave))

    return wave.cpu()
예제 #9
0
    if not os.path.exists("results"):
        os.mkdir("results")
    audio.save_wav(wav[0].data.cpu().numpy(),
                   os.path.join("results",
                                str(num) + ".wav"))


if __name__ == "__main__":
    # Test

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    torch.manual_seed(hp.seed)
    torch.cuda.manual_seed(hp.seed)
    model = WaveGlow().cuda()
    checkpoint = torch.load('test/TTSglow_130000')
    model.load_state_dict(checkpoint['model'].state_dict())

    dataset = FastSpeechDataset()
    testing_loader = DataLoader(dataset,
                                batch_size=1,
                                shuffle=False,
                                collate_fn=collate_fn,
                                drop_last=True,
                                num_workers=4)
    model = model.eval()

    for i, data_of_batch in enumerate(testing_loader):
        src_seq = data_of_batch["texts"]
        src_pos = data_of_batch["pos"]
예제 #10
0
def train(num_gpus,
          rank,
          group_name,
          output_directory,
          epochs,
          learning_rate,
          sigma,
          iters_per_checkpoint,
          batch_size,
          seed,
          fp16_run,
          checkpoint_path,
          with_tensorboard,
          num_workers=4):
    print("num_workers", num_workers)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    # =====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda()

    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    # =====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = StepLR(optimizer, step_size=1, gamma=0.96)

    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    trainset = Mel2Samp(**data_config)
    evalset = Mel2Samp(**eval_data_config)
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    eval_sampler = DistributedSampler(evalset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset,
                              num_workers=num_workers,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)
    eval_loader = DataLoader(evalset,
                             num_workers=num_workers,
                             shuffle=False,
                             sampler=eval_sampler,
                             batch_size=batch_size,
                             pin_memory=False,
                             drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        logger = SummaryWriter(os.path.join(output_directory, 'logs'))

    epoch_offset = max(1, int(iteration / len(train_loader)))
    start_time = datetime.datetime.now()
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print('Epoch:', epoch, 'LR:', scheduler.get_lr())
        elapsed = datetime.datetime.now() - start_time
        print("Epoch: [{}][els: {}] {}".format(
            datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), elapsed,
            epoch))
        model.train()
        total_loss = 0.
        for i, batch in enumerate(train_loader):
            model.zero_grad()

            if waveglow_config["multi_speaker_config"]["use_multi_speaker"]:
                mel, audio, spk_embed_or_id = batch
                spk_embed_or_id = torch.autograd.Variable(
                    spk_embed_or_id.cuda())
            else:
                mel, audio = batch
            mel = torch.autograd.Variable(mel.cuda())
            audio = torch.autograd.Variable(audio.cuda())

            if waveglow_config["multi_speaker_config"]["use_multi_speaker"]:
                outputs = model((mel, audio, spk_embed_or_id))
            else:
                outputs = model((mel, audio))

            loss = criterion(outputs)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            optimizer.step()
            total_loss += reduced_loss
            if i > 0 and i % 10 == 0:
                elapsed = datetime.datetime.now() - start_time
                print(
                    "[{}][els: {}] epoch {},total steps{}, {}/{} steps:\t{:.9f}"
                    .format(
                        datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"),
                        elapsed, epoch, iteration, i, len(train_loader),
                        reduced_loss))
            if with_tensorboard and rank == 0:
                logger.add_scalar('training_loss', reduced_loss,
                                  i + len(train_loader) * epoch)

            if (iteration % iters_per_checkpoint == 0):
                if rank == 0:
                    checkpoint_path = "{}/waveglow_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
        elapsed = datetime.datetime.now() - start_time
        print("[{}][els: {}] {} epoch :\tavg loss {:.9f}".format(
            datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), elapsed,
            epoch, total_loss / len(train_loader)))
        scheduler.step()
        eval.eval(eval_loader, model, criterion, num_gpus, start_time, epoch,
                  waveglow_config["multi_speaker_config"]["use_multi_speaker"])
예제 #11
0
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
          sigma, iters_per_checkpoint, batch_size, seed, fp16_run,
          checkpoint_path, with_tensorboard):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    trainset = Mel2Samp(data_config['training_files'],
                        data_config['segment_length'],
                        data_config['filter_length'],
                        data_config['hop_length'],
                        data_config['win_length'],
                        data_config['sampling_rate'],
                        data_config['mel_fmin'],
                        data_config['mel_fmax'],
                        debug=False)

    if 'testing_files' in data_config:
        testset = Mel2Samp(data_config['testing_files'],
                           data_config['segment_length'],
                           data_config['filter_length'],
                           data_config['hop_length'],
                           data_config['win_length'],
                           data_config['sampling_rate'],
                           data_config['mel_fmin'],
                           data_config['mel_fmax'],
                           debug=True)
    else:
        testset = None

    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        logger = SummaryWriter(os.path.join(output_directory, 'logs'))
    else:
        logger = None

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            start = time.perf_counter()

            model.zero_grad()

            print("train batch loaded, {} ({} of {})".format(
                iteration, i, len(train_loader)))
            mel, audio = batch
            mel = torch.autograd.Variable(mel.cuda())
            audio = torch.autograd.Variable(audio.cuda())
            outputs = model((mel, audio))

            loss = criterion(outputs)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            is_overflow = False
            if fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), 1.0)
                is_overflow = math.isnan(grad_norm)

            optimizer.step()

            duration = time.perf_counter() - start

            print(
                "train batch done, {} ({} of {}): {:.9f} (took {:.2f})".format(
                    iteration, i, len(train_loader), reduced_loss, duration))

            if logger:
                logger.add_scalar('training_loss', reduced_loss,
                                  i + len(train_loader) * epoch)
                logger.add_scalar('duration', duration,
                                  i + len(train_loader) * epoch)

            if testset and not is_overflow and (iteration %
                                                iters_per_checkpoint == 0):
                if testset:
                    validate(model, criterion, testset, iteration, batch_size,
                             num_gpus, logger)

                if rank == 0:
                    rotate_checkpoints(output_directory)
                    checkpoint_path = "{}/waveglow_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
예제 #12
0
def train(num_gpus, rank, group_name, output_directory, log_directory,
          checkpoint_path, hparams):
    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(hparams.sigma)
    model = WaveGlow(hparams).cuda()

    Taco2 = load_pretrained_taco('tacotron2.pt', hparams)

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path:
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    trainset = TextMelLoader(hparams.training_files, hparams)
    collate_fn = TextMelCollate()
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    batch_size = hparams.batch_size
    train_loader = DataLoader(trainset,
                              num_workers=0,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)

    # Get shared output_directory readya

    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if hparams.with_tensorboard and rank == 0:
        logger = prepare_directories_and_logger(output_directory,
                                                log_directory)

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    print("Total Epochs: {}".format(hparams.epochs))
    print("Batch Size: {}".format(hparams.batch_size))
    print("learning rate: {}".format(hparams.learning_rate))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, hparams.epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            model.zero_grad()

            text_padded, input_lengths, mel_padded, max_len, output_lengths = parse_batch(
                batch)
            with torch.no_grad():
                enc_outputs, alignments = Taco2(
                    (text_padded, input_lengths, mel_padded, max_len,
                     output_lengths))

            # mel_padded = mel_padded.transpose(1, 2)
            # mel_padded = mel_padded / torch.abs(mel_padded).max().item()
            mel_pos = torch.arange(1000)
            mel_pos = to_gpu(mel_pos).long().unsqueeze(0)
            mel_pos = mel_pos.expand(hparams.batch_size, -1)
            src_pos = torch.arange(hparams.n_position)
            src_pos = to_gpu(src_pos).long().unsqueeze(0)
            src_pos = src_pos.expand(hparams.batch_size, -1)

            mel_padded = (mel_padded + 5) / 10

            z, log_s_list, log_det_w_list, dec_enc_attn = model(
                mel_padded, enc_outputs, mel_pos, src_pos, input_lengths)
            outputs = (z, log_s_list, log_det_w_list, dec_enc_attn)
            loss = criterion(outputs, alignments)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            grad_norm = torch.nn.utils.clip_grad_norm_(
                model.parameters(), hparams.grad_clip_thresh)
            optimizer.step()

            print("{}:\t{:.9f}".format(iteration, reduced_loss))
            if hparams.with_tensorboard and rank == 0:
                logger.log_training(reduced_loss, grad_norm, learning_rate,
                                    iteration)

            if (iteration % hparams.iters_per_checkpoint == 0):
                if rank == 0:
                    mel_predict, test_attn = model.test(
                        mel_padded, enc_outputs, mel_pos, src_pos,
                        input_lengths)
                    logger.log_alignment(model, dec_enc_attn, alignments,
                                         mel_padded, mel_predict, test_attn,
                                         iteration)
                    checkpoint_path = "{}/waveglow_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
예제 #13
0
def main(files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
         denoiser_strength, args):
    #mel_files = files_to_list(mel_files)
    #print(mel_files)
    files = ['/local-scratch/fuyang/cmpt726/final_project/cremad/1091_WSI_SAD_XX.wav']
    #files = ['/local-scratch/fuyang/cmpt726/waveglow/data/LJSpeech-1.1/LJ001-0001.wav']
    with open('config.json') as f:
        data = f.read()
    config = json.loads(data)
    waveglow_config = config["waveglow_config"]
    model = WaveGlow(**waveglow_config)
    checkpoint_dict = torch.load('waveglow_256channels_universal_v5.pt', map_location='cpu')
    model_for_loading = checkpoint_dict['model']
    model.load_state_dict(model_for_loading.state_dict())
    model.cuda()
    #waveglow = torch.load(waveglow_path)['model']
    #waveglow = waveglow.remove_weightnorm(waveglow)
    #waveglow.cuda()
    waveglow = model
    if is_fp16:
        from apex import amp
        waveglow, _ = amp.initialize(waveglow, [], opt_level="O1")

    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow).cuda()

    mel_extractor = Get_mel(1024, 256, 1024, args.sampling_rate, 0.0, 8000.0)

    for i, file_path in enumerate(files):
        audio, rate = load_wav_to_torch(file_path)
        if rate != sampling_rate:
            audio = resampy.resample(audio.numpy(), rate, sampling_rate)
            audio = torch.from_numpy(audio).float()
        #if audio.size(0) >= args.segment_length:
        #    max_audio_start = audio.size(0) - args.segment_length
        #    audio_start = random.randint(0, max_audio_start)
        #    audio = audio[audio_start:audio_start+args.segment_length]
        #else:
        #    audio = torch.nn.functional.pad(audio, (0, args.segment_length-audio.size(0)), 'constant').data
        mel = mel_extractor.get_mel(audio)
        audio = audio / MAX_WAV_VALUE

        mel = torch.autograd.Variable(mel.cuda().unsqueeze(0))
        audio = torch.autograd.Variable(audio.cuda().unsqueeze(0))
        audio = audio.half() if is_fp16 else audio
        mel = mel.half() if is_fp16 else mel
        outputs = waveglow((mel, audio))
        z = outputs[0][:,4:]
        print(outputs)
        mel_up = waveglow.upsample(mel)
        time_cutoff = waveglow.upsample.kernel_size[0]-waveglow.upsample.stride[0]
        mel_up = mel_up[:,:,:-time_cutoff]
        #mel_up = mel_up[:,:,:-(time_cutoff+128)]

        mel_up = mel_up.unfold(2, waveglow.n_group, waveglow.n_group).permute(0,2,1,3)
        mel_up = mel_up.contiguous().view(mel_up.size(0), mel_up.size(1), -1).permute(0, 2, 1)
        audio = z
        mel_up = mel_up[:,:,:audio.size(2)]

        sigma = 0.7
        z_i = 0
        for k in reversed(range(waveglow.n_flows)):
            n_half = int(audio.size(1)/2)
            audio_0 = audio[:,:n_half, :]
            audio_1 = audio[:, n_half:, :]

            output = waveglow.WN[k]((audio_0, mel_up))

            s = output[:,n_half:, :]
            b = output[:, :n_half, :]
            audio_1 = (audio_1-b)/torch.exp(s)
            audio = torch.cat([audio_0, audio_1],1)

            audio = waveglow.convinv[k](audio, reverse=True)

            if k % waveglow.n_early_every == 0 and k > 0:
                z = outputs[0][:, 2-z_i:4-z_i]
                #if mel_up.type() == 'torch.cuda.HalfTensor':
                #    z = torch.cuda.HalfTensor(mel_up.size(0), waveglow.n_early_size, mel_up.size(2)).normal_()
                #else:
                #    z = torch.cuda.FloatTensor(mel_up.size(0), waveglow.n_early_size, mel_up.size(2)).normal_()
                audio = torch.cat((sigma*z, audio),1)
        audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data
        audio = audio * MAX_WAV_VALUE
        audio = audio.squeeze()
        audio = audio.cpu().numpy()
        audio = audio.astype('int16')
        audio_path = os.path.join(
            output_dir, "{}_synthesis.wav".format('fuyangz'))
        write(audio_path, sampling_rate, audio)
        print(audio_path)
예제 #14
0
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
          sigma, iters_per_checkpoint, batch_size, seed, checkpoint_path):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Load checkpoint if one exists
    iteration = 0
    print("checkpoint path", checkpoint_path)
    #model = warm_load_checkpoint(checkpoint_path, model)
    model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                  optimizer)
    iteration += 1
    trainset = Mel2Samp(**data_config)
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=True,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            model.zero_grad()

            mel, audio = batch
            mel = torch.autograd.Variable(mel.cuda())
            audio = torch.autograd.Variable(audio.cuda())
            outputs = model((mel, audio))

            loss = criterion(outputs)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()
            loss.backward()
            optimizer.step()
            if (iteration % iters_per_checkpoint == 0):
                print("{}:\t{:.9f}".format(iteration, reduced_loss))
                checkpoint_path = "{}/waveglow".format(output_directory)
                save_checkpoint(model, optimizer, learning_rate, iteration,
                                checkpoint_path)
            iteration += 1
예제 #15
0
def main(style, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
         denoiser_strength, args):
    #mel_files = files_to_list(mel_files)
    #print(mel_files)
    dataset = voice_dataset(dataBase={
        'ravdess': './our_data/ravdess',
        'cremad': './our_data/cremad'
    },
                            style=('happy', 'sad', 'angry'))
    #print(len(dataset.final_data['happy']))

    #sample = dataset.pick_one_random_sample('happy')
    styles = ['happy', 'sad', 'angry']
    with open('config.json') as f:
        data = f.read()
    config = json.loads(data)
    waveglow_config = config["waveglow_config"]
    model = WaveGlow(**waveglow_config)
    checkpoint_dict = torch.load('waveglow_256channels_universal_v5.pt',
                                 map_location='cpu')
    model_for_loading = checkpoint_dict['model']
    model.load_state_dict(model_for_loading.state_dict())
    model.cuda()
    waveglow = model
    if is_fp16:
        from apex import amp
        waveglow, _ = amp.initialize(waveglow, [], opt_level="O1")

    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow).cuda()

    mel_extractor = Get_mel(1024, 256, 1024, args.sampling_rate, 0.0, 8000.0)

    vector_all = {}
    for style in styles:
        files = dataset.final_data[style].copy()
        random.shuffle(files)

        vectors = []
        for i, (_, file_path) in enumerate(files):
            if i > 200:
                break
            try:
                audio, rate = load_wav_to_torch(file_path)
                if rate != sampling_rate:
                    audio = resampy.resample(audio.numpy(), rate,
                                             sampling_rate)
                    audio = torch.from_numpy(audio).float()
                #if audio.size(0) >= args.segment_length:
                #    max_audio_start = audio.size(0) - args.segment_length
                #    audio_start = random.randint(0, max_audio_start)
                #    audio = audio[audio_start:audio_start+args.segment_length]
                #else:
                #    audio = torch.nn.functional.pad(audio, (0, args.segment_length-audio.size(0)), 'constant').data
                mel = mel_extractor.get_mel(audio)
                audio = audio / MAX_WAV_VALUE

                mel = torch.autograd.Variable(mel.cuda().unsqueeze(0))
                audio = torch.autograd.Variable(audio.cuda().unsqueeze(0))
                audio = audio.half() if is_fp16 else audio
                mel = mel.half() if is_fp16 else mel
                outputs = waveglow((mel, audio))
                vectors.append(
                    outputs[0].squeeze(0).mean(1).detach().cpu().numpy())
                print(style, i)
            except:
                continue

        vector_all[style] = vectors

    np.save('all_style_vector', vector_all)
예제 #16
0
def train(num_gpus,
          rank,
          group_name,
          output_directory,
          epochs,
          learning_rate,
          sigma,
          iters_per_checkpoint,
          batch_size,
          seed,
          fp16_run,
          checkpoint_path,
          with_tensorboard,
          num_workers=2):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    # HACK: setup separate training and eval sets
    training_files = data_config['training_files']
    eval_files = data_config['eval_files']
    del data_config['training_files']
    del data_config['eval_files']
    data_config['audio_files'] = training_files
    trainset = Mel2Samp(**data_config)
    data_config['audio_files'] = eval_files
    evalset = Mel2Samp(**data_config)

    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    eval_sampler = DistributedSampler(evalset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======

    print("Creating dataloaders with " + str(num_workers) + " workers")
    train_loader = DataLoader(trainset,
                              num_workers=num_workers,
                              shuffle=True,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)
    eval_loader = DataLoader(evalset,
                             num_workers=num_workers,
                             shuffle=True,
                             sampler=eval_sampler,
                             batch_size=batch_size,
                             pin_memory=False,
                             drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        logger_train = SummaryWriter(
            os.path.join(output_directory, 'logs', 'train'))
        logger_eval = SummaryWriter(
            os.path.join(output_directory, 'logs', 'eval'))

    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        model.train()
        with tqdm(total=len(train_loader)) as train_pbar:
            for i, batch in enumerate(train_loader):
                model.zero_grad()

                mel, audio = batch
                mel = torch.autograd.Variable(mel.cuda())
                audio = torch.autograd.Variable(audio.cuda())
                outputs = model((mel, audio))

                loss = criterion(outputs)
                if num_gpus > 1:
                    reduced_loss = reduce_tensor(loss.data, num_gpus).item()
                else:
                    reduced_loss = loss.item()

                if fp16_run:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                optimizer.step()

                train_pbar.set_description(
                    "Epoch {} Iter {} Loss {:.3f}".format(
                        epoch, iteration, reduced_loss))
                if with_tensorboard and rank == 0 and iteration % 10 == 0:
                    logger_train.add_scalar('loss', reduced_loss,
                                            i + len(train_loader) * epoch)
                    # adding logging for GPU utilization and memory usage
                    gpu_memory_used, gpu_utilization = get_gpu_stats()
                    k = 'gpu' + str(0)
                    logger_train.add_scalar(k + '/memory', gpu_memory_used,
                                            iteration)
                    logger_train.add_scalar(k + '/load', gpu_utilization,
                                            iteration)
                    logger_train.flush()

                if (iteration % iters_per_checkpoint == 0):
                    if rank == 0:
                        checkpoint_path = "{}/waveglow_{}".format(
                            output_directory, iteration)
                        save_checkpoint(model, optimizer, learning_rate,
                                        iteration, checkpoint_path)

                iteration += 1
                train_pbar.update(1)

        # Eval
        model.eval()
        torch.cuda.empty_cache()

        with torch.no_grad():
            tensorboard_mel, tensorboard_audio = None, None
            loss_accum = []
            with tqdm(total=len(eval_loader)) as eval_pbar:
                for i, batch in enumerate(eval_loader):
                    model.zero_grad()
                    mel, audio = batch
                    mel = torch.autograd.Variable(mel.cuda())
                    audio = torch.autograd.Variable(audio.cuda())
                    outputs = model((mel, audio))
                    loss = criterion(outputs).item()
                    loss_accum.append(loss)
                    eval_pbar.set_description("Epoch {} Eval {:.3f}".format(
                        epoch, loss))
                    outputs = None

                    # use the first batch for tensorboard audio samples
                    if i == 0:
                        tensorboard_mel = mel
                        tensorboard_audio = audio
                    eval_pbar.update(1)

            if with_tensorboard and rank == 0:
                loss_avg = statistics.mean(loss_accum)
                tqdm.write("Epoch {} Eval AVG {}".format(epoch, loss_avg))
                logger_eval.add_scalar('loss', loss_avg, iteration)

            # log audio samples to tensorboard
            tensorboard_audio_generated = model.infer(tensorboard_mel)
            for i in range(0, 5):
                ta = tensorboard_audio[i].cpu().numpy()
                tag = tensorboard_audio_generated[i].cpu().numpy()
                logger_eval.add_audio("sample " + str(i) + "/orig",
                                      ta,
                                      epoch,
                                      sample_rate=data_config['sampling_rate'])
                logger_eval.add_audio("sample " + str(i) + "/gen",
                                      tag,
                                      epoch,
                                      sample_rate=data_config['sampling_rate'])
            logger_eval.flush()
예제 #17
0
def train(num_gpus, rank, group_name, output_directory, epochs, init_lr,
          final_lr, sigma, epochs_per_checkpoint, batch_size, seed, fp16_run,
          checkpoint_path, with_tensorboard):
    os.makedirs(output_directory, exist_ok=True)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=init_lr)

    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    epoch_offset = 1
    if checkpoint_path != "":
        model, optimizer, epoch_offset = load_checkpoint(
            checkpoint_path, model, optimizer)
        epoch_offset += 1  # next epoch is epoch_offset + 1

    trainset = Mel2Samp(**data_config)
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset,
                              num_workers=8,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        logger = SummaryWriter(os.path.join(output_directory, 'logs'))

    model.train()
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs + 1):
        print(f'Epoch: {epoch}')
        adjust_learning_rate(optimizer, epoch, init_lr, final_lr, epochs)

        for i, batch in enumerate(tqdm.tqdm(train_loader)):
            optimizer.zero_grad()

            batch = model.pre_process(batch)
            outputs = model(batch)

            loss = criterion(outputs)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            optimizer.step()

            if with_tensorboard and rank == 0:
                logger.add_scalar('training_loss', reduced_loss,
                                  i + 1 + len(train_loader) * epoch)

        if epoch % epochs_per_checkpoint == 0:
            if rank == 0:
                # Keep only one checkpoint
                last_chkpt = os.path.join(
                    output_directory,
                    f'waveglow_{epoch - epochs_per_checkpoint:06d}.pt')
                if os.path.exists(last_chkpt):
                    os.remove(last_chkpt)

                checkpoint_path = os.path.join(output_directory,
                                               f'waveglow_{epoch:06d}.pt')
                save_checkpoint(model, optimizer, epoch, checkpoint_path)
예제 #18
0
def train(
    num_gpus,
    rank,
    group_name,
    output_directory,
    epochs,
    learning_rate,
    sigma,
    iters_per_checkpoint,
    batch_size,
    seed,
    fp16_run,
    checkpoint_path,
    with_tensorboard,
):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    # =====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda()

    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    # =====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if fp16_run:
        from apex import amp

        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    trainset = Mel2Samp(**data_config)
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(
        trainset,
        num_workers=1,
        shuffle=False,
        sampler=train_sampler,
        batch_size=batch_size,
        pin_memory=False,
        drop_last=True,
    )

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter

        logger = SummaryWriter(os.path.join(output_directory, "logs"))

    # fixed for visualization
    real_mels, real_audios = zip(*[trainset[i] for i in range(8)])
    real_mel = torch.cat(real_mels, dim=-1)
    real_audio = torch.cat(real_audios, dim=0)

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            model.zero_grad()

            mel, audio = batch
            mel = torch.autograd.Variable(mel.cuda())
            audio = torch.autograd.Variable(audio.cuda())
            outputs = model((mel, audio))

            loss = criterion(outputs)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            optimizer.step()

            print("{}:\t{:.9f}".format(iteration, reduced_loss))
            if with_tensorboard and rank == 0:
                step = i + len(train_loader) * epoch
                logger.add_scalar("training_loss", reduced_loss, step)
                if step % 500 == 0:
                    # select the first eight data sample

                    model.eval()
                    with torch.no_grad():
                        device = mel.device
                        fake_audio = (model.infer(
                            torch.stack(real_mels).to(device)).flatten(
                                0, 1).cpu())
                    model.train()
                    fake_mel = trainset.get_mel(fake_audio)

                    logger.add_image(
                        "training_mel_real",
                        plot_spectrogram_to_numpy(real_mel),
                        step,
                        dataformats="HWC",
                    )
                    logger.add_audio(
                        "training_audio_real",
                        real_audio,
                        step,
                        22050,
                    )
                    logger.add_image(
                        "training_mel_fake",
                        plot_spectrogram_to_numpy(fake_mel),
                        step,
                        dataformats="HWC",
                    )
                    logger.add_audio(
                        "training_audio_fake",
                        fake_audio,
                        step,
                        22050,
                    )
                    logger.flush()

            if iteration % iters_per_checkpoint == 0:
                if rank == 0:
                    checkpoint_path = "{}/waveglow_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
예제 #19
0
def train(output_directory, epochs, learning_rate, sigma, iters_per_checkpoint,
          batch_size, seed, fp16_run, checkpoint_path, with_tensorboard):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config,
                     filter_length=data_config["filter_length"],
                     hop_length=data_config["hop_length"]).cuda()

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)

    trainset = Mel2Samp(**data_config)
    train_loader = DataLoader(trainset,
                              num_workers=6,
                              sampler=RandomSampler(0, 14),
                              batch_size=batch_size,
                              pin_memory=True,
                              drop_last=False)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        logger = SummaryWriter(os.path.join(output_directory, 'logs'))

    model.train()
    model = model.cuda()

    s = time()
    reduced_loss = 0
    for i, batch in enumerate(train_loader):
        model.zero_grad()

        mel, audio = batch
        mel = torch.autograd.Variable(mel.cuda())
        audio = torch.autograd.Variable(audio.cuda())
        outputs = model((mel, audio))

        loss = criterion(outputs)
        reduced_loss += loss.item()

        if fp16_run:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        optimizer.step()
        denominator = i % iters_per_checkpoint + 1
        print("iteration:{}, loss:{:.4f}, time:{:.2f}            "
              "".format(iteration + 1, reduced_loss / denominator,
                        (time() - s) / denominator),
              end="\r")

        if with_tensorboard and rank == 0:
            logger.add_scalar('training_loss', reduced_loss / denominator,
                              iteration + 1)

        if (iteration + 1) % iters_per_checkpoint == 0:
            s = time()
            reduced_loss = 0
            if rank == 0:
                checkpoint_path = "{}/waveglow_it{}.pt".format(
                    output_directory, iteration + 1)
                save_checkpoint(model, optimizer, learning_rate, iteration + 1,
                                checkpoint_path)
        iteration += 1
예제 #20
0
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
          sigma, iters_per_checkpoint, batch_size, seed, fp16_run,
          checkpoint_path, with_tensorboard, warm_start):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    # =====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda()

    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    # =====END:   ADDED FOR DISTRIBUTED======
    optimizer = Over9000(model.parameters(), lr=learning_rate)

    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
    else:
        amp = None

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer, warm_start)
        if fp16_run and not warm_start:
            amp.load_state_dict(torch.load(checkpoint_path)['amp'])
        iteration += 1

    trainset = Mel2Samp(**data_config)
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset,
                              num_workers=16,
                              shuffle=True,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        logger = SummaryWriter(os.path.join(output_directory, 'logs'))

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           factor=0.999,
                                                           patience=250,
                                                           cooldown=250,
                                                           verbose=True,
                                                           min_lr=1e-5)
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            model.zero_grad()

            mel, audio = batch
            mel = mel.cuda()
            audio = audio.cuda()
            outputs = model((mel, audio))

            loss = criterion(outputs)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), 1.0)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), 1.0)

            optimizer.step()

            if epoch > 1:
                scheduler.step(loss)

            print("{}:\t{:.9f}\t{:.9f}".format(iteration, reduced_loss,
                                               grad_norm))
            if with_tensorboard and rank == 0:
                logger.add_scalar('training_loss', reduced_loss,
                                  i + len(train_loader) * epoch)

            if (iteration % iters_per_checkpoint == 0):
                if rank == 0:
                    checkpoint_path = "{}/waveglow_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, amp, iteration,
                                    checkpoint_path)

            iteration += 1
예제 #21
0
파일: tts.py 프로젝트: malarinv/tacotron2
class TTSModel(object):
    """docstring for TTSModel."""
    def __init__(self, tacotron2_path, waveglow_path, **kwargs):
        super(TTSModel, self).__init__()
        hparams = HParams(**kwargs)
        self.hparams = hparams
        self.model = Tacotron2(hparams)
        if torch.cuda.is_available():
            self.model.load_state_dict(
                torch.load(tacotron2_path)["state_dict"])
            self.model.cuda().eval()
        else:
            self.model.load_state_dict(
                torch.load(tacotron2_path, map_location="cpu")["state_dict"])
            self.model.eval()
        self.k_cache = klepto.archives.file_archive(cached=False)
        if waveglow_path:
            if torch.cuda.is_available():
                wave_params = torch.load(waveglow_path)
            else:
                wave_params = torch.load(waveglow_path, map_location="cpu")
            try:
                self.waveglow = WaveGlow(**WAVEGLOW_CONFIG)
                self.waveglow.load_state_dict(wave_params)
            except:
                self.waveglow = wave_params["model"]
                self.waveglow = self.waveglow.remove_weightnorm(self.waveglow)
            if torch.cuda.is_available():
                self.waveglow.cuda().eval()
            else:
                self.waveglow.eval()
            # workaround from
            # https://github.com/NVIDIA/waveglow/issues/127
            for m in self.waveglow.modules():
                if "Conv" in str(type(m)):
                    setattr(m, "padding_mode", "zeros")
            for k in self.waveglow.convinv:
                k.float().half()
            self.denoiser = Denoiser(self.waveglow,
                                     n_mel_channels=hparams.n_mel_channels)
            self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
                self._synth_speech)
        else:
            self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
                self._synth_speech_fast)
        self.taco_stft = TacotronSTFT(
            hparams.filter_length,
            hparams.hop_length,
            hparams.win_length,
            n_mel_channels=hparams.n_mel_channels,
            sampling_rate=hparams.sampling_rate,
            mel_fmax=4000,
        )

    def _generate_mel_postnet(self, text):
        sequence = np.array(text_to_sequence(text,
                                             ["english_cleaners"]))[None, :]
        if torch.cuda.is_available():
            sequence = torch.autograd.Variable(
                torch.from_numpy(sequence)).cuda().long()
        else:
            sequence = torch.autograd.Variable(
                torch.from_numpy(sequence)).long()
        with torch.no_grad():
            mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(
                sequence)
        return mel_outputs_postnet

    def synth_speech_array(self, text, vocoder):
        mel_outputs_postnet = self._generate_mel_postnet(text)

        if vocoder == VOCODER_WAVEGLOW:
            with torch.no_grad():
                audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666)
                audio_t = self.denoiser(audio_t, 0.1)[0]
            audio = audio_t[0].data
        elif vocoder == VOCODER_GL:
            mel_decompress = self.taco_stft.spectral_de_normalize(
                mel_outputs_postnet)
            mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
            spec_from_mel_scaling = 1000
            spec_from_mel = torch.mm(mel_decompress[0],
                                     self.taco_stft.mel_basis)
            spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
            spec_from_mel = spec_from_mel * spec_from_mel_scaling
            spec_from_mel = (spec_from_mel.cuda()
                             if torch.cuda.is_available() else spec_from_mel)
            audio = griffin_lim(
                torch.autograd.Variable(spec_from_mel[:, :, :-1]),
                self.taco_stft.stft_fn,
                GL_ITERS,
            )
            audio = audio.squeeze()
        else:
            raise ValueError("vocoder arg should be one of [wavglow|gl]")
        audio = audio.cpu().numpy()
        return audio

    def _synth_speech(self,
                      text,
                      speed: float = 1.0,
                      sample_rate: int = OUTPUT_SAMPLE_RATE):
        audio = self.synth_speech_array(text, VOCODER_WAVEGLOW)

        return postprocess_audio(
            audio,
            src_rate=self.hparams.sampling_rate,
            dst_rate=sample_rate,
            tempo=speed,
        )

    def _synth_speech_fast(self,
                           text,
                           speed: float = 1.0,
                           sample_rate: int = OUTPUT_SAMPLE_RATE):
        audio = self.synth_speech_array(text, VOCODER_GL)

        return postprocess_audio(
            audio,
            tempo=speed,
            src_rate=self.hparams.sampling_rate,
            dst_rate=sample_rate,
        )
예제 #22
0
def train(num_gpus, rank, group_name, output_directory, log_directory,
          checkpoint_path):
    # Get device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    torch.manual_seed(hp.seed)
    torch.cuda.manual_seed(hp.seed)

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(hp.sigma)
    model = WaveGlow().cuda()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    learning_rate = hp.learning_rate
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if hp.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path:
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    # Get dataset
    dataset = FastSpeechDataset()

    # Get training loader
    print("Get Training Loader")
    training_loader = DataLoader(dataset,
                                 batch_size=hp.batch_size,
                                 shuffle=True,
                                 collate_fn=collate_fn,
                                 drop_last=True,
                                 num_workers=cpu_count())

    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if hp.with_tensorboard and rank == 0:
        logger = prepare_directories_and_logger(output_directory,
                                                log_directory)

    model = model.train()
    epoch_offset = max(0, int(iteration / len(training_loader)))
    beta = hp.batch_size
    print("Total Epochs: {}".format(hp.epochs))
    print("Batch Size: {}".format(hp.batch_size))

    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, hp.epochs):
        print("Epoch: {}".format(epoch))
        for i, data_of_batch in enumerate(training_loader):
            model.zero_grad()

            if not hp.pre_target:
                # Prepare Data
                src_seq = data_of_batch["texts"]
                src_pos = data_of_batch["pos"]
                mel_tgt = data_of_batch["mels"]

                src_seq = torch.from_numpy(src_seq).long().to(device)
                src_pos = torch.from_numpy(src_pos).long().to(device)
                mel_tgt = torch.from_numpy(mel_tgt).float().to(device)
                alignment_target = get_alignment(src_seq,
                                                 tacotron2).float().to(device)
                # For Data Parallel
                mel_max_len = mel_tgt.size(1)
            else:
                # Prepare Data
                src_seq = data_of_batch["texts"]
                src_pos = data_of_batch["pos"]
                mel_tgt = data_of_batch["mels"]
                alignment_target = data_of_batch["alignment"]

                src_seq = torch.from_numpy(src_seq).long().to(device)
                src_pos = torch.from_numpy(src_pos).long().to(device)
                mel_tgt = torch.from_numpy(mel_tgt).float().to(device)
                alignment_target = torch.from_numpy(
                    alignment_target).float().to(device)
                # For Data Parallel
                mel_max_len = mel_tgt.size(1)

            outputs = model(src_seq, src_pos, mel_tgt, mel_max_len,
                            alignment_target)
            _, _, _, duration_predictor = outputs
            mel_tgt = mel_tgt.transpose(1, 2)
            max_like, dur_loss = criterion(outputs, alignment_target, mel_tgt)
            if beta > 1 and iteration % 10000 == 0:
                beta = beta // 2
            loss = max_like + dur_loss

            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if hp.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            #grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), hp.grad_clip_thresh)

            optimizer.step()

            print("{}:\t{:.9f}".format(iteration, reduced_loss))
            if hp.with_tensorboard and rank == 0:
                logger.log_training(reduced_loss, dur_loss, learning_rate,
                                    iteration)

            if (iteration % hp.save_step == 0):
                if rank == 0:
                    # logger.log_alignment(model, mel_predict, mel_tgt, iteration)
                    checkpoint_path = "{}/TTSglow_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
def main(style, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
         denoiser_strength, args):
    #mel_files = files_to_list(mel_files)
    #print(mel_files)
    dataset = voice_dataset(dataBase={
        'ravdess': './our_data/ravdess',
        'cremad': './our_data/cremad'
    },
                            style=('happy', 'sad', 'angry'))
    #print(len(dataset.final_data['happy']))

    #sample = dataset.pick_one_random_sample('happy')
    files = dataset.final_data[style]
    #files = ['/local-scratch/fuyang/cmpt726/waveglow/data/LJSpeech-1.1/LJ001-0001.wav']
    with open('config.json') as f:
        data = f.read()
    config = json.loads(data)
    waveglow_config = config["waveglow_config"]
    model = WaveGlow(**waveglow_config)
    checkpoint_dict = torch.load('waveglow_256channels_universal_v5.pt',
                                 map_location='cpu')
    model_for_loading = checkpoint_dict['model']
    model.load_state_dict(model_for_loading.state_dict())
    model.cuda()
    waveglow = model
    if is_fp16:
        from apex import amp
        waveglow, _ = amp.initialize(waveglow, [], opt_level="O1")

    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow).cuda()

    mel_extractor = Get_mel(1024, 256, 1024, args.sampling_rate, 0.0, 8000.0)
    avg_z = np.zeros(8)
    _count = 0
    for i, (_, file_path) in enumerate(files):
        if i > 50:
            break
        try:
            audio, rate = load_wav_to_torch(file_path)
            if rate != sampling_rate:
                audio = resampy.resample(audio.numpy(), rate, sampling_rate)
                audio = torch.from_numpy(audio).float()
            #if audio.size(0) >= args.segment_length:
            #    max_audio_start = audio.size(0) - args.segment_length
            #    audio_start = random.randint(0, max_audio_start)
            #    audio = audio[audio_start:audio_start+args.segment_length]
            #else:
            #    audio = torch.nn.functional.pad(audio, (0, args.segment_length-audio.size(0)), 'constant').data
            mel = mel_extractor.get_mel(audio)
            audio = audio / MAX_WAV_VALUE

            mel = torch.autograd.Variable(mel.cuda().unsqueeze(0))
            audio = torch.autograd.Variable(audio.cuda().unsqueeze(0))
            audio = audio.half() if is_fp16 else audio
            mel = mel.half() if is_fp16 else mel
            outputs = waveglow((mel, audio))
            avg_z += outputs[0].squeeze(0).mean(1).detach().cpu().numpy()
            _count += 1
            z = outputs[0][:, 4:]

            #print(outputs)
            mel_up = waveglow.upsample(mel)
            time_cutoff = waveglow.upsample.kernel_size[
                0] - waveglow.upsample.stride[0]
            mel_up = mel_up[:, :, :-time_cutoff]
            #mel_up = mel_up[:,:,:-(time_cutoff+128)]

            mel_up = mel_up.unfold(2, waveglow.n_group,
                                   waveglow.n_group).permute(0, 2, 1, 3)
            mel_up = mel_up.contiguous().view(mel_up.size(0), mel_up.size(1),
                                              -1).permute(0, 2, 1)
            audio = z
            mel_up = mel_up[:, :, :audio.size(2)]

            sigma = 0.7
            z_i = 0
            for k in reversed(range(waveglow.n_flows)):
                n_half = int(audio.size(1) / 2)
                audio_0 = audio[:, :n_half, :]
                audio_1 = audio[:, n_half:, :]

                output = waveglow.WN[k]((audio_0, mel_up))

                s = output[:, n_half:, :]
                b = output[:, :n_half, :]
                audio_1 = (audio_1 - b) / torch.exp(s)
                audio = torch.cat([audio_0, audio_1], 1)

                audio = waveglow.convinv[k](audio, reverse=True)

                if k % waveglow.n_early_every == 0 and k > 0:
                    z = outputs[0][:, 2 - z_i:4 - z_i]
                    #if mel_up.type() == 'torch.cuda.HalfTensor':
                    #    z = torch.cuda.HalfTensor(mel_up.size(0), waveglow.n_early_size, mel_up.size(2)).normal_()
                    #else:
                    #    z = torch.cuda.FloatTensor(mel_up.size(0), waveglow.n_early_size, mel_up.size(2)).normal_()
                    audio = torch.cat((sigma * z, audio), 1)
            audio = audio.permute(0, 2,
                                  1).contiguous().view(audio.size(0), -1).data
            audio = audio * MAX_WAV_VALUE
            audio = audio.squeeze()
            audio = audio.cpu().numpy()
            audio = audio.astype('int16')
            audio_path = os.path.join(
                output_dir, "{}_synthesis.wav".format(file_path[:-4]))
            if os.path.exists(
                    os.path.join(*audio_path.split('/')[:-1])) is False:
                os.makedirs(os.path.join(*audio_path.split('/')[:-1]),
                            exist_ok=True)
            write(audio_path, sampling_rate, audio)
            print(audio_path)
        except:
            continue

    avg_z = avg_z / _count
    np.save(style, avg_z)
예제 #24
0
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
          sigma, iters_per_checkpoint, batch_size, seed, fp16_run,
          checkpoint_path, with_tensorboard, weight_sharing, optimizer_type,
          dataloader_type):

    ws = weight_sharing
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer_type = optimizer_type.lower()
    if optimizer_type == "sgd":
        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    elif optimizer_type == "adam":
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    else:
        print("Unsupported optimizer: %s. Aborting." % optimizer_type)
        return None

    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    dataloader_type = dataloader_type.lower()
    if dataloader_type == "vanilla":
        trainset = Mel2Samp(**data_config)
    elif dataloader_type == "split":
        trainset = Mel2SampSplit(**data_config)
    else:
        print("Unsupported dataloader type: %s. Aborting." % dataloader_type)
        return None

    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=(num_gpus == 1),
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    name = "waveglow_ws%d_%s_%s_batch%d" % (ws, optimizer_type,
                                            dataloader_type, batch_size)

    if learning_rate != 1e-4:
        name = name + "_lr{:.0e}".format(learning_rate)

    if num_gpus > 1:
        name = name + "_x%d" % num_gpus

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        logger = SummaryWriter(os.path.join("./logs", name))

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    stime2 = None
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        stime = time()
        for i, batch in enumerate(train_loader):
            model.zero_grad()

            mel, audio = batch
            mel = torch.autograd.Variable(mel.cuda())
            audio = torch.autograd.Variable(audio.cuda())
            outputs = model((mel, audio))

            loss = criterion(outputs)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            optimizer.step()

            if (iteration % 100 == 0):
                if not stime2 is None:
                    tot_time2 = time() - stime2
                    print("{}:\t{:.9f}, time: {}".format(
                        iteration, reduced_loss, int(tot_time2)))
                stime2 = time()
            if with_tensorboard and rank == 0:
                logger.add_scalar('training_loss', reduced_loss,
                                  i + len(train_loader) * epoch)

            if (iteration % iters_per_checkpoint == 0):
                if rank == 0:
                    checkpoint_path = "{}/waveglow_{}_{}".format(
                        output_directory, name, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
        tot_time = time() - stime
        print("Epoch %d completed. Time: %d seconds" % (epoch, int(tot_time)))
예제 #25
0
def save_checkpoint(model, optimizer, learning_rate, iteration, filepath,
                    drive_fid):
    print("Saving model and optimizer state at iteration {} to {}".format(
        iteration, filepath))
    model_for_saving = WaveGlow(**waveglow_config).cuda()
    model_for_saving.load_state_dict(model.state_dict())
    torch.save(
        {
            'model': model_for_saving,
            'iteration': iteration,
            'optimizer': optimizer.state_dict(),
            'learning_rate': learning_rate
        }, filepath)
    uploaded = False
    attempt = 0
    file_title = filepath[filepath.find("/") + 1:]
    while not uploaded and attempt < 10:
        attempt += 1
        try:
            if gauth.credentials is None:
                # Authenticate if they're not there
                gauth.LocalWebserverAuth()
            elif gauth.access_token_expired:
                # Refresh them if expired
                print("Google Drive Token Expired, Refreshing")
                gauth.Refresh()
            else:
                # Initialize the saved creds
                gauth.Authorize()
            # Save the current credentials to a file
            # gauth.SaveCredentialsFile("GoogleDriveCredentials.txt")
            f = drive.CreateFile({
                'title':
                file_title,
                "parents": [{
                    "kind": "drive#fileLink",
                    "id": drive_fid
                }]
            })
            f.SetContentFile(filepath)
            f.Upload()
            uploaded = True
            break
        except:
            print("Failed uploading to drive at attempt #{}".format(attempt))
            sleep(30)
        if uploaded:
            try:
                ok = False
                for file in drive.ListFile({
                        'q':
                        "'" + drive_fid + "' in parents"
                }).GetList():
                    if file['title'] == file_title:
                        if file["fileSize"] > 4000000:
                            ok = True
                            print("File was successfully uploaded")
                        else:
                            file.Delete()
                            uploaded = False
                            print("File was not uploaded normally. Deleting")
                            sleep(30)
                        break
                if ok:
                    for file in drive.ListFile({
                            'q':
                            "'" + drive_fid + "' in parents"
                    }).GetList():
                        if file['title'] != file_title:
                            file.Delete()
                            sleep(
                                30
                            )  #make sure the file is deleted from drive first
            except:
                pass
예제 #26
0
def train(num_gpus, rank, group_name, prj_name, run_name, output_directory,
          epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed,
          fp16_run, grad_clip_thresh, checkpoint_path, pretrained_path,
          with_tensorboard, with_wandb):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    if pretrained_path != "":
        model = load_pretrained(pretrained_path, model)

    trainset = Mel2Samp(**data_config)
    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        train_sampler = DistributedSampler(trainset)
        shuffle_at_dataloader = False
    else:
        train_sampler = None
        shuffle_at_dataloader = True
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=shuffle_at_dataloader,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        logger = SummaryWriter(os.path.join(output_directory, 'logs'))

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            iter_start = time.perf_counter()

            float_epoch = float(iteration) / len(train_loader)

            model.zero_grad()

            mel, audio = batch
            mel = torch.autograd.Variable(mel.cuda())
            audio = torch.autograd.Variable(audio.cuda())
            outputs = model((mel, audio))

            loss, etc = criterion(outputs)
            (z_L2_normalized, neg_log_s_total, neg_log_det_W_total) = etc
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            is_overflow = False
            if fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), grad_clip_thresh)
                is_overflow = math.isnan(grad_norm)
                if not is_overflow:
                    clipped_grad_norm = get_clip_grad_norm(
                        grad_norm, grad_clip_thresh)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), grad_clip_thresh)
                clipped_grad_norm = get_clip_grad_norm(grad_norm,
                                                       grad_clip_thresh)

            optimizer.step()
            iter_duration = time.perf_counter() - iter_start

            print("{}:\t{:.9f}".format(iteration, reduced_loss))
            if with_tensorboard and rank == 0:
                logger.add_scalar('training_loss', reduced_loss,
                                  i + len(train_loader) * epoch)

            if with_wandb and rank == 0:
                wandb.log(
                    {
                        'iteration': iteration,
                        'epoch': float_epoch,
                        'iter_duration': iter_duration,
                        'training_loss': reduced_loss,
                        'training_loss/z_L2_normalized': z_L2_normalized,
                        'training_loss/neg_log_s_total': neg_log_s_total,
                        'training_loss/neg_log_det_W_total':
                        neg_log_det_W_total,
                    },
                    step=iteration)
                if not is_overflow:
                    wandb.log(
                        {
                            'grad_norm': grad_norm,
                            'clipped_grad_norm': clipped_grad_norm,
                        },
                        step=iteration)

            if (iteration % iters_per_checkpoint == 0):
                if rank == 0:
                    checkpoint_path = "{}/{}/{}/waveglow_{}".format(
                        output_directory, prj_name, run_name, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
예제 #27
0
파일: train.py 프로젝트: mcf330/WaveGlow
def main():
    args = get_arguments()
    args.logdir = os.path.join(hparams.logdir_root, args.run_name)
    if not os.path.exists(args.logdir):
        os.makedirs(args.logdir)

    assert hparams.upsampling_rate == hparams.hop_length, 'upsamling rate should be same as hop_length'

    # Create coordinator.
    coord = tf.train.Coordinator()
    global_step = tf.get_variable("global_step", [], initializer=tf.constant_initializer(0), trainable=False)
    learning_rate = tf.train.exponential_decay(hparams.lr, global_step, hparams.decay_steps, 0.95, staircase=True)
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

    with tf.device('/cpu:0'):
        with tf.name_scope('inputs'):
            reader = DataReader(coord, args.filelist, args.wave_dir, args.lc_dir)

    sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True))
    reader.start_threads()

    audio_placeholder = tf.placeholder(tf.float32, shape=[None, None, 1], name='audio')
    lc_placeholder = tf.placeholder(tf.float32, shape=[None, None, hparams.num_mels], name='lc')

    tower_losses = []
    tower_grads = []
    with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
        for i in range(args.ngpu):
            with tf.device('/gpu:%d' % i), tf.name_scope('tower_%d' % i):
                glow = WaveGlow(lc_dim=hparams.num_mels,
                                n_flows=hparams.n_flows,
                                n_group=hparams.n_group,
                                n_early_every=hparams.n_early_every,
                                n_early_size=hparams.n_early_size)
                print('create network %i' % i)

                local_audio_placeholder = audio_placeholder[i * hparams.batch_size:(i + 1) * hparams.batch_size, :, :]
                local_lc_placeholder = lc_placeholder[i * hparams.batch_size:(i + 1) * hparams.batch_size, :, :]

                output_audio, log_s_list, log_det_W_list = glow.create_forward_network(local_audio_placeholder,
                                                                                       local_lc_placeholder)
                loss = compute_waveglow_loss(output_audio, log_s_list, log_det_W_list, sigma=hparams.sigma)
                grads = optimizer.compute_gradients(loss, var_list=tf.trainable_variables())

                tower_losses.append(loss)
                tower_grads.append(grads)

                tf.summary.scalar('loss_tower_%d' % i, loss)

    # # gradient clipping
    # gradients = [grad for grad, var in averaged_gradients]
    # params = [var for grad, var in averaged_gradients]
    # clipped_gradients, norm = tf.clip_by_global_norm(gradients, 1.0)
    #
    # with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
    #     train_ops = optimizer.apply_gradients(zip(clipped_gradients, params), global_step=global_step)

    print("create network finished")
    loss = tf.reduce_mean(tower_losses)
    averaged_gradients = average_gradients(tower_grads)

    train_ops = optimizer.apply_gradients(averaged_gradients, global_step=global_step)

    tf.summary.scalar('loss', loss)

    # Set up logging for TensorBoard.
    writer = tf.summary.FileWriter(args.logdir)
    writer.add_graph(tf.get_default_graph())
    run_metadata = tf.RunMetadata()
    summaries = tf.summary.merge_all()

    # Set up session
    init = tf.global_variables_initializer()
    sess.run(init)
    print('parameters initialization finished')

    saver = tf.train.Saver(var_list=tf.trainable_variables(), max_to_keep=30)

    saved_global_step = 0
    if args.restore_from is not None:
        try:
            saved_global_step = load(saver, sess, args.restore_from)
            if saved_global_step is None:
                # The first training step will be saved_global_step + 1,
                # therefore we put -1 here for new or overwritten trainings.
                saved_global_step = 0
        except Exception:
            print("Something went wrong while restoring checkpoint. "
                  "We will terminate training to avoid accidentally overwriting "
                  "the previous model.")
            raise

        print("restore model successfully!")

    print('start training.')
    last_saved_step = saved_global_step
    try:
        for step in range(saved_global_step + 1, hparams.train_steps):
            audio, lc = reader.dequeue(num_elements=hparams.batch_size * args.ngpu)

            if hparams.lc_encode or hparams.transposed_upsampling:
                # if using local condition bi-lstm encoding or tranposed conv upsampling, no need to upsample
                # bi-lstm, upsamle will be done in the tf code
                lc = np.reshape(lc, [hparams.batch_size * args.ngpu, -1, hparams.num_mels])
            else:
                # upsampling by directly repeat
                lc = np.tile(lc, [1, 1, hparams.upsampling_rate])
                lc = np.reshape(lc, [hparams.batch_size * args.ngpu, -1, hparams.num_mels])

            start_time = time.time()
            if step % 50 == 0 and args.store_metadata:
                # Slow run that stores extra information for debugging.
                print('Storing metadata')
                run_options = tf.RunOptions(
                    trace_level=tf.RunOptions.FULL_TRACE)
                summary, loss_value, _, lr = sess.run(
                    [summaries, loss, train_ops, learning_rate],
                    feed_dict={audio_placeholder: audio, lc_placeholder: lc},
                    options=run_options,
                    run_metadata=run_metadata)
                writer.add_summary(summary, step)
                writer.add_run_metadata(run_metadata,
                                        'step_{:04d}'.format(step))
                tl = timeline.Timeline(run_metadata.step_stats)
                timeline_path = os.path.join(args.logdir, 'timeline.trace')
                with open(timeline_path, 'w') as f:
                    f.write(tl.generate_chrome_trace_format(show_memory=True))
            else:
                summary, loss_value, _, lr = sess.run([summaries, loss, train_ops, learning_rate],
                                                      feed_dict={audio_placeholder: audio, lc_placeholder: lc})
                writer.add_summary(summary, step)

            duration = time.time() - start_time
            step_log = 'step {:d} - loss = {:.3f}, lr={:.8f}, time cost={:4f}'\
                .format(step, loss_value, lr, duration)
            print(step_log)

            if step % hparams.save_model_every == 0:
                save(saver, sess, args.logdir, step)
                last_saved_step = step

    except KeyboardInterrupt:
        # Introduce a line break after ^C is displayed so save message
        # is on its own line.
        print()
    finally:
        if step > last_saved_step:
            save(saver, sess, args.logdir, step)
        coord.request_stop()
        coord.join()
예제 #28
0
파일: train.py 프로젝트: Flinst0n/tacotron2
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
          sigma, iters_per_checkpoint, batch_size, seed, fp16_run,
          checkpoint_path, with_tensorboard):
    torch.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cpu()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    trainset = Mel2Samp(**data_config)
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset, num_workers=1, shuffle=False,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        logger = SummaryWriter(os.path.join(output_directory, 'logs'))

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            model.zero_grad()

            mel, audio = batch
            mel = torch.autograd.Variable(mel.cpu())
            audio = torch.autograd.Variable(audio.cpu())
            outputs = model((mel, audio))

            loss = criterion(outputs)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            optimizer.step()

            print("{}:\t{:.9f}".format(iteration, reduced_loss))
            if with_tensorboard and rank == 0:
                logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch)

            if (iteration % iters_per_checkpoint == 0):
                if rank == 0:
                    checkpoint_path = "{}/waveglow_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
예제 #29
0
from dataset import FastSpeechDataset, collate_fn, DataLoader
from scipy.io.wavfile import write
import sys

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

MAX_WAV_VALUE = 32768.0

if __name__ == "__main__":
    # Test
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    sampling_rate = 22050

    torch.manual_seed(hp.seed)
    torch.cuda.manual_seed(hp.seed)
    model = WaveGlow().cuda()
    checkpoint = torch.load('test/TTSglow_67000')
    model.load_state_dict(checkpoint['model'].state_dict())
    model = model.remove_weightnorm(model)

    dataset = FastSpeechDataset()
    testing_loader = DataLoader(dataset,
                                batch_size=1,
                                shuffle=False,
                                collate_fn=collate_fn,
                                drop_last=True,
                                num_workers=4)
    model = model.train()

    for i, data_of_batch in enumerate(testing_loader):
        audio_tgt = data_of_batch["audios"]