예제 #1
0
def main(files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
         denoiser_strength, args):
    #mel_files = files_to_list(mel_files)
    #print(mel_files)
    files = ['/local-scratch/fuyang/cmpt726/final_project/cremad/1091_WSI_SAD_XX.wav']
    #files = ['/local-scratch/fuyang/cmpt726/waveglow/data/LJSpeech-1.1/LJ001-0001.wav']
    with open('config.json') as f:
        data = f.read()
    config = json.loads(data)
    waveglow_config = config["waveglow_config"]
    model = WaveGlow(**waveglow_config)
    checkpoint_dict = torch.load('waveglow_256channels_universal_v5.pt', map_location='cpu')
    model_for_loading = checkpoint_dict['model']
    model.load_state_dict(model_for_loading.state_dict())
    model.cuda()
    #waveglow = torch.load(waveglow_path)['model']
    #waveglow = waveglow.remove_weightnorm(waveglow)
    #waveglow.cuda()
    waveglow = model
    if is_fp16:
        from apex import amp
        waveglow, _ = amp.initialize(waveglow, [], opt_level="O1")

    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow).cuda()

    mel_extractor = Get_mel(1024, 256, 1024, args.sampling_rate, 0.0, 8000.0)

    for i, file_path in enumerate(files):
        audio, rate = load_wav_to_torch(file_path)
        if rate != sampling_rate:
            audio = resampy.resample(audio.numpy(), rate, sampling_rate)
            audio = torch.from_numpy(audio).float()
        #if audio.size(0) >= args.segment_length:
        #    max_audio_start = audio.size(0) - args.segment_length
        #    audio_start = random.randint(0, max_audio_start)
        #    audio = audio[audio_start:audio_start+args.segment_length]
        #else:
        #    audio = torch.nn.functional.pad(audio, (0, args.segment_length-audio.size(0)), 'constant').data
        mel = mel_extractor.get_mel(audio)
        audio = audio / MAX_WAV_VALUE

        mel = torch.autograd.Variable(mel.cuda().unsqueeze(0))
        audio = torch.autograd.Variable(audio.cuda().unsqueeze(0))
        audio = audio.half() if is_fp16 else audio
        mel = mel.half() if is_fp16 else mel
        outputs = waveglow((mel, audio))
        z = outputs[0][:,4:]
        print(outputs)
        mel_up = waveglow.upsample(mel)
        time_cutoff = waveglow.upsample.kernel_size[0]-waveglow.upsample.stride[0]
        mel_up = mel_up[:,:,:-time_cutoff]
        #mel_up = mel_up[:,:,:-(time_cutoff+128)]

        mel_up = mel_up.unfold(2, waveglow.n_group, waveglow.n_group).permute(0,2,1,3)
        mel_up = mel_up.contiguous().view(mel_up.size(0), mel_up.size(1), -1).permute(0, 2, 1)
        audio = z
        mel_up = mel_up[:,:,:audio.size(2)]

        sigma = 0.7
        z_i = 0
        for k in reversed(range(waveglow.n_flows)):
            n_half = int(audio.size(1)/2)
            audio_0 = audio[:,:n_half, :]
            audio_1 = audio[:, n_half:, :]

            output = waveglow.WN[k]((audio_0, mel_up))

            s = output[:,n_half:, :]
            b = output[:, :n_half, :]
            audio_1 = (audio_1-b)/torch.exp(s)
            audio = torch.cat([audio_0, audio_1],1)

            audio = waveglow.convinv[k](audio, reverse=True)

            if k % waveglow.n_early_every == 0 and k > 0:
                z = outputs[0][:, 2-z_i:4-z_i]
                #if mel_up.type() == 'torch.cuda.HalfTensor':
                #    z = torch.cuda.HalfTensor(mel_up.size(0), waveglow.n_early_size, mel_up.size(2)).normal_()
                #else:
                #    z = torch.cuda.FloatTensor(mel_up.size(0), waveglow.n_early_size, mel_up.size(2)).normal_()
                audio = torch.cat((sigma*z, audio),1)
        audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data
        audio = audio * MAX_WAV_VALUE
        audio = audio.squeeze()
        audio = audio.cpu().numpy()
        audio = audio.astype('int16')
        audio_path = os.path.join(
            output_dir, "{}_synthesis.wav".format('fuyangz'))
        write(audio_path, sampling_rate, audio)
        print(audio_path)
def main(style, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
         denoiser_strength, args):
    #mel_files = files_to_list(mel_files)
    #print(mel_files)
    dataset = voice_dataset(dataBase={
        'ravdess': './our_data/ravdess',
        'cremad': './our_data/cremad'
    },
                            style=('happy', 'sad', 'angry'))
    #print(len(dataset.final_data['happy']))

    #sample = dataset.pick_one_random_sample('happy')
    files = dataset.final_data[style]
    #files = ['/local-scratch/fuyang/cmpt726/waveglow/data/LJSpeech-1.1/LJ001-0001.wav']
    with open('config.json') as f:
        data = f.read()
    config = json.loads(data)
    waveglow_config = config["waveglow_config"]
    model = WaveGlow(**waveglow_config)
    checkpoint_dict = torch.load('waveglow_256channels_universal_v5.pt',
                                 map_location='cpu')
    model_for_loading = checkpoint_dict['model']
    model.load_state_dict(model_for_loading.state_dict())
    model.cuda()
    waveglow = model
    if is_fp16:
        from apex import amp
        waveglow, _ = amp.initialize(waveglow, [], opt_level="O1")

    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow).cuda()

    mel_extractor = Get_mel(1024, 256, 1024, args.sampling_rate, 0.0, 8000.0)
    avg_z = np.zeros(8)
    _count = 0
    for i, (_, file_path) in enumerate(files):
        if i > 50:
            break
        try:
            audio, rate = load_wav_to_torch(file_path)
            if rate != sampling_rate:
                audio = resampy.resample(audio.numpy(), rate, sampling_rate)
                audio = torch.from_numpy(audio).float()
            #if audio.size(0) >= args.segment_length:
            #    max_audio_start = audio.size(0) - args.segment_length
            #    audio_start = random.randint(0, max_audio_start)
            #    audio = audio[audio_start:audio_start+args.segment_length]
            #else:
            #    audio = torch.nn.functional.pad(audio, (0, args.segment_length-audio.size(0)), 'constant').data
            mel = mel_extractor.get_mel(audio)
            audio = audio / MAX_WAV_VALUE

            mel = torch.autograd.Variable(mel.cuda().unsqueeze(0))
            audio = torch.autograd.Variable(audio.cuda().unsqueeze(0))
            audio = audio.half() if is_fp16 else audio
            mel = mel.half() if is_fp16 else mel
            outputs = waveglow((mel, audio))
            avg_z += outputs[0].squeeze(0).mean(1).detach().cpu().numpy()
            _count += 1
            z = outputs[0][:, 4:]

            #print(outputs)
            mel_up = waveglow.upsample(mel)
            time_cutoff = waveglow.upsample.kernel_size[
                0] - waveglow.upsample.stride[0]
            mel_up = mel_up[:, :, :-time_cutoff]
            #mel_up = mel_up[:,:,:-(time_cutoff+128)]

            mel_up = mel_up.unfold(2, waveglow.n_group,
                                   waveglow.n_group).permute(0, 2, 1, 3)
            mel_up = mel_up.contiguous().view(mel_up.size(0), mel_up.size(1),
                                              -1).permute(0, 2, 1)
            audio = z
            mel_up = mel_up[:, :, :audio.size(2)]

            sigma = 0.7
            z_i = 0
            for k in reversed(range(waveglow.n_flows)):
                n_half = int(audio.size(1) / 2)
                audio_0 = audio[:, :n_half, :]
                audio_1 = audio[:, n_half:, :]

                output = waveglow.WN[k]((audio_0, mel_up))

                s = output[:, n_half:, :]
                b = output[:, :n_half, :]
                audio_1 = (audio_1 - b) / torch.exp(s)
                audio = torch.cat([audio_0, audio_1], 1)

                audio = waveglow.convinv[k](audio, reverse=True)

                if k % waveglow.n_early_every == 0 and k > 0:
                    z = outputs[0][:, 2 - z_i:4 - z_i]
                    #if mel_up.type() == 'torch.cuda.HalfTensor':
                    #    z = torch.cuda.HalfTensor(mel_up.size(0), waveglow.n_early_size, mel_up.size(2)).normal_()
                    #else:
                    #    z = torch.cuda.FloatTensor(mel_up.size(0), waveglow.n_early_size, mel_up.size(2)).normal_()
                    audio = torch.cat((sigma * z, audio), 1)
            audio = audio.permute(0, 2,
                                  1).contiguous().view(audio.size(0), -1).data
            audio = audio * MAX_WAV_VALUE
            audio = audio.squeeze()
            audio = audio.cpu().numpy()
            audio = audio.astype('int16')
            audio_path = os.path.join(
                output_dir, "{}_synthesis.wav".format(file_path[:-4]))
            if os.path.exists(
                    os.path.join(*audio_path.split('/')[:-1])) is False:
                os.makedirs(os.path.join(*audio_path.split('/')[:-1]),
                            exist_ok=True)
            write(audio_path, sampling_rate, audio)
            print(audio_path)
        except:
            continue

    avg_z = avg_z / _count
    np.save(style, avg_z)
예제 #3
0
def train(output_directory, epochs, learning_rate, sigma, iters_per_checkpoint,
          batch_size, seed, fp16_run, checkpoint_path, with_tensorboard):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config,
                     filter_length=data_config["filter_length"],
                     hop_length=data_config["hop_length"]).cuda()

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)

    trainset = Mel2Samp(**data_config)
    train_loader = DataLoader(trainset,
                              num_workers=6,
                              sampler=RandomSampler(0, 14),
                              batch_size=batch_size,
                              pin_memory=True,
                              drop_last=False)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        logger = SummaryWriter(os.path.join(output_directory, 'logs'))

    model.train()
    model = model.cuda()

    s = time()
    reduced_loss = 0
    for i, batch in enumerate(train_loader):
        model.zero_grad()

        mel, audio = batch
        mel = torch.autograd.Variable(mel.cuda())
        audio = torch.autograd.Variable(audio.cuda())
        outputs = model((mel, audio))

        loss = criterion(outputs)
        reduced_loss += loss.item()

        if fp16_run:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        optimizer.step()
        denominator = i % iters_per_checkpoint + 1
        print("iteration:{}, loss:{:.4f}, time:{:.2f}            "
              "".format(iteration + 1, reduced_loss / denominator,
                        (time() - s) / denominator),
              end="\r")

        if with_tensorboard and rank == 0:
            logger.add_scalar('training_loss', reduced_loss / denominator,
                              iteration + 1)

        if (iteration + 1) % iters_per_checkpoint == 0:
            s = time()
            reduced_loss = 0
            if rank == 0:
                checkpoint_path = "{}/waveglow_it{}.pt".format(
                    output_directory, iteration + 1)
                save_checkpoint(model, optimizer, learning_rate, iteration + 1,
                                checkpoint_path)
        iteration += 1
예제 #4
0
파일: tts.py 프로젝트: malarinv/tacotron2
class TTSModel(object):
    """docstring for TTSModel."""
    def __init__(self, tacotron2_path, waveglow_path, **kwargs):
        super(TTSModel, self).__init__()
        hparams = HParams(**kwargs)
        self.hparams = hparams
        self.model = Tacotron2(hparams)
        if torch.cuda.is_available():
            self.model.load_state_dict(
                torch.load(tacotron2_path)["state_dict"])
            self.model.cuda().eval()
        else:
            self.model.load_state_dict(
                torch.load(tacotron2_path, map_location="cpu")["state_dict"])
            self.model.eval()
        self.k_cache = klepto.archives.file_archive(cached=False)
        if waveglow_path:
            if torch.cuda.is_available():
                wave_params = torch.load(waveglow_path)
            else:
                wave_params = torch.load(waveglow_path, map_location="cpu")
            try:
                self.waveglow = WaveGlow(**WAVEGLOW_CONFIG)
                self.waveglow.load_state_dict(wave_params)
            except:
                self.waveglow = wave_params["model"]
                self.waveglow = self.waveglow.remove_weightnorm(self.waveglow)
            if torch.cuda.is_available():
                self.waveglow.cuda().eval()
            else:
                self.waveglow.eval()
            # workaround from
            # https://github.com/NVIDIA/waveglow/issues/127
            for m in self.waveglow.modules():
                if "Conv" in str(type(m)):
                    setattr(m, "padding_mode", "zeros")
            for k in self.waveglow.convinv:
                k.float().half()
            self.denoiser = Denoiser(self.waveglow,
                                     n_mel_channels=hparams.n_mel_channels)
            self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
                self._synth_speech)
        else:
            self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
                self._synth_speech_fast)
        self.taco_stft = TacotronSTFT(
            hparams.filter_length,
            hparams.hop_length,
            hparams.win_length,
            n_mel_channels=hparams.n_mel_channels,
            sampling_rate=hparams.sampling_rate,
            mel_fmax=4000,
        )

    def _generate_mel_postnet(self, text):
        sequence = np.array(text_to_sequence(text,
                                             ["english_cleaners"]))[None, :]
        if torch.cuda.is_available():
            sequence = torch.autograd.Variable(
                torch.from_numpy(sequence)).cuda().long()
        else:
            sequence = torch.autograd.Variable(
                torch.from_numpy(sequence)).long()
        with torch.no_grad():
            mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(
                sequence)
        return mel_outputs_postnet

    def synth_speech_array(self, text, vocoder):
        mel_outputs_postnet = self._generate_mel_postnet(text)

        if vocoder == VOCODER_WAVEGLOW:
            with torch.no_grad():
                audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666)
                audio_t = self.denoiser(audio_t, 0.1)[0]
            audio = audio_t[0].data
        elif vocoder == VOCODER_GL:
            mel_decompress = self.taco_stft.spectral_de_normalize(
                mel_outputs_postnet)
            mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
            spec_from_mel_scaling = 1000
            spec_from_mel = torch.mm(mel_decompress[0],
                                     self.taco_stft.mel_basis)
            spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
            spec_from_mel = spec_from_mel * spec_from_mel_scaling
            spec_from_mel = (spec_from_mel.cuda()
                             if torch.cuda.is_available() else spec_from_mel)
            audio = griffin_lim(
                torch.autograd.Variable(spec_from_mel[:, :, :-1]),
                self.taco_stft.stft_fn,
                GL_ITERS,
            )
            audio = audio.squeeze()
        else:
            raise ValueError("vocoder arg should be one of [wavglow|gl]")
        audio = audio.cpu().numpy()
        return audio

    def _synth_speech(self,
                      text,
                      speed: float = 1.0,
                      sample_rate: int = OUTPUT_SAMPLE_RATE):
        audio = self.synth_speech_array(text, VOCODER_WAVEGLOW)

        return postprocess_audio(
            audio,
            src_rate=self.hparams.sampling_rate,
            dst_rate=sample_rate,
            tempo=speed,
        )

    def _synth_speech_fast(self,
                           text,
                           speed: float = 1.0,
                           sample_rate: int = OUTPUT_SAMPLE_RATE):
        audio = self.synth_speech_array(text, VOCODER_GL)

        return postprocess_audio(
            audio,
            tempo=speed,
            src_rate=self.hparams.sampling_rate,
            dst_rate=sample_rate,
        )
예제 #5
0
def main(style, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
         denoiser_strength, args):
    #mel_files = files_to_list(mel_files)
    #print(mel_files)
    dataset = voice_dataset(dataBase={
        'ravdess': './our_data/ravdess',
        'cremad': './our_data/cremad'
    },
                            style=('happy', 'sad', 'angry'))
    #print(len(dataset.final_data['happy']))

    #sample = dataset.pick_one_random_sample('happy')
    styles = ['happy', 'sad', 'angry']
    with open('config.json') as f:
        data = f.read()
    config = json.loads(data)
    waveglow_config = config["waveglow_config"]
    model = WaveGlow(**waveglow_config)
    checkpoint_dict = torch.load('waveglow_256channels_universal_v5.pt',
                                 map_location='cpu')
    model_for_loading = checkpoint_dict['model']
    model.load_state_dict(model_for_loading.state_dict())
    model.cuda()
    waveglow = model
    if is_fp16:
        from apex import amp
        waveglow, _ = amp.initialize(waveglow, [], opt_level="O1")

    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow).cuda()

    mel_extractor = Get_mel(1024, 256, 1024, args.sampling_rate, 0.0, 8000.0)

    vector_all = {}
    for style in styles:
        files = dataset.final_data[style].copy()
        random.shuffle(files)

        vectors = []
        for i, (_, file_path) in enumerate(files):
            if i > 200:
                break
            try:
                audio, rate = load_wav_to_torch(file_path)
                if rate != sampling_rate:
                    audio = resampy.resample(audio.numpy(), rate,
                                             sampling_rate)
                    audio = torch.from_numpy(audio).float()
                #if audio.size(0) >= args.segment_length:
                #    max_audio_start = audio.size(0) - args.segment_length
                #    audio_start = random.randint(0, max_audio_start)
                #    audio = audio[audio_start:audio_start+args.segment_length]
                #else:
                #    audio = torch.nn.functional.pad(audio, (0, args.segment_length-audio.size(0)), 'constant').data
                mel = mel_extractor.get_mel(audio)
                audio = audio / MAX_WAV_VALUE

                mel = torch.autograd.Variable(mel.cuda().unsqueeze(0))
                audio = torch.autograd.Variable(audio.cuda().unsqueeze(0))
                audio = audio.half() if is_fp16 else audio
                mel = mel.half() if is_fp16 else mel
                outputs = waveglow((mel, audio))
                vectors.append(
                    outputs[0].squeeze(0).mean(1).detach().cpu().numpy())
                print(style, i)
            except:
                continue

        vector_all[style] = vectors

    np.save('all_style_vector', vector_all)
예제 #6
0
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
          sigma, iters_per_checkpoint, batch_size, seed, fp16_run,
          checkpoint_path, with_tensorboard):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config)
    checkpoint_dict = torch.load('waveglow_256channels_universal_v5.pt',
                                 map_location='cpu')
    model_for_loading = checkpoint_dict['model']
    model.load_state_dict(model_for_loading.state_dict())
    model.cuda()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    trainset = Mel2Samp(**data_config)
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset,
                              num_workers=0,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        logger = SummaryWriter(os.path.join(output_directory, 'logs'))

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            model.zero_grad()

            mel, audio = batch
            mel = torch.autograd.Variable(mel.cuda())
            audio = torch.autograd.Variable(audio.cuda())
            outputs = model((mel, audio))
            #print(outputs[0])
            #print(outputs[0].shape)

            loss = criterion(outputs)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            optimizer.step()

            print("{}:\t{:.9f}".format(iteration, reduced_loss))
            if with_tensorboard and rank == 0:
                logger.add_scalar('training_loss', reduced_loss,
                                  i + len(train_loader) * epoch)

            if (iteration % iters_per_checkpoint == 0):
                if rank == 0:
                    checkpoint_path = "{}/waveglow_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1