Пример #1
0
    def prepare_test(self, dim_spec, source_path=None, target_path=None):
        if source_path is None:
            source_path = "/mnt/lustre/dengkangle/cmu/datasets/audio/test/trump_02.wav"
        if target_path is None:
            target_path = "/mnt/lustre/dengkangle/cmu/datasets/audio/test/female.wav"
        # source_path = "/home/kangled/datasets/audio/Chaplin_01.wav"
        # target_path = "/home/kangled/datasets/audio/Obama_01.wav"

        mel_basis80 = librosa.filters.mel(hparams.sample_rate,
                                          hparams.n_fft,
                                          n_mels=80)

        wav, sr = librosa.load(source_path, hparams.sample_rate)
        wav = preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
        linear_spec = np.abs(
            librosa.stft(wav,
                         n_fft=hparams.n_fft,
                         hop_length=hparams.hop_size,
                         win_length=hparams.win_size))
        mel_spec = mel_basis80.dot(linear_spec)
        mel_db = 20 * np.log10(mel_spec)
        source_spec = np.clip((mel_db + 120) / 125, 0, 1)
        # source_spec = mel_spec

        self.source_embed = torch.from_numpy(np.array([0, 1
                                                       ])).float().unsqueeze(0)
        self.source_wav = wav

        wav, sr = librosa.load(target_path, hparams.sample_rate)
        wav = preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
        linear_spec = np.abs(
            librosa.stft(wav,
                         n_fft=hparams.n_fft,
                         hop_length=hparams.hop_size,
                         win_length=hparams.win_size))
        mel_spec = mel_basis80.dot(linear_spec)
        mel_db = 20 * np.log10(mel_spec)
        target_spec = np.clip((mel_db + 120) / 125, 0, 1)
        # target_spec = mel_spec

        self.target_embed = torch.from_numpy(np.array([1, 0
                                                       ])).float().unsqueeze(0)
        self.target_wav = wav

        self.source_spec = torch.Tensor(pad_seq(source_spec.T,
                                                hparams.freq)).unsqueeze(0)
        self.target_spec = torch.Tensor(pad_seq(target_spec.T,
                                                hparams.freq)).unsqueeze(0)
    def __init__(self, npy_path=None, wav_path=None, speaker_id=0, speaker_nums=2, sample_frames=128, length=-1):
        super(SampleDataset, self).__init__()
        if npy_path is not None:
            self.raw_data = np.load(npy_path)
            print('Loading ', npy_path, "\tshape:", self.raw_data.shape)

        elif wav_path is not None:
            print('Encoding ', wav_path)
            wav, sr = librosa.load(wav_path, hparams.sample_rate)
            wav = preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
            wav = wav / (np.abs(wav).max() * 1.1)
            self.wav = audio.encode_mu_law(wav, mu=2 ** hparams.bits)
    
            mel_basis = librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels)
            linear_spec = np.abs(
                librosa.stft(wav, n_fft=hparams.n_fft, hop_length=hparams.hop_size, win_length=hparams.win_size))
            mel_spec = mel_basis.dot(linear_spec)
            mel_db = 20 * np.log10(mel_spec)
            # print(in_fpath, mel_db.min(), mel_db.max())
            self.raw_data = np.clip((mel_db + 120) / 125, 0, 1)
            print('Raw_Data Shape:', self.raw_data.shape)
            # (num_mels, num_frames)
        else:
            print("Error! No data input...")
        self.speaker = np.zeros(speaker_nums)
        self.speaker[speaker_id % speaker_nums] = 1
        self.sample_frames = sample_frames
        if length > 0:
            self.length = length
        else:
            self.length = max(self.raw_data.shape[1] // sample_frames, 50 * 32)
Пример #3
0
def voice_conversion(G, input_wavfile, parallel=True):
    source_path = input_wavfile
    wav, sr = librosa.load(source_path, hparams.sample_rate)
    wav = preemphasis(wav, hparams.preemphasis, hparams.sample_rate)
    linear_spec = np.abs(
        librosa.stft(wav, n_fft=hparams.n_fft, hop_length=hparams.hop_size, win_length=hparams.win_size))
    mel_spec = mel_basis.dot(linear_spec)
    mel_db = 20 * np.log10(mel_spec)
    source_spec = np.clip((mel_db + 120) / 125, 0, 1)
    source_embed = torch.from_numpy(np.array([0, 1])).float()

    source_spec, _ = pad_seq(source_spec.T, hparams.freq)

    with torch.no_grad():
        s2t_spec = G.conversion(torch.Tensor(source_embed).unsqueeze(0), torch.Tensor(source_embed).unsqueeze(0),
                                   torch.Tensor(source_spec).unsqueeze(0), device).cpu()

    if parallel:
        s2t_wav = G.vocoder.generate(s2t_spec.transpose(1, 2), True, 8000, 800, mu_law=True)
    else:
        s2t_wav = G.vocoder.generate(s2t_spec.transpose(1, 2), False, None, None, mu_law=True)

    s2t_wav = inv_preemphasis(s2t_wav, hparams.preemphasis, hparams.preemphasize)
    librosa.output.write_wav(args.output_file, s2t_wav.astype(np.float32), hparams.sample_rate)
        newkey = key
        if 'wavenet' in key:
            newdict[key.replace('wavenet', 'vocoder')] = newdict.pop(key)
            newkey = key.replace('wavenet', 'vocoder')
        if not args.multigpu and 'module' in key:
            newdict[newkey.replace('module.','',1)] = newdict.pop(newkey)
            newkey = newkey.replace('module.','', 1)
        if newkey not in G.state_dict():
            newdict.pop(newkey)
    G.load_state_dict(newdict)
    print("AutoVC Model Loaded")

    wav, sr = librosa.load(args.wav_path, hparams.sample_rate)

    mel_basis = librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels)
    wav = preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
    linear_spec = np.abs(
        librosa.stft(wav, n_fft=hparams.n_fft, hop_length=hparams.hop_size, win_length=hparams.win_size))
    mel_spec = mel_basis.dot(linear_spec)
    mel_db = 20 * np.log10(mel_spec)
    # print(in_fpath, mel_db.min(), mel_db.max())
    test_data = np.clip((mel_db + 120) / 125, 0, 1)
    test_data = torch.Tensor(pad_seq(test_data.T, hparams.freq)).unsqueeze(0).to(device)

    speaker = torch.from_numpy(np.array([0, 1])).float()

    with torch.no_grad():
        mel_outputs_postnet, v_stage1, v_stage2 = G.generate(test_data, speaker, device)

    print(v_stage2.shape)
    gif_list = list(v_stage2.squeeze().cpu().numpy().transpose(0,2,3,1))
Пример #5
0
    def test_audiovideo(self, device, writer, niter):
        source_path = self.test_path

        mel_basis80 = librosa.filters.mel(hparams.sample_rate,
                                          hparams.n_fft,
                                          n_mels=80)

        wav, sr = librosa.load(source_path, hparams.sample_rate)
        wav = preemphasis(wav, hparams.preemphasis, hparams.preemphasize)

        linear_spec = np.abs(
            librosa.stft(wav,
                         n_fft=hparams.n_fft,
                         hop_length=hparams.hop_size,
                         win_length=hparams.win_size))
        mel_spec = mel_basis80.dot(linear_spec)
        mel_db = 20 * np.log10(mel_spec)
        source_spec = np.clip((mel_db + 120) / 125, 0, 1)

        source_embed = torch.from_numpy(np.array([0, 1])).float().unsqueeze(0)
        source_wav = wav

        source_spec = torch.Tensor(pad_seq(source_spec.T,
                                           hparams.freq)).unsqueeze(0)
        # print(source_spec.shape)

        with torch.no_grad():
            generated_spec, v_mid, v_hat = self.generate(
                source_spec, source_embed, device)

        generated_spec, v_mid, v_hat = generated_spec.cpu(), v_mid.cpu(
        ), v_hat.cpu()

        print("Generating Wavfile...")
        with torch.no_grad():
            if not self.multigpu:
                generated_wav = inv_preemphasis(
                    self.vocoder.generate(generated_spec.to(device).transpose(
                        2, 1),
                                          False,
                                          None,
                                          None,
                                          mu_law=True), hparams.preemphasis,
                    hparams.preemphasize)

            else:
                generated_wav = inv_preemphasis(
                    self.vocoder.module.generate(
                        generated_spec.to(device).transpose(2, 1),
                        False,
                        None,
                        None,
                        mu_law=True), hparams.preemphasis,
                    hparams.preemphasize)

        writer.add_video('generated', (v_hat.numpy() + 1) / 2,
                         global_step=niter)
        writer.add_video('mid', (v_mid.numpy() + 1) / 2, global_step=niter)
        writer.add_audio('ground_truth',
                         source_wav,
                         niter,
                         sample_rate=hparams.sample_rate)
        writer.add_audio('generated_wav',
                         generated_wav,
                         niter,
                         sample_rate=hparams.sample_rate)