def save_world_wav(feats, model_name, filename):

    # feats = [f0, sp, ap, sp_coded, labels]

    if isinstance(feats[3], torch.Tensor):
        feats[3] = feats[3].cpu().numpy()
    if hp.normalise_mels:
        feats[3] = _unnormalise_coded_sp(feats[3])

    path = os.path.join(hp.sample_set_dir, model_name)

    if not os.path.exists(path):
        os.makedirs(path)

    path = os.path.join(path, filename)

    # print("Made path.")
    feats[3] = np.ascontiguousarray(feats[3], dtype=np.float64)
    # print("Made contiguous.")
    # print(feats[3].shape)
    decoded_sp = decode_spectral_envelope(feats[3], hp.sr, fft_size=hp.n_fft)
    # print("Decoded.")
    # f0_converted = norm.pitch_conversion(f0, speaker, target)
    wav = synthesize(feats[0], decoded_sp, feats[1], hp.sr)
    # Audio(wav,rate=hp.sr)
    # librosa.display.waveplot(y=wav, sr=hp.sr)
    # print("Sythesized wav.")
    save_wav(wav, path)
Пример #2
0
def world_decode_spectral_envelop(coded_sp, fs):

    # 将之前编码降维后的数据恢复以前的维度
    fftlen = pyworld.get_cheaptrick_fft_size(fs)
    decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen)

    return decoded_sp
Пример #3
0
def world_decode_spectral_envelop(coded_sp, fs):
    fftlen = pyworld.get_cheaptrick_fft_size(fs)
    # coded_sp = coded_sp.astype(np.float32)
    # coded_sp = np.ascontiguousarray(coded_sp)
    decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen)

    return decoded_sp
def world_decode_spectral_env(spectral_env_mel, settings):
    mfcc = dct(spectral_env_mel) / np.sqrt(settings['coded_dim'] * 2)
    fftlen = pyworld.get_cheaptrick_fft_size(settings['sample_rate'])
    spectral_env = pyworld.decode_spectral_envelope(mfcc,
                                                    settings['sample_rate'],
                                                    fftlen)
    return spectral_env
Пример #5
0
def synthesis(ori_path, aim_sp, aim_spkid):
    print('synthesizing ...')
    wav, _ = librosa.load(ori_path, sr=hp.SR, mono=True, dtype=np.float64)
    f0, timeaxis = pw.harvest(wav, hp.SR)
    sp_per_timeaxis_before = pw.cheaptrick(wav,
                                           f0,
                                           timeaxis,
                                           hp.SR,
                                           fft_size=hp.N_FFT)  # 1024 压缩到 513 维

    # ori_decoded_sp = pw.decode_spectral_envelope(ori_sp, hp.SR, fft_size=hp.N_FFT)

    # print('f0.shape = ')
    # print(f0)

    ap = pw.d4c(wav, f0, timeaxis, hp.SR, fft_size=hp.N_FFT)
    aim_decoded_sp = pw.decode_spectral_envelope(
        aim_sp, hp.SR, fft_size=hp.N_FFT)  # 转换/解码 后的sp:
    print('解码后的513维度的aim_decoded_sp = ')
    print(aim_decoded_sp.shape)
    print(aim_decoded_sp[399][:])

    synwav = pw.synthesize(f0, aim_decoded_sp, ap, hp.SR)
    print(f'synthesize done. path : ./convert_to_{aim_spkid}_test1.wav')
    librosa.output.write_wav(f'./convert_to_{aim_spkid}_test1.wav',
                             synwav,
                             sr=hp.SR)
Пример #6
0
def convertFeaturesIntoWav(f0seq, MCEPseq, APseq, fs, frame_period=5.0):
    contNumpy_MCEPseq = np.ascontiguousarray(MCEPseq.T, dtype=np.float64)
    fftlen = pyworld.get_cheaptrick_fft_size(fs)
    spectrogram = pyworld.decode_spectral_envelope(contNumpy_MCEPseq, fs,
                                                   fftlen)
    # print(f"dtypes. f0seq:{f0seq.dtype}, spectrogram:{spectrogram.dtype}, APseq:{APseq.dtype}")
    wav = pyworld.synthesize(f0seq, spectrogram, APseq, fs, frame_period)
    return wav.astype(np.float32)
def world_decode_spectral_envelop(coded_sp, fs):

    fftlen = pyworld.get_cheaptrick_fft_size(fs)
    #coded_sp = coded_sp.astype(np.float32)
    #coded_sp = np.ascontiguousarray(coded_sp)
    decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen)

    return decoded_sp
Пример #8
0
    def test(self):
        """Translate speech using StarGAN ."""
        # Load the trained generator.
        self.restore_model(self.test_iters)
        norm = Normalizer()

        # Set data loader.
        d, speaker = TestSet(self.test_dir).test_data(self.src_speaker)
        targets = self.trg_speaker

        for target in targets:
            print(target)
            assert target in speakers
            label_t = self.spk_enc.transform([target])[0]
            if label_t == [0]: label_t = [1, 0]
            elif label_t == [1]: label_t = [0, 1]
            label_t = np.asarray([label_t])

            with torch.no_grad():

                for filename, content in d.items():
                    f0 = content['f0']
                    ap = content['ap']
                    sp_norm_pad = self.pad_coded_sp(content['coded_sp_norm'])

                    convert_result = []
                    for start_idx in range(0,
                                           sp_norm_pad.shape[1] - FRAMES + 1,
                                           FRAMES):
                        one_seg = sp_norm_pad[:, start_idx:start_idx + FRAMES]

                        one_seg = torch.FloatTensor(one_seg).to(self.device)
                        one_seg = one_seg.view(1, 1, one_seg.size(0),
                                               one_seg.size(1))
                        l = torch.FloatTensor(label_t)
                        one_seg = one_seg.to(self.device)
                        l = l.to(self.device)
                        one_set_return = self.G(one_seg, l).data.cpu().numpy()
                        one_set_return = np.squeeze(one_set_return)
                        one_set_return = norm.backward_process(
                            one_set_return, target)
                        convert_result.append(one_set_return)

                    convert_con = np.concatenate(convert_result, axis=1)
                    convert_con = convert_con[:, 0:content['coded_sp_norm'].
                                              shape[1]]
                    contigu = np.ascontiguousarray(convert_con.T,
                                                   dtype=np.float64)
                    decoded_sp = decode_spectral_envelope(contigu,
                                                          SAMPLE_RATE,
                                                          fft_size=FFTSIZE)
                    f0_converted = norm.pitch_conversion(f0, speaker, target)
                    wav = synthesize(f0_converted, decoded_sp, ap, SAMPLE_RATE)

                    name = f'{speaker}-{target}_iter{self.test_iters}_{filename}'
                    path = os.path.join(self.result_dir, name)
                    print(f'[save]:{path}')
                    librosa.output.write_wav(path, wav, SAMPLE_RATE)
Пример #9
0
def inv_world_spectrogram(f0, sp, ap, sr=_sr, **kwargs):
    """world声码器频谱转为语音。"""
    frame_period = kwargs.get("frame_period", pw.default_frame_period)
    f0_floor = kwargs.get("f0_floor", pw.default_f0_floor)
    fft_size = kwargs.get("fft_size", pw.get_cheaptrick_fft_size(sr, f0_floor))
    sp_dec = pw.decode_spectral_envelope(sp, sr, fft_size=fft_size)
    ap_dec = pw.decode_aperiodicity(ap, sr, fft_size=fft_size)
    y = pw.synthesize(f0, sp_dec, ap_dec, sr, frame_period=frame_period)
    return y
Пример #10
0
def mcep2wav(mcep, f0, ap):
    f0 = f0.astype(np.float64)
    ap = ap.astype(np.float64)
    mcep = mcep.astype(np.float64)
    decoded_sp = pyworld.decode_spectral_envelope(mcep,
                                                  sampling_rate,
                                                  fft_size=n_fft)
    wav = pyworld.synthesize(f0, decoded_sp, ap, sampling_rate)
    return wav
Пример #11
0
def worldDecodeSpectralEnvelop(coded_sp: np.ndarray,
                               fs: int = SAMPLE_RATE) -> np.ndarray:
    '''
    MCEPsをスペクトル包絡に戻す

    Parameters
    ----------
    coded_sp: np.ndarray
        MCEPsのデータ
    fs: int, default SAMPLE_RATE
        サンプリング周波数
    
    Returns
    -------
    decoded_sp: np.ndarray
        スペクトル包絡
    '''
    fftlen = pyworld.get_cheaptrick_fft_size(fs)
    decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen)
    return decoded_sp
def save_world_wav(feats, filename):

    # feats = [f0, sp, ap, sp_coded, labels]

    if isinstance(feats[3], torch.Tensor):
        feats[3] = feats[3].cpu().numpy()
    if hp.normalise:
        feats[3] = _unnormalise_coded_sp(feats[3])

    # path = os.path.join(hp.sample_set_dir, model_name)

    if not os.path.exists(os.path.dirname(filename)):
        os.makedirs(os.path.dirname(filename))

    # path = os.path.join(path, filename)

    feats[3] = np.ascontiguousarray(feats[3], dtype=np.float64)
    decoded_sp = decode_spectral_envelope(feats[3], hp.sr, fft_size=hp.n_fft)
    wav = synthesize(feats[0], decoded_sp, feats[1], hp.sr)

    save_wav(wav, filename)
Пример #13
0
def synthesis(ori_path, aim_sp, aim_spkid):
    print('synthesizing ...')
    wav, _ = librosa.load(ori_path, sr=hp.SR, mono=True, dtype=np.float64)
    f0, timeaxis = pw.harvest(wav, hp.SR, frame_period=10)
    sp_per_timeaxis_before = pw.cheaptrick(wav,
                                           f0,
                                           timeaxis,
                                           hp.SR,
                                           fft_size=hp.N_FFT)  # 1024 压缩到 513 维

    ap = pw.d4c(wav, f0, timeaxis, hp.SR, fft_size=hp.N_FFT)
    aim_decoded_sp = pw.decode_spectral_envelope(
        aim_sp, hp.SR, fft_size=hp.N_FFT)  # 转换/解码 后的sp:维度从60变成513

    print('line23: f0.shape = ' + str(f0.shape) + 'aim_decoded_sp.shape = ' +
          str(aim_decoded_sp.shape) + 'ap.shape = ' + str(ap.shape))
    print('\n line26 : aim_sp.shape = ' + str(aim_sp.shape))

    synwav = pw.synthesize(f0, aim_decoded_sp, ap, hp.SR)
    print(f'synthesize done. path : ./convert_to_{aim_spkid}_test1.wav')
    librosa.output.write_wav(f'./convert_to_{aim_spkid}_test1.wav',
                             synwav,
                             sr=hp.SR)
Пример #14
0
def world_decode_spectral_envelop(coded_sp, fs):
    # Decode Mel-cepstral to sp
    fftlen = pyworld.get_cheaptrick_fft_size(fs)
    decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen)

    return decoded_sp
Пример #15
0
    def train(self):
        # 衰减的学习率缓存
        g_lr = self.g_lr
        d_lr = self.d_lr
        c_lr = self.c_lr
        # 开始训练步骤数为0
        start_iters = 0
        # 如果存在就跳过
        if self.resume_iters:
            pass
        # 调用定义的个性化标准化方法
        norm = Normalizer()
        # iter用来生成迭代器,这里用来迭代加载数据集
        data_iter = iter(self.data_loader)
        print('开始训练......')
        # 记录当前时间,now函数取当前时间
        start_time = datetime.now()
        # 利用总迭代次数来进行遍历
        for i in range(start_iters, self.num_iters):
            # =================================================================================== #
            #                                 1.预处理输入数据                                    #
            # =================================================================================== #
            # 获取真实的图像和对应标签标签
            try:
                # next方法为迭代下一个迭代器
                # 利用自定义的加载器获取真实x值,发音者标签在组中索引与源标签
                x_real, speaker_idx_org, label_org = next(data_iter)
            except:
                # 如果迭代器有问题就再转换为迭代器一次然后迭代
                data_iter = iter(self.data_loader)
                x_real, speaker_idx_org, label_org = next(data_iter)

            # 随机生成目标域标签
            # torch.randperm返回一个从0到参数-1范围的随机数组
            # 因为标签二值化了,所以这里的标签是10组成的,所以一共有label_org.size(0)个标签
            # 获得的是随机索引
            rand_idx = torch.randperm(label_org.size(0))
            # 根据随机数作为源标签的索引作为目标标签数
            label_trg = label_org[rand_idx]
            # 同理得到随机目标发音者
            speaker_idx_trg = speaker_idx_org[rand_idx]
            # to表示使用cpu或者gpu运行
            x_real = x_real.to(self.device)  # 输入数据
            label_org = label_org.to(self.device)  # 源域one-hot格式标签
            label_trg = label_trg.to(self.device)  # 目标域ont-hot格式标签
            speaker_idx_org = speaker_idx_org.to(self.device)  # 源域标签
            speaker_idx_trg = speaker_idx_trg.to(self.device)  # 目标域标签

            # =================================================================================== #
            #                                      2.训练判别器                                   #
            # =================================================================================== #
            # 用真实音频数据计算损失
            # nn.CrossEntropyLoss()为交叉熵损失函数,但是不是普通的形式,而是主要是将softmax-log-NLLLoss合并到一块得到的结果。
            CELoss = nn.CrossEntropyLoss()
            # 调用分类器计算真实数据
            cls_real = self.C(x_real)
            # 计算对应的域分类损失,即用交叉熵实现
            cls_loss_real = CELoss(input=cls_real, target=speaker_idx_org)
            # 重置缓冲区,具体实现在下面
            self.reset_grad()
            # tensor.backward为自动求导函数
            cls_loss_real.backward()
            # optimizer.step这个方法会更新模型所有的参数以提升学习率,一般在backward函数后根据其计算的梯度来更新参数
            self.c_optimizer.step()
            # 记录中
            loss = {}
            # 从真实域分类损失张量中获取元素值
            # item()得到一个元素张量里面的元素值
            loss['C/C_loss'] = cls_loss_real.item()

            # 基于源数据的D判断结果
            out_r = self.D(x_real, label_org)
            # 用假音频帧计算损失
            # 根据真实样本与目标标签生成生成样本
            x_fake = self.G(x_real, label_trg)
            # detach截断反向传播的梯度流,从而让梯度不影响判别器D
            # 基于生成样本的D判断结果
            out_f = self.D(x_fake.detach(), label_trg)
            # torch.nn.Function.binary_cross_entropy_with_logits度量目标逻辑和输出逻辑之间的二进制交叉熵的函数
            # 接受任意形状的输入,target要求与输入形状一致。切记:target的值必须在[0,N-1]之间,其中N为类别数,否则会出现莫名其妙的错误,比如loss为负数。
            # 计算其实就是交叉熵,不过输入不要求在0,1之间,该函数会自动添加sigmoid运算
            # 返回一个填充了标量值1的张量,其大小与输入相同。torch.ones_like(input)
            # 相当于torch.ones(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)

            # binary_cross_entropy_with_logits和binary_cross_entropy的区别
            # 有一个(类)损失函数名字中带了with_logits. 而这里的logits指的是,该损失函数已经内部自带了计算logit的操作,
            # 无需在传入给这个loss函数之前手动使用sigmoid/softmax将之前网络的输入映射到[0,1]之间
            d_loss_t = F.binary_cross_entropy_with_logits(input=out_f,target=torch.zeros_like(out_f, dtype=torch.float)) + \
                F.binary_cross_entropy_with_logits(input=out_r, target=torch.ones_like(out_r, dtype=torch.float))
            # 生成样本的分类结果
            out_cls = self.C(x_fake)
            # 交叉熵计算生成样本的域分类损失
            d_loss_cls = CELoss(input=out_cls, target=speaker_idx_trg)

            # 计算梯度惩罚的损失
            alpha = torch.rand(x_real.size(0), 1, 1, 1).to(self.device)
            # 计算x_hat
            # requires_grad_设置积分方法,将requires_grad是否积分的属性设置为真
            # 取一个随机数混合真实样本和生成样本得到一个x尖
            x_hat = (alpha * x_real.data +
                     (1 - alpha) * x_fake.data).requires_grad_(True)
            # 计算混合样本和目标标签的判别结果
            out_src = self.D(x_hat, label_trg)
            # 调用自定义方法得到处理导数后的数据
            d_loss_gp = self.gradient_penalty(out_src, x_hat)
            # 计算判别器的总体损失
            d_loss = d_loss_t + self.lambda_cls * d_loss_cls + 5 * d_loss_gp
            # 调用自定义方法重置梯度变化缓冲区
            self.reset_grad()
            # 对D的损失求导
            d_loss.backward()
            # 更新模型判别器D参数
            self.d_optimizer.step()

            # loss['D/d_loss_t'] = d_loss_t.item()
            # loss['D/loss_cls'] = d_loss_cls.item()
            # loss['D/D_gp'] = d_loss_gp.item()
            # 获取判别器损失
            loss['D/D_loss'] = d_loss.item()

            # =================================================================================== #
            #                                       3.训练生成器                                  #
            # =================================================================================== #
            # 进行模运算,判读更新时间
            if (i + 1) % self.n_critic == 0:
                # 源至目标域
                # 利用真实样本和目标标签生成生成样本
                x_fake = self.G(x_real, label_trg)
                #  判别生成样本与目标标签
                g_out_src = self.D(x_fake, label_trg)
                # 将生成与目标标签的损失与相同大小纯1张量计算交叉熵得到生成G损失
                g_loss_fake = F.binary_cross_entropy_with_logits(
                    input=g_out_src,
                    target=torch.ones_like(g_out_src, dtype=torch.float))
                # 得到真实样本通过域分类器得到的类别
                out_cls = self.C(x_real)
                # 计算C计算类别与输入的类别的交叉熵损失即G的分类损失
                g_loss_cls = CELoss(input=out_cls, target=speaker_idx_org)

                # 目标至源域
                # 通过G将生成样本转换为源标签
                x_reconst = self.G(x_fake, label_org)
                # 得到循环一致性损失,即通过G转回来的损失,按道理这两个是同样的
                # l1_loss为L1损失函数,即平均绝对误差
                g_loss_rec = F.l1_loss(x_reconst, x_real)

                # 源到源域(身份一致性损失).
                # 通过真实样本与源标签生成,按道理也是生成x_real
                x_fake_iden = self.G(x_real, label_org)
                # 利用L1损失函数计算
                id_loss = F.l1_loss(x_fake_iden, x_real)

                # 后退和优化
                # 得到生成器的总体损失函数
                g_loss = g_loss_fake + self.lambda_cycle * g_loss_rec +\
                 self.lambda_cls * g_loss_cls + self.lambda_identity * id_loss
                # 重置梯度变化缓冲区
                self.reset_grad()
                # 对G损失求导
                g_loss.backward()
                # 更新生成器参数
                self.g_optimizer.step()

                # 记录对应的损失
                loss['G/loss_fake'] = g_loss_fake.item()
                loss['G/loss_rec'] = g_loss_rec.item()
                loss['G/loss_cls'] = g_loss_cls.item()
                loss['G/loss_id'] = id_loss.item()
                loss['G/g_loss'] = g_loss.item()
            # =================================================================================== #
            #                                           4.其他                                    #
            # =================================================================================== #
            # 打印训练相关信息
            if (i + 1) % self.log_step == 0:
                # 得到训练时间
                et = datetime.now() - start_time
                # 截取后面的时间段
                et = str(et)[:-7]
                # 耗时与迭代次数
                log = "耗时:[{}], 迭代次数:[{}/{}]".format(et, i + 1, self.num_iters)
                # 打印对应损失值
                for tag, value in loss.items():
                    log += ", {}: {:.4f}".format(tag, value)
                print(log)
                # 如果调用tensorboard来记录训练过程
                if self.use_tensorboard:
                    for tag, value in loss.items():
                        # 添加到log中
                        self.logger.scalar_summary(tag, value, i + 1)

            # 翻译固定数据进行调试
            if (i + 1) % self.sample_step == 0:
                # torch.no_grad是一个上下文管理器,被该语句包括起来的部分将不会track 梯度
                # 所有依赖他的tensor会全部变成True,反向传播时就不会自动求导了,反向传播就不会保存梯度,因此大大节约了显存或者说内存。
                with torch.no_grad():
                    # 调用自定义方法,定义一个路由,并随机选取一个发音者作为测试数据
                    d, speaker = TestSet(self.test_dir).test_data()
                    # random.choice返回参数的随机项
                    # 随机在speakers中选择一个不是目标的发音者
                    target = random.choice(
                        [x for x in speakers if x != speaker])
                    # 将二值化的标签组取出第一个作为目标
                    # LabelBinary.transfrom方法将复杂类标签转换为二进制标签
                    label_t = self.spk_enc.transform([target])[0]
                    # np.asarray将python原生列表或元组形式的现有数据来创建numpy数组
                    label_t = np.asarray([label_t])
                    # 取出字典中的文件名与内容
                    for filename, content in d.items():
                        f0 = content['f0']
                        ap = content['ap']
                        # 调用自定义方法处理对应的数据
                        sp_norm_pad = self.pad_coded_sp(
                            content['coded_sp_norm'])

                        convert_result = []
                        for start_idx in range(
                                0, sp_norm_pad.shape[1] - FRAMES + 1, FRAMES):
                            one_seg = sp_norm_pad[:,
                                                  start_idx:start_idx + FRAMES]

                            one_seg = torch.FloatTensor(one_seg).to(
                                self.device)
                            one_seg = one_seg.view(1, 1, one_seg.size(0),
                                                   one_seg.size(1))
                            l = torch.FloatTensor(label_t)
                            one_seg = one_seg.to(self.device)
                            l = l.to(self.device)
                            one_set_return = self.G(one_seg,
                                                    l).data.cpu().numpy()
                            one_set_return = np.squeeze(one_set_return)
                            one_set_return = norm.backward_process(
                                one_set_return, target)
                            convert_result.append(one_set_return)

                        convert_con = np.concatenate(convert_result, axis=1)
                        convert_con = convert_con[:,
                                                  0:content['coded_sp_norm'].
                                                  shape[1]]
                        contigu = np.ascontiguousarray(convert_con.T,
                                                       dtype=np.float64)
                        decoded_sp = decode_spectral_envelope(contigu,
                                                              SAMPLE_RATE,
                                                              fft_size=FFTSIZE)
                        f0_converted = norm.pitch_conversion(
                            f0, speaker, target)
                        wav = synthesize(f0_converted, decoded_sp, ap,
                                         SAMPLE_RATE)

                        name = f'{speaker}-{target}_iter{i+1}_{filename}'
                        path = os.path.join(self.sample_dir, name)
                        print(f'[save]:{path}')
                        librosa.output.write_wav(path, wav, SAMPLE_RATE)

            # 保存模型检查点
            if (i + 1) % self.model_save_step == 0:
                G_path = os.path.join(self.model_save_dir,
                                      '{}-G.ckpt'.format(i + 1))
                D_path = os.path.join(self.model_save_dir,
                                      '{}-D.ckpt'.format(i + 1))
                C_path = os.path.join(self.model_save_dir,
                                      '{}-C.ckpt'.format(i + 1))
                torch.save(self.G.state_dict(), G_path)
                torch.save(self.D.state_dict(), D_path)
                torch.save(self.C.state_dict(), C_path)
                print('Saved model checkpoints into {}...'.format(
                    self.model_save_dir))

            # 衰减学习率
            if (i + 1) % self.lr_update_step == 0 and (i + 1) > (
                    self.num_iters - self.num_iters_decay):
                g_lr -= (self.g_lr / float(self.num_iters_decay))
                d_lr -= (self.d_lr / float(self.num_iters_decay))
                c_lr -= (self.c_lr / float(self.num_iters_decay))
                self.update_lr(g_lr, d_lr, c_lr)
                print('Decayed learning rates, g_lr: {}, d_lr: {}.'.format(
                    g_lr, d_lr))
Пример #16
0
def main(args):
    if os.path.isdir('test'):
        rmtree('test')
    os.mkdir('test')

    #x, fs = sf.read('utterance/vaiueo2d.wav')
    x, fs = sf.read('utterance/p226_002.wav')
    # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64)

    # 1. A convient way
    f0, sp, ap = pw.wav2world(x, fs)  # use default options
    y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period)

    # 2. Step by step
    # 2-1 Without F0 refinement
    _f0, t = pw.dio(x,
                    fs,
                    f0_floor=50.0,
                    f0_ceil=600.0,
                    channels_in_octave=2,
                    frame_period=args.frame_period,
                    speed=args.speed)
    _sp = pw.cheaptrick(x, _f0, t, fs)
    _ap = pw.d4c(x, _f0, t, fs)
    _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs)
    sf.write('test/y_without_f0_refinement.wav', _y, fs)

    # 2-2 DIO with F0 refinement (using Stonemask)
    f0 = pw.stonemask(x, _f0, t, fs)
    sp = pw.cheaptrick(x, f0, t, fs)
    ap = pw.d4c(x, f0, t, fs)
    y = pw.synthesize(f0, sp, ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs)
    sf.write('test/y_with_f0_refinement.wav', y, fs)

    # 2-3 Harvest with F0 refinement (using Stonemask)
    _f0_h, t_h = pw.harvest(x, fs)
    f0_h = pw.stonemask(x, _f0_h, t_h, fs)
    sp_h = pw.cheaptrick(x, f0_h, t_h, fs)
    ap_h = pw.d4c(x, f0_h, t_h, fs)
    y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period)
    # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs)
    sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs)

    # 2-4 DIO with F0 refinement (using Stonemask). Code and restore sp, ap.
    code_sp = pw.code_spectral_envelope(sp, fs, 80)
    code_ap = pw.code_aperiodicity(ap, fs)
    fft_size = (sp.shape[1] - 1) * 2
    rest_sp = pw.decode_spectral_envelope(code_sp, fs, fft_size)
    rest_ap = pw.decode_aperiodicity(code_ap, fs, fft_size)
    y_r = pw.synthesize(f0, rest_sp, rest_ap, fs, args.frame_period)
    sf.write('test/y_with_f0_refinement_code_and_restore.wav', y_r, fs)
    print("fft size: {:d}".format(fft_size))
    print("coded sp shape: ({:d}, {:d})".format(code_sp.shape[0],
                                                code_sp.shape[1]))
    print("coded ap shape: ({:d}, {:d})".format(code_ap.shape[0],
                                                code_ap.shape[1]))

    # 2-5 DIO with F0 refinement (using Stonemask). Code and restore sp, ap. frame_shift: 12.5 ms, frame_length: 50.0 ms
    f0_xx, t_xx = pw.dio(x,
                         fs,
                         f0_floor=50.0,
                         f0_ceil=600.0,
                         channels_in_octave=2,
                         frame_period=12.5,
                         speed=args.speed)
    f0_xx = pw.stonemask(x, f0_xx, t_xx, fs)
    sp_xx = pw.cheaptrick(x, f0_xx, t_xx, fs)
    ap_xx = pw.d4c(x, f0_xx, t_xx, fs)
    code_sp_xx = pw.code_spectral_envelope(sp_xx, fs, 80)
    code_ap_xx = pw.code_aperiodicity(ap_xx, fs)
    fft_size = (sp_xx.shape[1] - 1) * 2
    rest_sp_xx = pw.decode_spectral_envelope(code_sp_xx, fs, fft_size)
    rest_ap_xx = pw.decode_aperiodicity(code_ap_xx, fs, fft_size)
    y_r_xx = pw.synthesize(f0_xx, rest_sp_xx, rest_ap_xx, fs, 12.5)
    sf.write(
        'test/y_with_f0_refinement_code_and_restore_frame_period_12.5.wav',
        y_r_xx, fs)
    print("coded sp_xx shape: ({:d}, {:d})".format(code_sp_xx.shape[0],
                                                   code_sp_xx.shape[1]))
    print("coded ap_xx shape: ({:d}, {:d})".format(code_ap_xx.shape[0],
                                                   code_ap_xx.shape[1]))

    # Comparison
    savefig('test/wavform.png', [x, _y, y, y_h, y_r, y_r_xx])
    savefig('test/sp.png', [_sp, sp, sp_h, rest_sp, rest_sp_xx])
    savefig('test/ap.png', [_ap, ap, ap_h, rest_ap, rest_ap_xx], log=False)
    savefig('test/f0.png', [_f0, f0, f0_h, f0_xx])

    print('Please check "test" directory for output files')
def decode_spectral_envelop(coded_spect, sampling_rate):
    fftlen = pyworld.get_cheaptrick_fft_size(sampling_rate)
    decoded_spect = pyworld.decode_spectral_envelope(coded_spect,
                                                     sampling_rate, fftlen)
    return decoded_spect
Пример #18
0
sp = pw.cheaptrick(x, f0, t, fs)  # extract smoothed spectrogram
ap = pw.d4c(x, f0, t, fs)  # extract aperiodicity
end = timer()
print('Feature Extraction:', end - start, 'seconds')

# f0_new
from copy import deepcopy  # to avoid call by reference!!
f0_new = deepcopy(f0)  # 1-58 59-138 139-198 // 269-360 // 429-522
f0_new[1:198] = np.flip(f0_new[1:198], 0)  # reverse pitch
f0_new[269:360] = f0_new[269:360] + 62  #E(330hz) -> G (392hz)
f0_new[429:522] = f0_new[429:522] + 193  #E(330hz) -> G(523hz)

#%% reduce dimension of spectral envelope and aperiodicity.
enc_sp = pw.code_spectral_envelope(sp, fs, number_of_dimensions=32)
dec_sp = pw.decode_spectral_envelope(enc_sp,
                                     fs,
                                     fft_size=(sp.shape[1] - 1) * 2)

enc_ap = pw.code_aperiodicity(ap, fs)
dec_ap = pw.decode_aperiodicity(enc_ap, fs, fft_size=(ap.shape[1] - 1) * 2)

#%%
y = pw.synthesize(f0, sp, ap, fs)
librosa.output.write_wav('y_EyesNose_short_resynthesis.wav', y, fs)
#%%
y = pw.synthesize(f0, dec_sp, ap, fs)
librosa.output.write_wav('y_EyesNose_short_resynthesis_sp_decode_32.wav', y,
                         fs)

#%% synthesis using new f0
y = pw.synthesize(f0_new, sp, ap, fs)
Пример #19
0
    def train(self):
        # Learning rate cache for decaying.
        g_lr = self.g_lr
        d_lr = self.d_lr
        c_lr = self.c_lr

        start_iters = 0
        if self.resume_iters:
            pass

        norm = Normalizer()
        data_iter = iter(self.data_loader)

        print("Start training......")
        start_time = datetime.now()

        for i in range(start_iters, self.num_iters):
            # Preprocess input data
            # Fetch real images and labels.
            try:
                x_real, speaker_idx_org, label_org = next(data_iter)
            except:
                data_iter = iter(self.data_loader)
                x_real, speaker_idx_org, label_org = next(data_iter)

            # Generate target domain labels randomly.
            rand_idx = flow.randperm(label_org.size(0))
            label_trg = label_org[rand_idx]
            speaker_idx_trg = speaker_idx_org[rand_idx]

            x_real = x_real.to(self.device)
            # Original domain one-hot labels.
            label_org = label_org.to(self.device)
            # Target domain one-hot labels.
            label_trg = label_trg.to(self.device)
            speaker_idx_org = speaker_idx_org.to(self.device)
            speaker_idx_trg = speaker_idx_trg.to(self.device)

            # Train the discriminator
            # Compute loss with real audio frame.
            CELoss = nn.CrossEntropyLoss()
            cls_real = self.C(x_real)
            cls_loss_real = CELoss(input=cls_real, target=speaker_idx_org)

            self.reset_grad()
            cls_loss_real.backward()
            self.c_optimizer.step()
            # Logging.
            loss = {}
            loss["C/C_loss"] = cls_loss_real.item()

            out_r = self.D(x_real, label_org)
            # Compute loss with fake audio frame.
            x_fake = self.G(x_real, label_trg)
            out_f = self.D(x_fake.detach(), label_trg)
            d_loss_t = nn.BCEWithLogitsLoss()(
                input=out_f, target=flow.zeros_like(
                    out_f).float()) + nn.BCEWithLogitsLoss()(
                        input=out_r, target=flow.ones_like(out_r).float())

            out_cls = self.C(x_fake)
            d_loss_cls = CELoss(input=out_cls, target=speaker_idx_trg)

            # Compute loss for gradient penalty.
            alpha = flow.rand(x_real.size(0), 1, 1, 1).to(self.device)
            x_hat = ((alpha * x_real +
                      (1 - alpha) * x_fake).detach().requires_grad_(True))
            out_src = self.D(x_hat, label_trg)

            # TODO: Second-order derivation is not currently supported in oneflow, so gradient penalty cannot be used temporarily.
            if self.use_gradient_penalty:
                d_loss_gp = self.gradient_penalty(out_src, x_hat)
                d_loss = d_loss_t + self.lambda_cls * d_loss_cls + 5 * d_loss_gp
            else:
                d_loss = d_loss_t + self.lambda_cls * d_loss_cls

            self.reset_grad()
            d_loss.backward()
            self.d_optimizer.step()

            loss["D/D_loss"] = d_loss.item()

            # Train the generator
            if (i + 1) % self.n_critic == 0:
                # Original-to-target domain.
                x_fake = self.G(x_real, label_trg)
                g_out_src = self.D(x_fake, label_trg)
                g_loss_fake = nn.BCEWithLogitsLoss()(
                    input=g_out_src, target=flow.ones_like(g_out_src).float())

                out_cls = self.C(x_real)
                g_loss_cls = CELoss(input=out_cls, target=speaker_idx_org)

                # Target-to-original domain.
                x_reconst = self.G(x_fake, label_org)
                g_loss_rec = nn.L1Loss()(x_reconst, x_real)

                # Original-to-Original domain(identity).
                x_fake_iden = self.G(x_real, label_org)
                id_loss = nn.L1Loss()(x_fake_iden, x_real)

                # Backward and optimize.
                g_loss = (g_loss_fake + self.lambda_cycle * g_loss_rec +
                          self.lambda_cls * g_loss_cls +
                          self.lambda_identity * id_loss)

                self.reset_grad()
                g_loss.backward()
                self.g_optimizer.step()

                # Logging.
                loss["G/loss_fake"] = g_loss_fake.item()
                loss["G/loss_rec"] = g_loss_rec.item()
                loss["G/loss_cls"] = g_loss_cls.item()
                loss["G/loss_id"] = id_loss.item()
                loss["G/g_loss"] = g_loss.item()

            # Miscellaneous
            # Print out training information.
            if (i + 1) % self.log_step == 0:
                et = datetime.now() - start_time
                et = str(et)[:-7]
                log = "Elapsed [{}], Iteration [{}/{}]".format(
                    et, i + 1, self.num_iters)
                for tag, value in loss.items():
                    log += ", {}: {:.4f}".format(tag, value)
                print(log)

            # Translate fixed images for debugging.
            if (i + 1) % self.sample_step == 0:
                with flow.no_grad():
                    d, speaker = TestSet(self.test_dir).test_data()
                    target = random.choice(
                        [x for x in speakers if x != speaker])
                    label_t = self.spk_enc.transform([target])[0]
                    label_t = np.asarray([label_t])

                    for filename, content in d.items():
                        f0 = content["f0"]
                        ap = content["ap"]
                        sp_norm_pad = self.pad_coded_sp(
                            content["coded_sp_norm"])

                        convert_result = []
                        for start_idx in range(
                                0, sp_norm_pad.shape[1] - FRAMES + 1, FRAMES):
                            one_seg = sp_norm_pad[:,
                                                  start_idx:start_idx + FRAMES]

                            one_seg = flow.Tensor(one_seg).to(self.device)
                            one_seg = one_seg.view(1, 1, one_seg.size(0),
                                                   one_seg.size(1))
                            l = flow.Tensor(label_t)
                            one_seg = one_seg.to(self.device)
                            l = l.to(self.device)
                            one_set_return = self.G(one_seg,
                                                    l).detach().cpu().numpy()
                            one_set_return = np.squeeze(one_set_return)
                            one_set_return = norm.backward_process(
                                one_set_return, target)
                            convert_result.append(one_set_return)

                        convert_con = np.concatenate(convert_result, axis=1)
                        convert_con = convert_con[:,
                                                  0:content["coded_sp_norm"].
                                                  shape[1]]
                        contigu = np.ascontiguousarray(convert_con.T,
                                                       dtype=np.float64)
                        decoded_sp = decode_spectral_envelope(contigu,
                                                              SAMPLE_RATE,
                                                              fft_size=FFTSIZE)
                        f0_converted = norm.pitch_conversion(
                            f0, speaker, target)
                        wav = synthesize(f0_converted, decoded_sp, ap,
                                         SAMPLE_RATE)

                        name = f"{speaker}-{target}_iter{i+1}_{filename}"
                        path = os.path.join(self.sample_dir, name)
                        print(f"[save]:{path}")
                        sf.write(path, wav, SAMPLE_RATE)

            # Save model checkpoints.
            if (i + 1) % self.model_save_step == 0:
                G_path = os.path.join(self.model_save_dir,
                                      "{}-G".format(i + 1))
                D_path = os.path.join(self.model_save_dir,
                                      "{}-D".format(i + 1))
                C_path = os.path.join(self.model_save_dir,
                                      "{}-C".format(i + 1))
                flow.save(self.G.state_dict(), G_path)
                flow.save(self.D.state_dict(), D_path)
                flow.save(self.C.state_dict(), C_path)
                print("Saved model checkpoints into {}...".format(
                    self.model_save_dir))

            # Decay learning rates.
            if (i + 1) % self.lr_update_step == 0 and (i + 1) > (
                    self.num_iters - self.num_iters_decay):
                g_lr -= self.g_lr / float(self.num_iters_decay)
                d_lr -= self.d_lr / float(self.num_iters_decay)
                c_lr -= self.c_lr / float(self.num_iters_decay)
                self.update_lr(g_lr, d_lr, c_lr)
                print("Decayed learning rates, g_lr: {}, d_lr: {}.".format(
                    g_lr, d_lr))
Пример #20
0
def save_states(global_step, writer, mel_outputs, linear_outputs, attn, mel, y,
                input_lengths, checkpoint_dir=None):
    print("Save intermediate states at step {}".format(global_step))

    # idx = np.random.randint(0, len(input_lengths))
    idx = min(1, len(input_lengths) - 1)
    input_length = input_lengths[idx]

    # Alignment
    # Multi-hop attention
    if attn is not None and attn.dim() == 4:
        for i, alignment in enumerate(attn):
            alignment = alignment[idx].cpu().data.numpy()
            tag = "alignment_layer{}".format(i + 1)
            writer.add_image(tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255), global_step)

            # save files as well for now
            alignment_dir = join(checkpoint_dir, "alignment_layer{}".format(i + 1))
            os.makedirs(alignment_dir, exist_ok=True)
            path = join(alignment_dir, "step{:09d}_layer_{}_alignment.png".format(
                global_step, i + 1))
            save_alignment(path, alignment)

        # Save averaged alignment
        alignment_dir = join(checkpoint_dir, "alignment_ave")
        os.makedirs(alignment_dir, exist_ok=True)
        path = join(alignment_dir, "step{:09d}_alignment.png".format(global_step))
        alignment = attn.mean(0)[idx].cpu().data.numpy()
        save_alignment(path, alignment)

        tag = "averaged_alignment"
        writer.add_image(tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255), global_step)

    # Predicted mel spectrogram
    if mel_outputs is not None:
        mel_output = mel_outputs[idx].cpu().data.numpy()
        if hparams.vocoder != "world":
            mel_output = prepare_spec_image(audio._denormalize(mel_output))
            writer.add_image("Predicted mel spectrogram", mel_output, global_step)
        else:
            mel_output_prep = mel_output
            try:
                writer.add_image("Predicted WORLD output", mel_output_prep, global_step)
            except:
                pass

            mel_output = denormalize(mel_output)
            nfft = pw.get_cheaptrick_fft_size(hparams.sample_rate)
            f0 = mel_output[:,0].astype(np.float64)
            sp = pw.decode_spectral_envelope(mel_output[:,1:(hparams.coded_env_dim+1)].astype(np.float64), hparams.sample_rate, nfft)
            ap = pw.decode_aperiodicity(mel_output[:,(hparams.coded_env_dim+1):hparams.num_mels].astype(np.float64), hparams.sample_rate, nfft)

            signal = pw.synthesize(f0, sp, ap, hparams.sample_rate, pw.default_frame_period)
            path = join(checkpoint_dir, "step{:09d}_out.wav".format(
                        global_step))
            audio.save_wav(signal, path)

            try:
                signal /= np.max(np.abs(signal))
                writer.add_audio("Target audio signal", signal, global_step, sample_rate=fs)
            except:
                print("Unexpected error :", sys.exc_info())

            mel_tgt = mel[idx].cpu().data.numpy()
            mel_tgt = denormalize(mel_tgt)

            f0 = mel_tgt[:,0].astype(np.float64)
            sp = pw.decode_spectral_envelope(mel_tgt[:,1:(hparams.coded_env_dim+1)].astype(np.float64), hparams.sample_rate, nfft)
            ap = pw.decode_aperiodicity(mel_tgt[:,(hparams.coded_env_dim+1):hparams.num_mels].astype(np.float64), hparams.sample_rate, nfft)

            signal = pw.synthesize(f0, sp, ap, hparams.sample_rate, pw.default_frame_period)
            try:
                signal /= np.max(np.abs(signal))
                writer.add_audio("Target audio signal", signal, global_step, sample_rate=hparams.sample_rate)
            except:
                print("Unexpected error :", sys.exc_info())
    # Predicted spectrogram
    if linear_outputs is not None:
        linear_output = linear_outputs[idx].cpu().data.numpy()
        spectrogram = prepare_spec_image(audio._denormalize(linear_output))
        writer.add_image("Predicted linear spectrogram", spectrogram, global_step)

        # Predicted audio signal
        signal = audio.inv_spectrogram(linear_output.T)
        signal /= np.max(np.abs(signal))
        path = join(checkpoint_dir, "step{:09d}_predicted.wav".format(
            global_step))
        try:
            writer.add_audio("Predicted audio signal", signal, global_step, sample_rate=fs)
        except Exception as e:
            warn(str(e))
            pass
        audio.save_wav(signal, path)

    # Target mel spectrogram
    if mel_outputs is not None:
        mel_output = mel[idx].cpu().data.numpy()
        mel_output = prepare_spec_image(audio._denormalize(mel_output))
        writer.add_image("Target mel spectrogram", mel_output, global_step)

    # Target spectrogram
    if linear_outputs is not None:
        linear_output = y[idx].cpu().data.numpy()
        spectrogram = prepare_spec_image(audio._denormalize(linear_output))
        writer.add_image("Target linear spectrogram", spectrogram, global_step)

    #ei
    path = join(checkpoint_dir, "step{:09d}_mel_target.npy".format(
                global_step))
    mel_output = mel[idx].cpu().data.numpy()
    np.save(path, denormalize(mel_output))

    path = join(checkpoint_dir, "step{:09d}_mel_out.npy".format(
                global_step))
    mel_output = denormalize(mel_outputs[idx].cpu().data.numpy())
    np.save(path, mel_output)
Пример #21
0
    def train(self):
        # Learning rate cache for decaying.
        g_lr = self.g_lr
        d_lr = self.d_lr
        c_lr = self.c_lr

        start_iters = 0
        if self.resume_iters:
            pass

        norm = Normalizer()
        data_iter = iter(self.data_loader)

        print('Start training......')
        start_time = datetime.now()

        for i in range(start_iters, self.num_iters):
            # =================================================================================== #
            #                             1. Preprocess input data                                #
            # =================================================================================== #
            # Fetch real images and labels.
            try:
                x_real, speaker_idx_org, label_org = next(data_iter)
            except:
                data_iter = iter(self.data_loader)
                x_real, speaker_idx_org, label_org = next(data_iter)

            # Generate target domain labels randomly.
            rand_idx = torch.randperm(label_org.size(0))
            label_trg = label_org[rand_idx]
            speaker_idx_trg = speaker_idx_org[rand_idx]

            x_real = x_real.to(self.device)  # Input images.
            label_org = label_org.to(
                self.device)  # Original domain one-hot labels.
            label_trg = label_trg.to(
                self.device)  # Target domain one-hot labels.
            speaker_idx_org = speaker_idx_org.to(
                self.device)  # Original domain labels
            speaker_idx_trg = speaker_idx_trg.to(
                self.device)  #Target domain labels

            # =================================================================================== #
            #                             2. Train the discriminator                              #
            # =================================================================================== #
            # Compute loss with real audio frame.
            CELoss = nn.CrossEntropyLoss()
            cls_real = self.C(x_real)
            cls_loss_real = CELoss(input=cls_real, target=speaker_idx_org)

            self.reset_grad()
            cls_loss_real.backward()
            self.c_optimizer.step()
            # Logging.
            loss = {}
            loss['C/C_loss'] = cls_loss_real.item()

            out_r = self.D(x_real, label_org)
            # Compute loss with fake audio frame.
            x_fake = self.G(x_real, label_trg)
            out_f = self.D(x_fake.detach(), label_trg)
            d_loss_t = F.binary_cross_entropy_with_logits(input=out_f,target=torch.zeros_like(out_f, dtype=torch.float)) + \
                F.binary_cross_entropy_with_logits(input=out_r, target=torch.ones_like(out_r, dtype=torch.float))

            out_cls = self.C(x_fake)
            d_loss_cls = CELoss(input=out_cls, target=speaker_idx_trg)

            # Compute loss for gradient penalty.
            alpha = torch.rand(x_real.size(0), 1, 1, 1).to(self.device)
            x_hat = (alpha * x_real.data +
                     (1 - alpha) * x_fake.data).requires_grad_(True)
            out_src = self.D(x_hat, label_trg)
            d_loss_gp = self.gradient_penalty(out_src, x_hat)

            d_loss = d_loss_t + self.lambda_cls * d_loss_cls + 5 * d_loss_gp

            self.reset_grad()
            d_loss.backward()
            self.d_optimizer.step()

            # loss['D/d_loss_t'] = d_loss_t.item()
            # loss['D/loss_cls'] = d_loss_cls.item()
            # loss['D/D_gp'] = d_loss_gp.item()
            loss['D/D_loss'] = d_loss.item()

            # =================================================================================== #
            #                               3. Train the generator                                #
            # =================================================================================== #
            if (i + 1) % self.n_critic == 0:
                # Original-to-target domain.
                x_fake = self.G(x_real, label_trg)
                g_out_src = self.D(x_fake, label_trg)
                g_loss_fake = F.binary_cross_entropy_with_logits(
                    input=g_out_src,
                    target=torch.ones_like(g_out_src, dtype=torch.float))

                out_cls = self.C(x_fake)
                g_loss_cls = CELoss(input=out_cls, target=speaker_idx_trg)

                # Target-to-original domain.
                x_reconst = self.G(x_fake, label_org)
                g_loss_rec = F.l1_loss(x_reconst, x_real)

                # Original-to-Original domain(identity).
                x_fake_iden = self.G(x_real, label_org)
                id_loss = F.l1_loss(x_fake_iden, x_real)

                # Backward and optimize.
                g_loss = g_loss_fake + self.lambda_cycle * g_loss_rec +\
                 self.lambda_cls * g_loss_cls + self.lambda_identity * id_loss

                self.reset_grad()
                g_loss.backward()
                self.g_optimizer.step()

                # Logging.
                loss['G/loss_fake'] = g_loss_fake.item()
                loss['G/loss_rec'] = g_loss_rec.item()
                loss['G/loss_cls'] = g_loss_cls.item()
                loss['G/loss_id'] = id_loss.item()
                loss['G/g_loss'] = g_loss.item()
            # =================================================================================== #
            #                                 4. Miscellaneous                                    #
            # =================================================================================== #
            # Print out training information.
            if (i + 1) % self.log_step == 0:
                et = datetime.now() - start_time
                et = str(et)[:-7]
                log = "Elapsed [{}], Iteration [{}/{}]".format(
                    et, i + 1, self.num_iters)
                for tag, value in loss.items():
                    log += ", {}: {:.4f}".format(tag, value)
                print(log)

                if self.use_tensorboard:
                    for tag, value in loss.items():
                        self.logger.scalar_summary(tag, value, i + 1)

            # Translate fixed images for debugging.
            if (i + 1) % self.sample_step == 0:
                with torch.no_grad():
                    d, speaker = TestSet(self.test_dir).test_data()
                    target = random.choice(
                        [x for x in speakers if x != speaker])
                    label_t = self.spk_enc.transform([target])[0]
                    label_t = np.asarray([label_t])

                    for filename, content in d.items():
                        f0 = content['f0']
                        ap = content['ap']
                        sp_norm_pad = self.pad_coded_sp(
                            content['coded_sp_norm'])

                        convert_result = []
                        for start_idx in range(
                                0, sp_norm_pad.shape[1] - FRAMES + 1, FRAMES):
                            one_seg = sp_norm_pad[:,
                                                  start_idx:start_idx + FRAMES]

                            one_seg = torch.FloatTensor(one_seg).to(
                                self.device)
                            one_seg = one_seg.view(1, 1, one_seg.size(0),
                                                   one_seg.size(1))
                            l = torch.FloatTensor(label_t)
                            one_seg = one_seg.to(self.device)
                            l = l.to(self.device)
                            one_set_return = self.G(one_seg,
                                                    l).data.cpu().numpy()
                            one_set_return = np.squeeze(one_set_return)
                            one_set_return = norm.backward_process(
                                one_set_return, target)
                            convert_result.append(one_set_return)

                        convert_con = np.concatenate(convert_result, axis=1)
                        convert_con = convert_con[:,
                                                  0:content['coded_sp_norm'].
                                                  shape[1]]
                        contigu = np.ascontiguousarray(convert_con.T,
                                                       dtype=np.float64)
                        decoded_sp = decode_spectral_envelope(contigu,
                                                              SAMPLE_RATE,
                                                              fft_size=FFTSIZE)
                        f0_converted = norm.pitch_conversion(
                            f0, speaker, target)
                        wav = synthesize(f0_converted, decoded_sp, ap,
                                         SAMPLE_RATE)

                        name = f'{speaker}-{target}_iter{i+1}_{filename}'
                        path = os.path.join(self.sample_dir, name)
                        print(f'[save]:{path}')
                        librosa.output.write_wav(path, wav, SAMPLE_RATE)

            # Save model checkpoints.
            if (i + 1) % self.model_save_step == 0:
                G_path = os.path.join(self.model_save_dir,
                                      '{}-G.ckpt'.format(i + 1))
                D_path = os.path.join(self.model_save_dir,
                                      '{}-D.ckpt'.format(i + 1))
                C_path = os.path.join(self.model_save_dir,
                                      '{}-C.ckpt'.format(i + 1))
                torch.save(self.G.state_dict(), G_path)
                torch.save(self.D.state_dict(), D_path)
                torch.save(self.C.state_dict(), C_path)
                print('Saved model checkpoints into {}...'.format(
                    self.model_save_dir))

            # Decay learning rates.
            if (i + 1) % self.lr_update_step == 0 and (i + 1) > (
                    self.num_iters - self.num_iters_decay):
                g_lr -= (self.g_lr / float(self.num_iters_decay))
                d_lr -= (self.d_lr / float(self.num_iters_decay))
                c_lr -= (self.c_lr / float(self.num_iters_decay))
                self.update_lr(g_lr, d_lr, c_lr)
                print('Decayed learning rates, g_lr: {}, d_lr: {}.'.format(
                    g_lr, d_lr))
Пример #22
0
    # output_dir = './data/processed'
    #
    # parser.add_argument('--input_dir', type = str, help = 'the direcotry contains data need to be processed', default = input_dir)
    # parser.add_argument('--output_dir', type = str, help = 'the directory stores the processed data', default = output_dir)
    #
    # argv = parser.parse_args()
    # input_dir = argv.input_dir
    # output_dir = argv.output_dir
    #
    # os.makedirs(output_dir, exist_ok=True)
    #
    # wav_to_mcep_file(input_dir, SAMPLE_RATE,  processed_filepath=output_dir)
    #
    # #input_dir is train dataset. we need to calculate and save the speech\
    # # statistical characteristics for each speaker.
    # generator = GenerateStatistics(output_dir)
    # generator.generate_stats()
    # generator.normalize_dataset()
    # end = datetime.now()
    # print(f"[Runing Time]: {end-start}")

    data_dir = '../data/audio/'
    sample = data_dir + 'Ses01F_impro01_F000.wav'
    wav = librosa.load(one_file, sr=sr, mono=True, dtype=np.float64)[0]
    f0, ap, sp, coded_sp = call_mcep(wav)

    decoded_sp = decode_spectral_envelope(coded_sp, SAMPLE_RATE, fft_size=FFTSIZE)
    # f0_converted = norm.pitch_conversion(f0, speaker, target)
    wav2 = synthesize(f0_converted, decoded_sp, ap, SAMPLE_RATE)
    audio_utils.save(wav2, './')
Пример #23
0
                out_name = key.replace("/", "_")
            else:
                out_name = key
            out_wavfile = tspk_dir / f"{out_name}.wav"
            out_wavfile.parent.mkdir(exist_ok=True, parents=True)

            # test generator
            mcep_T = np.asarray(mcep.T, dtype=np.float32)
            mcep_T = mcep_T.reshape((1, *mcep_T.shape))
            gen_mcep_var = generator(mcep_T, tspk_lab)
            gen_mcep = gen_mcep_var[0].data.T
            denorm_gen_mcep = denorm_mcep(gen_mcep, f0, mcep_mean[tspk], mcep_std[tspk])
            denorm_gen_mcep = signal.medfilt(denorm_gen_mcep, (5, 1))
            conved_f0 = conv_f0(f0, logf0_mean[tspk], logf0_std[tspk])

            specenv = pw.decode_spectral_envelope(denorm_gen_mcep, args.samplerate, args.fftsize)
            x = pw.synthesize(conved_f0, specenv, ap, args.samplerate, frame_period=args.frame_period*1000)
            x = x / max(abs(x)) * 30000
            x = x.astype(np.int16)

            wavfile.write(out_wavfile, args.samplerate, x)

            # test discriminator
            # test real data
            if fspk not in real_flags:
                real_datas[key] = np.squeeze(adverserial_discriminator(mcep_T, fspk_lab, dp_ratio=0.0)[1].data)
            # test fake data
            fake_datas[key] = np.squeeze(adverserial_discriminator(gen_mcep_var, tspk_lab, dp_ratio=0.0)[1].data)

        # save values of discriminator of fake data (fspk -> tspk)
        plt.clf()
import pyworld

IN_WAVE_FILE = "in.wav"  # 入力音声
OUT_WAVE_FILE = "out.wav"  # 分析再合成した音声

SP_DIM = 50  # スペクトル包絡の圧縮後の次元

# 音声の読み込み
fs, x = wavfile.read(IN_WAVE_FILE)
x = x.astype(np.float64)

# 音声の分析 (基本周波数、スペクトル包絡、非周期性指標)
f0, sp, ap = pyworld.wav2world(x, fs)
fft_size = pyworld.get_cheaptrick_fft_size(fs)

# スペクトル包絡をエンコード / デコード
# https://www.isca-speech.org/archive/Interspeech_2017/abstracts/0067.html
code_sp = pyworld.code_spectral_envelope(sp, fs, SP_DIM)
decode_sp = pyworld.decode_spectral_envelope(code_sp, fs, fft_size)

# 非周期性指標をエンコード / デコード
code_ap = pyworld.code_aperiodicity(ap, fs)
decode_ap = pyworld.decode_aperiodicity(code_ap, fs, fft_size)

# 音声の再合成
y = pyworld.synthesize(f0, decode_sp, decode_ap, fs)
y = y.astype(np.int16)

# 音声の書き込み
wavfile.write(OUT_WAVE_FILE, fs, y)
Пример #25
0
    def inference(self, dataset, rank_size=1):
        if dataset is None:
            print("convert dataset error!")
            return
        for i, samples in enumerate(dataset):
            samples = self.model.prepare_samples(samples)
            src_coded_sp, src_speaker_onehot, src_f0, src_ap = samples["src_coded_sp"], \
                 samples["src_speaker"], samples["src_f0"], samples["src_ap"]
            tar_speaker_onehot = samples["tar_speaker"]
            # Map the ids to the speakers name
            src_id, tar_id = samples["src_id"], samples["tar_id"]
            src_id, tar_id = int(src_id), int(tar_id)
            src_speaker = self.speakers_ids_dict[src_id]
            tar_speaker = self.speakers_ids_dict[tar_id]

            src_wav_filename = samples["src_wav_filename"]
            src_filename = src_wav_filename.numpy()[0].decode().replace(
                ".npz", "")

            gen_coded_sp = self.model.convert(src_coded_sp, tar_speaker_onehot)
            gen_coded_sp = tf.transpose(tf.squeeze(gen_coded_sp), [1, 0])
            coded_sp = self.feature_normalizer(gen_coded_sp,
                                               str(tar_speaker),
                                               reverse=True)

            def apply_f0_cmvn(cmvn_dict, feat_data, src_speaker, tar_speaker):
                if tar_speaker not in cmvn_dict:
                    print("tar_speaker not in cmvn_dict!")
                    return feat_data
                f0 = feat_data.numpy()
                src_mean = cmvn_dict[src_speaker][2]
                src_var = cmvn_dict[src_speaker][3]
                tar_mean = cmvn_dict[tar_speaker][2]
                tar_var = cmvn_dict[tar_speaker][3]
                f0_converted = np.exp((np.ma.log(f0) - src_mean) /
                                      np.sqrt(src_var) * np.sqrt(tar_var) +
                                      tar_mean)
                return f0_converted

            f0 = apply_f0_cmvn(self.feature_normalizer.cmvn_dict, src_f0,
                               str(src_speaker), str(tar_speaker))

            # Restoration of sp characteristics
            c = []
            for one_slice in coded_sp:
                one_slice = np.ascontiguousarray(one_slice,
                                                 dtype=np.float64).reshape(
                                                     1, -1)
                decoded_sp = pyworld.decode_spectral_envelope(
                    one_slice, self.fs, fft_size=self.fft_size)
                c.append(decoded_sp)
            sp = np.concatenate((c), axis=0)
            f0 = np.squeeze(f0, axis=(0, )).astype(np.float64)
            src_ap = np.squeeze(src_ap.numpy(), axis=(0, )).astype(np.float64)

            # Remove the extra padding at the end of the sp feature
            sp = sp[:src_ap.shape[0], :]
            # sp: T,fft_size//2+1   f0: T   ap: T,fft_size//2+1
            synwav = pyworld.synthesize(f0, sp, src_ap, self.fs)

            wavname = src_speaker + "_" + tar_speaker + "_" + src_filename + ".wav"
            wavfolder = os.path.join(self.hparams.output_directory)
            if not os.path.exists(wavfolder):
                os.makedirs(wavfolder)
            wavpath = os.path.join(wavfolder, wavname)

            librosa.output.write_wav(wavpath, synwav, sr=self.fs)
            print("generate wav:", wavpath)
Пример #26
0
else:
    sentences = args.sentences
print(f"sentences: {sentences}")

for s, snt in enumerate(sentences):
    feature, gen_letter_stateseq = feat_generator.generate(snt)

    mcep = mcep_generator.generate(feature, args.target_speaker)
    ap = ap_generator.generate(gen_letter_stateseq)
    f0 = f0_generator.generate(gen_letter_stateseq)
    f0[f0 < 0] = 0

    mcep = denorm_mcep(mcep, mcep_min, mcep_max)
    mcep = signal.medfilt(mcep, (5, 1))
    mcep = mcep.astype(float, order="C")

    decoded_sp = pw.decode_spectral_envelope(mcep, args.samplerate,
                                             args.fftsize)
    synthesized = pw.synthesize(f0,
                                decoded_sp,
                                ap,
                                args.samplerate,
                                frame_period=args.frame_period * 1000)
    synthesized = synthesized / max(abs(synthesized)) * 30000

    args.output_prefix.parent.mkdir(parents=True, exist_ok=True)
    out_file = args.output_prefix.with_name(
        f"{args.output_prefix.name}_{s:02d}_({'_'.join(map(str, snt))}).wav")

    wavfile.write(out_file, args.samplerate, synthesized.astype(np.int16))