示例#1
0
    def __getitem__(self, key):
        key, pitch_aug_factor, time_aug_factor = key
        wav = self.data[key]
        if self.normalize:
            # soundfile.read normalizes data to [-1,1] if dtype is not given
            array, rate = soundfile.read(wav, always_2d=self.always_2d)
        else:
            array, rate = soundfile.read(wav,
                                         dtype=self.dtype,
                                         always_2d=self.always_2d)

        if pitch_aug_factor != 0:
            # Pitch augmentation
            ratio = pow(2, 1 / 12)
            import pyworld as pw

            f0_pw, sp, ap = pw.wav2world(array, rate)  # use default options
            array = pw.synthesize(
                f0_pw * (ratio**pitch_aug_factor),
                sp,
                ap,
                rate,
                pw.default_frame_period,
            )

        if time_aug_factor != 1:
            # Time augmentation
            array = tsm.wsola(array, time_aug_factor)

        return rate, array
def get_world_feats(vocals):
    vocals = np.float64(vocals)
    feats = pw.wav2world(vocals, config.fs, frame_period=config.hoptime * 1000)

    ap = feats[2].reshape([feats[1].shape[0],
                           feats[1].shape[1]]).astype(np.float32)
    ap = 10. * np.log10(ap**2)
    harm = 10 * np.log10(feats[1].reshape(
        [feats[2].shape[0], feats[2].shape[1]]))
    harm += config.world_offset
    f0 = feats[0]

    # f0 = pitch.extract_f0_sac(vocals, fs, config.hoptime)

    y = f0_to_hertz(f0)
    # import pdb;pdb.set_trace()
    # y = hertz_to_new_base(f0)
    nans, x = utils.nan_helper(y)
    naners = np.isinf(y)
    y[nans] = np.interp(x(nans), x(~nans), y[~nans])
    # y=[float(x-(min_note-1))/float(max_note-(min_note-1)) for x in y]
    y = np.array(y).reshape([len(y), 1])
    guy = np.array(naners).reshape([len(y), 1])
    y = np.concatenate((y, guy), axis=-1)

    # if config.comp_mode == 'mfsc':
    harmy = sp_to_mfsc(harm, 60, 0.45)
    apy = sp_to_mfsc(ap, 4, 0.45)
    # elif config.comp_mode == 'mgc':
    #     harmy=sp_to_mgc(harm,60,0.45)
    #     apy=sp_to_mgc(ap,4,0.45)

    out_feats = np.concatenate((harmy, apy, y.reshape((-1, 2))), axis=1)

    return out_feats
示例#3
0
def get_mgc(audio,
            sample_rate,
            frame_period,
            fft_size=512,
            mcep_size=60,
            alpha=0.65):
    if isinstance(audio, torch.Tensor):
        if audio.ndim > 1:
            audio = audio[0]

        audio = audio.numpy()

    _, sp, _ = pw.wav2world(audio.astype(np.double),
                            fs=sample_rate,
                            frame_period=frame_period,
                            fft_size=fft_size)
    mgc = pysptk.sptk.mcep(sp,
                           order=mcep_size,
                           alpha=alpha,
                           maxiter=0,
                           etype=1,
                           eps=1.0E-8,
                           min_det=0.0,
                           itype=3)

    return mgc
示例#4
0
def input_to_feats(input_file, mode=config.comp_mode):
    audio, fs = sf.read(input_file)
    vocals = np.array(audio[:, 1])
    feats = pw.wav2world(vocals, fs, frame_period=5.80498866)

    ap = feats[2].reshape([feats[1].shape[0],
                           feats[1].shape[1]]).astype(np.float32)
    ap = 10. * np.log10(ap**2)
    harm = 10 * np.log10(feats[1].reshape(
        [feats[2].shape[0], feats[2].shape[1]]))

    y = 69 + 12 * np.log2(feats[0] / 440)
    nans, x = nan_helper(y)
    naners = np.isinf(y)
    y[nans] = np.interp(x(nans), x(~nans), y[~nans])
    # y=[float(x-(min_note-1))/float(max_note-(min_note-1)) for x in y]
    y = np.array(y).reshape([len(y), 1])
    guy = np.array(naners).reshape([len(y), 1])
    y = np.concatenate((y, guy), axis=-1)

    if mode == 'mfsc':
        harmy = sp_to_mfsc(harm, 60, 0.45)
        apy = sp_to_mfsc(ap, 4, 0.45)
    elif mode == 'mgc':
        harmy = sp_to_mgc(harm, 60, 0.45)
        apy = sp_to_mgc(ap, 4, 0.45)

    out_feats = np.concatenate((harmy, apy, y.reshape((-1, 2))), axis=1)

    # harm_in=mgc_to_sp(harmy, 1025, 0.45)
    # ap_in=mgc_to_sp(apy, 1025, 0.45)

    return out_feats
示例#5
0
def mcep_dir(srcroot, tgtroot, n_mcep=40, alpha=0.42):
    src = pathlib.Path(srcroot)
    tgt = pathlib.Path(tgtroot)
    if not pathlib.Path(src).exists():
        raise ValueError('src not exists: {}'.format(src))

    for p in sorted(src.glob('**/*.wav')):
        print(p)
        tgt_dir = tgt / p.parent.relative_to(src)
        tgt_stem = (tgt_dir / p.name).with_suffix('')
        tgt_dir.mkdir(parents=True, exist_ok=True)
        mcep_path = tgt_stem.with_suffix('.mcep.npy')
        c0_path = tgt_stem.with_suffix('.c0.npy')
        f0_path = tgt_stem.with_suffix('.f0.npy')
        ap_path = tgt_stem.with_suffix('.ap.npy')
        if mcep_path.exists() and c0_path.exists() and f0_path.exists() and ap_path.exists():
            print('skip')
            continue

        sr, wav = wavfile.read(p)
        x = (wav/32768.0).astype(np.float64)
        f0, sp, ap = pyworld.wav2world(x.astype(np.float64), sr)
        mcep = pysptk.sptk.mcep(sp,order=n_mcep,alpha=alpha, itype=4)
        f0, mcep, ap = f0.astype(np.float32), mcep.T.astype(np.float32), ap.T.astype(np.float32)
        c0 = mcep[0, :]
        mcep = np.ascontiguousarray(mcep[1:, :])
        ap = ap[192, :]

        np.save(mcep_path, mcep)
        np.save(c0_path, c0)
        np.save(f0_path, f0)
        np.save(ap_path, ap)
        print(tgt_stem, flush=True)
示例#6
0
def extract_feats(file, feats_dir):
    fname = os.path.basename(file).split('.wav')[0]
    x, fs = sf.read(file)
    f0, sp, ap = pw.wav2world(x, fs, frame_period=20)

    np.savetxt(feats_dir + '/' + fname + '.f0_ascii', f0)
    np.savetxt(feats_dir + '/' + fname + '.sp_ascii', sp)
    np.savetxt(feats_dir + '/' + fname + '.ap_ascii', ap)
示例#7
0
def wav2world(wavfile):
    wav, fs = sf.read(wavfile)
    f0,sp,ap=vocoder.wav2world(wav,fs , hp.n_fft, ap_depth=hp.num_bap)
    # feature normalization
    lf0 = f0_normalize(f0)
    mgc = sp_normalize(sp)
    bap = ap_normalize(ap)
    return np.array(world_features_to_one_tensor(lf0,mgc,bap))
示例#8
0
def basic_analysis(wav, sample_rate):
    nbits = wav.itemsize * 8
    int_ceiling = 2**(nbits - 1)
    float_data = wav.astype(np.float64) / int_ceiling

    f0, smoothed_spectrogram, aperiodicity = pyworld.wav2world(
        float_data, sample_rate)

    f0 = f0.reshape((-1, 1))
    return f0, smoothed_spectrogram, aperiodicity
示例#9
0
def entropy(filename):
    y, sr = librosa.core.load(filename)
    y = y.astype(np.float64)
    f0, sp, ap = pw.wav2world(y, sr)
    # compute entropy of f0
    f0_entropy = -1
    # if sum(f0)
    entropy = scipy.stats.entropy
    f0_entropy = entropy(np.trim_zeros(f0))
    return f0_entropy
示例#10
0
def world_spectrogram_default(wav, sr=_sr):
    """默认参数的world声码器语音转为特征频谱。"""
    # f0 : ndarray
    #     F0 contour. 基频等高线
    # sp : ndarray
    #     Spectral envelope. 频谱包络
    # ap : ndarray
    #     Aperiodicity. 非周期性
    f0, sp, ap = pw.wav2world(wav.astype(np.double), sr)  # use default options
    return f0, sp, ap
示例#11
0
 def changeFreq(self, data, freq_target):
     data = data.astype(np.float)
     f0, sp, ap = pw.wav2world(data, self.fs)
     f0_positive = np.array([f for f in f0 if f > 0])
     if len(f0_positive) == 0:
         return data
     f0_mean = np.mean(f0_positive)
     f0_new = f0 * freq_target / f0_mean
     synthesized = pw.synthesize(f0_new, sp, ap, self.fs,
                                 pw.default_frame_period)
     return synthesized
def extract_feats(file, feats_dir):
    fname = os.path.basename(file).split('.wav')[0]
    x, fs = sf.read(file)
    f0, sp, ap = pw.wav2world(x, fs, frame_period=1)
    _f0, t = pw.dio(x, fs, frame_period=1)
    t_sec = t[1:-1] * 16000
    x_segments = np.split(x, t_sec.astype(int))

    np.savetxt(feats_dir + '/' + fname + '.f0_ascii', f0)
    np.savetxt(feats_dir + '/' + fname + '.sp_ascii', sp)
    np.savetxt(feats_dir + '/' + fname + '.ap_ascii', ap)
def get_data():
    import pyworld as pw
    import os
    import soundfile as sf
    cwd = os.getcwd()
    raw_folder = os.path.join(cwd, 'data', 'raw')
    processed_folder = os.path.join(raw_folder, 'processed')

    #create processed items folder
    if not os.path.exists(processed_folder):
        os.makedirs(processed_folder)

    for i, filename in enumerate(os.listdir(raw_folder)):
        if filename.endswith(".raw"):
            # print(os.path.join(directory, filename))

            print("hree", i)
            data, samplerate = sf.read(os.path.join(raw_folder, filename),
                                       channels=1,
                                       endian='LITTLE',
                                       dtype='float',
                                       subtype='PCM_16',
                                       samplerate=48000)
            print("here2, data", data, "rate", samplerate)
            f0, sp, ap = pw.wav2world(data, fs=48000)

            print("passed through vocoder successfully. f0", f0)

            print("")
            print("")
            print("")
            print("sp", sp)
            print("")
            print("")
            print("")
            print("ap", ap)
            new_file_folder_path = os.path.join(processed_folder, str(i))
            if not os.path.exists(new_file_folder_path):
                os.makedirs(new_file_folder_path)

            new_proccesed_file_path = os.path.join(new_file_folder_path, 'f0')
            f = open(new_proccesed_file_path, 'w')
            f.write(f0)
            f.close()

            new_proccesed_file_path = os.path.join(new_file_folder_path, 'sp')
            f = open(new_proccesed_file_path, 'w')
            f.write(sp)
            f.close()

            new_proccesed_file_path = os.path.join(new_file_folder_path, 'ap')
            f = open(new_proccesed_file_path, 'w')
            f.write(ap)
            f.close()
示例#14
0
def spectral_entropy(filename):
    y, sr = librosa.core.load(filename)
    y = y.astype(np.float64)
    f0, sp, ap = pw.wav2world(y, sr)
    # power spectral density
    freq, psd = scipy.signal.periodogram(np.trim_zeros(f0))
    # pdb.set_trace()
    # normalized_v = v / np.sqrt(np.sum(v**2))
    # This routine will normalize pk and qk if they don’t sum to 1.
    f0_spectral_entropy = scipy.stats.entropy(psd)
    return f0_spectral_entropy
示例#15
0
def _process_utterance(out_dir, index, wav_path, text, phone):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    if hparams.vocoder=="world":
        spectrogram = audio.spectrogram(wav).astype(np.float32)

        f0, sp, ap = pw.wav2world(wav.astype(np.double), hparams.sample_rate)
        ap_coded = pw.code_aperiodicity(ap, hparams.sample_rate)
        sp_coded = pw.code_spectral_envelope(sp,hparams.sample_rate, hparams.coded_env_dim)
        
        world_spec = np.hstack([f0[:,np.newaxis],sp_coded,ap_coded])
        n_frames = world_spec.shape[0]
        spectrogram_filename = 'synpaflex-spec-%05d.npy' % index
        encoded_filename = 'synpaflex-world-%05d.npy' % index
        np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
        np.save(os.path.join(out_dir, encoded_filename), world_spec, allow_pickle=False)

    else:
        # Compute the linear-scale spectrogram from the wav:
        spectrogram = audio.spectrogram(wav).astype(np.float32)
        n_frames = spectrogram.shape[1]

        # Compute a mel-scale spectrogram from the wav:
        mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

        # Write the spectrograms to disk:
        spectrogram_filename = 'synpaflex-spec-%05d.npy' % index
        encoded_filename = 'synpaflex-mel-%05d.npy' % index
        np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
        np.save(os.path.join(out_dir, encoded_filename), mel_spectrogram.T, allow_pickle=False)


    # Return a tuple describing this training example:
    return (spectrogram_filename, encoded_filename, n_frames, text, phone)
def main():
    _x, _fs = sf.read(oripath)  # 原始 音频信息&采样率
    _f0, _sp, _ap = pw.wav2world(_x, _fs)  # 原始f0,sp,ap,合成要用到,这个函数更 直接简单
    ori_f0, ori_timeaxis = pw.harvest(_x, _fs)  # 貌似只有这个函数能出来 timeaxis:对应帧信息

    x, fs = sf.read(oripath)  # 目标 音频信息&采样率
    f0, sp, ap = pw.wav2world(x, fs)  # 目标 f0,sp,ap
    aim_f0, aim_timeaxis = pw.harvest(x, fs)

    aim_mean_f0 = get_mean_f0(aimpath)  #目标f0均值,是 对数形式 的 数字(拿到下面去正态分布出来)
    ori_mean_f0 = get_mean_f0(oripath)

    #下面开始把源说话人的f0(有效帧:f0>0的帧),做一个转换(先取对数,再加上两者 f0对数均值差 )
    for i in range(len(ori_timeaxis)):  # 对原始说话人,逐帧筛选,有效帧 做对数处理后 再转换;
        if ori_f0[i] > 0:
            tmp_log_f0 = np.log(ori_f0[i])
            tmp_log_f0 = tmp_log_f0 - ori_mean_f0 + aim_mean_f0
            tmp_exp_f0 = np.exp(tmp_log_f0)  # 反对数
            ori_f0[i] = tmp_exp_f0

    # 这样说是不对,再来一版本:下面这行不行
    # aim_new_f0 = np.random.normal(aim_mean_f0, 1.0, sp.shape[0])  # 要的是目标新f0

    print('原始 _x:wav的尺寸_x.shape = ' +
          str(_x.shape))  # 54852 维度,帧长度可以用len(timeaxis)

    print('目标:sp.shape[0] = ' + str(sp.shape[0]) + ' sp.shape = ' +
          str(sp.shape))
    print('目标 ap.shape = ' + str(ap.shape))
    print('f0.shape = ' + str(f0.shape) + '_f0.shape = ' + str(_f0.shape))

    #print('aim_new_f0.shape'+str(aim_new_f0.shape))

    print('原始f0.shape = ' + str(f0.shape))
    print('原始_sp.shape = ' + str(_sp.shape) + ' 原始_ap.shape = ' +
          str(_ap.shape))

    synthesized = pw.synthesize(ori_f0, _sp, _ap, _fs, pw.default_frame_period)
    sf.write('./synthesized.wav', synthesized, _fs)
示例#17
0
def stft_to_feats(vocals, fs, mode=config.comp_mode):
    feats = pw.wav2world(vocals, fs, frame_period=5.80498866)

    ap = feats[2].reshape([feats[1].shape[0],
                           feats[1].shape[1]]).astype(np.float32)
    ap = 10. * np.log10(ap**2)
    harm = 10 * np.log10(feats[1].reshape(
        [feats[2].shape[0], feats[2].shape[1]]))
    feats = pw.wav2world(vocals, fs, frame_period=5.80498866)

    f0 = feats[0]
    # f0 = pitch.extract_f0_sac(vocals, fs, 0.00580498866)

    y = 69 + 12 * np.log2(f0 / 440)
    # import pdb;pdb.set_trace()
    # y = hertz_to_new_base(f0)
    nans, x = nan_helper(y)
    naners = np.isinf(y)
    y[nans] = np.interp(x(nans), x(~nans), y[~nans])
    # y=[float(x-(min_note-1))/float(max_note-(min_note-1)) for x in y]
    y = np.array(y).reshape([len(y), 1])
    guy = np.array(naners).reshape([len(y), 1])
    y = np.concatenate((y, guy), axis=-1)

    if mode == 'mfsc':
        harmy = sp_to_mfsc(harm, 60, 0.45)
        apy = sp_to_mfsc(ap, 4, 0.45)
    elif mode == 'mgc':
        harmy = sp_to_mgc(harm, 60, 0.45)
        apy = sp_to_mgc(ap, 4, 0.45)

    # import pdb;pdb.set_trace()

    out_feats = np.concatenate((harmy, apy, y.reshape((-1, 2))), axis=1)

    # harm_in=mgc_to_sp(harmy, 1025, 0.45)
    # ap_in=mgc_to_sp(apy, 1025, 0.45)

    return out_feats
示例#18
0
    def main(args):
        if os.path.isdir('test'):
            rmtree('test')
        os.mkdir('test')

        x, fs = sf.read("./{}.wav".format(edited_files["Thinking_Out_Loud"]))
        # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64)

        # 1. A convient way
        f0, sp, ap = pw.wav2world(x, fs)  # use default options
        y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period)

        # 2. Step by step
        # 2-1 Without F0 refinement
        _f0, t = pw.dio(x,
                        fs,
                        f0_floor=50.0,
                        f0_ceil=600.0,
                        channels_in_octave=2,
                        frame_period=args.frame_period,
                        speed=args.speed)
        _sp = pw.cheaptrick(x, _f0, t, fs)
        _ap = pw.d4c(x, _f0, t, fs)
        _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period)
        # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs)
        sf.write('test/y_without_f0_refinement.wav', _y, fs)

        # 2-2 DIO with F0 refinement (using Stonemask)
        f0 = pw.stonemask(x, _f0, t, fs)
        sp = pw.cheaptrick(x, f0, t, fs)
        ap = pw.d4c(x, f0, t, fs)
        y = pw.synthesize(f0, sp, ap, fs, args.frame_period)
        # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs)
        sf.write('test/y_with_f0_refinement.wav', y, fs)

        # 2-3 Harvest with F0 refinement (using Stonemask)
        _f0_h, t_h = pw.harvest(x, fs)
        f0_h = pw.stonemask(x, _f0_h, t_h, fs)
        sp_h = pw.cheaptrick(x, f0_h, t_h, fs)
        ap_h = pw.d4c(x, f0_h, t_h, fs)
        y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period)
        # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs)
        sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs)

        # Comparison
        savefig('test/wavform.png', [x, _y, y])
        savefig('test/sp.png', [_sp, sp])
        savefig('test/ap.png', [_ap, ap], log=False)
        savefig('test/f0.png', [_f0, f0])

        print('Please check "test" directory for output files')
示例#19
0
def world(y, sample_rate, fft_size, hop_size):
    if isinstance(y, torch.Tensor):
        y = y.numpy()
    if y.ndim == 2:
        y = y.squeeze(0)

    y = y.astype('float64')
    frame_period = 1000*hop_size/sample_rate
    f0, sp, ap = pw.wav2world(y, sample_rate, fft_size=fft_size, frame_period=frame_period)
    f0 = torch.from_numpy(f0).float()
    sp = torch.from_numpy(sp).float()
    ap = torch.from_numpy(ap).float()

    return f0, sp, ap
def compute_features_from_path(path):
    from tqdm import tqdm
    d={}
    d['sp_list']=[]
    d['f0_list']=[]
    d['ap_list']=[]
    for id in tqdm(transcript[transcript.index.str.contains(hp.validpatt)].index):
        file = [s for s in os.listdir(path) if id in s][0]
        wav,fs=sf.read(path+file)
        f0, sp, ap = pw.wav2world(wav, fs)
        # mgc, lf0, vuv = mgc_lf0_vuv(f0, sp, ap, fs=fs)
        d['sp_list'].append(sp)
        d['f0_list'].append(f0)
        d['ap_list'].append(ap)
    return d
def main(args):
    if os.path.isdir('test'):
        rmtree('test')
    os.mkdir('test')

    x, fs = sf.read('utterance/vaiueo2d.wav')
    # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64)

    # 1. A convient way
    f0, sp, ap = pw.wav2world(x, fs)    # use default options
    y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period)

    # 2. Step by step
    # 2-1 Without F0 refinement
    _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0,
                    channels_in_octave=2,
                    frame_period=args.frame_period,
                    speed=args.speed)
    _sp = pw.cheaptrick(x, _f0, t, fs)
    _ap = pw.d4c(x, _f0, t, fs)
    _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs)
    sf.write('test/y_without_f0_refinement.wav', _y, fs)

    # 2-2 DIO with F0 refinement (using Stonemask)
    f0 = pw.stonemask(x, _f0, t, fs)
    sp = pw.cheaptrick(x, f0, t, fs)
    ap = pw.d4c(x, f0, t, fs)
    y = pw.synthesize(f0, sp, ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs)
    sf.write('test/y_with_f0_refinement.wav', y, fs)

    # 2-3 Harvest with F0 refinement (using Stonemask)
    _f0_h, t_h = pw.harvest(x, fs)
    f0_h = pw.stonemask(x, _f0_h, t_h, fs)
    sp_h = pw.cheaptrick(x, f0_h, t_h, fs)
    ap_h = pw.d4c(x, f0_h, t_h, fs)
    y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period)
    # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs)
    sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs)

    # Comparison
    savefig('test/wavform.png', [x, _y, y])
    savefig('test/sp.png', [_sp, sp])
    savefig('test/ap.png', [_ap, ap], log=False)
    savefig('test/f0.png', [_f0, f0])

    print('Please check "test" directory for output files')
示例#22
0
    def world(self):
        """Extracts vocoder features using WORLD.

        Note VUV in F0 is represented using 0.0

        Returns:
            (np.ndarray[n_frames]): fundamental frequency,
            (np.ndarray[n_frames, sp_dim]): smoothed spectrogram,
            (np.ndarray[n_frames, ap_dim]): aperiodicity.
        """
        nbits = self.data.itemsize * 8
        int_ceiling = 2**(nbits - 1)
        float_data = self.data.astype(np.float64) / int_ceiling
        f0, smoothed_spectrogram, aperiodicity = pyworld.wav2world(
            float_data, self.sample_rate)
        return f0, smoothed_spectrogram, aperiodicity
示例#23
0
 def actuar(self, text):
     command = 'espeak -v ' + VOZ + ' "' + text + '" -w ' + ARCHIVO
     os.system(command)
     x, fs = sf.read(ARCHIVO)
     f0, sp, ap = pw.wav2world(x, fs)
     yy = pw.synthesize(f0 / GRAVEDAD, sp / ATENUACION_DEL_VOLUMEN, ap,
                        fs / VELOCIDAD_DEL_DISCURSO,
                        pw.default_frame_period)
     sf.write(ARCHIVO, yy, fs)
     mixer.init()
     mixer.music.load(ARCHIVO)
     mixer.music.play()
     while mixer.music.get_busy():
         pygame.time.Clock().tick(10)
     mixer.quit()
     print("robot:$ " + text)
示例#24
0
def file_to_sac(input_file):
    audio, fs = sf.read(input_file)
    vocals = np.array(audio[:, 1])
    feats = pw.wav2world(vocals, fs, frame_period=5.80498866)

    f0 = feats[0]
    # f0 = pitch.extract_f0_sac(vocals, config.fs, 0.00580498866)
    y = 69 + 12 * np.log2(f0 / 440)
    # y = hertz_to_new_base(f0)
    nans, x = nan_helper(y)
    naners = np.isinf(y)
    y[nans] = np.interp(x(nans), x(~nans), y[~nans])
    # y=[float(x-(min_note-1))/float(max_note-(min_note-1)) for x in y]
    y = np.array(y).reshape([len(y), 1])
    guy = np.array(naners).reshape([len(y), 1])
    y = np.concatenate((y, guy), axis=-1)
    return y
示例#25
0
def main(args):
    if os.path.isdir('test'):
        rmtree('test')
    os.mkdir('test')

    # Read speech sample
    x, fs = sf.read(args.input)

    # 1. A convenient way
    f0, sp, ap = pw.wav2world(x, fs)  # use default options
    y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period)

    # 2. Step by step
    # 2-1 Without F0 refinement
    _f0, t = pw.dio(x,
                    fs,
                    f0_floor=50.0,
                    f0_ceil=600.0,
                    channels_in_octave=2,
                    frame_period=args.frame_period,
                    speed=args.speed)
    _sp = pw.cheaptrick(x, _f0, t, fs)
    _ap = pw.d4c(x, _f0, t, fs)
    _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period)
    sf.write('test/y_without_f0_refinement.wav', _y, fs)

    # 2-2 DIO with F0 refinement (using Stonemask)
    f0 = pw.stonemask(x, _f0, t, fs)
    sp = pw.cheaptrick(x, f0, t, fs)
    ap = pw.d4c(x, f0, t, fs)
    y = pw.synthesize(f0, sp, ap, fs, args.frame_period)
    sf.write('test/y_with_f0_refinement.wav', y, fs)

    # 2-3 Harvest with F0 refinement (using Stonemask)
    _f0_h, t_h = pw.harvest(x, fs)
    f0_h = pw.stonemask(x, _f0_h, t_h, fs)
    sp_h = pw.cheaptrick(x, f0_h, t_h, fs)
    ap_h = pw.d4c(x, f0_h, t_h, fs)
    y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period)
    sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs)

    # Comparison
    save_image('test/wavform.png', [x, _y, y])
    save_image('test/sp.png', [_sp, sp])
    save_image('test/ap.png', [_ap, ap], log=False)
    save_image('test/f0.png', [_f0, f0])
示例#26
0
    def load(cls, wavfile: str) -> Frq:
        path = pathlib.Path(wavfile).with_suffix(EXTENSION)

        if path.is_file():
            data = np.load(path)

        else:
            # NOTE: WORLD anaylsis only works on mono-channel float64 samples
            f0, sp, ap = pyworld.wav2world(
                *soundfile.read(wavfile, dtype="float64"))

            if not f0.nonzero()[0].size:
                raise RuntimeError(f"f0 estimation failed for {wavfile}!!!")

            data = {"f0": f0, "sp": sp, "ap": ap}

            np.savez(path, **data)

        return cls(**data)
示例#27
0
文件: test.py 项目: yossy11/Vocoder
def main():
    # read
    x, fs = sf.read('Datas/vaiueo2d.wav')

    # extract features
    f0, sp, ap = pw.wav2world(x, fs)    # use default options

    # synthesize features
    y_default = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period)

    # write
    sf.write('test/default.wav', y_default, fs)

    y_f0_x2 = pw.synthesize(f0*2, sp, ap, fs, pw.default_frame_period)
    sf.write('test/f0_x2.wav', y_f0_x2, fs)
    y_sp_x2 = pw.synthesize(f0, sp*2, ap, fs, pw.default_frame_period)
    sf.write('test/sp_x2.wav', y_sp_x2, fs)
    y_ap_x2 = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period)
    sf.write('test/ap_x2.wav', y_ap_x2, fs)
示例#28
0
def main(args):
    if os.path.isdir('test'):
        rmtree('test')
    os.mkdir('test')

    x, fs = sf.read('utterance/vaiueo2d.wav')
    # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64)

    # 1. A convient way
    f0, sp, ap, pyDioOpt = pw.wav2world(x, fs)  # use default options
    y = pw.synthesize(f0, sp, ap, fs, pyDioOpt.option['frame_period'])

    # 2. Step by step
    pyDioOpt = pw.pyDioOption(f0_floor=50,
                              f0_ceil=600,
                              channels_in_octave=2,
                              frame_period=args.frame_rate,
                              speed=args.speed)

    # 2-1 Without F0 refinement
    _f0, t = pw.dio(x, fs, pyDioOpt)
    _sp = pw.cheaptrick(x, _f0, t, fs)
    _ap = pw.d4c(x, _f0, t, fs)
    _y = pw.synthesize(_f0, _sp, _ap, fs, pyDioOpt.option['frame_period'])
    # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs)
    sf.write('test/y_without_f0_refinement.wav', _y, fs)

    # 2-2 With F0 refinement (using stonemask)
    f0 = pw.stonemask(x, _f0, t, fs)
    sp = pw.cheaptrick(x, f0, t, fs)
    ap = pw.d4c(x, f0, t, fs)
    y = pw.synthesize(f0, sp, ap, fs, pyDioOpt.option['frame_period'])
    # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs)
    sf.write('test/y_with_f0_refinement.wav', y, fs)

    # Comparison
    savefig('test/wavform.png', [x, _y, y])
    savefig('test/sp.png', [_sp, sp])
    savefig('test/ap.png', [_ap, ap], log=False)
    savefig('test/f0.png', [_f0, f0])

    print('Please check "test" directory for output files')
示例#29
0
def save_wav_ceps(fake_B, input_path, sample_path):
    length = 14000
    bps, wav_data = wav.read(input_path)
    datas = [
        wav_data[i:i + length, 0] for i in range(0, len(wav_data), length)
    ]
    wave = np.zeros([len(fake_B), length])
    for (b, d) in zip(fake_B, datas):
        f0, _, pitch = pw.wav2world(d, bps)
        for cep in b:
            for i, Scep in enumerate(cep):
                if (i == 0):
                    Scep = (Scep * 28) - 20
                else:
                    Scep = (Scep * 7) - 3
                cep[i] = Scep
        sp = pysptk.mc2sp(b, 0.48, 2048)
        w = pw.synthesize(f0, sp, pitch, bps)
        np.append(wave, w)
    wave = np.reshape(wave, -1).astype('int16')
    wav.write(sample_path + '_fake.wav', bps, wave)
示例#30
0
def main(args):
    x, fs = sf.read('voice.wav')
    f0, sp, ap = pw.wav2world(x, fs)

    y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period)
    sf.write('test_f0/y_10_semplice.wav', y, fs)
    sf.write('test_f0+sp/y_10_semplice.wav', y, fs)

    for i in range(1, 20):
        if i != 10:
            _f0 = (i / 10) * np.array(f0)
            _y = pw.synthesize(_f0, sp, ap, fs, args.frame_period)
            sf.write('test_f0/y_' + str(i) + '.wav', _y, fs)

    for i in range(1, 20):
        if i != 10:
            _f0 = (i / 10) * np.array(f0)
            _sp = (i / 10) * np.array(sp)
            _y = pw.synthesize(_f0, _sp, ap, fs, args.frame_period)
            sf.write('test_f0+sp/y_' + str(i) + '.wav', _y, fs)

    print('Please check "test" directory for output files')
示例#31
0
def save_mcg_np(path):
    # Check if recording ID list (and thereby numpy representations)
    # have already been created
    if os.path.isfile(os.path.join(path, 'rec_ids.txt')):
        print('Recording ID list already exists. Assuming numpy arrays'
              ' exist as well. Skipping this folder.')
        return
    files = os.listdir(path)
    # Create a list of file endings to save as text file for later use
    rec_id_list = []
    # Iterate through all .wav files and save as mcep feature arrays
    for filename in files:
        if filename.endswith('.wav'):
            rec_id_list.append(filename.rstrip('.wav')[-3:])
            wav_path = os.path.join(path, filename)
            loaded_wav, _ = librosa.load(wav_path, sr=SAMPLING_RATE)
            # Use WORLD vocoder for spectral envelope
            _, sp, _ = pyworld.wav2world(loaded_wav.astype(np.double),
                                         fs=SAMPLING_RATE,
                                         frame_period=FRAME_PERIOD,
                                         fft_size=fft_size)
            # Extract MCEP features
            mgc = pysptk.sptk.mcep(sp,
                                   order=mcep_size,
                                   alpha=alpha,
                                   maxiter=0,
                                   etype=1,
                                   eps=1.0E-8,
                                   min_det=0.0,
                                   itype=3)
            # Save as numpy
            np.save(os.path.join(path,
                                 filename.rstrip('.wav') + '.npy'),
                    mgc,
                    allow_pickle=False)
    # Save list of file endings
    rec_id_file = open(os.path.join(path, 'rec_ids.txt'), 'w')
    for rec_id in sorted(rec_id_list):
        rec_id_file.write(rec_id + '\n')