示例#1
0
def test_world_array_order():
    wav = kwiiyatta.load_wav(dataset.CLB_WAV)

    f0, timeaxis = pyworld.dio(wav.data, wav.fs)
    f0 = pyworld.stonemask(wav.data, f0, timeaxis, wav.fs)
    spec = pyworld.cheaptrick(wav.data, f0, timeaxis, wav.fs)
    ape = pyworld.d4c(wav.data, f0, timeaxis, wav.fs)
    pyworld.synthesize(f0, spec, ape, wav.fs)

    data = wav.data[::2]

    expected_msg = 'ndarray is not C-contiguous'
    with pytest.raises(ValueError) as e:
        f0, timeaxis = pyworld.dio(data, wav.fs)
    assert expected_msg == str(e.value)

    with pytest.raises(ValueError) as e:
        f0 = pyworld.stonemask(data, f0, timeaxis, wav.fs)
    assert expected_msg == str(e.value)

    with pytest.raises(ValueError) as e:
        pyworld.cheaptrick(data, f0, timeaxis, wav.fs)
    assert expected_msg == str(e.value)

    with pytest.raises(ValueError) as e:
        pyworld.d4c(data, f0, timeaxis, wav.fs)
    assert expected_msg == str(e.value)

    with pytest.raises(ValueError) as e:
        pyworld.synthesize(f0[::2], spec[::2], ape[::2], wav.fs)
    assert expected_msg == str(e.value)
示例#2
0
    def main(args):
        if os.path.isdir('test'):
            rmtree('test')
        os.mkdir('test')

        x, fs = sf.read("./{}.wav".format(edited_files["Thinking_Out_Loud"]))
        # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64)

        # 1. A convient way
        f0, sp, ap = pw.wav2world(x, fs)  # use default options
        y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period)

        # 2. Step by step
        # 2-1 Without F0 refinement
        _f0, t = pw.dio(x,
                        fs,
                        f0_floor=50.0,
                        f0_ceil=600.0,
                        channels_in_octave=2,
                        frame_period=args.frame_period,
                        speed=args.speed)
        _sp = pw.cheaptrick(x, _f0, t, fs)
        _ap = pw.d4c(x, _f0, t, fs)
        _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period)
        # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs)
        sf.write('test/y_without_f0_refinement.wav', _y, fs)

        # 2-2 DIO with F0 refinement (using Stonemask)
        f0 = pw.stonemask(x, _f0, t, fs)
        sp = pw.cheaptrick(x, f0, t, fs)
        ap = pw.d4c(x, f0, t, fs)
        y = pw.synthesize(f0, sp, ap, fs, args.frame_period)
        # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs)
        sf.write('test/y_with_f0_refinement.wav', y, fs)

        # 2-3 Harvest with F0 refinement (using Stonemask)
        _f0_h, t_h = pw.harvest(x, fs)
        f0_h = pw.stonemask(x, _f0_h, t_h, fs)
        sp_h = pw.cheaptrick(x, f0_h, t_h, fs)
        ap_h = pw.d4c(x, f0_h, t_h, fs)
        y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period)
        # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs)
        sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs)

        # Comparison
        savefig('test/wavform.png', [x, _y, y])
        savefig('test/sp.png', [_sp, sp])
        savefig('test/ap.png', [_ap, ap], log=False)
        savefig('test/f0.png', [_f0, f0])

        print('Please check "test" directory for output files')
def main(args):
    if os.path.isdir('test'):
        rmtree('test')
    os.mkdir('test')

    x, fs = sf.read('utterance/vaiueo2d.wav')
    # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64)

    # 1. A convient way
    f0, sp, ap = pw.wav2world(x, fs)    # use default options
    y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period)

    # 2. Step by step
    # 2-1 Without F0 refinement
    _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0,
                    channels_in_octave=2,
                    frame_period=args.frame_period,
                    speed=args.speed)
    _sp = pw.cheaptrick(x, _f0, t, fs)
    _ap = pw.d4c(x, _f0, t, fs)
    _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs)
    sf.write('test/y_without_f0_refinement.wav', _y, fs)

    # 2-2 DIO with F0 refinement (using Stonemask)
    f0 = pw.stonemask(x, _f0, t, fs)
    sp = pw.cheaptrick(x, f0, t, fs)
    ap = pw.d4c(x, f0, t, fs)
    y = pw.synthesize(f0, sp, ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs)
    sf.write('test/y_with_f0_refinement.wav', y, fs)

    # 2-3 Harvest with F0 refinement (using Stonemask)
    _f0_h, t_h = pw.harvest(x, fs)
    f0_h = pw.stonemask(x, _f0_h, t_h, fs)
    sp_h = pw.cheaptrick(x, f0_h, t_h, fs)
    ap_h = pw.d4c(x, f0_h, t_h, fs)
    y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period)
    # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs)
    sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs)

    # Comparison
    savefig('test/wavform.png', [x, _y, y])
    savefig('test/sp.png', [_sp, sp])
    savefig('test/ap.png', [_ap, ap], log=False)
    savefig('test/f0.png', [_f0, f0])

    print('Please check "test" directory for output files')
示例#4
0
def collect_features(x, fs):
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
    return mc
def analysis_resynthesis(signal):

    # 音響特徴量の抽出
    f0, t = pw.dio(signal, sample_rate)  # 基本周波数の抽出
    f0 = pw.stonemask(signal, f0, t, sample_rate)  # refinement
    sp = pw.cheaptrick(signal, f0, t, sample_rate)  # スペクトル包絡の抽出
    ap = pw.d4c(signal, f0, t, sample_rate)  # 非周期性指標の抽出

    # ピッチシフト
    modified_f0 = f0_rate * f0

    # フォルマントシフト(周波数軸の一様な伸縮)
    modified_sp = np.zeros_like(sp)
    sp_range = int(modified_sp.shape[1] * sp_rate)
    for f in range(modified_sp.shape[1]):
        if (f < sp_range):
            if sp_rate >= 1.0:
                modified_sp[:, f] = sp[:, int(f / sp_rate)]
            else:
                modified_sp[:, f] = sp[:, int(sp_rate * f)]
        else:
            modified_sp[:, f] = sp[:, f]

    # 再合成
    synth = pw.synthesize(modified_f0, modified_sp, ap, sample_rate)

    return synth
def analyze(wav,
            fs=FS,
            minf0=MINF0,
            maxf0=MAXF0,
            fperiod=SHIFTMS,
            fftl=FFTL,
            f0=None,
            time_axis=None):
    """
    f0 estimation w/o f0_floor & f0_ceil
    Args:
        minf0: Never used
        maxf0: Never used
    Returns:
        (time_axis, fundamental frequency, spectral envelope, aperiodicity)
    """
    if f0 is None or time_axis is None:
        _f0, time_axis = pw.harvest(wav,
                                    fs,
                                    f0_floor=60.0,
                                    frame_period=fperiod)
        f0 = pw.stonemask(wav, _f0, time_axis, fs)
    sp = pw.cheaptrick(wav, f0, time_axis, fs, fft_size=fftl)
    ap = pw.d4c(wav, f0, time_axis, fs, fft_size=fftl)

    return time_axis, f0, sp, ap
def analyze_range(wav,
                  fs=FS,
                  minf0=MINF0,
                  maxf0=MAXF0,
                  fperiod=SHIFTMS,
                  fftl=FFTL,
                  f0=None,
                  time_axis=None):
    """
    f0 estimation w/ f0_floor & f0_ceil
    Args:
        f0: Given f0. If not provided, estimated by WORLD harvest/stonemask from waveform.
    Returns:
        (time_axis, fundamental frequency, spectral envelope, aperiodicity)
    """
    if f0 is None or time_axis is None:
        # pyworld.harvest: Estimate fo.
        _f0, time_axis = pw.harvest(wav,
                                    fs,
                                    f0_floor=minf0,
                                    f0_ceil=maxf0,
                                    frame_period=fperiod)
        # pyworld.stonemask: Refine fo.
        f0 = pw.stonemask(wav, _f0, time_axis, fs)
    # pyworld.cheaptrick: Spectral envelope estimation.
    sp = pw.cheaptrick(wav, f0, time_axis, fs, fft_size=fftl)
    # pyworld.d4c: Aperiodicity estimation.
    ap = pw.d4c(wav, f0, time_axis, fs, fft_size=fftl)

    return time_axis, f0, sp, ap
def anonymization(fs, waveNDArray, f0Value = 0, sp_strechRatio = np.random.uniform(0.6, 2, size=1), gaussian_s = 3):
    """
    WAV音声データから話者情報を取り除いたWAV音声データを作成
    label音声からinput音声作成用
    :param path:
    :param f0Value:
    :param sp_strechRatio:
    :return:
    """
    waveNDArray = waveNDArray.astype(np.float)
    _f0, t = pw.dio(waveNDArray, fs)  # 基本周波数の抽出
    f0 = pw.stonemask(waveNDArray, _f0, t, fs)  # 基本周波数の修正
    sp = pw.cheaptrick(waveNDArray, f0, t, fs)  # スペクトル包絡の抽出
    ap = pw.d4c(waveNDArray, f0, t, fs)  # 非周期性指標の抽出
    f0_fixed0 = np.ones(f0.shape) * f0Value
    f0_median = np.median(f0)
    sp_median = np.median(sp)
    ap_median = np.median(ap)
    # SPを高周波方向に伸縮
    sp2 = np.ones_like(sp)*np.min(sp)
    for f in range(sp2.shape[1]):
        if(int(f / sp_strechRatio) >= sp.shape[1]): break
        sp2[:, f] = sp[:, int(f / sp_strechRatio)]
    # SP/APに正規分布ノイズ
    sp_noised = sp2 + np.random.normal(sp_median,sp_median/10,sp2.shape)
    ap_noised = ap + np.random.normal(ap_median,ap_median/10,ap.shape)
    #ガウシアンフィルタ
    sp_gaussian = scipy.ndimage.filters.gaussian_filter(sp_noised,gaussian_s)
    ap_gaussian = scipy.ndimage.filters.gaussian_filter(ap_noised,gaussian_s)
    # 音声復元
    synthesized = pw.synthesize(f0_fixed0, sp, ap, fs)
    return synthesized
示例#9
0
def extract_f0(wav_dir, speaker_id_pos=-4):
    wav_file_list = get_list_of_files(wav_dir)
    wav_file_list = [fname for fname in wav_file_list if file_filters(fname)]
    with open("jvs_speaker_info.json", "r") as f:
        speaker_info = json.load(f)
    for fname in progressbar(wav_file_list, redirect_stdout=True):
        print(fname)
        speaker_name = fname.split("/")[speaker_id_pos]
        x, fs = librosa.load(fname, sr=None)
        x = x.astype(np.float64)
        _f0, t = pyworld.dio(
            x,
            fs,
            # f0_floor=75, f0_ceil=400,
            f0_floor=speaker_info[speaker_name]["f0_min"],
            f0_ceil=speaker_info[speaker_name]["f0_max"],
            frame_period=12.5)
        f0 = pyworld.stonemask(x, _f0, t, fs)
        f0[f0 < 1.0] = 1.0
        f0 = np.log2(f0).astype(np.float32)
        fname = fname.replace("wav24", "f0_24k")
        fname = fname.replace(".wav", "")
        fname_tokens = fname.split('/')
        file_name = fname_tokens[-1]
        output_dir = "/".join(fname_tokens[:speaker_id_pos + 1])
        if not exists(output_dir):
            os.makedirs(output_dir)
        np.save(join(output_dir, file_name), f0)
    print("Finished!")
示例#10
0
def data_extraction(np_data, rate):
    np_data = np_data.astype(np.float)
    _f0, t = pw.harvest(np_data, rate)
    f0 = pw.stonemask(np_data, _f0, t, rate)
    sp = pw.cheaptrick(np_data, f0, t, rate)
    ap = pw.d4c(np_data, f0, t, rate)
    return f0, sp, ap
示例#11
0
 def collect_features(self, wav_path):
     
     # x: Raw audio, (Sample_length, )
     x, fs = librosa.load(wav_path, sr=self.target_sr, mono=True, dtype=np.float64)
     
     
     # f0: F0, (Frame_length, ) 
     # lf0: log(f0) --> interp1d (Frame_length, )
     # vuv: voice/unvoiced (Frame_length, )
     f0, timeaxis = pyworld.dio(x, self.target_sr, frame_period=self.hop_sz_in_ms)
     f0 = pyworld.stonemask(x, f0, timeaxis, fs)
     lf0 = f0.copy()
     lf0[np.nonzero(f0)] = np.log(f0[np.nonzero(f0)])
     lf0 = interp1d(lf0, kind="slinear")
     vuv = (lf0 != 0).astype(np.float32)
     
     
     # spec: Spectrogram, (Frame_length x Dim), Dim = 513
     # bap: coded aperiodicity, (Frame_length, )
     # mgc: mel-cepstrum, (Frame_length x Dim), Dim = 60
     spec = pyworld.cheaptrick(x, f0, timeaxis, fs)
     aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)        
     bap = pyworld.code_aperiodicity(aperiodicity, fs)
     mgc = pysptk.sp2mc(spec, order=59, alpha=pysptk.util.mcepalpha(fs))
     
     
     # Stacking Features: total dimesnion = 64
     features = np.hstack((f0[:,None], lf0[:,None], vuv[:,None], bap, mgc, spec))
     return features.astype(np.float32)
示例#12
0
def wav2world(wavfile, frame_period):
    wav, fs = librosa.load(wavfile, sr=hp.sample_rate, dtype=np.float64)
    if hp.use_harvest:
        f0, timeaxis = pyworld.harvest(wav, fs, frame_period=frame_period)
    else:
        f0, timeaxis = pyworld.dio(wav, fs, frame_period=frame_period)
        f0 = pyworld.stonemask(wav, f0, timeaxis, fs)

    spectrogram = pyworld.cheaptrick(wav, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(wav, f0, timeaxis, fs)
    bap = pyworld.code_aperiodicity(aperiodicity, fs)
    hp.num_bap = bap.shape[1]
    alpha = pysptk.util.mcepalpha(fs)
    mgc = pysptk.sp2mc(spectrogram, order=hp.num_mgc - 1, alpha=alpha)
    f0 = f0[:, None]
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    if hp.use_harvest:
        # https://github.com/mmorise/World/issues/35#issuecomment-306521887
        vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
    else:
        vuv = (lf0 != 0).astype(np.float32)
    #print(mgc.shape,lf0.shape,vuv.shape,bap.shape)
    features = np.hstack((mgc, lf0, vuv, bap))
    return features.astype(np.float32)
示例#13
0
def convert(signal):
    f0_rate = 2.4
    sp_rate = 0.78
    sample_rate = 16000

    f0, t = pyworld.dio(signal, sample_rate)
    f0 = pyworld.stonemask(signal, f0, t, sample_rate)
    sp = pyworld.cheaptrick(signal, f0, t, sample_rate)
    ap = pyworld.d4c(signal, f0, t, sample_rate)

    modified_f0 = f0_rate * f0

    # フォルマントシフト(周波数軸の一様な伸縮)
    modified_sp = np.zeros_like(sp)
    sp_range = int(modified_sp.shape[1] * sp_rate)
    for f in range(modified_sp.shape[1]):
        if (f < sp_range):
            if sp_rate >= 1.0:
                modified_sp[:, f] = sp[:, int(f / sp_rate)]
            else:
                modified_sp[:, f] = sp[:, int(sp_rate * f)]
        else:
            modified_sp[:, f] = sp[:, f]

    y = pyworld.synthesize(modified_f0, modified_sp, ap, sample_rate)

    return y
示例#14
0
 def extract_spectrum(self, x, sample_rate):
     x = np.asarray(x)
     _f0, t = pw.dio(x, sample_rate, frame_period=12.5)  # raw pitch extractor
     f0 = pw.stonemask(x, _f0, t, sample_rate)  # pitch refinement
     sp = pw.cheaptrick(x, f0, t, sample_rate)  # extract smoothed spectrogram
     ap = pw.d4c(x, f0, t, sample_rate)
     return sp, ap, f0
示例#15
0
 def extract_f0(self, **kwargs):
     if self._f0 is None:
         self._f0, self._timeaxis = pyworld.dio(
             self.data, self.fs, frame_period=self.frame_period, **kwargs)
         self._f0 = pyworld.stonemask(self.data, self._f0, self._timeaxis,
                                      self.fs)
     return self._f0
示例#16
0
def get_conversion_data(audiodata, fs, refine_f0):
    """
    Get A (without warping source dictionary) feature for conversion (sp, ap, f0)
    :param args:
    :param kwargs:
    :return: source dictionary (without warping)
    """
    features = []

    logging.info("Start building speaker A dictionary: Extracting feature for conversion (sp, ap, f0)")
    for audio in tqdm(audiodata):
        # Extract feature
        _f0, t = pw.dio(audio, fs)  # raw pitch extractor

        if refine_f0:
            f0 = pw.stonemask(audio, _f0, t, fs)  # pitch refinement
        else:
            f0 = _f0

        sp = pw.cheaptrick(audio, f0, t, fs)  # extract smoothed spectrogram
        ap = pw.d4c(audio, f0, t, fs)  # extract aperiodicity
        # y = pw.synthesize(f0, sp, ap, fs)

        features.append({
            'sp': sp,
            'ap': ap,
            'f0': f0,
            'fs': fs,
            'sr': fs
        })

    return features
示例#17
0
def wav2pw(wavfile, sr=SR, fft_size=FFT_SIZE, frame_period=FRAME_PERIOD):
    x, _ = librosa.load(wavfile, sr=sr, mono=True, dtype=np.float64)
    _f0, t = pw.harvest(x, sr, frame_period=frame_period)
    f0 = pw.stonemask(x, _f0, t, sr)
    sp = pw.cheaptrick(x, f0, t, sr, fft_size=fft_size)
    ap = pw.d4c(x, f0, t, sr, fft_size=fft_size)
    return f0, sp, ap
示例#18
0
def get_features(filename, *, winlen, winstep, n_mcep, mcep_alpha, minf0,
                 maxf0, type):
    wav, sr = load(filename, sr=None)

    # get f0
    x = wav.astype(float)
    _f0, t = world.harvest(x,
                           sr,
                           f0_floor=minf0,
                           f0_ceil=maxf0,
                           frame_period=winstep * 1000)
    f0 = world.stonemask(x, _f0, t, sr)

    window_size = int(sr * winlen)
    hop_size = int(sr * winstep)

    # get mel
    if type == 'mcc':
        spec = world.cheaptrick(x, f0, t, sr, f0_floor=minf0)
        h = sptk.sp2mc(spec, n_mcep - 1, mcep_alpha).T
    else:
        h = mfcc(x, sr, n_mfcc=n_mcep, n_fft=window_size, hop_length=hop_size)
    h = np.vstack((h, f0))
    maxlen = len(x) // hop_size + 2
    h = repeat_last_padding(h, maxlen)
    id = os.path.basename(filename).replace(".wav", "")
    return (id, x, h)
示例#19
0
    def compute_f0(self, x: np.ndarray) -> np.ndarray:
        """Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram.

        Args:
            x (np.ndarray): Waveform.

        Returns:
            np.ndarray: Pitch.

        Examples:
            >>> WAV_FILE = filename = librosa.util.example_audio_file()
            >>> from TTS.config import BaseAudioConfig
            >>> from TTS.utils.audio import AudioProcessor
            >>> conf = BaseAudioConfig(pitch_fmax=8000)
            >>> ap = AudioProcessor(**conf)
            >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
            >>> pitch = ap.compute_f0(wav)
        """
        assert self.pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`."
        # align F0 length to the spectrogram length
        if len(x) % self.hop_length == 0:
            x = np.pad(x, (0, self.hop_length // 2), mode="reflect")

        f0, t = pw.dio(
            x.astype(np.double),
            fs=self.sample_rate,
            f0_ceil=self.pitch_fmax,
            frame_period=1000 * self.hop_length / self.sample_rate,
        )
        f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
        return f0
def raw2WORLDfeatures(signal, fs=16000, fft_size=1024):
    _f0, t = pw.dio(signal, fs, f0_ceil=500)  # raw pitch contour extractor
    f0 = pw.stonemask(signal, _f0, t, fs)  # pitch refinement
    spectra = pw.cheaptrick(signal, f0, t, fs, fft_size=fft_size)
    aperiodicity = pw.d4c(signal, f0, t, fs,
                          fft_size=fft_size)  # extract aperiodicity
    return f0, spectra, aperiodicity
示例#21
0
def save_features_to_array(path = Data_Directory):
	labels, _, _ = get_labels(path)
	print(labels)
	for label in labels:
		fundfreq_vectors = []
		ap_vectors = []
		mfcc_vectors = []
		sp_vectors = []

		wavfiles = [path + label + '/' + wavfile for wavfile in sorted(os.listdir(path + '/' + label))]
		for wavfile in wavfiles:
			print(wavfile)
			x, fs = sf.read(wavfile)

			_f0, t = pw.dio(x, fs)
			f0 = pw.stonemask(x, _f0, t, fs)
			fundfreq_vectors.append(f0)

			sp = pw.cheaptrick(x, f0, t, fs)
			sp_vectors.append(sp)

			ap = pw.d4c(x, f0, t, fs)
			ap_vectors.append(ap)

			mfcc = wav2mfcc(wavfile, max_pad_len=120)
			mfcc_vectors.append(mfcc)
		#print(mfcc_vectors.shape)
		np.save('features/mfcc_' + label + '.npy', mfcc_vectors)
		np.save('features/fundfreq_' + label + '.npy', fundfreq_vectors)
示例#22
0
def get_para(data, fs):
    # This function is the same as wav2world.
    _fo, _time = pw.dio(data, fs)               # 基本周波数の抽出
    fo = pw.stonemask(data, _fo, _time, fs)     # 基本周波数の修正
    sp = pw.cheaptrick(data, fo, _time, fs)     # スペクトル包絡の抽出
    ap = pw.d4c(data, fo, _time, fs)            # 非周期性指標の抽出
    return fo, sp, ap
示例#23
0
    def collect_features(self, wav_path, label_path):
        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)
        f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
        f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        mgc = pysptk.sp2mc(spectrogram,
                           order=order,
                           alpha=pysptk.util.mcepalpha(fs))
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        vuv = (lf0 != 0).astype(np.float32)
        lf0 = interp1d(lf0, kind="slinear")

        mgc = apply_delta_windows(mgc, windows)
        lf0 = apply_delta_windows(lf0, windows)
        bap = apply_delta_windows(bap, windows)

        features = np.hstack((mgc, lf0, vuv, bap))

        # Cut silence frames by HTS alignment
        labels = hts.load(label_path)
        features = features[:labels.num_frames()]
        indices = labels.silence_frame_indices()
        features = np.delete(features, indices, axis=0)

        return features.astype(np.float32)
示例#24
0
def generate_changed_voice(model, input_path):

    fs, x = wavfile.read(input_path)
    x = x.astype(np.float64)
    if len(x.shape) > 1:
        x = x.mean(axis=1)

    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]

    mc = P.modspec_smoothing(mc, FS / HOP_LENGHT, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    gen_data = model.predict(mc)

    gen_data = np.hstack([c0.reshape((-1, 1)), gen_data])

    fftlen = pyworld.get_cheaptrick_fft_size(fs)
    spectrogram = pysptk.mc2sp(
        gen_data.astype(np.float64), alpha=alpha, fftlen=fftlen)
    waveform = pyworld.synthesize(
        f0, spectrogram, aperiodicity, fs, hp.frame_period)

    return waveform
示例#25
0
def wave2world(data):
    """
    Parameters
    ----------
    data : float64
        SamplingRate: 44100
        ValueRange  : [-1.0,1.0]
        Shape: (input_size)
    Returns
    -------
    _f0 : float64
        Shape: (N)
    _cepstrum : float64
        Shape: (N, 64)
    _aperiodicity : float64
        Shape: (N,513)
    NOTE: input_size is defined in config file.
          N is determined by input_size.
    """
    sampling_rate = 44100
    _f0, _t = pw.dio(data, sampling_rate, frame_period=10)
    _f0 = pw.stonemask(data, _f0, _t, sampling_rate)
    _cepstrum = pw.cheaptrick(data, _f0, _t, sampling_rate)
    _cepstrum = (np.log(_cepstrum) + 7) / 9
    _cepstrum = np.clip(_cepstrum, -1.0, 1.0)
    _aperiodicity = pw.d4c(data, _f0, _t, sampling_rate)
    return _f0, _cepstrum.astype(np.float32), _aperiodicity
示例#26
0
    def extract(wave: Wave, frame_period, f0_floor, f0_ceil, fft_length, order,
                alpha, dtype):
        x = wave.wave.astype(numpy.float64)
        fs = wave.sampling_rate

        f0, t = pyworld.harvest(
            x,
            fs,
            frame_period=frame_period,
            f0_floor=f0_floor,
            f0_ceil=f0_ceil,
        )

        f0 = pyworld.stonemask(x, f0, t, fs)
        sp = pyworld.cheaptrick(x, f0, t, fs, fft_size=fft_length)
        ap = pyworld.d4c(x, f0, t, fs, fft_size=fft_length)

        mc = pysptk.sp2mc(sp, order=order, alpha=alpha)
        coded_ap = pyworld.code_aperiodicity(ap, fs)
        voiced = ~(f0 == 0)  # type: numpy.ndarray

        feature = AcousticFeature(
            f0=f0[:, None],
            sp=sp,
            ap=ap,
            coded_ap=coded_ap,
            mc=mc,
            voiced=voiced[:, None],
        )
        feature = feature.astype_only_float(dtype)
        feature.validate()
        return feature
示例#27
0
def get_target(x,fs,n_ap_channels,n_sp_channels):
    _f0, t = pw.dio(x,fs, f0_floor=120.0, f0_ceil=750.0,
                    frame_period=8.0)
    f0_herz = pw.stonemask(x, _f0, t, fs)
    sp = pw.cheaptrick(x, f0_herz, t, fs)
    ap = pw.d4c(x, f0_herz, t, fs)
    # print(sp.shape)

    # plt.matshow(ap)
    # plt.show()
    ap=ap*20-18
    arr=[]
    for i in range(sp.shape[0]):
        arr.append(np.interp(np.linspace(0,1025,n_ap_channels),np.arange(1025),ap[i])[np.newaxis,:])
    _ap=np.concatenate(arr,axis=0)

    sp=np.log(sp)
    # plt.matshow(sp)
    # plt.show()
    arr=[]
    for i in range(sp.shape[0]):
        arr.append(np.interp(np.linspace(0,1025,n_sp_channels),np.arange(1025),sp[i])[np.newaxis,:])
    _sp=np.concatenate(arr,axis=0)

    
#     mel=mel+20.0
#     mel=np.where(mel>0,mel,0)
#     mel=mel/mel.max()
#     plt.matshow(mel)
#     plt.show()

    return _ap,_sp
示例#28
0
def pre_process(file_name, training_dir):

    audio_file_name = training_dir + file_name + '.wav'
    lyrics_file_name = training_dir + 'Transcripts/' + file_name + '.txt'

    audio_data, sample_rate = soundfile.read(audio_file_name)
    audio_data = librosa.resample(audio_data, sample_rate, params.sample_rate)
    sample_rate = params.sample_rate

    harvest_frequency, timing = pyworld.harvest(
        audio_data,
        sample_rate,
        f0_floor=params.min_freq,
        f0_ceil=params.max_freq,
        frame_period=params.frame_period)
    frequency = pyworld.stonemask(audio_data, harvest_frequency, timing,
                                  sample_rate)
    audio_length = len(frequency)

    phoneme_data = extract_phoneme_data(
        [audio_file_name, lyrics_file_name, audio_length])

    frequency_data = process_frequency(frequency)

    label_data = pd.concat([phoneme_data, frequency_data], axis=1)

    spectral_data, aperiodic_data = extract_timbre_data(
        [audio_data, frequency, timing, sample_rate])

    return [spectral_data, aperiodic_data, label_data, frequency]
示例#29
0
def wavfile2pw(filename, f0_ceil=F0_CEIL, fs=FS, fft_size=FFT_SIZE):
    """Speech analysis given the file name
  
  We use the PyWorld to extract feature, following the practice in:
  https://github.com/JeremyCCHsu/vae-npvc

  NOTE: The spectrum is normalized by energy and transformed to log scale. 
  To be discussed here 

  After transforming to the log scale, the spectrum will be further 
  normalized to be in the range of [-1, 1]
  
  Args:
    filename: the wav file 
    f0_ceil: maximum f0, note here we set the default to be 500, while praat 
      suggest we set 250. this will result in many small values in high frequence, probably not learnable for a network
    fs: sampling frequency, librosa will handle the frequency conversion
      from the original wavfile 
    fft_size: fft size

  Returns:
    f0: the pitch/ fundamental frequencys
    sp: spectogram
    ap: aperiodicity
    en: energy
  """
    x, _ = librosa.load(filename, sr=fs, mono=True, dtype=np.float64)
    _f0, t = pw.dio(x, fs, f0_ceil=f0_ceil)
    f0 = pw.stonemask(x, _f0, t, fs)
    sp = pw.cheaptrick(x, f0, t, fs, fft_size=fft_size)
    ap = pw.d4c(x, f0, t, fs, fft_size=fft_size)
    en = np.sum(sp + EPSILON, axis=1, keepdims=True)
    sp = np.log10(sp / en)
    return f0, sp, ap, en
示例#30
0
def process_wav(wav_path):
    y, osr = sf.read(wav_path, subtype='PCM_16', channels=1, samplerate=48000,
                    endian='LITTLE') #, start=56640, stop=262560)

    sr = 32000
    if osr != sr:
        y = librosa.resample(y, osr, sr)

    #使用harvest算法计算音频的基频F0
    _f0, t = pw.harvest(y, sr, f0_floor=f0_min, f0_ceil=f0_max, frame_period=pw.default_frame_period)
    _f0 = pw.stonemask(y, _f0, t, sr)
    print(_f0.shape)

    #使用CheapTrick算法计算音频的频谱包络
    _sp = pw.cheaptrick(y, _f0, t, sr)

    code_sp = code_harmonic(_sp, 60)
    print(_sp.shape, code_sp.shape)
    #计算aperiodic参数
    _ap = pw.d4c(y, _f0, t, sr)

    code_ap = pw.code_aperiodicity(_ap, sr)
    print(_ap.shape, code_ap.shape)

    return _f0, _sp, code_sp, _ap, code_ap
示例#31
0
def pyworld_featurize(audiofile):

    fs, x = wav.read(audiofile)
    print(x)
    print(fs)
    # corrects for 2 channel audio
    try:
        x = x[:, 0]
    except:
        pass
    x = np.array(np.ascontiguousarray(x), dtype=np.double)
    print(fs)
    print(x)

    _f0, t = pw.dio(x, fs)  # raw pitch extractor
    f0 = pw.stonemask(x, _f0, t, fs)  # pitch refinement
    sp = pw.cheaptrick(x, f0, t, fs)  # extract smoothed spectrogram
    ap = pw.d4c(x, f0, t, fs)  # extract aperiodicity)

    features_0, labels_0 = stats(_f0, 'pitch')
    features_1, labels_1 = stats(_f0, 'pitch_refinement')
    features_2, labels_2 = stats(sp, 'smoothed_spectrogram')
    features_3, labels_3 = stats(ap, 'aperiodicity')

    features_0 = list(features_0)
    features_1 = list(features_1)
    features_2 = list(features_2)
    features_3 = list(features_3)

    features = features_0 + features_1 + features_2 + features_3
    labels = labels_0 + labels_1 + labels_2 + labels_3

    return features, labels
示例#32
0
def wav2pw(x, fs=16000, fft_size=FFT_SIZE):
    ''' Extract WORLD feature from waveform '''
    _f0, t = pw.dio(x, fs, f0_ceil=args.f0_ceil)            # raw pitch extractor
    f0 = pw.stonemask(x, _f0, t, fs)  # pitch refinement
    sp = pw.cheaptrick(x, f0, t, fs, fft_size=fft_size)
    ap = pw.d4c(x, f0, t, fs, fft_size=fft_size) # extract aperiodicity
    return {
        'f0': f0,
        'sp': sp,
        'ap': ap,
    }
示例#33
0
    def collect_features(self, wav_path, label_path):
        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)
        if hp_acoustic.use_harvest:
            f0, timeaxis = pyworld.harvest(
                x, fs, frame_period=hp_acoustic.frame_period,
                f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil)
        else:
            f0, timeaxis = pyworld.dio(
                x, fs, frame_period=hp_acoustic.frame_period,
                f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil)
            f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        if self.alpha is None:
            self.alpha = pysptk.util.mcepalpha(fs)
        mgc = pysptk.sp2mc(spectrogram, order=hp_acoustic.order, alpha=self.alpha)
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        if hp_acoustic.use_harvest:
            # https://github.com/mmorise/World/issues/35#issuecomment-306521887
            vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
        else:
            vuv = (lf0 != 0).astype(np.float32)
        lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind)

        # Parameter trajectory smoothing
        if hp_acoustic.mod_spec_smoothing:
            hop_length = int(fs * (hp_acoustic.frame_period * 0.001))
            modfs = fs / hop_length
            mgc = P.modspec_smoothing(
                mgc, modfs, cutoff=hp_acoustic.mod_spec_smoothing_cutoff)

        mgc = P.delta_features(mgc, hp_acoustic.windows)
        lf0 = P.delta_features(lf0, hp_acoustic.windows)
        bap = P.delta_features(bap, hp_acoustic.windows)

        features = np.hstack((mgc, lf0, vuv, bap))

        # Cut silence frames by HTS alignment
        labels = hts.load(label_path)
        features = features[:labels.num_frames()]
        indices = labels.silence_frame_indices()
        features = np.delete(features, indices, axis=0)

        return features.astype(np.float32)
示例#34
0
 def collect_features(self, wav_path):
     fs, x = wavfile.read(wav_path)
     x = x.astype(np.float64)
     f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
     f0 = pyworld.stonemask(x, f0, timeaxis, fs)
     spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
     spectrogram = P.trim_zeros_frames(spectrogram)
     if self.alpha is None:
         self.alpha = pysptk.util.mcepalpha(fs)
     mgc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=self.alpha)
     # Drop 0-th coefficient
     mgc = mgc[:, 1:]
     # 50Hz cut-off MS smoothing
     hop_length = int(fs * (hp.frame_period * 0.001))
     modfs = fs / hop_length
     mgc = P.modspec_smoothing(mgc, modfs, cutoff=50)
     # Add delta
     mgc = P.delta_features(mgc, hp.windows)
     return mgc.astype(np.float32)
示例#35
0
    def __call__(self, data: Wave, test=None):
        x = data.wave.astype(numpy.float64)
        fs = data.sampling_rate

        if self._f0_estimating_method == 'dio':
            _f0, t = pyworld.dio(
                x,
                fs,
                frame_period=self._frame_period,
                f0_floor=self._f0_floor,
                f0_ceil=self._f0_ceil,
            )
        else:
            from world4py.np import apis
            _f0, t = apis.harvest(
                x,
                fs,
                frame_period=self._frame_period,
                f0_floor=self._f0_floor,
                f0_ceil=self._f0_ceil,
            )
        f0 = pyworld.stonemask(x, _f0, t, fs)
        spectrogram = pyworld.cheaptrick(x, f0, t, fs)
        aperiodicity = pyworld.d4c(x, f0, t, fs)

        mfcc = pysptk.sp2mc(spectrogram, order=self._order, alpha=self._alpha)
        voiced = ~(f0 == 0)  # type: numpy.ndarray

        feature = AcousticFeature(
            f0=f0[:, None].astype(self._dtype),
            spectrogram=spectrogram.astype(self._dtype),
            aperiodicity=aperiodicity.astype(self._dtype),
            mfcc=mfcc.astype(self._dtype),
            voiced=voiced[:, None],
        )
        feature.validate()
        return feature
示例#36
0
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True):
    model.eval()

    hop_length = int(fs * (hp.frame_period * 0.001))
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    lengths = [len(mc_scaled)]

    # Add batch axis
    mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1))

    # For MLPG
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)

    # Apply model
    if model.include_parameter_generation():
        # Case: models include parameter generation in itself
        # Mulistream features cannot be used in this case
        y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths)
    else:
        # Case: generic models (can be sequence model)
        assert hp.has_dynamic_features is not None
        y_hat = model(mc_scaled, lengths=lengths)
        y_hat_static = multi_stream_mlpg(
            y_hat, R, hp.stream_sizes, hp.has_dynamic_features)

    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(
        mc_static_pred, data_mean[:static_dim], data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(
            mc.astype(np.float64), alpha=alpha, fftlen=fftlen)
        waveform = pyworld.synthesize(
            f0, spectrogram, aperiodicity, fs, hp.frame_period)

    return waveform, inputs, outputs