Exemplo n.º 1
0
def get_feature(wav_path, preprocessing=False, getsize=False):
    fs, x = wavfile.read(wav_path)
    x = x.astype(np.float64)
    if audio_world_config.use_harvest:
        f0, timeaxis = pyworld.harvest(
            x,
            fs,
            frame_period=audio_world_config.frame_period,
            f0_floor=audio_world_config.f0_floor,
            f0_ceil=audio_world_config.f0_ceil)
    else:
        f0, timeaxis = pyworld.dio(
            x,
            fs,
            frame_period=audio_world_config.frame_period,
            f0_floor=audio_world_config.f0_floor,
            f0_ceil=audio_world_config.f0_ceil)
        f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    bap = pyworld.code_aperiodicity(aperiodicity, fs)

    alpha = pysptk.util.mcepalpha(fs)
    mgc = pysptk.sp2mc(spectrogram,
                       order=audio_world_config.mgc_dim,
                       alpha=alpha)
    f0 = f0[:, None]
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    if audio_world_config.use_harvest:
        # https://github.com/mmorise/World/issues/35#issuecomment-306521887
        vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
    else:
        vuv = (lf0 != 0).astype(np.float32)
    lf0 = P.interp1d(lf0, kind=audio_world_config.f0_interpolation_kind)

    # Parameter trajectory smoothing
    if audio_world_config.mod_spec_smoothing:
        hop_length = int(fs * (audio_world_config.frame_period * 0.001))
        modfs = fs / hop_length
        mgc = P.modspec_smoothing(
            mgc, modfs, cutoff=audio_world_config.mod_spec_smoothing_cutoff)

    mgc = P.delta_features(mgc, audio_world_config.windows)
    lf0 = P.delta_features(lf0, audio_world_config.windows)
    bap = P.delta_features(bap, audio_world_config.windows)

    features = np.hstack((mgc, lf0, vuv, bap))
    if preprocessing:
        out_path = wav_path.replace(".wav", "").replace("wav", "world")
        np.save(out_path, features)
    elif getsize:
        feature, mgc.shape[0], lf0.shape[0], bap.shape[0]
    else:
        return features
Exemplo n.º 2
0
    def collect_features(self, wav_path, label_path):
        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)
        if hp_acoustic.use_harvest:
            f0, timeaxis = pyworld.harvest(
                x, fs, frame_period=hp_acoustic.frame_period,
                f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil)
        else:
            f0, timeaxis = pyworld.dio(
                x, fs, frame_period=hp_acoustic.frame_period,
                f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil)
            f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        if self.alpha is None:
            self.alpha = pysptk.util.mcepalpha(fs)
        mgc = pysptk.sp2mc(spectrogram, order=hp_acoustic.order, alpha=self.alpha)
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        if hp_acoustic.use_harvest:
            # https://github.com/mmorise/World/issues/35#issuecomment-306521887
            vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
        else:
            vuv = (lf0 != 0).astype(np.float32)
        lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind)

        # Parameter trajectory smoothing
        if hp_acoustic.mod_spec_smoothing:
            hop_length = int(fs * (hp_acoustic.frame_period * 0.001))
            modfs = fs / hop_length
            mgc = P.modspec_smoothing(
                mgc, modfs, cutoff=hp_acoustic.mod_spec_smoothing_cutoff)

        mgc = P.delta_features(mgc, hp_acoustic.windows)
        lf0 = P.delta_features(lf0, hp_acoustic.windows)
        bap = P.delta_features(bap, hp_acoustic.windows)

        features = np.hstack((mgc, lf0, vuv, bap))

        # Cut silence frames by HTS alignment
        labels = hts.load(label_path)
        features = features[:labels.num_frames()]
        indices = labels.silence_frame_indices()
        features = np.delete(features, indices, axis=0)

        return features.astype(np.float32)
Exemplo n.º 3
0
    def collect_features(self, wav_path, label_path):
        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)
        f0, timeaxis = pyworld.dio(x,
                                   fs,
                                   frame_period=hp_acoustic.frame_period)
        f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        if self.alpha is None:
            self.alpha = pysptk.util.mcepalpha(fs)
        mgc = pysptk.sp2mc(spectrogram,
                           order=hp_acoustic.order,
                           alpha=self.alpha)
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        vuv = (lf0 != 0).astype(np.float32)
        lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind)

        # 50hz parameter trajectory smoothing
        hop_length = int(fs * (hp_acoustic.frame_period * 0.001))
        modfs = fs / hop_length
        mgc = P.modspec_smoothing(mgc, modfs, cutoff=50)

        mgc = P.delta_features(mgc, hp_acoustic.windows)
        lf0 = P.delta_features(lf0, hp_acoustic.windows)
        bap = P.delta_features(bap, hp_acoustic.windows)

        features = np.hstack((mgc, lf0, vuv, bap))

        # Cut silence frames by HTS alignment
        labels = hts.load(label_path)
        features = features[:labels.num_frames()]
        indices = labels.silence_frame_indices()
        features = np.delete(features, indices, axis=0)

        return features.astype(np.float32)
Exemplo n.º 4
0
def proc_wav(wav_path, out_dir, index, spkid, sr=16000):
    fs, signal = wav.read(wav_path)
    frame_count = int(len(signal) / hop_size)
    pad_len = (frame_count - 1) * hop_size + frame_length
    padded = np.pad(signal, (0, pad_len - len(signal)),
                    mode="constant",
                    constant_values=0)
    mfcc = python_speech_features.mfcc(padded,
                                       fs,
                                       winlen=0.025,
                                       winstep=0.01,
                                       nfilt=40,
                                       numcep=13)  # log-energy, mfcc[1:13]
    out = padded[:frame_count * hop_size]
    assert len(out) % mfcc.shape[0] == 0
    padded = padded.astype(np.float64)
    f0, timeaxis = pw.harvest(padded, fs, frame_period=frame_period)
    f0 = f0[:frame_count]
    vuv = np.zeros(len(f0))
    vuv[f0 > 0] = 1
    logf0 = np.zeros(len(f0))
    logf0[f0 > 0] = np.log(f0[f0 > 0])
    continuous_lf0 = interp1d(logf0, kind="slinear")
    print(continuous_lf0.shape, vuv.shape, out.shape, mfcc.shape)
Exemplo n.º 5
0
def _process_utterance(lf0_dir, mgc_dir, bap_dir, cmp_dir, linear_dir,
                       basename, wav_path, text, hparams):
    """
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- basename:
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""

    if hparams.trim_silence:
        tar_wavfile = wav_path[:-4] + "_trim.wav"
        print("raw wav path:%s" % wav_path)
        wav_raw, fs = sf.read(wav_path)
        wav_trim = audio.trim_silence(wav_raw, hparams)
        sf.write(tar_wavfile, wav_trim, fs)

        wav_path = tar_wavfile

    nFFTHalf, alpha, bap_dim = audio.get_config(hparams.sample_rate)

    mcsize = hparams.num_mgc - 1

    filename = basename  #os.path.basename(wav_path).split(".")[0]

    print('extract feats for %s' % wav_path)

    # extract f0,sp,ap
    os.system("analysis %s %s/%s.f0 %s/%s.sp %s/%s.bapd" %
              (wav_path, lf0_dir, filename, mgc_dir, filename, bap_dir,
               filename))  # get float64???

    # interpolate f0
    f0 = np.fromfile("%s/%s.f0" % (lf0_dir, filename), dtype=np.float64)
    continuous_f0 = interp1d(f0, kind="slinear")
    continuous_f0.tofile("%s/%s.f0c" % (lf0_dir, filename))

    # convert f0 to lf0
    os.system("x2x +da %s/%s.f0c > %s/%s.f0a" %
              (lf0_dir, filename, lf0_dir, filename))
    os.system(
        "x2x +af %s/%s.f0a | sopr -magic 0.0 -LN -MAGIC -1.0E+10 > %s/%s.lf0" %
        (lf0_dir, filename, lf0_dir, filename))

    # convert sp to mgc
    os.system("x2x +df %s/%s.sp | sopr -R -m 32768.0 | "
              "mcep -a %f -m %d -l %d -e 1.0E-8 -j 0 -f 0.0 -q 3 "
              "> %s/%s.mgc" %
              (mgc_dir, filename, alpha, mcsize, nFFTHalf, mgc_dir, filename))

    # convert ap to bap
    os.system("x2x +df %s/%s.bapd > %s/%s.bap" %
              (bap_dir, filename, bap_dir, filename))

    # merge mgc,lf0 and bap to cmp
    os.system("merge +f -s 0 -l 1 -L %d %s/%s.mgc < %s/%s.lf0 > %s/%s.ml" % (
        (mcsize + 1), mgc_dir, filename, lf0_dir, filename, cmp_dir, filename))
    os.system("merge +f -s 0 -l %d -L %d %s/%s.ml < %s/%s.bap > %s/%s.cmp" %
              (bap_dim, (mcsize + 2), cmp_dir, filename, bap_dir, filename,
               cmp_dir, filename))

    #if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
    #	return None

    #Compute the linear scale spectrogram from the wav
    wav = audio.load_wav(wav_path, hparams.sample_rate)
    linear_spectrogram = audio.linearspectrogram(wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    #assert linear_frames == mel_frames

    lf0 = np.fromfile("%s/%s.lf0" % (lf0_dir, filename), dtype=np.float32)
    mgc = np.fromfile("%s/%s.mgc" % (mgc_dir, filename), dtype=np.float32)
    bap = np.fromfile("%s/%s.bap" % (bap_dir, filename), dtype=np.float32)
    cmp = np.fromfile("%s/%s.cmp" % (cmp_dir, filename), dtype=np.float32)

    cmp_dim = mcsize + 1 + 1 + bap_dim
    cmp_frames = cmp.shape[0] / cmp_dim
    #print(f0[:100])
    #print(continuous_f0[:100])
    print(lf0.shape)
    print(continuous_f0.shape)
    print(mgc.shape)
    print(bap.shape)
    print(cmp_frames)
    print(continuous_f0.dtype)
    print(mgc.dtype)
    print(bap.dtype)
    assert (mgc.shape[0] /
            (mcsize + 1)) == (continuous_f0.shape[0] /
                              1) == (bap.shape[0] / bap_dim) == cmp_frames
    assert cmp_dim == hparams.num_mels
    #assert len(out) >= cmp_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    #out = out[:mel_frames * audio.get_hop_size(hparams)]
    #assert len(out) % audio.get_hop_size(hparams) == 0
    #time_steps = len(out)

    # Write the spectrogram and audio to disk
    #audio_filename = 'audio-{}.npy'.format(index)
    cmp_mat = cmp.reshape(-1, cmp_dim)
    cmp_filename = 'cmp-{}.npy'.format(basename)
    linear_filename = 'linear-{}.npy'.format(basename)
    #np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False)
    np.save(os.path.join(cmp_dir, cmp_filename), cmp_mat, allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)
    # Return a tuple describing this training example
    return (cmp_filename, linear_filename, cmp_frames, text)