Пример #1
0
 def x2mcep(self, x):
     x = x.astype(np.float64)
     # [TODO] to avoid ValueError: ndarray is not C-contiguous
     if not x.flags['C_CONTIGUOUS']:
         x = x.copy(order='C')
     # if self.itype == 3:
     #     etype = 2
     #     eps = 1e-10
     # else:
     #     etype = 0
     #     eps = 0.0
     etype = 0
     eps = 0.0
     if self.isMatrix:
         return np.asarray([
             mcep(xi,
                  order=self.order,
                  itype=self.itype,
                  etype=etype,
                  eps=eps) for xi in x
         ])
     else:
         return mcep(x,
                     order=self.order,
                     itype=self.itype,
                     etype=etype,
                     eps=eps)
Пример #2
0
def sptk_mcep(x,
              order,
              winsz,
              hopsz,
              fftsz,
              fs,
              window_norm=False,
              noise_floor=1e-8):
    alpha = hz2alpha(fs)
    windowed = sptk_window(x,
                           winsz,
                           hopsz,
                           fftsz,
                           windowing='blackman',
                           normalize=window_norm)
    cep = pysptk.mcep(windowed,
                      order=order,
                      alpha=alpha,
                      miniter=2,
                      maxiter=30,
                      threshold=0.001,
                      etype=1,
                      eps=noise_floor,
                      min_det=1.0e-6,
                      itype=0)
    return cep, alpha
Пример #3
0
    def pysptk_mfcc(self):
        self.frame_length = 1024
        self.hop_length = 80
        self.pitch = pysptk.swipe(self.audio.astype(np.float64),
                                  fs=self.sr,
                                  hopsize=self.hop_length,
                                  min=60,
                                  max=240,
                                  otype="pitch")
        self.source_excitation = pysptk.excite(self.pitch, self.hop_length)

        # Note that almost all of pysptk functions assume input array is C-contiguous and np.float4 element type
        frames = librosa.util.frame(self.audio,
                                    frame_length=self.frame_length,
                                    hop_length=self.hop_length).astype(
                                        np.float64).T

        # Windowing
        frames *= pysptk.blackman(self.frame_length)

        assert frames.shape[1] == self.frame_length

        # Order of mel-cepstrum
        self.order = 25
        self.alpha = 0.41

        self.mc = pysptk.mcep(frames, self.order, self.alpha)
        logH = pysptk.mgc2sp(self.mc, self.alpha, 0.0, self.frame_length).real
        librosa.display.specshow(logH.T,
                                 sr=self.sr,
                                 hop_length=self.hop_length,
                                 x_axis="time",
                                 y_axis="linear")
Пример #4
0
 def extract_mcep(amp_sp: np.array, num_coded_sps: int, mgc_alpha: float) \
         -> np.array:
     """Extract MCep from the amplitude spectrum with SPTK."""
     mcep = pysptk.mcep(amp_sp,
                        order=num_coded_sps - 1,
                        alpha=mgc_alpha,
                        eps=1.0e-8,
                        min_det=0.0,
                        etype=1,
                        itype=3)
     return mcep.astype(np.float32, copy=False)
Пример #5
0
def sptk_extract(
    x: np.ndarray,
    fs: int,
    n_fft: int = 512,
    n_shift: int = 256,
    mcep_dim: int = 25,
    mcep_alpha: float = 0.41,
    is_padding: bool = False,
) -> np.ndarray:
    """Extract SPTK-based mel-cepstrum.

    Args:
        x (ndarray): 1D waveform array.
        fs (int): Sampling rate
        n_fft (int): FFT length in point (default=512).
        n_shift (int): Shift length in point (default=256).
        mcep_dim (int): Dimension of mel-cepstrum (default=25).
        mcep_alpha (float): All pass filter coefficient (default=0.41).
        is_padding (bool): Whether to pad the end of signal (default=False).

    Returns:
        ndarray: Mel-cepstrum with the size (N, n_fft).

    """
    # perform padding
    if is_padding:
        n_pad = n_fft - (len(x) - n_fft) % n_shift
        x = np.pad(x, (0, n_pad), "reflect")

    # get number of frames
    n_frame = (len(x) - n_fft) // n_shift + 1

    # get window function
    win = pysptk.sptk.hamming(n_fft)

    # check mcep and alpha
    if mcep_dim is None or mcep_alpha is None:
        mcep_dim, mcep_alpha = _get_best_mcep_params(fs)

    # calculate spectrogram
    mcep = [
        pysptk.mcep(
            x[n_shift * i:n_shift * i + n_fft] * win,
            mcep_dim,
            mcep_alpha,
            eps=1e-6,
            etype=1,
        ) for i in range(n_frame)
    ]

    return np.stack(mcep)
Пример #6
0
    def __test_synthesis(filt):
        # dummy source excitation
        source = __dummy_source()

        hopsize = 80

        # dummy filter coef.
        windowed = __dummy_windowed_frames(source,
                                           frame_len=512,
                                           hopsize=hopsize)
        b = pysptk.mcep(windowed, filt.order, 0.0)

        # synthesis
        synthesizer = Synthesizer(filt, hopsize)
        y = synthesizer.synthesis(source, b)
        assert np.all(np.isfinite(y))
Пример #7
0
def freq_to_mcep(mag_spec, sample_rate, dims=60):
    r"""Convert from magnitude frequency space to mel-cepstral space.

    We use mel-cepstrum (i.e. mel-generalised with :math:`\gamma = 0`) as we do not make assumptions about the SNR.
    """
    mag_spec = mag_spec.astype(np.float64)

    # Convert float to signed-int16 domain.
    data_16bit = mag_spec * 2.**15

    # maxiter=0, etype=1, eps=1e-8, min_det=0.
    mcep = pysptk.mcep(data_16bit,
                       order=dims - 1,
                       alpha=utils.ALPHA[sample_rate],
                       itype=3)
    return mcep
Пример #8
0
def _process_wav(file_list, outfile, winlen, winstep, n_mcep, mcep_alpha,
                 minf0, maxf0, q_channels, type):
    data_dict = {}
    enc = encoder(q_channels)
    for f in tqdm(file_list):
        wav, sr = load(f, sr=None)

        x = wav.astype(float)
        _f0, t = world.harvest(x,
                               sr,
                               f0_floor=minf0,
                               f0_ceil=maxf0,
                               frame_period=winstep *
                               1000)  # can't adjust window size
        f0 = world.stonemask(x, _f0, t, sr)

        window_size = int(sr * winlen)
        hop_size = int(sr * winstep)
        # get mel
        if type == 'mcc':
            nfft = 2**(window_size - 1).bit_length()
            spec = np.abs(
                stft(x,
                     n_fft=nfft,
                     hop_length=hop_size,
                     win_length=window_size,
                     window='blackman'))**2
            h = sptk.mcep(spec,
                          n_mcep - 1,
                          mcep_alpha,
                          eps=-60,
                          etype=2,
                          itype=4).T
        else:
            h = mfcc(x,
                     sr,
                     n_mfcc=n_mcep,
                     n_fft=int(sr * winlen),
                     hop_length=int(sr * winstep))
        h = np.vstack((h, f0))
        # mulaw encode
        wav = enc(x).astype(np.uint8)

        id = os.path.basename(f).replace(".wav", "")
        data_dict[id] = wav
        data_dict[id + "_h"] = h
    np.savez(outfile, **data_dict)
Пример #9
0
def pysptk_features(x):
    import pysptk

    wav_max = 2**15-1
    x = (x * wav_max).astype(np.float64)

    frame_length = 512
    hop_length = 160
    frames = librosa.util.frame(x, frame_length=frame_length, hop_length=hop_length).T
    frames *= pysptk.blackman(frame_length)
    order = 25 # seems to be pretty standard, results in 26 values
    alpha = 0.42 # this value is best for 16kHz sampling according to docs http://ftp.jaist.ac.jp/pub/pkgsrc/distfiles/SPTKref-3.9.pdf
    mcep = pysptk.mcep(frames, order, alpha)

    f0 = pysptk.swipe(x, fs=16000, hopsize=hop_length, min=60, max=240, otype="f0")
    f0 = f0[1:1+mcep.shape[0]] # cut off ends to match mcep lengths

    return np.concatenate([f0[:,np.newaxis], mcep], 1).astype(np.float32)
Пример #10
0
def stft_mcep(x,
              fftl=512,
              shiftl=256,
              dim=25,
              alpha=0.41,
              window="hamming",
              is_padding=False):
    """EXTRACT STFT-BASED MEL-CEPSTRUM.

    Args:
        x (ndarray): Numpy double array with the size (T,).
        fftl (int): FFT length in point (default=512).
        shiftl (int): Shift length in point (default=256).
        dim (int): Dimension of mel-cepstrum (default=25).
        alpha (float): All pass filter coefficient (default=0.41).
        window (str): Analysis window type (default="hamming").
        is_padding (bool): Whether to pad the end of signal (default=False).

    Returns:
        ndarray: Mel-cepstrum with the size (N, n_fft).

    """
    # perform padding
    if is_padding:
        n_pad = fftl - (len(x) - fftl) % shiftl
        x = np.pad(x, (0, n_pad), 'reflect')

    # get number of frames
    n_frame = (len(x) - fftl) // shiftl + 1

    # get window function
    win = get_window(window, fftl)

    # calculate spectrogram
    mcep = [
        pysptk.mcep(x[shiftl * i:shiftl * i + fftl] * win,
                    dim,
                    alpha,
                    eps=EPS,
                    etype=1) for i in range(n_frame)
    ]

    return np.stack(mcep)
Пример #11
0
    def __test_synthesis_levdur(filt):
        # dummy source excitation
        source = __dummy_source()

        hopsize = 80

        # dummy filter coef.
        windowed = __dummy_windowed_frames(source,
                                           frame_len=512,
                                           hopsize=hopsize)
        c = pysptk.mcep(windowed, filt.order)
        lpc = pysptk.levdur(pysptk.c2acr(c))

        # make sure lpc has loggain
        lpc[:, 0] = np.log(lpc[:, 0])

        # synthesis
        synthesizer = Synthesizer(filt, hopsize)
        y = synthesizer.synthesis(source, lpc)
        assert np.all(np.isfinite(y))
Пример #12
0
def stft_mcep(x,
              fftl=512,
              shiftl=256,
              dim=25,
              alpha=0.41,
              window="hamming",
              is_padding=False):
    """FUNCTION TO EXTRACT STFT-BASED MEL-CEPSTRUM

    Args:
        x (ndarray): numpy double array with the size [T]
        fftl (int): fft length in point (default=512)
        shiftl (int): shift length in point (default=256)
        dim (int): dimension of mel-cepstrum (default=25)
        alpha (float): all pass filter coefficient (default=0.41)
        window (str): analysis window type (default="hamming")
        is_padding (bool): whether to pad the end of signal (default=False)

    Return:
        (ndarray): mel-cepstrum with the size [N, n_fft]
    """
    # perform padding
    if is_padding:
        n_pad = fftl - (len(x) - fftl) % shiftl
        x = np.pad(x, (0, n_pad), 'reflect')

    # get number of frames
    n_frame = (len(x) - fftl) // shiftl + 1

    # get window function
    win = get_window(window, fftl)

    # calculate spectrogram
    mcep = [
        pysptk.mcep(x[shiftl * i:shiftl * i + fftl] * win, dim, alpha)
        for i in range(n_frame)
    ]

    return np.stack(mcep)
 def get_MCEP(self, utterance):
     utterance = librosa.util.normalize(utterance)
     utterance = utterance + np.random.normal(
         loc=0, scale=0.0000001, size=utterance.shape[0])
     utterance = librosa.util.normalize(utterance)
     utterance = utterance.astype(np.float64)  # necessary for synthesizer
     frames = librosa.util.frame(utterance,
                                 frame_length=self.frame_length,
                                 hop_length=self.hop_length).astype(
                                     np.float64).T
     # Windowing
     frames *= pysptk.blackman(self.frame_length)
     assert frames.shape[1] == self.frame_length
     # Pitch
     pitch = pysptk.swipe(utterance.astype(np.float64),
                          fs=self.sr,
                          hopsize=self.hop_length,
                          min=60,
                          max=240,
                          otype="pitch")
     mcep = pysptk.mcep(frames, self.order, self.alpha)
     return mcep, pitch
Пример #14
0
def pitch_shift_on_lpc_residual(
    wav,
    sr,
    shift_in_cent,
    frame_length=4096,
    hop_length=240,
    mgc_order=59,
):
    assert wav.dtype == np.int16
    frames = (librosa.util.frame(wav,
                                 frame_length=frame_length,
                                 hop_length=hop_length).astype(np.float64).T)
    frames *= pysptk.blackman(frame_length)
    alpha = pysptk.util.mcepalpha(sr)

    mgc = pysptk.mcep(frames, mgc_order, alpha, eps=1e-5, etype=1)
    c = pysptk.freqt(mgc, mgc_order, -alpha)

    lpc = pysptk.levdur(pysptk.c2acr(c, mgc_order, frame_length))
    # remove gain
    lpc[:, 0] = 0

    # Compute LPC residual
    synth = Synthesizer(AllZeroDF(mgc_order), hop_length)
    wav_lpc = synth.synthesis(wav.astype(np.float64), -lpc)
    residual = wav - wav_lpc

    # Pitch-shift on LPC residual
    residual_shifted = librosa.effects.pitch_shift(residual,
                                                   sr=sr,
                                                   n_steps=shift_in_cent,
                                                   bins_per_octave=1200)

    # Filtering by LPC
    synth = Synthesizer(AllPoleDF(mgc_order), hop_length)
    wav_shifted = synth.synthesis(residual_shifted, lpc)

    return wav_shifted.astype(np.int16)
Пример #15
0
 def __test_broadcast(dtype):
     frames = windowed_dummy_frames(100, 512, dtype=dtype)
     mc = pysptk.mcep(frames, order, alpha)
     assert np.all(np.isfinite(mc))
     assert frames.shape[0] == mc.shape[0]
Пример #16
0
 def __test(order, alpha):
     mc = pysptk.mcep(x, order, alpha)
     assert np.all(np.isfinite(mc))
Пример #17
0
    def gen_data(self,
                 dir_in,
                 dir_out=None,
                 file_id_list=None,
                 id_list=None,
                 add_deltas=False,
                 return_dict=False):
        """
        Prepare WORLD features from audio files. If add_delta is false labels have the dimension
        num_frames x (num_coded_sps + 3) [mgc(num_coded_sps), lf0, vuv, bap(1)], otherwise
        the deltas and double deltas are added between the features resulting in
        num_frames x (3*num_coded_sps + 7) [mgc(3*num_coded_sps), lf0(3*1), vuv, bap(3*1)].

        :param dir_in:         Directory where the .wav files are stored for each utterance to process.
        :param dir_out:        Main directory where the labels and normalisation parameters are saved to subdirectories.
                               If None, labels are not saved.
        :param file_id_list:   Name of the file containing the ids. Normalisation parameters are saved using
                               this name to differentiate parameters between subsets.
        :param id_list:        The list of utterances to process.
                               Should have the form uttId1 \\n uttId2 \\n ...\\n uttIdN.
                               If None, all file in audio_dir are used.
        :param add_deltas:     Add deltas and double deltas to all features except vuv.
        :param return_dict:    If true, returns an OrderedDict of all samples as first output.
        :return:               Returns two normalisation parameters as tuple. If return_dict is True it returns
                               all processed labels in an OrderedDict followed by the two normalisation parameters.
        """

        # Fill file_id_list by .wav files in dir_in if not given and set an appropriate file_id_list_name.
        if id_list is None:
            id_list = list()
            filenames = glob.glob(os.path.join(dir_in, "*.wav"))
            for filename in filenames:
                id_list.append(os.path.splitext(os.path.basename(filename))[0])
            file_id_list_name = "all"
        else:
            file_id_list_name = os.path.splitext(
                os.path.basename(file_id_list))[0]

        # Create directories in dir_out if it is given.
        if dir_out is not None:
            if add_deltas:
                makedirs_safe(os.path.join(dir_out, self.dir_deltas))
            else:
                makedirs_safe(os.path.join(dir_out, self.dir_lf0))
                makedirs_safe(os.path.join(dir_out, self.dir_vuv))
                makedirs_safe(os.path.join(dir_out, self.dir_coded_sps))
                makedirs_safe(os.path.join(dir_out, self.dir_bap))

        # Create the return dictionary if required.
        if return_dict:
            label_dict = OrderedDict()

        if add_deltas:
            # Create normalisation computation units.
            norm_params_ext_coded_sp = MeanCovarianceExtractor()
            norm_params_ext_lf0 = MeanCovarianceExtractor()
            norm_params_ext_bap = MeanCovarianceExtractor()
        else:
            # Create normalisation computation units.
            norm_params_ext_coded_sp = MeanStdDevExtractor()
            norm_params_ext_lf0 = MeanStdDevExtractor()
            # norm_params_ext_vuv = MeanStdDevExtractor()
            norm_params_ext_bap = MeanStdDevExtractor()

        logging.info("Extract WORLD{} features for".format(
            "" if not add_deltas else " deltas") +
                     "[{0}]".format(", ".join(str(i) for i in id_list)))
        for file_name in id_list:

            # Load audio file and extract features.
            audio_name = os.path.join(dir_in, file_name + ".wav")
            raw, fs = soundfile.read(audio_name)
            logging.debug("Extract WORLD{} features from {} at {}Hz.".format(
                "" if not add_deltas else " deltas", file_name, fs))
            f0, sp, ap = pyworld.wav2world(raw, fs)

            file_name = os.path.basename(file_name)  # Remove speaker.

            # Compute lf0 and vuv information.
            lf0 = np.log(f0.clip(min=1E-10), dtype=np.float32)
            lf0[lf0 <= math.log(self.f0_silence_threshold)] = self.lf0_zero
            lf0, vuv = interpolate_lin(lf0)
            lf0 = lf0.astype(dtype=np.float32)
            vuv = vuv.astype(dtype=np.float32)
            # Throw a warning when less then 5% of all frames are unvoiced.
            if vuv.sum() / len(vuv) < 0.05:
                self.logger.warning(
                    "Detected only {:.0f}% [{}/{}] unvoiced frames in {}.".
                    format(vuv.sum() / len(vuv) * 100.0, int(vuv.sum()),
                           len(vuv), file_name))

            # Decode spectrum to a lower dimension and aperiodicity to one band aperiodicity.
            # coded_sp = pyworld.code_spectral_envelope(sp, fs, WorldFeatLabelGen.num_coded_sps)  # Cepstral version.
            coded_sp = np.sqrt(sp) * 32768.0
            coded_sp = np.array(pysptk.mcep(coded_sp,
                                            order=self.num_coded_sps - 1,
                                            alpha=self.mgc_alpha,
                                            eps=1.0e-8,
                                            min_det=0.0,
                                            etype=1,
                                            itype=3),
                                dtype=np.float32)
            bap = np.array(pyworld.code_aperiodicity(ap, fs), dtype=np.float32)

            if add_deltas:
                # Compute the deltas and double deltas for all features.
                lf0_deltas, lf0_double_deltas = compute_deltas(lf0)
                coded_sp_deltas, coded_sp_double_deltas = compute_deltas(
                    coded_sp)
                bap_deltas, bap_double_deltas = compute_deltas(bap)

                coded_sp = np.concatenate(
                    (coded_sp, coded_sp_deltas, coded_sp_double_deltas),
                    axis=1)
                lf0 = np.concatenate((lf0, lf0_deltas, lf0_double_deltas),
                                     axis=1)
                bap = np.concatenate((bap, bap_deltas, bap_double_deltas),
                                     axis=1)

                # Combine them to a single feature sample.
                labels = np.concatenate((coded_sp, lf0, vuv, bap), axis=1)

                # Save into return dictionary and/or file.
                if return_dict:
                    label_dict[file_name] = labels
                if dir_out is not None:
                    labels.tofile(
                        os.path.join(dir_out, self.dir_deltas,
                                     file_name + self.ext_deltas))

            else:
                # Save into return dictionary and/or file.
                if return_dict:
                    label_dict[file_name] = np.concatenate(
                        (coded_sp, lf0, vuv, bap), axis=1)
                if dir_out is not None:
                    coded_sp.tofile(
                        os.path.join(dir_out, self.dir_coded_sps,
                                     file_name + self.ext_coded_sp))
                    lf0.tofile(
                        os.path.join(dir_out, self.dir_lf0,
                                     file_name + self.ext_lf0))
                    vuv.astype(np.float32).tofile(
                        os.path.join(dir_out, self.dir_vuv,
                                     file_name + self.ext_vuv))
                    bap.tofile(
                        os.path.join(dir_out, self.dir_bap,
                                     file_name + self.ext_bap))

            # Add sample to normalisation computation unit.
            norm_params_ext_coded_sp.add_sample(coded_sp)
            norm_params_ext_lf0.add_sample(lf0)
            # norm_params_ext_vuv.add_sample(vuv)
            norm_params_ext_bap.add_sample(bap)

        # Save mean and std dev of all features.
        if not add_deltas:
            norm_params_ext_coded_sp.save(
                os.path.join(dir_out, self.dir_coded_sps, file_id_list_name))
            norm_params_ext_lf0.save(
                os.path.join(dir_out, self.dir_lf0, file_id_list_name))
            # norm_params_ext_vuv.save(os.path.join(dir_out, WorldFeatLabelGen.dir_vuv, file_id_list_name))
            norm_params_ext_bap.save(
                os.path.join(dir_out, self.dir_bap, file_id_list_name))
        else:
            self.logger.info("Write norm_prams to{}".format(
                os.path.join(dir_out, self.dir_deltas, "_".join(
                    (file_id_list_name, self.dir_coded_sps)))))
            norm_params_ext_coded_sp.save(
                os.path.join(dir_out, self.dir_deltas, "_".join(
                    (file_id_list_name, self.dir_coded_sps))))
            norm_params_ext_lf0.save(
                os.path.join(dir_out, self.dir_deltas, "_".join(
                    (file_id_list_name, self.dir_lf0))))
            norm_params_ext_bap.save(
                os.path.join(dir_out, self.dir_deltas, "_".join(
                    (file_id_list_name, self.dir_bap))))

        # Get normalisation parameters.
        if not add_deltas:
            norm_coded_sp = norm_params_ext_coded_sp.get_params()
            norm_lf0 = norm_params_ext_lf0.get_params()
            # norm_vuv = norm_params_ext_vuv.get_params()
            norm_bap = norm_params_ext_bap.get_params()

            norm_first = np.concatenate(
                (norm_coded_sp[0], norm_lf0[0], (0.0, ), norm_bap[0]), axis=0)
            norm_second = np.concatenate(
                (norm_coded_sp[1], norm_lf0[1], (1.0, ), norm_bap[1]), axis=0)

        else:
            norm_coded_sp = norm_params_ext_coded_sp.get_params()
            norm_lf0 = norm_params_ext_lf0.get_params()
            # norm_vuv = norm_params_ext_vuv.get_params()
            norm_bap = norm_params_ext_bap.get_params()

            norm_first = (norm_coded_sp[0], norm_lf0[0], (0.0, ), norm_bap[0])
            norm_second = (norm_coded_sp[1], norm_lf0[1], (1.0, ), norm_bap[1])

        if return_dict:
            # Return dict of labels for all utterances.
            return label_dict, norm_first, norm_second
        else:
            return norm_first, norm_second
Пример #18
0
 def __test_itype(itype=0):
     pysptk.mcep(x, itype=itype)
Пример #19
0
def get_random_peseudo_mcep(order=24, alpha=0.42):
    T, N = 100, 513
    frames = np.random.rand(T, N) * pysptk.blackman(N)
    mc = pysptk.mcep(frames, order=order, alpha=alpha)
    return mc
Пример #20
0
 def __test_min_det(min_det):
     pysptk.mcep(x, min_det=min_det)
Пример #21
0
def sp2mgc(sp, dim, sr):
    return pysptk.mcep(sp, order=dim-1, alpha=get_world_alpha(sr), \
                maxiter=0, etype=1, eps=0.0, min_det=1e-06, itype=4)
Пример #22
0
def load_mfcc_mceps_VCTK(list_train_data, data_folder, speaker,
                         config_mfcc_mceps):
    '''extract normalized mfcc and mceps from list of data path
  input:
    list_train_data: path to file name with audio tracks title for target
    data_folder: path to data folder
    speaker: code for target speaker  
    mfcc_mceps setting dictionary
  return:
    dictionary:
      key: speaker code + _ + audio name
      value: tuple (mfcc normalized, mceps normalized)
    target scaler:
      contains mcep mean and variance of target speaker in order to scale back mcep results
  '''
    root = data_folder
    _data_x = {}
    target_scaler = {}
    with open(list_train_data, 'r') as ft:
        count_errors = 0
        lines = ft.readlines()
        total_mceps = np.empty(
            (0, config_mfcc_mceps['order_mcep'] + 1),
            float)  #used to store mean and std for denormalize results
        for l in lines:
            l = l.strip()
            speaker_f, _ = l.split('_')
            if speaker_f != speaker:
                continue
            wav_path = root + speaker + '/' + l + '.wav'
            try:
                x, _ = librosa.load(wav_path,
                                    sr=config_mfcc_mceps["sampling_frequency"])
                mfccs = librosa.feature.mfcc(
                    y=x,
                    sr=config_mfcc_mceps["sampling_frequency"],
                    n_mfcc=config_mfcc_mceps["order_mfcc"],
                    n_fft=config_mfcc_mceps["n_fft"],
                    hop_length=config_mfcc_mceps["hop_length"])
                mfccs = normalize_mfcc(
                    mfccs.T
                ).T  #transpose twice in order to normalize on right axis

                ## pad the extracted x in order to frame it to have same number of mceps and mfccs
                mfcc_l = math.ceil(x.shape[0] / config_mfcc_mceps["hop_length"]
                                   )  #number of 10ms frames expected
                mcep_l = math.ceil((x.shape[0] - config_mfcc_mceps["n_fft"]) /
                                   config_mfcc_mceps["hop_length"]
                                   )  #number of 10ms frames without 0 padding
                final_shape = x.shape[0] + config_mfcc_mceps["hop_length"] * (
                    mfcc_l - mcep_l
                )  #compute new shape in order to get same number of mceps and mfcc frames
                x.resize((final_shape, ))

                frames = librosa.util.frame(
                    x,
                    frame_length=config_mfcc_mceps["n_fft"],
                    hop_length=config_mfcc_mceps["hop_length"]).astype(
                        np.float64).T
                # mceps = pysptk.mcep(frames, config_mfcc_mceps['order_mcep'], etype=1, eps=1e-5)
                mceps = pysptk.mcep(frames, config_mfcc_mceps['order_mcep'])

                total_mceps = np.vstack((total_mceps, mceps))

                id_ = "_" + l
                _data_x[id_] = (
                    mfccs.T, mceps
                )  #Don't forget mfcc.T -> now both have shape (#frames, #mfcc/mceps)
            except:
                #print(f"Error file: {wav_path}")
                count_errors += 1

        #print(f"\nTotal errors: {count_errors}\n")
    # compute mean and std for all mceps
    target_scaler["mean"] = np.mean(total_mceps, 0)
    target_scaler["std"] = np.std(total_mceps, 0)
    #apply normalization
    for k, v in _data_x.items():
        mcep = v[1]
        mcep = (mcep - target_scaler["mean"]) / target_scaler["std"]
        _data_x[k] = (v[0], mcep)

    #convert to list to save to file
    target_scaler["mean"] = list(target_scaler["mean"])
    target_scaler["std"] = list(target_scaler["std"])

    print(f"Total Seconds of audio: {total_mceps.shape[0]/100}")

    return _data_x, target_scaler
                            frame_length=FRAME_LENGTH,
                            hop_length=HOP_LENGTH).astype(np.float64).T
frames *= pysptk.blackman(FRAME_LENGTH)  # 窓掛け(ブラックマン窓)

# ピッチ抽出
pitch = pysptk.swipe(x,
                     fs=fs,
                     hopsize=HOP_LENGTH,
                     min=MIN_F0,
                     max=MAX_F0,
                     otype="pitch")

# 励振源信号(声帯音源)の生成
source_excitation = pysptk.excite(pitch, HOP_LENGTH)

# メルケプストラム分析(=スペクトル包絡の抽出)
mc = pysptk.mcep(frames, ORDER, ALPHA)  # メルケプストラム係数の抽出

# メルケプストラム係数からMLSAディジタルフィルタ係数に変換
mlsa_coef = pysptk.mc2b(mc, ALPHA)

# MLSAフィルタの作成
synthesizer = Synthesizer(MLSADF(order=ORDER, alpha=ALPHA), HOP_LENGTH)

# 励振源信号でフィルタを駆動して音声を合成
y = synthesizer.synthesis(source_excitation, mlsa_coef)

# 音声の書き込み
y = y.astype(np.int16)
wavfile.write(OUT_WAVE_FILE, fs, y)
Пример #24
0
def test_mcep_failure():
    pysptk.mcep(np.ones(256), 40, 0.41)
Пример #25
0
 def __test_min_det(min_det):
     pysptk.mcep(x, min_det=min_det)
Пример #26
0
 def __test_eps(etype=0, eps=0.0):
     pysptk.mcep(x, etype=etype, eps=eps)
Пример #27
0
 def __test_itype(itype=0):
     pysptk.mcep(x, itype=itype)
Пример #28
0
 def __test_eps(etype=0, eps=0.0):
     pysptk.mcep(x, etype=etype, eps=eps)
Пример #29
0
def load_mfcc_mceps(path_to_data, config_mfcc_mceps):
    '''extract normalized mfcc and mceps from list of data path
  input:
    list of paths to data, 
    mfcc_mceps setting dictionary
  return:
    dictionary:
      key: speaker code + _ + audio name
      value: tuple (mfcc normalized, mceps normalized)
    target scaler:
      contains mcep mean and variance of target speaker in order to scale back mcep results
  '''
    _data_x = {}
    path_audios = os.listdir(path_to_data)
    total_mceps = np.empty(
        (0, config_mfcc_mceps['order_mcep'] + 1),
        float)  #used to store mean and std for denormalize results
    target_scaler = {}
    for p in path_audios:
        if p.split(".")[-1] != "wav":
            continue
        x, _ = librosa.load(path_to_data + '/' + p,
                            sr=config_mfcc_mceps["sampling_frequency"])

        mfcc_l = math.ceil(x.shape[0] / config_mfcc_mceps["hop_length"])
        mcep_l = math.ceil((x.shape[0] - config_mfcc_mceps["n_fft"]) /
                           config_mfcc_mceps["hop_length"])
        final_shape = x.shape[0] + config_mfcc_mceps["hop_length"] * (mfcc_l -
                                                                      mcep_l)
        x.resize((final_shape, ))
        frames = librosa.util.frame(
            x,
            frame_length=config_mfcc_mceps["n_fft"],
            hop_length=config_mfcc_mceps["hop_length"]).astype(np.float64).T
        # Windowing
        frames *= pysptk.blackman(config_mfcc_mceps["n_fft"], normalize=1)
        mceps = pysptk.mcep(frames, config_mfcc_mceps['order_mcep'])  #,alpha)
        total_mceps = np.vstack((total_mceps, mceps))
        mfccs = librosa.feature.mfcc(
            y=x,
            sr=config_mfcc_mceps["sampling_frequency"],
            n_mfcc=config_mfcc_mceps["order_mfcc"],
            n_fft=config_mfcc_mceps["n_fft"],
            hop_length=config_mfcc_mceps["hop_length"])
        mfccs = normalize_mfcc(
            mfccs.T).T  #transpose twice in order to normalize on right axis
        id_ = "_" + p
        _data_x[id_] = (
            mfccs.T, mceps
        )  #Don't forget mfcc.T -> now both have shape (#frames, #mfcc/mceps)

    target_scaler["mean"] = list(np.mean(total_mceps, 0))
    target_scaler["std"] = list(np.std(total_mceps, 0))

    #apply normalization
    for k, v in _data_x.items():
        mcep = v[1]
        mcep = (mcep - target_scaler["mean"]) / target_scaler["std"]
        _data_x[k] = (v[0], mcep)

    return _data_x, target_scaler
Пример #30
0
def test_mcep_failure():
    pysptk.mcep(np.ones(256), 40, 0.41)
Пример #31
0
def test_mc2b():
    x = windowed_dummy_data(1024)
    mc = pysptk.mcep(x)
    assert pysptk.mc2e(mc) > 0
Пример #32
0
    # Windowing
    frames *= pysptk.blackman(frame_length)

    assert frames.shape[1] == frame_length

    pitch = pysptk.swipe(x.astype(np.float64),
                         fs=sr,
                         hopsize=hop_length,
                         min=60,
                         max=240,
                         otype="pitch")
    source_excitation = pysptk.excite(pitch, hop_length)

    # Order of mel-cepstrum

    mc = pysptk.mcep(frames, order, alpha)
    logH = pysptk.mgc2sp(mc, alpha, 0.0, frame_length).real
    print(mc.shape)
    #plt.plot(mc)
    #plotname="x_syn_coefs_" + str(order) + ".png"
    #plt.savefig(plotname)

    # Convert mel-cesptrum to MLSADF coefficients
    b = pysptk.mc2b(mc, alpha)

    synthesizer = Synthesizer(MLSADF(order=order, alpha=alpha), hop_length)

    x_synthesized = synthesizer.synthesis(source_excitation, b)

    filenam = "synthesized_sounds/" + "x_syn" + str(order + 1) + ".wav"
    #wavfile.write("x.wav", sr, x)
Пример #33
0
 def __test(order, alpha):
     mc = pysptk.mcep(x, order, alpha)
     assert np.all(np.isfinite(mc))
                            hop_length=HOP_LENGTH).astype(np.float64).T
frames *= pysptk.blackman(FRAME_LENGTH)  # 窓掛け(ブラックマン窓)

# ピッチ抽出
pitch = pysptk.swipe(x,
                     fs=fs,
                     hopsize=HOP_LENGTH,
                     min=MIN_F0,
                     max=MAX_F0,
                     otype="pitch")

# 励振源信号(声帯音源)の生成
source_excitation = pysptk.excite(pitch, HOP_LENGTH)

# メルケプストラム分析(=スペクトル包絡の抽出)
mc = pysptk.mcep(frames, ORDER, ALPHA)

# メルケプストラム係数からMLSAディジタルフィルタ係数に変換
mlsa_coef = pysptk.mc2b(mc, ALPHA)

# MLSAフィルタの作成
synthesizer = Synthesizer(MLSADF(order=ORDER, alpha=ALPHA), HOP_LENGTH)

# #### 以降、合成フィルタのパラメタなどを変えて色々な音声を合成

# ### ピッチシフト (音を高くする) ###
OUT_WAVE_FILE = "pitchshift_high.wav"
PITCH_SHIFT = 0.5  # 音を高くする場合は 1より小さい倍率
excitation_pitchhigh = pysptk.excite(pitch * PITCH_SHIFT, HOP_LENGTH)
y = synthesizer.synthesis(excitation_pitchhigh, mlsa_coef)  # 音声合成
y = y.astype(np.int16)
Пример #35
0
def mel_cepstrum(frames):
    mc = ps.mcep(frames, ORDER, ALPHA, eps=0, etype=1)
    # logH = ps.mgc2sp(mc, ALPHA, 0.0, FRAME_LENGTH).real
    return mc