def mel_transform(signal, sample_rate = 8000, pre_emphasis = 0.97
                 , frame_size = 0.025, frame_stride = 0.01, window_func = np.hamming
                 , N_FFT = 512, nfilt = 40, mean_normalised = True):
    feat, energy = fbank(signal, samplerate = sample_rate, winlen = frame_size
                        , winstep = frame_stride, nfilt = nfilt, nfft = N_FFT
                        , preemph = pre_emphasis, winfunc = np.hamming)
    return np.log(feat)
Пример #2
0
 def computeLogMelFilterBank(self, file_name):
     '''
     Compute the log-mel frequency filterbank feature vector with deltas and
     double deltas
     '''
     (rate, sig) = wav.read(file_name)
     fbank_feat, energy = fbank(sig, rate, winlen=0.025, winstep=0.01, nfilt=40)
     fbank_feat = np.log(fbank_feat)
     fbank_feat = np.vstack((fbank_feat.transpose(), energy.transpose())).transpose()
     deltas = self.computeDeltas(fbank_feat)
     assert deltas.shape == fbank_feat.shape, "Shapes not equal {0} and \
     {1}".format(deltas.shape, fbank_feat.shape)
     feat_vec = np.vstack((fbank_feat.transpose(), deltas.transpose()))
     double_deltas = self.computeDeltas(deltas)
     feat_vec = np.vstack((feat_vec, double_deltas.transpose())).transpose()
     assert len(feat_vec[0]) == 123, "Something wrong with feature vector dimensions..."
     return feat_vec
def mfcc_without_dct(signal,
                     samplerate=16000,
                     winlen=0.025,
                     winstep=0.01,
                     numcep=13,
                     nfilt=26,
                     nfft=512,
                     lowfreq=0,
                     highfreq=None,
                     preemph=0.97,
                     ceplifter=22,
                     appendEnergy=True,
                     winfunc=lambda x: numpy.ones((x, ))):
    """Compute MFCC features from an audio signal.

    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param numcep: the number of cepstrum to return, default 13
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
    :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
    :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
    """
    feat, energy = fbank(signal, samplerate, winlen, winstep, nfilt, nfft,
                         lowfreq, highfreq, preemph, winfunc)
    feat = numpy.log(feat)
    # feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
    # feat = lifter(feat,ceplifter)
    if appendEnergy:
        feat[:, 0] = numpy.log(
            energy
        )  # replace first cepstral coefficient with log of frame energy
    return feat
Пример #4
0
    def generator():
        if is_training:
            _wav_files, _labels = _shuffle(wav_files, labels)
        else:
            _wav_files, _labels = wav_files, labels

        for wav_file, label in zip(_wav_files, _labels):
            signal, sample_rate, _ = read_audio(wav_file)
            num_frames = ceil(desired_ms / window_stride_ms)
            num_samples = from_ms_to_samples(desired_ms, sample_rate)
            if input_feature == 'fbank':
                feat, _ = fbank(signal,
                                sample_rate,
                                winlen=window_size_ms / 1000,
                                winstep=window_stride_ms / 1000,
                                nfilt=input_feature_dim)
                feat = _random_select(feat, num_frames)
            elif input_feature == 'logfbank':
                feat = logfbank(signal,
                                sample_rate,
                                winlen=window_size_ms / 1000,
                                winstep=window_stride_ms / 1000,
                                nfilt=input_feature_dim)
                feat = _random_select(feat, num_frames)
            elif input_feature == 'mfcc':
                feat = mfcc(signal,
                            sample_rate,
                            winlen=window_size_ms / 1000,
                            winstep=window_stride_ms / 1000,
                            nfilt=input_feature_dim,
                            numcep=input_feature_dim)
                feat = _random_select(feat, num_frames)
            elif input_feature == 'raw':
                feat = np.expand_dims(signal, 1)
                feat = _random_select(feat, num_samples)

            # norm per dimension across all frames
            if normalize_frames:
                feat = _normalize_frames(feat)
            yield (feat, label)
Пример #5
0
def run_main():
    
    if len(sys.argv) <= 1:
        raise Exception("Need to specify input wav-file to process")
    
    wavname = sys.argv[1]
    
    if not os.path.exists(wavname):
        raise Exception("Specified wavfile {0} does not seem to exist!".format(wavname))

    print("Will process file {0}".format(wavname))

    (samplerate, signal) = wav.read(wavname)
    sampleperiod = 1.0 / samplerate 
    signal = signal.reshape( (-1, 1) )

    fft_size = 256
    nfilters = 15

    signal = utils_sig.pad_to_multiple_of(signal, fft_size, 0.0)
    sigchunks = utils_sig.cut_sig_into_chunks(signal.T, fft_size)
    spec_envs = utils_sp.get_spec_envelopes(sigchunks)
    fbank_envs = utils_sp.get_mel_fb_curves(spec_envs, samplerate, nfilters)

    timestep = float(fft_size) / float(samplerate)
    (fbank_envs_py, _) = psf.fbank(signal,samplerate=samplerate,winlen=timestep,winstep=timestep,
      nfilt=nfilters,nfft=fft_size,lowfreq=0,highfreq=None,preemph=0)

    #simple_plot(signal, numpy.arange(signal.shape[0]) * sampleperiod)
    #simple_plot(fbank_envs[30,:])
    #simple_plot(fbank_envs_py[30,:])

    print(fbank_envs.shape)
    print(fbank_envs_py.shape)

    print(fbank_envs.dtype)
    print(fbank_envs_py.dtype)

    fbank_envs.tofile('./tmp/my_fbank.bin')
    fbank_envs_py.tofile('./tmp/py_fbank.bin')
def lift(signal,
         samplerate=16000,
         winlen=0.08,
         winstep=0.04,
         numcep=39,
         nfilt=39,
         nfft=2048,
         lowfreq=12.5,
         highfreq=None,
         preemph=0.97,
         ceplifter=39,
         winfunc=lambda x: numpy.ones((x, ))):
    feat, energy = fbank(signal, samplerate, winlen, winstep, nfilt, nfft,
                         lowfreq, highfreq, preemph, winfunc)
    feat = numpy.log(feat)
    feat = dct(feat,
               n=max(numcep, feat.shape[1]),
               type=2,
               axis=1,
               norm='ortho')[:, :numcep]
    feat = lifter(feat, ceplifter)
    return feat
Пример #7
0
 def __extractFeatures(stackedWav, numSteps, numFilt, samplerate, winlen,
                       winstep):
     '''
     [number of waves, Len(wave)]
     returns [number of waves, numSteps, numFilt]
     All waves are assumed to be of fixed length
     '''
     assert stackedWav.ndim == 2, 'Should be [number of waves, len(wav)]'
     extractedList = []
     eps = 1e-10
     for sample in stackedWav:
         temp, _ = fbank(sample,
                         samplerate=samplerate,
                         winlen=winlen,
                         winstep=winstep,
                         nfilt=numFilt,
                         winfunc=np.hamming)
         temp = np.log(temp + eps)
         assert temp.ndim == 2, 'Should be [numSteps, numFilt]'
         assert temp.shape[0] == numSteps, 'Should be [numSteps, numFilt]'
         extractedList.append(temp)
     return np.array(extractedList)
def extract_mfb(filename, feat_dir, mode, count):
    audio, sr = librosa.load(filename, sr=c.SR, mono=True)
    features, energies = fbank(signal=audio, samplerate=c.SR, nfilt=c.FILTER_BANK, winlen=0.025)

    if c.USE_LOGSCALE:
        features = 20 * np.log10(np.maximum(features, 1e-5))

    features = normalize_frame(features, scale=c.USE_SCALE)
    print(features.shape)  # features_shape : (# of frames, nfilt)

    output_folder_name, output_file_name = convert_wav_to_feature(filename, feat_dir, mode=mode)

    if not os.path.exists(output_folder_name):
        os.makedirs(output_folder_name)

    if os.path.isfile(output_file_name):
        print('\'' + '/'.join(output_file_name.split('/')[-3:]) + '\'' + 'file already extracted!')
    else:
        with open(output_file_name, 'wb') as fp:
            pickle.dump(features, fp)
            print('[%s]feature extraction (%s DB). step : %d, file : \'%s\''
                  % ('MFB', mode, count, '/'.join(filename.split('/')[-3:])))
Пример #9
0
def get_features(filename, numcep, numfilt, winlen, winstep, grad):

    f = Sndfile(filename, 'r')

    frames = f.nframes
    samplerate = f.samplerate
    data = f.read_frames(frames)
    data = np.asarray(data)

    #calc mfcc
    feat_raw, energy = sf.fbank(data,
                                samplerate,
                                winlen,
                                winstep,
                                nfilt=numfilt)
    feat = np.log(feat_raw)
    feat = sf.dct(feat, type=2, axis=1, norm='ortho')[:, :numcep]
    feat = sf.lifter(feat, L=22)
    feat = np.asarray(feat)

    #calc log energy
    log_energy = np.log(energy)  #np.log( np.sum(feat_raw**2, axis=1) )
    log_energy = log_energy.reshape([log_energy.shape[0], 1])

    mat = (feat - np.mean(feat, axis=0)) / (0.5 * np.std(feat, axis=0))
    mat = np.concatenate((mat, log_energy), axis=1)

    #calc first order derivatives
    if grad >= 1:
        gradf = np.gradient(mat)[0]
        mat = np.concatenate((mat, gradf), axis=1)

    #calc second order derivatives
    if grad == 2:
        grad2f = np.gradient(gradf)[0]
        mat = np.concatenate((mat, grad2f), axis=1)

    return mat, frames, samplerate
Пример #10
0
def get_kaldi_features(wav_, y_, X_):
    '''
    Get Kaldi - Discrete FFT features
    :param wav_: list of trimmed wav file
    :param y   : Array of accents
    :param filename: Array of filenames 
    :return (numpy array): array of (mfcc, filter_banks, delta_1, delta_2), accent array (utternace level), dict(filename,number of frames)
    '''
    n_mfcc   = 13
    n_filt   = 32
    features = []
    target   = []
    f_len    = defaultdict(list)
    for wav, accent, x_arr in (zip(wav_, y_, np.array(X_))):
        if len(wav) > 0:
            mfcc_                  = mfcc(wav, samplerate=16000, winlen=0.025, winstep=0.01, numcep=n_mfcc)
            filter_banks, energies = fbank(wav, samplerate=16000, nfilt=n_filt)
            filter_banks           = 20 * np.log10(np.maximum(filter_banks,1e-5))
            delta_1                = delta(filter_banks, N=1)
            delta_2                = delta(delta_1, N=1)

            filter_banks = normalize_frames(filter_banks, Scale=True)
            delta_1      = normalize_frames(delta_1, Scale=True)
            delta_2      = normalize_frames(delta_2, Scale=True)
            accent_      = list(itertools.repeat(accent, len(mfcc_)))
            dummies      = list(itertools.repeat(x_arr[1:], len(mfcc_)))
            frames_features = np.hstack([mfcc_, filter_banks, delta_1, delta_2, dummies])
            features.append(frames_features)
            target.append(accent_)
            f_len[x_arr[0]] = [len(mfcc_),accent]# num of frames
    features = np.vstack(features)
    target   = np.hstack(target)
    df = pd.DataFrame.from_dict(f_len,orient='index').reset_index()
    df.columns = ['filename', 'frame_len', 'accent']
    return features, target, df

#def get_kaldi_features(wav, accent, x_arr):
    '''
Пример #11
0
def extract_mfcc(signal,
                 samplerate=16000,
                 winlen=0.025,
                 winstep=0.01,
                 numcep=13,
                 nfilt=26,
                 nfft=512,
                 lowfreq=0,
                 highfreq=None,
                 preemph=0.97,
                 ceplifter=22,
                 appendEnergy=True,
                 winfunc=lambda x: numpy.ones((x, ))):

    feat, energy = fbank(signal, samplerate, winlen, winstep, nfilt, nfft,
                         lowfreq, highfreq, preemph, winfunc)
    feat = numpy.log(feat)
    feat = dct(feat, type=2, axis=1, norm='ortho')[:, :numcep]
    feat = lifter(feat, ceplifter)
    if appendEnergy:
        feat = numpy.c_[feat, numpy.log(
            energy)]  # append cepstral coefficient with log of frame energy
    return feat, numpy.log(energy)
Пример #12
0
def mk_MFB(filename, sample_rate=c.SAMPLE_RATE, use_delta=c.USE_DELTA):
    audio, sr = librosa.load(filename, sr=sample_rate, mono=True)
    audio = audio.flatten()

    filter_banks, energies = fbank(audio,
                                   samplerate=sample_rate,
                                   nfilt=c.FILTER_BANK,
                                   winlen=0.025)
    delta_1 = delta(filter_banks, N=1)
    delta_2 = delta(delta_1, N=1)

    filter_banks = normalize_frames(filter_banks)
    delta_1 = normalize_frames(delta_1)
    delta_2 = normalize_frames(delta_2)

    if use_delta:
        frames_features = np.hstack([filter_banks, delta_1, delta_2])
    else:
        frames_features = filter_banks

    np.save(filename.replace('.wav', '.npy'), frames_features)

    return
def feature_extract(filename, wavpath, tgpath):
    filename = os.path.splitext(filename)[0]
    wav_filename = wavpath + '/' + filename + '.wav'
    tg_filename = tgpath + '/' + filename + '.TextGrid'

    y, sr = read_wav(wav_filename)
    _mfccs = fbank(signal=y,
                   samplerate=sr,
                   winfunc=np.hamming,
                   winlen=0.02,
                   nfilt=40)[0]
    print(_mfccs.shape)
    # mfccs = mfcc(signal=y,samplerate=sr,winlen=0.02,winfunc=np.hamming)
    # delta1 = delta(mfccs,1)
    # delta2 = delta(mfccs,2)
    #
    # _mfccs = np.concatenate((mfccs,delta1,delta2),1)
    _mfccs = normalize(_mfccs)
    _mfccs = get_martix(_mfccs, 30, 10)

    _labels = read_textgrid(tg_filename, len(_mfccs))
    _labels = to_one_hot(_labels)
    return _mfccs, _labels
Пример #14
0
def filter(samplerate,
           signal,
           winlen=0.02,
           winstep=0.01,
           nfilt=40,
           nfft=512,
           lowfreq=100,
           highfreq=5000,
           preemph=0.97):
    """extracts mel filterbank energies from a given signal

  Args:
    samplerate (int): samples taken per second
    signal(1d numpy array): sample values
    winlen(float): sliding window size in seconds
    winstep(float): overlap of sliding windows in seconds
    nfilt(int): number of mel filters to apply
    nfft(int): size of the discrete fourier transform to use
    lowfreq(int): lowest frequency to collect
    highfreq(int): highest frequency to collect
    preemph(float): preemphesis factor

  Returns:
    feat(2d numpy array): filterbank energies

  """
    feat, energy = speechfeatures.fbank(np.array(signal),
                                        samplerate,
                                        winlen=winlen,
                                        winstep=winstep,
                                        nfilt=nfilt,
                                        nfft=nfft,
                                        lowfreq=lowfreq,
                                        highfreq=highfreq,
                                        preemph=preemph)

    return np.swapaxes(feat, 0, 1)
Пример #15
0
def logfbank_features(signal,
                      samplerate=44100,
                      fps=24,
                      num_filt=40,
                      num_cepstra=40,
                      nfft=8192,
                      **kwargs):
    winstep = 2 / fps
    winlen = winstep * 2
    feat, energy = psf.fbank(signal=signal,
                             samplerate=samplerate,
                             winlen=winlen,
                             winstep=winstep,
                             nfilt=num_filt,
                             nfft=nfft)
    feat = np.log(feat)
    feat = psf.dct(feat, type=2, axis=1, norm='ortho')[:, :num_cepstra]
    feat = psf.lifter(feat, L=22)
    feat = np.asarray(feat)

    energy = np.log(energy)
    energy = energy.reshape([energy.shape[0], 1])

    if feat.shape[0] > 1:
        std = 0.5 * np.std(feat, axis=0)
        mat = (feat - np.mean(feat, axis=0)) / std
    else:
        mat = feat

    mat = np.concatenate((mat, energy), axis=1)

    duration = signal.shape[0] / samplerate
    expected_frames = fps * duration
    assert mat.shape[
        0] - expected_frames <= 1, "Producted feature number does not match framerate"
    return mat
Пример #16
0
def mk_MFB(filename,
           sample_rate=c.SAMPLE_RATE,
           use_delta=c.USE_DELTA,
           use_scale=c.USE_SCALE,
           use_logscale=c.USE_LOGSCALE):
    audio, sr = librosa.load(filename, sr=sample_rate, mono=True)
    #audio = audio.flatten()

    filter_banks, energies = fbank(audio,
                                   samplerate=sample_rate,
                                   nfilt=c.FILTER_BANK,
                                   winlen=0.025)

    if use_logscale:
        filter_banks = 20 * np.log10(np.maximum(filter_banks, 1e-5))
    '''
    if use_delta:
    delta_1 = delta(filter_banks, N=1)
    delta_2 = delta(delta_1, N=1)

    filter_banks = normalize_frames(filter_banks, Scale=use_scale)
    delta_1 = normalize_frames(delta_1, Scale=use_scale)
    delta_2 = normalize_frames(delta_2, Scale=use_scale)

    frames_features = np.hstack([filter_banks, delta_1, delta_2])
    '''

    #else:
    filter_banks = normalize_frames(filter_banks, Scale=use_scale)
    frames_features = filter_banks

    #print(frames_features.shape)

    #np.save(filename.replace('.wav', '.npy'),frames_features)

    return frames_features
Пример #17
0
def get_fbanks(audio_file):
    def normalize_frames(signal, epsilon=1e-12):
        return np.array([(v - np.mean(v)) / max(np.std(v), epsilon)
                         for v in signal])

    y, sr = librosa.load(audio_file, sr=None)
    assert sr == 16000

    trim_len = int(0.25 * sr)
    if y.shape[0] < 1 * sr:

        return None

    y = y[trim_len:-trim_len]

    filter_banks, energies = psf.fbank(y,
                                       samplerate=sr,
                                       nfilt=64,
                                       winlen=0.025,
                                       winstep=0.01)
    filter_banks = normalize_frames(signal=filter_banks)

    filter_banks = filter_banks.reshape((filter_banks.shape[0], 64, 1))
    return filter_banks
Пример #18
0
    def test_one(self, file_path):

        (rate, sig) = wav.read(file_path)
        assert rate == 16000
        # sig ranges from -32768 to +32768 AND NOT -1 to +1
        feat, energy = fbank(sig,
                             samplerate=rate,
                             nfilt=self.config_file['feat_dim'],
                             winfunc=np.hamming)
        tsteps, hidden_dim = feat.shape
        # calculate log mel filterbank energies for complete file
        feat_log_full = np.reshape(np.log(feat), (1, tsteps, hidden_dim))
        lens = np.array([tsteps])
        inputs, lens = torch.from_numpy(
            np.array(feat_log_full)).float(), torch.from_numpy(
                np.array(lens)).long()
        id_to_phone = {v[0]: k for k, v in self.model.phone_to_id.items()}

        self.model.eval()

        with torch.no_grad():
            if self.cuda:
                inputs = inputs.cuda()
                lens = lens.cuda()

            # Pass through model
            a = time.time()

            outputs = self.model(inputs, lens).cpu().numpy()
            print(time.time() - a)
            # Since only one example per batch and ignore blank token
            outputs = outputs[0, :, :-1]
            softmax = np.exp(outputs) / np.sum(np.exp(outputs), axis=1)[:,
                                                                        None]

        return softmax, id_to_phone
Пример #19
0
    def graves_2013(self, wav_path):
        """
    Alex Graves, Abdel-rahman Mohamed, Geoffrey E. Hinton:
    Speech recognition with deep recurrent neural networks.
    ICASSP 2013: 6645-6649

    FBANK features : (40 fbank, 1 energy * 3)
    The audio data was encoded using a Fourier-transform-based filter-bank with
    40 coefficients (plus energy) distributed on a mel-scale, together with their
    first and second temporal derivatives. Each input vector was therefore size 123.

    For CMVN
    The data were normalised so that every element of the input vec- tors had
    zero mean and unit variance over the training set.

    there is not description about window I chose to use a hanning window.

    I left as default the other options which were not mentioned in the paper
    such as nfft, lowfreq, highfreq, ceplifter, etc.

    :param wav_path: wav file path
    :return: a feature sequence
    """
        (rate, sig) = wav.read(Util.get_file_path(self.basepath, wav_path))
        # computing features
        fbank_feat, _ = \
          fbank(signal=sig, samplerate=rate, nfilt=40, winfunc=np.hanning)

        # adding energy
        energy = np.expand_dims(np.sum(np.power(fbank_feat, 2), axis=-1), 1)
        fbank_e_feat = np.concatenate((energy, fbank_feat), axis=-1)
        # concatenating delta vectors
        delta_feat = delta(fbank_e_feat, 1)
        delta_delta_feat = delta(fbank_e_feat, 2)
        return np.concatenate((fbank_e_feat, delta_feat, delta_delta_feat),
                              axis=1)
 def computeLogMelFilterBank(self, file_name):
     '''
     Compute the log-mel frequency filterbank feature vector with deltas and
     double deltas
     '''
     (rate, sig) = wav.read(file_name)
     fbank_feat, energy = fbank(sig,
                                rate,
                                winlen=0.025,
                                winstep=0.01,
                                nfilt=40)
     fbank_feat = np.log(fbank_feat)
     fbank_feat = np.vstack(
         (fbank_feat.transpose(), energy.transpose())).transpose()
     deltas = self.computeDeltas(fbank_feat)
     assert deltas.shape == fbank_feat.shape, "Shapes not equal {0} and \
     {1}".format(deltas.shape, fbank_feat.shape)
     feat_vec = np.vstack((fbank_feat.transpose(), deltas.transpose()))
     double_deltas = self.computeDeltas(deltas)
     feat_vec = np.vstack((feat_vec, double_deltas.transpose())).transpose()
     assert len(
         feat_vec[0]
     ) == 123, "Something wrong with feature vector dimensions..."
     return feat_vec
def read_wav(wav_path, feature_type='logmelfbank', batch_size=1):
    """Read wav file & convert to MFCC or log mel filterbank features.
    Args:
        wav_path: path to a wav file
        feature: logmelfbank or mfcc
    Returns:
        inputs: `[batch_size, max_time, feature_dim]`
        inputs_seq_len: `[batch_size, frame_num]`
    """
    # Load wav file
    fs, audio = scipy.io.wavfile.read(wav_path)

    if feature_type == 'mfcc':
        features = mfcc(audio, samplerate=fs)  # `[291, 13]`
    elif feature_type == 'logmelfbank':
        fbank_features, energy = fbank(audio, nfilt=40)
        logfbank = np.log(fbank_features)
        logenergy = np.log(energy)
        logmelfbank = hz2mel(logfbank)
        features = np.c_[logmelfbank, logenergy]  # `[291, 41]`

    delta1 = delta(features, N=2)
    delta2 = delta(delta1, N=2)
    input_data = np.c_[features, delta1, delta2]  # `[291, 123]`

    # Transform to 3D array
    # `[1, 291, 39]` or `[1, 291, 123]`
    inputs = np.zeros((batch_size, input_data.shape[0], input_data.shape[1]))
    for i in range(batch_size):
        inputs[i] = input_data
    inputs_seq_len = [inputs.shape[1]] * batch_size  # `[291]`

    # Normalization
    inputs = (inputs - np.mean(inputs)) / np.std(inputs)

    return inputs, inputs_seq_len
Пример #22
0
def audio_feature(signal,
                  samplerate=16000,
                  winlen=0.025,
                  winstep=0.01,
                  numcep=13,
                  nfilt=40,
                  nfft=512,
                  lowfreq=0,
                  highfreq=None,
                  preemph=0.97,
                  ceplifter=22,
                  appendEnergy=True,
                  winfunc=np.hamming):
    feat, energy = fbank(signal, samplerate, winlen, winstep, nfilt, nfft,
                         lowfreq, highfreq, preemph, winfunc)
    log_fbank = np.log(feat)
    # discard the 0-th dct coefficient
    mfcc = dct(log_fbank, type=2, axis=1, norm='ortho')[:, 1:numcep]
    mfcc = lifter(mfcc, ceplifter)
    d1_mfcc = delta(mfcc, 1)
    d2_mfcc = delta(d1_mfcc, 1)
    energy = np.reshape(np.log(energy), (energy.shape[0], 1))
    mixed = np.concatenate((mfcc, d1_mfcc, d2_mfcc, energy), axis=1)
    return mixed
Пример #23
0
            num += 1
            prev_sum = list(map(add, prev_sum, vals))
        else:
            final_lattice.append(list(zip(previous_phones, [x / num for x in prev_sum])))
            previous_phones = ids
            prev_sum = vals
            num = 1

    final_lattice.append(list(zip(previous_phones, [x / num for x in prev_sum])))

    return final_lattice


if __name__=="__main__":
    rate, sig = wavfile.read('./SA1.WAV.wav')
    feat, energy = fbank(sig, samplerate=rate, nfilt=38, winfunc=np.hamming)
    #feat = np.log(feat)
    tsteps, hidden_dim = feat.shape
    feat_log_full = np.reshape(np.log(feat), (1, tsteps, hidden_dim))
    lens = np.array([tsteps])
    inputs, lens = torch.from_numpy(np.array(feat_log_full)).float(), torch.from_numpy(np.array(lens)).long()
    dl_model = dl_model("test_one")
    id_to_phone = {v[0]: k for k, v in dl_model.model.phone_to_id.items()}

    dl_model.model.eval()
    with torch.no_grad():
        #if cuda:
        #    inputs = inputs.cuda()
        #    lens = lens.cuda()

        # Pass through model
Пример #24
0
from python_speech_features import fbank
from scipy.signal import stft
import scipy.io.wavfile as wav

# Get data
place = os.getcwd()
sound_path = place + "/sounds/"

data = glob.glob(os.path.join(sound_path, "*.wav"))

patterns = []

for path in data:
    rate, sig = wav.read(path)
    if "mfcc" in sys.argv:
        feat = mfcc(sig, rate)
    elif "fbank" in sys.argv:
        feat = fbank(sig, rate)[0]
    elif "logfbank" in sys.argv:
        feat = logfbank(sig, rate)
    elif "powspec" in sys.argv:
        feat = stft(sig)[2].transpose()
        feat = np.real(feat * np.conj(feat))
    else:
        raise IndexError("Ge mig ett jävla kommandoradsargument för fan!")
    patterns.append(feat)

patterns = np.array(patterns)

np.save("numpy_features", patterns)
Пример #25
0
cnt = 0
for session in os.listdir(audio_path):
    for dialog in os.listdir(os.path.join(audio_path, session,
                                          'sentences/wav')):
        if 'Ses' in dialog:
            for audio in os.listdir(
                    os.path.join(audio_path, session, 'sentences/wav',
                                 dialog)):
                if audio[-4:] == '.wav':
                    input_path = os.path.join(audio_path, session,
                                              'sentences/wav', dialog, audio)
                    (rate, sig) = wav.read(input_path)
                    feat, energy = fbank(sig,
                                         samplerate=rate,
                                         winlen=0.025,
                                         winstep=0.01,
                                         nfilt=40,
                                         nfft=2048,
                                         winfunc=np.hamming)
                    output_file = os.path.join(acoustic_features_path, session,
                                               dialog)
                    os.makedirs(output_file, exist_ok=True)
                    np.save(os.path.join(output_file, audio[:-4]), feat)

                    cnt += 1
                    if cnt % 200 == 0:
                        print(cnt)

print(cnt)  # 10039
def wav2feature(wav_paths, feature_type='logfbank', feature_dim=40,
                energy=True, delta1=True, delta2=True):
    """Read wav file & convert to MFCC or log mel filterbank features.
    Args:
        wav_paths (list): paths to a wav file
        batch_size (int, optional): the batch size
        feature_type (string, optional): logfbank or fbank or mfcc
        feature_dim (int, optional): the demension of each feature
        energy (bool, optional): if True, add energy
        delta1 (bool, optional): if True, add delta features
        delta2 (bool, optional): if True, add delta delta features
    Returns:
        inputs: A tensor of size `[B, T, input_size]`
        inputs_seq_len: A tensor of size `[B]`
    """
    if feature_type not in ['logmelfbank', 'logfbank', 'fbank', 'mfcc']:
        raise ValueError(
            'feature_type is "logmelfbank" or "logfbank" or "fbank" or "mfcc".')
    if not isinstance(wav_paths, list):
        raise ValueError('wav_paths must be a list.')
    if delta2 and not delta1:
        delta1 = True

    batch_size = len(wav_paths)
    max_time = 0
    for wav_path in wav_paths:
        # Read wav file
        fs, audio = scipy.io.wavfile.read(wav_path)
        if len(audio) > max_time:
            max_time = len(audio)
    input_size = feature_dim
    if energy:
        input_size + 1
    if delta2:
        input_size *= 3
    elif delta1:
        input_size *= 2

    inputs = None
    inputs_seq_len = np.zeros((batch_size,), dtype=np.int32)
    for i, wav_path in enumerate(wav_paths):
        if feature_type == 'mfcc':
            feat = mfcc(audio, samplerate=fs, numcep=feature_dim)
            if energy:
                energy_feat = fbank(audio, samplerate=fs, nfilt=feature_dim)[1]
                feat = np.c_[feat, energy_feat]
        else:
            fbank_feat, energy_feat = fbank(
                audio, samplerate=fs, nfilt=feature_dim)
            if feature_type == 'logfbank':
                fbank_feat = np.log(fbank_feat)
            feat = fbank_feat
            if energy:
                # logenergy = np.log(energy_feat)
                feat = np.c_[feat, energy_feat]

        if delta2:
            delta1_feat = _delta(feat, N=2)
            delta2_feat = _delta(delta1_feat, N=2)
            feat = np.c_[feat, delta1_feat, delta2_feat]
        elif delta1:
            delta1_feat = _delta(feat, N=2)
            feat = np.c_[feat, delta1_feat]

        # Normalize per wav
        feat = (feat - np.mean(feat)) / np.std(feat)

        if inputs is None:
            max_time = feat.shape[0]
            input_size = feat.shape[-1]
            inputs = np.zeros((batch_size, max_time, input_size))

        inputs[i] = feat
        inputs_seq_len[i] = len(feat)

    return inputs, inputs_seq_len
Пример #27
0
    return (s[-1][0], s[-2], s[-1][1]
            )  # BH/1A_endpt.wav: sort by '1', 'BH', 'A'


filelist.sort(key=keyfunc)

for i, file in enumerate(filelist):

    rate, sig = wav.read(file)
    duration = sig.size / rate
    winlen = duration / (n_frames * (1 - overlap) + overlap)
    winstep = winlen * (1 - overlap)
    feat, energy = fbank(sig,
                         rate,
                         winlen,
                         winstep,
                         nfilt=n_bands,
                         nfft=4096,
                         winfunc=np.hamming)
    feat = np.log(feat)
    feat = feat.transpose()

    # plt.subplot(131)
    plt.imshow(feat)
    plt.axis('off')

    # feat2 = feat.copy()
    # feat2[feat2 < 4] = 0
    # plt.subplot(132)
    # plt.imshow(feat2)
Пример #28
0
def get_features(filename,
                 numcep,
                 numfilt,
                 winlen,
                 winstep,
                 method=1,
                 quaternion=False):

    #f = Sndfile(filename, 'r')
    #frames = f.nframes
    #samplerate = f.samplerate
    #data = f.read_frames(frames)
    #data = np.asarray(data)
    samplerate, data = wav.read(filename)

    # Claculate mfcc
    feat_raw, energy = sf.fbank(data,
                                samplerate,
                                winlen,
                                winstep,
                                nfilt=numfilt)
    feat = np.log(feat_raw)
    feat = sf.dct(feat, type=2, axis=1, norm='ortho')[:, :numcep]
    feat = sf.lifter(feat, L=22)
    feat = np.asarray(feat)

    #calc log energy
    log_energy = np.log(energy)  #np.log( np.sum(feat_raw**2, axis=1) )
    log_energy = log_energy.reshape([log_energy.shape[0], 1])

    mat = (feat - np.mean(feat, axis=0)) / (0.5 * np.std(feat, axis=0))
    mat = np.concatenate((mat, log_energy), axis=1)

    # Calculate first order derivatives
    # if grad >= 1:
    #     gradf = np.gradient(mat)[0]
    #     mat = np.concatenate((mat, gradf), axis=1)

    # #calc second order derivatives
    # if grad == 2:
    #     grad2f = np.gradient(gradf)[0]
    #     mat = np.concatenate((mat, grad2f), axis=1)

    # Calculate 1st-2nd-3rd order derivatives
    if method:
        gradf = np.gradient(mat)[0]
        mat = np.concatenate((mat, gradf), axis=1)

        grad2f = np.gradient(gradf)[0]
        mat = np.concatenate((mat, grad2f), axis=1)

        grad3f = np.gradient(grad2f)[0]
        mat = np.concatenate((mat, grad3f), axis=1)
    else:
        zerof = np.zeros(shape=mat.shape)
        mat = np.concatenate((mat, zerof), axis=1)

        gradf = np.gradient(mat)[0]
        mat = np.concatenate((mat, gradf), axis=1)

        grad2f = np.gradient(gradf)[0]
        mat = np.concatenate((mat, grad2f), axis=1)

    if quaternion:
        Q_mat = np.reshape(mat, (mat.shape[0], 4, mat.shape[1] // 4))
        mat = Q_mat

    return mat, data, samplerate
Пример #29
0
#coding=utf8
from python_speech_features import fbank
from python_speech_features import logfbank
import scipy.io.wavfile as wav

path = '/home/sw/Shin/Codes/DL4SS_Keras/Data_with_dev/male_test.wav'
(rate, sig) = wav.read(path)
print(rate, sig)
print sig.shape  #43520
fbank_feat = fbank(sig, rate, winstep=0.01, nfilt=40)
print fbank_feat[0].shape  # 271的结果是这样得到噢的,43520/(0.01s*16000)
    #aa,bb,cc,dd, plt = get_spectrogram(new_file_name_path)

    fd = 2048
    fs = 1024

    f_size = fd * fs

    (rate, sig) = wav.read(new_file_name_path)
    x_brahms, sr_brahms = librosa.load(file, duration=30, offset=30)

    mfcc_feat = mfcc(sig, samplerate=rate)  #(2992, 13)

    ipdb.set_trace()

    #mfcc_one_line = mfcc_feat.reshape(38896, 1)
    fbank_feat = fbank(sig, samplerate=rate)
    logfbank_feat = logfbank(sig, samplerate=rate)
    d_mfcc_feat = delta(mfcc_feat, 2)
    #gammatone.gtgram.gtgram(wave, fs, window_time, hop_time, channels, f_min)

    gtgram_function = gtgram.gtgram(sig, rate, .250, .125, 1, 20)

    print("mfcc_feat.shape:", mfcc_feat.shape)
    print("mfcc_one_line.shape", mfcc_one_line.shape)
    print("logfbank_feat.shape", logfbank_feat.shape)
    print("d_mfcc_feat.shape", d_mfcc_feat.shape)
    print("gtgram_function.shape", gtgram_function.shape)
    print("gtgram_function.shape.T", gtgram_function.T.shape)
    #ssc = ssc(sig,samplerate=rate)

    #print(logfbank_feat[1:3,:])
def wav2feature(wav_path, feature_type='fbank', feature_dim=40,
                use_energy=True, use_delta1=True, use_delta2=True,
                window=0.025, slide=0.01, dtype=np.float32):
    """Read wav file & convert to MFCC or log mel filterbank features.
    Args:
        wav_path (string): the path to a wav file
        feature_type (string, optional): fbank or mfcc
        feature_dim (int, optional): the demension of each feature
        use_energy (bool, optional): if True, add energy
        use_delta1 (bool, optional): if True, add delta features
        use_delta2 (bool, optional): if True, add delta delta features
        window (float, optional): window width to extract features
        slide (float, optional): extract features per 'slide'
        dtype (optional): default is np.float32
    Returns:
        feat (np.ndarray): A tensor of size `[T, feature_dim]`
    """
    if feature_type not in ['fbank', 'mfcc']:
        raise ValueError('feature_type is or "fbank" or "mfcc".')
    if use_delta2:
        delta1 = True

    # Read wav file
    try:
        fs, audio = scipy.io.wavfile.read(wav_path)
    except ValueError:
        # Read NIST file
        wav_path_tmp = '/tmp//tmp.wav'
        # result = subprocess.call(['sph2pipe', '-f', 'wav', wav_path, wav_path_tmp])
        result = subprocess.call(['sox', wav_path, '-t', 'wav', wav_path_tmp])

        if result != 0:
            raise ValueError

        # Try again
        fs, audio = scipy.io.wavfile.read(wav_path_tmp)
        subprocess.call(['rm', wav_path_tmp])

    if feature_type == 'mfcc':
        feat = mfcc(audio,
                    samplerate=fs,
                    numcep=feature_dim)
        if use_energy:
            energy_feat = fbank(audio,
                                samplerate=fs,
                                nfilt=feature_dim)[1]
            energy_feat = energy_feat.reshape(-1, 1)
            feat = np.concatenate((feat, energy_feat), axis=1)
            # NOTE: only fbank function retures energy
    else:
        fbank_feat, energy_feat = fbank(audio,
                                        samplerate=fs,
                                        winlen=window,
                                        winstep=slide,
                                        nfilt=feature_dim,
                                        nfft=512,
                                        lowfreq=0,
                                        highfreq=None,
                                        preemph=0.97,
                                        winfunc=np.hamming)
        if feature_type == 'fbank':
            feat = np.log(fbank_feat)
        if use_energy:
            energy_feat = energy_feat.reshape(-1, 1)
            # logenergy = np.log(energy_feat)

            feat = np.concatenate((feat, energy_feat), axis=1)
            # NOTE: energy_feat may be not log-scale.

    if use_delta2:
        delta1_feat = _delta(feat, N=2)
        delta2_feat = _delta(delta1_feat, N=2)
        feat = np.concatenate((feat, delta1_feat, delta2_feat), axis=1)
    elif delta1:
        delta1_feat = _delta(feat, N=2)
        feat = np.concatenate((feat, delta1_feat), axis=1)

    return feat
Пример #32
0
def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph)
    return np.log(feat),energy
Пример #33
0
def _fbank(*args, **kwargs) -> np.ndarray:
    feat, _ = fbank(*args, **kwargs)
    return feat
 def make_features(file_path: str, **kwargs) -> np.ndarray:
     """ Use `python_speech_features` lib to extract MFCC features from the audio file. """
     fs, audio = wav.read(file_path)
     feat, energy = python_speech_features.fbank(audio, samplerate=fs, **kwargs)
     features = np.log(feat)
     return features
Пример #35
0
def get_fbank(signal, target_sample_rate):    
    filter_banks, energies = fbank(signal, samplerate=target_sample_rate, nfilt=40,nfft=int(target_sample_rate*0.025))
    filter_banks = normalize_frames(filter_banks)
    return np.array(filter_banks)