예제 #1
0
파일: data_util.py 프로젝트: moomou/mlab
def _encode_data(args):
    filename, mode, sr = args

    data, sr = librosa.core.load(filename, sr=sr)
    data, _ = librosa.effects.trim(data, top_db=15)
    duration = librosa.get_duration(y=data, sr=sr)

    if mode.name.startswith('spec'):
        data = np.log(abs(librosa.core.stft(y=data, n_fft=2**11))**2)
        data = data[..., np.newaxis]
        glog.debug('spec:: %s, %s', data.shape, data)
    elif mode.name.startswith('ssc'):
        data = ssc(data, sr, **SSC_CONFIG)
        data = data[..., np.newaxis]
        glog.debug('mfcc:: %s, %s', data.shape, data)
    elif mode.name.startswith('mfcc'):
        data = mfcc(data, sr, **MFCC_CONFIG)

        if mode == DataMode.mfcc_delta:
            data_delta = delta(data, 1)
            data = np.append(data, data_delta, axis=-1)
        elif mode == DataMode.mfcc_ssc:
            data = ssc(data, sr, **SSC_CONFIG)
            data = np.append(data, data_delta, axis=-1)

        data = data[..., np.newaxis]
        glog.debug('mfcc:: %s, %s', data.shape, data)
    elif mode == DataMode.fbank:
        data = logfbank(data, sr, **FBANK_CONFIG)
        data = data[..., np.newaxis]
        glog.debug('fbank:: %s, %s', data.shape, data)
    else:
        assert False, 'Invald option:: %s' % mode

    return data, duration
예제 #2
0
def mfcc_loop(n_first, n_last, grade):
    a = []
    b = []
    for i in range(n_first, n_last):
        (rate, sig) = wav.read(
            "C:\Work\speech_recognition\{}\sample_{}.wav".format(grade, i))
        a.append([i for i in ssc(sig, rate, nfft=1103)])
        b.append([i for i in logfbank(sig, rate, nfft=1103)])
    return a, b
예제 #3
0
def extract_from_signal(fs, signal, nfft):
    mfcc = psf.mfcc(signal, fs, nfft=nfft)
    fbank = psf.fbank(signal, fs, nfft=nfft)[0]
    logfbank = psf.logfbank(signal, fs, nfft=nfft)
    ssc = psf.ssc(signal, fs, nfft=nfft)

    mfcc_mean = [mfcc[:, i].mean() for i in xrange(mfcc.shape[1])]
    mfcc_std = [mfcc[:, i].std() for i in xrange(mfcc.shape[1])]
    fbank_mean = [fbank[:, i].mean() for i in xrange(fbank.shape[1])]
    fbank_std = [fbank[:, i].std() for i in xrange(fbank.shape[1])]
    logfbank_mean = [logfbank[:, i].mean() for i in xrange(logfbank.shape[1])]
    logfbank_std = [logfbank[:, i].std() for i in xrange(logfbank.shape[1])]
    ssc_mean = [ssc[:, i].mean() for i in xrange(ssc.shape[1])]
    ssc_std = [ssc[:, i].std() for i in xrange(ssc.shape[1])]

    return mfcc_mean + mfcc_std + fbank_mean + fbank_std + logfbank_mean + logfbank_std + ssc_mean + ssc_std

    #signal_preemphasized = speechpy.processing.preemphasis(signal, cof=0.98)
    frames = speechpy.processing.stack_frames(signal,
                                              sampling_frequency=fs,
                                              frame_length=0.020,
                                              frame_stride=0.01,
                                              filter=lambda x: np.ones((x, )),
                                              zero_padding=True)

    power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=512)
    mfcc = speechpy.feature.mfcc(signal,
                                 sampling_frequency=fs,
                                 frame_length=0.020,
                                 frame_stride=0.01,
                                 num_filters=40,
                                 fft_length=512,
                                 low_frequency=0,
                                 high_frequency=None)
    logenergy = speechpy.feature.lmfe(signal,
                                      sampling_frequency=fs,
                                      frame_length=0.020,
                                      frame_stride=0.01,
                                      num_filters=40,
                                      fft_length=512,
                                      low_frequency=0,
                                      high_frequency=None)

    # power_spectrum_mean = [power_spectrum[:, i].mean() for i in xrange(power_spectrum.shape[1])]
    # power_spectrum_std = [power_spectrum[:, i].std() for i in xrange(power_spectrum.shape[1])]
    mfcc_mean = [mfcc[:, i].mean() for i in xrange(mfcc.shape[1])]
    mfcc_std = [mfcc[:, i].std() for i in xrange(mfcc.shape[1])]
    # logenergy_mean = [logenergy[:, i].mean() for i in xrange(logenergy.shape[1])]
    # logenergy_std = [logenergy[:, i].std() for i in xrange(logenergy.shape[1])]
    return mfcc_mean + mfcc_std
예제 #4
0
def extract_features(audio, rate):
    """extract 20 dim mfcc features from an audio, performs CMS and combines 
    delta to make it 40 dim feature vector"""

    mfcc_feature = mfcc.mfcc(audio,
                             rate,
                             0.025,
                             0.01,
                             26,
                             nfft=1200,
                             preemph=0.97,
                             appendEnergy=True)
    # mfcc_feature = preprocessing.scale(mfcc_feature)
    mfcc_feature1 = mfcc.logfbank(audio, rate, 0.025, 0.01, 26, nfft=1200)
    mfcc_feature2 = mfcc.ssc(audio, rate, 0.025, 0.01, 26, nfft=1200)
    delta = mfcc.delta(mfcc_feature, 26)
    combined = np.hstack((mfcc_feature, delta, mfcc_feature1, mfcc_feature2))
    return combined
def get_feature_from_python_speech_features(wave_name):
    from python_speech_features import logfbank
    from python_speech_features import mfcc
    from python_speech_features import delta
    from python_speech_features import fbank
    from python_speech_features import ssc
    import scipy.io.wavfile as wav
    import numpy
    (rate, sig) = wav.read(wave_name)
    mfcc_feat = mfcc(sig, rate)
    d_mfcc_feat = delta(mfcc_feat, 2)
    d_d_mfcc_feat = delta(d_mfcc_feat, 2)
    fbank_feat, energy = fbank(sig, rate)
    logfbank_feat = logfbank(sig, rate)
    centroids = ssc(sig, rate)
    feat = numpy.hstack(
        (mfcc_feat, d_mfcc_feat, d_d_mfcc_feat, logfbank_feat, centroids))
    return feat.T  #一行代表一帧的特征
예제 #6
0
def makeMFCC(name, sampleSize):
    rate = [[] for i in range(sampleSize)]
    sig = [[] for i in range(sampleSize)]
    mfcc_feat = [[] for i in range(sampleSize)]
    fbank_feat = [[] for i in range(sampleSize)]
    for i in range(sampleSize):
        wordIs = name
        word = wordIs + str(i) + ".wav"
        (rate[i], sig[i]) = wav.read(word)
        mfcc_featI = mfcc(sig[i], rate[i], nfft=1103)
        fbank_featI = logfbank(sig[i], rate[i], nfft=1103)
        ssc_featI = ssc(sig[i], rate[i], nfft=1103)
        for j in ssc_featI:
            fbank_feat[i].append(np.average(j))
    plt.figure()
    plt.plot(fbank_feat)
    plt.savefig("Result/MFCCaverage.png", dpi=300)
    return fbank_feat
예제 #7
0
def get_ssc_feat(
        file_path, samplerate=16000, winlen=0.025,
        winstep=0.01, nfilt=26, nfft=2048,
        lowfreq=0, highfreq=None, preemph=0.97):
    """Get Spectral Subband Centroid features given a signal path.
    @param: file_path – file path of the signal.
    @param: samplerate – the samplerate of the signal we are working with.
    @param: winlen – the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
    @param: winstep – the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    @param: nfilt – the number of filters in the filterbank, default 26.
    @param: nfft – the FFT size. Default is 512.
    @param: lowfreq – lowest band edge of mel filters. In Hz, default is 0.
    @param: highfreq – highest band edge of mel filters. In Hz, default is samplerate/2
    @param: preemph – apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    @return: tuple of feature vector (999, 26) (num_frames, nfilt)
    """
    sample_rate, signal = wf.read(file_path)
    feat_vec = ssc(signal, sample_rate, winlen, winstep, nfilt, \
                     nfft, lowfreq, highfreq, preemph)
    return feat_vec
예제 #8
0
def featuresExtraction_temp(request):
    rate, data = wavfile.read("file.wav")
    F_vectors, f_names = audioFeatureExtraction.stFeatureExtraction(
        data, rate, 0.050 * rate, 0.025 * rate)
    f_vectors1 = logfbank(data, rate)

    f_vectors3 = ssc(data, rate)
    F_vectors = np.transpose(F_vectors)
    F_vectors = np.array((F_vectors))
    length = F_vectors.shape[0]

    F_vectors = list(F_vectors)
    for i in range(length):

        F_vectors[i] = list(F_vectors[i])
        f_vectors1[i] = list(f_vectors1[i])
        F_vectors[i].extend(f_vectors1[i])
        f_vectors3[i] = list(f_vectors3[i])
        F_vectors[i].extend(f_vectors3[i])
    print(len(F_vectors))
    print(len(F_vectors[0]))
    print(length)
    username = request.user.username
    name = user_list.objects.get(user_name=str(username))
    userid = name.id
    for j in range(length):
        print("j>>", j)
        features = voiceFeatures_temp.objects.create(user_name=str(username),
                                                     user_id=userid,
                                                     frame_index=j)
        features = voiceFeatures_temp.objects.get(user_name=str(username),
                                                  user_id=userid,
                                                  frame_index=j)
        for i in range(86):
            exec("features.f%d=F_vectors[%d][%d]" % (i, j, i))
        features.save()
    file = 'test_v.csv'
    table = 'mainpage_voiceFeatures_temp'
    convert_modeltocsv_voice(userid, userid, file, table)
예제 #9
0
def extract_features():

    # Some variable and path initializations
    df = pd.DataFrame(columns=['mfcc_feat', 'fbank_feat', 'ssc'],
                      index=range(0, 1000))
    y_df = pd.DataFrame(columns=['classification'], index=range(0, 1000))

    # Getting all the genres
    l = []
    for dirpaths, dirnames, filenames in os.walk(os.getcwd()):
        l.append(dirnames)

    song_no = 0
    # Extracting Features of the songs
    for i in l[0]:
        #for genre in genres1
        for x in range(100):

            # for song in batch of songs

            # Extracting the features
            (rate, sig) = wav.read(i + "/" + i + ".000" + "%02d" % x + '.wav')
            mfcc_feat = mfcc(sig, rate, nfft=551)
            fbank_feat = logfbank(sig, rate, nfft=551)
            sig_temp = np.reshape(sig, (sig.shape[0], 1))
            ssc_var = ssc(sig_temp, rate, nfft=551)

            # Adding features to the pandas dataframe -- all 2985 frames
            df.iloc[song_no][0] = mfcc_feat[0:2985, :]
            df.iloc[song_no][1] = fbank_feat[0:2985, :]
            df.iloc[song_no][2] = ssc_var[0:2985, :]
            y_df.iloc[song_no][0] = i

            # Incrementing the song number\n",
            song_no += 1

    return df, y_df
예제 #10
0
def featuresExtraction(username1):
    print("sucessssssssssssssss")
    rate, data = wavfile.read("file.wav")
    F_vectors, f_names = audioFeatureExtraction.stFeatureExtraction(
        data, rate, 0.050 * rate, 0.025 * rate)
    f_vectors1 = logfbank(data, rate)

    f_vectors3 = ssc(data, rate)
    F_vectors = np.transpose(F_vectors)
    F_vectors = np.array((F_vectors))
    length = F_vectors.shape[0]

    F_vectors = list(F_vectors)
    for i in range(length):

        F_vectors[i] = list(F_vectors[i])
        f_vectors1[i] = list(f_vectors1[i])
        F_vectors[i].extend(f_vectors1[i])
        f_vectors3[i] = list(f_vectors3[i])
        F_vectors[i].extend(f_vectors3[i])
    print(len(F_vectors))
    print(len(F_vectors[0]))
    print(length)
    username = username1['username1']
    name = user_list.objects.get(user_name=str(username))
    userid = name.id
    for j in range(length):
        print("j>>", j)
        features = voiceFeatures.objects.create(user_name=str(username),
                                                user_id=userid,
                                                frame_index=j)
        features = voiceFeatures.objects.get(user_name=str(username),
                                             user_id=userid,
                                             frame_index=j)
        for i in range(86):
            exec("features.f%d=F_vectors[%d][%d]" % (i, j, i))
        features.save()
예제 #11
0
 def extract_ssc(self, y, sr, cmn=False):
     feat = ssc(y, sr, winfunc=np.hamming, **self.ssc_kwargs)
     if cmn:
         feat -= np.mean(feat, axis=0, keepdims=True)
     return feat.astype('float32')
def get_ssc(signal, rate):
    return ssc(signal, rate)
def featurex(filepath):
    # print(filepath)
    (rate, X) = wav.read(filepath)
    ceps = mfcc(X, rate)
    delt = delta(ceps, 2)
    sscz = ssc(X, rate)
    filt = delta(delt, 2)
    ls = []
    for i in range(ceps.shape[1]):
        temp = ceps[:, i]
        lfeatures = [
            np.mean(temp),
            np.var(temp),
            np.amax(temp),
            np.amin(temp),
            scipy.stats.kurtosis(temp),
            scipy.stats.skew(temp),
            scipy.stats.iqr(temp)
        ]
        temp2 = np.array(lfeatures)
        ls.append(temp2)
    ls2 = []
    for i in range(delt.shape[1]):
        dtemp = delt[:, i]
        dlfeatures = [
            np.mean(dtemp),
            np.var(dtemp),
            np.amax(dtemp),
            np.amin(dtemp),
            scipy.stats.kurtosis(dtemp),
            scipy.stats.skew(dtemp),
            scipy.stats.iqr(dtemp)
        ]
        dtemp2 = np.array(dlfeatures)
        ls2.append(dtemp2)
    ls3 = []
    for i in range(sscz.shape[1]):
        stemp = sscz[:, i]
        slfeatures = [
            np.mean(stemp),
            np.var(stemp),
            np.amax(stemp),
            np.amin(stemp),
            scipy.stats.kurtosis(stemp),
            scipy.stats.skew(stemp),
            scipy.stats.iqr(stemp)
        ]
        stemp3 = np.array(slfeatures)
        ls3.append(stemp3)
    ls4 = []
    for i in range(filt.shape[1]):
        ftemp = filt[:, i]
        flfeatures = [
            np.mean(ftemp),
            np.var(ftemp),
            np.amax(ftemp),
            np.amin(ftemp),
            scipy.stats.kurtosis(ftemp),
            scipy.stats.skew(ftemp),
            scipy.stats.iqr(ftemp)
        ]
        ftemp4 = np.array(flfeatures)
        ls4.append(ftemp4)

    source = np.array(ls).flatten()
    source = np.append(source, np.array(ls2).flatten())
    source = np.append(source, np.array(ls3).flatten())
    source = np.append(source, np.array(ls4).flatten())

    return source
예제 #14
0
def generateStage2FFT(frameSize,
                      modFrameSize,
                      modWindowSize,
                      nModFrames,
                      p,
                      fft1matrix,
                      modWin,
                      form="magnitude",
                      nFilts=30,
                      nMFCCs=15):
    """
    This function takes an input spectrogram and returns a tensor modulation spectrum.
    Note form can be "magnitude", "complex" or "real", which describes what is done to the data resulting from each FFT.
    
    Inputs:
        - frameSize: the step size of the acoustic windows in seconds, form "0.001"
        - modFrameSiza: the step size of the modulation windows in seconds, form "0.1"
        - modWindowSize: the modulation window size in seconds, form "1"
        - nModFrames: the number of modulation frames that the speech signal is broken up into
        - p: the number of FFT points based on the acoustic window, form "48"
        - fft1matrix: a 2D matrix size [nFrames x p] that contains the FFT of each acoustic window
        - modWin: the modulation window as a numpy array
        - form: a choice of "magnitude", "complex" or "real" which determines the content of fft2matrix
    Outputs:
        - q: the number of FFT points based on the modulation window, form "1000"
        - fft2matrix: a 2D matrix size [nModFrames x (p*q)] that has the flattened modulation spectrum
          per row
        - logfft2matrix: the decibel magnitude of fft2matrix
    """
    import math
    from scipy.fftpack import fft
    #from scipy.signal.windows import hamming, hann
    import numpy as np
    from python_speech_features import fbank, mfcc, ssc
    import warnings

    if form == "fbank":
        q = nFilts
    elif form == "mfcc":
        q = nMFCCs
    else:
        q = round(modWindowSize / (2 * frameSize) +
                  1)  # After applying Nyquist
    fft2matrix = np.zeros(
        (nModFrames, p, q)) if form != "complex" else np.zeros(
            (nModFrames, p, q), dtype=np.complex_)
    logfft2matrix = np.zeros((nModFrames, p, q))

    scale2 = modWin.sum()

    for i in range(nModFrames):
        if i % 100 == 0:
            print("{:,}".format(i), end="\r")
        for j in range(p):
            sigExtract2 = fft1matrix[i*int(modFrameSize/frameSize):i*int(modFrameSize/frameSize)\
                            +round(modWindowSize/frameSize), j]
            sigWin2 = fft(sigExtract2 * modWin) / scale2
            nfft = 2**math.ceil(np.log2(round(modWindowSize / frameSize)))
            if form == "magnitude":
                fft2matrix[i, j, :] = np.absolute(sigWin2)[:q]
            elif form == "complex":
                fft2matrix[i, j, :] = sigWin2[:q]
            elif form == "real":
                fft2matrix[i, j, :] = np.real(sigWin2)[:q]
            elif form == "fbank":
                fft2matrix[i, j, :] = fbank(sigExtract2*modWin, samplerate=round(1/frameSize), winlen=modWindowSize, winstep=modFrameSize, nfilt=nFilts, \
                                     nfft=nfft, lowfreq=0, highfreq=round(1/(2*frameSize)), preemph=0)[0][:q]
            elif form == "mfcc":
                fft2matrix[i, j, :] = mfcc(sigExtract2 * modWin,
                                           samplerate=round(1 / frameSize),
                                           winlen=modWindowSize,
                                           winstep=modFrameSize,
                                           numcep=nMFCCs,
                                           nfilt=nFilts,
                                           nfft=nfft,
                                           lowfreq=0,
                                           highfreq=round(1 / (2 * frameSize)),
                                           preemph=0,
                                           ceplifter=22,
                                           appendEnergy=False)[0][:q]
            else:
                print(
                    "Form must be \"magnitude\", \"complex\", \"real\", \"fbank\" or \"mfcc\"."
                )

            if form != "complex":
                logfft2matrix[i, j, :] = 20 * np.log10(fft2matrix[i, j, :])[:q]
            else:
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    logfft2matrix[i, j, :] = 10 * np.log10(
                        fft2matrix[i, j, :] * np.conj(fft2matrix[i, j, :]))[:q]

    print("fft2matrix shape is {}".format(fft2matrix.shape))

    if form == "fbank":
        freqs2 = ssc(np.array(fft1matrix[0, :round(modWindowSize/frameSize)]), samplerate=(1/frameSize), winlen=modWindowSize, winstep=modFrameSize, nfilt=nFilts, nfft=2048, \
                            lowfreq=0, highfreq=round(1/(2*frameSize)), preemph=0)[0][:q]
        return q, fft2matrix, logfft2matrix, freqs2
    else:
        return q, fft2matrix, logfft2matrix
예제 #15
0
error2d(psf_feat, csf_feat)
print 'Energy'
error1d(psf_energy, csf_energy)

print ''
print 'logfbank'
print '========'
psf_feat = psf.logfbank(audio)
csf_feat = csf.logfbank(audio)
assert (np.shape(psf_feat) == np.shape(csf_feat))
error2d(psf_feat, csf_feat)

print ''
print 'ssc'
print '==='
psf_ssc = psf.ssc(audio)
csf_ssc = csf.ssc(audio)
assert (np.shape(psf_ssc) == np.shape(csf_ssc))
error2d(psf_ssc, csf_ssc)

print ''
print 'hz2mel'
print '======'
assert (get_error(psf.hz2mel(8000), csf.hz2mel(8000)) <= acceptable_error)
assert (get_error(psf.hz2mel(16000), csf.hz2mel(16000)) <= acceptable_error)
assert (get_error(csf.mel2hz(csf.hz2mel(8000)), 8000) <= acceptable_error)
print ' ✓'

print ''
print 'mel2hz'
print '======'
예제 #16
0
points_data1 = floor(data1.shape[0] / fs / t)
data0 = data0[:points_data0 * fs * t]
data1 = data1[:points_data1 * fs * t]

mfcc_0 = mfcc(data0, fs, winlen=t, nfft=t * fs, winstep=t)
mfcc_1 = mfcc(data1, fs, winlen=t, nfft=t * fs, winstep=t)
mfcc_feat = np.concatenate((mfcc_0, mfcc_1))

# =============================================================================
# fbank_0 = logfbank(data0,fs,winlen=t,nfft=t*fs,winstep=t)
# fbank_1 = logfbank(data1,fs,winlen=t,nfft=t*fs,winstep=t)
# fbank_feat = np.concatenate((fbank_0,fbank_1))
# =============================================================================

hop = 0.5
sc_feat_0 = ssc(data0, fs, winlen=t, nfft=int((t * fs) / hop), winstep=t)
sc_feat_1 = ssc(data1, fs, winlen=t, nfft=int((t * fs) / hop), winstep=t)
sc_feat = np.concatenate((sc_feat_0, sc_feat_1))

# =============================================================================
# rms_feat = np.array([])
# points = fs*t
# data_ampl = np.abs(np.fft.fft(data0))
# data_ampl = data_ampl[1:]
# data_energy = data_ampl ** 2
# energy = np.append(data_energy,data_energy[-1])
# energy = energy.reshape((floor(points),-1))
# rms = librosa.feature.rmse(S=energy)
# rms = rms.T
# rms_feat = np.append(rms_feat,rms)
# data_ampl = np.abs(np.fft.fft(data1))
예제 #17
0
def create_dataset_csv(csv_dir, test_audio_name='test_audio.wav'):
    loaded_data = dict()
    loaded_data['wav'] = []
    loaded_data['phoneme'] = []
    loaded_data['landmark'] = []
    loaded_data['maya_pos'] = []
    loaded_data['maya_param'] = []
    loaded_data['face_close'] = []
    loaded_data['face_open'] = []
    loaded_data['pose'] = []
    loaded_data['file_len'] = {'train': 0, 'test': 0}
    loaded_data['clip_len'] = {'train': [], 'test': []}
    loaded_data['file_dir'] = {'train': [], 'test': []}
    dataset_type_order = ['test']

    csv_dir += test_audio_name[:-4] + '/'
    try_mkdir(csv_dir)
    try_mkdir(csv_dir + 'test/')
    errf = open(csv_dir + 'err.txt', 'w')

    for dataset_type_i in range(0, 1):  # all from train file list
        dataset_type = dataset_type_order[dataset_type_i]

        file_list = {'n': 1, 'wav': [lpw_dir + test_audio_name]}

        for nClip in range(0, file_list['n']):

            print(
                '\n==================== Processing file {:} ===================='
                .format(file_list["wav"][nClip]))
            if (not os.path.isfile(file_list["wav"][nClip])):
                print('# ' + str(nClip) + ' None existing file: ' +
                      file_list["wav"][nClip])
                errf.write('# ' + str(nClip) + ' None existing file: ' +
                           file_list["wav"][nClip] + '\n')
                continue

            # WAV
            (rate, sig) = wav.read(file_list["wav"][nClip])
            if (sig.ndim > 1):
                sig = sig[:, 0]  # pick mono-acoustic track
            else:
                print('Notice: ' + file_list["wav"][nClip] + ' is mono-track')

            # fps = (nLandmark + 1) / (sig.shape[0] / rate)
            fps = 25
            errf.write(file_list["wav"][nClip] + 'FPS: {:} \n'.format(fps))
            print('FPS: {:}'.format(fps))
            winstep = 1.0 / fps / mfcc_win_step_per_frame / up_sample_rate
            mfcc_feat = mfcc(sig,
                             samplerate=rate,
                             winlen=0.025,
                             winstep=winstep,
                             numcep=13)
            logfbank_feat = logfbank(sig,
                                     samplerate=rate,
                                     winlen=0.025,
                                     winstep=winstep,
                                     nfilt=26)
            ssc_feat = ssc(sig,
                           samplerate=rate,
                           winlen=0.025,
                           winstep=winstep,
                           nfilt=26)
            full_feat = np.concatenate([mfcc_feat, logfbank_feat, ssc_feat],
                                       axis=1)
            # full_feat = logfbank_feat

            nFrames_represented_by_wav = math.floor(
                full_feat.shape[0] / mfcc_win_step_per_frame / up_sample_rate)
            mfcc_lines = full_feat[0:nFrames_represented_by_wav *
                                   mfcc_win_step_per_frame *
                                   up_sample_rate, :].reshape(
                                       int(nFrames_represented_by_wav *
                                           up_sample_rate),
                                       int(full_feat.shape[1] *
                                           mfcc_win_step_per_frame))
            '''
            # ==================== cut the tail of lpw to make sure they are in same length ==================== #
            '''
            # print("Original length of lpw + maya_param/pos: " + str(nFrames_represented_by_wav))
            aligned_length_wav = mfcc_lines
            '''
            # ==================== process each lpw file ==================== #
            '''

            npWav = np.array(aligned_length_wav)
            print("Load #Clip {:d}/{:}, wav {:}".format(
                nClip, file_list['n'], npWav.shape))
            loaded_data['wav'].append(npWav)

            # length of each dataset_type
            loaded_data['file_len'][dataset_type] += npWav.shape[0]
            loaded_data['clip_len'][dataset_type].append(npWav.shape[0])
            loaded_data['file_dir'][dataset_type].append(
                file_list["wav"][nClip][28:-4] + ' ' +
                str(loaded_data['file_len'][dataset_type] - npWav.shape[0]) +
                ' ' + str(npWav.shape[0]))
            # end for nClip loop
            # break

        # end for dataset_type loop
        # break
    '''
    # ==================== save file ==================== #
    '''
    key_order = ['wav']
    for key_i in range(0, 1):
        key = key_order[key_i]
        #  print(key)
        # ==================== wav normalize file ==================== #
        npKey = loaded_data[key][0]
        for i in range(1, len(loaded_data[key])):
            npKey = np.concatenate((npKey, loaded_data[key][i]), axis=0)

        # Use saved std & mean
        mean_std = np.loadtxt(lpw_dir + 'saved_param/wav_mean_std.csv')
        npKey_mean = mean_std[0:65]
        npKey_std = mean_std[65:130]

        def normal_data(loaded_data, mean, std):
            normed = (loaded_data - mean) / std
            return normed

        npKey = normal_data(npKey, npKey_mean, npKey_std)
        np.savetxt(csv_dir + key + '_mean_std.csv',
                   np.append(npKey_mean, npKey_std),
                   fmt='%.5f',
                   delimiter=' ')
        np.savetxt(csv_dir + key + '_raw.csv',
                   npKey,
                   fmt='%.5f',
                   delimiter=' ')
        del npKey

        def reshape_based_on_win_size(loaded_data, i, win_size, start_idx):
            npWav = (loaded_data[i] - npKey_mean) / npKey_std
            listWav = list(range(start_idx, start_idx + npWav.shape[0]))
            half_win_size = int(win_size / 2)
            pad_head = [start_idx for _ in range(half_win_size)]
            pad_tail = [listWav[-1] for _ in range(half_win_size)]
            pad_npWav = np.array(pad_head + listWav + pad_tail)
            npKey = np.zeros(shape=(npWav.shape[0], win_size))
            for np_i in range(0, npWav.shape[0]):
                npKey[np_i] = pad_npWav[np_i:np_i + win_size].reshape(
                    1, win_size)
            return npKey

        npKey = reshape_based_on_win_size(loaded_data['wav'], 0, win_size, 0)

        for i in range(1, len(loaded_data[key])):
            npKeytmp = reshape_based_on_win_size(loaded_data['wav'], i,
                                                 win_size, npKey.shape[0])
            npKey = np.concatenate((npKey, npKeytmp), axis=0)

        idx = 0
        for dataset_type_i in range(0, 1):
            dataset_type = dataset_type_order[dataset_type_i]
            dataset_type_data_len = loaded_data['file_len'][dataset_type]
            cur_npKey = npKey[idx:idx + dataset_type_data_len]
            print('Save {:} - {:} file as shape of {:}'.format(
                dataset_type, key, cur_npKey.shape))
            np.savetxt(csv_dir + dataset_type + '/' + key + '.csv',
                       cur_npKey,
                       fmt='%d',
                       delimiter=' ')
            idx += dataset_type_data_len

    for dataset_type in {'test'}:
        npLen = np.array(loaded_data['clip_len'][dataset_type])
        np.savetxt(csv_dir + dataset_type + '/clip_len.csv',
                   npLen,
                   fmt='%d',
                   delimiter=' ')
        # print("Saved clip length file to " + dataset_type + '/clip_len.csv')
        npLen = np.array(loaded_data['file_dir'][dataset_type])
        np.savetxt(csv_dir + dataset_type + '/file_dir.csv',
                   npLen,
                   fmt='%s',
                   delimiter=' ')
예제 #18
0
with open(sFList, 'r') as fList:
    lWavFiles = fList.read().splitlines()
    for sLine in lWavFiles:
        sWavFile, sFeatureFile = sLine.split()
        print(sWavFile)
        iRate, lSamples = wav.read(sWavFile)
        print(sWavFile, end='\r')
        #Ceating Features
        if sFeatureType == 'mfcc':
            aFeatures = mfcc(lSamples, iRate)
        elif sFeatureType == 'fbank':
            aFeatures = fbank(lSamples, iRate)
        elif sFeatureType == 'lfbank':
            aFeatures = logfbank(lSamples, iRate, nfilt=iSize)
        elif sFeatureType == 'ssc':
            aFeatures = ssc(lSamples, iRate)
        else:
            print('Error: Unknown Feature Type sFeatureType')
            sys.exit(1)

        #Computing Time Drivatives
        if sDrivatives == 'D':
            aDFeatures = delta(aFeatures, iDeltaWindow)
            aFeatures = np.c_[aFeatures, aDFeatures]
        elif sDrivatives == 'A':
            aDFeatures = delta(aFeatures, iDeltaWindow)
            aAFeatures = delta(aDFeatures, iAccWindow)
            aFeatures = np.c_[aFeatures, aDFeatures, aAFeatures]
        elif sDrivatives == 'T':
            aDFeatures = delta(aFeatures, iDeltaWindow)
            aAFeatures = delta(aDFeatures, iAccWindow)
예제 #19
0
def pspeech_featurize(file):
    # convert if .mp3 to .wav or it will fail
    convert = False
    if file[-4:] == '.mp3':
        convert = True
        os.system('ffmpeg -i %s %s' % (file, file[0:-4] + '.wav'))
        file = file[0:-4] + '.wav'

    (rate, sig) = wav.read(file)
    mfcc_feat = mfcc(sig, rate)
    fbank_feat = logfbank(sig, rate)
    ssc_feat = ssc(sig, rate)

    one_ = np.mean(mfcc_feat, axis=0)
    one = get_labels(one_, 'mfcc_', 'means')
    two_ = np.std(mfcc_feat, axis=0)
    two = get_labels(one_, 'mfcc_', 'stds')
    three_ = np.amax(mfcc_feat, axis=0)
    three = get_labels(one_, 'mfcc_', 'max')
    four_ = np.amin(mfcc_feat, axis=0)
    four = get_labels(one_, 'mfcc_', 'min')
    five_ = np.median(mfcc_feat, axis=0)
    five = get_labels(one_, 'mfcc_', 'medians')

    six_ = np.mean(fbank_feat, axis=0)
    six = get_labels(six_, 'fbank_', 'means')
    seven_ = np.mean(fbank_feat, axis=0)
    seven = get_labels(six_, 'fbank_', 'stds')
    eight_ = np.mean(fbank_feat, axis=0)
    eight = get_labels(six_, 'fbank_', 'max')
    nine_ = np.mean(fbank_feat, axis=0)
    nine = get_labels(six_, 'fbank_', 'min')
    ten_ = np.mean(fbank_feat, axis=0)
    ten = get_labels(six_, 'fbank_', 'medians')

    eleven_ = np.mean(ssc_feat, axis=0)
    eleven = get_labels(eleven_, 'spectral_centroid_', 'means')
    twelve_ = np.mean(ssc_feat, axis=0)
    twelve = get_labels(eleven_, 'spectral_centroid_', 'stds')
    thirteen_ = np.mean(ssc_feat, axis=0)
    thirteen = get_labels(eleven_, 'spectral_centroid_', 'max')
    fourteen_ = np.mean(ssc_feat, axis=0)
    fourteen = get_labels(eleven_, 'spectral_centroid_', 'min')
    fifteen_ = np.mean(ssc_feat, axis=0)
    fifteen = get_labels(eleven_, 'spectral_centroid_', 'medians')

    labels = one + two + three + four + five + six + seven + eight + nine + ten + eleven + twelve + thirteen + fourteen + fifteen
    features = np.append(one_, two_)
    features = np.append(features, three_)
    features = np.append(features, four_)
    features = np.append(features, five_)
    features = np.append(features, six_)
    features = np.append(features, seven_)
    features = np.append(features, eight_)
    features = np.append(features, nine_)
    features = np.append(features, ten_)
    features = np.append(features, eleven_)
    features = np.append(features, twelve_)
    features = np.append(features, thirteen_)
    features = np.append(features, fourteen_)
    features = np.append(features, fifteen_)

    if convert == True:
        os.remove(file)

    print(features.shape)
    print(len(labels))

    return features, labels
예제 #20
0
    def predict(self, rate, sig, group_list):
        #---------------------------------------------------------------------------#
        #test
        #print len(sig)
        fps = 25
        # print('FPS: {:}'.format(fps))
        winstep = 1.0 / fps / up_sample_rate
        mfcc_feat = mfcc(sig,
                         samplerate=rate,
                         winlen=0.025,
                         winstep=winstep,
                         numcep=13)
        logfbank_feat = logfbank(sig,
                                 samplerate=rate,
                                 winlen=0.025,
                                 winstep=winstep,
                                 nfilt=26)
        ssc_feat = ssc(sig,
                       samplerate=rate,
                       winlen=0.025,
                       winstep=winstep,
                       nfilt=26)
        full_feat = np.concatenate([mfcc_feat, logfbank_feat, ssc_feat],
                                   axis=1)
        # full_feat = logfbank_feat

        aligned_length_wav = full_feat
        npWav = np.array(aligned_length_wav)
        n_samples = len(npWav)

        # normalize wav-raw
        mean, std = np.loadtxt('utl/mean_std.txt')
        wav_raw = npWav
        wav_raw = (wav_raw - mean) / std
        #wav_raw = wav_raw[:, sel_id]

        # grouping
        x = list()
        x.append(wav_raw)
        n_batch = 1
        n_sample_needed = n_batch * batch_size - len(x)
        x += [x[-1] for _ in range(n_sample_needed)]
        x = np.array(x)
        #print x.shape

        state_test = self.sess.run(self.initial_state)
        batch_x = x
        seq_len = np.array([n_steps] + [0] * (batch_size - 1))
        feed = {
            self.x: batch_x,
            self.phase: False,
            self.dropout: 0,
            #self.batch_size: batch_size,
            self.initial_state: state_test,
            self.seq_len: seq_len
        }
        batch_y, _ = self.sess.run([self.pred, self.final_state],
                                   feed_dict=feed)

        y = batch_y[0]

        y = np.array(y)
        # np.savetxt("prediction/{}.txt".format(step),y)
        id = np.argmax(y)

        phonemes = group_list[id]
        #print phonemes
        #consider the up sample rate
        return phonemes
def featurex(filepath):
    (X, rate) = librosa.load(filepath, sr=48000)
    ceps = mfcc(X, rate, nfft=2048)
    delt = delta(ceps, 2)
    sscz = ssc(X, rate, nfft=2048)
    filt = delta(delt, 2)
    #zeroo=delta(sscz,2)
    #librosa.feature.zero_crossing_rate(X,rate)
    #    zeroo=zeroo.reshape((zeroo.shape[1],zeroo.shape[0]))
    ls = []
    for i in range(ceps.shape[1]):
        temp = ceps[:, i]
        lfeatures = [
            np.mean(temp),
            np.var(temp),
            np.amax(temp),
            np.amin(temp),
            scipy.stats.kurtosis(temp),
            scipy.stats.skew(temp),
            scipy.stats.iqr(temp)
        ]
        temp2 = np.array(lfeatures)
        ls.append(temp2)

    ls2 = []
    for i in range(delt.shape[1]):
        dtemp = delt[:, i]
        dlfeatures = [
            np.mean(dtemp),
            np.var(dtemp),
            np.amax(dtemp),
            np.amin(dtemp),
            scipy.stats.kurtosis(dtemp),
            scipy.stats.skew(dtemp),
            scipy.stats.iqr(dtemp)
        ]
        dtemp2 = np.array(dlfeatures)
        ls2.append(dtemp2)
    ls3 = []
    for i in range(sscz.shape[1]):
        stemp = sscz[:, i]
        slfeatures = [
            np.mean(stemp),
            np.var(stemp),
            np.amax(stemp),
            np.amin(stemp),
            scipy.stats.kurtosis(stemp),
            scipy.stats.skew(stemp),
            scipy.stats.iqr(stemp)
        ]
        stemp3 = np.array(slfeatures)
        ls3.append(stemp3)

    ls4 = []
    for i in range(filt.shape[1]):
        ftemp = filt[:, i]
        flfeatures = [
            np.mean(ftemp),
            np.var(ftemp),
            np.amax(ftemp),
            np.amin(ftemp),
            scipy.stats.kurtosis(ftemp),
            scipy.stats.skew(ftemp),
            scipy.stats.iqr(ftemp)
        ]
        ftemp4 = np.array(flfeatures)
        ls4.append(ftemp4)

    source = np.array(ls).flatten()
    source = np.append(source, np.array(ls2).flatten())
    source = np.append(source, np.array(ls3).flatten())
    source = np.append(source, np.array(ls4).flatten())
    #    source = np.append(source, np.array(ls5).flatten())
    return source
예제 #22
0
def generateStage1FFT(fs,
                      sig,
                      frameSize,
                      windowSize,
                      nFrames,
                      acWin,
                      form="magnitude",
                      phases=False,
                      nFilts=40,
                      nMFCCs=19):
    """
    This function takes an input speech signal and returns the STFT.
    Specify form="magnitude" to generate Hilbert transform later - the Hilbert transform only works on real signals.
    Note the scaling has been added as that is in SciPy STFT and seems necessary for signal reconstruction.
    
    Inputs:
        - fs: the sampling frequency of the speech files, form "16000"
        - sig: the speech signal to analyse
        - frameSize: the step size of the acoustic windows in seconds, form "0.001"
        - windowSize: the acoustic window size in seconds, form "0.03"
        - nFrames: the number of acoustic frames that the speech signal is broken up into
        - acWin: the acoustic window as a numpy array
        - form: a choice of "magnitude", "complex" or "real" which determines the content of fft1matrix
        - phases: whether to return the phase information as a separate matrix, form "True"
    Outputs:
        - p: the number of FFT points based on the acoustic window, form "48"
        - fft1matrix: a 2D matrix size [nFrames x p] that contains the FFT of each acoustic window
        - fft1matrixphases: the phase of each element, only returned if phases = True
    """
    import math
    from scipy.fftpack import fft
    #from scipy.signal.windows import hamming, hann
    import numpy as np
    from python_speech_features import fbank, mfcc, ssc

    if form == "fbank":
        p = nFilts
    elif form == "mfcc":
        p = nMFCCs
    else:
        p = round(
            windowSize * fs / 2 + 1
        )  # p is the number of acoustic frequency bins afters limiting with Nyquist

    fft1matrix = np.zeros((nFrames, p)) if form != "complex" else np.zeros(
        (nFrames, p), dtype=np.complex_)

    scale1 = acWin.sum()

    if phases == True:
        fft1matrixphases = np.zeros((nFrames, p))

    for i in range(nFrames):
        sigExtract = np.array(
            sig[i * round(frameSize * fs):i * round(frameSize * fs) +
                round(windowSize * fs)])  # Ignoring pre-emphasis filter here
        sigWin = fft(sigExtract * acWin) / scale1
        nfft = 2**math.ceil(np.log2(round(windowSize * fs)))
        if len(sigExtract) == round(windowSize * fs):
            if form == "magnitude":
                fft1matrix[i, :] = np.absolute(sigWin)[:p]
            elif form == "complex":
                fft1matrix[i, :] = sigWin[:p]
            elif form == "real":
                fft1matrix[i, :] = np.real(sigWin)[:p]
            elif form == "fbank":
                fft1matrix[i, :] = fbank(sigExtract*acWin, samplerate=fs, winlen=windowSize, winstep=frameSize, nfilt=nFilts, \
                                    nfft=nfft, lowfreq=0, highfreq=int(fs/2), preemph=0)[0][:p]
            elif form == "mfcc":
                fft1matrix[i, :] = mfcc(sigExtract * acWin,
                                        samplerate=fs,
                                        winlen=windowSize,
                                        winstep=frameSize,
                                        numcep=nMFCCs,
                                        nfilt=nFilts,
                                        nfft=nfft,
                                        lowfreq=0,
                                        highfreq=int(fs / 2),
                                        preemph=0,
                                        ceplifter=22,
                                        appendEnergy=False)[0][:p]
            else:
                print(
                    "Form must be \"magnitude\", \"complex\", \"real\", \"fbank\" or \"mfcc\"."
                )
            if phases == True:
                fft1matrixphases[i, :] = np.angle(fft(sigExtract * acWin))[:p]
    print("fft1matrix shape is {}".format(fft1matrix.shape))
    if phases == True:
        return p, fft1matrix, fft1matrixphases
    elif form == "fbank":
        freqs1 = ssc(np.array(sig[:round(windowSize*fs)]), samplerate=fs, winlen=windowSize, winstep=frameSize, nfilt=40, nfft=512, \
                                lowfreq=0, highfreq=round(fs/2), preemph=0)[0][:p]
        return p, fft1matrix, freqs1
    else:
        return p, fft1matrix