示例#1
0
def _key_fnc(
    sample: NDArray[Float32],
    frequency_rate: int,
    windowfnc: Window,
    key_type: KeyFunction,
):
    """
    This function computes the key function,
    which in return calculates the keys for the [this.samples] map.
    To calculate the spectral centroid,
    the frequency_rate should be equal to the half of the samplerate.
    """

    if key_type == KeyFunction.CENTROID:
        return _get_centroid(
            sample,
            estd.Centroid(range=frequency_rate),
            estd.Spectrum(),
            estd.Windowing(type=windowfnc.value),
        )
    if key_type == KeyFunction.MAX:
        return _get_max(
            sample,
            estd.Spectrum(),
            estd.Windowing(type=windowfnc.value),
        )
    if key_type == KeyFunction.MFCC:
        return _get_mfcc(
            sample,
            estd.MFCC(),
            estd.Spectrum(),
            estd.Windowing(type=windowfnc.value),
        )
    if key_type == KeyFunction.MELBANDS:
        return _get_melbands(
            sample,
            estd.MFCC(),
            estd.Spectrum(),
            estd.Windowing(type=windowfnc.value),
        )
    if key_type == KeyFunction.MELBANDS_LOG:
        return estd.UnaryOperator(type="log")(_get_melbands(
            sample,
            estd.MFCC(),
            estd.Spectrum(),
            estd.Windowing(type=windowfnc.value),
        ))
    raise ValueError("Keyfunction is not defined!")
示例#2
0
def extract(fname, outpath, fs=22050, fsize=1024, hsize=512):
    """
    extract(fname, outpath, fs, fsize, hsize) will compute the mfcc of Audio file fname.

    Inputs:
        fname   -- is the name of audio file.
        outpath -- is the output path of processed files.
        fs      -- is the sampling frequency (Hz).
        fsize   -- is the size of each frame.
        hsize   -- is the hop size betwean frames.
    Outputs:
        the file contains the mfcc coefficents of audio file.
        in what format???
    """
    #    gate(fname)
    loader = es.MonoLoader(filename=fname, sampleRate=fs)
    #    length = len(loader)
    #    maxim = max(loader)
    #    for sample in loader:
    #        if abs(sample) < maxim/20:
    #            sample = 0 ;

    w = es.Windowing(type='hann')
    spectrum = es.Spectrum()
    mfcc = es.MFCC(inputSize=513, numberCoefficients=20)

    mfccs = []
    audio = loader()
    for frame in es.FrameGenerator(audio, frameSize=1024, hopSize=512):
        mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame)))
        mfccs.append(mfcc_coeffs)

    mfccs = np.array(mfccs)
    return mfcc
def getMBE(audio):
    '''
    mel band energy feature
    :param audio:
    :return:
    '''

    winAnalysis = 'hann'

    # this MFCC is for pattern classification, which numberBands always be by default
    MFCC40 = ess.MFCC(sampleRate=fs,
                      highFrequencyBound=highFrequencyBound,
                      inputSize=framesize + 1)

    N = 2 * framesize  # padding 1 time framesize
    SPECTRUM = ess.Spectrum(size=N)
    WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize)

    mfccBands = []
    for frame in ess.FrameGenerator(audio,
                                    frameSize=framesize,
                                    hopSize=hopsize):

        frame = WINDOW(frame)
        mXFrame = SPECTRUM(frame)
        bands, mfccFrame = MFCC40(mXFrame)
        mfccBands.append(bands)
    feature = np.array(mfccBands)
    return feature
示例#4
0
def extractor(filename):

    fs = 44100
    audio = ess.MonoLoader(filename=filename, sampleRate=fs)()
    # dynamic range expansion as done in HTK implementation
    audio = audio * 2**15

    frameSize = 1102  # corresponds to htk default WINDOWSIZE = 250000.0
    hopSize = 441  # corresponds to htk default TARGETRATE = 100000.0
    fftSize = 2048
    spectrumSize = fftSize // 2 + 1
    zeroPadding = fftSize - frameSize

    w = ess.Windowing(
        type='hamming',  #  corresponds to htk default  USEHAMMING = T
        size=frameSize,
        zeroPadding=zeroPadding,
        normalized=False,
        zeroPhase=False)

    spectrum = ess.Spectrum(size=fftSize)

    mfcc_htk = ess.MFCC(
        inputSize=spectrumSize,
        type='magnitude',  # htk uses mel filterbank magniude
        warpingFormula='htkMel',  # htk's mel warping formula
        weighting='linear',  # computation of filter weights done in Hz domain
        highFrequencyBound=8000,  # corresponds to htk default
        lowFrequencyBound=0,  # corresponds to htk default
        numberBands=26,  # corresponds to htk default  NUMCHANS = 26
        numberCoefficients=13,
        normalize=
        'unit_max',  # htk filter normaliation to have constant height = 1  
        dctType=3,  # htk uses DCT type III
        logType='log',
        liftering=22)  # corresponds to htk default CEPLIFTER = 22

    mfccs = []
    # startFromZero = True, validFrameThresholdRatio = 1 : the way htk computes windows
    for frame in ess.FrameGenerator(audio,
                                    frameSize=frameSize,
                                    hopSize=hopSize,
                                    startFromZero=True,
                                    validFrameThresholdRatio=1):
        spect = spectrum(w(frame))
        mel_bands, mfcc_coeffs = mfcc_htk(spect)
        mfccs.append(mfcc_coeffs)

    # transpose to have it in a better shape
    # we need to convert the list to an essentia.array first (== numpy.array of floats)
    # mfccs = essentia.array(pool['MFCC']).T
    mfccs = essentia.array(mfccs).T

    # and plot
    plt.imshow(mfccs[1:, :], aspect='auto',
               interpolation='none')  # ignore enery
    # plt.imshow(mfccs, aspect = 'auto', interpolation='none')
    plt.show()  # unnecessary if you started "ipython --pylab"
示例#5
0
    def essentiaObjectInit(self):
        winAnalysis = 'hann'
        self.MFCC80 = ess.MFCC(sampleRate=self.fs,
                          highFrequencyBound=self.highFrequencyBound,
                          inputSize=self.frameSize + 1,
                          numberBands=self.numberBands)

        N = 2 * self.frameSize  # padding 1 time framesize
        self.SPECTRUM = ess.Spectrum(size=N)
        self.WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - self.frameSize)
示例#6
0
文件: features.py 项目: ctralie/acoss
    def mfcc_htk(self, window_length=22050, nmfcc=13, n_mels=26, fmax=8000, lifterexp=22):
        """
        Get MFCCs 'the HTK way' with the help of Essentia
        https://github.com/MTG/essentia/blob/master/src/examples/tutorial/example_mfcc_the_htk_way.py
        Using all of the default parameters from there except the hop length (which shouldn't matter), and a much longer window length (which has been found to work better for covers)
        Parameters
        ----------
        window_length: int
            Length of the window to use for the STFT
        nmfcc: int
            Number of MFCC coefficients to compute
        n_mels: int
            Number of frequency bands to use
        fmax: int
            Maximum frequency
        Returns
        -------
        ndarray(nmfcc, nframes)
            An array of all of the MFCC frames
        """
        fftlen = int(2**(np.ceil(np.log(window_length)/np.log(2))))
        spectrumSize= fftlen//2+1
        zeroPadding = fftlen - window_length

        w = estd.Windowing(type = 'hamming', #  corresponds to htk default  USEHAMMING = T
                            size = window_length, 
                            zeroPadding = zeroPadding,
                            normalized = False,
                            zeroPhase = False)
        
        spectrum = estd.Spectrum(size=fftlen)
        mfcc_htk = estd.MFCC(inputSize = spectrumSize,
                            type = 'magnitude', # htk uses mel filterbank magniude
                            warpingFormula = 'htkMel', # htk's mel warping formula
                            weighting = 'linear', # computation of filter weights done in Hz domain
                            highFrequencyBound = fmax, # 8000 is htk default
                            lowFrequencyBound = 0, # corresponds to htk default
                            numberBands = n_mels, # corresponds to htk default  NUMCHANS = 26
                            numberCoefficients = nmfcc,
                            normalize = 'unit_max', # htk filter normaliation to have constant height = 1  
                            dctType = 3, # htk uses DCT type III
                            logType = 'log',
                            liftering = lifterexp) # corresponds to htk default CEPLIFTER = 22


        mfccs = []
        # startFromZero = True, validFrameThresholdRatio = 1 : the way htk computes windows
        for frame in estd.FrameGenerator(self.audio_vector, frameSize = window_length, hopSize = self.hop_length , startFromZero = True, validFrameThresholdRatio = 1):
            spect = spectrum(w(frame))
            mel_bands, mfcc_coeffs = mfcc_htk(spect)
            mfccs.append(mfcc_coeffs)
        
        return np.array(mfccs, dtype=np.float32).T
def audio_features(audio_win):
    """
    returns audio features for a win
    """
    if audio_win.shape[0] % 2 == 1:
        audio_win = audio_win[:-1]
    spectrum = esst.Spectrum(size=audio_win.shape[0])(audio_win)
    _bands, mfcc = esst.MFCC(inputSize=spectrum.shape[0],
                             sampleRate=SR)(spectrum)

    rhythm = esst.RhythmDescriptors()(audio_win)
    return mfcc.tolist() + [rhythm[2]] + list(rhythm[5:11])
def getFeature(audio, d=True, nbf=False):
    '''
    MFCC of give audio interval [p[0],p[1]]
    :param audio:
    :param p:
    :return:
    '''

    winAnalysis = 'hann'

    # this MFCC is for pattern classification, which numberBands always be by default
    MFCC40 = ess.MFCC(sampleRate=fs,
                      highFrequencyBound=highFrequencyBound,
                      inputSize=framesize + 1)

    N = 2 * framesize  # padding 1 time framesize
    SPECTRUM = ess.Spectrum(size=N)
    WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize)

    mfcc = []
    # audio_p = audio[p[0]*fs:p[1]*fs]
    for frame in ess.FrameGenerator(audio,
                                    frameSize=framesize,
                                    hopSize=hopsize):
        frame = WINDOW(frame)
        mXFrame = SPECTRUM(frame)
        bands, mfccFrame = MFCC40(mXFrame)
        # mfccFrame       = mfccFrame[1:]
        mfcc.append(mfccFrame)

    if d:
        mfcc = np.array(mfcc).transpose()
        dmfcc = Fdeltas(mfcc, w=5)
        ddmfcc = Fdeltas(dmfcc, w=5)
        feature = np.transpose(np.vstack((mfcc, dmfcc, ddmfcc)))
    else:
        feature = np.array(mfcc)

    if not d and nbf:
        mfcc = np.array(mfcc).transpose()
        mfcc_out = np.array(mfcc, copy=True)
        for w_r in range(1, 6):
            mfcc_right_shifted = Fprev_sub(mfcc, w=w_r)
            mfcc_left_shifted = Fprev_sub(mfcc, w=-w_r)
            mfcc_out = np.vstack(
                (mfcc_out, mfcc_left_shifted, mfcc_right_shifted))
        feature = np.array(np.transpose(mfcc_out), dtype='float32')

    # print feature.shape

    return feature
示例#9
0
def extractor(filename):

    fs = 44100
    audio = ess.MonoLoader(filename=filename, sampleRate=fs)()
    # dynamic range expansion as done in HTK implementation
    audio = audio * 2**15

    frameSize = 1102  # corresponds to htk default WINDOWSIZE = 250000.0
    hopSize = 441  # corresponds to htk default TARGETRATE = 100000.0
    fftSize = 2048
    spectrumSize = fftSize // 2 + 1
    zeroPadding = fftSize - frameSize

    w = ess.Windowing(
        type='hamming',  #  corresponds to htk default  USEHAMMING = T
        size=frameSize,
        zeroPadding=zeroPadding,
        normalized=False,
        zeroPhase=False)

    spectrum = ess.Spectrum(size=fftSize)

    mfcc_htk = ess.MFCC(
        inputSize=spectrumSize,
        type='magnitude',  # htk uses mel filterbank magniude
        warpingFormula='htkMel',  # htk's mel warping formula
        weighting='linear',  # computation of filter weights done in Hz domain
        highFrequencyBound=8000,  # corresponds to htk default
        lowFrequencyBound=0,  # corresponds to htk default
        numberBands=26,  # corresponds to htk default  NUMCHANS = 26
        numberCoefficients=13,
        normalize=
        'unit_max',  # htk filter normaliation to have constant height = 1  
        dctType=3,  # htk uses DCT type III
        logType='log',
        liftering=22)  # corresponds to htk default CEPLIFTER = 22

    mfccs = []
    # startFromZero = True, validFrameThresholdRatio = 1 : the way htk computes windows
    for frame in ess.FrameGenerator(audio,
                                    frameSize=frameSize,
                                    hopSize=hopSize,
                                    startFromZero=True,
                                    validFrameThresholdRatio=1):
        spect = spectrum(w(frame))
        mel_bands, mfcc_coeffs = mfcc_htk(spect)
        #frame_energy = energy_func(frame)
        #mfccs.append(numpy.append(mfcc_coeffs, frame_energy))
        mfccs.append(mfcc_coeffs)

    return mfccs
示例#10
0
def extractor(filename):
    frameSize = 1024
    hopSize = 512
    fs = 44100
    audio = ess.MonoLoader(filename=filename, sampleRate=fs)()
    w = ess.Windowing(type='hamming', normalized=False)
    # make sure these are same for MFCC and IDCT computation
    NUM_BANDS = 26
    DCT_TYPE = 2
    LIFTERING = 0
    NUM_MFCCs = 13

    spectrum = ess.Spectrum()
    mfcc = ess.MFCC(
        numberBands=NUM_BANDS,
        numberCoefficients=
        NUM_MFCCs,  # make sure you specify first N mfcc: the less, the more lossy (blurry) the smoothed mel spectrum will be
        weighting=
        'linear',  # computation of filter weights done in Hz domain (optional)
        normalize=
        'unit_max',  #  htk filter normaliation to have constant height = 1 (optional)
        dctType=DCT_TYPE,
        logType='log',
        liftering=LIFTERING)  # corresponds to htk default CEPLIFTER = 22

    idct = ess.IDCT(inputSize=NUM_MFCCs,
                    outputSize=NUM_BANDS,
                    dctType=DCT_TYPE,
                    liftering=LIFTERING)
    all_melbands_smoothed = []

    for frame in ess.FrameGenerator(audio,
                                    frameSize=frameSize,
                                    hopSize=hopSize):
        spect = spectrum(w(frame))
        melbands, mfcc_coeffs = mfcc(spect)
        melbands_smoothed = np.exp(
            idct(mfcc_coeffs))  # inverse the log taken in MFCC computation
        all_melbands_smoothed.append(melbands_smoothed)

    # transpose to have it in a better shape
    # we need to convert the list to an essentia.array first (== numpy.array of floats)
    # mfccs = essentia.array(pool['MFCC']).T
    all_melbands_smoothed = essentia.array(all_melbands_smoothed).T

    # and plot
    plt.imshow(all_melbands_smoothed, aspect='auto',
               interpolation='none')  # ignore enery
    # plt.imshow(mfccs, aspect = 'auto', interpolation='none')
    plt.show()  # unnecessary if you started "ipython --pylab"
def feature_extractor_standard(audio_in, frameSize, hopSize, aggLen):
    
    #print('Starting Feature Extraction for %s',filename)
    
    #creating algorithm objects and pool objects
    win=es.Windowing()
    spec=es.Spectrum()
    centroid = es.Centroid()
    flatness = es.Flatness()
    mfcc=es.MFCC(lowFrequencyBound=40)
    pitchYin = es.PitchYinFFT()
    
    #Compute features frame by frame
    mfcc_ftrsArray = []
    sCentroidArray = []
    sFlatnessArray = []
    pConfArray = []
    
    for frame in es.FrameGenerator(audio_in, frameSize = frameSize, hopSize = hopSize):
        spectrum = spec(win(frame))
        band_eneg, mfcc_ftrs=mfcc(spectrum)
        sCentroid = centroid(spectrum)
        sFlatness = flatness(spectrum)
        pitch, pitchConf = pitchYin(spectrum)
        #sFlux = flux(spectrum)
        
        mfcc_ftrsArray.append(mfcc_ftrs)
        sCentroidArray.append(sCentroid)
        sFlatnessArray.append(sFlatness)
        pConfArray.append(pitchConf)

    meanMFCC = []
    varMFCC = []
    meanCent = []
    varCent = []
    meanFlat = []
    varFlat = []
    meanPConf = []
    varPConf = []
    for ii in xrange(0, len(mfcc_ftrsArray)-aggLen,aggLen):
        meanMFCC.append(np.mean(mfcc_ftrsArray[ii:ii+aggLen],axis=0))
        varMFCC.append(np.var(mfcc_ftrsArray[ii:ii+aggLen],axis=0))
        meanCent.append(np.mean(sCentroidArray[ii:ii+aggLen]))
        varCent.append(np.var(sCentroidArray[ii:ii+aggLen]))
        meanFlat.append(np.mean(sFlatnessArray[ii:ii+aggLen]))
        varFlat.append(np.var(sFlatnessArray[ii:ii+aggLen]))
        meanPConf.append(np.mean(pConfArray[ii:ii+aggLen]))
        varPConf.append(np.var(pConfArray[ii:ii+aggLen]))

    return np.concatenate((np.array(meanMFCC), np.array(varMFCC), np.transpose(np.array(meanCent, ndmin=2)), np.transpose(np.array(varCent, ndmin=2)), np.transpose(np.array(meanFlat,ndmin=2)), np.transpose(np.array(varFlat,ndmin=2)), np.transpose(np.array(meanPConf,ndmin=2)), np.transpose(np.array(varPConf,ndmin=2))),axis=1)
示例#12
0
def compute_beatsync_features(ticks, audio):
    """Computes the HPCP and MFCC beat-synchronous features given a set
        of beats (ticks)."""
    MFCC = STFTFeature(FRAME_SIZE, HOP_SIZE, WINDOW_TYPE,
                       ES.MFCC(numberCoefficients=14), ticks, SAMPLE_RATE)
    HPCP = STFTFeature(FRAME_SIZE, HOP_SIZE, WINDOW_TYPE, ES.HPCP(),
                       ticks, SAMPLE_RATE)
    logging.info("Computing Beat-synchronous MFCCs...")
    mfcc = MFCC.compute_features(audio)
    logging.info("Computing Beat-synchronous HPCPs...")
    hpcp = HPCP.compute_features(audio)
    logging.info("Computing Beat-synchronous Tonnetz...")
    tonnetz = utils.chroma_to_tonnetz(hpcp)

    return mfcc.tolist(), hpcp.tolist(), tonnetz.tolist()
def getMFCCBands1D(audio, nbf=False):
    '''
    mel bands feature [p[0],p[1]], this function only for pdnn acoustic model training
    output feature is a 1d vector
    it needs the array format float32
    :param audio:
    :param p:
    :param nbf: bool, if we need to neighbor frames
    :return:
    '''

    winAnalysis = 'hann'

    MFCC80 = ess.MFCC(sampleRate=fs,
                      highFrequencyBound=highFrequencyBound,
                      inputSize=framesize + 1,
                      numberBands=80)

    N = 2 * framesize  # padding 1 time framesize
    SPECTRUM = ess.Spectrum(size=N)
    WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize)

    mfcc = []
    # audio_p = audio[p[0]*fs:p[1]*fs]
    for frame in ess.FrameGenerator(audio,
                                    frameSize=framesize,
                                    hopSize=hopsize):
        frame = WINDOW(frame)
        mXFrame = SPECTRUM(frame)
        bands, mfccFrame = MFCC80(mXFrame)
        mfcc.append(bands)

    if nbf:
        mfcc = np.array(mfcc).transpose()
        mfcc_right_shifted_1 = Fprev_sub(mfcc, w=1)
        mfcc_left_shifted_1 = Fprev_sub(mfcc, w=-1)
        mfcc_right_shifted_2 = Fprev_sub(mfcc, w=2)
        mfcc_left_shifted_2 = Fprev_sub(mfcc, w=-2)
        feature = np.transpose(
            np.vstack((mfcc, mfcc_right_shifted_1, mfcc_left_shifted_1,
                       mfcc_right_shifted_2, mfcc_left_shifted_2)))
    else:
        feature = mfcc

    # the mel bands features
    feature = np.array(feature, dtype='float32')

    return feature
def getMFCCBands2D(audio, framesize, nbf=False, nlen=10):
    '''
    mel bands feature [p[0],p[1]]
    output feature for each time stamp is a 2D matrix
    it needs the array format float32
    :param audio:
    :param p:
    :param nbf: bool, if we need to neighbor frames
    :return:
    '''

    winAnalysis = 'hann'

    MFCC80 = ess.MFCC(sampleRate=fs,
                      highFrequencyBound=highFrequencyBound,
                      inputSize=framesize + 1,
                      numberBands=80)

    N = 2 * framesize  # padding 1 time framesize
    SPECTRUM = ess.Spectrum(size=N)
    WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize)

    mfcc = []
    # audio_p = audio[p[0]*fs:p[1]*fs]
    for frame in ess.FrameGenerator(audio,
                                    frameSize=framesize,
                                    hopSize=hopsize):
        frame = WINDOW(frame)
        mXFrame = SPECTRUM(frame)
        bands, mfccFrame = MFCC80(mXFrame)
        mfcc.append(bands)

    if nbf:
        mfcc = np.array(mfcc).transpose()
        mfcc_out = np.array(mfcc, copy=True)
        for ii in range(1, nlen + 1):
            mfcc_right_shift = Fprev_sub(mfcc, w=ii)
            mfcc_left_shift = Fprev_sub(mfcc, w=-ii)
            mfcc_out = np.vstack((mfcc_right_shift, mfcc_out, mfcc_left_shift))
        feature = mfcc_out.transpose()
    else:
        feature = mfcc
    # the mel bands features
    feature = np.array(feature, dtype='float32')

    return feature
示例#15
0
    def _calculate_features_for_audio(self, audio):

        FRAME_SIZE, HOP_SIZE = 2048, 1024
        features = []

        low_f = 100
        high_f = 7000

        w = ess.Windowing(type='hann')
        spec = ess.Spectrum(size=FRAME_SIZE)
        mfcc = ess.MFCC(lowFrequencyBound=low_f, highFrequencyBound=high_f)
        spectralContrast = ess.SpectralContrast(lowFrequencyBound=low_f,
                                                highFrequencyBound=high_f)
        pool = essentia.Pool()

        for frame in ess.FrameGenerator(audio,
                                        frameSize=FRAME_SIZE,
                                        hopSize=HOP_SIZE):
            frame_spectrum = spec(w(frame))
            spec_contrast, spec_valley = spectralContrast(frame_spectrum)
            mfcc_bands, mfcc_coeff = mfcc(frame_spectrum)
            pool.add('spec_contrast', spec_contrast)
            pool.add('spec_valley', spec_valley)
            pool.add('mfcc_coeff', mfcc_coeff)

        def add_moment_features(array):
            avg = np.average(array, axis=0)
            std = np.std(array, axis=0)
            skew = scipy.stats.skew(array, axis=0)
            deltas = array[1:, :] - array[:-1, :]
            avg_d = np.average(deltas, axis=0)
            std_d = np.std(deltas, axis=0)

            features.extend(avg)
            features.extend(std)
            features.extend(skew)
            features.extend(avg_d)
            features.extend(std_d)

        add_moment_features(pool['spec_contrast'])
        add_moment_features(pool['spec_valley'])
        add_moment_features(pool['mfcc_coeff'])

        return np.array(features, dtype='single')
示例#16
0
def mfcc(x,
         M=WINDOW_SIZE_MFCC,
         N=FFT_SIZE_MFCC,
         H=HOP_SIZE_MFCC,
         fs=SR,
         window_type=WINDOW_TYPE_MFCC,
         n_mfcc=N_MFCC):
    '''
	-extract features from audio file
	-Features:
		MFCC (24 COEFFS)

	'''
    #audioLoader = ess.EasyLoader(filename=file_name, sampleRate=fs)
    #create essentia instances
    x = essentia.array(x)
    spectrum = ess.Spectrum(size=N)
    window = ess.Windowing(size=M, type=window_type)
    mfcc = ess.MFCC(numberCoefficients=n_mfcc,
                    inputSize=int(N / 2 + 1),
                    sampleRate=fs,
                    highFrequencyBound=int(fs / 2 - 1))

    #init vectors
    MFCC = []

    #compute features for every stft frame
    for frame in ess.FrameGenerator(x,
                                    frameSize=M,
                                    hopSize=H,
                                    startFromZero=True):  #generate frames
        wX = window(frame)  #window frame
        mX = spectrum(wX)  #compute fft

        mfcc_bands, mfcc_coeffs = mfcc(mX)
        MFCC.append(mfcc_coeffs)

    #convert into numpy matrices
    MFCC = essentia.array(MFCC)

    return MFCC
示例#17
0
文件: mfcc.py 项目: szetumer/tabla
def _get_features(audio_path):
	spectrum = ess.Spectrum(size=N)
	window = ess.Windowing(size=M, type='hann')
	mfcc = ess.MFCC(numberCoefficients = 12)
	x = ess.MonoLoader(filename=audio_path, sampleRate = fs)()
	mfccs = []

	for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True):
	  mX = spectrum(window(frame))
	  mfcc_bands, mfcc_coeffs = mfcc(mX)
	  mfccs.append(mfcc_coeffs)
	mfccs = np.array(mfccs)

	headers = []
	features = []
	for i in range(0, 12):
		coefficients = mfccs[:,i]
		headers.append('mean_mfcc_%d' % i)
		features.append(np.mean(coefficients))

	# plt.figure(1, figsize=(9.5, 7))

	# plt.subplot(2,1,1)
	# plt.plot(np.arange(x.size)/float(fs), x, 'b')
	# plt.axis([0, x.size/float(fs), min(x), max(x)])
	# plt.ylabel('amplitude')
	# plt.title('x (speech-male.wav)')

	# plt.subplot(2,1,2)
	# numFrames = int(mfccs[:,0].size)
	# frmTime = H*np.arange(numFrames)/float(fs)
	# plt.pcolormesh(frmTime, 1+np.arange(12), np.transpose(mfccs[:,1:]))
	# plt.ylabel('coefficients')
	# plt.title('MFCCs')
	# plt.autoscale(tight=True)
	# plt.tight_layout()
	# plt.savefig('mfcc.png')
	# plt.show()

	return headers, features
def get_mfcc(frames,
             sample_rate=16000,
             num_bands=64,
             num_coeffs=32,
             window_type='hann'):
    '''
    Calculates amplitude spectrum, mel-frequency spectrum and mel-frequency cepstral coefficients.

    Parameters:
    frames          : overlapping signal frames for short-time analysis
    sample_rate     : audio sampling rate,
    num_bands       : number of mel-frequency bands
    num_coeffs      : number of mel-freq cepstrum coefficients
    window_type     : type of windowing function to apply

    Returns three 2D numpy arrays: amplitude spectra, mel-freq spectra and MFCCs
    '''

    frame_size = len(frames[0])
    spectra = []
    melbands = []
    mfccs = []

    spectrum_estimator = es.Spectrum(size=frame_size)
    windowing = es.Windowing(type='hann', size=frame_size)
    mfcc_estimator = es.MFCC(numberBands=num_bands,
                             numberCoefficients=num_coeffs + 1,
                             inputSize=frame_size,
                             sampleRate=sample_rate,
                             highFrequencyBound=8000)

    for frame in frames:
        spectrum = spectrum_estimator(windowing(frame))
        mfcc_bands, mfcc_coeffs = mfcc_estimator(spectrum)
        spectra.append(spectrum)
        mfccs.append(mfcc_coeffs[1:])
        melbands.append(mfcc_bands)

    return np.array(spectra).T, np.array(melbands).T, np.array(mfccs).T
示例#19
0
def compute_features(audio, beats=None):
    """Computes the HPCP and MFCC beat-synchronous features given a set
        of beats (beats)."""
    beatsync_str = ""
    if beats is not None:
        beatsync_str = "Beat-synchronous "

    MFCC = STFTFeature(msaf.Anal.frame_size, msaf.Anal.hop_size,
                       msaf.Anal.window_type,
                       ES.MFCC(numberCoefficients=msaf.Anal.mfcc_coeff),
                       msaf.Anal.sample_rate, beats)
    HPCP = STFTFeature(msaf.Anal.frame_size, msaf.Anal.hop_size,
                       msaf.Anal.window_type, ES.HPCP(), msaf.Anal.sample_rate,
                       beats)
    logging.info("Computing %sMFCCs..." % beatsync_str)
    mfcc = MFCC.compute_features(audio)
    logging.info("Computing %sHPCPs..." % beatsync_str)
    hpcp = HPCP.compute_features(audio)
    #plt.imshow(hpcp.T, interpolation="nearest", aspect="auto"); plt.show()
    logging.info("Computing %sTonnetz..." % beatsync_str)
    tonnetz = utils.chroma_to_tonnetz(hpcp)
    return mfcc, hpcp, tonnetz
示例#20
0
    def __init__(self, input_filename, fft_size, numMelBands):
        fft_size_dummy = 1024
        window_function_dummy = np.hanning
        AudioProcessor.__init__(self, input_filename, fft_size_dummy,
                                window_function_dummy)

        #             self.inv_mfcc_transform = InvMFCC() # inverse mfcc transform
        #             self.inv_mfcc_transform.setup()
        self.framesize = 2048  #
        #         self.framesize = 1102 #  default frame size in htk, at rate of 44100
        zeroPadding = fft_size - self.framesize
        self.w = ess.Windowing(
            type='hamming',
            size=self.framesize,
            zeroPadding=zeroPadding,
            #                     normalized = False,
            zeroPhase=False)

        spectrumSize = fft_size // 2 + 1
        self.spectrum = ess.Spectrum(size=fft_size)
        self.mfcc = ess.MFCC(
            inputSize=spectrumSize,  # htk-like  mfccs
            type='magnitude',
            warpingFormula='htkMel',
            weighting='linear',
            highFrequencyBound=8000,
            lowFrequencyBound=0,
            numberBands=numMelBands,
            numberCoefficients=InvMFCCAudioProcessor.NUM_MFCC_COEFFS,
            normalize='unit_max',
            dctType=3,
            logType='log',
            liftering=22)

        self.idct = ess.IDCT(inputSize=InvMFCCAudioProcessor.NUM_MFCC_COEFFS,
                             outputSize=numMelBands,
                             dctType=3,
                             liftering=22)
示例#21
0
def load_audio_excerpts(path=AUDIO_PATH, num_features=9):
    """
    Extracts `num_features+1` MFCC coeffcients from each audio and discards the
    first coefficients (tied to energy).
    """

    targets = np.zeros((3, 5, num_features))
    out = np.zeros((3, 5, 4, num_features))
    for file in tqdm(os.listdir(path)):
        if file.endswith(excerpt_search.FORMAT):
            audio = esst.EasyLoader(filename=os.path.join(path, file),
                                    sampleRate=SR)()
            if audio.shape[0] % 2 == 1:
                audio = audio[:-1]
            spectrum = esst.Spectrum(size=audio.shape[0])(audio)
            _bands, features = esst.MFCC(inputSize=spectrum.shape[0],
                                         sampleRate=SR,
                                         numberCoefficients=num_features +
                                         1)(spectrum)
            splits = file.replace('.flac', '').split('_')
            question = int(splits[0][1])
            _fill_out_targets(out[question], targets[question], features[1:],
                              splits, 'target')
    return out - targets[..., np.newaxis, :]
    print 'Labels and label indices', all_labels

    # This processing (top freq peaks) only works for single speaker case... need better features for multispeaker!
    # MFCC (or deep NN/automatic feature extraction) could be interesting

    inputSize = (data.shape[1] - 1) * 2

    M = 1024
    N = 1024
    H = 256
    fs = 8000
    spectrum = ess.Spectrum(size=N)
    window = ess.Windowing(size=M, type='hann')

    mfcc = ess.MFCC(numberCoefficients=7, inputSize=inputSize / 2 + 1)
    sc = ess.SpectralContrast(frameSize=inputSize)
    cent = ess.Centroid()
    """n_dim = 6
    all_obs = np.zeros((data.shape[0], n_dim))
    for r in range(data.shape[0]):
        #obs = np.zeros((n_dim, 1))
        _, t = peakfind(data[r, :], n_peaks=n_dim)
        all_obs[r, :] = t.copy()
    
    #all_obs = np.atleast_3d(all_obs)"""

    n_dim = 13
    all_obs = np.zeros((data.shape[0], n_dim))
    for r in range(data.shape[0]):
        mX = essentia.array(data[r, :])
示例#23
0

M = 1024
N = 1024
H = 512
fs = 44100

help(ess.MFCC)

spectrum = ess.Spectrum(size=N)
#printInfo(spectrum)

window = ess.Windowing(size=M, type='hann')
#printInfo(window)

mfcc = ess.MFCC(numberCoefficients=12, inputSize=N / 2 + 1)
#printInfo(mfcc)

x = ess.MonoLoader(filename='../../sounds/speech-female.wav', sampleRate=fs)()
frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True)

print '-' * 70

mfccs = []
frameIndex = 0
for frame in frames:
    mX = spectrum(window(frame))
    mfcc_bands, mfcc_coeffs = mfcc(mX)

    print mfcc_bands
    print '-' * 70
示例#24
0
from general.parameters import *
from general.filePathHsmm import kerasScaler_path
from general.Fprev_sub import Fprev_sub
from audio_preprocessing import feature_reshape
import essentia.standard as ess
import pickle
import numpy as np

winAnalysis = 'hann'
N = 2 * framesize  # padding 1 time framesize
SPECTRUM = ess.Spectrum(size=N)
MFCC = ess.MFCC(sampleRate=fs,
                highFrequencyBound=highFrequencyBound,
                inputSize=framesize + 1,
                numberBands=80)
WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize)


def getMFCCBands2D(audio, framesize, hopsize, nbf=False, nlen=10):
    """
    mel bands feature [p[0],p[1]]
    output feature for each time stamp is a 2D matrix
    it needs the array format float32
    :param audio:
    :param p:
    :param nbf: bool, if we need to neighbor frames
    :return:
    """

    mfcc = []
    # audio_p = audio[p[0]*fs:p[1]*fs]
示例#25
0
def main_simple(args):
    """main_simple

    Compute short time spectral feature map
    """

    plt.ion()
    
    audio = loadaudio(args)
    
    w = estd.Windowing(type = 'hamming')
    spectrum = estd.Spectrum()  # FFT() would return the complex FFT, here we just want the magnitude spectrum
    mfcc = estd.MFCC()
    
    specgram = []
    mfccs = []
    melbands = []

    for frame in estd.FrameGenerator(audio, frameSize = args.frame_size_low_level, hopSize = args.frame_size_low_level, startFromZero=True):
        mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame)))
        mfccs.append(mfcc_coeffs)
        melbands.append(mfcc_bands)
        specgram.append(spectrum(w(frame)))
        

    # transpose to have it in a better shape
    # we need to convert the list to an essentia.array first (== numpy.array of floats)
    mfccs = np.array(mfccs).T
    melbands = np.array(melbands).T
    specgram = np.array(specgram).T

    fig, gs = makefig(rows = 3, cols = 1, add_subplots = False)
    fig.show()

    print(("specgram.shape", specgram.shape))
    print(("melbands.shape", melbands.shape))
    
    ax1 = fig.add_subplot(gs[0,0])
    ax1.imshow(np.log(specgram[1:,:]), aspect = 'auto', origin='lower', interpolation='none')

    ax2 = fig.add_subplot(gs[1,0])
    ax2.imshow(mfccs[1:,:], aspect='auto', origin='lower', interpolation='none')

    ax3 = fig.add_subplot(gs[2,0])
    ax3.imshow(np.log(melbands[1:,:]), aspect = 'auto', origin='lower', interpolation='none')
    
    plt.draw()
    plt.pause(1e-9)

    # process
    numcomps = 3
    melbands_ = scale(melbands.T).T
    # wt = PCA(n_components = melbands.shape[0], whiten = True)
    # melbands_ = wt.fit_transform(melbands.T).T # scale(melbands.T).T
    # melbands_ = np.log(melbands + 1)  * 10

    print(("melbands", melbands.shape, "melbands_", melbands_.shape))
    print(("means", np.mean(melbands_, axis = 1)))
    
    sfa_in = melbands_[1:,:]
    sfa_cov = np.cov(sfa_in)
    print(("sfa_cov", sfa_cov.shape))
    # rbfcs = np.random.uniform(-5, 5, (numcomps, sfa_in.shape[0]))
    # sfa = SFA(numcomps = numcomps, numexps = 2) # , rbfc = rbfcs)
    sfa = KernelPCA(kernel="rbf", degree=5, fit_inverse_transform=True, gamma=10, n_components = numcomps)
    

    fig3, gs3 = makefig(rows = 1, cols = 2)
    
    fig3.axes[0].plot(sfa_in.T)
    
    fig3.axes[1].imshow(sfa_cov, aspect = 'auto', origin='upper', interpolation='none')
    fig3.axes[1].set_aspect(1)
    plt.draw()
    plt.pause(1e-9)

    try:
        # sfa_in += np.random.uniform(-1e-3, 1e-3, sfa_in.shape)
        melbands_sfa = sfa.fit_transform(sfa_in.T)
        # melbands_sfa = sfa.fit_transform(specgram[1:,:].T)
        print(("melbands_sfa.shape", melbands_sfa.shape))

        fig2, gs2 = makefig(rows = 1, cols = 2, add_subplots = False)
        fig2.show()

        ax = fig2.add_subplot(gs2[0,0])
        # ax.plot(melbands_sfa)
        # ax.imshow(np.log(melbands_sfa.T), aspect = 'auto', origin='lower', interpolation='none')
        # ax.imshow(np.log(np.abs(melbands_sfa.T)), aspect = 'auto', origin='lower', interpolation='none')
        ax.imshow(np.abs(melbands_sfa.T), aspect = 'auto', origin='lower', interpolation='none')
        
        ax = fig2.add_subplot(gs2[0,1])
        maxs = []
        for fr_ in melbands_sfa:
            print(("fr_", fr_.shape))
            maxs.append(np.argmax(np.abs(fr_)))
        ax.plot(np.array(maxs), "bo")

        plt.draw()
        plt.pause(1e-9)
        
    except Exception as e:
        print(("SFA failed", e))

    
    plt.ioff()
    plt.show()
示例#26
0
def main_mfcc(args):
    """main_mfcc

    Compute short time windowed MFCC features for input waveform and
    plot them over time (mfcc-spectrogram)
    """
    plt.ion()

    audio = loadaudio(args)
    
    print(("audio", type(audio), audio.shape))

    # pylab contains the plot() function, as well as figure, etc... (same names as Matlab)
    plt.rcParams['figure.figsize'] = (15, 6) # set plot sizes to something larger than default

    fig, gs = makefig(rows = 2, cols = 2)

    w = estd.Windowing(type = 'hann')
    spectrum = estd.Spectrum()  # FFT() would return the complex FFT, here we just want the magnitude spectrum
    mfcc = estd.MFCC()

    # print "w", repr(w)
    # print "spectrum", repr(spectrum)
    # print "mfcc", repr(mfcc)

    frame = audio[int(0.2*args.samplerate) : int(0.2*args.samplerate) + 1024]
    print(("frame.shape", frame.shape))
    spec = spectrum(w(frame))
    mfcc_bands, mfcc_coeffs = mfcc(spec)

    print(("type(spec)", type(spec)))
    print(("spec.shape", spec.shape))

    fig.axes[0].plot(audio[int(0.2*args.samplerate):int(0.4*args.samplerate)])
    fig.axes[0].set_title("This is how the 2nd second of this audio looks like:")
    # plt.show() # unnecessary if you started "ipython --pylab"

    fig.axes[1].plot(spec)
    fig.axes[1].set_title("The spectrum of a frame:")

    fig.axes[2].plot(mfcc_bands)
    fig.axes[2].set_title("Mel band spectral energies of a frame:")

    fig.axes[3].plot(mfcc_coeffs)
    fig.axes[3].set_title("First 13 MFCCs of a frame:")

    fig.show()

    # plt.show() # unnecessary if you started "ipython --pylab"
    ################################################################################
    fig2, gs2 = makefig(rows = 2, cols = 2, add_subplots = False)

    mfccs = []
    melbands = []

    for frame in estd.FrameGenerator(audio, frameSize=1024, hopSize=512, startFromZero=True):
        mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame)))
        mfccs.append(mfcc_coeffs)
        melbands.append(mfcc_bands)

    # transpose to have it in a better shape
    # we need to convert the list to an essentia.array first (== numpy.array of floats)
    mfccs = np.array(mfccs).T
    melbands = np.array(melbands).T

    pool = e.Pool()

    for frame in estd.FrameGenerator(audio, frameSize = 1024, hopSize = 512, startFromZero=True):
        mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame)))
        pool.add('lowlevel.mfcc', mfcc_coeffs)
        pool.add('lowlevel.mfcc_bands', mfcc_bands)


    ax1 = fig2.add_subplot(gs2[0,0])
    ax1.imshow(pool['lowlevel.mfcc_bands'].T, aspect = 'auto', origin='lower', interpolation='none')
    ax1.set_title("Mel band spectral energies in frames")

    ax2 = fig2.add_subplot(gs2[0,1])
    ax2.imshow(pool['lowlevel.mfcc'].T[1:,:], aspect='auto', origin='lower', interpolation='none')
    ax2.set_title("MFCCs in frames")

    # and plot
    ax3 = fig2.add_subplot(gs2[1,0])
    ax3.imshow(melbands[:,:], aspect = 'auto', origin='lower', interpolation='none')
    ax3.set_title("Mel band spectral energies in frames")
    # show() # unnecessary if you started "ipython --pylab"

    ax4 = fig2.add_subplot(gs2[1,1])
    ax4.imshow(mfccs[1:,:], aspect='auto', origin='lower', interpolation='none')
    ax4.set_title("MFCCs in frames")

    fig2.show()


    plt.ioff()
    plt.show() # unnecessary if you started "ipython --pylab"
示例#27
0
import essentia as es
import essentia.standard as ess
import numpy as np
import pickle
import glob
import utilFunctions as UF
import scipy.spatial.distance as DS

import parameters as params
import csv

rms=ess.RMS()
window = ess.Windowing(type = "hamming")
spec = ess.Spectrum(size=params.Nfft)
zz = np.zeros((params.zeropadLen,), dtype = 'float32')
genmfcc = ess.MFCC(highFrequencyBound = 22000.0, inputSize = params.Nfft/2+1, sampleRate = params.Fs)
hps = ess.HighPass(cutoffFrequency = 240.0)
onsets = ess.Onsets()

strokeLabels = ['dha', 'dhen', 'dhi', 'dun', 'ge', 'kat', 'ke', 'na', 'ne', 're', 'tak', 'te', 'tit', 'tun']

taals = {"teen": {"nmatra": 16, "accents": np.array([4, 1, 1, 1, 3, 1, 1, 1, 2, 1, 1, 1, 3, 1, 1, 1])}, 
         "ek": {"nmatra": 12, "accents": np.array([4, 1, 1, 2, 1, 1, 3, 1, 1, 2, 1, 1])},
         "jhap": {"nmatra": 10, "accents": np.array([4, 1, 2, 1, 1, 3, 1, 2, 1, 1])},
         "rupak": {"nmatra": 7, "accents": np.array([2, 1, 1, 3, 1, 3, 1])}
         }

rolls = [{"bol": ['dha/dha_02', 'te/te_05', 're/re_04', 'dha/dha_02'], "dur": np.array([1.0, 1.0, 1, 1]), "amp": np.array([1.0, 1.0, 1.0, 1.0])},
         {"bol": ['te/te_02', 're/re_05', 'ke/ke_04', 'te/te_02'], "dur": np.array([1.0, 1.0, 1, 1]), "amp": np.array([1.0, 1.0, 1.0, 1.0])},
         {"bol": ['ge/ge_02', 'ge/ge_05', 'te/te_04', 'te/te_02'], "dur": np.array([1.0, 1.0, 1, 1]), "amp": np.array([1.0, 1.0, 1.0, 1.0])},
         {"bol": ['ge/ge_02', 'ge/ge_05', 'dhi/dhi_04', 'na/na_02'], "dur": np.array([1.0, 1.0, 1, 1]), "amp": np.array([1.0, 1.0, 1.0, 1.0])},
示例#28
0
# Spectral descriptors
peak_freq = es.MaxMagFreq()
roll_off = es.RollOff()
flux = es.Flux()
flatness = es.Flatness()

# Harmonic descriptors
pitch = es.PitchYin(frameSize=1024)
spectral_peaks = es.SpectralPeaks(minFrequency=1e-5)
harmonic_peaks = es.HarmonicPeaks()
inharmonicity = es.Inharmonicity()
oer = es.OddToEvenHarmonicEnergyRatio()
tristimulus = es.Tristimulus()

# MFCC
mfcc = es.MFCC(inputSize=513)


class Audio:
    def __init__(self, path):
        self.audio = es.MonoLoader(filename=str(path))()
        self.name = path.name
        self.pool = essentia.Pool()

        self._build_temporal_features()
        self._build_spectral_features()
        self._build_harmonic_features()
        self._build_mfcc()

        self._features = {
            'audio_correlation': 'AC',
示例#29
0
def compute(audio, pool, options):
    # analysis parameters
    sampleRate = options['sampleRate']
    frameSize = options['frameSize']
    hopSize = options['hopSize']
    windowType = options['windowType']

    # temporal descriptors
    lpc = ess.LPC(order=10, type='warped', sampleRate=sampleRate)
    zerocrossingrate = ess.ZeroCrossingRate()

    # frame algorithms
    frames = ess.FrameGenerator(audio=audio, frameSize=frameSize, hopSize=hopSize)
    window = ess.Windowing(size=frameSize, zeroPadding=0, type=windowType)
    spectrum = ess.Spectrum(size=frameSize)

    # spectral algorithms
    barkbands = ess.BarkBands(sampleRate=sampleRate)
    centralmoments = ess.CentralMoments()
    crest = ess.Crest()
    centroid = ess.Centroid()
    decrease = ess.Decrease()
    spectral_contrast = ess.SpectralContrast(frameSize=frameSize,
                                             sampleRate=sampleRate,
                                             numberBands=6,
                                             lowFrequencyBound=20,
                                             highFrequencyBound=11000,
                                             neighbourRatio=0.4,
                                             staticDistribution=0.15)
    distributionshape = ess.DistributionShape()
    energy = ess.Energy()
    # energyband_bass, energyband_middle and energyband_high parameters come from "standard" hi-fi equalizers
    energyband_bass = ess.EnergyBand(startCutoffFrequency=20.0, stopCutoffFrequency=150.0, sampleRate=sampleRate)
    energyband_middle_low = ess.EnergyBand(startCutoffFrequency=150.0, stopCutoffFrequency=800.0, sampleRate=sampleRate)
    energyband_middle_high = ess.EnergyBand(startCutoffFrequency=800.0, stopCutoffFrequency=4000.0,
                                            sampleRate=sampleRate)
    energyband_high = ess.EnergyBand(startCutoffFrequency=4000.0, stopCutoffFrequency=20000.0, sampleRate=sampleRate)
    flatnessdb = ess.FlatnessDB()
    flux = ess.Flux()
    harmonic_peaks = ess.HarmonicPeaks()
    hfc = ess.HFC()
    mfcc = ess.MFCC()
    rolloff = ess.RollOff()
    rms = ess.RMS()
    strongpeak = ess.StrongPeak()

    # pitch algorithms
    pitch_detection = ess.PitchYinFFT(frameSize=frameSize, sampleRate=sampleRate)
    pitch_salience = ess.PitchSalience()

    # dissonance
    spectral_peaks = ess.SpectralPeaks(sampleRate=sampleRate, orderBy='frequency')
    dissonance = ess.Dissonance()

    # spectral complexity
    # magnitudeThreshold = 0.005 is hardcoded for a "blackmanharris62" frame
    spectral_complexity = ess.SpectralComplexity(magnitudeThreshold=0.005)

    INFO('Computing Low-Level descriptors...')

    # used for a nice progress display
    total_frames = frames.num_frames()
    n_frames = 0
    start_of_frame = -frameSize * 0.5

    pitches, pitch_confidences = [], []

    progress = Progress(total=total_frames)

    #scPool = es.Pool()  # pool for spectral contrast

    for frame in frames:

        frameScope = [start_of_frame / sampleRate, (start_of_frame + frameSize) / sampleRate]
        # pool.setCurrentScope(frameScope)

        # silence rate
        # pool.add(namespace + '.' + 'silence_rate_60dB', es.isSilent(frame))
        pool.add(namespace + '.' + 'silence_rate_60dB', is_silent_threshold(frame, -60))
        pool.add(namespace + '.' + 'silence_rate_30dB', is_silent_threshold(frame, -30))
        pool.add(namespace + '.' + 'silence_rate_20dB', is_silent_threshold(frame, -20))

        if options['skipSilence'] and es.isSilent(frame):
            total_frames -= 1
            start_of_frame += hopSize
            continue

        # temporal descriptors
        pool.add(namespace + '.' + 'zerocrossingrate', zerocrossingrate(frame))
        (frame_lpc, frame_lpc_reflection) = lpc(frame)
        pool.add(namespace + '.' + 'temporal_lpc', frame_lpc)

        frame_windowed = window(frame)
        frame_spectrum = spectrum(frame_windowed)

        # spectrum-based descriptors
        power_spectrum = frame_spectrum ** 2
        pool.add(namespace + '.' + 'spectral_centroid', centroid(power_spectrum))
        pool.add(namespace + '.' + 'spectral_decrease', decrease(power_spectrum))
        pool.add(namespace + '.' + 'spectral_energy', energy(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_low', energyband_bass(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_middle_low', energyband_middle_low(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_middle_high', energyband_middle_high(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_high', energyband_high(frame_spectrum))
        pool.add(namespace + '.' + 'hfc', hfc(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_rms', rms(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_flux', flux(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_rolloff', rolloff(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_strongpeak', strongpeak(frame_spectrum))

        # central moments descriptors
        frame_centralmoments = centralmoments(power_spectrum)
        (frame_spread, frame_skewness, frame_kurtosis) = distributionshape(frame_centralmoments)
        pool.add(namespace + '.' + 'spectral_kurtosis', frame_kurtosis)
        pool.add(namespace + '.' + 'spectral_spread', frame_spread)
        pool.add(namespace + '.' + 'spectral_skewness', frame_skewness)

        # dissonance
        (frame_frequencies, frame_magnitudes) = spectral_peaks(frame_spectrum)
        frame_dissonance = dissonance(frame_frequencies, frame_magnitudes)
        pool.add(namespace + '.' + 'dissonance', frame_dissonance)

        # mfcc
        (frame_melbands, frame_mfcc) = mfcc(frame_spectrum)
        pool.add(namespace + '.' + 'mfcc', frame_mfcc)

        # spectral contrast
        (sc_coeffs, sc_valleys) = spectral_contrast(frame_spectrum)
        #scPool.add(namespace + '.' + 'sccoeffs', sc_coeffs)
        #scPool.add(namespace + '.' + 'scvalleys', sc_valleys)
        pool.add(namespace + '.' + 'spectral_contrast', sc_coeffs)


        # barkbands-based descriptors
        frame_barkbands = barkbands(frame_spectrum)
        pool.add(namespace + '.' + 'barkbands', frame_barkbands)
        pool.add(namespace + '.' + 'spectral_crest', crest(frame_barkbands))
        pool.add(namespace + '.' + 'spectral_flatness_db', flatnessdb(frame_barkbands))
        barkbands_centralmoments = ess.CentralMoments(range=len(frame_barkbands) - 1)
        (barkbands_spread, barkbands_skewness, barkbands_kurtosis) = distributionshape(
            barkbands_centralmoments(frame_barkbands))
        pool.add(namespace + '.' + 'barkbands_spread', barkbands_spread)
        pool.add(namespace + '.' + 'barkbands_skewness', barkbands_skewness)
        pool.add(namespace + '.' + 'barkbands_kurtosis', barkbands_kurtosis)

        # pitch descriptors
        frame_pitch, frame_pitch_confidence = pitch_detection(frame_spectrum)
        if frame_pitch > 0 and frame_pitch <= 20000.:
            pool.add(namespace + '.' + 'pitch', frame_pitch)
        pitches.append(frame_pitch)
        pitch_confidences.append(frame_pitch_confidence)
        pool.add(namespace + '.' + 'pitch_instantaneous_confidence', frame_pitch_confidence)

        frame_pitch_salience = pitch_salience(frame_spectrum[:-1])
        pool.add(namespace + '.' + 'pitch_salience', frame_pitch_salience)

        # spectral complexity
        pool.add(namespace + '.' + 'spectral_complexity', spectral_complexity(frame_spectrum))

        # display of progress report
        progress.update(n_frames)

        n_frames += 1
        start_of_frame += hopSize

    # if no 'temporal_zerocrossingrate' it means that this is a silent file
    if 'zerocrossingrate' not in descriptorNames(pool.descriptorNames(), namespace):
        raise ess.EssentiaError('This is a silent file!')

    #spectralContrastPCA(scPool, pool)

    # build pitch value histogram
    from math import log
    from numpy import bincount
    # convert from Hz to midi notes
    midipitches = []
    unknown = 0
    for freq in pitches:
        if freq > 0. and freq <= 12600:
            midipitches.append(12 * (log(freq / 6.875) / 0.69314718055995) - 3.)
        else:
            unknown += 1

    if len(midipitches) > 0:
        # compute histogram
        midipitchhist = bincount(midipitches)
        # set 0 midi pitch to be the number of pruned value
        midipitchhist[0] = unknown
        # normalise
        midipitchhist = [val / float(sum(midipitchhist)) for val in midipitchhist]
        # zero pad
        for i in range(128 - len(midipitchhist)): midipitchhist.append(0.0)
    else:
        midipitchhist = [0.] * 128
        midipitchhist[0] = 1.

    # pitchhist = ess.array(zip(range(len(midipitchhist)), midipitchhist))
    pool.add(namespace + '.' + 'spectral_pitch_histogram', midipitchhist)  # , pool.GlobalScope)

    # the code below is the same as the one above:
    # for note in midipitchhist:
    #    pool.add(namespace + '.' + 'spectral_pitch_histogram_values', note)
    #    print "midi note:", note

    pitch_centralmoments = ess.CentralMoments(range=len(midipitchhist) - 1)
    (pitch_histogram_spread, pitch_histogram_skewness, pitch_histogram_kurtosis) = distributionshape(
        pitch_centralmoments(midipitchhist))
    pool.add(namespace + '.' + 'spectral_pitch_histogram_spread', pitch_histogram_spread)  # , pool.GlobalScope)

    progress.finish()
def mainFunction(filename,fs,framesize,hopsize,h2,alpha,p_lambda):

    '''
    main procedure of algorithm
    :param filename:
    :param fs:
    :param framesize:
    :param hopsize:
    :return:
    '''

    # load audio
    audio           = ess.MonoLoader(filename = filename, sampleRate = fs)()

    # spectrogram init
    winAnalysis     = 'hann'
    N               = 2 * framesize                     # padding 1 time framesize
    SPECTRUM        = ess.Spectrum(size=N)
    WINDOW          = ess.Windowing(type=winAnalysis, zeroPadding=N-framesize)
    highFrequencyBound = fs/2 if fs/2<11000 else 11000
    MFCC            = ess.MFCC(sampleRate=fs,highFrequencyBound=highFrequencyBound)
    PEAK            = ess.PeakDetection(interpolate=False,maxPeaks=99999)
    mfcc            = []
    mX              = []

    print 'calculating MFCC ... ...'

    for frame in ess.FrameGenerator(audio, frameSize=framesize, hopSize=hopsize):

        frame           = WINDOW(frame)
        mXFrame         = SPECTRUM(frame)
        mX.append(mXFrame)
        bands,mfccFrame = MFCC(mXFrame)
        mfccFrame       = mfccFrame[1:]

        mfcc.append(mfccFrame)

    mX              = np.array(mX)
    mX              = np.transpose(mX)
    mfcc            = np.array(mfcc)
    T               = mfcc.shape[0]                         # time
    D               = mfcc.shape[1]                         # feature dimension

    print 'calculating delta mfcc ... ...'

    d_mfcc          = Fdeltas(mfcc.transpose(), w=9)
    d_mfcc          = np.transpose(d_mfcc)

    # Spectral variation function
    SVF             = np.sqrt(np.sum(d_mfcc**2.0,axis=1))
    SVF             = (SVF - np.min(SVF))/(np.max(SVF)-np.min(SVF))

    # peaks and valleys
    p_SVF,a_SVF     = PEAK(np.array(SVF,dtype=np.float32))
    p_SVF           = np.array(np.round(p_SVF*(T-1)),dtype=np.int)

    p_v_SVF,a_v_SVF = PEAK(np.array(1-SVF,dtype=np.float32))
    p_v_SVF         = np.array(np.round(p_v_SVF*(T-1)),dtype=np.int)

    # heuristics
    p_SVF,a_SVF,p_v_SVF,a_v_SVF = heuristics(p_SVF,a_SVF,p_v_SVF,a_v_SVF,SVF,fs,hopsize,h2,alpha)

    index2Delete    = []
    if len(p_SVF) > 3:
        # BIC
        ii              = 1
        jj              = 1
        # dynamic windowing BIC
        while ii < len(p_SVF)-1:
            p_0             = p_SVF[ii-jj]
            p_1             = p_SVF[ii]
            p_2             = p_SVF[ii+1]

            delta_ABF2   = ABF2(d_mfcc[p_0:p_1,:],d_mfcc[p_1:p_2,:],d_mfcc[p_0:p_2,:],p_lambda)
            if  delta_ABF2 > 0:
                jj              = 1

            else:
                jj              += 1
                index2Delete.append(ii)
            ii              += 1

            if ii >= len(p_SVF)-1: break

            # print delta_BIC, p_0, p_1, p_2,

    p_ABF2          = np.delete(p_SVF,index2Delete)
    a_ABF2          = np.delete(a_SVF,index2Delete)