def MRCG(x, fs=44100, framesize1=0.02, framesize2=0.2, hopsize=0.01):

    hopsize = int(hopsize * fs)
    # spectrogram init
    winAnalysis = 'hann'

    ####---- cochleagram 1
    framesize = int(framesize1 * fs)
    N = 2 * framesize  # padding 1 time framesize
    SPECTRUM = ess.Spectrum(size=N)
    WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize)
    highFrequencyBound = fs / 2 if fs / 2 < 11000 else 11000
    ERBBANDS = ess.ERBBands(sampleRate=fs,
                            highFrequencyBound=highFrequencyBound,
                            inputSize=framesize + 1)

    cochlea1 = []
    for frame in ess.FrameGenerator(x, frameSize=framesize, hopSize=hopsize):
        frame = WINDOW(frame)
        mXFrame = SPECTRUM(frame)
        erbFrame = np.log10(ERBBANDS(mXFrame) + np.finfo(np.float).eps)
        cochlea1.append(erbFrame)
    cochlea1 = np.array(cochlea1)

    ####---- cochleagram 2
    framesize = int(framesize2 * fs)
    N = 2 * framesize  # padding 1 time framesize
    SPECTRUM = ess.Spectrum(size=N)
    WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize)
    highFrequencyBound = fs / 2 if fs / 2 < 11000 else 11000
    ERBBANDS = ess.ERBBands(sampleRate=fs,
                            highFrequencyBound=highFrequencyBound,
                            inputSize=framesize + 1)

    cochlea2 = []
    for frame in ess.FrameGenerator(x, frameSize=framesize, hopSize=hopsize):
        frame = WINDOW(frame)
        mXFrame = SPECTRUM(frame)
        erbFrame = np.log10(ERBBANDS(mXFrame) + np.finfo(np.float).eps)
        cochlea2.append(erbFrame)
    cochlea2 = np.array(cochlea2)

    ####---- smoothed version
    cochlea3 = get_avg(cochlea1, 5, 5)
    cochlea4 = get_avg(cochlea1, 11, 11)

    all_cochleas = np.hstack((cochlea1, cochlea2, cochlea3, cochlea4))

    ####---- delta
    d_all_cochleas = Fdeltas(all_cochleas.T)
    dd_all_cochleas = Fdeltas(Fdeltas(all_cochleas.T, 5), 5)

    d_all_cochleas = d_all_cochleas.T
    dd_all_cochleas = dd_all_cochleas.T

    return all_cochleas, d_all_cochleas, dd_all_cochleas
示例#2
0
def _key_fnc(
    sample: NDArray[Float32],
    frequency_rate: int,
    windowfnc: Window,
    key_type: KeyFunction,
):
    """
    This function computes the key function,
    which in return calculates the keys for the [this.samples] map.
    To calculate the spectral centroid,
    the frequency_rate should be equal to the half of the samplerate.
    """

    if key_type == KeyFunction.CENTROID:
        return _get_centroid(
            sample,
            estd.Centroid(range=frequency_rate),
            estd.Spectrum(),
            estd.Windowing(type=windowfnc.value),
        )
    if key_type == KeyFunction.MAX:
        return _get_max(
            sample,
            estd.Spectrum(),
            estd.Windowing(type=windowfnc.value),
        )
    if key_type == KeyFunction.MFCC:
        return _get_mfcc(
            sample,
            estd.MFCC(),
            estd.Spectrum(),
            estd.Windowing(type=windowfnc.value),
        )
    if key_type == KeyFunction.MELBANDS:
        return _get_melbands(
            sample,
            estd.MFCC(),
            estd.Spectrum(),
            estd.Windowing(type=windowfnc.value),
        )
    if key_type == KeyFunction.MELBANDS_LOG:
        return estd.UnaryOperator(type="log")(_get_melbands(
            sample,
            estd.MFCC(),
            estd.Spectrum(),
            estd.Windowing(type=windowfnc.value),
        ))
    raise ValueError("Keyfunction is not defined!")
示例#3
0
def extract(fname, outpath, fs=22050, fsize=1024, hsize=512):
    """
    extract(fname, outpath, fs, fsize, hsize) will compute the mfcc of Audio file fname.

    Inputs:
        fname   -- is the name of audio file.
        outpath -- is the output path of processed files.
        fs      -- is the sampling frequency (Hz).
        fsize   -- is the size of each frame.
        hsize   -- is the hop size betwean frames.
    Outputs:
        the file contains the mfcc coefficents of audio file.
        in what format???
    """
    #    gate(fname)
    loader = es.MonoLoader(filename=fname, sampleRate=fs)
    #    length = len(loader)
    #    maxim = max(loader)
    #    for sample in loader:
    #        if abs(sample) < maxim/20:
    #            sample = 0 ;

    w = es.Windowing(type='hann')
    spectrum = es.Spectrum()
    mfcc = es.MFCC(inputSize=513, numberCoefficients=20)

    mfccs = []
    audio = loader()
    for frame in es.FrameGenerator(audio, frameSize=1024, hopSize=512):
        mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame)))
        mfccs.append(mfcc_coeffs)

    mfccs = np.array(mfccs)
    return mfcc
示例#4
0
文件: utilFunc.py 项目: MTG/smc-2016
def vibFreq(pitchtrack, sp, hopsize):
    '''
    :param pitchtrack:
    :param sp: samplerate of wave audio
    :param hopsize:
    :return: 3 frequencies of potential vibrato
    '''

    if pitchtrack.dtype != np.float32:
        pitchtrack = pitchtrack.astype(np.float32)

    pitchtrackPad = pitchtrack[:]

    sampleRate = sp / hopsize
    ptlen = len(pitchtrack)
    fftSize = int(pow(2, ceil(log(ptlen) /
                              log(2))))  # next pow of pitchtrack length
    if ptlen < fftSize:
        pitchtrackPad = np.append(pitchtrack,
                                  np.zeros(fftSize - ptlen, dtype=np.float32))
    S = ess.Spectrum(size=fftSize)(pitchtrackPad)
    locs, amps = ess.PeakDetection(maxPeaks=3, orderBy='amplitude')(S)
    freqs = locs * (fftSize / 2 + 1) * sampleRate / fftSize

    return freqs[0]
示例#5
0
def segment(audio, hopSize, frameSize, rms_onset_threshold,
            mel_onset_threshold, flux_onset_threshold, onset_threshold):

    # init algorithms
    o_mel = estd.OnsetDetection(method='melflux')
    o_rms = estd.OnsetDetection(method='rms')
    o_hfc = estd.OnsetDetection(method='hfc')
    o_flux = estd.OnsetDetection(method='flux')
    fft = estd.FFT()
    c2p = estd.CartesianToPolar()
    pool = essentia.Pool()
    frame_generator = estd.FrameGenerator(audio,
                                          frameSize=frameSize,
                                          hopSize=hopSize)
    w = estd.Windowing(type='hann')
    yin = estd.PitchYinFFT(frameSize=frameSize,
                           minFrequency=40,
                           maxFrequency=2500,
                           interpolate=True)
    spectrum = estd.Spectrum()
    loudness = estd.Loudness()

    # control parameters
    attack = False
    detection = True
    mel_onset_value = 0
    rms_onset_value = 0

    # output variables
    onset = None
    sustain = None

    for index, frame in enumerate(frame_generator):
        mag, phase = c2p(fft(w(frame)))
        _, conf = yin(spectrum(w(frame)))
        loud = loudness(frame)
        mel_onset = o_mel(mag, phase)
        rms_onset = o_rms(mag, phase)
        hfc_onset = o_hfc(mag, phase)
        flux_onset = o_flux(mag, phase)
        pool.add('onsets_mel', mel_onset)
        pool.add('onsets_rms', rms_onset)
        pool.add('onsets_hfc', hfc_onset)
        pool.add('onsets_flux', flux_onset)
        pool.add('conf', conf)
        pool.add('loudness', loud)

        # condition for onset
        if detection and (flux_onset > flux_onset_threshold or mel_onset > mel_onset_threshold) \
                and rms_onset > rms_onset_threshold and loud > onset_threshold:
            onset = index
            attack = True
            detection = False
            mel_onset_value = mel_onset
            rms_onset_value = rms_onset
        # condition for beginning of sustain
        if attack and conf > 0.5 and rms_onset < rms_onset_value * .05 and mel_onset < mel_onset_value * .3:
            attack = False
            sustain = index
    return onset, sustain
示例#6
0
def compute_description(x,
                        M=WINDOW_SIZE,
                        N=FFT_SIZE,
                        H=HOP_SIZE,
                        fs=SR,
                        window_type=WINDOW_TYPE):
    '''
    -extract features from audio file
    -Features:
        HFC
        SPECTRAL CENTROID
        SPECTRAL ENERGY
        F0
        loud_factor = energy * (spectral_centroid - F0)  #how many harmonics = how much speaker is yelling
	    PITCH CONFIDENCE
    '''
    #audioLoader = ess.EasyLoader(filename=file_name, sampleRate=fs)
    #create essentia instances
    x = essentia.array(x)
    spectrum = ess.Spectrum(size=N)
    window = ess.Windowing(size=M, type=window_type)
    hfc = ess.HFC(sampleRate=fs)
    spectralCentroid = ess.SpectralCentroidTime(sampleRate=fs)
    energy = ess.Energy()
    pitch_extractor = ess.PredominantPitchMelodia(frameSize=M,
                                                  hopSize=H,
                                                  maxFrequency=1200)
    #init vectors
    CONTRAST = []
    HFC = []
    CENTROID = []
    ENERGY = []

    #compute features for every stft frame
    for frame in ess.FrameGenerator(x,
                                    frameSize=M,
                                    hopSize=H,
                                    startFromZero=True):  #generate frames
        wX = window(frame)  #window frame
        mX = spectrum(wX)  #compute fft
        frame_hfc = hfc(mX)
        HFC.append(frame_hfc)
        frame_centroid = spectralCentroid(
            wX)  #compute spectral centroid in time domain
        CENTROID.append(frame_centroid)
        frame_energy = energy(mX)  #compute spectral energy in time domain
        ENERGY.append(frame_energy)

    F0, SALIENCE = pitch_extractor(x)  #estimate pitch in time domain

    #convert into numpy matrices
    HFC = essentia.array(HFC)
    CENTROID = essentia.array(CENTROID)
    ENERGY = essentia.array(ENERGY)
    F0 = essentia.array(F0)
    SALIENCE = essentia.array(SALIENCE)
    F0 = F0[:len(CENTROID)]
    SALIENCE = SALIENCE[:len(CENTROID)]

    return HFC, CENTROID, ENERGY, F0, SALIENCE
示例#7
0
def extract_features(x,
                     M=WINDOW_SIZE,
                     N=FFT_SIZE,
                     H=HOP_SIZE,
                     fs=SR,
                     window_type=WINDOW_TYPE):
    '''
    extract magnitudes spectra from input vector and apply power-law compression
    '''
    #init functions and vectors
    x = essentia.array(x)
    spectrum = ess.Spectrum(size=N)
    window = ess.Windowing(size=M, type=window_type)
    SP = []

    #compute STFT
    for frame in ess.FrameGenerator(x,
                                    frameSize=M,
                                    hopSize=H,
                                    startFromZero=True):  #generate frames
        wX = window(frame)  #window frame
        mX = spectrum(wX)  #compute fft
        ###############################OPTIMIZATION[[[[[[[[[[[[[[]]]]]]]]]]]]]]
        #DEPRECATED
        #################################################
        SP.append(mX)

    SP = essentia.array(SP)
    SP = np.power(SP, 2. / 3.)  #power law compression

    return SP
示例#8
0
def get_f0(audio, minf0=20, maxf0=22050, cf=0.9, ws=2048, hs=256):
        '''
        Args:
            audio (array): audio signal (output from MonoLoader)
            minf0 (int): minimum allowed frequency
            maxf0 (int): maximun allowed frequency
            cf (float): confidence threshold (0 - 1)
            ws (int): window size
            hp (int): hop size

        Returns:
            f0 (array):
        '''
        # instantiate Essentia functions
        w = es.Windowing(type='hann', zeroPadding=ws)
        spec = es.Spectrum()
        yin = es.PitchYinFFT(minFrequency=minf0, maxFrequency=maxf0, frameSize=ws)

        # empty lists for f0 and confidence
        f0 = []
        conf = []

        # iterate over frames
        for frame in es.FrameGenerator(audio, frameSize=ws, hopSize=hs):
            p, pc = yin(spec(w(frame)))
            f0.append(p)
            conf.append(pc)

        # convert lists to np.arrays
        f0 = np.array(f0)
        conf = np.array(conf)

        # return f0 over given confidence
        f0[conf < cf] = 0
        return f0
示例#9
0
def getHPCPEssentia(XAudio, Fs, winSize, hopSize, squareRoot=False, NChromaBins=36, NHarmonics = 0):
    """
    Wrap around the essentia library to compute HPCP features
    :param XAudio: A flat array of raw audio samples
    :param Fs: Sample rate
    :param winSize: Window size of each STFT window
    :param hopSize: Hop size between STFT windows
    :param squareRoot: Do square root compression?
    :param NChromaBins: How many chroma bins (default 36)
    :returns H: An (NChromaBins x NWindows) matrix of all \
        chroma windows
    """
    import essentia
    from essentia import Pool, array
    import essentia.standard as ess
    spectrum = ess.Spectrum()
    window = ess.Windowing(size=winSize, type='hann')
    spectralPeaks = ess.SpectralPeaks()
    hpcp = ess.HPCP(size=NChromaBins, harmonics=NHarmonics)
    H = []
    for frame in ess.FrameGenerator(array(XAudio), frameSize=winSize, hopSize=hopSize, startFromZero=True):
        S = spectrum(window(frame))
        freqs, mags = spectralPeaks(S)
        H.append(hpcp(freqs, mags))
    H = np.array(H)
    H = H.T
    if squareRoot:
        H = sqrtCompress(H)
    return H
示例#10
0
    def calc_chromagram(self):

        # save the results in the stft_pool
        self.chromagram = []
        hpcp = es.HPCP(
            size=12,  # we will need higher resolution for Key estimation
            referenceFrequency=440,  # assume tuning frequency is 44100.
            bandPreset=False,
            weightType='cosine',
            nonLinear=False,
            windowSize=1.,
            sampleRate=self.sample_rate)

        spectrum = es.Spectrum(size=self.fft_size)
        spectral_peaks = es.SpectralPeaks(sampleRate=self.sample_rate)

        for frame in es.FrameGenerator(self.audio,
                                       frameSize=self.frame_size,
                                       hopSize=self.hop_size,
                                       startFromZero=True):
            frame = array(frame * self.window)
            freqs, mags = spectral_peaks(spectrum(frame))
            chroma = hpcp(freqs, mags)
            self.chromagram.append(chroma)

        self.chromagram = array(self.chromagram)

        self.timeAxSec = np.arange(len(
            self.chromagram)) * self.hop_size / float(self.sample_rate)
示例#11
0
def extract_features(x,
                     M=Config.WINDOW_SIZE,
                     N=Config.FFT_SIZE,
                     H=Config.HOP_SIZE,
                     fs=Config.FS,
                     window_type=Config.WINDOW_TYPE):
    '''
    Function that extracts spectrogram from an audio signal
    -----------------------
    Input: Samples, window size (int), FFT size (int), Hop size (int),
    Sampling rate, Window type (e.g. Hanning)

    Output: Spectrogram
    -----------------------
    '''
    # init functions and vectors
    x = essentia.array(x)
    spectrum = ess.Spectrum(size=N)
    window = ess.Windowing(size=M, type=window_type)
    SP = []
    # compute STFT
    for frame in ess.FrameGenerator(x,
                                    frameSize=M,
                                    hopSize=H,
                                    startFromZero=True):  # generate frames
        wX = window(frame)  # window frame
        mX = spectrum(wX)  # compute fft

        SP.append(mX)
    SP = essentia.array(SP)
    SP = np.power(SP, 2. / 3.)  # power law compression
    SP = SP[:, :int(Config.FFT_SIZE / 4 + 1)]

    return SP
def get_beat_chunks(filename, bpm_restrict=None):
    audio = std.MonoLoader(filename=filename)()
    hpcp = std.HPCP()
    spectrum = std.Spectrum()
    speaks = std.SpectralPeaks()
    large_speaks = std.SpectralPeaks(maxPeaks=2000)
    tivs = []
    sr = 44100
    bpm = get_tempo(filename)
    tivs_framewise = []
    if bpm_restrict != None and bpm_restrict != bpm:
        raise ValueError
    sec_beat = (60 / bpm)
    beats = np.arange(0, len(audio) / sr, sec_beat)
    beats = np.append(beats, len(audio) / sr)
    for i in range(1, len(beats)):
        segmented_audio = audio[int(beats[i - 1] * sr):int(beats[i] * sr)]
        cutter = std.FrameGenerator(segmented_audio)
        for sec in cutter:
            spec = spectrum(sec)
            freq, mag = speaks(spec)
            chroma = hpcp(freq, mag)
            tivs_framewise.append(chroma)
        np2_seg_audio = zeropad_next_power_2(segmented_audio)
        spec = spectrum(np2_seg_audio)
        freq, mag = speaks(spec)
        chroma = hpcp(freq, mag)
        tivs.append(chroma)

    # Calculate the whole TIV
    np2_whole = zeropad_next_power_2(audio)
    spec = spectrum(np2_whole)
    freq, mag = large_speaks(spec)
    chroma_whole = hpcp(freq, mag)
    return mt.TIVCollection.from_pcp(np.array(tivs).T), mt.TIV.from_pcp(chroma_whole), mt.TIVCollection.from_pcp(np.array(tivs_framewise).T)
def getMBE(audio):
    '''
    mel band energy feature
    :param audio:
    :return:
    '''

    winAnalysis = 'hann'

    # this MFCC is for pattern classification, which numberBands always be by default
    MFCC40 = ess.MFCC(sampleRate=fs,
                      highFrequencyBound=highFrequencyBound,
                      inputSize=framesize + 1)

    N = 2 * framesize  # padding 1 time framesize
    SPECTRUM = ess.Spectrum(size=N)
    WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize)

    mfccBands = []
    for frame in ess.FrameGenerator(audio,
                                    frameSize=framesize,
                                    hopSize=hopsize):

        frame = WINDOW(frame)
        mXFrame = SPECTRUM(frame)
        bands, mfccFrame = MFCC40(mXFrame)
        mfccBands.append(bands)
    feature = np.array(mfccBands)
    return feature
def get_constantq(frames, sample_rate=16000, num_bands=64):
    max_freq = 8000
    min_freq = 125
    num_octaves = np.log2(max_freq / min_freq)
    bins_per_octave = int(np.ceil(num_bands / num_octaves))

    frame_size = len(frames[0])
    const_q_spectra = []

    spectrum_estimator = es.Spectrum(size=frame_size)
    if num_bands == 16:
        padding_size = max([0, 512 - frame_size])
    elif num_bands == 32:
        padding_size = max([0, 2048 - frame_size])
    else:
        padding_size = max([0, 1024 - frame_size])

    windowing = es.Windowing(type='hann',
                             size=frame_size,
                             zeroPadding=padding_size)

    constantq_estimator = es.ConstantQ(binsPerOctave=bins_per_octave,
                                       minFrequency=min_freq,
                                       numberBins=num_bands,
                                       sampleRate=sample_rate)
    for frame in frames:
        const_q_spectrum = constantq_estimator(windowing(frame))
        const_q_spectra.append(np.abs(const_q_spectrum))

    return np.array(const_q_spectra).T
示例#15
0
def extract_features(x,
                     M=WINDOW_SIZE,
                     N=FFT_SIZE,
                     H=HOP_SIZE,
                     fs=SR,
                     window_type=WINDOW_TYPE):
    '''
    extract magnitudes spectra from input vector
    apply power-law compression
    cutt the upper spectrum
    '''
    #init functions and vectors
    x = essentia.array(x)
    spectrum = ess.Spectrum(size=N)
    window = ess.Windowing(size=M, type=WINDOW_TYPE)
    SP = []

    #compute STFT
    for frame in ess.FrameGenerator(x,
                                    frameSize=M,
                                    hopSize=H,
                                    startFromZero=True):  #generate frames
        wX = window(frame)  #window frame
        mX = spectrum(wX)  #compute fft
        SP.append(mX)

    SP = essentia.array(SP)
    SP = np.power(SP, 2. / 3.)  #power law compression
    #SP = SP[:,:int(FFT_SIZE/2+1)]  #cut upper spectrum (above 4 khz)

    return SP
示例#16
0
def file_to_hpcp(loop):
    loop = e.array(loop)

    windowing = es.Windowing(type='blackmanharris62')
    spectrum = es.Spectrum()
    spectral_peaks = es.SpectralPeaks(orderBy='magnitude',
                                      magnitudeThreshold=0.001,
                                      maxPeaks=20,
                                      minFrequency=20,
                                      maxFrequency=8000)
    hpcp = es.HPCP(maxFrequency=8000)
    spec_group = []
    hpcp_group = []
    for frame in es.FrameGenerator(loop, frameSize=1024, hopSize=512):
        windowed = windowing(frame)
        fft = spectrum(windowed)
        frequencies, magnitudes = spectral_peaks(fft)
        final_hpcp = hpcp(frequencies, magnitudes)
        spec_group.append(fft)
        hpcp_group.append(final_hpcp)

    mean_hpcp = np.mean(np.array(hpcp_group).T, axis=1)
    #normalize to 1
    mean_hpcp = mean_hpcp / mean_hpcp.max()

    return mean_hpcp
示例#17
0
    def _extract_pitch_contours(self, audio):
        # Hann window with x4 zero padding
        run_windowing = estd.Windowing(  # pylint: disable-msg=E1101
            zeroPadding=3 * self.frame_size)
        run_spectrum = estd.Spectrum(  # pylint: disable-msg=E1101
            size=self.frame_size * 4)
        run_spectral_peaks = estd.SpectralPeaks(  # pylint: disable-msg=E1101
            minFrequency=self.min_frequency,
            maxFrequency=self.max_frequency,
            magnitudeThreshold=self.magnitude_threshold,
            sampleRate=self.sample_rate,
            orderBy='magnitude')

        # convert unit to cents, PitchSalienceFunction takes 55 Hz as the
        # default reference
        run_pitch_salience_function = \
            estd.PitchSalienceFunction(  # pylint: disable-msg=E1101
                binResolution=self.bin_resolution)
        run_pitch_salience_function_peaks = \
            estd.PitchSalienceFunctionPeaks(  # pylint: disable-msg=E1101
                binResolution=self.bin_resolution,
                minFrequency=self.min_frequency,
                maxFrequency=self.max_frequency)
        run_pitch_contours = estd.PitchContours(  # pylint: disable-msg=E1101
            hopSize=self.hop_size,
            binResolution=self.bin_resolution,
            peakDistributionThreshold=self.peak_distribution_threshold)

        # compute frame by frame
        pool = Pool()
        for frame in estd.FrameGenerator(
                audio,  # pylint: disable-msg=E1101
                frameSize=self.frame_size,
                hopSize=self.hop_size):
            frame = run_windowing(frame)
            spectrum = run_spectrum(frame)
            peak_frequencies, peak_magnitudes = run_spectral_peaks(spectrum)
            salience = run_pitch_salience_function(peak_frequencies,
                                                   peak_magnitudes)
            salience_peaks_bins, salience_peaks_contour_saliences = \
                run_pitch_salience_function_peaks(salience)
            if not np.size(salience_peaks_bins):
                salience_peaks_bins = np.array([0])
            if not np.size(salience_peaks_contour_saliences):
                salience_peaks_contour_saliences = np.array([0])

            pool.add('allframes_salience_peaks_bins', salience_peaks_bins)
            pool.add('allframes_salience_peaks_contourSaliences',
                     salience_peaks_contour_saliences)

        # post-processing: contour tracking
        contours_bins, contour_saliences, contours_start_times, duration = \
            run_pitch_contours(
                [f.tolist()
                 for f in pool['allframes_salience_peaks_bins']],
                [f.tolist()
                 for f in pool['allframes_salience_peaks_contourSaliences']])
        return contours_bins, contours_start_times, contour_saliences, duration
示例#18
0
def extractor(filename):

    fs = 44100
    audio = ess.MonoLoader(filename=filename, sampleRate=fs)()
    # dynamic range expansion as done in HTK implementation
    audio = audio * 2**15

    frameSize = 1102  # corresponds to htk default WINDOWSIZE = 250000.0
    hopSize = 441  # corresponds to htk default TARGETRATE = 100000.0
    fftSize = 2048
    spectrumSize = fftSize // 2 + 1
    zeroPadding = fftSize - frameSize

    w = ess.Windowing(
        type='hamming',  #  corresponds to htk default  USEHAMMING = T
        size=frameSize,
        zeroPadding=zeroPadding,
        normalized=False,
        zeroPhase=False)

    spectrum = ess.Spectrum(size=fftSize)

    mfcc_htk = ess.MFCC(
        inputSize=spectrumSize,
        type='magnitude',  # htk uses mel filterbank magniude
        warpingFormula='htkMel',  # htk's mel warping formula
        weighting='linear',  # computation of filter weights done in Hz domain
        highFrequencyBound=8000,  # corresponds to htk default
        lowFrequencyBound=0,  # corresponds to htk default
        numberBands=26,  # corresponds to htk default  NUMCHANS = 26
        numberCoefficients=13,
        normalize=
        'unit_max',  # htk filter normaliation to have constant height = 1  
        dctType=3,  # htk uses DCT type III
        logType='log',
        liftering=22)  # corresponds to htk default CEPLIFTER = 22

    mfccs = []
    # startFromZero = True, validFrameThresholdRatio = 1 : the way htk computes windows
    for frame in ess.FrameGenerator(audio,
                                    frameSize=frameSize,
                                    hopSize=hopSize,
                                    startFromZero=True,
                                    validFrameThresholdRatio=1):
        spect = spectrum(w(frame))
        mel_bands, mfcc_coeffs = mfcc_htk(spect)
        mfccs.append(mfcc_coeffs)

    # transpose to have it in a better shape
    # we need to convert the list to an essentia.array first (== numpy.array of floats)
    # mfccs = essentia.array(pool['MFCC']).T
    mfccs = essentia.array(mfccs).T

    # and plot
    plt.imshow(mfccs[1:, :], aspect='auto',
               interpolation='none')  # ignore enery
    # plt.imshow(mfccs, aspect = 'auto', interpolation='none')
    plt.show()  # unnecessary if you started "ipython --pylab"
示例#19
0
    def essentiaObjectInit(self):
        winAnalysis = 'hann'
        self.MFCC80 = ess.MFCC(sampleRate=self.fs,
                          highFrequencyBound=self.highFrequencyBound,
                          inputSize=self.frameSize + 1,
                          numberBands=self.numberBands)

        N = 2 * self.frameSize  # padding 1 time framesize
        self.SPECTRUM = ess.Spectrum(size=N)
        self.WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - self.frameSize)
示例#20
0
    def __onset_candidate_detection__(self):
        spectrum = e.Spectrum()
        e_onsetdetection = e.OnsetDetection(method="flux")

        onsetspecs = []
        for frame in e.FrameGenerator(self.signal, 1024, 512):
            self.frames.append(frame)
            onsetspecs.append(spectrum(frame))
            self.onset_candidates.append(e_onsetdetection(onsetspecs[-1], [0]*len(onsetspecs[-1])))

        self.frame_count = len(self.frames)
示例#21
0
 def __init__(self, frame_size, hop_size, window_type, feature,
         beats, sample_rate):
     """STFTFeature constructor."""
     self.frame_size = frame_size
     self.hop_size = hop_size
     self.window_type = window_type
     self.w = ES.Windowing(type=window_type)
     self.spectrum = ES.Spectrum()
     self.feature = feature  # Essentia feature object
     self.beats = beats
     self.sample_rate = sample_rate
示例#22
0
    def exp_env(audio, step):
        def func(x, a, c):
            return a * np.exp(-c * x)

        max_pos = np.argmax(audio)
        audio1 = audio
        audio = audio[np.argmax(audio):]
        step = int(step)
        audio = np.abs(audio)
        envelope = []
        env_x = []
        for i in range(0, len(audio), step):
            env_x += [i + np.argmax(audio[i:i + step])]
            envelope += [np.max(audio[i:i + step])]
        env_x = np.array(env_x)
        envelope = np.array(envelope)
        try:
            popt, pcov = curve_fit(func, env_x, envelope, p0=(1, 1e-3))
        except RuntimeError:
            popt = [envelope[0], 0]
            pcov = []
        xx = np.arange(0, len(audio), 1)
        yy = func(xx, *popt)
        xx = xx + max_pos
        xx = np.append(np.arange(0, max_pos), xx)
        yy = np.append(np.zeros(max_pos), yy)
        plt.plot(xx, yy)
        plt.plot(xx, audio1, color='green')
        start = env_x[np.where(envelope == envelope.max())[0]]
        nf1 = envelope[0:5].mean()
        locs = np.where(envelope < 0.1 * envelope.max())[0]
        if len(locs) < 1:
            stop1 = env_x[-1]
        else:
            stop1 = env_x[locs[np.where(
                locs > np.where(envelope == envelope.max())[0])][0]]
        locs = np.where(envelope < 0.01 * envelope.max())[0]
        if len(locs) < 1:
            stop2 = env_x[-1]
        else:
            stop2 = env_x[locs[np.where(
                locs > np.where(envelope == envelope.max())[0])][0]]
        plt.xlabel('Samples')
        plt.ylabel('Absolute Amplitude')
        plt.axis([0, 140000, 0, 0.20])
        plt.figure()
        en_mod = np.array(audio1 - yy, dtype='float32')
        if len(en_mod) % 2 > 0:
            en_mod = en_mod[:-1]
        spectrum = estd.Spectrum()(en_mod)
        plt.plot(spectrum)
        plt.show()
        return stop1 - start
示例#23
0
文件: features.py 项目: ctralie/acoss
    def mfcc_htk(self, window_length=22050, nmfcc=13, n_mels=26, fmax=8000, lifterexp=22):
        """
        Get MFCCs 'the HTK way' with the help of Essentia
        https://github.com/MTG/essentia/blob/master/src/examples/tutorial/example_mfcc_the_htk_way.py
        Using all of the default parameters from there except the hop length (which shouldn't matter), and a much longer window length (which has been found to work better for covers)
        Parameters
        ----------
        window_length: int
            Length of the window to use for the STFT
        nmfcc: int
            Number of MFCC coefficients to compute
        n_mels: int
            Number of frequency bands to use
        fmax: int
            Maximum frequency
        Returns
        -------
        ndarray(nmfcc, nframes)
            An array of all of the MFCC frames
        """
        fftlen = int(2**(np.ceil(np.log(window_length)/np.log(2))))
        spectrumSize= fftlen//2+1
        zeroPadding = fftlen - window_length

        w = estd.Windowing(type = 'hamming', #  corresponds to htk default  USEHAMMING = T
                            size = window_length, 
                            zeroPadding = zeroPadding,
                            normalized = False,
                            zeroPhase = False)
        
        spectrum = estd.Spectrum(size=fftlen)
        mfcc_htk = estd.MFCC(inputSize = spectrumSize,
                            type = 'magnitude', # htk uses mel filterbank magniude
                            warpingFormula = 'htkMel', # htk's mel warping formula
                            weighting = 'linear', # computation of filter weights done in Hz domain
                            highFrequencyBound = fmax, # 8000 is htk default
                            lowFrequencyBound = 0, # corresponds to htk default
                            numberBands = n_mels, # corresponds to htk default  NUMCHANS = 26
                            numberCoefficients = nmfcc,
                            normalize = 'unit_max', # htk filter normaliation to have constant height = 1  
                            dctType = 3, # htk uses DCT type III
                            logType = 'log',
                            liftering = lifterexp) # corresponds to htk default CEPLIFTER = 22


        mfccs = []
        # startFromZero = True, validFrameThresholdRatio = 1 : the way htk computes windows
        for frame in estd.FrameGenerator(self.audio_vector, frameSize = window_length, hopSize = self.hop_length , startFromZero = True, validFrameThresholdRatio = 1):
            spect = spectrum(w(frame))
            mel_bands, mfcc_coeffs = mfcc_htk(spect)
            mfccs.append(mfcc_coeffs)
        
        return np.array(mfccs, dtype=np.float32).T
def audio_features(audio_win):
    """
    returns audio features for a win
    """
    if audio_win.shape[0] % 2 == 1:
        audio_win = audio_win[:-1]
    spectrum = esst.Spectrum(size=audio_win.shape[0])(audio_win)
    _bands, mfcc = esst.MFCC(inputSize=spectrum.shape[0],
                             sampleRate=SR)(spectrum)

    rhythm = esst.RhythmDescriptors()(audio_win)
    return mfcc.tolist() + [rhythm[2]] + list(rhythm[5:11])
示例#25
0
def melspectrogram(audio,
                   sampleRate=44100,
                   frameSize=2048,
                   hopSize=1024,
                   window='blackmanharris62',
                   zeroPadding=0,
                   center=True,
                   numberBands=[128, 96, 48, 32, 24, 16, 8],
                   lowFrequencyBound=0,
                   highFrequencyBound=None,
                   weighting='linear',
                   warpingFormula='slaneyMel',
                   normalize='unit_tri'):

    if highFrequencyBound is None:
        highFrequencyBound = sampleRate / 2

    windowing = es.Windowing(type=window,
                             normalized=False,
                             zeroPadding=zeroPadding)
    spectrum = es.Spectrum()
    melbands = {}
    for nBands in numberBands:
        melbands[nBands] = es.MelBands(
            numberBands=nBands,
            sampleRate=sampleRate,
            lowFrequencyBound=lowFrequencyBound,
            highFrequencyBound=highFrequencyBound,
            inputSize=(frameSize + zeroPadding) // 2 + 1,
            weighting=weighting,
            normalize=normalize,
            warpingFormula=warpingFormula,
            type='power')
    norm10k = es.UnaryOperator(type='identity', shift=1, scale=10000)
    log10 = es.UnaryOperator(type='log10')
    amp2db = es.UnaryOperator(type='lin2db', scale=2)

    results = essentia.Pool()

    for frame in es.FrameGenerator(audio,
                                   frameSize=frameSize,
                                   hopSize=hopSize,
                                   startFromZero=not center):
        spectrumFrame = spectrum(windowing(frame))

        for nBands in numberBands:
            melFrame = melbands[nBands](spectrumFrame)
            results.add('mel_' + str(nBands) + '_db', amp2db(melFrame))
            results.add('mel_' + str(nBands) + '_log1+10kx',
                        log10(norm10k(melFrame)))

    return results
def FeatureExtraction_Recording(recording, params):

    numBins = params.numbins
    fs = params.fs
    # LOAD Audio file
    Audio = ess.MonoLoader(filename=recording.path, sampleRate=fs)()
    Audio = ess.DCRemoval()(Audio)  # PREPROCESSING / DC removal
    Audio = ess.EqualLoudness()(Audio)  # PREPROCESSING - Equal Loudness Filter

    # Windowing Parameters (first converting from msec to number of samples)
    # assuring windowSize and hopSize are even
    windowSize = round(fs * params.windowSize / 1000)
    windowSize = int(windowSize / 2) * 2
    hopSize = round(fs * params.hopSize / 1000)
    hopSize = int(hopSize / 2) * 2

    tonic = float(recording.tonic)

    # FRAME-BASED Spectral Analysis
    hpcp = []
    for frame in ess.FrameGenerator(Audio,
                                    frameSize=windowSize,
                                    hopSize=hopSize,
                                    startFromZero=True):
        frame = ess.Windowing(size=windowSize,
                              type=params.windowFunction)(frame)
        mX = ess.Spectrum(size=windowSize)(frame)
        mX[mX < np.finfo(float).eps] = np.finfo(float).eps
        # EXTRACT frequency and magnitude information of the harmonic spectral peaks
        freq, mag = ess.SpectralPeaks()(mX)
        # harmonic pitch-class profiles
        hpcp.append(
            ess.HPCP(normalized='unitSum',
                     referenceFrequency=tonic,
                     size=numBins,
                     windowSize=12 / numBins)(freq, mag))
    recording.chroma_framebased = np.array(hpcp)

    # FEATURE SUMMARIZATION
    mean_chroma = []
    # global Mean of HPCP vectors
    std_chroma = []
    # global standard deviation of HPCP vectors
    for j in range(numBins):
        tmp = []
        for i in range(len(recording.chroma_framebased)):
            tmp.append(recording.chroma_framebased[i][j])
        mean_chroma.append(np.mean(tmp))
        std_chroma.append(np.std(tmp))
    recording.chroma_mean = mean_chroma
    recording.chroma_std = std_chroma
示例#27
0
def extractor(filename):

    fs = 44100
    audio = ess.MonoLoader(filename=filename, sampleRate=fs)()
    # dynamic range expansion as done in HTK implementation
    audio = audio * 2**15

    frameSize = 1102  # corresponds to htk default WINDOWSIZE = 250000.0
    hopSize = 441  # corresponds to htk default TARGETRATE = 100000.0
    fftSize = 2048
    spectrumSize = fftSize // 2 + 1
    zeroPadding = fftSize - frameSize

    w = ess.Windowing(
        type='hamming',  #  corresponds to htk default  USEHAMMING = T
        size=frameSize,
        zeroPadding=zeroPadding,
        normalized=False,
        zeroPhase=False)

    spectrum = ess.Spectrum(size=fftSize)

    mfcc_htk = ess.MFCC(
        inputSize=spectrumSize,
        type='magnitude',  # htk uses mel filterbank magniude
        warpingFormula='htkMel',  # htk's mel warping formula
        weighting='linear',  # computation of filter weights done in Hz domain
        highFrequencyBound=8000,  # corresponds to htk default
        lowFrequencyBound=0,  # corresponds to htk default
        numberBands=26,  # corresponds to htk default  NUMCHANS = 26
        numberCoefficients=13,
        normalize=
        'unit_max',  # htk filter normaliation to have constant height = 1  
        dctType=3,  # htk uses DCT type III
        logType='log',
        liftering=22)  # corresponds to htk default CEPLIFTER = 22

    mfccs = []
    # startFromZero = True, validFrameThresholdRatio = 1 : the way htk computes windows
    for frame in ess.FrameGenerator(audio,
                                    frameSize=frameSize,
                                    hopSize=hopSize,
                                    startFromZero=True,
                                    validFrameThresholdRatio=1):
        spect = spectrum(w(frame))
        mel_bands, mfcc_coeffs = mfcc_htk(spect)
        #frame_energy = energy_func(frame)
        #mfccs.append(numpy.append(mfcc_coeffs, frame_energy))
        mfccs.append(mfcc_coeffs)

    return mfccs
def getFeature(audio, d=True, nbf=False):
    '''
    MFCC of give audio interval [p[0],p[1]]
    :param audio:
    :param p:
    :return:
    '''

    winAnalysis = 'hann'

    # this MFCC is for pattern classification, which numberBands always be by default
    MFCC40 = ess.MFCC(sampleRate=fs,
                      highFrequencyBound=highFrequencyBound,
                      inputSize=framesize + 1)

    N = 2 * framesize  # padding 1 time framesize
    SPECTRUM = ess.Spectrum(size=N)
    WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize)

    mfcc = []
    # audio_p = audio[p[0]*fs:p[1]*fs]
    for frame in ess.FrameGenerator(audio,
                                    frameSize=framesize,
                                    hopSize=hopsize):
        frame = WINDOW(frame)
        mXFrame = SPECTRUM(frame)
        bands, mfccFrame = MFCC40(mXFrame)
        # mfccFrame       = mfccFrame[1:]
        mfcc.append(mfccFrame)

    if d:
        mfcc = np.array(mfcc).transpose()
        dmfcc = Fdeltas(mfcc, w=5)
        ddmfcc = Fdeltas(dmfcc, w=5)
        feature = np.transpose(np.vstack((mfcc, dmfcc, ddmfcc)))
    else:
        feature = np.array(mfcc)

    if not d and nbf:
        mfcc = np.array(mfcc).transpose()
        mfcc_out = np.array(mfcc, copy=True)
        for w_r in range(1, 6):
            mfcc_right_shifted = Fprev_sub(mfcc, w=w_r)
            mfcc_left_shifted = Fprev_sub(mfcc, w=-w_r)
            mfcc_out = np.vstack(
                (mfcc_out, mfcc_left_shifted, mfcc_right_shifted))
        feature = np.array(np.transpose(mfcc_out), dtype='float32')

    # print feature.shape

    return feature
示例#29
0
def analyze_misc(filename, segment_duration=20):

    # Compute replay gain and duration on the entire file, then load the
    # segment that is centered in time with replaygain applied
    audio = es.MonoLoader(filename=filename)()
    replaygain = es.ReplayGain()(audio)

    segment_start = (len(audio) / 44100 - segment_duration) / 2
    segment_end = segment_start + segment_duration

    if segment_start < 0 or segment_end > len(audio) / 44100:
        raise ValueError(
            'Segment duration is larger than the input audio duration')

    loader = es.EasyLoader(filename=filename,
                           replayGain=replaygain,
                           startTime=segment_start,
                           endTime=segment_end)

    windowing = es.Windowing(type='blackmanharris62')
    spectrum = es.Spectrum()
    powerspectrum = es.PowerSpectrum()
    centroid = es.Centroid()
    zcr = es.ZeroCrossingRate()
    rms = es.RMS()
    hfc = es.HFC()
    pool = essentia.Pool()

    audio = loader()
    for frame in es.FrameGenerator(audio, frameSize=2048, hopSize=1024):
        frame_spectrum = spectrum(windowing(frame))
        pool.add('rms', rms(frame))
        pool.add('rms_spectrum', rms(frame_spectrum))
        pool.add('hfc', hfc(frame_spectrum))
        pool.add('spectral_centroid', centroid(frame_spectrum))
        pool.add('zcr', zcr(frame))

    audio_st, sr, _, _, _, _ = es.AudioLoader(filename=filename)()
    # Ugly hack because we don't have a StereoResample
    left, right = es.StereoDemuxer()(audio_st)
    resampler = es.Resample(inputSampleRate=sr, outputSampleRate=44100)
    left = resampler(left)
    right = resampler(right)
    audio_st = es.StereoMuxer()(left, right)
    audio_st = es.StereoTrimmer(startTime=segment_start,
                                endTime=segment_end)(audio_st)
    ebu_momentary, _, _, _ = es.LoudnessEBUR128(hopSize=1024 / 44100,
                                                startAtZero=True)(audio_st)
    pool.set('ebu_momentary', ebu_momentary)

    return pool
示例#30
0
def extractor(filename):
    frameSize = 1024
    hopSize = 512
    fs = 44100
    audio = ess.MonoLoader(filename=filename, sampleRate=fs)()
    w = ess.Windowing(type='hamming', normalized=False)
    # make sure these are same for MFCC and IDCT computation
    NUM_BANDS = 26
    DCT_TYPE = 2
    LIFTERING = 0
    NUM_MFCCs = 13

    spectrum = ess.Spectrum()
    mfcc = ess.MFCC(
        numberBands=NUM_BANDS,
        numberCoefficients=
        NUM_MFCCs,  # make sure you specify first N mfcc: the less, the more lossy (blurry) the smoothed mel spectrum will be
        weighting=
        'linear',  # computation of filter weights done in Hz domain (optional)
        normalize=
        'unit_max',  #  htk filter normaliation to have constant height = 1 (optional)
        dctType=DCT_TYPE,
        logType='log',
        liftering=LIFTERING)  # corresponds to htk default CEPLIFTER = 22

    idct = ess.IDCT(inputSize=NUM_MFCCs,
                    outputSize=NUM_BANDS,
                    dctType=DCT_TYPE,
                    liftering=LIFTERING)
    all_melbands_smoothed = []

    for frame in ess.FrameGenerator(audio,
                                    frameSize=frameSize,
                                    hopSize=hopSize):
        spect = spectrum(w(frame))
        melbands, mfcc_coeffs = mfcc(spect)
        melbands_smoothed = np.exp(
            idct(mfcc_coeffs))  # inverse the log taken in MFCC computation
        all_melbands_smoothed.append(melbands_smoothed)

    # transpose to have it in a better shape
    # we need to convert the list to an essentia.array first (== numpy.array of floats)
    # mfccs = essentia.array(pool['MFCC']).T
    all_melbands_smoothed = essentia.array(all_melbands_smoothed).T

    # and plot
    plt.imshow(all_melbands_smoothed, aspect='auto',
               interpolation='none')  # ignore enery
    # plt.imshow(mfccs, aspect = 'auto', interpolation='none')
    plt.show()  # unnecessary if you started "ipython --pylab"