def MRCG(x, fs=44100, framesize1=0.02, framesize2=0.2, hopsize=0.01): hopsize = int(hopsize * fs) # spectrogram init winAnalysis = 'hann' ####---- cochleagram 1 framesize = int(framesize1 * fs) N = 2 * framesize # padding 1 time framesize SPECTRUM = ess.Spectrum(size=N) WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize) highFrequencyBound = fs / 2 if fs / 2 < 11000 else 11000 ERBBANDS = ess.ERBBands(sampleRate=fs, highFrequencyBound=highFrequencyBound, inputSize=framesize + 1) cochlea1 = [] for frame in ess.FrameGenerator(x, frameSize=framesize, hopSize=hopsize): frame = WINDOW(frame) mXFrame = SPECTRUM(frame) erbFrame = np.log10(ERBBANDS(mXFrame) + np.finfo(np.float).eps) cochlea1.append(erbFrame) cochlea1 = np.array(cochlea1) ####---- cochleagram 2 framesize = int(framesize2 * fs) N = 2 * framesize # padding 1 time framesize SPECTRUM = ess.Spectrum(size=N) WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize) highFrequencyBound = fs / 2 if fs / 2 < 11000 else 11000 ERBBANDS = ess.ERBBands(sampleRate=fs, highFrequencyBound=highFrequencyBound, inputSize=framesize + 1) cochlea2 = [] for frame in ess.FrameGenerator(x, frameSize=framesize, hopSize=hopsize): frame = WINDOW(frame) mXFrame = SPECTRUM(frame) erbFrame = np.log10(ERBBANDS(mXFrame) + np.finfo(np.float).eps) cochlea2.append(erbFrame) cochlea2 = np.array(cochlea2) ####---- smoothed version cochlea3 = get_avg(cochlea1, 5, 5) cochlea4 = get_avg(cochlea1, 11, 11) all_cochleas = np.hstack((cochlea1, cochlea2, cochlea3, cochlea4)) ####---- delta d_all_cochleas = Fdeltas(all_cochleas.T) dd_all_cochleas = Fdeltas(Fdeltas(all_cochleas.T, 5), 5) d_all_cochleas = d_all_cochleas.T dd_all_cochleas = dd_all_cochleas.T return all_cochleas, d_all_cochleas, dd_all_cochleas
def main_danceability(args): """main_danceability Compute the danceability feature over input waveform and plot it """ audio = loadaudio(args) # create the pool and the necessary algorithms pool = e.Pool() w = estd.Windowing() spec = estd.Spectrum() centroid = estd.SpectralCentroidTime() # compute the centroid for all frames in our audio and add it to the pool for frame in estd.FrameGenerator(audio, frameSize = 1024, hopSize = 512): c = centroid(spec(w(frame))) pool.add('lowlevel.centroid', c) # aggregate the results aggrpool = estd.PoolAggregator(defaultStats = [ 'mean', 'var' ])(pool) # create the pool and the necessary algorithms pool = e.Pool() w = estd.Windowing() # spec = estd.Spectrum() # centroid = estd.SpectralCentroidTime() danceability = estd.Danceability(maxTau = 10000, minTau = 300, sampleRate = args.samplerate) # compute the centroid for all frames in our audio and add it to the pool for frame in estd.FrameGenerator(audio, frameSize = 10 * args.samplerate, hopSize = 5 * args.samplerate): dreal, ddfa = danceability(w(frame)) print(("d", dreal)) # , "frame", frame pool.add('rhythm.danceability', dreal) print((type(pool['rhythm.danceability']))) # aggregate the results # aggrpool = estd.PoolAggregator(defaultStats = [ 'mean', 'var' ])(pool) # write result to file # estd.YamlOutput(filename = args.file + '.features.yaml')(aggrpool) fig, gs = makefig(rows = 2, cols = 2) ax = fig.axes ax[0].plot(pool['rhythm.danceability']) plt.show()
def _key_fnc( sample: NDArray[Float32], frequency_rate: int, windowfnc: Window, key_type: KeyFunction, ): """ This function computes the key function, which in return calculates the keys for the [this.samples] map. To calculate the spectral centroid, the frequency_rate should be equal to the half of the samplerate. """ if key_type == KeyFunction.CENTROID: return _get_centroid( sample, estd.Centroid(range=frequency_rate), estd.Spectrum(), estd.Windowing(type=windowfnc.value), ) if key_type == KeyFunction.MAX: return _get_max( sample, estd.Spectrum(), estd.Windowing(type=windowfnc.value), ) if key_type == KeyFunction.MFCC: return _get_mfcc( sample, estd.MFCC(), estd.Spectrum(), estd.Windowing(type=windowfnc.value), ) if key_type == KeyFunction.MELBANDS: return _get_melbands( sample, estd.MFCC(), estd.Spectrum(), estd.Windowing(type=windowfnc.value), ) if key_type == KeyFunction.MELBANDS_LOG: return estd.UnaryOperator(type="log")(_get_melbands( sample, estd.MFCC(), estd.Spectrum(), estd.Windowing(type=windowfnc.value), )) raise ValueError("Keyfunction is not defined!")
def get_onsets(self, _audio=[]): if _audio != []: audio = _audio else: audio = self.audio W = es.Windowing(type=self.winType) c2p = es.CartesianToPolar() fft = es.FFT() onsetDetection = es.OnsetDetection(method=self.onsetMethod, sampleRate=44100) onsets = es.Onsets(alpha=.2) # onsetIndex = [] pool = Pool() for frame in es.FrameGenerator(audio, frameSize=1024, hopSize=512): mag, phase, = c2p(fft(W(frame))) onsetDetection.configure(method=self.onsetMethod) onsetFunction = onsetDetection(mag, phase) pool.add("onsetFunction", onsetFunction) DetectedOnsetsArray = onsets([pool["onsetFunction"]], [1]) return DetectedOnsetsArray
def extract(fname, outpath, fs=22050, fsize=1024, hsize=512): """ extract(fname, outpath, fs, fsize, hsize) will compute the mfcc of Audio file fname. Inputs: fname -- is the name of audio file. outpath -- is the output path of processed files. fs -- is the sampling frequency (Hz). fsize -- is the size of each frame. hsize -- is the hop size betwean frames. Outputs: the file contains the mfcc coefficents of audio file. in what format??? """ # gate(fname) loader = es.MonoLoader(filename=fname, sampleRate=fs) # length = len(loader) # maxim = max(loader) # for sample in loader: # if abs(sample) < maxim/20: # sample = 0 ; w = es.Windowing(type='hann') spectrum = es.Spectrum() mfcc = es.MFCC(inputSize=513, numberCoefficients=20) mfccs = [] audio = loader() for frame in es.FrameGenerator(audio, frameSize=1024, hopSize=512): mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame))) mfccs.append(mfcc_coeffs) mfccs = np.array(mfccs) return mfcc
def segment(audio, hopSize, frameSize, rms_onset_threshold, mel_onset_threshold, flux_onset_threshold, onset_threshold): # init algorithms o_mel = estd.OnsetDetection(method='melflux') o_rms = estd.OnsetDetection(method='rms') o_hfc = estd.OnsetDetection(method='hfc') o_flux = estd.OnsetDetection(method='flux') fft = estd.FFT() c2p = estd.CartesianToPolar() pool = essentia.Pool() frame_generator = estd.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize) w = estd.Windowing(type='hann') yin = estd.PitchYinFFT(frameSize=frameSize, minFrequency=40, maxFrequency=2500, interpolate=True) spectrum = estd.Spectrum() loudness = estd.Loudness() # control parameters attack = False detection = True mel_onset_value = 0 rms_onset_value = 0 # output variables onset = None sustain = None for index, frame in enumerate(frame_generator): mag, phase = c2p(fft(w(frame))) _, conf = yin(spectrum(w(frame))) loud = loudness(frame) mel_onset = o_mel(mag, phase) rms_onset = o_rms(mag, phase) hfc_onset = o_hfc(mag, phase) flux_onset = o_flux(mag, phase) pool.add('onsets_mel', mel_onset) pool.add('onsets_rms', rms_onset) pool.add('onsets_hfc', hfc_onset) pool.add('onsets_flux', flux_onset) pool.add('conf', conf) pool.add('loudness', loud) # condition for onset if detection and (flux_onset > flux_onset_threshold or mel_onset > mel_onset_threshold) \ and rms_onset > rms_onset_threshold and loud > onset_threshold: onset = index attack = True detection = False mel_onset_value = mel_onset rms_onset_value = rms_onset # condition for beginning of sustain if attack and conf > 0.5 and rms_onset < rms_onset_value * .05 and mel_onset < mel_onset_value * .3: attack = False sustain = index return onset, sustain
def compute_description(x, M=WINDOW_SIZE, N=FFT_SIZE, H=HOP_SIZE, fs=SR, window_type=WINDOW_TYPE): ''' -extract features from audio file -Features: HFC SPECTRAL CENTROID SPECTRAL ENERGY F0 loud_factor = energy * (spectral_centroid - F0) #how many harmonics = how much speaker is yelling PITCH CONFIDENCE ''' #audioLoader = ess.EasyLoader(filename=file_name, sampleRate=fs) #create essentia instances x = essentia.array(x) spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type=window_type) hfc = ess.HFC(sampleRate=fs) spectralCentroid = ess.SpectralCentroidTime(sampleRate=fs) energy = ess.Energy() pitch_extractor = ess.PredominantPitchMelodia(frameSize=M, hopSize=H, maxFrequency=1200) #init vectors CONTRAST = [] HFC = [] CENTROID = [] ENERGY = [] #compute features for every stft frame for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True): #generate frames wX = window(frame) #window frame mX = spectrum(wX) #compute fft frame_hfc = hfc(mX) HFC.append(frame_hfc) frame_centroid = spectralCentroid( wX) #compute spectral centroid in time domain CENTROID.append(frame_centroid) frame_energy = energy(mX) #compute spectral energy in time domain ENERGY.append(frame_energy) F0, SALIENCE = pitch_extractor(x) #estimate pitch in time domain #convert into numpy matrices HFC = essentia.array(HFC) CENTROID = essentia.array(CENTROID) ENERGY = essentia.array(ENERGY) F0 = essentia.array(F0) SALIENCE = essentia.array(SALIENCE) F0 = F0[:len(CENTROID)] SALIENCE = SALIENCE[:len(CENTROID)] return HFC, CENTROID, ENERGY, F0, SALIENCE
def extract_features(x, M=WINDOW_SIZE, N=FFT_SIZE, H=HOP_SIZE, fs=SR, window_type=WINDOW_TYPE): ''' extract magnitudes spectra from input vector and apply power-law compression ''' #init functions and vectors x = essentia.array(x) spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type=window_type) SP = [] #compute STFT for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True): #generate frames wX = window(frame) #window frame mX = spectrum(wX) #compute fft ###############################OPTIMIZATION[[[[[[[[[[[[[[]]]]]]]]]]]]]] #DEPRECATED ################################################# SP.append(mX) SP = essentia.array(SP) SP = np.power(SP, 2. / 3.) #power law compression return SP
def get_constantq(frames, sample_rate=16000, num_bands=64): max_freq = 8000 min_freq = 125 num_octaves = np.log2(max_freq / min_freq) bins_per_octave = int(np.ceil(num_bands / num_octaves)) frame_size = len(frames[0]) const_q_spectra = [] spectrum_estimator = es.Spectrum(size=frame_size) if num_bands == 16: padding_size = max([0, 512 - frame_size]) elif num_bands == 32: padding_size = max([0, 2048 - frame_size]) else: padding_size = max([0, 1024 - frame_size]) windowing = es.Windowing(type='hann', size=frame_size, zeroPadding=padding_size) constantq_estimator = es.ConstantQ(binsPerOctave=bins_per_octave, minFrequency=min_freq, numberBins=num_bands, sampleRate=sample_rate) for frame in frames: const_q_spectrum = constantq_estimator(windowing(frame)) const_q_spectra.append(np.abs(const_q_spectrum)) return np.array(const_q_spectra).T
def getHPCPEssentia(XAudio, Fs, winSize, hopSize, squareRoot=False, NChromaBins=36, NHarmonics = 0): """ Wrap around the essentia library to compute HPCP features :param XAudio: A flat array of raw audio samples :param Fs: Sample rate :param winSize: Window size of each STFT window :param hopSize: Hop size between STFT windows :param squareRoot: Do square root compression? :param NChromaBins: How many chroma bins (default 36) :returns H: An (NChromaBins x NWindows) matrix of all \ chroma windows """ import essentia from essentia import Pool, array import essentia.standard as ess spectrum = ess.Spectrum() window = ess.Windowing(size=winSize, type='hann') spectralPeaks = ess.SpectralPeaks() hpcp = ess.HPCP(size=NChromaBins, harmonics=NHarmonics) H = [] for frame in ess.FrameGenerator(array(XAudio), frameSize=winSize, hopSize=hopSize, startFromZero=True): S = spectrum(window(frame)) freqs, mags = spectralPeaks(S) H.append(hpcp(freqs, mags)) H = np.array(H) H = H.T if squareRoot: H = sqrtCompress(H) return H
def extract_features(x, M=Config.WINDOW_SIZE, N=Config.FFT_SIZE, H=Config.HOP_SIZE, fs=Config.FS, window_type=Config.WINDOW_TYPE): ''' Function that extracts spectrogram from an audio signal ----------------------- Input: Samples, window size (int), FFT size (int), Hop size (int), Sampling rate, Window type (e.g. Hanning) Output: Spectrogram ----------------------- ''' # init functions and vectors x = essentia.array(x) spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type=window_type) SP = [] # compute STFT for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True): # generate frames wX = window(frame) # window frame mX = spectrum(wX) # compute fft SP.append(mX) SP = essentia.array(SP) SP = np.power(SP, 2. / 3.) # power law compression SP = SP[:, :int(Config.FFT_SIZE / 4 + 1)] return SP
def analysisSynthesis(params, signal): outsignal = array(0) # framecutter > windowing > FFT > IFFT > OverlapAdd frames = cutFrames(params, signal) w = std.Windowing(type="hann") fft = std.FFT(size=params['frameSize']) ifft = std.IFFT(size=params['frameSize']) overl = std.OverlapAdd(frameSize=params['frameSize'], hopSize=params['hopSize']) counter = 0 for f in frames: #outframe = OverlapAdd(frameSize = params['frameSize'], hopSize = params['hopSize'])(IFFT(size = params['frameSize'])(FFT(size = params['frameSize'])(Windowing()(f)))) # STFT analysis infft = fft(w(f)) # here we could apply spectral transformations outfft = infft # STFT synthesis ifftframe = ifft(outfft) of = ifftframe outframe = overl(of) if counter >= (params['frameSize'] / (2 * params['hopSize'])): outsignal = numpy.append(outsignal, outframe) counter += 1 return outsignal
def file_to_hpcp(loop): loop = e.array(loop) windowing = es.Windowing(type='blackmanharris62') spectrum = es.Spectrum() spectral_peaks = es.SpectralPeaks(orderBy='magnitude', magnitudeThreshold=0.001, maxPeaks=20, minFrequency=20, maxFrequency=8000) hpcp = es.HPCP(maxFrequency=8000) spec_group = [] hpcp_group = [] for frame in es.FrameGenerator(loop, frameSize=1024, hopSize=512): windowed = windowing(frame) fft = spectrum(windowed) frequencies, magnitudes = spectral_peaks(fft) final_hpcp = hpcp(frequencies, magnitudes) spec_group.append(fft) hpcp_group.append(final_hpcp) mean_hpcp = np.mean(np.array(hpcp_group).T, axis=1) #normalize to 1 mean_hpcp = mean_hpcp / mean_hpcp.max() return mean_hpcp
def analysisSynthesis(params, signal): outsignal = array(0) signal = numpy.append(signal, zeros(params['frameSize']/2)) frames = cutFrames(params, signal) w = std.Windowing(type = "hann"); fft = std.FFT(size = params['frameSize']); ifft = std.IFFT(size = params['frameSize']); overl = std.OverlapAdd (frameSize = params['frameSize'], hopSize = params['hopSize'], gain = 1./params['frameSize']); counter = 0 for f in frames: # STFT analysis infft = fft(w(f)) # here we could apply spectral transformations outfft = infft # STFT synthesis ifftframe = ifft(outfft) of = ifftframe outframe = overl(of) if counter >= (params['frameSize']/(2*params['hopSize'])): outsignal = numpy.append(outsignal,outframe) counter += 1 return outsignal
def getMBE(audio): ''' mel band energy feature :param audio: :return: ''' winAnalysis = 'hann' # this MFCC is for pattern classification, which numberBands always be by default MFCC40 = ess.MFCC(sampleRate=fs, highFrequencyBound=highFrequencyBound, inputSize=framesize + 1) N = 2 * framesize # padding 1 time framesize SPECTRUM = ess.Spectrum(size=N) WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize) mfccBands = [] for frame in ess.FrameGenerator(audio, frameSize=framesize, hopSize=hopsize): frame = WINDOW(frame) mXFrame = SPECTRUM(frame) bands, mfccFrame = MFCC40(mXFrame) mfccBands.append(bands) feature = np.array(mfccBands) return feature
def get_lpc(frames, sample_rate=16000, num_coeffs=32, window_type='hann'): ''' Calculates linear prediction coefficients Parameters: frames : overlapping signal frames for short-time analysis sample_rate : audio sampling rate, num_coeffs : number of linear prediction coefficients window_type : type of windowing function to apply Returns two numpy 2D arrays: LPCs and reflection coefficients ''' frame_size = len(frames[0]) lpc_coeffs = [] reflection_coeffs = [] lpc_estimator = es.LPC(sampleRate=sample_rate, order=num_coeffs - 1) windowing = es.Windowing(type='hann', size=frame_size) for frame in frames: lpc, reflection = lpc_estimator(windowing(frame) * 1000) lpc_coeffs.append(lpc) reflection_coeffs.append(reflection) return np.array(lpc_coeffs).T, np.array(reflection_coeffs).T
def get_wavelet_envelopes(frames, level=5, window_type='hann'): ''' Decomposes input audio with wavelet packets and calculates energy envelopes of their components Parameters: frames : overlapping signal frames for short-time analysis level : number of levels of wavelet decomposition, 2**level gives the final number of wavelet components window_type : type of windowing function to apply Returns numpy 2D array af ''' frame_size = len(frames[0]) num_bands = 2**level output_envelopes = {i: [] for i in range(num_bands)} windowing = es.Windowing(type='hann', size=frame_size) for frame in frames: wp = pywt.WaveletPacket(data=windowing(frame), wavelet='db1', mode='zero', maxlevel=level) for i in range(num_bands): band_key = bin(i).replace('0b', '').zfill(level).replace( '0', 'a').replace('1', 'd') output_envelopes[i].append(np.std(wp[band_key].data)) output_array = [] for item in output_envelopes.values(): output_array.append(list(item)) return np.array(output_array)
def get_f0(audio, minf0=20, maxf0=22050, cf=0.9, ws=2048, hs=256): ''' Args: audio (array): audio signal (output from MonoLoader) minf0 (int): minimum allowed frequency maxf0 (int): maximun allowed frequency cf (float): confidence threshold (0 - 1) ws (int): window size hp (int): hop size Returns: f0 (array): ''' # instantiate Essentia functions w = es.Windowing(type='hann', zeroPadding=ws) spec = es.Spectrum() yin = es.PitchYinFFT(minFrequency=minf0, maxFrequency=maxf0, frameSize=ws) # empty lists for f0 and confidence f0 = [] conf = [] # iterate over frames for frame in es.FrameGenerator(audio, frameSize=ws, hopSize=hs): p, pc = yin(spec(w(frame))) f0.append(p) conf.append(pc) # convert lists to np.arrays f0 = np.array(f0) conf = np.array(conf) # return f0 over given confidence f0[conf < cf] = 0 return f0
def extract_features(x, M=WINDOW_SIZE, N=FFT_SIZE, H=HOP_SIZE, fs=SR, window_type=WINDOW_TYPE): ''' extract magnitudes spectra from input vector apply power-law compression cutt the upper spectrum ''' #init functions and vectors x = essentia.array(x) spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type=WINDOW_TYPE) SP = [] #compute STFT for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True): #generate frames wX = window(frame) #window frame mX = spectrum(wX) #compute fft SP.append(mX) SP = essentia.array(SP) SP = np.power(SP, 2. / 3.) #power law compression #SP = SP[:,:int(FFT_SIZE/2+1)] #cut upper spectrum (above 4 khz) return SP
def _extract_pitch_contours(self, audio): # Hann window with x4 zero padding run_windowing = estd.Windowing( # pylint: disable-msg=E1101 zeroPadding=3 * self.frame_size) run_spectrum = estd.Spectrum( # pylint: disable-msg=E1101 size=self.frame_size * 4) run_spectral_peaks = estd.SpectralPeaks( # pylint: disable-msg=E1101 minFrequency=self.min_frequency, maxFrequency=self.max_frequency, magnitudeThreshold=self.magnitude_threshold, sampleRate=self.sample_rate, orderBy='magnitude') # convert unit to cents, PitchSalienceFunction takes 55 Hz as the # default reference run_pitch_salience_function = \ estd.PitchSalienceFunction( # pylint: disable-msg=E1101 binResolution=self.bin_resolution) run_pitch_salience_function_peaks = \ estd.PitchSalienceFunctionPeaks( # pylint: disable-msg=E1101 binResolution=self.bin_resolution, minFrequency=self.min_frequency, maxFrequency=self.max_frequency) run_pitch_contours = estd.PitchContours( # pylint: disable-msg=E1101 hopSize=self.hop_size, binResolution=self.bin_resolution, peakDistributionThreshold=self.peak_distribution_threshold) # compute frame by frame pool = Pool() for frame in estd.FrameGenerator( audio, # pylint: disable-msg=E1101 frameSize=self.frame_size, hopSize=self.hop_size): frame = run_windowing(frame) spectrum = run_spectrum(frame) peak_frequencies, peak_magnitudes = run_spectral_peaks(spectrum) salience = run_pitch_salience_function(peak_frequencies, peak_magnitudes) salience_peaks_bins, salience_peaks_contour_saliences = \ run_pitch_salience_function_peaks(salience) if not np.size(salience_peaks_bins): salience_peaks_bins = np.array([0]) if not np.size(salience_peaks_contour_saliences): salience_peaks_contour_saliences = np.array([0]) pool.add('allframes_salience_peaks_bins', salience_peaks_bins) pool.add('allframes_salience_peaks_contourSaliences', salience_peaks_contour_saliences) # post-processing: contour tracking contours_bins, contour_saliences, contours_start_times, duration = \ run_pitch_contours( [f.tolist() for f in pool['allframes_salience_peaks_bins']], [f.tolist() for f in pool['allframes_salience_peaks_contourSaliences']]) return contours_bins, contours_start_times, contour_saliences, duration
def extractor(filename): fs = 44100 audio = ess.MonoLoader(filename=filename, sampleRate=fs)() # dynamic range expansion as done in HTK implementation audio = audio * 2**15 frameSize = 1102 # corresponds to htk default WINDOWSIZE = 250000.0 hopSize = 441 # corresponds to htk default TARGETRATE = 100000.0 fftSize = 2048 spectrumSize = fftSize // 2 + 1 zeroPadding = fftSize - frameSize w = ess.Windowing( type='hamming', # corresponds to htk default USEHAMMING = T size=frameSize, zeroPadding=zeroPadding, normalized=False, zeroPhase=False) spectrum = ess.Spectrum(size=fftSize) mfcc_htk = ess.MFCC( inputSize=spectrumSize, type='magnitude', # htk uses mel filterbank magniude warpingFormula='htkMel', # htk's mel warping formula weighting='linear', # computation of filter weights done in Hz domain highFrequencyBound=8000, # corresponds to htk default lowFrequencyBound=0, # corresponds to htk default numberBands=26, # corresponds to htk default NUMCHANS = 26 numberCoefficients=13, normalize= 'unit_max', # htk filter normaliation to have constant height = 1 dctType=3, # htk uses DCT type III logType='log', liftering=22) # corresponds to htk default CEPLIFTER = 22 mfccs = [] # startFromZero = True, validFrameThresholdRatio = 1 : the way htk computes windows for frame in ess.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize, startFromZero=True, validFrameThresholdRatio=1): spect = spectrum(w(frame)) mel_bands, mfcc_coeffs = mfcc_htk(spect) mfccs.append(mfcc_coeffs) # transpose to have it in a better shape # we need to convert the list to an essentia.array first (== numpy.array of floats) # mfccs = essentia.array(pool['MFCC']).T mfccs = essentia.array(mfccs).T # and plot plt.imshow(mfccs[1:, :], aspect='auto', interpolation='none') # ignore enery # plt.imshow(mfccs, aspect = 'auto', interpolation='none') plt.show() # unnecessary if you started "ipython --pylab"
def essentiaObjectInit(self): winAnalysis = 'hann' self.MFCC80 = ess.MFCC(sampleRate=self.fs, highFrequencyBound=self.highFrequencyBound, inputSize=self.frameSize + 1, numberBands=self.numberBands) N = 2 * self.frameSize # padding 1 time framesize self.SPECTRUM = ess.Spectrum(size=N) self.WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - self.frameSize)
def gen_frames(filepath): """Cuts audio into many frames""" # Convert file to mono raw audio audio = es.MonoLoader(filename=filepath, sampleRate=sample_rate)() # Cut audio into frames and expand them into windowed frames for better processing frame_gen = es.FrameGenerator(audio, frameSize=samples_per_frame, hopSize=hop_length) frames = np.array([es.Windowing(size=samples_per_frame, type=window_type)(frame) for frame in frame_gen]) return frames
def __init__(self, frame_size, hop_size, window_type, feature, beats, sample_rate): """STFTFeature constructor.""" self.frame_size = frame_size self.hop_size = hop_size self.window_type = window_type self.w = ES.Windowing(type=window_type) self.spectrum = ES.Spectrum() self.feature = feature # Essentia feature object self.beats = beats self.sample_rate = sample_rate
def mfcc_htk(self, window_length=22050, nmfcc=13, n_mels=26, fmax=8000, lifterexp=22): """ Get MFCCs 'the HTK way' with the help of Essentia https://github.com/MTG/essentia/blob/master/src/examples/tutorial/example_mfcc_the_htk_way.py Using all of the default parameters from there except the hop length (which shouldn't matter), and a much longer window length (which has been found to work better for covers) Parameters ---------- window_length: int Length of the window to use for the STFT nmfcc: int Number of MFCC coefficients to compute n_mels: int Number of frequency bands to use fmax: int Maximum frequency Returns ------- ndarray(nmfcc, nframes) An array of all of the MFCC frames """ fftlen = int(2**(np.ceil(np.log(window_length)/np.log(2)))) spectrumSize= fftlen//2+1 zeroPadding = fftlen - window_length w = estd.Windowing(type = 'hamming', # corresponds to htk default USEHAMMING = T size = window_length, zeroPadding = zeroPadding, normalized = False, zeroPhase = False) spectrum = estd.Spectrum(size=fftlen) mfcc_htk = estd.MFCC(inputSize = spectrumSize, type = 'magnitude', # htk uses mel filterbank magniude warpingFormula = 'htkMel', # htk's mel warping formula weighting = 'linear', # computation of filter weights done in Hz domain highFrequencyBound = fmax, # 8000 is htk default lowFrequencyBound = 0, # corresponds to htk default numberBands = n_mels, # corresponds to htk default NUMCHANS = 26 numberCoefficients = nmfcc, normalize = 'unit_max', # htk filter normaliation to have constant height = 1 dctType = 3, # htk uses DCT type III logType = 'log', liftering = lifterexp) # corresponds to htk default CEPLIFTER = 22 mfccs = [] # startFromZero = True, validFrameThresholdRatio = 1 : the way htk computes windows for frame in estd.FrameGenerator(self.audio_vector, frameSize = window_length, hopSize = self.hop_length , startFromZero = True, validFrameThresholdRatio = 1): spect = spectrum(w(frame)) mel_bands, mfcc_coeffs = mfcc_htk(spect) mfccs.append(mfcc_coeffs) return np.array(mfccs, dtype=np.float32).T
def melspectrogram(audio, sampleRate=44100, frameSize=2048, hopSize=1024, window='blackmanharris62', zeroPadding=0, center=True, numberBands=[128, 96, 48, 32, 24, 16, 8], lowFrequencyBound=0, highFrequencyBound=None, weighting='linear', warpingFormula='slaneyMel', normalize='unit_tri'): if highFrequencyBound is None: highFrequencyBound = sampleRate / 2 windowing = es.Windowing(type=window, normalized=False, zeroPadding=zeroPadding) spectrum = es.Spectrum() melbands = {} for nBands in numberBands: melbands[nBands] = es.MelBands( numberBands=nBands, sampleRate=sampleRate, lowFrequencyBound=lowFrequencyBound, highFrequencyBound=highFrequencyBound, inputSize=(frameSize + zeroPadding) // 2 + 1, weighting=weighting, normalize=normalize, warpingFormula=warpingFormula, type='power') norm10k = es.UnaryOperator(type='identity', shift=1, scale=10000) log10 = es.UnaryOperator(type='log10') amp2db = es.UnaryOperator(type='lin2db', scale=2) results = essentia.Pool() for frame in es.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize, startFromZero=not center): spectrumFrame = spectrum(windowing(frame)) for nBands in numberBands: melFrame = melbands[nBands](spectrumFrame) results.add('mel_' + str(nBands) + '_db', amp2db(melFrame)) results.add('mel_' + str(nBands) + '_log1+10kx', log10(norm10k(melFrame))) return results
def extractor(filename): fs = 44100 audio = ess.MonoLoader(filename=filename, sampleRate=fs)() # dynamic range expansion as done in HTK implementation audio = audio * 2**15 frameSize = 1102 # corresponds to htk default WINDOWSIZE = 250000.0 hopSize = 441 # corresponds to htk default TARGETRATE = 100000.0 fftSize = 2048 spectrumSize = fftSize // 2 + 1 zeroPadding = fftSize - frameSize w = ess.Windowing( type='hamming', # corresponds to htk default USEHAMMING = T size=frameSize, zeroPadding=zeroPadding, normalized=False, zeroPhase=False) spectrum = ess.Spectrum(size=fftSize) mfcc_htk = ess.MFCC( inputSize=spectrumSize, type='magnitude', # htk uses mel filterbank magniude warpingFormula='htkMel', # htk's mel warping formula weighting='linear', # computation of filter weights done in Hz domain highFrequencyBound=8000, # corresponds to htk default lowFrequencyBound=0, # corresponds to htk default numberBands=26, # corresponds to htk default NUMCHANS = 26 numberCoefficients=13, normalize= 'unit_max', # htk filter normaliation to have constant height = 1 dctType=3, # htk uses DCT type III logType='log', liftering=22) # corresponds to htk default CEPLIFTER = 22 mfccs = [] # startFromZero = True, validFrameThresholdRatio = 1 : the way htk computes windows for frame in ess.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize, startFromZero=True, validFrameThresholdRatio=1): spect = spectrum(w(frame)) mel_bands, mfcc_coeffs = mfcc_htk(spect) #frame_energy = energy_func(frame) #mfccs.append(numpy.append(mfcc_coeffs, frame_energy)) mfccs.append(mfcc_coeffs) return mfccs
def getFeature(audio, d=True, nbf=False): ''' MFCC of give audio interval [p[0],p[1]] :param audio: :param p: :return: ''' winAnalysis = 'hann' # this MFCC is for pattern classification, which numberBands always be by default MFCC40 = ess.MFCC(sampleRate=fs, highFrequencyBound=highFrequencyBound, inputSize=framesize + 1) N = 2 * framesize # padding 1 time framesize SPECTRUM = ess.Spectrum(size=N) WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize) mfcc = [] # audio_p = audio[p[0]*fs:p[1]*fs] for frame in ess.FrameGenerator(audio, frameSize=framesize, hopSize=hopsize): frame = WINDOW(frame) mXFrame = SPECTRUM(frame) bands, mfccFrame = MFCC40(mXFrame) # mfccFrame = mfccFrame[1:] mfcc.append(mfccFrame) if d: mfcc = np.array(mfcc).transpose() dmfcc = Fdeltas(mfcc, w=5) ddmfcc = Fdeltas(dmfcc, w=5) feature = np.transpose(np.vstack((mfcc, dmfcc, ddmfcc))) else: feature = np.array(mfcc) if not d and nbf: mfcc = np.array(mfcc).transpose() mfcc_out = np.array(mfcc, copy=True) for w_r in range(1, 6): mfcc_right_shifted = Fprev_sub(mfcc, w=w_r) mfcc_left_shifted = Fprev_sub(mfcc, w=-w_r) mfcc_out = np.vstack( (mfcc_out, mfcc_left_shifted, mfcc_right_shifted)) feature = np.array(np.transpose(mfcc_out), dtype='float32') # print feature.shape return feature
def FeatureExtraction_Recording(recording, params): numBins = params.numbins fs = params.fs # LOAD Audio file Audio = ess.MonoLoader(filename=recording.path, sampleRate=fs)() Audio = ess.DCRemoval()(Audio) # PREPROCESSING / DC removal Audio = ess.EqualLoudness()(Audio) # PREPROCESSING - Equal Loudness Filter # Windowing Parameters (first converting from msec to number of samples) # assuring windowSize and hopSize are even windowSize = round(fs * params.windowSize / 1000) windowSize = int(windowSize / 2) * 2 hopSize = round(fs * params.hopSize / 1000) hopSize = int(hopSize / 2) * 2 tonic = float(recording.tonic) # FRAME-BASED Spectral Analysis hpcp = [] for frame in ess.FrameGenerator(Audio, frameSize=windowSize, hopSize=hopSize, startFromZero=True): frame = ess.Windowing(size=windowSize, type=params.windowFunction)(frame) mX = ess.Spectrum(size=windowSize)(frame) mX[mX < np.finfo(float).eps] = np.finfo(float).eps # EXTRACT frequency and magnitude information of the harmonic spectral peaks freq, mag = ess.SpectralPeaks()(mX) # harmonic pitch-class profiles hpcp.append( ess.HPCP(normalized='unitSum', referenceFrequency=tonic, size=numBins, windowSize=12 / numBins)(freq, mag)) recording.chroma_framebased = np.array(hpcp) # FEATURE SUMMARIZATION mean_chroma = [] # global Mean of HPCP vectors std_chroma = [] # global standard deviation of HPCP vectors for j in range(numBins): tmp = [] for i in range(len(recording.chroma_framebased)): tmp.append(recording.chroma_framebased[i][j]) mean_chroma.append(np.mean(tmp)) std_chroma.append(np.std(tmp)) recording.chroma_mean = mean_chroma recording.chroma_std = std_chroma
def analyze_misc(filename, segment_duration=20): # Compute replay gain and duration on the entire file, then load the # segment that is centered in time with replaygain applied audio = es.MonoLoader(filename=filename)() replaygain = es.ReplayGain()(audio) segment_start = (len(audio) / 44100 - segment_duration) / 2 segment_end = segment_start + segment_duration if segment_start < 0 or segment_end > len(audio) / 44100: raise ValueError( 'Segment duration is larger than the input audio duration') loader = es.EasyLoader(filename=filename, replayGain=replaygain, startTime=segment_start, endTime=segment_end) windowing = es.Windowing(type='blackmanharris62') spectrum = es.Spectrum() powerspectrum = es.PowerSpectrum() centroid = es.Centroid() zcr = es.ZeroCrossingRate() rms = es.RMS() hfc = es.HFC() pool = essentia.Pool() audio = loader() for frame in es.FrameGenerator(audio, frameSize=2048, hopSize=1024): frame_spectrum = spectrum(windowing(frame)) pool.add('rms', rms(frame)) pool.add('rms_spectrum', rms(frame_spectrum)) pool.add('hfc', hfc(frame_spectrum)) pool.add('spectral_centroid', centroid(frame_spectrum)) pool.add('zcr', zcr(frame)) audio_st, sr, _, _, _, _ = es.AudioLoader(filename=filename)() # Ugly hack because we don't have a StereoResample left, right = es.StereoDemuxer()(audio_st) resampler = es.Resample(inputSampleRate=sr, outputSampleRate=44100) left = resampler(left) right = resampler(right) audio_st = es.StereoMuxer()(left, right) audio_st = es.StereoTrimmer(startTime=segment_start, endTime=segment_end)(audio_st) ebu_momentary, _, _, _ = es.LoudnessEBUR128(hopSize=1024 / 44100, startAtZero=True)(audio_st) pool.set('ebu_momentary', ebu_momentary) return pool