def MRCG(x, fs=44100, framesize1=0.02, framesize2=0.2, hopsize=0.01): hopsize = int(hopsize * fs) # spectrogram init winAnalysis = 'hann' ####---- cochleagram 1 framesize = int(framesize1 * fs) N = 2 * framesize # padding 1 time framesize SPECTRUM = ess.Spectrum(size=N) WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize) highFrequencyBound = fs / 2 if fs / 2 < 11000 else 11000 ERBBANDS = ess.ERBBands(sampleRate=fs, highFrequencyBound=highFrequencyBound, inputSize=framesize + 1) cochlea1 = [] for frame in ess.FrameGenerator(x, frameSize=framesize, hopSize=hopsize): frame = WINDOW(frame) mXFrame = SPECTRUM(frame) erbFrame = np.log10(ERBBANDS(mXFrame) + np.finfo(np.float).eps) cochlea1.append(erbFrame) cochlea1 = np.array(cochlea1) ####---- cochleagram 2 framesize = int(framesize2 * fs) N = 2 * framesize # padding 1 time framesize SPECTRUM = ess.Spectrum(size=N) WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize) highFrequencyBound = fs / 2 if fs / 2 < 11000 else 11000 ERBBANDS = ess.ERBBands(sampleRate=fs, highFrequencyBound=highFrequencyBound, inputSize=framesize + 1) cochlea2 = [] for frame in ess.FrameGenerator(x, frameSize=framesize, hopSize=hopsize): frame = WINDOW(frame) mXFrame = SPECTRUM(frame) erbFrame = np.log10(ERBBANDS(mXFrame) + np.finfo(np.float).eps) cochlea2.append(erbFrame) cochlea2 = np.array(cochlea2) ####---- smoothed version cochlea3 = get_avg(cochlea1, 5, 5) cochlea4 = get_avg(cochlea1, 11, 11) all_cochleas = np.hstack((cochlea1, cochlea2, cochlea3, cochlea4)) ####---- delta d_all_cochleas = Fdeltas(all_cochleas.T) dd_all_cochleas = Fdeltas(Fdeltas(all_cochleas.T, 5), 5) d_all_cochleas = d_all_cochleas.T dd_all_cochleas = dd_all_cochleas.T return all_cochleas, d_all_cochleas, dd_all_cochleas
def _key_fnc( sample: NDArray[Float32], frequency_rate: int, windowfnc: Window, key_type: KeyFunction, ): """ This function computes the key function, which in return calculates the keys for the [this.samples] map. To calculate the spectral centroid, the frequency_rate should be equal to the half of the samplerate. """ if key_type == KeyFunction.CENTROID: return _get_centroid( sample, estd.Centroid(range=frequency_rate), estd.Spectrum(), estd.Windowing(type=windowfnc.value), ) if key_type == KeyFunction.MAX: return _get_max( sample, estd.Spectrum(), estd.Windowing(type=windowfnc.value), ) if key_type == KeyFunction.MFCC: return _get_mfcc( sample, estd.MFCC(), estd.Spectrum(), estd.Windowing(type=windowfnc.value), ) if key_type == KeyFunction.MELBANDS: return _get_melbands( sample, estd.MFCC(), estd.Spectrum(), estd.Windowing(type=windowfnc.value), ) if key_type == KeyFunction.MELBANDS_LOG: return estd.UnaryOperator(type="log")(_get_melbands( sample, estd.MFCC(), estd.Spectrum(), estd.Windowing(type=windowfnc.value), )) raise ValueError("Keyfunction is not defined!")
def extract(fname, outpath, fs=22050, fsize=1024, hsize=512): """ extract(fname, outpath, fs, fsize, hsize) will compute the mfcc of Audio file fname. Inputs: fname -- is the name of audio file. outpath -- is the output path of processed files. fs -- is the sampling frequency (Hz). fsize -- is the size of each frame. hsize -- is the hop size betwean frames. Outputs: the file contains the mfcc coefficents of audio file. in what format??? """ # gate(fname) loader = es.MonoLoader(filename=fname, sampleRate=fs) # length = len(loader) # maxim = max(loader) # for sample in loader: # if abs(sample) < maxim/20: # sample = 0 ; w = es.Windowing(type='hann') spectrum = es.Spectrum() mfcc = es.MFCC(inputSize=513, numberCoefficients=20) mfccs = [] audio = loader() for frame in es.FrameGenerator(audio, frameSize=1024, hopSize=512): mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame))) mfccs.append(mfcc_coeffs) mfccs = np.array(mfccs) return mfcc
def vibFreq(pitchtrack, sp, hopsize): ''' :param pitchtrack: :param sp: samplerate of wave audio :param hopsize: :return: 3 frequencies of potential vibrato ''' if pitchtrack.dtype != np.float32: pitchtrack = pitchtrack.astype(np.float32) pitchtrackPad = pitchtrack[:] sampleRate = sp / hopsize ptlen = len(pitchtrack) fftSize = int(pow(2, ceil(log(ptlen) / log(2)))) # next pow of pitchtrack length if ptlen < fftSize: pitchtrackPad = np.append(pitchtrack, np.zeros(fftSize - ptlen, dtype=np.float32)) S = ess.Spectrum(size=fftSize)(pitchtrackPad) locs, amps = ess.PeakDetection(maxPeaks=3, orderBy='amplitude')(S) freqs = locs * (fftSize / 2 + 1) * sampleRate / fftSize return freqs[0]
def segment(audio, hopSize, frameSize, rms_onset_threshold, mel_onset_threshold, flux_onset_threshold, onset_threshold): # init algorithms o_mel = estd.OnsetDetection(method='melflux') o_rms = estd.OnsetDetection(method='rms') o_hfc = estd.OnsetDetection(method='hfc') o_flux = estd.OnsetDetection(method='flux') fft = estd.FFT() c2p = estd.CartesianToPolar() pool = essentia.Pool() frame_generator = estd.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize) w = estd.Windowing(type='hann') yin = estd.PitchYinFFT(frameSize=frameSize, minFrequency=40, maxFrequency=2500, interpolate=True) spectrum = estd.Spectrum() loudness = estd.Loudness() # control parameters attack = False detection = True mel_onset_value = 0 rms_onset_value = 0 # output variables onset = None sustain = None for index, frame in enumerate(frame_generator): mag, phase = c2p(fft(w(frame))) _, conf = yin(spectrum(w(frame))) loud = loudness(frame) mel_onset = o_mel(mag, phase) rms_onset = o_rms(mag, phase) hfc_onset = o_hfc(mag, phase) flux_onset = o_flux(mag, phase) pool.add('onsets_mel', mel_onset) pool.add('onsets_rms', rms_onset) pool.add('onsets_hfc', hfc_onset) pool.add('onsets_flux', flux_onset) pool.add('conf', conf) pool.add('loudness', loud) # condition for onset if detection and (flux_onset > flux_onset_threshold or mel_onset > mel_onset_threshold) \ and rms_onset > rms_onset_threshold and loud > onset_threshold: onset = index attack = True detection = False mel_onset_value = mel_onset rms_onset_value = rms_onset # condition for beginning of sustain if attack and conf > 0.5 and rms_onset < rms_onset_value * .05 and mel_onset < mel_onset_value * .3: attack = False sustain = index return onset, sustain
def compute_description(x, M=WINDOW_SIZE, N=FFT_SIZE, H=HOP_SIZE, fs=SR, window_type=WINDOW_TYPE): ''' -extract features from audio file -Features: HFC SPECTRAL CENTROID SPECTRAL ENERGY F0 loud_factor = energy * (spectral_centroid - F0) #how many harmonics = how much speaker is yelling PITCH CONFIDENCE ''' #audioLoader = ess.EasyLoader(filename=file_name, sampleRate=fs) #create essentia instances x = essentia.array(x) spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type=window_type) hfc = ess.HFC(sampleRate=fs) spectralCentroid = ess.SpectralCentroidTime(sampleRate=fs) energy = ess.Energy() pitch_extractor = ess.PredominantPitchMelodia(frameSize=M, hopSize=H, maxFrequency=1200) #init vectors CONTRAST = [] HFC = [] CENTROID = [] ENERGY = [] #compute features for every stft frame for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True): #generate frames wX = window(frame) #window frame mX = spectrum(wX) #compute fft frame_hfc = hfc(mX) HFC.append(frame_hfc) frame_centroid = spectralCentroid( wX) #compute spectral centroid in time domain CENTROID.append(frame_centroid) frame_energy = energy(mX) #compute spectral energy in time domain ENERGY.append(frame_energy) F0, SALIENCE = pitch_extractor(x) #estimate pitch in time domain #convert into numpy matrices HFC = essentia.array(HFC) CENTROID = essentia.array(CENTROID) ENERGY = essentia.array(ENERGY) F0 = essentia.array(F0) SALIENCE = essentia.array(SALIENCE) F0 = F0[:len(CENTROID)] SALIENCE = SALIENCE[:len(CENTROID)] return HFC, CENTROID, ENERGY, F0, SALIENCE
def extract_features(x, M=WINDOW_SIZE, N=FFT_SIZE, H=HOP_SIZE, fs=SR, window_type=WINDOW_TYPE): ''' extract magnitudes spectra from input vector and apply power-law compression ''' #init functions and vectors x = essentia.array(x) spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type=window_type) SP = [] #compute STFT for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True): #generate frames wX = window(frame) #window frame mX = spectrum(wX) #compute fft ###############################OPTIMIZATION[[[[[[[[[[[[[[]]]]]]]]]]]]]] #DEPRECATED ################################################# SP.append(mX) SP = essentia.array(SP) SP = np.power(SP, 2. / 3.) #power law compression return SP
def get_f0(audio, minf0=20, maxf0=22050, cf=0.9, ws=2048, hs=256): ''' Args: audio (array): audio signal (output from MonoLoader) minf0 (int): minimum allowed frequency maxf0 (int): maximun allowed frequency cf (float): confidence threshold (0 - 1) ws (int): window size hp (int): hop size Returns: f0 (array): ''' # instantiate Essentia functions w = es.Windowing(type='hann', zeroPadding=ws) spec = es.Spectrum() yin = es.PitchYinFFT(minFrequency=minf0, maxFrequency=maxf0, frameSize=ws) # empty lists for f0 and confidence f0 = [] conf = [] # iterate over frames for frame in es.FrameGenerator(audio, frameSize=ws, hopSize=hs): p, pc = yin(spec(w(frame))) f0.append(p) conf.append(pc) # convert lists to np.arrays f0 = np.array(f0) conf = np.array(conf) # return f0 over given confidence f0[conf < cf] = 0 return f0
def getHPCPEssentia(XAudio, Fs, winSize, hopSize, squareRoot=False, NChromaBins=36, NHarmonics = 0): """ Wrap around the essentia library to compute HPCP features :param XAudio: A flat array of raw audio samples :param Fs: Sample rate :param winSize: Window size of each STFT window :param hopSize: Hop size between STFT windows :param squareRoot: Do square root compression? :param NChromaBins: How many chroma bins (default 36) :returns H: An (NChromaBins x NWindows) matrix of all \ chroma windows """ import essentia from essentia import Pool, array import essentia.standard as ess spectrum = ess.Spectrum() window = ess.Windowing(size=winSize, type='hann') spectralPeaks = ess.SpectralPeaks() hpcp = ess.HPCP(size=NChromaBins, harmonics=NHarmonics) H = [] for frame in ess.FrameGenerator(array(XAudio), frameSize=winSize, hopSize=hopSize, startFromZero=True): S = spectrum(window(frame)) freqs, mags = spectralPeaks(S) H.append(hpcp(freqs, mags)) H = np.array(H) H = H.T if squareRoot: H = sqrtCompress(H) return H
def calc_chromagram(self): # save the results in the stft_pool self.chromagram = [] hpcp = es.HPCP( size=12, # we will need higher resolution for Key estimation referenceFrequency=440, # assume tuning frequency is 44100. bandPreset=False, weightType='cosine', nonLinear=False, windowSize=1., sampleRate=self.sample_rate) spectrum = es.Spectrum(size=self.fft_size) spectral_peaks = es.SpectralPeaks(sampleRate=self.sample_rate) for frame in es.FrameGenerator(self.audio, frameSize=self.frame_size, hopSize=self.hop_size, startFromZero=True): frame = array(frame * self.window) freqs, mags = spectral_peaks(spectrum(frame)) chroma = hpcp(freqs, mags) self.chromagram.append(chroma) self.chromagram = array(self.chromagram) self.timeAxSec = np.arange(len( self.chromagram)) * self.hop_size / float(self.sample_rate)
def extract_features(x, M=Config.WINDOW_SIZE, N=Config.FFT_SIZE, H=Config.HOP_SIZE, fs=Config.FS, window_type=Config.WINDOW_TYPE): ''' Function that extracts spectrogram from an audio signal ----------------------- Input: Samples, window size (int), FFT size (int), Hop size (int), Sampling rate, Window type (e.g. Hanning) Output: Spectrogram ----------------------- ''' # init functions and vectors x = essentia.array(x) spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type=window_type) SP = [] # compute STFT for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True): # generate frames wX = window(frame) # window frame mX = spectrum(wX) # compute fft SP.append(mX) SP = essentia.array(SP) SP = np.power(SP, 2. / 3.) # power law compression SP = SP[:, :int(Config.FFT_SIZE / 4 + 1)] return SP
def get_beat_chunks(filename, bpm_restrict=None): audio = std.MonoLoader(filename=filename)() hpcp = std.HPCP() spectrum = std.Spectrum() speaks = std.SpectralPeaks() large_speaks = std.SpectralPeaks(maxPeaks=2000) tivs = [] sr = 44100 bpm = get_tempo(filename) tivs_framewise = [] if bpm_restrict != None and bpm_restrict != bpm: raise ValueError sec_beat = (60 / bpm) beats = np.arange(0, len(audio) / sr, sec_beat) beats = np.append(beats, len(audio) / sr) for i in range(1, len(beats)): segmented_audio = audio[int(beats[i - 1] * sr):int(beats[i] * sr)] cutter = std.FrameGenerator(segmented_audio) for sec in cutter: spec = spectrum(sec) freq, mag = speaks(spec) chroma = hpcp(freq, mag) tivs_framewise.append(chroma) np2_seg_audio = zeropad_next_power_2(segmented_audio) spec = spectrum(np2_seg_audio) freq, mag = speaks(spec) chroma = hpcp(freq, mag) tivs.append(chroma) # Calculate the whole TIV np2_whole = zeropad_next_power_2(audio) spec = spectrum(np2_whole) freq, mag = large_speaks(spec) chroma_whole = hpcp(freq, mag) return mt.TIVCollection.from_pcp(np.array(tivs).T), mt.TIV.from_pcp(chroma_whole), mt.TIVCollection.from_pcp(np.array(tivs_framewise).T)
def getMBE(audio): ''' mel band energy feature :param audio: :return: ''' winAnalysis = 'hann' # this MFCC is for pattern classification, which numberBands always be by default MFCC40 = ess.MFCC(sampleRate=fs, highFrequencyBound=highFrequencyBound, inputSize=framesize + 1) N = 2 * framesize # padding 1 time framesize SPECTRUM = ess.Spectrum(size=N) WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize) mfccBands = [] for frame in ess.FrameGenerator(audio, frameSize=framesize, hopSize=hopsize): frame = WINDOW(frame) mXFrame = SPECTRUM(frame) bands, mfccFrame = MFCC40(mXFrame) mfccBands.append(bands) feature = np.array(mfccBands) return feature
def get_constantq(frames, sample_rate=16000, num_bands=64): max_freq = 8000 min_freq = 125 num_octaves = np.log2(max_freq / min_freq) bins_per_octave = int(np.ceil(num_bands / num_octaves)) frame_size = len(frames[0]) const_q_spectra = [] spectrum_estimator = es.Spectrum(size=frame_size) if num_bands == 16: padding_size = max([0, 512 - frame_size]) elif num_bands == 32: padding_size = max([0, 2048 - frame_size]) else: padding_size = max([0, 1024 - frame_size]) windowing = es.Windowing(type='hann', size=frame_size, zeroPadding=padding_size) constantq_estimator = es.ConstantQ(binsPerOctave=bins_per_octave, minFrequency=min_freq, numberBins=num_bands, sampleRate=sample_rate) for frame in frames: const_q_spectrum = constantq_estimator(windowing(frame)) const_q_spectra.append(np.abs(const_q_spectrum)) return np.array(const_q_spectra).T
def extract_features(x, M=WINDOW_SIZE, N=FFT_SIZE, H=HOP_SIZE, fs=SR, window_type=WINDOW_TYPE): ''' extract magnitudes spectra from input vector apply power-law compression cutt the upper spectrum ''' #init functions and vectors x = essentia.array(x) spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type=WINDOW_TYPE) SP = [] #compute STFT for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True): #generate frames wX = window(frame) #window frame mX = spectrum(wX) #compute fft SP.append(mX) SP = essentia.array(SP) SP = np.power(SP, 2. / 3.) #power law compression #SP = SP[:,:int(FFT_SIZE/2+1)] #cut upper spectrum (above 4 khz) return SP
def file_to_hpcp(loop): loop = e.array(loop) windowing = es.Windowing(type='blackmanharris62') spectrum = es.Spectrum() spectral_peaks = es.SpectralPeaks(orderBy='magnitude', magnitudeThreshold=0.001, maxPeaks=20, minFrequency=20, maxFrequency=8000) hpcp = es.HPCP(maxFrequency=8000) spec_group = [] hpcp_group = [] for frame in es.FrameGenerator(loop, frameSize=1024, hopSize=512): windowed = windowing(frame) fft = spectrum(windowed) frequencies, magnitudes = spectral_peaks(fft) final_hpcp = hpcp(frequencies, magnitudes) spec_group.append(fft) hpcp_group.append(final_hpcp) mean_hpcp = np.mean(np.array(hpcp_group).T, axis=1) #normalize to 1 mean_hpcp = mean_hpcp / mean_hpcp.max() return mean_hpcp
def _extract_pitch_contours(self, audio): # Hann window with x4 zero padding run_windowing = estd.Windowing( # pylint: disable-msg=E1101 zeroPadding=3 * self.frame_size) run_spectrum = estd.Spectrum( # pylint: disable-msg=E1101 size=self.frame_size * 4) run_spectral_peaks = estd.SpectralPeaks( # pylint: disable-msg=E1101 minFrequency=self.min_frequency, maxFrequency=self.max_frequency, magnitudeThreshold=self.magnitude_threshold, sampleRate=self.sample_rate, orderBy='magnitude') # convert unit to cents, PitchSalienceFunction takes 55 Hz as the # default reference run_pitch_salience_function = \ estd.PitchSalienceFunction( # pylint: disable-msg=E1101 binResolution=self.bin_resolution) run_pitch_salience_function_peaks = \ estd.PitchSalienceFunctionPeaks( # pylint: disable-msg=E1101 binResolution=self.bin_resolution, minFrequency=self.min_frequency, maxFrequency=self.max_frequency) run_pitch_contours = estd.PitchContours( # pylint: disable-msg=E1101 hopSize=self.hop_size, binResolution=self.bin_resolution, peakDistributionThreshold=self.peak_distribution_threshold) # compute frame by frame pool = Pool() for frame in estd.FrameGenerator( audio, # pylint: disable-msg=E1101 frameSize=self.frame_size, hopSize=self.hop_size): frame = run_windowing(frame) spectrum = run_spectrum(frame) peak_frequencies, peak_magnitudes = run_spectral_peaks(spectrum) salience = run_pitch_salience_function(peak_frequencies, peak_magnitudes) salience_peaks_bins, salience_peaks_contour_saliences = \ run_pitch_salience_function_peaks(salience) if not np.size(salience_peaks_bins): salience_peaks_bins = np.array([0]) if not np.size(salience_peaks_contour_saliences): salience_peaks_contour_saliences = np.array([0]) pool.add('allframes_salience_peaks_bins', salience_peaks_bins) pool.add('allframes_salience_peaks_contourSaliences', salience_peaks_contour_saliences) # post-processing: contour tracking contours_bins, contour_saliences, contours_start_times, duration = \ run_pitch_contours( [f.tolist() for f in pool['allframes_salience_peaks_bins']], [f.tolist() for f in pool['allframes_salience_peaks_contourSaliences']]) return contours_bins, contours_start_times, contour_saliences, duration
def extractor(filename): fs = 44100 audio = ess.MonoLoader(filename=filename, sampleRate=fs)() # dynamic range expansion as done in HTK implementation audio = audio * 2**15 frameSize = 1102 # corresponds to htk default WINDOWSIZE = 250000.0 hopSize = 441 # corresponds to htk default TARGETRATE = 100000.0 fftSize = 2048 spectrumSize = fftSize // 2 + 1 zeroPadding = fftSize - frameSize w = ess.Windowing( type='hamming', # corresponds to htk default USEHAMMING = T size=frameSize, zeroPadding=zeroPadding, normalized=False, zeroPhase=False) spectrum = ess.Spectrum(size=fftSize) mfcc_htk = ess.MFCC( inputSize=spectrumSize, type='magnitude', # htk uses mel filterbank magniude warpingFormula='htkMel', # htk's mel warping formula weighting='linear', # computation of filter weights done in Hz domain highFrequencyBound=8000, # corresponds to htk default lowFrequencyBound=0, # corresponds to htk default numberBands=26, # corresponds to htk default NUMCHANS = 26 numberCoefficients=13, normalize= 'unit_max', # htk filter normaliation to have constant height = 1 dctType=3, # htk uses DCT type III logType='log', liftering=22) # corresponds to htk default CEPLIFTER = 22 mfccs = [] # startFromZero = True, validFrameThresholdRatio = 1 : the way htk computes windows for frame in ess.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize, startFromZero=True, validFrameThresholdRatio=1): spect = spectrum(w(frame)) mel_bands, mfcc_coeffs = mfcc_htk(spect) mfccs.append(mfcc_coeffs) # transpose to have it in a better shape # we need to convert the list to an essentia.array first (== numpy.array of floats) # mfccs = essentia.array(pool['MFCC']).T mfccs = essentia.array(mfccs).T # and plot plt.imshow(mfccs[1:, :], aspect='auto', interpolation='none') # ignore enery # plt.imshow(mfccs, aspect = 'auto', interpolation='none') plt.show() # unnecessary if you started "ipython --pylab"
def essentiaObjectInit(self): winAnalysis = 'hann' self.MFCC80 = ess.MFCC(sampleRate=self.fs, highFrequencyBound=self.highFrequencyBound, inputSize=self.frameSize + 1, numberBands=self.numberBands) N = 2 * self.frameSize # padding 1 time framesize self.SPECTRUM = ess.Spectrum(size=N) self.WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - self.frameSize)
def __onset_candidate_detection__(self): spectrum = e.Spectrum() e_onsetdetection = e.OnsetDetection(method="flux") onsetspecs = [] for frame in e.FrameGenerator(self.signal, 1024, 512): self.frames.append(frame) onsetspecs.append(spectrum(frame)) self.onset_candidates.append(e_onsetdetection(onsetspecs[-1], [0]*len(onsetspecs[-1]))) self.frame_count = len(self.frames)
def __init__(self, frame_size, hop_size, window_type, feature, beats, sample_rate): """STFTFeature constructor.""" self.frame_size = frame_size self.hop_size = hop_size self.window_type = window_type self.w = ES.Windowing(type=window_type) self.spectrum = ES.Spectrum() self.feature = feature # Essentia feature object self.beats = beats self.sample_rate = sample_rate
def exp_env(audio, step): def func(x, a, c): return a * np.exp(-c * x) max_pos = np.argmax(audio) audio1 = audio audio = audio[np.argmax(audio):] step = int(step) audio = np.abs(audio) envelope = [] env_x = [] for i in range(0, len(audio), step): env_x += [i + np.argmax(audio[i:i + step])] envelope += [np.max(audio[i:i + step])] env_x = np.array(env_x) envelope = np.array(envelope) try: popt, pcov = curve_fit(func, env_x, envelope, p0=(1, 1e-3)) except RuntimeError: popt = [envelope[0], 0] pcov = [] xx = np.arange(0, len(audio), 1) yy = func(xx, *popt) xx = xx + max_pos xx = np.append(np.arange(0, max_pos), xx) yy = np.append(np.zeros(max_pos), yy) plt.plot(xx, yy) plt.plot(xx, audio1, color='green') start = env_x[np.where(envelope == envelope.max())[0]] nf1 = envelope[0:5].mean() locs = np.where(envelope < 0.1 * envelope.max())[0] if len(locs) < 1: stop1 = env_x[-1] else: stop1 = env_x[locs[np.where( locs > np.where(envelope == envelope.max())[0])][0]] locs = np.where(envelope < 0.01 * envelope.max())[0] if len(locs) < 1: stop2 = env_x[-1] else: stop2 = env_x[locs[np.where( locs > np.where(envelope == envelope.max())[0])][0]] plt.xlabel('Samples') plt.ylabel('Absolute Amplitude') plt.axis([0, 140000, 0, 0.20]) plt.figure() en_mod = np.array(audio1 - yy, dtype='float32') if len(en_mod) % 2 > 0: en_mod = en_mod[:-1] spectrum = estd.Spectrum()(en_mod) plt.plot(spectrum) plt.show() return stop1 - start
def mfcc_htk(self, window_length=22050, nmfcc=13, n_mels=26, fmax=8000, lifterexp=22): """ Get MFCCs 'the HTK way' with the help of Essentia https://github.com/MTG/essentia/blob/master/src/examples/tutorial/example_mfcc_the_htk_way.py Using all of the default parameters from there except the hop length (which shouldn't matter), and a much longer window length (which has been found to work better for covers) Parameters ---------- window_length: int Length of the window to use for the STFT nmfcc: int Number of MFCC coefficients to compute n_mels: int Number of frequency bands to use fmax: int Maximum frequency Returns ------- ndarray(nmfcc, nframes) An array of all of the MFCC frames """ fftlen = int(2**(np.ceil(np.log(window_length)/np.log(2)))) spectrumSize= fftlen//2+1 zeroPadding = fftlen - window_length w = estd.Windowing(type = 'hamming', # corresponds to htk default USEHAMMING = T size = window_length, zeroPadding = zeroPadding, normalized = False, zeroPhase = False) spectrum = estd.Spectrum(size=fftlen) mfcc_htk = estd.MFCC(inputSize = spectrumSize, type = 'magnitude', # htk uses mel filterbank magniude warpingFormula = 'htkMel', # htk's mel warping formula weighting = 'linear', # computation of filter weights done in Hz domain highFrequencyBound = fmax, # 8000 is htk default lowFrequencyBound = 0, # corresponds to htk default numberBands = n_mels, # corresponds to htk default NUMCHANS = 26 numberCoefficients = nmfcc, normalize = 'unit_max', # htk filter normaliation to have constant height = 1 dctType = 3, # htk uses DCT type III logType = 'log', liftering = lifterexp) # corresponds to htk default CEPLIFTER = 22 mfccs = [] # startFromZero = True, validFrameThresholdRatio = 1 : the way htk computes windows for frame in estd.FrameGenerator(self.audio_vector, frameSize = window_length, hopSize = self.hop_length , startFromZero = True, validFrameThresholdRatio = 1): spect = spectrum(w(frame)) mel_bands, mfcc_coeffs = mfcc_htk(spect) mfccs.append(mfcc_coeffs) return np.array(mfccs, dtype=np.float32).T
def audio_features(audio_win): """ returns audio features for a win """ if audio_win.shape[0] % 2 == 1: audio_win = audio_win[:-1] spectrum = esst.Spectrum(size=audio_win.shape[0])(audio_win) _bands, mfcc = esst.MFCC(inputSize=spectrum.shape[0], sampleRate=SR)(spectrum) rhythm = esst.RhythmDescriptors()(audio_win) return mfcc.tolist() + [rhythm[2]] + list(rhythm[5:11])
def melspectrogram(audio, sampleRate=44100, frameSize=2048, hopSize=1024, window='blackmanharris62', zeroPadding=0, center=True, numberBands=[128, 96, 48, 32, 24, 16, 8], lowFrequencyBound=0, highFrequencyBound=None, weighting='linear', warpingFormula='slaneyMel', normalize='unit_tri'): if highFrequencyBound is None: highFrequencyBound = sampleRate / 2 windowing = es.Windowing(type=window, normalized=False, zeroPadding=zeroPadding) spectrum = es.Spectrum() melbands = {} for nBands in numberBands: melbands[nBands] = es.MelBands( numberBands=nBands, sampleRate=sampleRate, lowFrequencyBound=lowFrequencyBound, highFrequencyBound=highFrequencyBound, inputSize=(frameSize + zeroPadding) // 2 + 1, weighting=weighting, normalize=normalize, warpingFormula=warpingFormula, type='power') norm10k = es.UnaryOperator(type='identity', shift=1, scale=10000) log10 = es.UnaryOperator(type='log10') amp2db = es.UnaryOperator(type='lin2db', scale=2) results = essentia.Pool() for frame in es.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize, startFromZero=not center): spectrumFrame = spectrum(windowing(frame)) for nBands in numberBands: melFrame = melbands[nBands](spectrumFrame) results.add('mel_' + str(nBands) + '_db', amp2db(melFrame)) results.add('mel_' + str(nBands) + '_log1+10kx', log10(norm10k(melFrame))) return results
def FeatureExtraction_Recording(recording, params): numBins = params.numbins fs = params.fs # LOAD Audio file Audio = ess.MonoLoader(filename=recording.path, sampleRate=fs)() Audio = ess.DCRemoval()(Audio) # PREPROCESSING / DC removal Audio = ess.EqualLoudness()(Audio) # PREPROCESSING - Equal Loudness Filter # Windowing Parameters (first converting from msec to number of samples) # assuring windowSize and hopSize are even windowSize = round(fs * params.windowSize / 1000) windowSize = int(windowSize / 2) * 2 hopSize = round(fs * params.hopSize / 1000) hopSize = int(hopSize / 2) * 2 tonic = float(recording.tonic) # FRAME-BASED Spectral Analysis hpcp = [] for frame in ess.FrameGenerator(Audio, frameSize=windowSize, hopSize=hopSize, startFromZero=True): frame = ess.Windowing(size=windowSize, type=params.windowFunction)(frame) mX = ess.Spectrum(size=windowSize)(frame) mX[mX < np.finfo(float).eps] = np.finfo(float).eps # EXTRACT frequency and magnitude information of the harmonic spectral peaks freq, mag = ess.SpectralPeaks()(mX) # harmonic pitch-class profiles hpcp.append( ess.HPCP(normalized='unitSum', referenceFrequency=tonic, size=numBins, windowSize=12 / numBins)(freq, mag)) recording.chroma_framebased = np.array(hpcp) # FEATURE SUMMARIZATION mean_chroma = [] # global Mean of HPCP vectors std_chroma = [] # global standard deviation of HPCP vectors for j in range(numBins): tmp = [] for i in range(len(recording.chroma_framebased)): tmp.append(recording.chroma_framebased[i][j]) mean_chroma.append(np.mean(tmp)) std_chroma.append(np.std(tmp)) recording.chroma_mean = mean_chroma recording.chroma_std = std_chroma
def extractor(filename): fs = 44100 audio = ess.MonoLoader(filename=filename, sampleRate=fs)() # dynamic range expansion as done in HTK implementation audio = audio * 2**15 frameSize = 1102 # corresponds to htk default WINDOWSIZE = 250000.0 hopSize = 441 # corresponds to htk default TARGETRATE = 100000.0 fftSize = 2048 spectrumSize = fftSize // 2 + 1 zeroPadding = fftSize - frameSize w = ess.Windowing( type='hamming', # corresponds to htk default USEHAMMING = T size=frameSize, zeroPadding=zeroPadding, normalized=False, zeroPhase=False) spectrum = ess.Spectrum(size=fftSize) mfcc_htk = ess.MFCC( inputSize=spectrumSize, type='magnitude', # htk uses mel filterbank magniude warpingFormula='htkMel', # htk's mel warping formula weighting='linear', # computation of filter weights done in Hz domain highFrequencyBound=8000, # corresponds to htk default lowFrequencyBound=0, # corresponds to htk default numberBands=26, # corresponds to htk default NUMCHANS = 26 numberCoefficients=13, normalize= 'unit_max', # htk filter normaliation to have constant height = 1 dctType=3, # htk uses DCT type III logType='log', liftering=22) # corresponds to htk default CEPLIFTER = 22 mfccs = [] # startFromZero = True, validFrameThresholdRatio = 1 : the way htk computes windows for frame in ess.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize, startFromZero=True, validFrameThresholdRatio=1): spect = spectrum(w(frame)) mel_bands, mfcc_coeffs = mfcc_htk(spect) #frame_energy = energy_func(frame) #mfccs.append(numpy.append(mfcc_coeffs, frame_energy)) mfccs.append(mfcc_coeffs) return mfccs
def getFeature(audio, d=True, nbf=False): ''' MFCC of give audio interval [p[0],p[1]] :param audio: :param p: :return: ''' winAnalysis = 'hann' # this MFCC is for pattern classification, which numberBands always be by default MFCC40 = ess.MFCC(sampleRate=fs, highFrequencyBound=highFrequencyBound, inputSize=framesize + 1) N = 2 * framesize # padding 1 time framesize SPECTRUM = ess.Spectrum(size=N) WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize) mfcc = [] # audio_p = audio[p[0]*fs:p[1]*fs] for frame in ess.FrameGenerator(audio, frameSize=framesize, hopSize=hopsize): frame = WINDOW(frame) mXFrame = SPECTRUM(frame) bands, mfccFrame = MFCC40(mXFrame) # mfccFrame = mfccFrame[1:] mfcc.append(mfccFrame) if d: mfcc = np.array(mfcc).transpose() dmfcc = Fdeltas(mfcc, w=5) ddmfcc = Fdeltas(dmfcc, w=5) feature = np.transpose(np.vstack((mfcc, dmfcc, ddmfcc))) else: feature = np.array(mfcc) if not d and nbf: mfcc = np.array(mfcc).transpose() mfcc_out = np.array(mfcc, copy=True) for w_r in range(1, 6): mfcc_right_shifted = Fprev_sub(mfcc, w=w_r) mfcc_left_shifted = Fprev_sub(mfcc, w=-w_r) mfcc_out = np.vstack( (mfcc_out, mfcc_left_shifted, mfcc_right_shifted)) feature = np.array(np.transpose(mfcc_out), dtype='float32') # print feature.shape return feature
def analyze_misc(filename, segment_duration=20): # Compute replay gain and duration on the entire file, then load the # segment that is centered in time with replaygain applied audio = es.MonoLoader(filename=filename)() replaygain = es.ReplayGain()(audio) segment_start = (len(audio) / 44100 - segment_duration) / 2 segment_end = segment_start + segment_duration if segment_start < 0 or segment_end > len(audio) / 44100: raise ValueError( 'Segment duration is larger than the input audio duration') loader = es.EasyLoader(filename=filename, replayGain=replaygain, startTime=segment_start, endTime=segment_end) windowing = es.Windowing(type='blackmanharris62') spectrum = es.Spectrum() powerspectrum = es.PowerSpectrum() centroid = es.Centroid() zcr = es.ZeroCrossingRate() rms = es.RMS() hfc = es.HFC() pool = essentia.Pool() audio = loader() for frame in es.FrameGenerator(audio, frameSize=2048, hopSize=1024): frame_spectrum = spectrum(windowing(frame)) pool.add('rms', rms(frame)) pool.add('rms_spectrum', rms(frame_spectrum)) pool.add('hfc', hfc(frame_spectrum)) pool.add('spectral_centroid', centroid(frame_spectrum)) pool.add('zcr', zcr(frame)) audio_st, sr, _, _, _, _ = es.AudioLoader(filename=filename)() # Ugly hack because we don't have a StereoResample left, right = es.StereoDemuxer()(audio_st) resampler = es.Resample(inputSampleRate=sr, outputSampleRate=44100) left = resampler(left) right = resampler(right) audio_st = es.StereoMuxer()(left, right) audio_st = es.StereoTrimmer(startTime=segment_start, endTime=segment_end)(audio_st) ebu_momentary, _, _, _ = es.LoudnessEBUR128(hopSize=1024 / 44100, startAtZero=True)(audio_st) pool.set('ebu_momentary', ebu_momentary) return pool
def extractor(filename): frameSize = 1024 hopSize = 512 fs = 44100 audio = ess.MonoLoader(filename=filename, sampleRate=fs)() w = ess.Windowing(type='hamming', normalized=False) # make sure these are same for MFCC and IDCT computation NUM_BANDS = 26 DCT_TYPE = 2 LIFTERING = 0 NUM_MFCCs = 13 spectrum = ess.Spectrum() mfcc = ess.MFCC( numberBands=NUM_BANDS, numberCoefficients= NUM_MFCCs, # make sure you specify first N mfcc: the less, the more lossy (blurry) the smoothed mel spectrum will be weighting= 'linear', # computation of filter weights done in Hz domain (optional) normalize= 'unit_max', # htk filter normaliation to have constant height = 1 (optional) dctType=DCT_TYPE, logType='log', liftering=LIFTERING) # corresponds to htk default CEPLIFTER = 22 idct = ess.IDCT(inputSize=NUM_MFCCs, outputSize=NUM_BANDS, dctType=DCT_TYPE, liftering=LIFTERING) all_melbands_smoothed = [] for frame in ess.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize): spect = spectrum(w(frame)) melbands, mfcc_coeffs = mfcc(spect) melbands_smoothed = np.exp( idct(mfcc_coeffs)) # inverse the log taken in MFCC computation all_melbands_smoothed.append(melbands_smoothed) # transpose to have it in a better shape # we need to convert the list to an essentia.array first (== numpy.array of floats) # mfccs = essentia.array(pool['MFCC']).T all_melbands_smoothed = essentia.array(all_melbands_smoothed).T # and plot plt.imshow(all_melbands_smoothed, aspect='auto', interpolation='none') # ignore enery # plt.imshow(mfccs, aspect = 'auto', interpolation='none') plt.show() # unnecessary if you started "ipython --pylab"