def get_sines_per_frame(audio, sr=44100, onlyfrecuencies=False, nsines=20): """ Perform framewise sinusoidal model in an audio :param audio: Audio either mono or stereo. Will be downsampled to mono :param sr: Samplerate used for the audio :return: Nx2x100. N is the number of resulting frames. 2x100 are the frequencies and magnitudes respectively. """ if audio.ndim > 1: audio = std.MonoMixer()(audio, audio.shape[1]) len_arrays = 0 for i, _ in enumerate( std.FrameGenerator(audio, frameSize=4096, hopSize=2048)): len_arrays = i fft_algo = std.FFT() sine_anal = std.SineModelAnal(maxnSines=nsines, orderBy='frequency', minFrequency=1) sines = np.zeros([len_arrays + 1, 2, nsines], dtype=np.float32) + eps for i, frame in enumerate( std.FrameGenerator(audio, frameSize=4096, hopSize=2048)): fft = fft_algo(frame) freqs, mags, _ = sine_anal(fft) sorting_indexes = np.argsort(freqs) freqs = freqs[sorting_indexes] mags = mags[sorting_indexes] sines[i, :] = [freqs, mags] if onlyfrecuencies: return sines[:, 0, :] else: return sines[:, 0, :], sines[:, 1, :]
def MRCG(x, fs=44100, framesize1=0.02, framesize2=0.2, hopsize=0.01): hopsize = int(hopsize * fs) # spectrogram init winAnalysis = 'hann' ####---- cochleagram 1 framesize = int(framesize1 * fs) N = 2 * framesize # padding 1 time framesize SPECTRUM = ess.Spectrum(size=N) WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize) highFrequencyBound = fs / 2 if fs / 2 < 11000 else 11000 ERBBANDS = ess.ERBBands(sampleRate=fs, highFrequencyBound=highFrequencyBound, inputSize=framesize + 1) cochlea1 = [] for frame in ess.FrameGenerator(x, frameSize=framesize, hopSize=hopsize): frame = WINDOW(frame) mXFrame = SPECTRUM(frame) erbFrame = np.log10(ERBBANDS(mXFrame) + np.finfo(np.float).eps) cochlea1.append(erbFrame) cochlea1 = np.array(cochlea1) ####---- cochleagram 2 framesize = int(framesize2 * fs) N = 2 * framesize # padding 1 time framesize SPECTRUM = ess.Spectrum(size=N) WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize) highFrequencyBound = fs / 2 if fs / 2 < 11000 else 11000 ERBBANDS = ess.ERBBands(sampleRate=fs, highFrequencyBound=highFrequencyBound, inputSize=framesize + 1) cochlea2 = [] for frame in ess.FrameGenerator(x, frameSize=framesize, hopSize=hopsize): frame = WINDOW(frame) mXFrame = SPECTRUM(frame) erbFrame = np.log10(ERBBANDS(mXFrame) + np.finfo(np.float).eps) cochlea2.append(erbFrame) cochlea2 = np.array(cochlea2) ####---- smoothed version cochlea3 = get_avg(cochlea1, 5, 5) cochlea4 = get_avg(cochlea1, 11, 11) all_cochleas = np.hstack((cochlea1, cochlea2, cochlea3, cochlea4)) ####---- delta d_all_cochleas = Fdeltas(all_cochleas.T) dd_all_cochleas = Fdeltas(Fdeltas(all_cochleas.T, 5), 5) d_all_cochleas = d_all_cochleas.T dd_all_cochleas = dd_all_cochleas.T return all_cochleas, d_all_cochleas, dd_all_cochleas
def main_danceability(args): """main_danceability Compute the danceability feature over input waveform and plot it """ audio = loadaudio(args) # create the pool and the necessary algorithms pool = e.Pool() w = estd.Windowing() spec = estd.Spectrum() centroid = estd.SpectralCentroidTime() # compute the centroid for all frames in our audio and add it to the pool for frame in estd.FrameGenerator(audio, frameSize = 1024, hopSize = 512): c = centroid(spec(w(frame))) pool.add('lowlevel.centroid', c) # aggregate the results aggrpool = estd.PoolAggregator(defaultStats = [ 'mean', 'var' ])(pool) # create the pool and the necessary algorithms pool = e.Pool() w = estd.Windowing() # spec = estd.Spectrum() # centroid = estd.SpectralCentroidTime() danceability = estd.Danceability(maxTau = 10000, minTau = 300, sampleRate = args.samplerate) # compute the centroid for all frames in our audio and add it to the pool for frame in estd.FrameGenerator(audio, frameSize = 10 * args.samplerate, hopSize = 5 * args.samplerate): dreal, ddfa = danceability(w(frame)) print(("d", dreal)) # , "frame", frame pool.add('rhythm.danceability', dreal) print((type(pool['rhythm.danceability']))) # aggregate the results # aggrpool = estd.PoolAggregator(defaultStats = [ 'mean', 'var' ])(pool) # write result to file # estd.YamlOutput(filename = args.file + '.features.yaml')(aggrpool) fig, gs = makefig(rows = 2, cols = 2) ax = fig.axes ax[0].plot(pool['rhythm.danceability']) plt.show()
def get_beat_chunks(filename, bpm_restrict=None): audio = std.MonoLoader(filename=filename)() hpcp = std.HPCP() spectrum = std.Spectrum() speaks = std.SpectralPeaks() large_speaks = std.SpectralPeaks(maxPeaks=2000) tivs = [] sr = 44100 bpm = get_tempo(filename) tivs_framewise = [] if bpm_restrict != None and bpm_restrict != bpm: raise ValueError sec_beat = (60 / bpm) beats = np.arange(0, len(audio) / sr, sec_beat) beats = np.append(beats, len(audio) / sr) for i in range(1, len(beats)): segmented_audio = audio[int(beats[i - 1] * sr):int(beats[i] * sr)] cutter = std.FrameGenerator(segmented_audio) for sec in cutter: spec = spectrum(sec) freq, mag = speaks(spec) chroma = hpcp(freq, mag) tivs_framewise.append(chroma) np2_seg_audio = zeropad_next_power_2(segmented_audio) spec = spectrum(np2_seg_audio) freq, mag = speaks(spec) chroma = hpcp(freq, mag) tivs.append(chroma) # Calculate the whole TIV np2_whole = zeropad_next_power_2(audio) spec = spectrum(np2_whole) freq, mag = large_speaks(spec) chroma_whole = hpcp(freq, mag) return mt.TIVCollection.from_pcp(np.array(tivs).T), mt.TIV.from_pcp(chroma_whole), mt.TIVCollection.from_pcp(np.array(tivs_framewise).T)
def getMBE(audio): ''' mel band energy feature :param audio: :return: ''' winAnalysis = 'hann' # this MFCC is for pattern classification, which numberBands always be by default MFCC40 = ess.MFCC(sampleRate=fs, highFrequencyBound=highFrequencyBound, inputSize=framesize + 1) N = 2 * framesize # padding 1 time framesize SPECTRUM = ess.Spectrum(size=N) WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize) mfccBands = [] for frame in ess.FrameGenerator(audio, frameSize=framesize, hopSize=hopsize): frame = WINDOW(frame) mXFrame = SPECTRUM(frame) bands, mfccFrame = MFCC40(mXFrame) mfccBands.append(bands) feature = np.array(mfccBands) return feature
def get_informative_frames(input_data, markers, parameters, frame_size, hop_size): ''' Takes as input audio data with its markers and parameters, and generates informative and noise frames according to markers, frame and hop sizes. Returns framed audio, duration of the informartive region, standard deviations of both informative and non-informative parts. ''' first_informative_sample = markers[0][1] last_informative_sample = markers[1][1] noise_signal = np.append(input_data[0:first_informative_sample], input_data[last_informative_sample:]) informative_signal = input_data[ first_informative_sample:last_informative_sample] noise_rms = np.std(noise_signal) informative_rms = np.std(informative_signal) informative_duration = (last_informative_sample - first_informative_sample) / parameters.framerate first_informative_frame = int(np.floor(markers[0][1] / hop_size)) last_informative_frame = int(np.ceil(markers[1][1] / hop_size)) informative_frames = [] noise_frames = [] for frame_idx, frame in enumerate( es.FrameGenerator(input_data, frameSize=frame_size, hopSize=hop_size, startFromZero=True)): if first_informative_frame <= frame_idx <= last_informative_frame: informative_frames.append(frame) else: noise_frames.append(frame) return np.array(informative_frames), np.array( noise_frames), informative_duration, informative_rms, noise_rms
def energyThresholdAudio(soundfilesList): for sound in soundfilesList: RMS = esst.RMS() audioLoader = esst.MonoLoader(filename=sound) audio = audioLoader() start=0 end=0 thresh=0.05 rms_vals=[] for frame in esst.FrameGenerator(audio, frameSize=2048, hopSize=1024, startFromZero=True): rms = RMS(frame) rms_vals.append(float(rms)) rms_vals = np.array(rms_vals) higher=np.where(rms_vals >= thresh)[0] if len(higher) > 1: start=higher[0] end=higher[-1] else: continue newAudio = audio[start*1024:end*1024] writer = esst.MonoWriter(filename=sound, format="mp3") writer(newAudio) print (sound)
def computeEnergyHistogram(inputAudioFile, outputJsonFile, threshold, histograms): M = 2048 H = 1024 fs = 44100 energy = ess.Energy() x = ess.MonoLoader(filename=inputAudioFile, sampleRate=fs)() frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True) E = [] numFrames = 0 for frame in frames: numFrames += 1 E_frame = energy(frame) E.append(E_frame) E = np.array(E) E_norm = E / np.max(E) for i in range(len(threshold)): t = threshold[i] histograms[i] = np.append(histograms[i], [0] * (numFrames - len(histograms[i]))) idx_threshold = np.where(E_norm > t) histograms[i][idx_threshold[0]] += 1
def extract_features(x, M=WINDOW_SIZE, N=FFT_SIZE, H=HOP_SIZE, fs=SR, window_type=WINDOW_TYPE): ''' extract magnitudes spectra from input vector and apply power-law compression ''' #init functions and vectors x = essentia.array(x) spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type=window_type) SP = [] #compute STFT for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True): #generate frames wX = window(frame) #window frame mX = spectrum(wX) #compute fft ###############################OPTIMIZATION[[[[[[[[[[[[[[]]]]]]]]]]]]]] #DEPRECATED ################################################# SP.append(mX) SP = essentia.array(SP) SP = np.power(SP, 2. / 3.) #power law compression return SP
def get_onsets(self, _audio=[]): if _audio != []: audio = _audio else: audio = self.audio W = es.Windowing(type=self.winType) c2p = es.CartesianToPolar() fft = es.FFT() onsetDetection = es.OnsetDetection(method=self.onsetMethod, sampleRate=44100) onsets = es.Onsets(alpha=.2) # onsetIndex = [] pool = Pool() for frame in es.FrameGenerator(audio, frameSize=1024, hopSize=512): mag, phase, = c2p(fft(W(frame))) onsetDetection.configure(method=self.onsetMethod) onsetFunction = onsetDetection(mag, phase) pool.add("onsetFunction", onsetFunction) DetectedOnsetsArray = onsets([pool["onsetFunction"]], [1]) return DetectedOnsetsArray
def extract(fname, outpath, fs=22050, fsize=1024, hsize=512): """ extract(fname, outpath, fs, fsize, hsize) will compute the mfcc of Audio file fname. Inputs: fname -- is the name of audio file. outpath -- is the output path of processed files. fs -- is the sampling frequency (Hz). fsize -- is the size of each frame. hsize -- is the hop size betwean frames. Outputs: the file contains the mfcc coefficents of audio file. in what format??? """ # gate(fname) loader = es.MonoLoader(filename=fname, sampleRate=fs) # length = len(loader) # maxim = max(loader) # for sample in loader: # if abs(sample) < maxim/20: # sample = 0 ; w = es.Windowing(type='hann') spectrum = es.Spectrum() mfcc = es.MFCC(inputSize=513, numberCoefficients=20) mfccs = [] audio = loader() for frame in es.FrameGenerator(audio, frameSize=1024, hopSize=512): mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame))) mfccs.append(mfcc_coeffs) mfccs = np.array(mfccs) return mfcc
def segment(audio, hopSize, frameSize, rms_onset_threshold, mel_onset_threshold, flux_onset_threshold, onset_threshold): # init algorithms o_mel = estd.OnsetDetection(method='melflux') o_rms = estd.OnsetDetection(method='rms') o_hfc = estd.OnsetDetection(method='hfc') o_flux = estd.OnsetDetection(method='flux') fft = estd.FFT() c2p = estd.CartesianToPolar() pool = essentia.Pool() frame_generator = estd.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize) w = estd.Windowing(type='hann') yin = estd.PitchYinFFT(frameSize=frameSize, minFrequency=40, maxFrequency=2500, interpolate=True) spectrum = estd.Spectrum() loudness = estd.Loudness() # control parameters attack = False detection = True mel_onset_value = 0 rms_onset_value = 0 # output variables onset = None sustain = None for index, frame in enumerate(frame_generator): mag, phase = c2p(fft(w(frame))) _, conf = yin(spectrum(w(frame))) loud = loudness(frame) mel_onset = o_mel(mag, phase) rms_onset = o_rms(mag, phase) hfc_onset = o_hfc(mag, phase) flux_onset = o_flux(mag, phase) pool.add('onsets_mel', mel_onset) pool.add('onsets_rms', rms_onset) pool.add('onsets_hfc', hfc_onset) pool.add('onsets_flux', flux_onset) pool.add('conf', conf) pool.add('loudness', loud) # condition for onset if detection and (flux_onset > flux_onset_threshold or mel_onset > mel_onset_threshold) \ and rms_onset > rms_onset_threshold and loud > onset_threshold: onset = index attack = True detection = False mel_onset_value = mel_onset rms_onset_value = rms_onset # condition for beginning of sustain if attack and conf > 0.5 and rms_onset < rms_onset_value * .05 and mel_onset < mel_onset_value * .3: attack = False sustain = index return onset, sustain
def compute_description(x, M=WINDOW_SIZE, N=FFT_SIZE, H=HOP_SIZE, fs=SR, window_type=WINDOW_TYPE): ''' -extract features from audio file -Features: HFC SPECTRAL CENTROID SPECTRAL ENERGY F0 loud_factor = energy * (spectral_centroid - F0) #how many harmonics = how much speaker is yelling PITCH CONFIDENCE ''' #audioLoader = ess.EasyLoader(filename=file_name, sampleRate=fs) #create essentia instances x = essentia.array(x) spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type=window_type) hfc = ess.HFC(sampleRate=fs) spectralCentroid = ess.SpectralCentroidTime(sampleRate=fs) energy = ess.Energy() pitch_extractor = ess.PredominantPitchMelodia(frameSize=M, hopSize=H, maxFrequency=1200) #init vectors CONTRAST = [] HFC = [] CENTROID = [] ENERGY = [] #compute features for every stft frame for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True): #generate frames wX = window(frame) #window frame mX = spectrum(wX) #compute fft frame_hfc = hfc(mX) HFC.append(frame_hfc) frame_centroid = spectralCentroid( wX) #compute spectral centroid in time domain CENTROID.append(frame_centroid) frame_energy = energy(mX) #compute spectral energy in time domain ENERGY.append(frame_energy) F0, SALIENCE = pitch_extractor(x) #estimate pitch in time domain #convert into numpy matrices HFC = essentia.array(HFC) CENTROID = essentia.array(CENTROID) ENERGY = essentia.array(ENERGY) F0 = essentia.array(F0) SALIENCE = essentia.array(SALIENCE) F0 = F0[:len(CENTROID)] SALIENCE = SALIENCE[:len(CENTROID)] return HFC, CENTROID, ENERGY, F0, SALIENCE
def get_melspecs(audio_file: Path, algorithms: dict) -> Optional[dict[str, np.ndarray]]: # loading file audio = ess.MonoLoader(filename=str(audio_file), sampleRate=SAMPLE_RATE)() # precompute melspecs melspecs_all = {} for algorithm_name in algorithms: parameters = algorithms[algorithm_name] melspec_extractor = getattr(ess, parameters['melspec-algorithm'])() melspecs = [] for frame in ess.FrameGenerator(audio, frameSize=parameters['frame-size'], hopSize=parameters['hop-size']): melspecs.append(melspec_extractor(frame)) melspecs = np.array(melspecs) # reshape melspecs into tensor batches and discard the remainder discard = melspecs.shape[0] % parameters['patch-size'] if discard != 0: melspecs = melspecs[:-discard, :] melspecs = np.reshape(melspecs, [-1, parameters['patch-size'], parameters['number-bands']]) batch = np.expand_dims(melspecs, 2) melspecs_all[algorithm_name] = batch return melspecs_all
def stft(self): # save the results in the stft_pool self.mX = [] for frame in es.FrameGenerator(self.audio, frameSize=self.frame_size, hopSize=self.hop_size, startFromZero=True): frame = frame * self.window X = fft(frame, self.fft_size) # computing fft absX = np.abs( X[:int(self.fft_size / 2)] ) # taking first half of the spectrum and its magnitude absX[absX < np.finfo(float).eps] = np.finfo( float).eps # getting rid of zeros before the next step mX = 20 * np.log10(absX) if self.threshold: mX[mX < self.threshold] = -1000 self.mX.append(mX) self.mX = array(self.mX) self.freqAxHz = float(self.sample_rate) * np.arange(len( self.mX[0])) / float(self.fft_size) self.freqAxMidi = pitch2midi(self.freqAxHz, quantizePitch=False) self.timeAxSec = np.arange(len(self.mX)) * self.hop_size / float( self.sample_rate)
def lowSNR_detector(audio: list, frame_size=1024, hop_size=512, nrg_th=0.1, ac_th=0.6, snr_th=5): if audio.shape[1] > 1: audio = np.reshape(audio, audio.shape[0] * audio.shape[1], order='F') audio = audio.astype("float32") / max(audio.astype("float32")) audio = esarr(audio.astype("float16")) ac_arr = [] nrg_arr = [] sig_pwr = 0 noise_pwr = 0 sig_cnt = 0 noise_cnt = 0 ac_th = 0.6 for frame in estd.FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): ac = abs(autocorr(frame, mode="half")) nrg = sum(frame**2) ac = ac[0] / sum(ac) if sum(ac) > 0 else 0 ac_arr.append(ac) nrg_arr.append(nrg) ac_arr /= max(ac_arr) nrg_arr /= max(nrg_arr) for nrg, ac in zip(nrg_arr, ac_arr): if nrg < nrg_th: noise_pwr += nrg**2 noise_cnt += 1 else: if ac < ac_th: sig_pwr += nrg**2 sig_cnt += 1 else: noise_pwr += nrg**2 noise_cnt += 1 if noise_cnt == 0: snr = np.inf elif sig_cnt == 0: snr = 10 * np.log10(eps) else: sig_pwr /= sig_cnt noise_pwr /= noise_cnt snr = 10 * np.log10(sig_pwr / noise_pwr) # conf = 1-abs(noise_cnt-sig_cnt)/(sig_cnt + noise_cnt) # if conf > 0.7 and snr < snr_th: # return snr, conf, True # return snr, conf, False return snr, snr < snr_th
def extract_predominant_vocal_melody(audio_filename, hopSize, frameSize, pYinInst, end_ts=None): ''' extract predominant vocal pitch contour as workaround, intersect extracted pitch with vocal annotation Parameters ----------------------- end_ts: extract until this ts, disregard the rest of the audio Returns ------------------- list of estimated pitch values in Hz, at non-vocal returns value <= 0 ''' if WITH_MELODIA: if WITH_MAKAM: #### use predominant melody tailored to makam path_Alignment_duration = os.path.join(parentDir, 'AlignmentDuration') if path_Alignment_duration not in sys.path: sys.path.append(path_Alignment_duration) from src.align.FeatureExtractor import extractPredominantMelodyMakam estimatedPitch_andTs = extractPredominantMelodyMakam( audio_filename[:-4], frameSize, hopSize, jointAnalysis=False, musicbrainzid=None, preload=True) #jointAnalysis=False, becasue no else: # use melodia estimatedPitch_andTs = extractPredominantMelody( audio_filename, frameSize, hopSize) else: ######### pYIN audio = ess.MonoLoader(filename=audio_filename, sampleRate=fs)() for frame in ess.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize): featureSet = pYinInst.process(frame) estimatedPitch = pYinInst.decodePitchTrack() # pitch extraction ts = [] ### generated timestamps for onset_frame_number, frame in enumerate(estimatedPitch): ts.append(frame_to_ts(onset_frame_number, float(hopSize / fs))) estimatedPitch_andTs = np.vstack((np.array(ts), estimatedPitch)).T if end_ts is not None: idx_end_ts = np.searchsorted(estimatedPitch_andTs[:, 0], end_ts) # until end_ts estimatedPitch_andTs = estimatedPitch_andTs[:min( idx_end_ts + 1, estimatedPitch_andTs.shape[0]), :] if MonoNoteParameters.WITH_VOCAL_SEGMENTS: # vocal segments given estimatedPitch_andTs = intersect_vocal_segments( audio_filename, estimatedPitch_andTs) return estimatedPitch_andTs[:, 1]
def getFeatSequence(inputFile,pulsePos): audio = ess.MonoLoader(filename = inputFile, sampleRate = params.Fs)() frameCounter = 0 pool = es.Pool() pool.add('samples',audio) for frame in ess.FrameGenerator(audio, frameSize = params.frmSize, hopSize = params.hop): ts = params.hop/params.Fs*frameCounter + params.frmSize/float(2*params.Fs) zpFrame = np.hstack((frame,zz)) mag = spec(window(zpFrame)) mfccBands,mfccSeq = genmfcc(mag) pool.add('rms',rms(mag)) pool.add('mfcc',mfccSeq) pool.add('time',ts) frameCounter += 1 if pulsePos != None: pulsePos = np.append(pulsePos,len(audio)/params.Fs) for tp in xrange(len(pulsePos)-1): pool.add('pst', pulsePos[tp]) pool.add('pet', pulsePos[tp+1]) temp1 = np.where(pool['time'] >= pulsePos[tp])[0] temp2 = np.where(pool['time'] < pulsePos[tp+1])[0] binIndices = np.intersect1d(temp1, temp2) pool.add('pmfcc', np.mean(pool['mfcc'][binIndices,:], axis = 0)) pool.add('prms', np.mean(pool['rms'][binIndices])) else: pool.add('pst', 0.0) pool.add('pet', len(audio)/params.Fs) pool.add('pmfcc', np.mean(pool['mfcc'], axis = 0)) pool.add('prms', np.mean(pool['rms'], axis = 0)) return pool
def file_to_hpcp(loop): loop = e.array(loop) windowing = es.Windowing(type='blackmanharris62') spectrum = es.Spectrum() spectral_peaks = es.SpectralPeaks(orderBy='magnitude', magnitudeThreshold=0.001, maxPeaks=20, minFrequency=20, maxFrequency=8000) hpcp = es.HPCP(maxFrequency=8000) spec_group = [] hpcp_group = [] for frame in es.FrameGenerator(loop, frameSize=1024, hopSize=512): windowed = windowing(frame) fft = spectrum(windowed) frequencies, magnitudes = spectral_peaks(fft) final_hpcp = hpcp(frequencies, magnitudes) spec_group.append(fft) hpcp_group.append(final_hpcp) mean_hpcp = np.mean(np.array(hpcp_group).T, axis=1) #normalize to 1 mean_hpcp = mean_hpcp / mean_hpcp.max() return mean_hpcp
def getFeature(audio): ''' MFCC of give audio interval [p[0],p[1]] :param audio: :param p: :return: ''' mfcc = [] # audio_p = audio[p[0]*fs:p[1]*fs] for frame in ess.FrameGenerator(audio, frameSize=framesize_phoneticSimilarity, hopSize=hopsize_phoneticSimilarity): frame = WINDOW(frame) mXFrame = SPECTRUM(frame) bands, mfccFrame = MFCC(mXFrame) mfccFrame = mfccFrame[1:] mfcc.append(mfccFrame) mfcc = np.array(mfcc).transpose() dmfcc = Fdeltas(mfcc, w=5) ddmfcc = Fdeltas(dmfcc, w=5) feature = np.transpose(np.vstack((mfcc, dmfcc, ddmfcc))) return feature
def extract_features(x, M=WINDOW_SIZE, N=FFT_SIZE, H=HOP_SIZE, fs=SR, window_type=WINDOW_TYPE): ''' extract magnitudes spectra from input vector apply power-law compression cutt the upper spectrum ''' #init functions and vectors x = essentia.array(x) spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type=WINDOW_TYPE) SP = [] #compute STFT for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True): #generate frames wX = window(frame) #window frame mX = spectrum(wX) #compute fft SP.append(mX) SP = essentia.array(SP) SP = np.power(SP, 2. / 3.) #power law compression #SP = SP[:,:int(FFT_SIZE/2+1)] #cut upper spectrum (above 4 khz) return SP
def get_f0(audio, minf0=20, maxf0=22050, cf=0.9, ws=2048, hs=256): ''' Args: audio (array): audio signal (output from MonoLoader) minf0 (int): minimum allowed frequency maxf0 (int): maximun allowed frequency cf (float): confidence threshold (0 - 1) ws (int): window size hp (int): hop size Returns: f0 (array): ''' # instantiate Essentia functions w = es.Windowing(type='hann', zeroPadding=ws) spec = es.Spectrum() yin = es.PitchYinFFT(minFrequency=minf0, maxFrequency=maxf0, frameSize=ws) # empty lists for f0 and confidence f0 = [] conf = [] # iterate over frames for frame in es.FrameGenerator(audio, frameSize=ws, hopSize=hs): p, pc = yin(spec(w(frame))) f0.append(p) conf.append(pc) # convert lists to np.arrays f0 = np.array(f0) conf = np.array(conf) # return f0 over given confidence f0[conf < cf] = 0 return f0
def calc_chromagram(self): # save the results in the stft_pool self.chromagram = [] hpcp = es.HPCP( size=12, # we will need higher resolution for Key estimation referenceFrequency=440, # assume tuning frequency is 44100. bandPreset=False, weightType='cosine', nonLinear=False, windowSize=1., sampleRate=self.sample_rate) spectrum = es.Spectrum(size=self.fft_size) spectral_peaks = es.SpectralPeaks(sampleRate=self.sample_rate) for frame in es.FrameGenerator(self.audio, frameSize=self.frame_size, hopSize=self.hop_size, startFromZero=True): frame = array(frame * self.window) freqs, mags = spectral_peaks(spectrum(frame)) chroma = hpcp(freqs, mags) self.chromagram.append(chroma) self.chromagram = array(self.chromagram) self.timeAxSec = np.arange(len( self.chromagram)) * self.hop_size / float(self.sample_rate)
def getMFCCBands2D(audio, framesize, hopsize, nbf=False, nlen=10): """ mel bands feature [p[0],p[1]] output feature for each time stamp is a 2D matrix it needs the array format float32 :param audio: :param p: :param nbf: bool, if we need to neighbor frames :return: """ mfcc = [] # audio_p = audio[p[0]*fs:p[1]*fs] for frame in ess.FrameGenerator(audio, frameSize=framesize, hopSize=hopsize): frame = WINDOW(frame) mXFrame = SPECTRUM(frame) bands, mfccFrame = MFCC(mXFrame) mfcc.append(bands) if nbf: mfcc = np.array(mfcc).transpose() mfcc_out = np.array(mfcc, copy=True) for ii in range(1, nlen + 1): mfcc_right_shift = Fprev_sub(mfcc, w=ii) mfcc_left_shift = Fprev_sub(mfcc, w=-ii) mfcc_out = np.vstack((mfcc_right_shift, mfcc_out, mfcc_left_shift)) feature = mfcc_out.transpose() else: feature = mfcc # the mel bands features feature = np.array(feature, dtype='float32') return feature
def get_hpeaks_per_frame(audio, sr=44100, onlyfrecuencies=False, nsines=20): """ Get Harmonic peaks in an audio :param audio: Audio either mono or stereo. Will be downsampled to mono :param sr: Samplerate used for the audio :return: Nx2x100. N is the number of resulting frames. 2x100 are the frequencies and magnitudes respectively. """ if audio.ndim > 1: audio = std.MonoMixer()(audio, audio.shape[1]) fft_algo = std.FFT() pyin = std.PitchYin() hpeaks = std.HarmonicPeaks() sine_anal = std.SineModelAnal(maxnSines=nsines, orderBy='frequency', minFrequency=1) sines = [] for i, frame in enumerate( std.FrameGenerator(audio, frameSize=4096, hopSize=2048)): pitch, _ = pyin(frame) fft = fft_algo(frame) freqs, mags, _ = sine_anal(fft) sorting_indexes = np.argsort(freqs) freqs = freqs[sorting_indexes] mags = mags[sorting_indexes] non_zero_freqs = np.where(freqs != 0) freqs = freqs[non_zero_freqs] mags = mags[non_zero_freqs] freqs, mags = hpeaks(freqs, mags, pitch) sines.append([freqs, mags]) sines = np.array(sines) if onlyfrecuencies: return sines[:, 0, :] else: return sines[:, 0, :], sines[:, 1, :]
def extract_features(path): loader = essentia.standard.MonoLoader(filename=path) audio = loader() mfcc = MFCC(numberCoefficients=13) loudness = Loudness() spectrum = Spectrum( ) # FFT() would return the complex FFT, here we just want the magnitude spectrum w = Windowing(type='hann') pool = essentia.Pool() for frame in ess.FrameGenerator(audio, frameSize=1024, hopSize=512, startFromZero=True): mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame))) average_loudness = loudness(spectrum(w(frame))) pool.add('lowlevel.mfcc', mfcc_coeffs) pool.add('lowlevel.loudness', average_loudness) #pool.add('lowlevel.mfcc_bands', mfcc_bands) #pool.add('lowlevel.mfcc_bands_log', logNorm(mfcc_bands)) #YamlOutput(filename = 'mfcc.sig', format='yaml', writeVersion=False)(pool) # compute mean and variance of the frames #aggrPool = PoolAggregator(defaultStats = [ 'mean', 'stdev' ])(pool) aggrPool = PoolAggregator(defaultStats=['mean'])(pool) # and ouput those results in a file YamlOutput(filename='features.json', format='json', writeVersion=False)(aggrPool) save_descriptors_as_strings()
def getHPCPEssentia(XAudio, Fs, winSize, hopSize, squareRoot=False, NChromaBins=36, NHarmonics = 0): """ Wrap around the essentia library to compute HPCP features :param XAudio: A flat array of raw audio samples :param Fs: Sample rate :param winSize: Window size of each STFT window :param hopSize: Hop size between STFT windows :param squareRoot: Do square root compression? :param NChromaBins: How many chroma bins (default 36) :returns H: An (NChromaBins x NWindows) matrix of all \ chroma windows """ import essentia from essentia import Pool, array import essentia.standard as ess spectrum = ess.Spectrum() window = ess.Windowing(size=winSize, type='hann') spectralPeaks = ess.SpectralPeaks() hpcp = ess.HPCP(size=NChromaBins, harmonics=NHarmonics) H = [] for frame in ess.FrameGenerator(array(XAudio), frameSize=winSize, hopSize=hopSize, startFromZero=True): S = spectrum(window(frame)) freqs, mags = spectralPeaks(S) H.append(hpcp(freqs, mags)) H = np.array(H) H = H.T if squareRoot: H = sqrtCompress(H) return H
def extract_features(x, M=Config.WINDOW_SIZE, N=Config.FFT_SIZE, H=Config.HOP_SIZE, fs=Config.FS, window_type=Config.WINDOW_TYPE): ''' Function that extracts spectrogram from an audio signal ----------------------- Input: Samples, window size (int), FFT size (int), Hop size (int), Sampling rate, Window type (e.g. Hanning) Output: Spectrogram ----------------------- ''' # init functions and vectors x = essentia.array(x) spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type=window_type) SP = [] # compute STFT for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True): # generate frames wX = window(frame) # window frame mX = spectrum(wX) # compute fft SP.append(mX) SP = essentia.array(SP) SP = np.power(SP, 2. / 3.) # power law compression SP = SP[:, :int(Config.FFT_SIZE / 4 + 1)] return SP
def compute(self, *args): self.algo.reset() for frame in es.FrameGenerator(args[1], frameSize=frameSize, hopSize=hopSize, startFromZero=True): snr, _, _ = self.algo(frame) return esarr([snr])
def _build_mfcc(self): for frame in es.FrameGenerator(self.audio, frameSize=1024, hopSize=512, startFromZero=True): spec = spectrum(w(frame)) _, mfcc_coeffs = mfcc(spec) self.pool.add('MFCC', mfcc_coeffs)