def feature_allframes(audio, beats, frame_indexer=None): # Initialise the algorithms w = Windowing(type='hann') spectrum = Spectrum( ) # FFT would return complex FFT, we only want magnitude melbands = MelBands(numberBands=NUMBER_BANDS) pool = Pool() if frame_indexer is None: frame_indexer = list( range(4, len(beats) - 1) ) # Exclude first frame, because it has no predecessor to calculate difference with # 13 MFCC coefficients # 40 Mel band energies mfcc_bands = np.zeros((len(beats), NUMBER_BANDS)) # 1 cosine distance value between every mfcc feature vector # 13 differences between MFCC coefficient of this frame and previous frame # 13 differences between MFCC coefficient of this frame and frame - 4 # 13 differences between the differences above # Idem for mel band energies mfcc_bands_diff = np.zeros((len(beats), NUMBER_BANDS * 4)) # Step 1: Calculate framewise for all output frames # Calculate this for all frames where this frame, or its successor, is in the frame_indexer for i in [ i for i in range(len(beats)) if (i in frame_indexer) or ( i + 1 in frame_indexer) or (i - 1 in frame_indexer) or ( i - 2 in frame_indexer) or (i - 3 in frame_indexer) ]: SAMPLE_RATE = 44100 start_sample = int(beats[i] * SAMPLE_RATE) end_sample = int(beats[i + 1] * SAMPLE_RATE) frame = audio[start_sample:end_sample if (start_sample - end_sample) % 2 == 0 else end_sample - 1] bands = melbands(spectrum(w(frame))) mfcc_bands[i] = bands # Step 2: Calculate the cosine distance between the MFCC values for i in frame_indexer: # The norm of difference is usually very high around downbeat, because of melodic changes there! mfcc_bands_diff[i][0 * NUMBER_BANDS:1 * NUMBER_BANDS] = mfcc_bands[i + 1] - mfcc_bands[i] mfcc_bands_diff[i][1 * NUMBER_BANDS:2 * NUMBER_BANDS] = mfcc_bands[i + 2] - mfcc_bands[i] mfcc_bands_diff[i][2 * NUMBER_BANDS:3 * NUMBER_BANDS] = mfcc_bands[i + 3] - mfcc_bands[i] mfcc_bands_diff[i][3 * NUMBER_BANDS:4 * NUMBER_BANDS] = mfcc_bands[i] - mfcc_bands[i - 1] result = mfcc_bands_diff[frame_indexer] return preprocessing.scale(result)
def rms(audio,params): """ hop size, frame size, window type """ hopSize, frameSize, wtype = params w = Windowing(type=wtype) spec = Spectrum() result = [] RMS = ess.RMS() for frame in ess.FrameGenerator(audio, frameSize = frameSize, hopSize = hopSize): sf = spec(w(frame)) result.append(RMS(sf)) return np.asarray(result),hopSize
def spectralCentroid(audio,params): """ hop size, frame size, window type """ hopSize, frameSize, wtype = params w = Windowing(type=wtype) spec = Spectrum() result = [] centroid = ess.Centroid(range=int(44100/2)) for frame in ess.FrameGenerator(audio, frameSize = frameSize, hopSize = hopSize): sf = spec(w(frame)) result.append(centroid(sf)) return np.asarray(result),hopSize
def setup(self, channels=None, samplerate=None, blocksize=None, totalframes=None): super(Essentia_Dissonance, self).setup(channels, samplerate, blocksize, totalframes) self.spec_alg = Spectrum(size=self.input_blocksize) self.spec_peaks_alg = SpectralPeaks( sampleRate=self.input_samplerate, maxFrequency=self.input_samplerate / 2, minFrequency=0, orderBy='frequency')
def calculateDownbeats(self, audio, bpm, phase): # Step 0: calculate the CSD (Complex Spectral Difference) features # and the associated onset detection function ON LOWPASSED SIGNAL spec = Spectrum(size=self.FRAME_SIZE) w = Windowing(type='hann') fft = FFT() c2p = CartesianToPolar() od_csd = OnsetDetection(method='complex') lowpass = LowPass(cutoffFrequency=1500) pool = Pool() # TODO test faster (numpy) way #audio = lowpass(audio) for frame in FrameGenerator(audio, frameSize=self.FRAME_SIZE, hopSize=self.HOP_SIZE): mag, ph = c2p(fft(w(frame))) pool.add('onsets.complex', od_csd(mag, ph)) # Step 1: normalise the data using an adaptive mean threshold novelty_mean = self.adaptive_mean(pool['onsets.complex'], 16.0) # Step 2: half-wave rectify the result novelty_hwr = (pool['onsets.complex'] - novelty_mean).clip(min=0) # Step 7 (experimental): Determine downbeat locations as subsequence with highest complex spectral difference for i in range(4): phase_frames = (phase * 44100.0) / (512.0) frames = ( np.round( np.arange(phase_frames + i * self.numFramesPerBeat(bpm), np.size(novelty_hwr), 4 * self.numFramesPerBeat(bpm))).astype('int') )[: -1] # Discard last value to prevent reading beyond array (last value rounded up for example) pool.add('output.downbeat', np.sum(novelty_hwr[frames]) / np.size(frames)) plt.subplot(4, 1, i + 1) plt.plot(novelty_hwr) for f in frames: plt.axvline(x=f) print pool['output.downbeat'] downbeatIndex = np.argmax(pool['output.downbeat']) plt.show() # experimental return 1.0 * self.beats[downbeatIndex::4]
def create_analyzers(fs=44100.0, nhop=512, nffts=[1024, 2048, 4096], mel_nband=80, mel_freqlo=27.5, mel_freqhi=16000.0): analyzers = [] for nfft in nffts: window = Windowing(size=nfft, type='blackmanharris62') spectrum = Spectrum(size=nfft) mel = MelBands(inputSize=(nfft // 2) + 1, numberBands=mel_nband, lowFrequencyBound=mel_freqlo, highFrequencyBound=mel_freqhi, sampleRate=fs) analyzers.append((window, spectrum, mel)) return analyzers
def rms_centroids(filename, frameSize=1024, hopSize=512, sampleRate=44100): # load our audio into an array audio = MonoLoader(filename=filename, sampleRate=44100)() # create the pool and the necessary algorithms w = Windowing() spec = Spectrum() rms = RMS() centroid = Centroid(range=int(sampleRate / 2)) cs = [] rmss = [] # compute the centroid for all frames in our audio and add it to the pool for frame in FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize): sf = spec(w(frame)) cs.append(centroid(sf)) rmss.append(rms(sf)) return np.array(rmss), np.array(cs)
def mel40_analyzer(): window = Windowing(size=256, type='blackmanharris62') spectrum = Spectrum(size=256) mel = MelBands( inputSize=129, numberBands=40, lowFrequencyBound=27.5, highFrequencyBound=8000.0, sampleRate=16000.0) def analyzer(samples): feats = [] for frame in FrameGenerator(samples, 256, 160): frame_feats = mel(spectrum(window(frame))) frame_feats = np.log(frame_feats + 1e-16) feats.append(frame_feats) return np.array(feats) return analyzer
def shared_main(source, dest, display_result): source_audio = _loader(source) destination_audio = _loader(dest) source_frame = FrameGenerator(source_audio, frameSize=2048, hopSize=512) destination_frame = FrameGenerator(destination_audio, frameSize=2048, hopSize=512) window = Windowing(type='hann') # window function spectrum = Spectrum() # spectrum function pitch_yin_fft = PitchYinFFT() # pitch extractor pitch_saliennce = PitchSalience() loudness = Loudness() # draw_plot(source_frame, window, spectrum, pitch_yin_fft) min_cost, match_result = compare(source_frame, destination_frame, window, \ spectrum, pitch_yin_fft, 5, 1, 1, display_result, loudness) return min_cost, match_result
def feature_allframes(audio, beats, frame_indexer = None): # Initialise the algorithms w = Windowing(type = 'blackmanharris92') spectrum = Spectrum() specPeaks = SpectralPeaks() hpcp = HPCP() if frame_indexer is None: frame_indexer = range(1,len(beats) - 1) # Exclude first frame, because it has no predecessor to calculate difference with # 12 chromagram values by default chroma_values = np.zeros((len(beats), 12)) # Difference between chroma vectors chroma_differences = np.zeros((len(beats), 3)) # Step 1: Calculate framewise for all output frames # Calculate this for all frames where this frame, or its successor, is in the frame_indexer for i in [i for i in range(len(beats)) if (i in frame_indexer) or (i+1 in frame_indexer) or (i+1 in frame_indexer)]: SAMPLE_RATE = 44100 start_sample = int(beats[i] * SAMPLE_RATE) end_sample = int(beats[i+1] * SAMPLE_RATE) #print start_sample, end_sample frame = audio[start_sample : (end_sample if (start_sample - end_sample) % 2 == 0 else end_sample - 1)] freq, mag = specPeaks(spectrum(w(frame))) chroma_values[i] = hpcp(freq, mag) # Step 2: Calculate the cosine distance between the MFCC values for i in frame_indexer: chroma_differences[i][0] = np.linalg.norm(chroma_values[i] - chroma_values[i-1]) chroma_differences[i][1] = np.linalg.norm(chroma_values[i] - chroma_values[i+1]) chroma_differences[i][2] = np.linalg.norm(chroma_values[i-1] - chroma_values[i+1]) # Include the raw values as absolute features result = np.append(chroma_values[frame_indexer], chroma_differences[frame_indexer], axis=1) #~ print np.shape(result), np.shape(chroma_values), np.shape(chroma_differences) return preprocessing.scale(result)
beatTracker.run(audio) beats = beatTracker.getBeats() bpm = beatTracker.getBpm() phase = beatTracker.getPhase() beats = beats - phase print 'Bpm: ', bpm print 'Frame size in samples: ', 44100 * (60.0 / bpm) # Followed approach from Foote # Adjust the frame size to the length of a beat, to extract beat-aligned information (zelf-uitgevonden) FRAME_SIZE = int(44100 * (60.0 / bpm)) HOP_SIZE = FRAME_SIZE / 2 frames_per_second = (44100.0 / FRAME_SIZE) * (FRAME_SIZE / HOP_SIZE) beats = beats * frames_per_second spec = Spectrum(size=FRAME_SIZE - FRAME_SIZE % 2) w = Windowing(type='hann') spectrum = Spectrum() # FFT would return complex FFT, we only want magnitude mfcc = MFCC() pool = Pool() # Step 0: align audio with phase beats = beats - 0.5 start_sample = int((phase) * (44100.0 * 60 / bpm)) # Step 1: Calculate framewise MFCC for frame in FrameGenerator(audio[start_sample:], frameSize=FRAME_SIZE, hopSize=HOP_SIZE):
def run(self, audio): def numFramesPerBeat(bpm): return (60.0 * self.SAMPLE_RATE) / (self.HOP_SIZE * bpm) def autocorr(x): result = np.correlate(x, x, mode='full') return result[result.size / 2:] def adaptive_mean(x, N): return np.convolve(x, [1.0] * int(N), mode='same') / N # Step 0: calculate the CSD (Complex Spectral Difference) features # and the associated onset detection function spec = Spectrum(size=self.FRAME_SIZE) w = Windowing(type='hann') fft = np.fft.fft c2p = CartesianToPolar() od_csd = OnsetDetection(method='melflux') pool = Pool() for frame in FrameGenerator(audio, frameSize=self.FRAME_SIZE, hopSize=self.HOP_SIZE): pool.add('audio.windowed_frames', w(frame)) fft_result = fft(pool['audio.windowed_frames']).astype('complex64') fft_result_mag = np.absolute(fft_result) fft_result_ang = np.angle(fft_result) for mag, phase in zip(fft_result_mag, fft_result_ang): pool.add('onsets.complex', od_csd(mag, phase)) # Step 1: normalise the data using an adaptive mean threshold novelty_mean = adaptive_mean(pool['onsets.complex'], 16.0) # Step 2: half-wave rectify the result novelty_hwr = (pool['onsets.complex'] - novelty_mean).clip(min=0) # Step 3: then calculate the autocorrelation of this signal novelty_autocorr = autocorr(novelty_hwr) # Step 4: Sum over constant intervals to detect most likely BPM valid_bpms = np.arange(self.minBpm, self.maxBpm, self.stepBpm) for bpm in valid_bpms: frames = ( np.round( np.arange(0, np.size(novelty_autocorr), numFramesPerBeat(bpm))).astype('int') )[: -1] # Discard last value to prevent reading beyond array (last value rounded up for example) pool.add('output.bpm', np.sum(novelty_autocorr[frames]) / np.size(frames)) bpm = valid_bpms[np.argmax(pool['output.bpm'])] # Step 5: Calculate phase information valid_phases = np.arange(0.0, 60.0 / bpm, 0.001) # Valid phases in SECONDS for phase in valid_phases: # Convert phase from seconds to frames phase_frames = (phase * 44100.0) / (512.0) frames = ( np.round( np.arange(phase_frames, np.size(novelty_hwr), numFramesPerBeat(bpm))).astype('int') )[: -1] # Discard last value to prevent reading beyond array (last value rounded up for example) pool.add('output.phase', np.sum(novelty_hwr[frames]) / np.size(frames)) phase = valid_phases[np.argmax(pool['output.phase'])] # Step 6: Determine the beat locations spb = 60. / bpm #seconds per beat beats = (np.arange(phase, (np.size(audio) / 44100) - spb + phase, spb).astype('single')) # Store all the results self.bpm = bpm self.phase = phase self.beats = beats
def feature_allframes(audio, beats, frame_indexer = None): # Initialise the algorithms FRAME_SIZE = 1024 HOP_SIZE = 512 spec = Spectrum(size = FRAME_SIZE) w = Windowing(type = 'hann') fft = np.fft.fft od_csd = OnsetDetection(method = 'complex') od_hfc = OnsetDetection(method = 'flux') pool = Pool() # Calculate onset detection curve on audio for frame in FrameGenerator(audio, frameSize = FRAME_SIZE, hopSize = HOP_SIZE): pool.add('windowed_frames', w(frame)) fft_result = fft(pool['windowed_frames']).astype('complex64') fft_result_mag = np.absolute(fft_result) fft_result_ang = np.angle(fft_result) for mag,phase in zip(fft_result_mag, fft_result_ang): pool.add('onsets.flux', od_hfc(mag, phase)) # Normalize and half-rectify onset detection curve def adaptive_mean(x, N): return np.convolve(x, [1.0]*int(N), mode='same')/N novelty_mean = adaptive_mean(pool['onsets.flux'], 16.0) novelty_hwr = (pool['onsets.flux'] - novelty_mean).clip(min=0) novelty_hwr = novelty_hwr / np.average(novelty_hwr) # For every frame in frame_indexer, if frame_indexer is None: frame_indexer = list(range(4,len(beats) - 1)) # Exclude first frame, because it has no predecessor to calculate difference with # Feature: correlation between current frame onset detection f and of previous frame # Feature: correlation between current frame onset detection f and of next frame # Feature: diff between correlation between current frame onset detection f and corr cur and next onset_integrals = np.zeros((2 * len(beats), 1)) frame_i = (np.array(beats) * 44100.0/ HOP_SIZE).astype('int') onset_correlations = np.zeros((len(beats), 21)) for i in [i for i in range(len(beats)) if (i in frame_indexer) or (i+1 in frame_indexer) or (i-1 in frame_indexer) or (i-2 in frame_indexer) or (i-3 in frame_indexer) or (i-4 in frame_indexer) or (i-5 in frame_indexer) or (i-6 in frame_indexer) or (i-7 in frame_indexer)]: half_i = int((frame_i[i] + frame_i[i+1]) / 2) cur_frame_1st_half = novelty_hwr[frame_i[i] : half_i] cur_frame_2nd_half = novelty_hwr[half_i : frame_i[i+1]] onset_integrals[2*i] = np.sum(cur_frame_1st_half) onset_integrals[2*i + 1] = np.sum(cur_frame_2nd_half) # Step 2: Calculate the cosine distance between the MFCC values for i in frame_indexer: onset_correlations[i][0] = max(np.correlate(novelty_hwr[frame_i[i-1] : frame_i[i]], novelty_hwr[frame_i[i] : frame_i[i+1]], mode='valid')) # Only 1 value onset_correlations[i][1] = max(np.correlate(novelty_hwr[frame_i[i] : frame_i[i+1]], novelty_hwr[frame_i[i+1] : frame_i[i+2]], mode='valid')) # Only 1 value onset_correlations[i][2] = max(np.correlate(novelty_hwr[frame_i[i] : frame_i[i+1]], novelty_hwr[frame_i[i+2] : frame_i[i+3]], mode='valid')) # Only 1 value onset_correlations[i][3] = max(np.correlate(novelty_hwr[frame_i[i] : frame_i[i+1]], novelty_hwr[frame_i[i+3] : frame_i[i+4]], mode='valid')) # Only 1 value # Difference in integrals of novelty curve between frames # Quantifies the difference in number and prominence of onsets in this frame onset_correlations[i][4] = onset_integrals[2*i] - onset_integrals[2*i-1] onset_correlations[i][5] = onset_integrals[2*i+2] + onset_integrals[2*i+3] - onset_integrals[2*i-1] - onset_integrals[2*i-2] for j in range(1,16): onset_correlations[i][5 + j] = onset_integrals[2*i + j] - onset_integrals[2*i] # Include the MFCC coefficients as features result = onset_correlations[frame_indexer] return preprocessing.scale(result)
import errno import time import essentia from essentia.standard import Extractor, MonoLoader, Trimmer, Mean, FrameGenerator, Spectrum, SpectralPeaks, Dissonance, BarkBands, Windowing, \ ZeroCrossingRate, OddToEvenHarmonicEnergyRatio, EnergyBand, MetadataReader, OnsetDetection, Onsets, CartesianToPolar, FFT, MFCC, SingleGaussian from build_map import build_map sampleRate = 44100 frameSize = 2048 hopSize = 1024 windowType = "hann" mean = Mean() keyDetector = essentia.standard.Key(pcpSize=12) spectrum = Spectrum() window = Windowing(size=frameSize, zeroPadding=0, type=windowType) mfcc = MFCC() gaussian = SingleGaussian() od = OnsetDetection(method='hfc') fft = FFT() # this gives us a complex FFT c2p = CartesianToPolar() # and this turns it into a pair (magnitude, phase) onsets = Onsets(alpha=1) # dissonance spectralPeaks = SpectralPeaks(sampleRate=sampleRate, orderBy='frequency') dissonance = Dissonance() # barkbands barkbands = BarkBands(sampleRate=sampleRate)
def run(self, audio): # TODO put this in some util class # Step 0: calculate the CSD (Complex Spectral Difference) features # and the associated onset detection function spec = Spectrum(size=self.FRAME_SIZE) w = Windowing(type='hann') fft = FFT() c2p = CartesianToPolar() od_csd = OnsetDetection(method='complex') pool = Pool() # TODO test faster (numpy) way for frame in FrameGenerator(audio, frameSize=self.FRAME_SIZE, hopSize=self.HOP_SIZE): mag, phase = c2p(fft(w(frame))) pool.add('onsets.complex', od_csd(mag, phase)) # Step 1: normalise the data using an adaptive mean threshold novelty_mean = self.adaptive_mean(pool['onsets.complex'], 16.0) # Step 2: half-wave rectify the result novelty_hwr = (pool['onsets.complex'] - novelty_mean).clip(min=0) # Step 3: then calculate the autocorrelation of this signal novelty_autocorr = self.autocorr(novelty_hwr) # Step 4: Sum over constant intervals to detect most likely BPM valid_bpms = np.arange(self.minBpm, self.maxBpm, self.stepBpm) for bpm in valid_bpms: frames = ( np.round( np.arange(0, np.size(novelty_autocorr), self.numFramesPerBeat(bpm))).astype('int') )[: -1] # Discard last value to prevent reading beyond array (last value rounded up for example) pool.add('output.bpm', np.sum(novelty_autocorr[frames]) / np.size(frames)) bpm = valid_bpms[np.argmax(pool['output.bpm'])] # Step 5: Calculate phase information valid_phases = np.arange(0.0, 60.0 / bpm, 0.001) # Valid phases in SECONDS for phase in valid_phases: # Convert phase from seconds to frames phase_frames = (phase * 44100.0) / (512.0) frames = ( np.round( np.arange(phase_frames, np.size(novelty_hwr), self.numFramesPerBeat(bpm))).astype('int') )[: -1] # Discard last value to prevent reading beyond array (last value rounded up for example) pool.add('output.phase', np.sum(novelty_hwr[frames]) / np.size(frames)) phase = valid_phases[np.argmax(pool['output.phase'])] print 'PHASE', phase # Step 6: Determine the beat locations spb = 60. / bpm #seconds per beat beats = (np.arange(phase, (np.size(audio) / 44100) - spb + phase, spb).astype('single')) # Store all the results self.bpm = bpm self.phase = phase self.beats = beats self.downbeats = self.calculateDownbeats(audio, bpm, phase)