def testNumChordsEqualsHpcpSize(self): # this test has been introduced since it was reported that # chordsdetection may reveal errors on the scheduling yielding more # chords than hpcps are computed from essentia.streaming import MonoLoader, DCRemoval, FrameCutter,\ EqualLoudness, Windowing, Spectrum, SpectralPeaks, SpectralWhitening,\ HPCP audiofile = 'musicbox.wav' filename = filename = join(testdata.audio_dir, 'recorded', audiofile) p = Pool() loader = MonoLoader(filename=filename) dc = DCRemoval() eqloud = EqualLoudness() fc = FrameCutter(frameSize=2048, hopSize=1024, silentFrames="noise") win = Windowing(size=2048) spec = Spectrum() specPeaks = SpectralPeaks() specWhite = SpectralWhitening() hpcp = HPCP() chords = ChordsDetection(hopSize=1024) loader.audio >> dc.signal dc.signal >> eqloud.signal eqloud.signal >> fc.signal fc.frame >> win.frame win.frame >> spec.frame spec.spectrum >> specPeaks.spectrum spec.spectrum >> specWhite.spectrum specPeaks.frequencies >> specWhite.frequencies specPeaks.magnitudes >> specWhite.magnitudes specWhite.magnitudes >> hpcp.magnitudes specPeaks.frequencies >> hpcp.frequencies hpcp.hpcp >> chords.pcp chords.chords >> (p, 'chords') chords.strength >> None hpcp.hpcp >> (p, 'hpcp') run(loader) self.assertEqual(len(p['chords']), len(p['hpcp']))
def featureExtractFile(curFile): import sys import numpy import essentia from essentia.streaming import MonoLoader from essentia.streaming import LowLevelSpectralExtractor from essentia.standard import YamlOutput from essentia.standard import YamlInput from essentia.standard import PoolAggregator from essentia.streaming import FrameCutter from essentia.streaming import AutoCorrelation import pickle filename = '/home/user/Desktop/soundsDB2/classifier/featureExtractionEssentia/frameSize.npz' npz = numpy.load(filename) frameSize = int(npz['frameSize']) # and instantiate our algorithms loader = MonoLoader(filename=curFile, sampleRate=8000) framecutter = FrameCutter(frameSize=frameSize, hopSize=frameSize / 4) autoCorrelator = AutoCorrelation() lowLevelExtractor = LowLevelSpectralExtractor(frameSize=frameSize, hopSize=frameSize / 4, sampleRate=8000) pool = essentia.Pool() loader.audio >> lowLevelExtractor.signal lowLevelExtractor.barkbands >> (pool, curFile[:-4] + '.barkbands') lowLevelExtractor.barkbands_kurtosis >> (pool, curFile[:-4] + '.barkbands_kurtosis') lowLevelExtractor.barkbands_skewness >> (pool, curFile[:-4] + '.barkbands_skewness') lowLevelExtractor.barkbands_spread >> (pool, curFile[:-4] + '.barkbands_spread') lowLevelExtractor.hfc >> (pool, curFile[:-4] + '.hfc') lowLevelExtractor.mfcc >> (pool, curFile[:-4] + '.mfcc') lowLevelExtractor.pitch >> (pool, curFile[:-4] + '.pitch') lowLevelExtractor.pitch_instantaneous_confidence >> ( pool, curFile[:-4] + '.pitch_instantaneous_confidence') lowLevelExtractor.pitch_salience >> (pool, curFile[:-4] + '.pitch_salience') lowLevelExtractor.silence_rate_20dB >> (pool, curFile[:-4] + '.silence_rate_20dB') lowLevelExtractor.silence_rate_30dB >> (pool, curFile[:-4] + '.silence_rate_30dB ') lowLevelExtractor.silence_rate_60dB >> (pool, curFile[:-4] + '.silence_rate_60dB') lowLevelExtractor.spectral_complexity >> (pool, curFile[:-4] + '.spectral_complexity') lowLevelExtractor.spectral_crest >> (pool, curFile[:-4] + '.spectral_crest') lowLevelExtractor.spectral_decrease >> (pool, curFile[:-4] + '.spectral_decrease') lowLevelExtractor.spectral_energy >> (pool, curFile[:-4] + '.spectral_energy') lowLevelExtractor.spectral_energyband_low >> (pool, curFile[:-4] + '.spectral_energyband_low') lowLevelExtractor.spectral_energyband_middle_low >> ( pool, curFile[:-4] + '.spectral_energyband_middle_low') lowLevelExtractor.spectral_energyband_middle_high >> ( pool, curFile[:-4] + '.spectral_energyband_middle_high') lowLevelExtractor.spectral_energyband_high >> None lowLevelExtractor.spectral_flatness_db >> (pool, curFile[:-4] + '.spectral_flatness_db') lowLevelExtractor.spectral_flux >> (pool, curFile[:-4] + '.spectral_flux') lowLevelExtractor.spectral_rms >> (pool, curFile[:-4] + '.spectral_rms') lowLevelExtractor.spectral_rolloff >> (pool, curFile[:-4] + '.spectral_rolloff') lowLevelExtractor.spectral_strongpeak >> (pool, curFile[:-4] + '.spectral_strongpeak') lowLevelExtractor.zerocrossingrate >> (pool, curFile[:-4] + '.zerocrossingrate') lowLevelExtractor.inharmonicity >> (pool, curFile[:-4] + '.inharmonicity') lowLevelExtractor.tristimulus >> (pool, curFile[:-4] + '.tristimulus') lowLevelExtractor.oddtoevenharmonicenergyratio >> ( pool, curFile[:-4] + '.oddtoevenharmonicenergyratio') lowLevelExtractor.inharmonicity >> None lowLevelExtractor.tristimulus >> None lowLevelExtractor.oddtoevenharmonicenergyratio >> None #mfcc.bands >> (pool, curFile[:-4]+'.mfccBands') #mfcc.mfcc >> (pool, curFile[:-4]+'.mfcc') essentia.run(loader) aggrPool = PoolAggregator(defaultStats=[ 'min', 'max', 'median', 'mean', 'var', 'skew', 'kurt', 'dmean', 'dvar' ])(pool) #aggrPool = PoolAggregator(defaultStats = ['min', 'max', 'mean', 'var'])(pool) YamlOutput(filename=curFile[:-4] + 'trainingFeatures.yaml', format="yaml")(aggrPool) essentia.reset(loader) return
def __init__(self, filename, frameSize=2048, hopSize=1024, window='hann', stats=['mean', 'var', 'dmean', 'dvar'], sampleRate=44100): """ Initialize a feature extractor object for a given audiofile. By instantiation feature extraction will be automatically performed. The extracted features can be accessed via the attribute features or for the non aggreagted feature trajectories via the attribute pool. Parameters ---------- filename (str): the filename of the audiofile for which features should be extracted frameSize (optional(int)): the size of the frames for the framebased features in samples, default=2048, note that the fast fourier transform is most efficient for a framesize which is a power of two hopSize (optional(int)): thw hop size between two consecutive frames, default=1024 window (optional(str)): before computing the spectrum on a given frame it is necessary to window the signal with a given windowing function, possible options are: ['hamming', 'hann', 'triangular', 'square', 'blackmanharris62', 'blackmanharris70', 'blackmanharris74', 'blackmanharris92'], default='hann' stats (optional(list[str])): the statistics to be computed for the aggregation of framebased features, possible statistics are: ['min', 'max', 'median', 'mean', 'var', 'skew', 'kurt', 'dmean', 'dvar', 'dmean2', 'dvar2'], with e.g. dmean and dmean2 being the first and second derivative of the mean, default=['mean', 'var', 'dmean', 'dvar'] sampleRate (optional(int)): the desired output sampling rate, audiofiles with a different samplerate will be resampled Returns ------- None Examples -------- >>> audiofile = 'Testfiles/sine300.wav' >>> Extractor = FeatureExtractor(audiofile) >>> Extractor.features # doctest: +ELLIPSIS <essentia.common.Pool instance at 0x...> >>> Extractor.features['duration'] 0.30000001192092896 >>> Extractor._pool['pitch'] # doctest: +NORMALIZE_WHITESPACE array([ 304.22268677, 301.05880737, 301.05871582, 301.05877686, 301.05889893, 301.05886841, 301.05889893, 301.05880737, 301.05880737, 301.05871582, 301.0586853 , 301.05877686, 301.05947876, 304.97198486], dtype=float32) >>> Extractor.features['pitch.mean'] 301.5643615722656 >>> Extractor.features['pitchConfidence.mean'] 0.94275963306427 """ # instantiate as a feature extractor in streaming mode # this is done internal by essentia super(FeatureExtractor, self).__init__() # ------------------------------------------------------------------- # # -------- instantiate neccesary algorithms and connect them -------- # # ------------------------ preliminaries ---------------------------- # # the loader outputs the raw signal data from a given audiofile loader = MonoLoader(filename=filename, sampleRate=sampleRate) # pool where the feature values will be stored pool = Pool() # needed by logattacktime envelope = Envelope() accu = RealAccumulator() # needed between logattacktime and envelope loader.audio >> envelope.signal envelope.signal >> accu.data # needed for framebased processing fc = FrameCutter(frameSize=frameSize, hopSize=hopSize) loader.audio >> fc.signal # windowing w = Windowing(type=window) fc.frame >> w.frame # spectrum spec = Spectrum() w.frame >> spec.frame # ------------------------- audio features -------------------------- # # ------------------------ global features -------------------------- # # dynamic complexity and loudness dynamicComplexity = DynamicComplexity() loader.audio >> dynamicComplexity.signal dynamicComplexity.dynamicComplexity >> (pool, 'dynamicComplexity') dynamicComplexity.loudness >> (pool, 'loudness') # duration duration = Duration() loader.audio >> duration.signal duration.duration >> (pool, 'duration') # effective duration effectiveDuration = EffectiveDuration() accu.array >> effectiveDuration.signal effectiveDuration.effectiveDuration >> (pool, 'effectiveDuration') # logattacktime log = LogAttackTime() accu.array >> log.signal log.logAttackTime >> (pool, 'logattacktime') # ---------------------- framebased features ------------------------ # # spectral centroid sc = Centroid() spec.spectrum >> sc.array sc.centroid >> (pool, 'spectralcentroid') # mfcc mfcc = MFCC(numberCoefficients=13) spec.spectrum >> mfcc.spectrum mfcc.bands >> None # not included in feature vector mfcc.mfcc >> (pool, 'mfcc') # pitchYinFFT pitch = PitchYinFFT() spec.spectrum >> pitch.spectrum pitch.pitchConfidence >> (pool, 'pitchConfidence') pitch.pitch >> (pool, 'pitch') # ------------------ finished network connection -------------------- # # ------------------------------------------------------------------- # # start feature extraction essentia.run(loader) # aggregate results # logattacktime and effective duration are global features # but automatically aggregated in streaming mode # to handle this 'copy' is used aggrPool = PoolAggregator(defaultStats=stats, exceptions={ 'logattacktime': ['copy'], 'effectiveDuration': ['copy'] })(pool) self._pool = pool self.features = aggrPool self.feature_names = aggrPool.descriptorNames()
def melspectrogram(filename, npy_file=None, force=False, verbose=False, sample_rate=SAMPLE_RATE, frame_size=FRAME_SIZE, hop_size=HOP_SIZE, window_type=WINDOW_TYPE, zero_padding=ZERO_PADDING, low_frequency_bound=LOW_FREQUENCY_BOUND, high_frequency_bound=HIGH_FREQUENCY_BOUND, number_bands=NUMBER_BANDS, warping_formula=WARPING_FORMULA, weighting=WEIGHTING, normalize=NORMALIZE, bands_type=BANDS_TYPE, compression_type=COMPRESSION_TYPE): """Computes the mel spectrogram given the audio filename. When the parameter `npy_file` is specified, the data is saved to disk as a numpy array (.npy). Use the parameter `force` to overwrite the numpy array in case it already exists. The rest of parameters are directly mapped to Essentia algorithms as explained below. Note: this functionality is also available as a command line script. Parameters: sample_rate: real ∈ (0,inf) (default = 44100) the desired output sampling rate [Hz] frame_size: integer ∈ [1,inf) (default = 1024) the output frame size hop_size: integer ∈ [1,inf) (default = 512) the hop size between frames window_type: string ∈ {hamming,hann,hannnsgcq,triangular,square,blackmanharris62,blackmanharris70,blackmanharris74,blackmanharris92} (default = "hann") the window type, which can be 'hamming', 'hann', 'triangular', 'square' or 'blackmanharrisXX' zero_padding: integer ∈ [0,inf) (default = 0) the size of the zero-padding low_frequency_bound: real ∈ [0,inf) (default = 0) a lower-bound limit for the frequencies to be included in the bands high_frequency_bound: real ∈ [0,inf) (default = 22050) an upper-bound limit for the frequencies to be included in the bands number_bands: integer ∈ (1,inf) (default = 24) the number of output bands warping_formula: string ∈ {slaneyMel,htkMel} (default = "htkMel") The scale implementation type: 'htkMel' scale from the HTK toolkit [2, 3] (default) or 'slaneyMel' scale from the Auditory toolbox [4] weighting: string ∈ {warping,linear} (default = "warping") type of weighting function for determining triangle area normalize: string ∈ {unit_sum,unit_tri,unit_max} (default = "unit_sum") spectrum bin weights to use for each mel band: 'unit_max' to make each mel band vertex equal to 1, 'unit_sum' to make each mel band area equal to 1 summing the actual weights of spectrum bins, 'unit_area' to make each triangle mel band area equal to 1 normalizing the weights of each triangle by its bandwidth bands_type: string ∈ {magnitude,power} (default = "power") 'power' to output squared units, 'magnitude' to keep it as the input compression_type: string ∈ {dB,shift_scale_log,none} (default = "shift_scale_log") the compression type to use. 'shift_scale_log' is log10(10000 * x + 1) 'dB' is 10 * log10(x) Returns: (2D array): The mel-spectrogram. """ padded_size = frame_size + zero_padding spectrum_size = (padded_size) // 2 + 1 # In case we want to save the melbands to a file # check if the file already exists if npy_file: if not npy_file.endswith('.npy'): npy_file += '.npy' if not force: if os.path.exists(npy_file): if verbose: print('Skipping "{}"'.format(npy_file)) return pool = Pool() loader = MonoLoader(filename=filename, sampleRate=sample_rate) frameCutter = FrameCutter(frameSize=frame_size, hopSize=hop_size) w = Windowing(zeroPadding=zero_padding, type=window_type, normalized=False) # None of the mel bands extraction methods # we have seen requires window-level normalization. spec = Spectrum(size=padded_size) mels = MelBands(inputSize=spectrum_size, numberBands=number_bands, sampleRate=sample_rate, lowFrequencyBound=low_frequency_bound, highFrequencyBound=high_frequency_bound, warpingFormula=warping_formula, weighting=weighting, normalize=normalize, type=bands_type, log=False) # Do not compute any compression here. # Use the `UnaryOperator`s methods before # in case a new compression type is required. if compression_type.lower() == 'db': shift = UnaryOperator(type='identity') compressor = UnaryOperator(type='lin2db') elif compression_type.lower() == 'shift_scale_log': shift = UnaryOperator(type='identity', scale=1e4, shift=1) compressor = UnaryOperator(type='log10') elif compression_type.lower() == 'none': shift = UnaryOperator(type='identity') compressor = UnaryOperator(type='identity') loader.audio >> frameCutter.signal frameCutter.frame >> w.frame >> spec.frame spec.spectrum >> mels.spectrum mels.bands >> shift.array >> compressor.array >> (pool, 'mel_bands') run(loader) mel_bands = np.array(pool['mel_bands']) if npy_file: np.save(npy_file, mel_bands) if verbose: print('Done for "{}"'.format(npy_file)) return mel_bands