def testNumChordsEqualsHpcpSize(self):
        # this test has been introduced since it was reported that
        # chordsdetection may reveal errors on the scheduling yielding more
        # chords than hpcps are computed
        from essentia.streaming import MonoLoader, DCRemoval, FrameCutter,\
        EqualLoudness, Windowing, Spectrum, SpectralPeaks, SpectralWhitening,\
        HPCP

        audiofile = 'musicbox.wav'
        filename = filename = join(testdata.audio_dir, 'recorded', audiofile)

        p = Pool()
        loader = MonoLoader(filename=filename)
        dc = DCRemoval()
        eqloud = EqualLoudness()
        fc = FrameCutter(frameSize=2048, hopSize=1024, silentFrames="noise")
        win = Windowing(size=2048)
        spec = Spectrum()
        specPeaks = SpectralPeaks()
        specWhite = SpectralWhitening()
        hpcp = HPCP()
        chords = ChordsDetection(hopSize=1024)

        loader.audio >> dc.signal
        dc.signal >> eqloud.signal
        eqloud.signal >> fc.signal
        fc.frame >> win.frame
        win.frame >> spec.frame
        spec.spectrum >> specPeaks.spectrum
        spec.spectrum >> specWhite.spectrum
        specPeaks.frequencies >> specWhite.frequencies
        specPeaks.magnitudes >> specWhite.magnitudes
        specWhite.magnitudes >> hpcp.magnitudes
        specPeaks.frequencies >> hpcp.frequencies
        hpcp.hpcp >> chords.pcp
        chords.chords >> (p, 'chords')
        chords.strength >> None
        hpcp.hpcp >> (p, 'hpcp')

        run(loader)
        self.assertEqual(len(p['chords']), len(p['hpcp']))
示例#2
0
def featureExtractFile(curFile):
    import sys
    import numpy
    import essentia
    from essentia.streaming import MonoLoader
    from essentia.streaming import LowLevelSpectralExtractor
    from essentia.standard import YamlOutput
    from essentia.standard import YamlInput
    from essentia.standard import PoolAggregator
    from essentia.streaming import FrameCutter
    from essentia.streaming import AutoCorrelation
    import pickle
    filename = '/home/user/Desktop/soundsDB2/classifier/featureExtractionEssentia/frameSize.npz'
    npz = numpy.load(filename)
    frameSize = int(npz['frameSize'])
    # and instantiate our algorithms
    loader = MonoLoader(filename=curFile, sampleRate=8000)
    framecutter = FrameCutter(frameSize=frameSize, hopSize=frameSize / 4)
    autoCorrelator = AutoCorrelation()

    lowLevelExtractor = LowLevelSpectralExtractor(frameSize=frameSize,
                                                  hopSize=frameSize / 4,
                                                  sampleRate=8000)

    pool = essentia.Pool()
    loader.audio >> lowLevelExtractor.signal
    lowLevelExtractor.barkbands >> (pool, curFile[:-4] + '.barkbands')
    lowLevelExtractor.barkbands_kurtosis >> (pool, curFile[:-4] +
                                             '.barkbands_kurtosis')
    lowLevelExtractor.barkbands_skewness >> (pool, curFile[:-4] +
                                             '.barkbands_skewness')
    lowLevelExtractor.barkbands_spread >> (pool,
                                           curFile[:-4] + '.barkbands_spread')
    lowLevelExtractor.hfc >> (pool, curFile[:-4] + '.hfc')
    lowLevelExtractor.mfcc >> (pool, curFile[:-4] + '.mfcc')
    lowLevelExtractor.pitch >> (pool, curFile[:-4] + '.pitch')
    lowLevelExtractor.pitch_instantaneous_confidence >> (
        pool, curFile[:-4] + '.pitch_instantaneous_confidence')
    lowLevelExtractor.pitch_salience >> (pool,
                                         curFile[:-4] + '.pitch_salience')
    lowLevelExtractor.silence_rate_20dB >> (pool, curFile[:-4] +
                                            '.silence_rate_20dB')
    lowLevelExtractor.silence_rate_30dB >> (pool, curFile[:-4] +
                                            '.silence_rate_30dB ')
    lowLevelExtractor.silence_rate_60dB >> (pool, curFile[:-4] +
                                            '.silence_rate_60dB')
    lowLevelExtractor.spectral_complexity >> (pool, curFile[:-4] +
                                              '.spectral_complexity')
    lowLevelExtractor.spectral_crest >> (pool,
                                         curFile[:-4] + '.spectral_crest')
    lowLevelExtractor.spectral_decrease >> (pool, curFile[:-4] +
                                            '.spectral_decrease')
    lowLevelExtractor.spectral_energy >> (pool,
                                          curFile[:-4] + '.spectral_energy')
    lowLevelExtractor.spectral_energyband_low >> (pool, curFile[:-4] +
                                                  '.spectral_energyband_low')
    lowLevelExtractor.spectral_energyband_middle_low >> (
        pool, curFile[:-4] + '.spectral_energyband_middle_low')
    lowLevelExtractor.spectral_energyband_middle_high >> (
        pool, curFile[:-4] + '.spectral_energyband_middle_high')
    lowLevelExtractor.spectral_energyband_high >> None
    lowLevelExtractor.spectral_flatness_db >> (pool, curFile[:-4] +
                                               '.spectral_flatness_db')
    lowLevelExtractor.spectral_flux >> (pool, curFile[:-4] + '.spectral_flux')
    lowLevelExtractor.spectral_rms >> (pool, curFile[:-4] + '.spectral_rms')
    lowLevelExtractor.spectral_rolloff >> (pool,
                                           curFile[:-4] + '.spectral_rolloff')
    lowLevelExtractor.spectral_strongpeak >> (pool, curFile[:-4] +
                                              '.spectral_strongpeak')
    lowLevelExtractor.zerocrossingrate >> (pool,
                                           curFile[:-4] + '.zerocrossingrate')
    lowLevelExtractor.inharmonicity >> (pool, curFile[:-4] + '.inharmonicity')
    lowLevelExtractor.tristimulus >> (pool, curFile[:-4] + '.tristimulus')
    lowLevelExtractor.oddtoevenharmonicenergyratio >> (
        pool, curFile[:-4] + '.oddtoevenharmonicenergyratio')
    lowLevelExtractor.inharmonicity >> None
    lowLevelExtractor.tristimulus >> None
    lowLevelExtractor.oddtoevenharmonicenergyratio >> None

    #mfcc.bands >> (pool, curFile[:-4]+'.mfccBands')
    #mfcc.mfcc >> (pool, curFile[:-4]+'.mfcc')

    essentia.run(loader)
    aggrPool = PoolAggregator(defaultStats=[
        'min', 'max', 'median', 'mean', 'var', 'skew', 'kurt', 'dmean', 'dvar'
    ])(pool)
    #aggrPool = PoolAggregator(defaultStats = ['min', 'max', 'mean', 'var'])(pool)
    YamlOutput(filename=curFile[:-4] + 'trainingFeatures.yaml',
               format="yaml")(aggrPool)
    essentia.reset(loader)
    return
示例#3
0
    def __init__(self,
                 filename,
                 frameSize=2048,
                 hopSize=1024,
                 window='hann',
                 stats=['mean', 'var', 'dmean', 'dvar'],
                 sampleRate=44100):
        """ Initialize a feature extractor object for a given audiofile.

        By instantiation feature extraction will be automatically performed.
        The extracted features can be accessed via the attribute features
        or for the non aggreagted feature trajectories via the attribute pool.

        Parameters
        ----------
            filename (str): the filename of the audiofile for which features
                should be extracted
            frameSize (optional(int)): the size of the frames for the
                framebased features in samples, default=2048, note that the
                fast fourier transform is most efficient for a framesize
                which is a power of two
            hopSize (optional(int)): thw hop size between two consecutive
                frames, default=1024
            window (optional(str)): before computing the spectrum on a
                given frame it is necessary to window the signal with a given
                windowing function, possible options are: ['hamming', 'hann',
                'triangular', 'square', 'blackmanharris62', 'blackmanharris70',
                'blackmanharris74', 'blackmanharris92'], default='hann'
            stats (optional(list[str])): the statistics to be computed for the
                aggregation of framebased features, possible statistics are:
                ['min', 'max', 'median', 'mean', 'var', 'skew', 'kurt',
                'dmean', 'dvar', 'dmean2', 'dvar2'], with e.g.
                dmean and dmean2 being the first and second derivative of
                the mean, default=['mean', 'var', 'dmean', 'dvar']
            sampleRate (optional(int)): the desired output sampling rate,
                audiofiles with a different samplerate will be resampled


        Returns
        -------
            None


        Examples
        --------
        >>> audiofile = 'Testfiles/sine300.wav'
        >>> Extractor = FeatureExtractor(audiofile)
        >>> Extractor.features # doctest: +ELLIPSIS
        <essentia.common.Pool instance at 0x...>
        >>> Extractor.features['duration']
        0.30000001192092896
        >>> Extractor._pool['pitch'] # doctest: +NORMALIZE_WHITESPACE
        array([ 304.22268677,  301.05880737,  301.05871582,  301.05877686,
                301.05889893,  301.05886841,  301.05889893,  301.05880737,
                301.05880737,  301.05871582,  301.0586853 ,  301.05877686,
                301.05947876,  304.97198486], dtype=float32)
        >>> Extractor.features['pitch.mean']
        301.5643615722656
        >>> Extractor.features['pitchConfidence.mean']
        0.94275963306427

        """

        # instantiate as a feature extractor in streaming mode
        # this is done internal by essentia
        super(FeatureExtractor, self).__init__()

        # ------------------------------------------------------------------- #
        # -------- instantiate neccesary algorithms and connect them -------- #

        # ------------------------ preliminaries ---------------------------- #

        # the loader outputs the raw signal data from a given audiofile
        loader = MonoLoader(filename=filename, sampleRate=sampleRate)

        # pool where the feature values will be stored
        pool = Pool()

        # needed by logattacktime
        envelope = Envelope()
        accu = RealAccumulator()  # needed between logattacktime and envelope
        loader.audio >> envelope.signal
        envelope.signal >> accu.data

        # needed for framebased processing
        fc = FrameCutter(frameSize=frameSize, hopSize=hopSize)
        loader.audio >> fc.signal
        # windowing
        w = Windowing(type=window)
        fc.frame >> w.frame
        # spectrum
        spec = Spectrum()
        w.frame >> spec.frame

        # ------------------------- audio features -------------------------- #

        # ------------------------ global features -------------------------- #

        # dynamic complexity and loudness
        dynamicComplexity = DynamicComplexity()
        loader.audio >> dynamicComplexity.signal
        dynamicComplexity.dynamicComplexity >> (pool, 'dynamicComplexity')
        dynamicComplexity.loudness >> (pool, 'loudness')
        # duration
        duration = Duration()
        loader.audio >> duration.signal
        duration.duration >> (pool, 'duration')
        # effective duration
        effectiveDuration = EffectiveDuration()
        accu.array >> effectiveDuration.signal
        effectiveDuration.effectiveDuration >> (pool, 'effectiveDuration')
        # logattacktime
        log = LogAttackTime()
        accu.array >> log.signal
        log.logAttackTime >> (pool, 'logattacktime')

        # ---------------------- framebased features ------------------------ #

        # spectral centroid
        sc = Centroid()
        spec.spectrum >> sc.array
        sc.centroid >> (pool, 'spectralcentroid')
        # mfcc
        mfcc = MFCC(numberCoefficients=13)
        spec.spectrum >> mfcc.spectrum
        mfcc.bands >> None  # not included in feature vector
        mfcc.mfcc >> (pool, 'mfcc')
        # pitchYinFFT
        pitch = PitchYinFFT()
        spec.spectrum >> pitch.spectrum
        pitch.pitchConfidence >> (pool, 'pitchConfidence')
        pitch.pitch >> (pool, 'pitch')

        # ------------------ finished network connection -------------------- #
        # ------------------------------------------------------------------- #

        # start feature extraction
        essentia.run(loader)

        # aggregate results
        # logattacktime and effective duration are global features
        # but automatically aggregated in streaming mode
        # to handle this 'copy' is used
        aggrPool = PoolAggregator(defaultStats=stats,
                                  exceptions={
                                      'logattacktime': ['copy'],
                                      'effectiveDuration': ['copy']
                                  })(pool)
        self._pool = pool
        self.features = aggrPool
        self.feature_names = aggrPool.descriptorNames()
示例#4
0
def melspectrogram(filename,
                   npy_file=None,
                   force=False,
                   verbose=False,
                   sample_rate=SAMPLE_RATE,
                   frame_size=FRAME_SIZE,
                   hop_size=HOP_SIZE,
                   window_type=WINDOW_TYPE,
                   zero_padding=ZERO_PADDING,
                   low_frequency_bound=LOW_FREQUENCY_BOUND,
                   high_frequency_bound=HIGH_FREQUENCY_BOUND,
                   number_bands=NUMBER_BANDS,
                   warping_formula=WARPING_FORMULA,
                   weighting=WEIGHTING,
                   normalize=NORMALIZE,
                   bands_type=BANDS_TYPE,
                   compression_type=COMPRESSION_TYPE):
    """Computes the mel spectrogram given the audio filename.
    When the parameter `npy_file` is specified, the data is saved to disk as a numpy array (.npy).
    Use the parameter `force` to overwrite the numpy array in case it already exists.
    The rest of parameters are directly mapped to Essentia algorithms as explained below.

    Note: this functionality is also available as a command line script.

    Parameters:
        sample_rate:
        real ∈ (0,inf) (default = 44100)
        the desired output sampling rate [Hz]

        frame_size:
        integer ∈ [1,inf) (default = 1024)
        the output frame size

        hop_size:
        integer ∈ [1,inf) (default = 512)
        the hop size between frames

        window_type:
        string ∈ {hamming,hann,hannnsgcq,triangular,square,blackmanharris62,blackmanharris70,blackmanharris74,blackmanharris92} (default = "hann")
        the window type, which can be 'hamming', 'hann', 'triangular', 'square' or 'blackmanharrisXX'

        zero_padding:
        integer ∈ [0,inf) (default = 0)
        the size of the zero-padding

        low_frequency_bound:
        real ∈ [0,inf) (default = 0)
        a lower-bound limit for the frequencies to be included in the bands

        high_frequency_bound:
        real ∈ [0,inf) (default = 22050)
        an upper-bound limit for the frequencies to be included in the bands

        number_bands:
        integer ∈ (1,inf) (default = 24)
        the number of output bands

        warping_formula:
        string ∈ {slaneyMel,htkMel} (default = "htkMel")
        The scale implementation type: 'htkMel' scale from the HTK toolkit [2, 3]
        (default) or 'slaneyMel' scale from the Auditory toolbox [4]

        weighting:
        string ∈ {warping,linear} (default = "warping")
        type of weighting function for determining triangle area

        normalize:
        string ∈ {unit_sum,unit_tri,unit_max} (default = "unit_sum")
        spectrum bin weights to use for each mel band: 'unit_max' to make each mel
        band vertex equal to 1, 'unit_sum' to make each mel band area equal to 1
        summing the actual weights of spectrum bins, 'unit_area' to make each
        triangle mel band area equal to 1 normalizing the weights of each triangle
        by its bandwidth

        bands_type:
        string ∈ {magnitude,power} (default = "power")
        'power' to output squared units, 'magnitude' to keep it as the input

        compression_type:
        string ∈ {dB,shift_scale_log,none} (default = "shift_scale_log")
        the compression type to use.
        'shift_scale_log' is log10(10000 * x + 1)
        'dB' is 10 * log10(x)

    Returns:
        (2D array): The mel-spectrogram.
    """

    padded_size = frame_size + zero_padding
    spectrum_size = (padded_size) // 2 + 1

    # In case we want to save the melbands to a file
    # check if the file already exists
    if npy_file:
        if not npy_file.endswith('.npy'):
            npy_file += '.npy'

        if not force:
            if os.path.exists(npy_file):
                if verbose:
                    print('Skipping "{}"'.format(npy_file))
                return

    pool = Pool()

    loader = MonoLoader(filename=filename, sampleRate=sample_rate)
    frameCutter = FrameCutter(frameSize=frame_size, hopSize=hop_size)
    w = Windowing(zeroPadding=zero_padding, type=window_type,
                  normalized=False)  # None of the mel bands extraction methods
    # we have seen requires window-level normalization.
    spec = Spectrum(size=padded_size)
    mels = MelBands(inputSize=spectrum_size,
                    numberBands=number_bands,
                    sampleRate=sample_rate,
                    lowFrequencyBound=low_frequency_bound,
                    highFrequencyBound=high_frequency_bound,
                    warpingFormula=warping_formula,
                    weighting=weighting,
                    normalize=normalize,
                    type=bands_type,
                    log=False)  # Do not compute any compression here.
    # Use the `UnaryOperator`s methods before
    # in case a new compression type is required.

    if compression_type.lower() == 'db':
        shift = UnaryOperator(type='identity')
        compressor = UnaryOperator(type='lin2db')

    elif compression_type.lower() == 'shift_scale_log':
        shift = UnaryOperator(type='identity', scale=1e4, shift=1)
        compressor = UnaryOperator(type='log10')

    elif compression_type.lower() == 'none':
        shift = UnaryOperator(type='identity')
        compressor = UnaryOperator(type='identity')

    loader.audio >> frameCutter.signal
    frameCutter.frame >> w.frame >> spec.frame
    spec.spectrum >> mels.spectrum
    mels.bands >> shift.array >> compressor.array >> (pool, 'mel_bands')

    run(loader)

    mel_bands = np.array(pool['mel_bands'])

    if npy_file:
        np.save(npy_file, mel_bands)

    if verbose:
        print('Done for "{}"'.format(npy_file))

    return mel_bands