예제 #1
0
        self.outputs['mfcc'] = mfcc.mfcc


if __name__ == '__main__':
    # Make sure the command was well-formed.
    if len(sys.argv) < 3:
        print 'Usage: extractor_mfcc.py <input audio filename> <output yaml filename>'
        sys.exit(1)

    # Loaders must be specified outside your composite algorithm.
    loader = essentia.streaming.MonoLoader(filename=sys.argv[1])

    # We are using the default values of our parameters so we don't specify any keyword arguments.
    mfccex = ExtractorMfcc()

    p = essentia.Pool()

    # When connecting to/from your composite algorithm, use the names you declared in the
    # self.inputs and self.outputs dictionaries, respectively.
    loader.audio >> mfccex.audio
    mfccex.mfcc >> (p, 'mfcc')

    essentia.run(loader)

    # CompoxiteBase algorithms can be translated into c++ code and dot graphs
    # can also be generated:
    essentia.translate(ExtractorMfcc,     # algorithm to be translated
                       'myExtractorMfcc', # output name for the c++ and dot generated files
                       dot_graph=True)    # whether dot file should be generated
    essentia.standard.YamlOutput(filename=sys.argv[2])(p)
예제 #2
0
def featureExtractFile(curFile):
    import sys
    import numpy
    import essentia
    from essentia.streaming import MonoLoader
    from essentia.streaming import LowLevelSpectralExtractor
    from essentia.standard import YamlOutput
    from essentia.standard import YamlInput
    from essentia.standard import PoolAggregator
    from essentia.streaming import FrameCutter
    from essentia.streaming import AutoCorrelation
    import pickle
    filename = '/home/user/Desktop/soundsDB2/classifier/featureExtractionEssentia/frameSize.npz'
    npz = numpy.load(filename)
    frameSize = int(npz['frameSize'])
    # and instantiate our algorithms
    loader = MonoLoader(filename=curFile, sampleRate=8000)
    framecutter = FrameCutter(frameSize=frameSize, hopSize=frameSize / 4)
    autoCorrelator = AutoCorrelation()

    lowLevelExtractor = LowLevelSpectralExtractor(frameSize=frameSize,
                                                  hopSize=frameSize / 4,
                                                  sampleRate=8000)

    pool = essentia.Pool()
    loader.audio >> lowLevelExtractor.signal
    lowLevelExtractor.barkbands >> (pool, curFile[:-4] + '.barkbands')
    lowLevelExtractor.barkbands_kurtosis >> (pool, curFile[:-4] +
                                             '.barkbands_kurtosis')
    lowLevelExtractor.barkbands_skewness >> (pool, curFile[:-4] +
                                             '.barkbands_skewness')
    lowLevelExtractor.barkbands_spread >> (pool,
                                           curFile[:-4] + '.barkbands_spread')
    lowLevelExtractor.hfc >> (pool, curFile[:-4] + '.hfc')
    lowLevelExtractor.mfcc >> (pool, curFile[:-4] + '.mfcc')
    lowLevelExtractor.pitch >> (pool, curFile[:-4] + '.pitch')
    lowLevelExtractor.pitch_instantaneous_confidence >> (
        pool, curFile[:-4] + '.pitch_instantaneous_confidence')
    lowLevelExtractor.pitch_salience >> (pool,
                                         curFile[:-4] + '.pitch_salience')
    lowLevelExtractor.silence_rate_20dB >> (pool, curFile[:-4] +
                                            '.silence_rate_20dB')
    lowLevelExtractor.silence_rate_30dB >> (pool, curFile[:-4] +
                                            '.silence_rate_30dB ')
    lowLevelExtractor.silence_rate_60dB >> (pool, curFile[:-4] +
                                            '.silence_rate_60dB')
    lowLevelExtractor.spectral_complexity >> (pool, curFile[:-4] +
                                              '.spectral_complexity')
    lowLevelExtractor.spectral_crest >> (pool,
                                         curFile[:-4] + '.spectral_crest')
    lowLevelExtractor.spectral_decrease >> (pool, curFile[:-4] +
                                            '.spectral_decrease')
    lowLevelExtractor.spectral_energy >> (pool,
                                          curFile[:-4] + '.spectral_energy')
    lowLevelExtractor.spectral_energyband_low >> (pool, curFile[:-4] +
                                                  '.spectral_energyband_low')
    lowLevelExtractor.spectral_energyband_middle_low >> (
        pool, curFile[:-4] + '.spectral_energyband_middle_low')
    lowLevelExtractor.spectral_energyband_middle_high >> (
        pool, curFile[:-4] + '.spectral_energyband_middle_high')
    lowLevelExtractor.spectral_energyband_high >> None
    lowLevelExtractor.spectral_flatness_db >> (pool, curFile[:-4] +
                                               '.spectral_flatness_db')
    lowLevelExtractor.spectral_flux >> (pool, curFile[:-4] + '.spectral_flux')
    lowLevelExtractor.spectral_rms >> (pool, curFile[:-4] + '.spectral_rms')
    lowLevelExtractor.spectral_rolloff >> (pool,
                                           curFile[:-4] + '.spectral_rolloff')
    lowLevelExtractor.spectral_strongpeak >> (pool, curFile[:-4] +
                                              '.spectral_strongpeak')
    lowLevelExtractor.zerocrossingrate >> (pool,
                                           curFile[:-4] + '.zerocrossingrate')
    lowLevelExtractor.inharmonicity >> (pool, curFile[:-4] + '.inharmonicity')
    lowLevelExtractor.tristimulus >> (pool, curFile[:-4] + '.tristimulus')
    lowLevelExtractor.oddtoevenharmonicenergyratio >> (
        pool, curFile[:-4] + '.oddtoevenharmonicenergyratio')
    lowLevelExtractor.inharmonicity >> None
    lowLevelExtractor.tristimulus >> None
    lowLevelExtractor.oddtoevenharmonicenergyratio >> None

    #mfcc.bands >> (pool, curFile[:-4]+'.mfccBands')
    #mfcc.mfcc >> (pool, curFile[:-4]+'.mfcc')

    essentia.run(loader)
    aggrPool = PoolAggregator(defaultStats=[
        'min', 'max', 'median', 'mean', 'var', 'skew', 'kurt', 'dmean', 'dvar'
    ])(pool)
    #aggrPool = PoolAggregator(defaultStats = ['min', 'max', 'mean', 'var'])(pool)
    YamlOutput(filename=curFile[:-4] + 'trainingFeatures.yaml',
               format="yaml")(aggrPool)
    essentia.reset(loader)
    return
예제 #3
0
    if options.input_file is None:
        print(usage)
        sys.exit(1)
    return options, args


if __name__ == '__main__':
    import sys, os.path, essentia
    options, args = parse_args()
    input_file = options.input_file

    # load audio file
    audio_file = essentia.AudioFileInput(filename=input_file)
    audio = audio_file()
    sampleRate = 44100.
    pool = essentia.Pool(input_file)

    if options.ground_truth_file is not None:
        import yaml
        if 'CLoader' in dir(yaml):
            load = lambda x: yaml.load(x, yaml.CLoader)
            load_all = lambda x: yaml.load_all(x, yaml.CLoader)
        else:
            load = yaml.load
            load_all = yaml.load_all
        if 'CDumper' in dir(yaml):
            dump = lambda x: yaml.dump(x, Dumper=yaml.CDumper)
        else:
            dump = yaml.dump
        metadata = load(open(options.ground_truth_file))
        # add ground truth to pool
예제 #4
0
def compute(audio, pool, options):

    # analysis parameters
    sampleRate = options['sampleRate']
    frameSize = options['frameSize']
    hopSize = options['hopSize']
    windowType = options['windowType']

    # temporal descriptors
    lpc = essentia.LPC(order=10, type='warped', sampleRate=sampleRate)
    zerocrossingrate = essentia.ZeroCrossingRate()

    # frame algorithms
    frames = essentia.FrameGenerator(audio=audio,
                                     frameSize=frameSize,
                                     hopSize=hopSize)
    window = essentia.Windowing(size=frameSize, zeroPadding=0, type=windowType)
    spectrum = essentia.Spectrum(size=frameSize)

    # spectral algorithms
    barkbands = essentia.BarkBands(sampleRate=sampleRate)
    centralmoments = essentia.SpectralCentralMoments()
    crest = essentia.Crest()
    centroid = essentia.SpectralCentroid()
    decrease = essentia.SpectralDecrease()
    spectral_contrast = essentia.SpectralContrast(frameSize=frameSize,
                                                  sampleRate=sampleRate,
                                                  numberBands=6,
                                                  lowFrequencyBound=20,
                                                  highFrequencyBound=11000,
                                                  neighbourRatio=0.4,
                                                  staticDistribution=0.15)
    distributionshape = essentia.DistributionShape()
    energy = essentia.Energy()
    # energyband_bass, energyband_middle and energyband_high parameters come from "standard" hi-fi equalizers
    energyband_bass = essentia.EnergyBand(startCutoffFrequency=20.0,
                                          stopCutoffFrequency=150.0,
                                          sampleRate=sampleRate)
    energyband_middle_low = essentia.EnergyBand(startCutoffFrequency=150.0,
                                                stopCutoffFrequency=800.0,
                                                sampleRate=sampleRate)
    energyband_middle_high = essentia.EnergyBand(startCutoffFrequency=800.0,
                                                 stopCutoffFrequency=4000.0,
                                                 sampleRate=sampleRate)
    energyband_high = essentia.EnergyBand(startCutoffFrequency=4000.0,
                                          stopCutoffFrequency=20000.0,
                                          sampleRate=sampleRate)
    flatnessdb = essentia.FlatnessDB()
    flux = essentia.Flux()
    harmonic_peaks = essentia.HarmonicPeaks()
    hfc = essentia.HFC()
    mfcc = essentia.MFCC()
    rolloff = essentia.RollOff()
    rms = essentia.RMS()
    strongpeak = essentia.StrongPeak()

    # pitch algorithms
    pitch_detection = essentia.PitchDetection(frameSize=frameSize,
                                              sampleRate=sampleRate)
    pitch_salience = essentia.PitchSalience()

    # dissonance
    spectral_peaks = essentia.SpectralPeaks(sampleRate=sampleRate,
                                            orderBy='frequency')
    dissonance = essentia.Dissonance()

    # spectral complexity
    # magnitudeThreshold = 0.005 is hardcoded for a "blackmanharris62" frame
    spectral_complexity = essentia.SpectralComplexity(magnitudeThreshold=0.005)

    INFO('Computing Low-Level descriptors...')

    # used for a nice progress display
    total_frames = frames.num_frames()
    n_frames = 0
    start_of_frame = -frameSize * 0.5

    pitches, pitch_confidences = [], []

    progress = Progress(total=total_frames)

    scPool = essentia.Pool()  # pool for spectral contrast

    for frame in frames:

        frameScope = [
            start_of_frame / sampleRate,
            (start_of_frame + frameSize) / sampleRate
        ]
        #pool.setCurrentScope(frameScope)

        # silence rate
        pool.add(namespace + '.' + 'silence_rate_60dB',
                 essentia.isSilent(frame))
        pool.add(namespace + '.' + 'silence_rate_30dB',
                 is_silent_threshold(frame, -30))
        pool.add(namespace + '.' + 'silence_rate_20dB',
                 is_silent_threshold(frame, -20))

        if options['skipSilence'] and essentia.isSilent(frame):
            total_frames -= 1
            start_of_frame += hopSize
            continue

        # temporal descriptors
        pool.add(namespace + '.' + 'zerocrossingrate', zerocrossingrate(frame))
        (frame_lpc, frame_lpc_reflection) = lpc(frame)
        pool.add(namespace + '.' + 'temporal_lpc', frame_lpc)

        frame_windowed = window(frame)
        frame_spectrum = spectrum(frame_windowed)

        # spectrum-based descriptors
        power_spectrum = frame_spectrum**2
        pool.add(namespace + '.' + 'spectral_centroid',
                 centroid(power_spectrum))
        pool.add(namespace + '.' + 'spectral_decrease',
                 decrease(power_spectrum))
        pool.add(namespace + '.' + 'spectral_energy', energy(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_low',
                 energyband_bass(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_middle_low',
                 energyband_middle_low(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_middle_high',
                 energyband_middle_high(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_high',
                 energyband_high(frame_spectrum))
        pool.add(namespace + '.' + 'hfc', hfc(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_rms', rms(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_flux', flux(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_rolloff', rolloff(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_strongpeak',
                 strongpeak(frame_spectrum))

        # central moments descriptors
        frame_centralmoments = centralmoments(power_spectrum)
        (frame_spread, frame_skewness,
         frame_kurtosis) = distributionshape(frame_centralmoments)
        pool.add(namespace + '.' + 'spectral_kurtosis', frame_kurtosis)
        pool.add(namespace + '.' + 'spectral_spread', frame_spread)
        pool.add(namespace + '.' + 'spectral_skewness', frame_skewness)

        # dissonance
        (frame_frequencies, frame_magnitudes) = spectral_peaks(frame_spectrum)
        frame_dissonance = dissonance(frame_frequencies, frame_magnitudes)
        pool.add(namespace + '.' + 'dissonance', frame_dissonance)

        # mfcc
        (frame_melbands, frame_mfcc) = mfcc(frame_spectrum)
        pool.add(namespace + '.' + 'mfcc', frame_mfcc)

        # spectral contrast
        (sc_coeffs, sc_valleys) = spectral_contrast(frame_spectrum)
        scPool.add(namespace + '.' + 'sccoeffs', sc_coeffs)
        scPool.add(namespace + '.' + 'scvalleys', sc_valleys)

        # barkbands-based descriptors
        frame_barkbands = barkbands(frame_spectrum)
        pool.add(namespace + '.' + 'barkbands', frame_barkbands)
        pool.add(namespace + '.' + 'spectral_crest', crest(frame_barkbands))
        pool.add(namespace + '.' + 'spectral_flatness_db',
                 flatnessdb(frame_barkbands))
        barkbands_centralmoments = essentia.CentralMoments(
            range=len(frame_barkbands) - 1)
        (barkbands_spread, barkbands_skewness,
         barkbands_kurtosis) = distributionshape(
             barkbands_centralmoments(frame_barkbands))
        pool.add(namespace + '.' + 'barkbands_spread', barkbands_spread)
        pool.add(namespace + '.' + 'barkbands_skewness', barkbands_skewness)
        pool.add(namespace + '.' + 'barkbands_kurtosis', barkbands_kurtosis)

        # pitch descriptors
        frame_pitch, frame_pitch_confidence = pitch_detection(frame_spectrum)
        if frame_pitch > 0 and frame_pitch <= 20000.:
            pool.add(namespace + '.' + 'pitch', frame_pitch)
        pitches.append(frame_pitch)
        pitch_confidences.append(frame_pitch_confidence)
        pool.add(namespace + '.' + 'pitch_instantaneous_confidence',
                 frame_pitch_confidence)

        frame_pitch_salience = pitch_salience(frame_spectrum[:-1])
        pool.add(namespace + '.' + 'pitch_salience', frame_pitch_salience)

        # spectral complexity
        pool.add(namespace + '.' + 'spectral_complexity',
                 spectral_complexity(frame_spectrum))

        # display of progress report
        progress.update(n_frames)

        n_frames += 1
        start_of_frame += hopSize

    # if no 'temporal_zerocrossingrate' it means that this is a silent file
    if 'zerocrossingrate' not in descriptorNames(pool.descriptorNames(),
                                                 namespace):
        raise essentia.EssentiaError('This is a silent file!')

    spectralContrastPCA(scPool, pool)

    # build pitch value histogram
    from math import log
    from numpy import bincount
    # convert from Hz to midi notes
    midipitches = []
    unknown = 0
    for freq in pitches:
        if freq > 0. and freq <= 12600:
            midipitches.append(12 * (log(freq / 6.875) / 0.69314718055995) -
                               3.)
        else:
            unknown += 1

    if len(midipitches) > 0:
        # compute histogram
        midipitchhist = bincount(midipitches)
        # set 0 midi pitch to be the number of pruned value
        midipitchhist[0] = unknown
        # normalise
        midipitchhist = [
            val / float(sum(midipitchhist)) for val in midipitchhist
        ]
        # zero pad
        for i in range(128 - len(midipitchhist)):
            midipitchhist.append(0.0)
    else:
        midipitchhist = [0.] * 128
        midipitchhist[0] = 1.

    # pitchhist = essentia.array(zip(range(len(midipitchhist)), midipitchhist))
    pool.add(namespace + '.' + 'spectral_pitch_histogram',
             midipitchhist)  #, pool.GlobalScope)

    # the code below is the same as the one above:
    #for note in midipitchhist:
    #    pool.add(namespace + '.' + 'spectral_pitch_histogram_values', note)
    #    print "midi note:", note

    pitch_centralmoments = essentia.CentralMoments(range=len(midipitchhist) -
                                                   1)
    (pitch_histogram_spread, pitch_histogram_skewness,
     pitch_histogram_kurtosis) = distributionshape(
         pitch_centralmoments(midipitchhist))
    pool.add(namespace + '.' + 'spectral_pitch_histogram_spread',
             pitch_histogram_spread)  #, pool.GlobalScope)

    progress.finish()
def getStereoPanningSpectrum(_audio):

    w_l = Windowing(type='hann')
    stereoDemuxer = StereoDemuxer()
    spectrum_l = FFT(size=kN)

    w_r = Windowing(type='hann')
    stereoDemuxer = StereoDemuxer()
    spectrum_r = FFT(size=kN)

    pool = essentia.Pool()

    rms = RMS()

    freq_1 = int(np.round((250 * kN + 2) / kSampleRate))
    freq_2 = int(np.round((2500 * kN + 2) / kSampleRate))

    left, right = stereoDemuxer(_audio)

    if not np.any(right):
        right = left

    frame_l = FrameGenerator(left, frameSize=kN, hopSize=kN // 2)
    frame_r = FrameGenerator(right, frameSize=kN, hopSize=kN // 2)

    for _frame_l, _frame_r in zip(frame_l, frame_r):

        # Calculates Stereo Panning Spectrum

        l = spectrum_l(w_l(_frame_l))
        r = spectrum_r(w_r(_frame_r))

        phi_l = np.abs(l * np.conj(r)) / (np.abs(l)**2)

        phi_r = np.abs(r * np.conj(l)) / (np.abs(r)**2)

        phi = 2 * np.abs(l * np.conj(r)) / (np.abs(l)**2 + np.abs(r)**2)

        delta = phi_l - phi_r

        delta_ = []

        for bin in delta:

            if bin > 0:
                delta_.append(1)
            elif bin < 0:
                delta_.append(-1)
            else:
                delta_.append(0)

        SPS = (1 - phi) * delta_
        SPS = essentia.array(SPS)
        pool.add('panning.SPS', SPS)

        P_total = rms(SPS)
        P_low = rms(SPS[0:freq_1])
        P_medium = rms(SPS[freq_1:freq_2])
        P_high = rms(SPS[freq_2::])

        pool.add('panning.P_total', P_total)
        pool.add('panning.P_low', P_low)
        pool.add('panning.P_medium', P_medium)
        pool.add('panning.P_high', P_high)

        #Calculates Stereo Phase Spread:

        frequencies = np.linspace(1, (kN / 2) + 1,
                                  (kN / 2) + 1) * (kSampleRate) / (kN + 2)

        erb = erbScale(30, 11025, 40)

        phase_l = np.angle(l)
        phase_r = np.angle(r)
        mag_l = np.abs(l)
        mag_r = np.abs(r)
        pool2 = essentia.Pool()

        for erb_f0 in erb:

            freqs = np.asarray([])

            for f in frequencies:

                if find_nearest(erb, f) == erb_f0:
                    freqs = np.append(freqs, f)
                elif freqs.size != 0:
                    break

            freq1 = int(np.round((freqs[0] * kN + 2) / kSampleRate))
            freq2 = int(np.round((freqs[-1] * kN + 2) / kSampleRate))

            if freq2 == kN / 2:
                freq2 = freq2 + 1

            S_l = np.cos(2 * np.pi * (freqs / kSampleRate) +
                         phase_l[freq1 - 1:freq2])
            S_r = np.cos(2 * np.pi * (freqs / kSampleRate) +
                         phase_r[freq1 - 1:freq2])

            a_weight = np.mean(mag_l[freq1 - 1:freq2] + mag_r[freq1 - 1:freq2])

            delta_lr = a_weight * np.std(S_l - S_r) / np.std(S_l + S_r)

            if freq2 - freq1 == 0:

                #delta_lr = a_weight * np.mean(S_l - S_r) / np.mean(S_l + S_r)
                delta_lr = 0

            pool2.add('a', delta_lr)

        pool.add('panning.SSPS', pool2['a'])

    return pool
예제 #6
0
def analyze_hp(filename, segment_duration=20):

    lowlevelFrameSize = 2048
    lowlevelHopSize = 1024
    tonalFrameSize = 4096
    tonalHopSize = 1024

    # Compute replay gain and duration on the entire file, then load the
    # segment that is centered in time with replaygain applied
    audio = es.MonoLoader(filename=filename)()
    replaygain = es.ReplayGain()(audio)

    segment_start = (len(audio) / 44100 - segment_duration) / 2
    segment_end = segment_start + segment_duration

    if segment_start < 0 or segment_end > len(audio) / 44100:
        raise ValueError(
            'Segment duration is larger than the input audio duration')

    loader = es.EasyLoader(filename=filename,
                           replayGain=replaygain,
                           startTime=segment_start,
                           endTime=segment_end)
    window = es.Windowing(type='blackmanharris62')
    fft = es.FFT()

    stft = []

    audio = loader()
    for frame in es.FrameGenerator(audio,
                                   frameSize=lowlevelFrameSize,
                                   hopSize=lowlevelHopSize):
        stft.append(fft(window(frame)))

    # Librosa requires bins x frames format
    stft = np.array(stft).T

    D_harmonic, D_percussive = librosa.decompose.hpss(stft, margin=8)
    D_percussive_magnitude, _ = librosa.magphase(D_percussive)
    D_harmonic_magnitude, _ = librosa.magphase(D_harmonic)

    # Convert back to Essentia format (frames x bins)
    spectrum_harmonic = D_harmonic_magnitude.T
    specturm_percussive = D_percussive_magnitude.T

    # Processing for Mel bands
    melbands = es.MelBands(numberBands=96,
                           lowFrequencyBound=0,
                           highFrequencyBound=11025)

    # Normalize Mel bands: log10(1+x*10000)
    norm = es.UnaryOperator(type='identity', shift=1, scale=10000)
    log10 = es.UnaryOperator(type='log10')

    p = essentia.Pool()

    for spectrum_frame in spectrum_harmonic:
        p.add('melbands_harmonic', log10(norm(melbands(spectrum_frame))))

    for spectrum_frame in specturm_percussive:
        p.add('melbands_percussive', log10(norm(melbands(spectrum_frame))))

    return p
예제 #7
0
def essentia_midi(file):
    pool = essentia.Pool()

    # Compute all features, aggregate only 'mean' and 'stdev' statistics for all low-level, rhythm and tonal frame features
    features, features_frames = es.MusicExtractor(
        lowlevelStats=['mean', 'stdev'],
        rhythmStats=['mean', 'stdev'],
        tonalStats=['mean', 'stdev'])(file)

    # You can then access particular values in the pools:
    print("Filename:", features['metadata.tags.file_name'])
    print("-" * 80)
    print("Replay gain:", features['metadata.audio_properties.replay_gain'])
    print("EBU128 integrated loudness:",
          features['lowlevel.loudness_ebu128.integrated'])
    print("EBU128 loudness range:",
          features['lowlevel.loudness_ebu128.loudness_range'])
    print("-" * 80)
    print("MFCC mean:", features['lowlevel.mfcc.mean'])
    print("-" * 80)
    print("BPM:", features['rhythm.bpm'])
    print("Beat positions (sec.)", features['rhythm.beats_position'])
    print("-" * 80)
    print(
        "Key/scale estimation (using a profile specifically suited for electronic music):",
        features['tonal.key_edma.key'], features['tonal.key_edma.scale'])

    # BPM Detection

    # Loading audio file
    audio = MonoLoader(filename=file)()

    # # Compute beat positions and BPM
    rhythm_extractor = RhythmExtractor2013(method="multifeature")
    bpm, beats, beats_confidence, _, beats_intervals = rhythm_extractor(audio)

    beat_volume_extractor = BeatsLoudness(beats=beats)
    beats_loudness, beats_loudness_band_ratio = beat_volume_extractor(audio)

    # Danceability Detection
    danceability_extractor = Danceability()
    danceability, dfa = danceability_extractor(audio)

    # Melody Detection
    # Load audio file; it is recommended to apply equal-loudness filter for PredominantPitchMelodia
    loader = EqloudLoader(filename=file, sampleRate=44100)
    audio = loader()
    print("Duration of the audio sample [sec]:")
    print(len(audio) / 44100.0)

    pitch_extractor = PredominantPitchMelodia(frameSize=2048, hopSize=1024)
    pitch_values, pitch_confidence = pitch_extractor(audio)

    midi_extractor = PitchContourSegmentation(hopSize=1024)
    onset, duration, midi_pitch = midi_extractor(pitch_values, audio)

    # Pitch is estimated on frames. Compute frame time positions
    pitch_times = numpy.linspace(0.0, len(audio) / 44100.0, len(pitch_values))

    #Storing in Pool
    pool.add('MIDIonset', onset)
    pool.add('MIDIduration', duration)
    pool.add('MIDIpitch', midi_pitch)
    pool.add('pitch', pitch_values)
    pool.add('danceability', danceability)
    pool.add('beat-loudness', beats_loudness)
    pool.add('beats', beats)
    pool.add('bpm', bpm)

    output = YamlOutput(
        filename='./analyzer/output.json',
        format='json',
        indent=4,
        writeVersion=False)  # use "format = 'json'" for JSON output
    output(pool)
예제 #8
0
파일: main.py 프로젝트: cepko33/AlgoRhythm
def pickleToPool(nparr):
    pool = essentia.Pool()
    for tup in nparr:
        pool.add(tup[0], tup[1])
    return pool
    def cleaningSineTracks(self, pool, minFrames):
        """
        Cleans the sine tracks identified based on the minimum number of frames identified
        reference: https://github.com/MTG/essentia/blob/b5b46f80d80058603a525af36cbf7069c17c3df9/
        test/src/unittests/synthesis/test_sinemodel_streaming.py

        :param pool: must contain pool["magnitudes"], pool["frequencies"] and pool["phases"]
        :param minFrames: minimum number of frames required for a sine track to be valid
        :return: cleaned up pool
        """

        freqsTotal = pool["frequencies"]
        nFrames = freqsTotal.shape[0]
        begTrack = 0
        freqsClean = freqsTotal.copy()

        if (nFrames > 0):

            f = 0
            nTracks = freqsTotal.shape[
                1]  # we assume all frames have a fix number of tracks

            for t in range(nTracks):

                f = 0
                begTrack = f

                while (f < nFrames - 1):

                    # // check if f is begin of track
                    if (freqsClean[f][t] <= 0 and freqsClean[f + 1][t] > 0):
                        begTrack = f + 1

                    # clean track if shorter than min duration
                    if ((freqsClean[f][t] > 0 and freqsClean[f + 1][t] <= 0)
                            and ((f - begTrack) < minFrames)):
                        for i in range(begTrack, f + 1):
                            freqsClean[i][t] = 0

                    f += 1

        cleaned_pool = essentia.Pool()

        for frame_ix, originalTracks in enumerate(freqsTotal):
            freqs = []
            mags = []
            phases = []
            for track_ix, freqTrack in enumerate(originalTracks):
                if freqTrack in freqsClean[frame_ix]:
                    freqs.append(pool["frequencies"][frame_ix][track_ix])
                    mags.append(pool["magnitudes"][frame_ix][track_ix])
                    phases.append(pool["phases"][frame_ix][track_ix])
                else:
                    freqs.append(0)
                    mags.append(0)
                    phases.append(0)
            cleaned_pool.add("frequencies", essentia.array(freqs))
            cleaned_pool.add("magnitudes", essentia.array(mags))
            cleaned_pool.add("phases", essentia.array(phases))

        return cleaned_pool
예제 #10
0
def compute_features(complete_path):
    result = []
    meta_result = []
    file_count = 0
    # for loop over files
    for file in os.listdir(complete_path):
        if file.endswith(".wav"):
            file_count+=1
            # print(file +' : ' + str(file_count))

            # load our audio into an array
            audio = es.MonoLoader(filename=complete_path + file, sampleRate=44100)()

            # create the pool and the necessary algorithms
            pool = essentia.Pool()
            window = es.Windowing()
            energy = es.Energy()
            spectrum = es.Spectrum()
            centroid = es.Centroid(range=22050)
            rolloff = es.RollOff()
            crest = es.Crest()
            speak = es.StrongPeak()
            rmse = es.RMS()
            mfcc = es.MFCC()
            flux = es.Flux()
            barkbands = es.BarkBands( sampleRate = 44100)
            zerocrossingrate = es.ZeroCrossingRate()

            meta = es.MetadataReader(filename=complete_path + file, failOnError=True)()
            pool_meta, duration, bitrate, samplerate, channels = meta[7:]
            
            # centralmoments = es.SpectralCentralMoments()
            # distributionshape = es.DistributionShape()

            # compute the centroid for all frames in our audio and add it to the pool
            for frame in es.FrameGenerator(audio, frameSize = 1024, hopSize = 512):
                frame_windowed = window(frame)
                frame_spectrum = spectrum(frame_windowed)
                
                c = centroid(frame_spectrum)
                pool.add('spectral.centroid', c)

                cr = crest(frame_spectrum)
                pool.add('spectral crest', cr)

                r = rolloff(frame_spectrum)
                pool.add('spectral rolloff', r)

                sp = speak(frame_spectrum)
                pool.add('strong peak', sp)

                rms = rmse(frame_spectrum)
                pool.add('RMS', rms)

                pool.add('spectral_energy', energy(frame_spectrum))
                # (frame_melbands, frame_mfcc) = mfcc(frame_spectrum)
                # pool.add('frame_MFCC', frame_mfcc)

                fl = flux(frame_spectrum)
                pool.add('spectral flux', fl)

                # bbands = barkbands(frame_spectrum)
                # pool.add('bark bands', bbands)

                zcr = zerocrossingrate(frame_spectrum)
                pool.add('zero crossing rate', zcr)

                # frame_centralmoments = centralmoments(power_spectrum)
                # (frame_spread, frame_skewness, frame_kurtosis) = distributionshape(frame_centralmoments)
                # pool.add('spectral_kurtosis', frame_kurtosis)
                # pool.add('spectral_spread', frame_spread)
                # pool.add('spectral_skewness', frame_skewness)

            # aggregate the results (find mean if needed)
            aggrpool = es.PoolAggregator(defaultStats = ['mean'])(pool) #,'stdev' ])(pool)
            
            pool_meta.set("duration", duration)
            pool_meta.set("filename", os.path.relpath(file))

            # write pools to lists
            pool_arr = pool_to_array(aggrpool)
            result.append(pool_arr)

            meta_arr = pool_to_array(pool_meta)
            meta_result.append(meta_arr)
         
    features_df = pd.DataFrame.from_records(result)
    features_df.columns = ['centroid', 'crest','roll off','strong peak','rms','energy','flux','zcr']
    
    meta_df = pd.DataFrame.from_records(meta_result)
    meta_df.columns = ['duration','filename','metadata.tags.comment']
    del meta_df['metadata.tags.comment']

    return features_df,meta_df
    def analysis_synthesis_spr_model_standard(self, params, signal):

        pool = essentia.Pool()
        #   Streaming Algos for Sine Model Analysis
        w = es.Windowing(type="hann")
        fft = es.FFT(size=params['fftSize'])
        smanal = es.SineModelAnal(
            sampleRate=params['sampleRate'],
            maxnSines=params['maxnSines'],
            magnitudeThreshold=params['magnitudeThreshold'],
            freqDevOffset=params['freqDevOffset'],
            freqDevSlope=params['freqDevSlope'])

        #   Standard Algos for Sine Model Analysis
        smsyn = es.SineModelSynth(sampleRate=params['sampleRate'],
                                  fftSize=params['frameSize'],
                                  hopSize=params['hopSize'])
        ifft = es.IFFT(size=params['frameSize'])
        overlSine = es.OverlapAdd(frameSize=params['frameSize'],
                                  hopSize=params['hopSize'],
                                  gain=1. / params['frameSize'])
        overlres = es.OverlapAdd(frameSize=params['frameSize'],
                                 hopSize=params['hopSize'],
                                 gain=1. / params['frameSize'])

        fft_original = []

        # analysis
        for frame in es.FrameGenerator(signal,
                                       frameSize=params["frameSize"],
                                       hopSize=params["hopSize"]):
            frame_fft = fft(w(frame))
            fft_original.append(frame_fft)
            freqs, mags, phases = smanal(frame_fft)
            pool.add("frequencies", freqs)
            pool.add("magnitudes", mags)
            pool.add("phases", phases)

        # remove short tracks
        minFrames = int(params['minSineDur'] * params['sampleRate'] /
                        params['hopSize'])
        pool = self.cleaningSineTracks(pool, minFrames)

        # synthesis
        sineTracksAudio = np.array([])
        resTracksAudio = np.array([])
        for frame_ix, _ in enumerate(pool["frequencies"]):
            sine_frame_fft = smsyn(pool["magnitudes"][frame_ix],
                                   pool["frequencies"][frame_ix],
                                   pool["phases"][frame_ix])
            res_frame_fft = fft_original[frame_ix] - sine_frame_fft
            sine_outframe = overlSine(ifft(sine_frame_fft))
            sineTracksAudio = np.append(sineTracksAudio, sine_outframe)
            res_outframe = overlres(ifft(res_frame_fft))
            resTracksAudio = np.append(resTracksAudio, res_outframe)

        sineTracksAudio = sineTracksAudio.flatten()[-len(signal):]
        resTracksAudio = resTracksAudio.flatten()[-len(signal):]

        #print("len signal", len(signal), "len res", len(resTracksAudio))
        return essentia.array(signal), essentia.array(
            sineTracksAudio), essentia.array(resTracksAudio)
예제 #12
0
def extractFeatures(audio_data):
    """
  Recebe um vetor de reais representando um sinal de áudio, calcula suas 
  features, agrega-as em uma Pool() de essentia e retorna esta Pool
  """
    from numpy import ndarray
    assert (type(audio_data) is ndarray)
    assert ("float" in str(audio_data.dtype))

    #Inicia Pool()
    output_pool = es.Pool()

    #Calcula espectro do sinal
    output_pool.set(pk_spectrum, es_mode.Spectrum()(audio_data))

    #Calcula EnergyBandRatio
    energy_band_ratio = es_mode.EnergyBandRatio()(output_pool[pk_spectrum])
    output_pool.set(pk_energy_band_ratio, energy_band_ratio)

    #Calcula MaxMagFreq
    max_mag_freq = es_mode.MaxMagFreq()(output_pool[pk_spectrum])
    output_pool.set(pk_max_mag_freq, max_mag_freq)

    #Calcula SpectralCentroidTime
    spectral_centroid_time = es_mode.SpectralCentroidTime()(audio_data)
    output_pool.set(pk_spectral_centroid_time, spectral_centroid_time)

    #Calcula SpectralComplexity
    spectral_complexity = es_mode.SpectralComplexity()(
        output_pool[pk_spectrum])
    output_pool.set(pk_spectral_complexity, spectral_complexity)

    #Calcula StrongPeak
    strong_peak = es_mode.StrongPeak()(output_pool[pk_spectrum])
    output_pool.set(pk_strong_peak, strong_peak)

    #Calcula SpectralPeaks
    sp_freq, sp_mag = es_mode.SpectralPeaks()(output_pool[pk_spectrum])
    #corta o DC, se houver, e pedido de HarmonicPeaks
    if sp_freq[0] == 0:
        sp_freq = sp_freq[1:]
        sp_mag = sp_mag[1:]
    output_pool.set(pk_spectral_peaks_freq, sp_freq)
    output_pool.set(pk_spectral_peaks_mag, sp_mag)

    ######################################
    #       Para Inharmonicity           #
    ######################################
    #Calcula PitchYinFFT
    pitch_yin_fft, pitch_prob_yin_fft = es_mode.PitchYinFFT()(
        output_pool[pk_spectrum])
    output_pool.set(pk_pitch, pitch_yin_fft)
    output_pool.set(pk_pitch_prob, pitch_prob_yin_fft)

    #Calcula HarmonicPeaks
    hp_freq, hp_mag = es_mode.HarmonicPeaks()(output_pool[pk_spectral_peaks_freq],\
                                              output_pool[pk_spectral_peaks_mag],\
                                              output_pool[pk_pitch] )
    output_pool.set(pk_harmonic_peaks_freq, hp_freq)
    output_pool.set(pk_harmonic_peaks_mag, hp_mag)

    #Calcula Inharmonicity
    inharmonicity = es_mode.Inharmonicity()(output_pool[pk_harmonic_peaks_freq],\
                                            output_pool[pk_harmonic_peaks_mag])
    output_pool.set(pk_inharmonicity, inharmonicity)

    #Acaba Inharmonicity#####################################

    #Calcula SpectralContrast
    frame_size = 2 * (output_pool[pk_spectrum].size - 1)
    spectral_contrast, spectral_valley = \
        es_mode.SpectralContrast(frameSize=frame_size)(output_pool[pk_spectrum])
    output_pool.set(pk_spectral_contrast, spectral_contrast)
    output_pool.set(pk_spectral_valley, spectral_valley)

    #Calcula SpectralWhitening
    spectral_whitening = \
                es_mode.SpectralWhitening()(output_pool[pk_spectrum],\
                                            output_pool[pk_spectral_peaks_freq],\
                                            output_pool[pk_spectral_peaks_mag])
    output_pool.set(pk_spectral_whitening, spectral_whitening)

    return output_pool
예제 #13
0
def main():

    aparser = argparse.ArgumentParser()
    aparser.add_argument(
        '-c',
        action='store',
        dest='config',
        help=
        '-c type of the dataset. For ex: _1s_h100 for 1s with full length hop')
    aparser.add_argument('-t',
                         action='store',
                         dest='data_type',
                         help='-t type of data original/harmonic/residual')

    args = aparser.parse_args()
    if not args.config:
        aparser.error('Please specify the data config!')

    conf = args.config
    if args.data_type == 'original':
        path_to_dataset = PATH_TO_ORIGINAL_WAV_FILES + conf
        path_to_features = PATH_TO_ORIGINAL_FEATURES + conf
    if args.data_type == 'residual':
        path_to_dataset = PATH_TO_RESIDUAL_WAV_FILES + conf
        path_to_features = PATH_TO_RESIDUAL_FEATURES + conf
    if args.data_type == 'harmonic':
        path_to_dataset = PATH_TO_HARMONIC_WAV_FILES + conf
        path_to_features = PATH_TO_HARMONIC_FEATURES + conf

    datasets = sorted(os.listdir(path_to_dataset))
    for dataset in datasets:
        empty_files = 0
        print("[Dataset] : " + dataset)
        folder_path = os.path.join(path_to_dataset, dataset)
        lrms = sorted(os.listdir(folder_path))
        for channel in lrms:
            channel_path = os.path.join(folder_path, channel)
            sub_folders = sorted(os.listdir(channel_path))
            for sub_folder in sub_folders:
                sub_folder_path = os.path.join(channel_path, sub_folder)
                files = sorted(os.listdir(sub_folder_path))
                for filename in files:
                    filepath = os.path.join(sub_folder_path, filename)
                    features = essentia.Pool()
                    try:
                        # Compute all features, aggregate only 'mean' and 'stdev' statistics for all low-level, rhythm and tonal frame features
                        features, features_frames = es.MusicExtractor(
                            lowlevelSilentFrames='drop',
                            lowlevelFrameSize=2048,
                            lowlevelHopSize=1024,
                            lowlevelStats=['mean', 'stdev'])(filepath)
                        features_frames = []
                    except RuntimeError, e:
                        print(filepath + " is almost silent")
                        empty_files += 1
                    dump_path = os.path.join(path_to_features, dataset,
                                             channel, sub_folder)
                    create_folder(dump_path)
                    es.YamlOutput(filename=os.path.join(
                        dump_path, filename.replace('.wav', '.json')),
                                  format='json')(features)
                    features = []
                    filename = []
        print("Feature Extraction Completed Successfully for " + dataset)
        print("Total number of empty file in " + dataset + " is " +
              str(empty_files))
def compute(profile, inputFilename, outputFilename, userOptions = {}):

    # load profile
    profileDirectory = __file__.split(os.path.sep)[:-1]
    profileDirectory.append('profiles')
    profileDirectory.append('%s_config.yaml' % profile)

    try:
        # try to load the predefined profile, if it exists
        config = open(os.path.sep.join(profileDirectory), 'r').read()
    except:
        # otherwise, just load the file that was specified
        config = open(profile, 'r').read()

    options = yaml.load(config)
    mergeRecursiveDict(options, userOptions)

    # which format for the output?
    format = options['outputFormat']
    if format not in [ 'xml', 'yaml' ]:
        raise essentia.EssentiaError('output format should be either \'xml\' or \'yaml\'')
    if format == 'xml':
        xmlOutput = True
    else:
        xmlOutput = False

    # we need this for dependencies checking
    options['computed'] = []
    options['generatedBy'] = {}

    # get list of extractors to compute
    extractors = options['extractors']

    # create pool & megalopool
    pool = essentia.Pool()

    # load audio file into memory
    audio = loadAudioFile(inputFilename, pool, options)

    # preprocess audio by applying a DC filter, normalization, etc...
    # preprocessing is a special step because it modifies the audio, hence it
    # must be executed before all the other extractors
    audio = preProcess(audio, pool, options, 'metadata')
    options['globalPreprocessing'] = options['preprocessing']
    del options['preprocessing']

    # process all extractors
    computeAllExtractors(extractors, audio, pool, options)

    # process segmentation if asked
    if options['segmentation']['doSegmentation']:
        segments = segmentation.compute(inputFilename, audio, pool, options)

    # remove unwanted descriptors
    wantedStats = cleanStats(pool, options)

    # add to megalopool
    #megalopool = essentia.Pool()
    scope = [ 0.0, len(audio)/options['sampleRate'] ]
    #megalopool.add('global', pool.aggregate_descriptors(wantedStats))#, scope)
    megalopool = essentia.PoolAggregator(exceptions=wantedStats)(pool)
    # special case for spectral contrast, which is only 1 matrix, therefore no
    # stats are computed:
    spectral_contrast_stats(megalopool, 'lowlevel.spectral_contrast', wantedStats)

    # plotting descriptors evolution
    try:
        if options['plots']:
            import plotting
            plotting.compute(inputFilename, audio, pool, options)
    except KeyError: pass

    # compute extractors on segments
    if options['segmentation']['doSegmentation']:
        if options['segmentation']['computeSegments']:
            if len(segments) == 0:
                megalopool.add('void', [0])
            else:
                computeSegments(audio, segments, extractors, megalopool, options)

    # save to output file
    essentia.YamlOutput(filename=outputFilename)(megalopool)
예제 #15
0
plt.pcolormesh(np.array(mfccs))
plt.show()
"""
"""
# and let's do it in a more essentia-like way:
mfccs = []
for frame in ess.FrameGenerator(audio, frameSize = 1024, hopSize = 512):
    mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame)))
    mfccs.append(mfcc_coeffs)

# transpose to have it in a better shape
mfccs = ess.array(mfccs).T
"""

# So let's redo the previous using a Pool
pool = es.Pool()
for frame in ess.FrameGenerator(audio, frameSize=1024, hopSize=512):
    mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame)))
    pool.add('lowlevel.mfcc', mfcc_coeffs)
    pool.add('lowlevel.mfcc_bands', mfcc_bands)
"""
plotMfcc = pool['lowlevel.mfcc'].T[1:,:]
plt.pcolormesh(plotMfcc)
plt.show()
"""

#output = es.YamlOutput(filename = 'mfcc.sig')
output = ess.YamlOutput(filename='joeTestOut/mfcc.json', format='json')
output(pool)

# Say we're not interested in all the MFCC frames, but just their mean & variance.
예제 #16
0
import os
import essentia as e
import essentia.streaming as estr

# CONFIGURATION
# ================================================================================

# Default parameters
sample_rate = 44100
window_size = 16384
hop_size = 8192
tuning_frequency = 440

# retrieve filenames from folder:
soundfiles = os.listdir(audio_folder)
if '.DS_Store' in soundfiles:
    soundfiles.remove('.DS_Store')
    
# ANALYSIS
# ================================================================================
print "\nANALYSIS..."
for item in soundfiles:
    loader = estr.MonoLoader(filename=audio_folder+'/'+item,sampleRate=sample_rate)
    tuningExtractor = <estr.TuningFrequencyExtractor(frameSize=window_size,hopSize=hop_size)
    pool = e.Pool()
    loader.audio >> tuningExtractor.signal
    tuningExtractor.tuningFrequency >> (pool, 'tuning_reference')
    # run and print the results.
    e.run(loader)
    result = pool['tuning_reference']
    print item[:20]+'...     ', result
예제 #17
0
    def extractFeatures(
            self,
            audio,
            scale="onsets",
            listOfFeatures=['Loudness', 'Centroid', 'Flatness', 'BFCC']):
        """Extract features from an audio vector.
        
        This tends to be pretty slow for onset based segmentation and retrieval
        
        :param audio: the audio to extract features from
        
        :param scale: the temporal scale we wish to use
        
        :return: 
        
            features: the list of audio features
        
            units: If FFT scale, then the fft frames also  
        """

        pool = essentia.Pool()
        medianPool = essentia.Pool()

        centroid = flatness = loudness = pitchYinFFT = None
        mfcc = bfcc = gfcc = spectralPeaks = hpcp = None

        if 'Centroid' in listOfFeatures:
            centroid = essentia.standard.Centroid(range=self.sampleRate / 2)
        if 'Flatness' in listOfFeatures:
            flatness = essentia.standard.Flatness()
        if 'Loudness' in listOfFeatures:
            loudness = essentia.standard.Loudness()
        if 'Pitch' in listOfFeatures:
            pitchYinFFT = essentia.standard.PitchYinFFT()

        if 'MFCC' in listOfFeatures:
            mfcc = essentia.standard.MFCC(inputSize=int(self.frameSize / 2 +
                                                        1))
        if 'BFCC' in listOfFeatures:
            bfcc = essentia.standard.BFCC(inputSize=int(self.frameSize / 2 +
                                                        1))
        if 'GFCC' in listOfFeatures:
            gfcc = essentia.standard.GFCC(inputSize=int(self.frameSize / 2 +
                                                        1))
        if 'HPCP' in listOfFeatures:
            spectralPeaks = essentia.standard.SpectralPeaks(
                orderBy="magnitude",
                magnitudeThreshold=1e-05,
                minFrequency=40,
                maxFrequency=5000,
                maxPeaks=10000)
            hpcp = essentia.standard.HPCP()

        fft = essentia.standard.FFT()
        magnitude = essentia.standard.Magnitude()
        w = essentia.standard.Windowing(type='blackmanharris62')

        features = []
        units = []

        f = []

        # #Manual framecutting is faster than Essentia in Python
        # for fstart in range(0, len(audio) - self.frameSize, self.hopSize):
        #     #Get the frame
        #     frame = audio[fstart:fstart + self.frameSize]

        for frame in essentia.standard.FrameGenerator(audio,
                                                      frameSize=self.frameSize,
                                                      hopSize=self.hopSize):

            #FFT and Magnitude Spectrum
            fft_frame = fft(w(frame))
            mag = magnitude(fft_frame)

            if centroid is not None:
                centroidScalar = centroid(mag)
                pool.add("Centroid", centroidScalar)
            if flatness is not None:
                flatnessScalar = flatness(mag)
                pool.add("Flatness", flatnessScalar)
            if loudness is not None:
                loudnessScalar = loudness(frame)
                pool.add("Loudness", loudnessScalar)
            if pitchYinFFT is not None:
                pitchScalar, pitchConfidenceScalar = pitchYinFFT(mag)
                # pool.add("pitch", pitchScalar)
                medianPool.add("Pitch", pitchScalar)

            import time

            startTime = time.time()

            if mfcc is not None:
                mfcc_bands, mfccVector = mfcc(mag)
                pool.add("MFCC", mfccVector[1:])
            if bfcc is not None:
                bfcc_bands, bfccVector = bfcc(mag)
                pool.add("BFCC", bfccVector[1:])
            if gfcc is not None:
                gfcc_bands, gfccVector = gfcc(mag)
                pool.add("GFCC", gfccVector[1:])
            if hpcp is not None:
                frequencies, magnitudes = spectralPeaks(mag)
                hpcpVector = hpcp(frequencies, magnitudes)

                pool.add("HPCP", hpcpVector)

                f.append(hpcpVector)

            elapsedTime = time.time() - startTime

            x = pool.descriptorNames()

            #If we are spectral based we need to return the fft frames as units and the framewise features
            if scale is "spectral":
                units.append(fft_frame)

                frameFeatures = []
                """
                We do it this roundabout way to retain the order that user wants in listOfFeatures
                """
                for feature in listOfFeatures:
                    for descriptor in pool.descriptorNames():
                        if feature in descriptor:
                            frameFeatures = np.append(frameFeatures,
                                                      (pool[descriptor]))

                    for descriptor in medianPool.descriptorNames():
                        if feature in descriptor:
                            frameFeatures = np.append(frameFeatures,
                                                      (medianPool[descriptor]))

                features.append(frameFeatures)
                pool.clear()
                medianPool.clear()

        #Now we get all the stuff out of the pool
        if scale is not "spectral":
            # aggrPool = essentia.standard.PoolAggregator(defaultStats=['mean', 'var'])(pool)
            aggrPool = essentia.standard.PoolAggregator(
                defaultStats=['mean'])(pool)
            medianAggrPool = essentia.standard.PoolAggregator(
                defaultStats=['median'])(medianPool)
            """
            We do it this roundabout way to retain the order that user wants in listOfFeatures
            """
            for feature in listOfFeatures:
                for aggrFeature in aggrPool.descriptorNames():
                    if feature in aggrFeature:
                        if "mean" or "variance" in feature:
                            features = np.append(features,
                                                 aggrPool[aggrFeature])
                        else:
                            features += aggrPool[aggrFeature][0]

                #Median based features (i.e. pitch)
                for medianFeature in medianAggrPool.descriptorNames():
                    if feature in medianFeature:
                        if "median" in medianFeature:
                            features = np.append(features,
                                                 medianAggrPool[medianFeature])
                        else:
                            features += medianAggrPool[medianFeature][0]

            aggrPool.merge(medianAggrPool)

        #Return features, and if it's spectral return the frames as units
        return features, units, pool
def load_audio(type='mono'):

    raw_audio = OrderedDict()
    stem_audio = OrderedDict()

    if 'mono' in type:
        # loads raw audio
        loader = MonoLoader()
        for name in gNameTracks:
            path = gRawPath[name]
            loader.configure(filename=path)
            pool = essentia.Pool()
            loader.audio >> (pool, 'loader.audio')
            essentia.run(loader)

            print 'Raw track contains %d samples of Audio' % len(
                pool['loader.audio'])

            raw_audio[name] = pool['loader.audio']

            essentia.reset(loader)

        # loads stem audio
        for name in gNameTracks:
            path = gStemPath[name]
            loader.configure(filename=path)
            pool = essentia.Pool()
            loader.audio >> (pool, 'loader.audio')
            essentia.run(loader)

            print 'Stem track contains %d samples of Audio' % len(
                pool['loader.audio'])

            stem_audio[name] = pool['loader.audio']

            essentia.reset(loader)

    elif 'stereo' in type:

        # loads raw audio Stereo:
        for name in gNameTracks:
            path = gRawPath[name]
            loader = AudioLoader(filename=path)
            pool = essentia.Pool()
            loader.audio >> (pool, 'loader.audio')
            loader.sampleRate >> None
            loader.numberChannels >> None
            loader.md5 >> None
            loader.bit_rate >> None
            loader.codec >> None
            essentia.run(loader)

            print 'Raw Stereo track contains %d samples of Audio' % len(
                pool['loader.audio'])

            raw_audio[name] = pool['loader.audio']

            essentia.reset(loader)

        # loads stem stereo
        for name in gNameTracks:
            path = gStemStereoPath[name]
            loader = AudioLoader(filename=path)
            pool = essentia.Pool()
            loader.audio >> (pool, 'loader.audio')
            loader.sampleRate >> None
            loader.numberChannels >> None
            loader.md5 >> None
            loader.bit_rate >> None
            loader.codec >> None
            essentia.run(loader)

            print 'Stem Stereo track contains %d samples of Audio' % len(
                pool['loader.audio'])

            stem_audio[name] = pool['loader.audio']

            essentia.reset(loader)

    return raw_audio, stem_audio
예제 #19
0
    def analyseFile(
            self,
            file,
            writeOnsets,
            scale="onsets",
            yamlOutputFile="",
            onsetDetection="",
            listOfFeatures=['Loudness', 'Centroid', 'Flatness', 'MFCC']):
        """Extract onsets from a single file then extract features from all those onsets
        
        :param file: the file to analyse
        
        :param writeOnsets: whether you want to write the audio onsets to the filesystem
        
        :param scale: the temporal scale: None, spectral, onsets, beats
         
        :return:
        
            features : lists of lists of features
            
            units : list of audio signals corresponding to units
            
            unitTimes: the list of transient times from the audio signals
            
        """

        onsetTimes = []
        onsets = []
        fileName = file

        filePool = essentia.Pool()

        print("Processing file: " + file)

        if enableDebug:
            self.debugFile.write(file + "\n")

        #Extract onsets or add the audio as a single onset
        print("    Onset Detection and Segmentation...")
        if scale == "beats":
            onsetTimes, onsets, fileName = self.extractBeats(file)
        elif scale == "onsets":
            onsetTimes, onsets, fileName = self.extractAndSliceOnsets(
                file, method=onsetDetection)
        else:
            onsetTimes.append(0.0)
            audio = self.loadAudio(file)
            onsets.append(audio)

        #Optionally write these onsets out
        if (writeOnsets):
            fileNames = self.writeOnsets(onsets, file)

        features = []
        units = []

        print("    Feature Extraction...")

        for onsetTime, onset in zip(onsetTimes, onsets):
            onsetFeatures, onsetFFTs, onsetPool = self.extractFeatures(
                onset, scale, listOfFeatures=listOfFeatures)

            #If it's not onset based then spectra are the units, append
            if scale is "spectral":
                units += onsetFFTs
                features += onsetFeatures
            else:
                features.append(onsetFeatures)

            onsetPool.add("onsetTimes", onsetTime)
            filePool.merge(onsetPool, "append")

        if scale is not "spectral":
            units = onsets

        if yamlOutputFile != "":
            essentia.standard.YamlOutput(filename=yamlOutputFile)(filePool)

        return features, units, onsetTimes
예제 #20
0
def analyze(filename, segment_duration=20):

    lowlevelFrameSize = 2048
    lowlevelHopSize = 1024
    tonalFrameSize = 4096
    tonalHopSize = 1024

    # Compute replay gain and duration on the entire file, then load the
    # segment that is centered in time with replaygain applied
    audio = es.MonoLoader(filename=filename)()
    replaygain = es.ReplayGain()(audio)

    segment_start = (len(audio) / 44100 - segment_duration) / 2
    segment_end = segment_start + segment_duration

    if segment_start < 0 or segment_end > len(audio) / 44100:
        raise ValueError(
            'Segment duration is larger than the input audio duration')

    # TODO
    # There's a bug in streaming mode Python wrapper: running both Mel and HPCP
    # in the same network with the same loader will result in a memory error.
    # This does not happen in C++. As a workaround, compute Mel and HPCP in
    # two separate networks with two separate loaders.

    loader_mel = EasyLoader(filename=filename,
                            replayGain=replaygain,
                            startTime=segment_start,
                            endTime=segment_end)
    loader_hpcp = EasyLoader(filename=filename,
                             replayGain=replaygain,
                             startTime=segment_start,
                             endTime=segment_end)

    # Processing for Mel bands
    framecutter_mel = FrameCutter(frameSize=lowlevelFrameSize,
                                  hopSize=lowlevelHopSize)
    window_mel = Windowing(type='blackmanharris62')
    spectrum_mel = Spectrum()
    melbands = MelBands(numberBands=96,
                        lowFrequencyBound=0,
                        highFrequencyBound=11025)

    # Processing for HPCPs
    framecutter_hpcp = FrameCutter(frameSize=tonalFrameSize,
                                   hopSize=tonalHopSize)
    window_hpcp = Windowing(type='blackmanharris62')
    spectrum_hpcp = Spectrum()
    speaks = SpectralPeaks(maxPeaks=60,
                           magnitudeThreshold=0.00001,
                           minFrequency=20.0,
                           maxFrequency=3500.0,
                           orderBy='magnitude')

    # Normalize Mel bands: log10(1+x*10000)
    norm = UnaryOperator(type='identity', shift=1, scale=10000)
    log10 = UnaryOperator(type='log10')

    hpcp = HPCP(size=12,
                bandPreset=False,
                minFrequency=20.0,
                maxFrequency=3500.0,
                weightType='cosine',
                windowSize=1.)

    p = essentia.Pool()

    loader_mel.audio >> framecutter_mel.signal
    framecutter_mel.frame >> window_mel.frame >> spectrum_mel.frame
    spectrum_mel.spectrum >> melbands.spectrum
    melbands.bands >> norm.array >> log10.array >> (p, 'melbands')
    essentia.run(loader_mel)

    loader_hpcp.audio >> framecutter_hpcp.signal
    framecutter_hpcp.frame >> window_hpcp.frame >> spectrum_hpcp.frame
    spectrum_hpcp.spectrum >> speaks.spectrum
    speaks.frequencies >> hpcp.frequencies
    speaks.magnitudes >> hpcp.magnitudes
    hpcp.hpcp >> (p, 'hpcp')
    essentia.run(loader_hpcp)

    return p
예제 #21
0
def analsynthHarmonicModelStreaming(params, signal):

    out = array([0.])

    pool = essentia.Pool()
    # windowing and FFT
    fcut = es.FrameCutter(frameSize=params['frameSize'],
                          hopSize=params['hopSize'],
                          startFromZero=False)
    w = es.Windowing(type="blackmanharris92")
    fft = es.FFT(size=params['frameSize'])
    spec = es.Spectrum(size=params['frameSize'])

    # pitch detection
    pitchDetect = es.PitchYinFFT(frameSize=params['frameSize'],
                                 sampleRate=params['sampleRate'])

    smanal = es.HarmonicModelAnal(
        sampleRate=params['sampleRate'],
        maxnSines=params['maxnSines'],
        magnitudeThreshold=params['magnitudeThreshold'],
        freqDevOffset=params['freqDevOffset'],
        freqDevSlope=params['freqDevSlope'],
        minFrequency=params['minFrequency'],
        maxFrequency=params['maxFrequency'])
    smsyn = es.SineModelSynth(sampleRate=params['sampleRate'],
                              fftSize=params['frameSize'],
                              hopSize=params['hopSize'])
    ifft = es.IFFT(size=params['frameSize'])
    overl = es.OverlapAdd(frameSize=params['frameSize'],
                          hopSize=params['hopSize'])

    # add half window of zeros to input signal to reach same ooutput length
    signal = numpy.append(signal, zeros(params['frameSize'] // 2))
    insignal = VectorInput(signal)

    # analysis
    insignal.data >> fcut.signal
    fcut.frame >> w.frame
    w.frame >> spec.frame
    w.frame >> fft.frame
    spec.spectrum >> pitchDetect.spectrum

    fft.fft >> smanal.fft
    pitchDetect.pitch >> smanal.pitch
    pitchDetect.pitchConfidence >> (pool, 'pitchConfidence')
    smanal.magnitudes >> (pool, 'magnitudes')
    smanal.frequencies >> (pool, 'frequencies')
    smanal.phases >> (pool, 'phases')

    # synthesis
    smanal.magnitudes >> smsyn.magnitudes
    smanal.frequencies >> smsyn.frequencies
    smanal.phases >> smsyn.phases
    smsyn.fft >> ifft.fft

    ifft.frame >> overl.frame
    overl.signal >> (pool, 'audio')

    essentia.run(insignal)

    # remove short tracks
    freqs = pool['frequencies']

    minFrames = int(params['minSineDur'] * params['sampleRate'] /
                    params['hopSize'])
    freqsClean = cleaningSineTracks(freqs, minFrames)
    pool['frequencies'].data = freqsClean

    # remove first half window frames
    outaudio = pool['audio']
    outaudio = outaudio[2 * params['hopSize']:]

    return outaudio, pool
예제 #22
0
def analyze_mel(filename,
                segment_duration=None,
                maxFrequency=11025,
                replaygain=True):
    lowlevelFrameSize = 2048
    lowlevelHopSize = 1024

    # Compute replay gain and duration on the entire file, then load the
    # segment that is centered in time with replaygain applied
    audio = es.MonoLoader(filename=filename)()

    if replaygain:
        replaygain = es.ReplayGain()(audio)
    else:
        replaygain = -6  # Default replaygain value in EasyLoader

    if segment_duration:
        segment_start = (len(audio) / 44100 - segment_duration) / 2
        segment_end = segment_start + segment_duration
    else:
        segment_start = 0
        segment_end = len(audio) / 44100

    if segment_start < 0 or segment_end > len(audio) / 44100:
        raise ValueError(
            'Segment duration is larger than the input audio duration')

    loader_mel = EasyLoader(filename=filename,
                            replayGain=replaygain,
                            startTime=segment_start,
                            endTime=segment_end)

    # Processing for Mel bands
    framecutter_mel = FrameCutter(frameSize=lowlevelFrameSize,
                                  hopSize=lowlevelHopSize)
    window_mel = Windowing(type='blackmanharris62',
                           zeroPadding=lowlevelFrameSize)

    spectrum_mel = Spectrum()

    melbands128 = MelBands(numberBands=128,
                           lowFrequencyBound=0,
                           highFrequencyBound=maxFrequency,
                           inputSize=lowlevelFrameSize + 1)

    melbands96 = MelBands(numberBands=96,
                          lowFrequencyBound=0,
                          highFrequencyBound=maxFrequency,
                          inputSize=lowlevelFrameSize + 1)

    melbands48 = MelBands(numberBands=48,
                          lowFrequencyBound=0,
                          highFrequencyBound=maxFrequency,
                          inputSize=lowlevelFrameSize + 1)

    melbands32 = MelBands(numberBands=32,
                          lowFrequencyBound=0,
                          highFrequencyBound=maxFrequency,
                          inputSize=lowlevelFrameSize + 1)

    melbands24 = MelBands(numberBands=24,
                          lowFrequencyBound=0,
                          highFrequencyBound=maxFrequency,
                          inputSize=lowlevelFrameSize + 1)

    melbands16 = MelBands(numberBands=16,
                          lowFrequencyBound=0,
                          highFrequencyBound=maxFrequency,
                          inputSize=lowlevelFrameSize + 1)

    melbands8 = MelBands(numberBands=8,
                         lowFrequencyBound=0,
                         highFrequencyBound=maxFrequency,
                         inputSize=lowlevelFrameSize + 1)

    # Normalize Mel bands: log10(1+x*10000)
    norm128 = UnaryOperator(type='identity', shift=1, scale=10000)
    log10128 = UnaryOperator(type='log10')

    norm96 = UnaryOperator(type='identity', shift=1, scale=10000)
    log1096 = UnaryOperator(type='log10')

    norm48 = UnaryOperator(type='identity', shift=1, scale=10000)
    log1048 = UnaryOperator(type='log10')

    norm32 = UnaryOperator(type='identity', shift=1, scale=10000)
    log1032 = UnaryOperator(type='log10')

    norm24 = UnaryOperator(type='identity', shift=1, scale=10000)
    log1024 = UnaryOperator(type='log10')

    norm16 = UnaryOperator(type='identity', shift=1, scale=10000)
    log1016 = UnaryOperator(type='log10')

    norm8 = UnaryOperator(type='identity', shift=1, scale=10000)
    log108 = UnaryOperator(type='log10')

    p = essentia.Pool()

    loader_mel.audio >> framecutter_mel.signal
    framecutter_mel.frame >> window_mel.frame >> spectrum_mel.frame

    spectrum_mel.spectrum >> melbands128.spectrum
    spectrum_mel.spectrum >> melbands96.spectrum
    spectrum_mel.spectrum >> melbands48.spectrum
    spectrum_mel.spectrum >> melbands32.spectrum
    spectrum_mel.spectrum >> melbands24.spectrum
    spectrum_mel.spectrum >> melbands16.spectrum
    spectrum_mel.spectrum >> melbands8.spectrum

    melbands128.bands >> norm128.array >> log10128.array >> (p, 'mel128')
    melbands96.bands >> norm96.array >> log1096.array >> (p, 'mel96')
    melbands48.bands >> norm48.array >> log1048.array >> (p, 'mel48')
    melbands32.bands >> norm32.array >> log1032.array >> (p, 'mel32')
    melbands24.bands >> norm24.array >> log1024.array >> (p, 'mel24')
    melbands16.bands >> norm16.array >> log1016.array >> (p, 'mel16')
    melbands8.bands >> norm8.array >> log108.array >> (p, 'mel8')

    essentia.run(loader_mel)

    return p
예제 #23
0
def reComputeDescriptors(inputAudioFile, outputJsonFile):
    """
    :param inputAudioFile:
    :param outputJsonFile:
    :return:
    """

    #help(ess.SpectralContrast)
    """ orig
    M = 1024
    N = 1024
    H = 512
    fs = 44100
    W = 'hann'
    """
    """ freesound
    Real sampleRate = 44100;
    int frameSize =   2048;
    int hopSize =     1024;
    int zeroPadding = 0;

    string silentFrames ="noise";
    string windowType = "blackmanharris62";

    // Silence Rate
    Real thresholds_dB[] = { -20, -30, -60 };
    vector<Real> thresholds(ARRAY_SIZE(thresholds_dB));
    for (uint i=0; i<thresholds.size(); i++) {
        thresholds[i] = db2lin(thresholds_dB[i]/2.0);
    }


    """

    M = 2048
    N = 2048
    H = 1024
    fs = 44100

    W = 'blackmanharris62'
    #silentFrames = "noise"
    #thresholds_dB = np.array([ -20, -30, -60 ])
    #thresholds = np.power (10.0, thresholds_dB / 20)

    #spectrum = ess.Spectrum(size=N)
    spectrum = ess.Spectrum()
    #window = ess.Windowing(size=M, type=W)
    window = ess.Windowing(type=W)
    #mfcc = ess.MFCC(numberCoefficients=12, inputSize=N/2+1)
    mfcc = ess.MFCC()

    spectral_peaks = ess.SpectralPeaks(minFrequency=1,
                                       maxFrequency=20000,
                                       maxPeaks=100,
                                       sampleRate=fs,
                                       magnitudeThreshold=0,
                                       orderBy="magnitude")

    dissonance = ess.Dissonance()

    #pitch_detection = ess.PitchYinFFT(frameSize=M, sampleRate=fs)
    pitch_detection = ess.PitchYinFFT()

    harmonic_peaks = ess.HarmonicPeaks()

    inharmonicity = ess.Inharmonicity()

    #spectral_contrast = ess.SpectralContrast(sampleRate=fs)
    spectral_contrast = ess.SpectralContrast()

    centroid = ess.Centroid()

    log_attack_time = ess.LogAttackTime()

    hfc = ess.HFC()

    energy = ess.Energy()

    x = ess.MonoLoader(filename=inputAudioFile, sampleRate=fs)()
    frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True)
    pool = es.Pool()
    for frame in frames:
        mX = spectrum(window(frame))
        mfcc_bands, mfcc_coeffs = mfcc(mX)

        pool.add('lowlevel.mfcc', mfcc_coeffs)
        pool.add('lowlevel.mfcc_bands', mfcc_bands)

        pfreq, pmag = spectral_peaks(mX)

        inds = pfreq.argsort()
        pfreq_sorted = pfreq[inds]
        pmag_sorted = pmag[inds]

        diss = dissonance(pfreq_sorted, pmag_sorted)
        pool.add('lowlevel.dissonance', diss)

        pitch, pitch_confidence = pitch_detection(mX)

        phfreq, phmag = harmonic_peaks(pfreq_sorted, pmag_sorted, pitch)
        if len(phfreq) > 1:
            inharm = inharmonicity(phfreq, phmag)
            pool.add('sfx.inharmonicity', inharm)

        sc_coeffs, sc_valleys = spectral_contrast(mX)
        pool.add('lowlevel.spectral_contrast', sc_coeffs)

        c = centroid(mX)
        pool.add('lowlevel.spectral_centroid', c)

        lat = log_attack_time(frame)
        pool.add('sfx.logattacktime', lat)

        h = hfc(mX)
        pool.add('lowlevel.hfc', h)

    calc_Mean_Var = ess.PoolAggregator(defaultStats=['mean', 'var'])
    aggrPool = calc_Mean_Var(pool)

    features = makeFeatures(aggrPool)
    json.dump(features, open(outputJsonFile, 'w'))
예제 #24
0
def analsynthHpsModelStreaming(params, signal):

    out = array([0.])

    pool = essentia.Pool()
    # windowing and FFT
    fcut = es.FrameCutter(frameSize=params['frameSize'],
                          hopSize=params['hopSize'],
                          startFromZero=False)
    w = es.Windowing(type="blackmanharris92")
    spec = es.Spectrum(size=params['frameSize'])

    # pitch detection
    pitchDetect = es.PitchYinFFT(frameSize=params['frameSize'],
                                 sampleRate=params['sampleRate'])

    smanal = es.HpsModelAnal(sampleRate=params['sampleRate'],
                             hopSize=params['hopSize'],
                             maxnSines=params['maxnSines'],
                             magnitudeThreshold=params['magnitudeThreshold'],
                             freqDevOffset=params['freqDevOffset'],
                             freqDevSlope=params['freqDevSlope'],
                             minFrequency=params['minFrequency'],
                             maxFrequency=params['maxFrequency'],
                             stocf=params['stocf'])
    synFFTSize = min(params['frameSize'] / 4, 4 * params['hopSize'])
    # make sure the FFT size is appropriate
    smsyn = es.SpsModelSynth(sampleRate=params['sampleRate'],
                             fftSize=synFFTSize,
                             hopSize=params['hopSize'],
                             stocf=params['stocf'])

    # add half window of zeros to input signal to reach same ooutput length
    signal = numpy.append(signal, zeros(params['frameSize'] / 2))
    insignal = VectorInput(signal)

    # analysis
    insignal.data >> fcut.signal
    fcut.frame >> w.frame
    w.frame >> spec.frame
    spec.spectrum >> pitchDetect.spectrum

    fcut.frame >> smanal.frame
    pitchDetect.pitch >> smanal.pitch
    pitchDetect.pitchConfidence >> (pool, 'pitchConfidence')
    pitchDetect.pitch >> (pool, 'pitch')

    # synthesis
    smanal.magnitudes >> smsyn.magnitudes
    smanal.frequencies >> smsyn.frequencies
    smanal.phases >> smsyn.phases
    smanal.stocenv >> smsyn.stocenv

    smsyn.frame >> (pool, 'frames')
    smsyn.sineframe >> (pool, 'sineframes')
    smsyn.stocframe >> (pool, 'stocframes')

    essentia.run(insignal)

    outaudio = framesToAudio(pool['frames'])
    outaudio = outaudio[2 * params['hopSize']:]

    return outaudio, pool
예제 #25
0
def reComputeDescriptors(inputAudioFile, outputJsonFile):

    """
    :param inputAudioFile:
    :param outputJsonFile:
    :return:
    """

    M = 2048
    N = 2048
    H = 1024
    fs = 44100

    W = 'blackmanharris62'


    #spectrum = ess.Spectrum(size=N)
    spectrum = ess.Spectrum()
    #window = ess.Windowing(size=M, type=W)
    window = ess.Windowing(type=W)
    #mfcc = ess.MFCC(numberCoefficients=12, inputSize=N/2+1)
    mfcc = ess.MFCC()

    spectral_peaks = ess.SpectralPeaks(minFrequency=1,
                                       maxFrequency=20000,
                                       maxPeaks=100,
                                       sampleRate=fs,
                                       magnitudeThreshold=0,
                                       orderBy="magnitude")

    dissonance = ess.Dissonance()

    #pitch_detection = ess.PitchYinFFT(frameSize=M, sampleRate=fs)
    pitch_detection = ess.PitchYinFFT()

    harmonic_peaks = ess.HarmonicPeaks()

    inharmonicity = ess.Inharmonicity()

    #spectral_contrast = ess.SpectralContrast(sampleRate=fs)
    spectral_contrast = ess.SpectralContrast()

    centroid = ess.Centroid()

    log_attack_time = ess.LogAttackTime()

    hfc = ess.HFC()

    # magnitudeThreshold = 0.005 is hardcoded for a "blackmanharris62" frame, see lowlevel.py
    spectral_complexity = ess.SpectralComplexity(magnitudeThreshold=0.005)


    energy = ess.Energy()

    x = ess.MonoLoader(filename=inputAudioFile, sampleRate=fs)()
    frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True)

    E = []
    numFrames = 0
    for frame in frames:
        numFrames += 1
        E_frame = energy(frame)
        E.append(E_frame)

    E_max = np.max(E)

    frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True)

    pools = [(t, es.Pool()) for t in dscr.threshold]
    for frame in frames:

        eNorm = energy(frame) / E_max

        threshPools = []
        for t, pool in pools:
            if eNorm >= t:
                threshPools.append(pool)

        mX = spectrum(window(frame))
        mfcc_bands, mfcc_coeffs = mfcc(mX)

        [pool.add('lowlevel.mfcc', mfcc_coeffs) for pool in threshPools]
        #[pool.add('lowlevel.mfcc_bands', mfcc_bands) for pool in threshPools]

        pfreq, pmag = spectral_peaks(mX)

        inds = pfreq.argsort()
        pfreq_sorted = pfreq[inds]
        pmag_sorted = pmag[inds]

        diss = dissonance(pfreq_sorted, pmag_sorted)
        [pool.add('lowlevel.dissonance', diss) for pool in threshPools]

        pitch, pitch_confidence = pitch_detection(mX)

        phfreq, phmag = harmonic_peaks(pfreq_sorted, pmag_sorted, pitch)
        if len(phfreq) > 1:
            inharm = inharmonicity(phfreq, phmag)
            [pool.add('sfx.inharmonicity', inharm) for pool in threshPools]

        sc_coeffs, sc_valleys = spectral_contrast(mX)
        [pool.add('lowlevel.spectral_contrast', sc_coeffs) for pool in threshPools]

        c = centroid(mX)
        [pool.add('lowlevel.spectral_centroid', c) for pool in threshPools]

        lat = log_attack_time(frame)
        [pool.add('sfx.logattacktime', lat) for pool in threshPools]

        h = hfc(mX)
        [pool.add('lowlevel.hfc', h) for pool in threshPools]

        spec_complx = spectral_complexity(mX)
        [pool.add('lowlevel.spectral_complexity', spec_complx) for pool in threshPools]


    #calc_Mean_Var = ess.PoolAggregator(defaultStats=['mean', 'var'])
    calc_Mean_Var = ess.PoolAggregator(defaultStats=['mean'])
    aggrPools = [calc_Mean_Var(pool) for t, pool in pools]

    features = {}
    [appendFeatures(features, aggrPools[i], ("ethc"+str(dscr.thresholdSelect[i]))) for i in range(len(aggrPools))]
    json.dump(features, open(outputJsonFile, 'w'))
show()

# <demo> --- stop ---

# Introducing the Pool: a good-for-all container
#
# A Pool can contain any type of values (easy in Python, not as much in C++ :-) )
# They need to be given a name, which represent the full path to these values;
# dot '.' characters are used as separators. You can think of it as a directory
# tree, or as namespace(s) + local name.
#
# Examples of valid names are: bpm, lowlevel.mfcc, highlevel.genre.rock.probability, etc...

# So let's redo the previous using a Pool

pool = essentia.Pool()

for frame in FrameGenerator(audio, frameSize=1024, hopSize=512):
    mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame)))
    pool.add('lowlevel.mfcc', mfcc_coeffs)
    pool.add('lowlevel.mfcc_bands', mfcc_bands)

imshow(pool['lowlevel.mfcc'].T[1:, :], aspect='auto')
figure()
# Let's plot mfcc bands on a log-scale so that the energy values will be better
# differentiated by color
from matplotlib.colors import LogNorm
imshow(pool['lowlevel.mfcc_bands'].T,
       aspect='auto',
       interpolation='nearest',
       norm=LogNorm())
예제 #27
0
def compute_features(path, f_mfcc_kl, f_mfcc_euclid, f_notes, f_chroma, f_bh):
    gc.enable()
    # Loading audio file
    #will resample if sampleRate is different!
    try:
        audio = es.MonoLoader(filename=path, sampleRate=fs)()
    except:
        print("Erroneos File detected by essentia standard: skipping!")
        #return bpm, histogram, key, scale, notes, chroma_matrix, mean, cov, var, cov_kl
        return 0, [], 0, 0, [], [], [], [], [], []
    #will resample if sampleRate is different!
    try:
        loader = ess.MonoLoader(filename=path, sampleRate=44100)
    except:
        print("Erroneos File detected by essentia streaming: skipping!")
        #return bpm, histogram, key, scale, notes, chroma_matrix, mean, cov, var, cov_kl
        return 0, [], 0, 0, [], [], [], [], [], []
    #Initialize algorithms we will use
    frameSize = 4096  #512
    hopSize = 2048  #256
    #######################################
    # DO FILTERING ONLY FOR MFCC - not with essentia standard
    # below is just an example
    #HP = es.HighPass(cutoffFrequency=128)
    #LP = es.LowPass(cutoffFrequency=4096)
    #lp_f = LP(audio)
    #hp_f = HP(lp_f)
    #audio = hp_f
    #MonoWriter(filename='music/filtered.wav')(filtered_audio)
    HP = ess.HighPass(cutoffFrequency=128)
    LP = ess.LowPass(cutoffFrequency=4096)
    #loader = ess.MonoLoader(filename=path, sampleRate=44100)
    #writer = ess.MonoWriter(filename='music/filtered.wav')
    #frameCutter = FrameCutter(frameSize = 1024, hopSize = 512)
    #pool = essentia.Pool()
    # Connect streaming algorithms
    #loader.audio >> HP.signal
    #HP.signal >> LP.signal
    #LP.signal >> writer.audio
    # Run streaming network
    #essentia.run(loader)
    bpm = 0
    histogram = 0
    key = 0
    scale = 0
    notes = 0
    chroma_matrix = 0
    mean = 0
    cov = 0
    var = 0
    cov_kl = 0
    #####################################
    # extract mfcc
    #####################################
    if f_mfcc_kl == 1 or f_mfcc_euclid == 1:
        #features, features_frames = es.MusicExtractor(analysisSampleRate=44100, mfccStats=['mean', 'cov'])(path)
        #m, n = features['lowlevel.mfcc.cov'].shape
        #print m
        #iu1 = np.triu_indices(m)
        #cov = features['lowlevel.mfcc.cov'][iu1]
        #mean = features['lowlevel.mfcc.mean']
        #print(features['lowlevel.mfcc.cov'])
        hamming_window = es.Windowing(type='hamming')
        spectrum = es.Spectrum()  # we just want the magnitude spectrum
        mfcc = es.MFCC(numberCoefficients=13)
        frame_sz = 2048  #512
        hop_sz = 1024  #256
        mfccs = np.array([
            mfcc(spectrum(hamming_window(frame)))[1] for frame in
            es.FrameGenerator(audio, frameSize=frame_sz, hopSize=hop_sz)
        ])
        #Let's scale the MFCCs such that each coefficient dimension has zero mean and unit variance:
        #mfccs = sklearn.preprocessing.scale(mfccs)
        #print mfccs.shape
        mean = np.mean(mfccs.T, axis=1)
        #print(mean)
        var = np.var(mfccs.T, axis=1)
        #print(var)
        cov = np.cov(mfccs.T)
        cov_kl = cov  #.flatten()
        #get only upper triangular matrix values to shorten length
        iu1 = np.triu_indices(13)
        cov = cov[iu1]
        #plt.imshow(mfccs.T, origin='lower', aspect='auto', interpolation='nearest')
        #plt.ylabel('MFCC Coefficient Index')
        #plt.xlabel('Frame Index')
        #plt.colorbar()
    #####################################
    # extract beat features and histogram
    #####################################
    if f_bh == 1 or f_chroma == 1 or f_notes == 1:
        # Compute beat positions and BPM
        rhythm_extractor = es.RhythmExtractor2013(method="multifeature")
        bpm, beats, beats_confidence, _, beats_intervals = rhythm_extractor(
            audio)
        if f_bh == 1:
            peak1_bpm, peak1_weight, peak1_spread, peak2_bpm, peak2_weight, peak2_spread, histogram = es.BpmHistogramDescriptors(
            )(beats_intervals)
        tempo = bpm
        times = beats
        beats_frames = (beats * fs) / hopSize
        beats_frames = beats_frames.astype(int)

        #fig, ax = plt.subplots()
        #ax.bar(range(len(histogram)), histogram, width=1)
        #ax.set_xlabel('BPM')
        #ax.set_ylabel('Frequency')
        #plt.title("BPM histogram")
        #ax.set_xticks([20 * x + 0.5 for x in range(int(len(histogram) / 20))])
        #ax.set_xticklabels([str(20 * x) for x in range(int(len(histogram) / 20))])
        #plt.show()

    #####################################
    # extract full beat aligned chroma
    #####################################

    framecutter = ess.FrameCutter(frameSize=frameSize,
                                  hopSize=hopSize,
                                  silentFrames='noise')
    windowing = ess.Windowing(type='blackmanharris62')
    spectrum = ess.Spectrum()
    spectralpeaks = ess.SpectralPeaks(orderBy='magnitude',
                                      magnitudeThreshold=0.00001,
                                      minFrequency=20,
                                      maxFrequency=3500,
                                      maxPeaks=60)
    # Use default HPCP parameters for plots, however we will need higher resolution
    # and custom parameters for better Key estimation
    hpcp = ess.HPCP()
    hpcp_key = ess.HPCP(
        size=36,  # we will need higher resolution for Key estimation
        referenceFrequency=440,  # assume tuning frequency is 44100.
        bandPreset=False,
        minFrequency=20,
        maxFrequency=3500,
        weightType='cosine',
        nonLinear=False,
        windowSize=1.)
    key = ess.Key(
        profileType='edma',  # Use profile for electronic music
        numHarmonics=4,
        pcpSize=36,
        slope=0.6,
        usePolyphony=True,
        useThreeChords=True)
    # Use pool to store data
    pool = essentia.Pool()
    # Connect streaming algorithms
    ###################################
    # USE FILTER - comment next lines in
    loader.audio >> HP.signal
    HP.signal >> LP.signal
    LP.signal >> framecutter.signal
    ###################################
    ###################################
    # NO FILTER - comment next line in
    #loader.audio >> framecutter.signal
    ###################################
    framecutter.frame >> windowing.frame >> spectrum.frame
    spectrum.spectrum >> spectralpeaks.spectrum
    spectralpeaks.magnitudes >> hpcp.magnitudes
    spectralpeaks.frequencies >> hpcp.frequencies
    spectralpeaks.magnitudes >> hpcp_key.magnitudes
    spectralpeaks.frequencies >> hpcp_key.frequencies
    hpcp_key.hpcp >> key.pcp
    hpcp.hpcp >> (pool, 'tonal.hpcp')
    key.key >> (pool, 'tonal.key_key')
    key.scale >> (pool, 'tonal.key_scale')
    key.strength >> (pool, 'tonal.key_strength')
    # Run streaming network
    essentia.run(loader)
    #print("Estimated key and scale:", pool['tonal.key_key'] + " " + pool['tonal.key_scale'])
    #print(pool['tonal.hpcp'].T)
    chroma = pool['tonal.hpcp'].T
    key = pool['tonal.key_key']
    scale = pool['tonal.key_scale']
    if f_chroma == 1:
        # Plot HPCP
        #imshow(pool['tonal.hpcp'].T, aspect='auto', origin='lower', interpolation='none')
        #plt.title("HPCPs in frames (the 0-th HPCP coefficient corresponds to A)")
        #show()
        #print beats_frames.shape[0]
        chroma_matrix = np.zeros((beats_frames.shape[0], 12))
        prev_beat = 0
        act_beat = 0
        sum_key = np.zeros(12)
        chroma_align = chroma
        chroma_align = chroma_align.transpose()
        mat_index = 0
        for i in beats_frames:
            act_beat = i
            value = sum(
                chroma_align[prev_beat:act_beat]) / (act_beat - prev_beat)
            chroma_align[prev_beat:act_beat] = value
            prev_beat = i
            if np.linalg.norm(value, ord=1) != 0:
                value = value / np.linalg.norm(value, ord=1)
            chroma_matrix[mat_index] = value
            mat_index = mat_index + 1

        #chroma_align = chroma_align.transpose()
        #plt.figure(figsize=(10, 4))
        #librosa.display.specshow(chroma_align, y_axis='chroma', x_axis='time')
        #plt.vlines(times, 0, 12, alpha=0.5, color='r', linestyle='--', label='Beats')
        #plt.colorbar()
        #plt.title('Chromagram')
        #plt.tight_layout()
        #chroma_align = chroma_align.transpose()
    #print(chroma_align[24:28])
    #####################################
    # extract full chroma text
    #####################################
    if f_notes == 1:
        #print(chroma.shape)
        m, n = chroma.shape
        avg = 0
        chroma = chroma.transpose()
        m, n = chroma.shape
        for j in chroma:
            avg = avg + np.sum(j)
        avg = avg / m
        threshold = avg / 2
        for i in chroma:
            if np.sum(i) > threshold:
                ind = np.where(i == np.max(i))
                max_val = i[ind]  #is always 1!
                i[ind] = 0

                ind2 = np.where(i == np.max(i))
                i[ind] = 1

                #if np.any(i[ind2][0] >= 0.8 * max_val):
                #i[ind2] = i[ind2]
                #pass
                #low_values_flags = i < 1
                low_values_flags = i < 0.8

                i[low_values_flags] = 0
            else:
                i.fill(0)
        chroma = chroma.transpose()
        # Compute beat positions and BPM
        prev_beat = 0
        act_beat = 0
        sum_key = np.zeros(12)
        chroma = chroma.transpose()
        for i in beats_frames:
            act_beat = i
            sum_key = sum(chroma[prev_beat:act_beat])
            #print(sum_key)
            #print(chroma[prev_beat:act_beat])

            ind = np.where(sum_key == np.max(sum_key))
            ind = ind[0]
            #print("debug")
            fill = np.zeros(len(j))
            if (np.all(chroma[prev_beat:act_beat] == 0)):
                fill[ind] = 0
            else:
                fill[ind] = 1
            chroma[prev_beat:act_beat] = fill
            #print(chroma[prev_beat:act_beat])
            prev_beat = i
            #print("BEAT")
        notes = []
        for i in notes:
            del i
        prev_beat = 0
        act_beat = 0
        for i in beats_frames:
            act_beat = i
            sum_key = sum(chroma[prev_beat:act_beat])
            ind = np.where(sum_key == np.max(sum_key))
            prev_beat = i
            notes.append(ind[0][0])
            prev_beat = i
        #chroma = chroma.transpose()
        #plt.figure(figsize=(10, 4))
        #librosa.display.specshow(chroma, y_axis='chroma', x_axis='time')
        #plt.vlines(times, 0, 12, alpha=0.5, color='r', linestyle='--', label='Beats')
        #plt.colorbar()
        #plt.title('Chromagram')
        #plt.tight_layout()
        #chroma = chroma.transpose()
    gc.collect()
    return bpm, histogram, key, scale, notes, chroma_matrix, mean, cov, var, cov_kl