예제 #1
0
def get_sines_per_frame(audio, sr=44100, onlyfrecuencies=False, nsines=20):
    """
    Perform framewise sinusoidal model in an audio
    :param audio: Audio either mono or stereo. Will be downsampled to mono
    :param sr: Samplerate used for the audio
    :return: Nx2x100. N is the number of resulting frames. 2x100 are the frequencies and magnitudes respectively.
    """
    if audio.ndim > 1:
        audio = std.MonoMixer()(audio, audio.shape[1])

    len_arrays = 0
    for i, _ in enumerate(
            std.FrameGenerator(audio, frameSize=4096, hopSize=2048)):
        len_arrays = i

    fft_algo = std.FFT()
    sine_anal = std.SineModelAnal(maxnSines=nsines,
                                  orderBy='frequency',
                                  minFrequency=1)
    sines = np.zeros([len_arrays + 1, 2, nsines], dtype=np.float32) + eps
    for i, frame in enumerate(
            std.FrameGenerator(audio, frameSize=4096, hopSize=2048)):
        fft = fft_algo(frame)
        freqs, mags, _ = sine_anal(fft)
        sorting_indexes = np.argsort(freqs)
        freqs = freqs[sorting_indexes]
        mags = mags[sorting_indexes]
        sines[i, :] = [freqs, mags]
    if onlyfrecuencies:
        return sines[:, 0, :]
    else:
        return sines[:, 0, :], sines[:, 1, :]
예제 #2
0
def MRCG(x, fs=44100, framesize1=0.02, framesize2=0.2, hopsize=0.01):

    hopsize = int(hopsize * fs)
    # spectrogram init
    winAnalysis = 'hann'

    ####---- cochleagram 1
    framesize = int(framesize1 * fs)
    N = 2 * framesize  # padding 1 time framesize
    SPECTRUM = ess.Spectrum(size=N)
    WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize)
    highFrequencyBound = fs / 2 if fs / 2 < 11000 else 11000
    ERBBANDS = ess.ERBBands(sampleRate=fs,
                            highFrequencyBound=highFrequencyBound,
                            inputSize=framesize + 1)

    cochlea1 = []
    for frame in ess.FrameGenerator(x, frameSize=framesize, hopSize=hopsize):
        frame = WINDOW(frame)
        mXFrame = SPECTRUM(frame)
        erbFrame = np.log10(ERBBANDS(mXFrame) + np.finfo(np.float).eps)
        cochlea1.append(erbFrame)
    cochlea1 = np.array(cochlea1)

    ####---- cochleagram 2
    framesize = int(framesize2 * fs)
    N = 2 * framesize  # padding 1 time framesize
    SPECTRUM = ess.Spectrum(size=N)
    WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize)
    highFrequencyBound = fs / 2 if fs / 2 < 11000 else 11000
    ERBBANDS = ess.ERBBands(sampleRate=fs,
                            highFrequencyBound=highFrequencyBound,
                            inputSize=framesize + 1)

    cochlea2 = []
    for frame in ess.FrameGenerator(x, frameSize=framesize, hopSize=hopsize):
        frame = WINDOW(frame)
        mXFrame = SPECTRUM(frame)
        erbFrame = np.log10(ERBBANDS(mXFrame) + np.finfo(np.float).eps)
        cochlea2.append(erbFrame)
    cochlea2 = np.array(cochlea2)

    ####---- smoothed version
    cochlea3 = get_avg(cochlea1, 5, 5)
    cochlea4 = get_avg(cochlea1, 11, 11)

    all_cochleas = np.hstack((cochlea1, cochlea2, cochlea3, cochlea4))

    ####---- delta
    d_all_cochleas = Fdeltas(all_cochleas.T)
    dd_all_cochleas = Fdeltas(Fdeltas(all_cochleas.T, 5), 5)

    d_all_cochleas = d_all_cochleas.T
    dd_all_cochleas = dd_all_cochleas.T

    return all_cochleas, d_all_cochleas, dd_all_cochleas
예제 #3
0
def main_danceability(args):
    """main_danceability

    Compute the danceability feature over input waveform and plot it
    """
    audio = loadaudio(args)
    
    # create the pool and the necessary algorithms
    pool = e.Pool()
    w = estd.Windowing()
    spec = estd.Spectrum()
    centroid = estd.SpectralCentroidTime()

    # compute the centroid for all frames in our audio and add it to the pool
    for frame in estd.FrameGenerator(audio, frameSize = 1024, hopSize = 512):
        c = centroid(spec(w(frame)))
        pool.add('lowlevel.centroid', c)

    # aggregate the results
    aggrpool = estd.PoolAggregator(defaultStats = [ 'mean', 'var' ])(pool)


    # create the pool and the necessary algorithms
    pool = e.Pool()
    w = estd.Windowing()
    # spec = estd.Spectrum()
    # centroid = estd.SpectralCentroidTime()
    danceability = estd.Danceability(maxTau = 10000, minTau = 300, sampleRate = args.samplerate)
    
    # compute the centroid for all frames in our audio and add it to the pool
    for frame in estd.FrameGenerator(audio, frameSize = 10 * args.samplerate, hopSize = 5 * args.samplerate):
        dreal, ddfa = danceability(w(frame))
        print(("d", dreal)) # , "frame", frame
        pool.add('rhythm.danceability', dreal)

    print((type(pool['rhythm.danceability'])))
        
    # aggregate the results
    # aggrpool = estd.PoolAggregator(defaultStats = [ 'mean', 'var' ])(pool)
    
    # write result to file
    # estd.YamlOutput(filename = args.file + '.features.yaml')(aggrpool)

    fig, gs = makefig(rows = 2, cols = 2)
    ax = fig.axes

    ax[0].plot(pool['rhythm.danceability'])

    plt.show()
예제 #4
0
def get_beat_chunks(filename, bpm_restrict=None):
    audio = std.MonoLoader(filename=filename)()
    hpcp = std.HPCP()
    spectrum = std.Spectrum()
    speaks = std.SpectralPeaks()
    large_speaks = std.SpectralPeaks(maxPeaks=2000)
    tivs = []
    sr = 44100
    bpm = get_tempo(filename)
    tivs_framewise = []
    if bpm_restrict != None and bpm_restrict != bpm:
        raise ValueError
    sec_beat = (60 / bpm)
    beats = np.arange(0, len(audio) / sr, sec_beat)
    beats = np.append(beats, len(audio) / sr)
    for i in range(1, len(beats)):
        segmented_audio = audio[int(beats[i - 1] * sr):int(beats[i] * sr)]
        cutter = std.FrameGenerator(segmented_audio)
        for sec in cutter:
            spec = spectrum(sec)
            freq, mag = speaks(spec)
            chroma = hpcp(freq, mag)
            tivs_framewise.append(chroma)
        np2_seg_audio = zeropad_next_power_2(segmented_audio)
        spec = spectrum(np2_seg_audio)
        freq, mag = speaks(spec)
        chroma = hpcp(freq, mag)
        tivs.append(chroma)

    # Calculate the whole TIV
    np2_whole = zeropad_next_power_2(audio)
    spec = spectrum(np2_whole)
    freq, mag = large_speaks(spec)
    chroma_whole = hpcp(freq, mag)
    return mt.TIVCollection.from_pcp(np.array(tivs).T), mt.TIV.from_pcp(chroma_whole), mt.TIVCollection.from_pcp(np.array(tivs_framewise).T)
def getMBE(audio):
    '''
    mel band energy feature
    :param audio:
    :return:
    '''

    winAnalysis = 'hann'

    # this MFCC is for pattern classification, which numberBands always be by default
    MFCC40 = ess.MFCC(sampleRate=fs,
                      highFrequencyBound=highFrequencyBound,
                      inputSize=framesize + 1)

    N = 2 * framesize  # padding 1 time framesize
    SPECTRUM = ess.Spectrum(size=N)
    WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize)

    mfccBands = []
    for frame in ess.FrameGenerator(audio,
                                    frameSize=framesize,
                                    hopSize=hopsize):

        frame = WINDOW(frame)
        mXFrame = SPECTRUM(frame)
        bands, mfccFrame = MFCC40(mXFrame)
        mfccBands.append(bands)
    feature = np.array(mfccBands)
    return feature
예제 #6
0
def get_informative_frames(input_data, markers, parameters, frame_size,
                           hop_size):
    '''
    Takes as input audio data with its markers and parameters,
    and generates informative and noise frames according to markers, frame and hop sizes.
    Returns framed audio, duration of the informartive region, standard deviations of both informative and non-informative parts.
    '''
    first_informative_sample = markers[0][1]
    last_informative_sample = markers[1][1]
    noise_signal = np.append(input_data[0:first_informative_sample],
                             input_data[last_informative_sample:])
    informative_signal = input_data[
        first_informative_sample:last_informative_sample]
    noise_rms = np.std(noise_signal)
    informative_rms = np.std(informative_signal)
    informative_duration = (last_informative_sample -
                            first_informative_sample) / parameters.framerate
    first_informative_frame = int(np.floor(markers[0][1] / hop_size))
    last_informative_frame = int(np.ceil(markers[1][1] / hop_size))
    informative_frames = []
    noise_frames = []
    for frame_idx, frame in enumerate(
            es.FrameGenerator(input_data,
                              frameSize=frame_size,
                              hopSize=hop_size,
                              startFromZero=True)):
        if first_informative_frame <= frame_idx <= last_informative_frame:
            informative_frames.append(frame)
        else:
            noise_frames.append(frame)
    return np.array(informative_frames), np.array(
        noise_frames), informative_duration, informative_rms, noise_rms
예제 #7
0
def energyThresholdAudio(soundfilesList):


    for sound in soundfilesList:
        RMS = esst.RMS()
        audioLoader = esst.MonoLoader(filename=sound)
        audio = audioLoader()


        start=0
        end=0
        thresh=0.05
        rms_vals=[]
        for frame in esst.FrameGenerator(audio, frameSize=2048, hopSize=1024, startFromZero=True):
            rms = RMS(frame)
            rms_vals.append(float(rms))
        rms_vals  = np.array(rms_vals)

        higher=np.where(rms_vals >= thresh)[0]
        if len(higher) > 1:
            start=higher[0]
            end=higher[-1]

        else:
            continue

        newAudio = audio[start*1024:end*1024]
        writer = esst.MonoWriter(filename=sound, format="mp3")
        writer(newAudio)
        print (sound)
예제 #8
0
def computeEnergyHistogram(inputAudioFile, outputJsonFile, threshold,
                           histograms):

    M = 2048
    H = 1024
    fs = 44100

    energy = ess.Energy()
    x = ess.MonoLoader(filename=inputAudioFile, sampleRate=fs)()
    frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True)

    E = []
    numFrames = 0
    for frame in frames:
        numFrames += 1
        E_frame = energy(frame)
        E.append(E_frame)

    E = np.array(E)
    E_norm = E / np.max(E)

    for i in range(len(threshold)):
        t = threshold[i]
        histograms[i] = np.append(histograms[i],
                                  [0] * (numFrames - len(histograms[i])))
        idx_threshold = np.where(E_norm > t)
        histograms[i][idx_threshold[0]] += 1
예제 #9
0
def extract_features(x,
                     M=WINDOW_SIZE,
                     N=FFT_SIZE,
                     H=HOP_SIZE,
                     fs=SR,
                     window_type=WINDOW_TYPE):
    '''
    extract magnitudes spectra from input vector and apply power-law compression
    '''
    #init functions and vectors
    x = essentia.array(x)
    spectrum = ess.Spectrum(size=N)
    window = ess.Windowing(size=M, type=window_type)
    SP = []

    #compute STFT
    for frame in ess.FrameGenerator(x,
                                    frameSize=M,
                                    hopSize=H,
                                    startFromZero=True):  #generate frames
        wX = window(frame)  #window frame
        mX = spectrum(wX)  #compute fft
        ###############################OPTIMIZATION[[[[[[[[[[[[[[]]]]]]]]]]]]]]
        #DEPRECATED
        #################################################
        SP.append(mX)

    SP = essentia.array(SP)
    SP = np.power(SP, 2. / 3.)  #power law compression

    return SP
예제 #10
0
    def get_onsets(self, _audio=[]):

        if _audio != []:
            audio = _audio
        else:
            audio = self.audio

        W = es.Windowing(type=self.winType)
        c2p = es.CartesianToPolar()
        fft = es.FFT()
        onsetDetection = es.OnsetDetection(method=self.onsetMethod,
                                           sampleRate=44100)
        onsets = es.Onsets(alpha=.2)
        # onsetIndex = []
        pool = Pool()

        for frame in es.FrameGenerator(audio, frameSize=1024, hopSize=512):
            mag, phase, = c2p(fft(W(frame)))
            onsetDetection.configure(method=self.onsetMethod)
            onsetFunction = onsetDetection(mag, phase)
            pool.add("onsetFunction", onsetFunction)

        DetectedOnsetsArray = onsets([pool["onsetFunction"]], [1])

        return DetectedOnsetsArray
예제 #11
0
def extract(fname, outpath, fs=22050, fsize=1024, hsize=512):
    """
    extract(fname, outpath, fs, fsize, hsize) will compute the mfcc of Audio file fname.

    Inputs:
        fname   -- is the name of audio file.
        outpath -- is the output path of processed files.
        fs      -- is the sampling frequency (Hz).
        fsize   -- is the size of each frame.
        hsize   -- is the hop size betwean frames.
    Outputs:
        the file contains the mfcc coefficents of audio file.
        in what format???
    """
    #    gate(fname)
    loader = es.MonoLoader(filename=fname, sampleRate=fs)
    #    length = len(loader)
    #    maxim = max(loader)
    #    for sample in loader:
    #        if abs(sample) < maxim/20:
    #            sample = 0 ;

    w = es.Windowing(type='hann')
    spectrum = es.Spectrum()
    mfcc = es.MFCC(inputSize=513, numberCoefficients=20)

    mfccs = []
    audio = loader()
    for frame in es.FrameGenerator(audio, frameSize=1024, hopSize=512):
        mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame)))
        mfccs.append(mfcc_coeffs)

    mfccs = np.array(mfccs)
    return mfcc
예제 #12
0
def segment(audio, hopSize, frameSize, rms_onset_threshold,
            mel_onset_threshold, flux_onset_threshold, onset_threshold):

    # init algorithms
    o_mel = estd.OnsetDetection(method='melflux')
    o_rms = estd.OnsetDetection(method='rms')
    o_hfc = estd.OnsetDetection(method='hfc')
    o_flux = estd.OnsetDetection(method='flux')
    fft = estd.FFT()
    c2p = estd.CartesianToPolar()
    pool = essentia.Pool()
    frame_generator = estd.FrameGenerator(audio,
                                          frameSize=frameSize,
                                          hopSize=hopSize)
    w = estd.Windowing(type='hann')
    yin = estd.PitchYinFFT(frameSize=frameSize,
                           minFrequency=40,
                           maxFrequency=2500,
                           interpolate=True)
    spectrum = estd.Spectrum()
    loudness = estd.Loudness()

    # control parameters
    attack = False
    detection = True
    mel_onset_value = 0
    rms_onset_value = 0

    # output variables
    onset = None
    sustain = None

    for index, frame in enumerate(frame_generator):
        mag, phase = c2p(fft(w(frame)))
        _, conf = yin(spectrum(w(frame)))
        loud = loudness(frame)
        mel_onset = o_mel(mag, phase)
        rms_onset = o_rms(mag, phase)
        hfc_onset = o_hfc(mag, phase)
        flux_onset = o_flux(mag, phase)
        pool.add('onsets_mel', mel_onset)
        pool.add('onsets_rms', rms_onset)
        pool.add('onsets_hfc', hfc_onset)
        pool.add('onsets_flux', flux_onset)
        pool.add('conf', conf)
        pool.add('loudness', loud)

        # condition for onset
        if detection and (flux_onset > flux_onset_threshold or mel_onset > mel_onset_threshold) \
                and rms_onset > rms_onset_threshold and loud > onset_threshold:
            onset = index
            attack = True
            detection = False
            mel_onset_value = mel_onset
            rms_onset_value = rms_onset
        # condition for beginning of sustain
        if attack and conf > 0.5 and rms_onset < rms_onset_value * .05 and mel_onset < mel_onset_value * .3:
            attack = False
            sustain = index
    return onset, sustain
예제 #13
0
def compute_description(x,
                        M=WINDOW_SIZE,
                        N=FFT_SIZE,
                        H=HOP_SIZE,
                        fs=SR,
                        window_type=WINDOW_TYPE):
    '''
    -extract features from audio file
    -Features:
        HFC
        SPECTRAL CENTROID
        SPECTRAL ENERGY
        F0
        loud_factor = energy * (spectral_centroid - F0)  #how many harmonics = how much speaker is yelling
	    PITCH CONFIDENCE
    '''
    #audioLoader = ess.EasyLoader(filename=file_name, sampleRate=fs)
    #create essentia instances
    x = essentia.array(x)
    spectrum = ess.Spectrum(size=N)
    window = ess.Windowing(size=M, type=window_type)
    hfc = ess.HFC(sampleRate=fs)
    spectralCentroid = ess.SpectralCentroidTime(sampleRate=fs)
    energy = ess.Energy()
    pitch_extractor = ess.PredominantPitchMelodia(frameSize=M,
                                                  hopSize=H,
                                                  maxFrequency=1200)
    #init vectors
    CONTRAST = []
    HFC = []
    CENTROID = []
    ENERGY = []

    #compute features for every stft frame
    for frame in ess.FrameGenerator(x,
                                    frameSize=M,
                                    hopSize=H,
                                    startFromZero=True):  #generate frames
        wX = window(frame)  #window frame
        mX = spectrum(wX)  #compute fft
        frame_hfc = hfc(mX)
        HFC.append(frame_hfc)
        frame_centroid = spectralCentroid(
            wX)  #compute spectral centroid in time domain
        CENTROID.append(frame_centroid)
        frame_energy = energy(mX)  #compute spectral energy in time domain
        ENERGY.append(frame_energy)

    F0, SALIENCE = pitch_extractor(x)  #estimate pitch in time domain

    #convert into numpy matrices
    HFC = essentia.array(HFC)
    CENTROID = essentia.array(CENTROID)
    ENERGY = essentia.array(ENERGY)
    F0 = essentia.array(F0)
    SALIENCE = essentia.array(SALIENCE)
    F0 = F0[:len(CENTROID)]
    SALIENCE = SALIENCE[:len(CENTROID)]

    return HFC, CENTROID, ENERGY, F0, SALIENCE
예제 #14
0
def get_melspecs(audio_file: Path, algorithms: dict) -> Optional[dict[str, np.ndarray]]:
    # loading file
    audio = ess.MonoLoader(filename=str(audio_file), sampleRate=SAMPLE_RATE)()

    # precompute melspecs
    melspecs_all = {}
    for algorithm_name in algorithms:
        parameters = algorithms[algorithm_name]

        melspec_extractor = getattr(ess, parameters['melspec-algorithm'])()
        melspecs = []
        for frame in ess.FrameGenerator(audio, frameSize=parameters['frame-size'], hopSize=parameters['hop-size']):
            melspecs.append(melspec_extractor(frame))

        melspecs = np.array(melspecs)

        # reshape melspecs into tensor batches and discard the remainder
        discard = melspecs.shape[0] % parameters['patch-size']
        if discard != 0:
            melspecs = melspecs[:-discard, :]
        melspecs = np.reshape(melspecs, [-1, parameters['patch-size'], parameters['number-bands']])
        batch = np.expand_dims(melspecs, 2)

        melspecs_all[algorithm_name] = batch

    return melspecs_all
    def stft(self):

        # save the results in the stft_pool
        self.mX = []
        for frame in es.FrameGenerator(self.audio,
                                       frameSize=self.frame_size,
                                       hopSize=self.hop_size,
                                       startFromZero=True):

            frame = frame * self.window
            X = fft(frame, self.fft_size)  # computing fft
            absX = np.abs(
                X[:int(self.fft_size / 2)]
            )  # taking first half of the spectrum and its magnitude
            absX[absX < np.finfo(float).eps] = np.finfo(
                float).eps  # getting rid of zeros before the next step
            mX = 20 * np.log10(absX)
            if self.threshold:
                mX[mX < self.threshold] = -1000
            self.mX.append(mX)

        self.mX = array(self.mX)

        self.freqAxHz = float(self.sample_rate) * np.arange(len(
            self.mX[0])) / float(self.fft_size)
        self.freqAxMidi = pitch2midi(self.freqAxHz, quantizePitch=False)

        self.timeAxSec = np.arange(len(self.mX)) * self.hop_size / float(
            self.sample_rate)
def lowSNR_detector(audio: list,
                    frame_size=1024,
                    hop_size=512,
                    nrg_th=0.1,
                    ac_th=0.6,
                    snr_th=5):

    if audio.shape[1] > 1:
        audio = np.reshape(audio, audio.shape[0] * audio.shape[1], order='F')

    audio = audio.astype("float32") / max(audio.astype("float32"))
    audio = esarr(audio.astype("float16"))
    ac_arr = []
    nrg_arr = []
    sig_pwr = 0
    noise_pwr = 0
    sig_cnt = 0
    noise_cnt = 0
    ac_th = 0.6

    for frame in estd.FrameGenerator(audio,
                                     frameSize=frame_size,
                                     hopSize=hop_size,
                                     startFromZero=True):
        ac = abs(autocorr(frame, mode="half"))
        nrg = sum(frame**2)
        ac = ac[0] / sum(ac) if sum(ac) > 0 else 0
        ac_arr.append(ac)
        nrg_arr.append(nrg)

    ac_arr /= max(ac_arr)
    nrg_arr /= max(nrg_arr)

    for nrg, ac in zip(nrg_arr, ac_arr):
        if nrg < nrg_th:
            noise_pwr += nrg**2
            noise_cnt += 1
        else:
            if ac < ac_th:
                sig_pwr += nrg**2
                sig_cnt += 1
            else:
                noise_pwr += nrg**2
                noise_cnt += 1

    if noise_cnt == 0:
        snr = np.inf
    elif sig_cnt == 0:
        snr = 10 * np.log10(eps)
    else:
        sig_pwr /= sig_cnt
        noise_pwr /= noise_cnt
        snr = 10 * np.log10(sig_pwr / noise_pwr)

#	conf = 1-abs(noise_cnt-sig_cnt)/(sig_cnt + noise_cnt)
#	if conf > 0.7 and snr < snr_th:
#		return snr, conf, True
#	return snr, conf, False

    return snr, snr < snr_th
예제 #17
0
파일: demo.py 프로젝트: NemoCpp/pypYIN
def extract_predominant_vocal_melody(audio_filename,
                                     hopSize,
                                     frameSize,
                                     pYinInst,
                                     end_ts=None):
    '''
    extract predominant vocal pitch contour
    as workaround, intersect extracted pitch with vocal annotation
    
    Parameters
    -----------------------
    end_ts: extract until this ts, disregard the rest of the audio  
    
    Returns
    -------------------
    list of estimated pitch values in Hz, at non-vocal returns value <= 0 
    '''
    if WITH_MELODIA:

        if WITH_MAKAM:  #### use predominant melody tailored to makam
            path_Alignment_duration = os.path.join(parentDir,
                                                   'AlignmentDuration')
            if path_Alignment_duration not in sys.path:
                sys.path.append(path_Alignment_duration)
            from src.align.FeatureExtractor import extractPredominantMelodyMakam
            estimatedPitch_andTs = extractPredominantMelodyMakam(
                audio_filename[:-4],
                frameSize,
                hopSize,
                jointAnalysis=False,
                musicbrainzid=None,
                preload=True)  #jointAnalysis=False, becasue no
        else:  # use melodia
            estimatedPitch_andTs = extractPredominantMelody(
                audio_filename, frameSize, hopSize)

    else:  ######### pYIN
        audio = ess.MonoLoader(filename=audio_filename, sampleRate=fs)()
        for frame in ess.FrameGenerator(audio,
                                        frameSize=frameSize,
                                        hopSize=hopSize):
            featureSet = pYinInst.process(frame)

        estimatedPitch = pYinInst.decodePitchTrack()  # pitch extraction
        ts = []  ### generated timestamps
        for onset_frame_number, frame in enumerate(estimatedPitch):
            ts.append(frame_to_ts(onset_frame_number, float(hopSize / fs)))
        estimatedPitch_andTs = np.vstack((np.array(ts), estimatedPitch)).T

    if end_ts is not None:
        idx_end_ts = np.searchsorted(estimatedPitch_andTs[:, 0],
                                     end_ts)  #  until end_ts
        estimatedPitch_andTs = estimatedPitch_andTs[:min(
            idx_end_ts + 1, estimatedPitch_andTs.shape[0]), :]

    if MonoNoteParameters.WITH_VOCAL_SEGMENTS:  # vocal segments given
        estimatedPitch_andTs = intersect_vocal_segments(
            audio_filename, estimatedPitch_andTs)

    return estimatedPitch_andTs[:, 1]
예제 #18
0
def getFeatSequence(inputFile,pulsePos):
    audio = ess.MonoLoader(filename = inputFile, sampleRate = params.Fs)()
    frameCounter = 0
    pool = es.Pool()
    pool.add('samples',audio)
    for frame in ess.FrameGenerator(audio, frameSize = params.frmSize, hopSize = params.hop):
        ts = params.hop/params.Fs*frameCounter + params.frmSize/float(2*params.Fs)
        zpFrame = np.hstack((frame,zz))
        mag = spec(window(zpFrame))
        mfccBands,mfccSeq = genmfcc(mag)
        pool.add('rms',rms(mag))
        pool.add('mfcc',mfccSeq)
        pool.add('time',ts)
        frameCounter += 1
    if pulsePos != None:
        pulsePos = np.append(pulsePos,len(audio)/params.Fs)
        for tp in xrange(len(pulsePos)-1):
            pool.add('pst', pulsePos[tp])
            pool.add('pet', pulsePos[tp+1])
            temp1 = np.where(pool['time'] >= pulsePos[tp])[0]
            temp2 = np.where(pool['time'] < pulsePos[tp+1])[0]
            binIndices = np.intersect1d(temp1, temp2)
            pool.add('pmfcc', np.mean(pool['mfcc'][binIndices,:], axis = 0))
            pool.add('prms', np.mean(pool['rms'][binIndices]))
    else:
        pool.add('pst', 0.0)
        pool.add('pet', len(audio)/params.Fs)
        pool.add('pmfcc', np.mean(pool['mfcc'], axis = 0))
        pool.add('prms', np.mean(pool['rms'], axis = 0))
    return pool
예제 #19
0
def file_to_hpcp(loop):
    loop = e.array(loop)

    windowing = es.Windowing(type='blackmanharris62')
    spectrum = es.Spectrum()
    spectral_peaks = es.SpectralPeaks(orderBy='magnitude',
                                      magnitudeThreshold=0.001,
                                      maxPeaks=20,
                                      minFrequency=20,
                                      maxFrequency=8000)
    hpcp = es.HPCP(maxFrequency=8000)
    spec_group = []
    hpcp_group = []
    for frame in es.FrameGenerator(loop, frameSize=1024, hopSize=512):
        windowed = windowing(frame)
        fft = spectrum(windowed)
        frequencies, magnitudes = spectral_peaks(fft)
        final_hpcp = hpcp(frequencies, magnitudes)
        spec_group.append(fft)
        hpcp_group.append(final_hpcp)

    mean_hpcp = np.mean(np.array(hpcp_group).T, axis=1)
    #normalize to 1
    mean_hpcp = mean_hpcp / mean_hpcp.max()

    return mean_hpcp
예제 #20
0
def getFeature(audio):
    '''
    MFCC of give audio interval [p[0],p[1]]
    :param audio:
    :param p:
    :return:
    '''

    mfcc = []
    # audio_p = audio[p[0]*fs:p[1]*fs]
    for frame in ess.FrameGenerator(audio,
                                    frameSize=framesize_phoneticSimilarity,
                                    hopSize=hopsize_phoneticSimilarity):
        frame = WINDOW(frame)
        mXFrame = SPECTRUM(frame)
        bands, mfccFrame = MFCC(mXFrame)
        mfccFrame = mfccFrame[1:]
        mfcc.append(mfccFrame)

    mfcc = np.array(mfcc).transpose()
    dmfcc = Fdeltas(mfcc, w=5)
    ddmfcc = Fdeltas(dmfcc, w=5)
    feature = np.transpose(np.vstack((mfcc, dmfcc, ddmfcc)))

    return feature
예제 #21
0
def extract_features(x,
                     M=WINDOW_SIZE,
                     N=FFT_SIZE,
                     H=HOP_SIZE,
                     fs=SR,
                     window_type=WINDOW_TYPE):
    '''
    extract magnitudes spectra from input vector
    apply power-law compression
    cutt the upper spectrum
    '''
    #init functions and vectors
    x = essentia.array(x)
    spectrum = ess.Spectrum(size=N)
    window = ess.Windowing(size=M, type=WINDOW_TYPE)
    SP = []

    #compute STFT
    for frame in ess.FrameGenerator(x,
                                    frameSize=M,
                                    hopSize=H,
                                    startFromZero=True):  #generate frames
        wX = window(frame)  #window frame
        mX = spectrum(wX)  #compute fft
        SP.append(mX)

    SP = essentia.array(SP)
    SP = np.power(SP, 2. / 3.)  #power law compression
    #SP = SP[:,:int(FFT_SIZE/2+1)]  #cut upper spectrum (above 4 khz)

    return SP
예제 #22
0
def get_f0(audio, minf0=20, maxf0=22050, cf=0.9, ws=2048, hs=256):
        '''
        Args:
            audio (array): audio signal (output from MonoLoader)
            minf0 (int): minimum allowed frequency
            maxf0 (int): maximun allowed frequency
            cf (float): confidence threshold (0 - 1)
            ws (int): window size
            hp (int): hop size

        Returns:
            f0 (array):
        '''
        # instantiate Essentia functions
        w = es.Windowing(type='hann', zeroPadding=ws)
        spec = es.Spectrum()
        yin = es.PitchYinFFT(minFrequency=minf0, maxFrequency=maxf0, frameSize=ws)

        # empty lists for f0 and confidence
        f0 = []
        conf = []

        # iterate over frames
        for frame in es.FrameGenerator(audio, frameSize=ws, hopSize=hs):
            p, pc = yin(spec(w(frame)))
            f0.append(p)
            conf.append(pc)

        # convert lists to np.arrays
        f0 = np.array(f0)
        conf = np.array(conf)

        # return f0 over given confidence
        f0[conf < cf] = 0
        return f0
예제 #23
0
    def calc_chromagram(self):

        # save the results in the stft_pool
        self.chromagram = []
        hpcp = es.HPCP(
            size=12,  # we will need higher resolution for Key estimation
            referenceFrequency=440,  # assume tuning frequency is 44100.
            bandPreset=False,
            weightType='cosine',
            nonLinear=False,
            windowSize=1.,
            sampleRate=self.sample_rate)

        spectrum = es.Spectrum(size=self.fft_size)
        spectral_peaks = es.SpectralPeaks(sampleRate=self.sample_rate)

        for frame in es.FrameGenerator(self.audio,
                                       frameSize=self.frame_size,
                                       hopSize=self.hop_size,
                                       startFromZero=True):
            frame = array(frame * self.window)
            freqs, mags = spectral_peaks(spectrum(frame))
            chroma = hpcp(freqs, mags)
            self.chromagram.append(chroma)

        self.chromagram = array(self.chromagram)

        self.timeAxSec = np.arange(len(
            self.chromagram)) * self.hop_size / float(self.sample_rate)
예제 #24
0
def getMFCCBands2D(audio, framesize, hopsize, nbf=False, nlen=10):
    """
    mel bands feature [p[0],p[1]]
    output feature for each time stamp is a 2D matrix
    it needs the array format float32
    :param audio:
    :param p:
    :param nbf: bool, if we need to neighbor frames
    :return:
    """

    mfcc = []
    # audio_p = audio[p[0]*fs:p[1]*fs]
    for frame in ess.FrameGenerator(audio,
                                    frameSize=framesize,
                                    hopSize=hopsize):
        frame = WINDOW(frame)
        mXFrame = SPECTRUM(frame)
        bands, mfccFrame = MFCC(mXFrame)
        mfcc.append(bands)

    if nbf:
        mfcc = np.array(mfcc).transpose()
        mfcc_out = np.array(mfcc, copy=True)
        for ii in range(1, nlen + 1):
            mfcc_right_shift = Fprev_sub(mfcc, w=ii)
            mfcc_left_shift = Fprev_sub(mfcc, w=-ii)
            mfcc_out = np.vstack((mfcc_right_shift, mfcc_out, mfcc_left_shift))
        feature = mfcc_out.transpose()
    else:
        feature = mfcc
    # the mel bands features
    feature = np.array(feature, dtype='float32')

    return feature
예제 #25
0
def get_hpeaks_per_frame(audio, sr=44100, onlyfrecuencies=False, nsines=20):
    """
    Get Harmonic peaks in an audio
    :param audio: Audio either mono or stereo. Will be downsampled to mono
    :param sr: Samplerate used for the audio
    :return: Nx2x100. N is the number of resulting frames. 2x100 are the frequencies and magnitudes respectively.
    """
    if audio.ndim > 1:
        audio = std.MonoMixer()(audio, audio.shape[1])

    fft_algo = std.FFT()
    pyin = std.PitchYin()
    hpeaks = std.HarmonicPeaks()
    sine_anal = std.SineModelAnal(maxnSines=nsines,
                                  orderBy='frequency',
                                  minFrequency=1)
    sines = []
    for i, frame in enumerate(
            std.FrameGenerator(audio, frameSize=4096, hopSize=2048)):
        pitch, _ = pyin(frame)
        fft = fft_algo(frame)
        freqs, mags, _ = sine_anal(fft)
        sorting_indexes = np.argsort(freqs)
        freqs = freqs[sorting_indexes]
        mags = mags[sorting_indexes]
        non_zero_freqs = np.where(freqs != 0)
        freqs = freqs[non_zero_freqs]
        mags = mags[non_zero_freqs]
        freqs, mags = hpeaks(freqs, mags, pitch)
        sines.append([freqs, mags])
    sines = np.array(sines)
    if onlyfrecuencies:
        return sines[:, 0, :]
    else:
        return sines[:, 0, :], sines[:, 1, :]
예제 #26
0
def extract_features(path):
    loader = essentia.standard.MonoLoader(filename=path)
    audio = loader()
    mfcc = MFCC(numberCoefficients=13)
    loudness = Loudness()
    spectrum = Spectrum(
    )  # FFT() would return the complex FFT, here we just want the magnitude spectrum
    w = Windowing(type='hann')

    pool = essentia.Pool()
    for frame in ess.FrameGenerator(audio,
                                    frameSize=1024,
                                    hopSize=512,
                                    startFromZero=True):
        mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame)))
        average_loudness = loudness(spectrum(w(frame)))
        pool.add('lowlevel.mfcc', mfcc_coeffs)
        pool.add('lowlevel.loudness', average_loudness)
        #pool.add('lowlevel.mfcc_bands', mfcc_bands)
        #pool.add('lowlevel.mfcc_bands_log', logNorm(mfcc_bands))

    #YamlOutput(filename = 'mfcc.sig', format='yaml', writeVersion=False)(pool)

    # compute mean and variance of the frames
    #aggrPool = PoolAggregator(defaultStats = [ 'mean', 'stdev' ])(pool)
    aggrPool = PoolAggregator(defaultStats=['mean'])(pool)

    # and ouput those results in a file
    YamlOutput(filename='features.json', format='json',
               writeVersion=False)(aggrPool)
    save_descriptors_as_strings()
예제 #27
0
def getHPCPEssentia(XAudio, Fs, winSize, hopSize, squareRoot=False, NChromaBins=36, NHarmonics = 0):
    """
    Wrap around the essentia library to compute HPCP features
    :param XAudio: A flat array of raw audio samples
    :param Fs: Sample rate
    :param winSize: Window size of each STFT window
    :param hopSize: Hop size between STFT windows
    :param squareRoot: Do square root compression?
    :param NChromaBins: How many chroma bins (default 36)
    :returns H: An (NChromaBins x NWindows) matrix of all \
        chroma windows
    """
    import essentia
    from essentia import Pool, array
    import essentia.standard as ess
    spectrum = ess.Spectrum()
    window = ess.Windowing(size=winSize, type='hann')
    spectralPeaks = ess.SpectralPeaks()
    hpcp = ess.HPCP(size=NChromaBins, harmonics=NHarmonics)
    H = []
    for frame in ess.FrameGenerator(array(XAudio), frameSize=winSize, hopSize=hopSize, startFromZero=True):
        S = spectrum(window(frame))
        freqs, mags = spectralPeaks(S)
        H.append(hpcp(freqs, mags))
    H = np.array(H)
    H = H.T
    if squareRoot:
        H = sqrtCompress(H)
    return H
예제 #28
0
def extract_features(x,
                     M=Config.WINDOW_SIZE,
                     N=Config.FFT_SIZE,
                     H=Config.HOP_SIZE,
                     fs=Config.FS,
                     window_type=Config.WINDOW_TYPE):
    '''
    Function that extracts spectrogram from an audio signal
    -----------------------
    Input: Samples, window size (int), FFT size (int), Hop size (int),
    Sampling rate, Window type (e.g. Hanning)

    Output: Spectrogram
    -----------------------
    '''
    # init functions and vectors
    x = essentia.array(x)
    spectrum = ess.Spectrum(size=N)
    window = ess.Windowing(size=M, type=window_type)
    SP = []
    # compute STFT
    for frame in ess.FrameGenerator(x,
                                    frameSize=M,
                                    hopSize=H,
                                    startFromZero=True):  # generate frames
        wX = window(frame)  # window frame
        mX = spectrum(wX)  # compute fft

        SP.append(mX)
    SP = essentia.array(SP)
    SP = np.power(SP, 2. / 3.)  # power law compression
    SP = SP[:, :int(Config.FFT_SIZE / 4 + 1)]

    return SP
예제 #29
0
    def compute(self, *args):
        self.algo.reset()
        for frame in es.FrameGenerator(args[1], frameSize=frameSize,
                                       hopSize=hopSize,
                                       startFromZero=True):
            snr, _, _  = self.algo(frame)

        return esarr([snr])
예제 #30
0
 def _build_mfcc(self):
     for frame in es.FrameGenerator(self.audio,
                                    frameSize=1024,
                                    hopSize=512,
                                    startFromZero=True):
         spec = spectrum(w(frame))
         _, mfcc_coeffs = mfcc(spec)
         self.pool.add('MFCC', mfcc_coeffs)