Exemplo n.º 1
0
def create_dissonances_examples(audio_target_file,
                                file_compatibilities,
                                audios_folder,
                                n_examples=3,
                                sr=44100):
    """
    Given an audio and a file with its compatibilities create the mixes
    :param audio_target_file: The audio itself
    :param file_compatibilities: The compatibility file for the target audio
    :param audios_folder: The folder where are located all the audios
    :param n_examples: Number of examples to generate
    :param sr: sample rate of the final mix
    :return: A list where each element is a mix
    """
    df = pd.read_csv(file_compatibilities)
    listoreturn = []

    df_sorted = df.sort_values(by=['compatibility_framewise'],
                               ascending=True).iloc[:n_examples, :]
    for idx, candidate in df_sorted.iterrows():
        cand_f = candidate['filename'].split('/')[-1]
        pshift = candidate['pitch_shift_framewise']
        audio_target = std.MonoLoader(filename=os.path.join(
            audios_folder, audio_target_file),
                                      sampleRate=sr)()
        audio_candidate = std.MonoLoader(filename=os.path.join(
            audios_folder, cand_f),
                                         sampleRate=sr)()
        audio_candidate = pitch_shift(audio_candidate, 44100,
                                      pshift).astype(np.float32)
        audio = mix(audio_target, audio_candidate, sr=sr)
        listoreturn.append(audio)

    return listoreturn
Exemplo n.º 2
0
    def mix(self, filename, synthesized_voice):
        # Get instrument lineup
        filename_violin = filename.replace("vocal.wav", "violin.wav")
        filename_mridangam_right = filename.replace("vocal.wav", "mridangam_right.wav")
        filename_mridangam_left = filename.replace("vocal.wav", "mridangam_left.wav")
        filename_tanpura = filename.replace("vocal.wav", "tanpura.wav")

        # Load audios and trim to synthesized voice length
        violin_mono = estd.MonoLoader(filename=filename_violin)()
        violin_mono_processed = np.array(violin_mono[:len(synthesized_voice) + 1], dtype='float64')
        violin_mono_processed_filt = self.filter_audio(audio=violin_mono_processed, coef=0.00075)
        mridangam_right_mono = estd.MonoLoader(filename=filename_mridangam_right)()
        mridangam_right_mono_processed = np.array(mridangam_right_mono[:len(synthesized_voice) + 1], dtype='float64')
        mridangam_right_mono_processed_filt = self.filter_audio(audio=mridangam_right_mono_processed, coef=0.001)
        mridangam_left_mono = estd.MonoLoader(filename=filename_mridangam_left)()
        mridangam_left_mono_processed = np.array(mridangam_left_mono[:len(synthesized_voice) + 1], dtype='float64')
        mridangam_left_mono_processed_filt = self.filter_audio(audio=mridangam_left_mono_processed, coef=0.001)
        tanpura_mono = estd.MonoLoader(filename=filename_tanpura)()
        tanpura_mono_processed = np.array(tanpura_mono[:len(synthesized_voice) + 1], dtype='float64')
        
        # Assign weights
        if self.mixing_weights:
            weight_voice = self.mixing_weights['voice']
            weight_violin = self.mixing_weights['violin']
            weight_mridangam_right = self.mixing_weights['mridangam_right']
            weight_mridangam_left = self.mixing_weights['mridangam_left']
            weight_tanpura = self.mixing_weights['tanpura']
        else:
            # Predefined weights in case no weight dict is provided
            weight_voice = 5.25
            weight_violin = 4
            weight_mridangam_right = 1
            weight_mridangam_left = 1
            weight_tanpura = 33.5

        # Get mix
        synthesized_audio_mix = [
            x*weight_voice +
            y*weight_violin +
            z*weight_mridangam_right +
            w*weight_mridangam_left +
            t*weight_tanpura for x, y, z, w, t in zip(
                synthesized_voice,
                violin_mono_processed_filt,
                mridangam_right_mono_processed_filt,
                mridangam_left_mono_processed_filt,
                tanpura_mono_processed
            )
        ]
        
        return synthesized_audio_mix
Exemplo n.º 3
0
def get_melspecs(audio_file: Path, algorithms: dict) -> Optional[dict[str, np.ndarray]]:
    # loading file
    audio = ess.MonoLoader(filename=str(audio_file), sampleRate=SAMPLE_RATE)()

    # precompute melspecs
    melspecs_all = {}
    for algorithm_name in algorithms:
        parameters = algorithms[algorithm_name]

        melspec_extractor = getattr(ess, parameters['melspec-algorithm'])()
        melspecs = []
        for frame in ess.FrameGenerator(audio, frameSize=parameters['frame-size'], hopSize=parameters['hop-size']):
            melspecs.append(melspec_extractor(frame))

        melspecs = np.array(melspecs)

        # reshape melspecs into tensor batches and discard the remainder
        discard = melspecs.shape[0] % parameters['patch-size']
        if discard != 0:
            melspecs = melspecs[:-discard, :]
        melspecs = np.reshape(melspecs, [-1, parameters['patch-size'], parameters['number-bands']])
        batch = np.expand_dims(melspecs, 2)

        melspecs_all[algorithm_name] = batch

    return melspecs_all
def chop_folder(folder):
    count = 0
    for loop in os.listdir(folder):
        try:
            loop_path = folder + loop
            audio_file = es.MonoLoader(filename=loop_path,
                                       sampleRate=sampleRate)
            audio = audio_file.compute()
            count += 1

            if count % 50 == 0:
                print(count)

            chops = range(math.ceil(len(audio) / four_bar_length))
            for chop in chops:
                if chop == chops[-1]:
                    audio_to_save = np.zeros(four_bar_length)
                    audio_to_save[:len(audio[chop * four_bar_length:]
                                       )] = audio[chop * four_bar_length:]
                    sf.write(FLSD_CHOPPED_AUDIO_DIR + str(chop + 1) + loop,
                             audio_to_save, sampleRate)
                else:
                    audio_to_save = audio[chop * four_bar_length:(chop + 1) *
                                          four_bar_length]
                    sf.write(FLSD_CHOPPED_AUDIO_DIR + str(chop + 1) + loop,
                             audio_to_save, sampleRate)
        except:
            print(loop + " failed!")
Exemplo n.º 5
0
def energyThresholdAudio(soundfilesList):


    for sound in soundfilesList:
        RMS = esst.RMS()
        audioLoader = esst.MonoLoader(filename=sound)
        audio = audioLoader()


        start=0
        end=0
        thresh=0.05
        rms_vals=[]
        for frame in esst.FrameGenerator(audio, frameSize=2048, hopSize=1024, startFromZero=True):
            rms = RMS(frame)
            rms_vals.append(float(rms))
        rms_vals  = np.array(rms_vals)

        higher=np.where(rms_vals >= thresh)[0]
        if len(higher) > 1:
            start=higher[0]
            end=higher[-1]

        else:
            continue

        newAudio = audio[start*1024:end*1024]
        writer = esst.MonoWriter(filename=sound, format="mp3")
        writer(newAudio)
        print (sound)
Exemplo n.º 6
0
def extract_predominant_vocal_melody(audio_filename,
                                     hopSize,
                                     frameSize,
                                     pYinInst,
                                     end_ts=None):
    '''
    extract predominant vocal pitch contour
    as workaround, intersect extracted pitch with vocal annotation
    
    Parameters
    -----------------------
    end_ts: extract until this ts, disregard the rest of the audio  
    
    Returns
    -------------------
    list of estimated pitch values in Hz, at non-vocal returns value <= 0 
    '''
    if WITH_MELODIA:

        if WITH_MAKAM:  #### use predominant melody tailored to makam
            path_Alignment_duration = os.path.join(parentDir,
                                                   'AlignmentDuration')
            if path_Alignment_duration not in sys.path:
                sys.path.append(path_Alignment_duration)
            from src.align.FeatureExtractor import extractPredominantMelodyMakam
            estimatedPitch_andTs = extractPredominantMelodyMakam(
                audio_filename[:-4],
                frameSize,
                hopSize,
                jointAnalysis=False,
                musicbrainzid=None,
                preload=True)  #jointAnalysis=False, becasue no
        else:  # use melodia
            estimatedPitch_andTs = extractPredominantMelody(
                audio_filename, frameSize, hopSize)

    else:  ######### pYIN
        audio = ess.MonoLoader(filename=audio_filename, sampleRate=fs)()
        for frame in ess.FrameGenerator(audio,
                                        frameSize=frameSize,
                                        hopSize=hopSize):
            featureSet = pYinInst.process(frame)

        estimatedPitch = pYinInst.decodePitchTrack()  # pitch extraction
        ts = []  ### generated timestamps
        for onset_frame_number, frame in enumerate(estimatedPitch):
            ts.append(frame_to_ts(onset_frame_number, float(hopSize / fs)))
        estimatedPitch_andTs = np.vstack((np.array(ts), estimatedPitch)).T

    if end_ts is not None:
        idx_end_ts = np.searchsorted(estimatedPitch_andTs[:, 0],
                                     end_ts)  #  until end_ts
        estimatedPitch_andTs = estimatedPitch_andTs[:min(
            idx_end_ts + 1, estimatedPitch_andTs.shape[0]), :]

    if MonoNoteParameters.WITH_VOCAL_SEGMENTS:  # vocal segments given
        estimatedPitch_andTs = intersect_vocal_segments(
            audio_filename, estimatedPitch_andTs)

    return estimatedPitch_andTs[:, 1]
Exemplo n.º 7
0
def loadaudio(args):
    """util.loadaudio

    Load data from an audio file of any format supported by Monoloader
    """
    loader = estd.MonoLoader(filename=args.file, sampleRate=args.samplerate)
    return loader()
def BatchProcess_TonicIdentification(RootDir,
                                     tonicExt='.tonic',
                                     FileExt2Proc=".wav",
                                     overwrite=0):

    audiofilenames = GetFileNamesInDir(RootDir, FileExt2Proc)

    for audiofilename in audiofilenames:
        print "processing %s" % audiofilename
        path, fileN = os.path.split(audiofilename)
        fname, ext = os.path.splitext(audiofilename)
        tonic_filename = fname + tonicExt
        if overwrite == 0 and os.path.isfile(tonic_filename):
            continue

        tonic_file = open(tonic_filename, 'w')

        audio = ES.MonoLoader(filename=audiofilename)()

        tonic = ES.TonicIndianArtMusic()(audio)

        MBID = audiofilename.split("/")[-1].strip()
        print MBID
        tonic_file.write(str(tonic) + "\n")
        tonic_file.close()
Exemplo n.º 9
0
def featureExtraction(soundfiles):
    # extractor = esst.LowLevelSpectralExtractor()
    extractor = esst.Extractor(dynamics = False,
                                                dynamicsFrameSize = 88200,
                                                dynamicsHopSize = 44100,
                                                highLevel = False,
        			         lowLevel = True,
        			         lowLevelFrameSize = 2048,
        			         lowLevelHopSize = 1024,
        			         midLevel = True,
        			         namespace = "",
        			         relativeIoi = False,
        			         rhythm = False,
        			         sampleRate  = 44100,
        			         tonalFrameSize  = 4096,
        			         tonalHopSize = 2048,
			         tuning = True)

	#soundfiles = listdir(inputPath)
    for file in soundfiles:

        path1= '/Users/helena/Desktop/SMC/ASP/sms-tools/workspace/A10/code/downloaded/'
        name=file[70:-4] + '_features.json'
        outPath = path1 + 'features/' + name
        print file
        audioLoader = esst.MonoLoader(filename=file)
        audio = audioLoader()
        pool = essentia.Pool()
        pool = extractor(audio)
        aggPool = esst.PoolAggregator()(pool)
        output = esst.YamlOutput(filename = outPath, format='json')
        output(aggPool)
        print (outPath + ' exported')
def extract_for_one(wavDataDir, lineList, filename, FILE_EXT_WAV):
    filename_wav                 = os.path.join(wavDataDir,filename+FILE_EXT_WAV)
    filename_wav_silence_removed = os.path.join(wavDataDir+'_silence_removed','temp'+FILE_EXT_WAV)

    ##-- remove the silence from audio
    sr = 44100
    audio = ess.MonoLoader(filename=filename_wav,downmix='left',sampleRate=sr)()
    audio_remove_silence = removeSilence(audio,sr,lineList)
    wavfile.write(filename_wav_silence_removed,sr,audio_remove_silence)

    ##-- process the silence removed audio
    loader = essentia.streaming.EqloudLoader(filename=filename_wav_silence_removed)
    fEx = FeatureExtractor(frameSize=2048, hopSize=1024, sampleRate=loader.paramValue('sampleRate'))
    p = essentia.Pool()

    loader.audio >> fEx.signal

    for desc, output in fEx.outputs.items():
        output >> (p, desc)

    essentia.run(loader)

    # convert pitch from hz to cents
    for i in range(len(p['pitch_instantaneous_pitch'])):
        p['pitch_instantaneous_pitch'][i] = hz2cents(p['pitch_instantaneous_pitch'][i])

    stats = ['mean', 'var', 'dmean', 'dvar']
    statsPool = essentia.standard.PoolAggregator(defaultStats=stats)(p)

    return statsPool
Exemplo n.º 11
0
def getFeatSequence(inputFile,pulsePos):
    audio = ess.MonoLoader(filename = inputFile, sampleRate = params.Fs)()
    frameCounter = 0
    pool = es.Pool()
    pool.add('samples',audio)
    for frame in ess.FrameGenerator(audio, frameSize = params.frmSize, hopSize = params.hop):
        ts = params.hop/params.Fs*frameCounter + params.frmSize/float(2*params.Fs)
        zpFrame = np.hstack((frame,zz))
        mag = spec(window(zpFrame))
        mfccBands,mfccSeq = genmfcc(mag)
        pool.add('rms',rms(mag))
        pool.add('mfcc',mfccSeq)
        pool.add('time',ts)
        frameCounter += 1
    if pulsePos != None:
        pulsePos = np.append(pulsePos,len(audio)/params.Fs)
        for tp in xrange(len(pulsePos)-1):
            pool.add('pst', pulsePos[tp])
            pool.add('pet', pulsePos[tp+1])
            temp1 = np.where(pool['time'] >= pulsePos[tp])[0]
            temp2 = np.where(pool['time'] < pulsePos[tp+1])[0]
            binIndices = np.intersect1d(temp1, temp2)
            pool.add('pmfcc', np.mean(pool['mfcc'][binIndices,:], axis = 0))
            pool.add('prms', np.mean(pool['rms'][binIndices]))
    else:
        pool.add('pst', 0.0)
        pool.add('pet', len(audio)/params.Fs)
        pool.add('pmfcc', np.mean(pool['mfcc'], axis = 0))
        pool.add('prms', np.mean(pool['rms'], axis = 0))
    return pool
Exemplo n.º 12
0
def extract(fname, outpath, fs=22050, fsize=1024, hsize=512):
    """
    extract(fname, outpath, fs, fsize, hsize) will compute the mfcc of Audio file fname.

    Inputs:
        fname   -- is the name of audio file.
        outpath -- is the output path of processed files.
        fs      -- is the sampling frequency (Hz).
        fsize   -- is the size of each frame.
        hsize   -- is the hop size betwean frames.
    Outputs:
        the file contains the mfcc coefficents of audio file.
        in what format???
    """
    #    gate(fname)
    loader = es.MonoLoader(filename=fname, sampleRate=fs)
    #    length = len(loader)
    #    maxim = max(loader)
    #    for sample in loader:
    #        if abs(sample) < maxim/20:
    #            sample = 0 ;

    w = es.Windowing(type='hann')
    spectrum = es.Spectrum()
    mfcc = es.MFCC(inputSize=513, numberCoefficients=20)

    mfccs = []
    audio = loader()
    for frame in es.FrameGenerator(audio, frameSize=1024, hopSize=512):
        mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame)))
        mfccs.append(mfcc_coeffs)

    mfccs = np.array(mfccs)
    return mfcc
Exemplo n.º 13
0
 def __init__(self,
              audio_file,
              mono=True,
              hop_length=512,
              sample_rate=44100,
              normalize_gain=False,
              verbose=False):
     """[summary]
     
     Arguments:
         audio_file {[type]} -- [description]
     
     Keyword Arguments:
         mono {bool} -- [description] (default: {True})
         hop_length {int} -- [description] (default: {512})
         sample_rate {int} -- [description] (default: {44100})
         normalize_gain {bool} -- [description] (default: {False})
         verbose {bool} -- [description] (default: {False})
     """
     self.hop_length = hop_length
     self.fs = sample_rate
     self.audio_file = audio_file
     if normalize_gain:
         self.audio_vector = estd.EasyLoader(filename=audio_file,
                                             sampleRate=self.fs,
                                             replayGain=-9)()
     elif mono:
         self.audio_vector = estd.MonoLoader(filename=audio_file,
                                             sampleRate=self.fs)()
     if verbose:
         print(
             "== Audio vector of %s loaded with shape %s and sample rate %s =="
             % (audio_file, self.audio_vector.shape, self.fs))
Exemplo n.º 14
0
    def __init__(self, path):
        self.audio = es.MonoLoader(filename=str(path))()
        self.name = path.name
        self.pool = essentia.Pool()

        self._build_temporal_features()
        self._build_spectral_features()
        self._build_harmonic_features()
        self._build_mfcc()

        self._features = {
            'audio_correlation': 'AC',
            'audio_power': 'AP',
            'audio_waveform': 'AWF',
            'bandwidth': 'SB',
            'effective_duration': 'ED',
            'fundamental_freq': 'F0',
            'inharmonicity': 'INH',
            'log_attack_time': 'LAT',
            'max_freq': 'FMax',
            'mfcc': 'MFCC',
            'min_freq': 'FMin',
            'oer': 'OER',
            'peak_ampl': 'PA',
            'peak_freq': 'PF',
            'spectral_centroid': 'SC',
            'spectral_flatness': 'SF',
            'spectral_flux': 'SFX',
            'spectral_roll_off': 'SRO',
            'spectral_spread': 'SS',
            'temporal_centroid': 'TC',
            'tristimulus': 'T',
            'zcr': 'ZCR'
        }
Exemplo n.º 15
0
def get_beat_chunks(filename, bpm_restrict=None):
    audio = std.MonoLoader(filename=filename)()
    hpcp = std.HPCP()
    spectrum = std.Spectrum()
    speaks = std.SpectralPeaks()
    large_speaks = std.SpectralPeaks(maxPeaks=2000)
    tivs = []
    sr = 44100
    bpm = get_tempo(filename)
    tivs_framewise = []
    if bpm_restrict != None and bpm_restrict != bpm:
        raise ValueError
    sec_beat = (60 / bpm)
    beats = np.arange(0, len(audio) / sr, sec_beat)
    beats = np.append(beats, len(audio) / sr)
    for i in range(1, len(beats)):
        segmented_audio = audio[int(beats[i - 1] * sr):int(beats[i] * sr)]
        cutter = std.FrameGenerator(segmented_audio)
        for sec in cutter:
            spec = spectrum(sec)
            freq, mag = speaks(spec)
            chroma = hpcp(freq, mag)
            tivs_framewise.append(chroma)
        np2_seg_audio = zeropad_next_power_2(segmented_audio)
        spec = spectrum(np2_seg_audio)
        freq, mag = speaks(spec)
        chroma = hpcp(freq, mag)
        tivs.append(chroma)

    # Calculate the whole TIV
    np2_whole = zeropad_next_power_2(audio)
    spec = spectrum(np2_whole)
    freq, mag = large_speaks(spec)
    chroma_whole = hpcp(freq, mag)
    return mt.TIVCollection.from_pcp(np.array(tivs).T), mt.TIV.from_pcp(chroma_whole), mt.TIVCollection.from_pcp(np.array(tivs_framewise).T)
Exemplo n.º 16
0
 def __get_signal__(self):
     """
     :rtype: returns the audio signal by reading the file
     """
     e_monoloader = e.MonoLoader(filename=self.fpath)
     self.signal = e_monoloader()
     self.signal_length = len(self.signal)
Exemplo n.º 17
0
def computeEnergyHistogram(inputAudioFile, outputJsonFile, threshold,
                           histograms):

    M = 2048
    H = 1024
    fs = 44100

    energy = ess.Energy()
    x = ess.MonoLoader(filename=inputAudioFile, sampleRate=fs)()
    frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True)

    E = []
    numFrames = 0
    for frame in frames:
        numFrames += 1
        E_frame = energy(frame)
        E.append(E_frame)

    E = np.array(E)
    E_norm = E / np.max(E)

    for i in range(len(threshold)):
        t = threshold[i]
        histograms[i] = np.append(histograms[i],
                                  [0] * (numFrames - len(histograms[i])))
        idx_threshold = np.where(E_norm > t)
        histograms[i][idx_threshold[0]] += 1
Exemplo n.º 18
0
def featureExtraction(soundfiles):

    #extractor = esst.LowLevelSpectralExtractor()
    extractor = esst.Extractor(dynamics=True,
                               dynamicsFrameSize=88200,
                               dynamicsHopSize=44100,
                               highLevel=True,
                               lowLevel=True,
                               lowLevelFrameSize=2048,
                               lowLevelHopSize=1024,
                               midLevel=True,
                               namespace="",
                               relativeIoi=False,
                               rhythm=True,
                               sampleRate=44100,
                               tonalFrameSize=4096,
                               tonalHopSize=2048,
                               tuning=True)

    #soundfiles = listdir(inputPath)

    for file, outPath in soundfiles:

        audioLoader = esst.MonoLoader(filename=file)
        audio = audioLoader()
        pool = essentia.Pool()
        pool = extractor(audio)
        aggPool = esst.PoolAggregator()(pool)
        esst.YamlOutput(filename=outPath + 'features.json',
                        format='json')(aggPool)
        print(file + ' exported')
def mfccFeature_audio(filename_wav, index_keep, feature_type='mfcc'):
    audio = ess.MonoLoader(downmix='left',
                           filename=filename_wav,
                           sampleRate=fs)()
    if feature_type == 'mfcc':
        feature = getFeature(audio)
    elif feature_type == 'mfccBands1D':
        feature = getMFCCBands1D(audio)
    elif feature_type == 'mfccBands2D':
        feature = getMFCCBands2D(audio, nbf=True)

    if feature_type == 'mfccBands1D' or feature_type == 'mfccBands2D':
        feature = np.log(100000 * feature + 1)
        scaler = pickle.load(open(kerasScaler_path, 'rb'))
        feature = scaler.transform(feature)

    # feature             = preprocessing.StandardScaler().fit_transform(feature)
    # index_keep          = pitchProcessing_audio(filename_wav)
    feature_out = feature[index_keep[0], :]

    for index in index_keep[1:]:
        feature_out = np.vstack((feature_out, feature[index, :]))

    if feature_type == 'mfccBands2D':
        feature_out = featureReshape(feature_out)

    return feature_out
 def load_audio(self):
     # loads the audio
     # apply equal-loudness filter for PredominantPitchMelodia
     loader = es.MonoLoader(filename=self.filename,
                            sampleRate=self.sample_rate)
     self.audio = loader()
     xvals = np.arange(len(self.audio)) / float(self.sample_rate)
     self.xlim = [0, max(xvals)]
Exemplo n.º 21
0
def duration(infile):
    """
    Returns the duration of a song in seconds.
    """
    dur = standard.Duration()
    audio = standard.MonoLoader(filename=infile)()
    duration = dur(audio)
    return duration
Exemplo n.º 22
0
def get_number_beats(filename):
    audio = std.MonoLoader(filename=filename)()
    sr = 44100
    bpm = get_tempo(filename)
    sec_beat = (60 / bpm)
    beats = np.arange(0, len(audio) / sr, sec_beat)
    beats = np.append(beats, len(audio) / sr)
    return len(beats)
Exemplo n.º 23
0
def estimate_beats(infile):
    """
    Return the estimated beat onsets in seconds for an audio file.
    """
    audio = standard.MonoLoader(filename=infile)()
    bt = standard.BeatTrackerMultiFeature()
    beats, confidence = bt(audio)
    return beats
Exemplo n.º 24
0
def features(filename):
    
    wav,fs = soundfile.read(filename)
    audio  = ess.MonoLoader(downmix = 'left',filename = filename,sampleRate =fs)()
    features,d_MRCG,dd_MRCG = MRCG(audio,fs=fs)
    
    print(features)
    return
Exemplo n.º 25
0
def extractor(filename):

    fs = 44100
    audio = ess.MonoLoader(filename=filename, sampleRate=fs)()
    # dynamic range expansion as done in HTK implementation
    audio = audio * 2**15

    frameSize = 1102  # corresponds to htk default WINDOWSIZE = 250000.0
    hopSize = 441  # corresponds to htk default TARGETRATE = 100000.0
    fftSize = 2048
    spectrumSize = fftSize // 2 + 1
    zeroPadding = fftSize - frameSize

    w = ess.Windowing(
        type='hamming',  #  corresponds to htk default  USEHAMMING = T
        size=frameSize,
        zeroPadding=zeroPadding,
        normalized=False,
        zeroPhase=False)

    spectrum = ess.Spectrum(size=fftSize)

    mfcc_htk = ess.MFCC(
        inputSize=spectrumSize,
        type='magnitude',  # htk uses mel filterbank magniude
        warpingFormula='htkMel',  # htk's mel warping formula
        weighting='linear',  # computation of filter weights done in Hz domain
        highFrequencyBound=8000,  # corresponds to htk default
        lowFrequencyBound=0,  # corresponds to htk default
        numberBands=26,  # corresponds to htk default  NUMCHANS = 26
        numberCoefficients=13,
        normalize=
        'unit_max',  # htk filter normaliation to have constant height = 1  
        dctType=3,  # htk uses DCT type III
        logType='log',
        liftering=22)  # corresponds to htk default CEPLIFTER = 22

    mfccs = []
    # startFromZero = True, validFrameThresholdRatio = 1 : the way htk computes windows
    for frame in ess.FrameGenerator(audio,
                                    frameSize=frameSize,
                                    hopSize=hopSize,
                                    startFromZero=True,
                                    validFrameThresholdRatio=1):
        spect = spectrum(w(frame))
        mel_bands, mfcc_coeffs = mfcc_htk(spect)
        mfccs.append(mfcc_coeffs)

    # transpose to have it in a better shape
    # we need to convert the list to an essentia.array first (== numpy.array of floats)
    # mfccs = essentia.array(pool['MFCC']).T
    mfccs = essentia.array(mfccs).T

    # and plot
    plt.imshow(mfccs[1:, :], aspect='auto',
               interpolation='none')  # ignore enery
    # plt.imshow(mfccs, aspect = 'auto', interpolation='none')
    plt.show()  # unnecessary if you started "ipython --pylab"
    def __init__(self, parent=None, **options):
        #
        #   Inputs:
        #       parent                  :   parent QtCanvas
        #
        #   Options:
        #       width (int)             :   width of plot
        #       height(int)             :   height of plot
        #       dpi (int)               :   resolution of plot
        #       xlim (tuple)            :   (x0,x1)
        #       ylim (tuple)            :   (x0,x1)

        self._width = 5
        self._height = 5
        self._dpi = 100

        if "width" in options:
            self._width = options.get("width")
        if "height" in options:
            self._height = options.get("height")
        if "dpi" in options:
            self._dpi = options.get("dpi")
        if "filename" in options:
            self.filename = options.get("filename")
            self.audio = es.MonoLoader(filename=self.filename,
                                       sampleRate=44100)()
        else:
            self.filename = None
            self.audio = None

        self.is_playing = False

        self.play_rate = 44100  # use keys 0 to 9 to reduce speed from 100 to 90%
        # Set the x and y limits of the window
        self.xlim = (0, 20)
        self.ylim = (0, 20)

        # create figure and axes for plotting
        self.fig = Figure(figsize=(self._width, self._height), dpi=self._dpi)
        self.ax = self.fig.add_subplot(111)

        # Figure Canvas initialization
        FigureCanvas.__init__(self, self.fig)
        self.setParent(parent)
        self.setFocusPolicy(Qt.ClickFocus)
        self.setFocus()

        # initialize variables used for calculation of spectrogram

        #initialize figure
        FigureCanvas.setSizePolicy(self, QtWidgets.QSizePolicy.Expanding,
                                   QtWidgets.QSizePolicy.Expanding)
        FigureCanvas.updateGeometry(self)

        self.key_pressed_cid = self.fig.canvas.mpl_connect(
            'key_press_event', self.on_key_press)

        self.show()
Exemplo n.º 27
0
def pYINPtNote(filename1,fs=44100,frameSize=2048,hopSize=256):

    '''
    Given filename, return pitchtrack and note transcription track
    :param filename1:
    :param fs:
    :param frameSize:
    :param hopSize:
    :return:
    '''
    # initialise
    pYinInst = pYINmain.PyinMain()
    pYinInst.initialise(channels = 1, inputSampleRate = fs, stepSize = hopSize, blockSize = frameSize,
                   lowAmp = 0.25, onsetSensitivity = 0.7, pruneThresh = 0.1)

    # frame-wise calculation
    audio = ess.MonoLoader(filename = filename1, sampleRate = fs)()

    # rms mean
    # rms = []
    # for frame in ess.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize):
    #     rms.append(RMS(frame, frameSize))
    # rmsMean = np.mean(rms)
    # print 'rmsMean', rmsMean

    for frame in ess.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize):
        fs = pYinInst.process(frame)

    # calculate smoothed pitch and mono note
    monoPitch = pYinInst.getSmoothedPitchTrack()

    # output smoothed pitch track
    print('pitch track')
    for ii in fs.m_oSmoothedPitchTrack:
        print(ii.values)
    print('\n')

    fs = pYinInst.getRemainingFeatures(monoPitch)

    # output of mono notes,
    # column 0: frame number,
    # column 1: pitch in midi numuber, this is the decoded pitch
    # column 2: attack 1, stable 2, silence 3
    print('mono note decoded pitch')
    for ii in fs.m_oMonoNoteOut:
        print(ii.frameNumber, ii.pitch, ii.noteState)
    print('\n')

    print('note pitch tracks')
    for ii in fs.m_oNotePitchTracks:
        print(ii)
    print('\n')

    # median pitch in Hz of the notes
    print('median note pitch')
    for ii in fs.m_oNotes:
        print(ii.values)
    print('\n')
Exemplo n.º 28
0
def compute_all_features(audio_file, audio_beats=False):
    """Computes all the features for a specific audio file and its respective
        human annotations.

    Returns
    -------
    features : dict
        Dictionary with the following features:
            mfcc : np.array
                Mel Frequency Cepstral Coefficients representation
            hpcp : np.array
                Harmonic Pitch Class Profiles
            tonnets : np.array
                Tonal Centroids (or Tonnetz)
    """

    # Makes sure the output features folder exists
    utils.ensure_dir(OUTPUT_FEATURES)
    features_file = os.path.join(OUTPUT_FEATURES,
                                 os.path.basename(audio_file) + ".json")

    # If already precomputed, read and return
    if os.path.exists(features_file):
        with open(features_file, "r") as f:
            features = json.load(f)
        return list_to_array(features)

    # Load Audio
    logging.info("Loading audio file %s" % os.path.basename(audio_file))
    audio = ES.MonoLoader(filename=audio_file, sampleRate=SAMPLE_RATE)()
    duration = len(audio) / float(SAMPLE_RATE)

    # Estimate Beats
    features = {}
    ticks, conf = compute_beats(audio)
    ticks = np.concatenate(([0], ticks, [duration]))  # Add first and last time
    ticks = essentia.array(np.unique(ticks))
    features["beats"] = ticks.tolist()

    # Compute Beat-sync features
    features["mfcc"], features["hpcp"], features["tonnetz"] = \
        compute_beatsync_features(ticks, audio)

    # Save output as audio file
    if audio_beats:
        logging.info("Saving Beats as an audio file")
        marker = ES.AudioOnsetsMarker(onsets=ticks, type='beep',
                                      sampleRate=SAMPLE_RATE)
        marked_audio = marker(audio)
        ES.MonoWriter(filename='beats.wav',
                      sampleRate=SAMPLE_RATE)(marked_audio)

    # Save features
    with open(features_file, "w") as f:
        json.dump(features, f)

    return list_to_array(features)
Exemplo n.º 29
0
 def read_audio(self, audio_file):
     self.set_audio_file(audio_file)
     if self.normalize_gain:
         self.audio_vector = estd.EasyLoader(filename=audio_file,
                                             sampleRate=self.fs,
                                             replayGain=-9)()
     elif self.mono:
         self.audio_vector = estd.MonoLoader(filename=audio_file,
                                             sampleRate=self.fs)()
Exemplo n.º 30
0
def piano_timing_features(anno_file,
                          audio_file,
                          latency,
                          bpm,
                          max_spectral_centroid=3500,
                          onset_threshold=2,
                          series_delta=0.22,
                          sample_rate=44100):
    bars, beats, events, chords = symbolic_analysis.rhythm_for_file(anno_file)
    beats = np.array(beats)
    events = np.array(events)

    is_defined = [x[0] != 'N' for x in chords]
    chords = chords[is_defined]
    events = events[is_defined]

    # LOAD AUDIO
    audio = ess.MonoLoader(filename=audio_file)()
    duration = float(len(audio)) / sample_rate
    half_ibi = (beats[1:] - beats[:-1]).mean() / 2
    start = max(events[0] - half_ibi, 0)
    end = min(events[-1] + half_ibi, duration)

    # LOAD BEATS FROM AUDIO
    onset_func = ess.OnsetDetectionGlobal()(audio)

    # CHANGE SILENCE THRESHOLD DEPENDING ON THE BPM
    silence_th = 0.2
    if bpm >= 40 and bpm < 50:
        silence_th = 0.2
    if bpm >= 50 and bpm < 60:
        silence_th = 0.15
    if bpm >= 60 and bpm < 70:
        silence_th = 0.1
    if bpm >= 70 and bpm < 80:
        silence_th = 0.05
    if bpm >= 80:
        silence_th = 0.02

    # COMPUTE ONSETS FROM AUDIO
    onsets = np.array(
        list(
            ess.Onsets(alpha=1, silenceThreshold=silence_th)([onset_func],
                                                             [1])))

    # COMPUTE DEVIATIONS BETWEEN ANNOTATION AND COMPUTED ONSETS
    devs = feature_extraction.attack_deviations(events, onsets, start, end)
    f, p, r = onset_measures(events, onsets, f_measure_threshold=0.25)

    features = {
        'onsets': onsets,
        'devs': devs,
        'f_measure': f,
        'precision': p,
        'recall': r
    }
    return features