def computeDutationSongs(root_dir, FileExt2Proc='.mp3'):

    audiofilenames = GetFileNamesInDir(root_dir, FileExt2Proc)

    totalLen = 0
    length = []
    mbid_dur = {}
    for audiofile in audiofilenames:
        if FileExt2Proc == '.mp3':
            audio = MP3(audiofile)
            totalLen += audio.info.length
            length.append(audio.info.length)
            mbid = fetchMBID(audiofile)
            if not mbid_dur.has_key(mbid):
                mbid_dur[mbid] = audio.info.length
        elif FileExt2Proc == '.wav':
            totalLen += ES.MetadataReader(filename=audiofile)()[7]
            length.append(ES.MetadataReader(filename=audiofile)()[7])
    print "total files %d\n" % len(audiofilenames)
    print "Total length %d\n" % totalLen
    print "Max length %d\n" % np.max(length)
    print "Min length %d\n" % np.min(length)
    print "Mean length %d\n" % np.mean(length)
    print "median length %d\n" % np.median(length)

    return mbid_dur
def generateBinaryAggMFCCARFF(class1Folder, class2Folder, class1, class2, arffFile, frameDur, hopDur, aggDur):
    """
    This function generates an arff file of MFCC features for two classes class1 and class2 for which MFCCs are extracted from the audio files kept in appropriate folders (first two args)
    mappFile store audiofile names and number of features extracted from that file
    """
    fname,ext = os.path.splitext(arffFile)
    mappFile = fname + '.mappFileFeat'
    
    #features extracted and features to use
    features = featuresExtracted()
    features2Use = featuresUsed()
    
    #array of class labels
    classes = [class1, class2]
    
    #finding index of features to be used in classification
    ind_features = []
    for feat in features2Use:
        ind_features.append(features.index(feat))

    #selecting those features
    features = np.array(features)
    features = features[ind_features]
    features = features.tolist()
    
    #writing header for arff file
    fid = open(arffFile,'w')
    
    fid.write("@relation 'ToWeka_sectionSegmentation'\n")
    for feature in features:
        fid.write("@attribute %s numeric\n"%feature)
    fid.write("@attribute class  {")
    for clas in classes:
        fid.write("%s,\t"%clas)
    fid.write("}\n")
    fid.write("@data\n")
    fid.close()

    fidMapp = open(mappFile,'w')
    fidMapp.close()
    
    
    #loading the audio file into an array
    ML =es.MonoLoader()
    
    
    #start extracting features and write
    class1audiofiles = BP.GetFileNamesInDir(class1Folder,'wav')
    for audiofile in class1audiofiles:
        fid = open(arffFile,'a')
        fidMapp = open(mappFile,'a')
        print audiofile
        #computing dynamically fs, aggLen based on provided hop size
        fs=float(es.MetadataReader(filename=audiofile)()[9])
        framesize = int(np.round(fs*frameDur))
        if framesize%2 ==1:
            framesize=framesize+1
        hopsize = int(np.round(fs*hopDur))
        aggLen = int(np.round(aggDur*fs/hopsize))

        ML.configure(filename = audiofile)
        audio_in = ML()
        featuresAll = feature_extractor_standard(audio_in, framesize, hopsize, aggLen)
        featuresAll = featuresAll[:,ind_features]
        for ftr in featuresAll:
            fid.write("%f,"*len(features)%tuple(ftr))
            fid.write("%s\n"%classes[0])
        fidMapp.write("%s\t%d\n"%(audiofile, featuresAll.shape[0]))
        del featuresAll
        fid.close()
        fidMapp.close()
        
    
    class2audiofiles = BP.GetFileNamesInDir(class2Folder,'wav')
    for audiofile in class2audiofiles:
        fid = open(arffFile,'a')
        fidMapp = open(mappFile,'a')
        print audiofile
        #computing dynamically fs, aggLen based on provided hop size
        fs=float(es.MetadataReader(filename=audiofile)()[9])
        framesize = int(np.round(fs*frameDur))
        if framesize%2 ==1:
            framesize=framesize+1
        hopsize = int(np.round(fs*hopDur))
        aggLen = int(np.round(aggDur*fs/hopsize))
        
        ML.configure(filename = audiofile)
        audio_in = ML()
        featuresAll = feature_extractor_standard(audio_in, framesize, hopsize, aggLen)
        featuresAll = featuresAll[:,ind_features]
        for ftr in featuresAll:
            fid.write("%f,"*len(features)%tuple(ftr))
            fid.write("%s\n"%classes[1])
        fidMapp.write("%s\t%d\n"%(audiofile, featuresAll.shape[0]))
        del featuresAll
        fid.close()
        fidMapp.close()
Пример #3
0
def compute_features(complete_path):
    result = []
    meta_result = []
    file_count = 0
    # for loop over files
    for file in os.listdir(complete_path):
        if file.endswith(".wav"):
            file_count+=1
            # print(file +' : ' + str(file_count))

            # load our audio into an array
            audio = es.MonoLoader(filename=complete_path + file, sampleRate=44100)()

            # create the pool and the necessary algorithms
            pool = essentia.Pool()
            window = es.Windowing()
            energy = es.Energy()
            spectrum = es.Spectrum()
            centroid = es.Centroid(range=22050)
            rolloff = es.RollOff()
            crest = es.Crest()
            speak = es.StrongPeak()
            rmse = es.RMS()
            mfcc = es.MFCC()
            flux = es.Flux()
            barkbands = es.BarkBands( sampleRate = 44100)
            zerocrossingrate = es.ZeroCrossingRate()

            meta = es.MetadataReader(filename=complete_path + file, failOnError=True)()
            pool_meta, duration, bitrate, samplerate, channels = meta[7:]
            
            # centralmoments = es.SpectralCentralMoments()
            # distributionshape = es.DistributionShape()

            # compute the centroid for all frames in our audio and add it to the pool
            for frame in es.FrameGenerator(audio, frameSize = 1024, hopSize = 512):
                frame_windowed = window(frame)
                frame_spectrum = spectrum(frame_windowed)
                
                c = centroid(frame_spectrum)
                pool.add('spectral.centroid', c)

                cr = crest(frame_spectrum)
                pool.add('spectral crest', cr)

                r = rolloff(frame_spectrum)
                pool.add('spectral rolloff', r)

                sp = speak(frame_spectrum)
                pool.add('strong peak', sp)

                rms = rmse(frame_spectrum)
                pool.add('RMS', rms)

                pool.add('spectral_energy', energy(frame_spectrum))
                # (frame_melbands, frame_mfcc) = mfcc(frame_spectrum)
                # pool.add('frame_MFCC', frame_mfcc)

                fl = flux(frame_spectrum)
                pool.add('spectral flux', fl)

                # bbands = barkbands(frame_spectrum)
                # pool.add('bark bands', bbands)

                zcr = zerocrossingrate(frame_spectrum)
                pool.add('zero crossing rate', zcr)

                # frame_centralmoments = centralmoments(power_spectrum)
                # (frame_spread, frame_skewness, frame_kurtosis) = distributionshape(frame_centralmoments)
                # pool.add('spectral_kurtosis', frame_kurtosis)
                # pool.add('spectral_spread', frame_spread)
                # pool.add('spectral_skewness', frame_skewness)

            # aggregate the results (find mean if needed)
            aggrpool = es.PoolAggregator(defaultStats = ['mean'])(pool) #,'stdev' ])(pool)
            
            pool_meta.set("duration", duration)
            pool_meta.set("filename", os.path.relpath(file))

            # write pools to lists
            pool_arr = pool_to_array(aggrpool)
            result.append(pool_arr)

            meta_arr = pool_to_array(pool_meta)
            meta_result.append(meta_arr)
         
    features_df = pd.DataFrame.from_records(result)
    features_df.columns = ['centroid', 'crest','roll off','strong peak','rms','energy','flux','zcr']
    
    meta_df = pd.DataFrame.from_records(meta_result)
    meta_df.columns = ['duration','filename','metadata.tags.comment']
    del meta_df['metadata.tags.comment']

    return features_df,meta_df
def extractSoloPercussion(audiofile, segFile, modelFile, normFile, frameDur, hopDur, aggDur, medianDur=20):
    
    #extactly same set of features used in training of the model
    features = featuresExtracted()
    features2Use = featuresUsed()
    
    # indexes of the chosen features
    ind_features = []
    for feat in features2Use:
        ind_features.append(features.index(feat))
    
    
    #computing dynamically fs, aggLen based on provided hop size
    fs=float(es.MetadataReader(filename=audiofile)()[10])
    print "Sampling rate is %d\n"%fs
    if fs!=44100:
        print "Hey here is a file which doesn't have 44100 as fs"
    framesize = int(np.round(fs*frameDur))
    if framesize%2 ==1:
            framesize=framesize+1
    hopsize = int(np.round(fs*hopDur))
    aggLen = int(np.round(aggDur*fs/hopsize))
    
    #loading the audio file into an array
    ML =es.MonoLoader()
    ML.configure(filename = audiofile)
    audio_in = ML()
    #computing features
    featuresAll = feature_extractor_standard(audio_in, framesize, hopsize, aggLen)
    features = featuresAll[:,ind_features]
    
    #normalization step, read the values used to normalize features while building the model
    fid=file(normFile,'r')
    normVals = yaml.load(fid)
    fid.close()
    for ii in np.arange(features.shape[1]):
            features[:,ii] = features[:,ii]-normVals[ii]['mean']
            features[:,ii] = features[:,ii]/normVals[ii]['var']
            
    #initializing object to predic classes using built model
    perc = 0 #= solo percussion
    nperc = 1 #= all except solo percussion
    objML = mlw.experimenter()
    prediction = objML.predicByModel(modelFile, features)
    
    ### CRUCIAL STEP
    # There are few frames in between voice section which are labelled as tani (either becauseof strong (or solo) mridandam part or missclassification or some other factor), these are generally 1 or 2 aggLen marked as tani. 
    #AS we don't want to loose any vocal segment we would like to do a median filering here roughly duration of 20 seconds. Which means only those segments longer than 10 seconds in continuam would be marked as same label.
    median_length = int(np.round(medianDur/aggDur))
    prediction = filters.median_filter(prediction,size= median_length)
    prediction = filters.median_filter(prediction,size= median_length)
    

    #array in which we store start and ending of every solo percussion segment
    perc_sec = []
    
    #flag
    perStr = 0
    for ii,val in enumerate(prediction):
        if val == perc and perStr == 0:
            perStr = 1
            strInd=ii
        if val == nperc and perStr ==1:
            perStr = 0
            perc_sec.append([strInd, ii])
    if perStr==1:
         perc_sec.append([strInd, prediction.shape[0]])
    
    #converting array to float values
    perc_sec = np.array(perc_sec).astype(np.float)
    #converting from indexes to time stamps
    perc_sec = perc_sec*float(aggLen)*float(hopsize)/float(fs)
    
    for sec in perc_sec:
        if(sec[1]-sec[0])<120 or sec[1] < 0.5*float(prediction.shape[0])*float(aggLen)*float(hopsize)/float(fs):
            print "WIERD AUDIO FILE IS HERE: %s"%audiofile
            
    np.savetxt(segFile, perc_sec,  fmt='%.3f',)
    
    return 1
Пример #5
0
def plotPitch(audio_file, pitch_file, output_file, time_start, time_end):
    """
	Example:
	PredominantPitchExample: plt.plotPitch('/media/Data/Datasets/PatternProcessing_DB/unsupervisedDBs/hindustaniDB/Hindustani30Ragas/audio/a99e07d5-20a0-467b-8dcd-aa5a095177fd/Rashid_Khan/Evergreen/Raga_Lalit_783aa4b0-26f3-4e18-844c-b787be6d9849.mp3', '/media/Data/Datasets/PatternProcessing_DB/unsupervisedDBs/hindustaniDB/Hindustani30Ragas/audio/a99e07d5-20a0-467b-8dcd-aa5a095177fd/Rashid_Khan/Evergreen/Raga_Lalit_783aa4b0-26f3-4e18-844c-b787be6d9849.pitch', '/home/sankalp/Work/Work_PhD/publications/2016_PhDThesis/plotUtils/ch05_preProcessing/predominantMelodyExample.png', 22*60 + 15,  22*60 + 45)
	octaveErrorIllustration1: plt.plotPitch('/media/Data/Datasets/PatternProcessing_DB/unsupervisedDBs/hindustaniDB/Hindustani30Ragas/audio/a99e07d5-20a0-467b-8dcd-aa5a095177fd/Rashid_Khan/Evergreen/Raga_Lalit_783aa4b0-26f3-4e18-844c-b787be6d9849.mp3', '/media/Data/Datasets/PatternProcessing_DB/unsupervisedDBs/hindustaniDB/Hindustani30Ragas/audio/a99e07d5-20a0-467b-8dcd-aa5a095177fd/Rashid_Khan/Evergreen/Raga_Lalit_783aa4b0-26f3-4e18-844c-b787be6d9849.pitch', '/home/sankalp/Work/Work_PhD/publications/2016_PhDThesis/plotUtils/ch05_preProcessing/octaveErrorIllustration.png', 22*60 + 20,  22*60 + 25)
	octaveErrorIllustration2:plt.plotPitch('/media/Data/Datasets/PatternProcessing_DB/unsupervisedDBs/hindustaniDB/Hindustani30Ragas/audio/46997b02-f09c-4969-8138-4e1861f61967/Kaustuv_Kanti_Ganguli/Raag_Shree/Raag_Shree_928a430e-813e-48b0-8a23-566e74aa8dc9.mp3', '/media/Data/Datasets/PatternProcessing_DB/unsupervisedDBs/hindustaniDB/Hindustani30Ragas/audio/46997b02-f09c-4969-8138-4e1861f61967/Kaustuv_Kanti_Ganguli/Raag_Shree/Raag_Shree_928a430e-813e-48b0-8a23-566e74aa8dc9.pitch', '/home/sankalp/Work/Work_PhD/publications/2016_PhDThesis/plotUtils/ch05_preProcessing/octaveErrorIllustration.png', 59*60 + 47,  59*60 + 50)
	octaveErrorIllustration3: plt.plotPitch('/media/Data/Datasets/PatternProcessing_DB/unsupervisedDBs/hindustaniDB/Hindustani30Ragas/audio/64e5fb9e-5569-4e80-8e6c-f543af9469c7/Prabha_Atre/Maalkauns/Jaako_Mana_Raam_980b4a00-6e7c-41c1-81ee-6b021d237343.mp3', '/media/Data/Datasets/PatternProcessing_DB/unsupervisedDBs/hindustaniDB/Hindustani30Ragas/audio/64e5fb9e-5569-4e80-8e6c-f543af9469c7/Prabha_Atre/Maalkauns/Jaako_Mana_Raam_980b4a00-6e7c-41c1-81ee-6b021d237343.pitch', '/home/sankalp/Work/Work_PhD/publications/2016_PhDThesis/plotUtils/ch05_preProcessing/octaveErrorIllustration.png', 25*60 + 9,  25*60 + 11)




	
	"""
    frameSize = 4096
    hopSize = 512
    NFFT = 4096
    w = np.hamming(frameSize)

    audio = ess.MonoLoader(filename=audio_file)()
    sampleRate = float(ess.MetadataReader(filename=audio_file)()[10])

    time_pitch = np.loadtxt(pitch_file)

    sample_start = sampleRate * time_start
    sample_end = sampleRate * time_end

    audio = audio[sample_start:sample_end]

    ind_start = np.argmin(abs(time_pitch[:, 0] - time_start))
    ind_end = np.argmin(abs(time_pitch[:, 0] - time_end))

    pitch = copy.deepcopy(time_pitch[ind_start:ind_end, 1])
    time = copy.deepcopy(time_pitch[ind_start:ind_end,
                                    0]) - time_pitch[ind_start, 0]

    mX, pX = STFT.stftAnal(audio, w, NFFT, hopSize)

    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.hold(True)
    fsize = 14
    fsize2 = 14
    #font="Times New Roman"

    maxplotfreq = 1000.0
    numFrames = int(mX[:, 0].size)
    frmTime = hopSize * np.arange(numFrames) / float(sampleRate)
    binFreq = sampleRate * np.arange(NFFT * maxplotfreq / sampleRate) / NFFT
    plt.pcolormesh(frmTime, binFreq,
                   np.transpose(mX[:, :NFFT * maxplotfreq / sampleRate + 1]))
    plt.hold(True)
    p, = plt.plot(time, pitch, color='k')

    xLim = ax.get_xlim()
    yLim = ax.get_ylim()
    ax.set_aspect((xLim[1] - xLim[0]) / (2 * float(yLim[1] - yLim[0])))
    plt.autoscale(tight=True)

    plt.legend([p], ['Predominant pitch'])
    plt.xlabel("Time (s)", fontsize=fsize)
    plt.ylabel("Frequency (Hz)", fontsize=fsize)

    plt.tight_layout()
    plt.savefig(output_file, dpi=600)