def computeDutationSongs(root_dir, FileExt2Proc='.mp3'): audiofilenames = GetFileNamesInDir(root_dir, FileExt2Proc) totalLen = 0 length = [] mbid_dur = {} for audiofile in audiofilenames: if FileExt2Proc == '.mp3': audio = MP3(audiofile) totalLen += audio.info.length length.append(audio.info.length) mbid = fetchMBID(audiofile) if not mbid_dur.has_key(mbid): mbid_dur[mbid] = audio.info.length elif FileExt2Proc == '.wav': totalLen += ES.MetadataReader(filename=audiofile)()[7] length.append(ES.MetadataReader(filename=audiofile)()[7]) print "total files %d\n" % len(audiofilenames) print "Total length %d\n" % totalLen print "Max length %d\n" % np.max(length) print "Min length %d\n" % np.min(length) print "Mean length %d\n" % np.mean(length) print "median length %d\n" % np.median(length) return mbid_dur
def generateBinaryAggMFCCARFF(class1Folder, class2Folder, class1, class2, arffFile, frameDur, hopDur, aggDur): """ This function generates an arff file of MFCC features for two classes class1 and class2 for which MFCCs are extracted from the audio files kept in appropriate folders (first two args) mappFile store audiofile names and number of features extracted from that file """ fname,ext = os.path.splitext(arffFile) mappFile = fname + '.mappFileFeat' #features extracted and features to use features = featuresExtracted() features2Use = featuresUsed() #array of class labels classes = [class1, class2] #finding index of features to be used in classification ind_features = [] for feat in features2Use: ind_features.append(features.index(feat)) #selecting those features features = np.array(features) features = features[ind_features] features = features.tolist() #writing header for arff file fid = open(arffFile,'w') fid.write("@relation 'ToWeka_sectionSegmentation'\n") for feature in features: fid.write("@attribute %s numeric\n"%feature) fid.write("@attribute class {") for clas in classes: fid.write("%s,\t"%clas) fid.write("}\n") fid.write("@data\n") fid.close() fidMapp = open(mappFile,'w') fidMapp.close() #loading the audio file into an array ML =es.MonoLoader() #start extracting features and write class1audiofiles = BP.GetFileNamesInDir(class1Folder,'wav') for audiofile in class1audiofiles: fid = open(arffFile,'a') fidMapp = open(mappFile,'a') print audiofile #computing dynamically fs, aggLen based on provided hop size fs=float(es.MetadataReader(filename=audiofile)()[9]) framesize = int(np.round(fs*frameDur)) if framesize%2 ==1: framesize=framesize+1 hopsize = int(np.round(fs*hopDur)) aggLen = int(np.round(aggDur*fs/hopsize)) ML.configure(filename = audiofile) audio_in = ML() featuresAll = feature_extractor_standard(audio_in, framesize, hopsize, aggLen) featuresAll = featuresAll[:,ind_features] for ftr in featuresAll: fid.write("%f,"*len(features)%tuple(ftr)) fid.write("%s\n"%classes[0]) fidMapp.write("%s\t%d\n"%(audiofile, featuresAll.shape[0])) del featuresAll fid.close() fidMapp.close() class2audiofiles = BP.GetFileNamesInDir(class2Folder,'wav') for audiofile in class2audiofiles: fid = open(arffFile,'a') fidMapp = open(mappFile,'a') print audiofile #computing dynamically fs, aggLen based on provided hop size fs=float(es.MetadataReader(filename=audiofile)()[9]) framesize = int(np.round(fs*frameDur)) if framesize%2 ==1: framesize=framesize+1 hopsize = int(np.round(fs*hopDur)) aggLen = int(np.round(aggDur*fs/hopsize)) ML.configure(filename = audiofile) audio_in = ML() featuresAll = feature_extractor_standard(audio_in, framesize, hopsize, aggLen) featuresAll = featuresAll[:,ind_features] for ftr in featuresAll: fid.write("%f,"*len(features)%tuple(ftr)) fid.write("%s\n"%classes[1]) fidMapp.write("%s\t%d\n"%(audiofile, featuresAll.shape[0])) del featuresAll fid.close() fidMapp.close()
def compute_features(complete_path): result = [] meta_result = [] file_count = 0 # for loop over files for file in os.listdir(complete_path): if file.endswith(".wav"): file_count+=1 # print(file +' : ' + str(file_count)) # load our audio into an array audio = es.MonoLoader(filename=complete_path + file, sampleRate=44100)() # create the pool and the necessary algorithms pool = essentia.Pool() window = es.Windowing() energy = es.Energy() spectrum = es.Spectrum() centroid = es.Centroid(range=22050) rolloff = es.RollOff() crest = es.Crest() speak = es.StrongPeak() rmse = es.RMS() mfcc = es.MFCC() flux = es.Flux() barkbands = es.BarkBands( sampleRate = 44100) zerocrossingrate = es.ZeroCrossingRate() meta = es.MetadataReader(filename=complete_path + file, failOnError=True)() pool_meta, duration, bitrate, samplerate, channels = meta[7:] # centralmoments = es.SpectralCentralMoments() # distributionshape = es.DistributionShape() # compute the centroid for all frames in our audio and add it to the pool for frame in es.FrameGenerator(audio, frameSize = 1024, hopSize = 512): frame_windowed = window(frame) frame_spectrum = spectrum(frame_windowed) c = centroid(frame_spectrum) pool.add('spectral.centroid', c) cr = crest(frame_spectrum) pool.add('spectral crest', cr) r = rolloff(frame_spectrum) pool.add('spectral rolloff', r) sp = speak(frame_spectrum) pool.add('strong peak', sp) rms = rmse(frame_spectrum) pool.add('RMS', rms) pool.add('spectral_energy', energy(frame_spectrum)) # (frame_melbands, frame_mfcc) = mfcc(frame_spectrum) # pool.add('frame_MFCC', frame_mfcc) fl = flux(frame_spectrum) pool.add('spectral flux', fl) # bbands = barkbands(frame_spectrum) # pool.add('bark bands', bbands) zcr = zerocrossingrate(frame_spectrum) pool.add('zero crossing rate', zcr) # frame_centralmoments = centralmoments(power_spectrum) # (frame_spread, frame_skewness, frame_kurtosis) = distributionshape(frame_centralmoments) # pool.add('spectral_kurtosis', frame_kurtosis) # pool.add('spectral_spread', frame_spread) # pool.add('spectral_skewness', frame_skewness) # aggregate the results (find mean if needed) aggrpool = es.PoolAggregator(defaultStats = ['mean'])(pool) #,'stdev' ])(pool) pool_meta.set("duration", duration) pool_meta.set("filename", os.path.relpath(file)) # write pools to lists pool_arr = pool_to_array(aggrpool) result.append(pool_arr) meta_arr = pool_to_array(pool_meta) meta_result.append(meta_arr) features_df = pd.DataFrame.from_records(result) features_df.columns = ['centroid', 'crest','roll off','strong peak','rms','energy','flux','zcr'] meta_df = pd.DataFrame.from_records(meta_result) meta_df.columns = ['duration','filename','metadata.tags.comment'] del meta_df['metadata.tags.comment'] return features_df,meta_df
def extractSoloPercussion(audiofile, segFile, modelFile, normFile, frameDur, hopDur, aggDur, medianDur=20): #extactly same set of features used in training of the model features = featuresExtracted() features2Use = featuresUsed() # indexes of the chosen features ind_features = [] for feat in features2Use: ind_features.append(features.index(feat)) #computing dynamically fs, aggLen based on provided hop size fs=float(es.MetadataReader(filename=audiofile)()[10]) print "Sampling rate is %d\n"%fs if fs!=44100: print "Hey here is a file which doesn't have 44100 as fs" framesize = int(np.round(fs*frameDur)) if framesize%2 ==1: framesize=framesize+1 hopsize = int(np.round(fs*hopDur)) aggLen = int(np.round(aggDur*fs/hopsize)) #loading the audio file into an array ML =es.MonoLoader() ML.configure(filename = audiofile) audio_in = ML() #computing features featuresAll = feature_extractor_standard(audio_in, framesize, hopsize, aggLen) features = featuresAll[:,ind_features] #normalization step, read the values used to normalize features while building the model fid=file(normFile,'r') normVals = yaml.load(fid) fid.close() for ii in np.arange(features.shape[1]): features[:,ii] = features[:,ii]-normVals[ii]['mean'] features[:,ii] = features[:,ii]/normVals[ii]['var'] #initializing object to predic classes using built model perc = 0 #= solo percussion nperc = 1 #= all except solo percussion objML = mlw.experimenter() prediction = objML.predicByModel(modelFile, features) ### CRUCIAL STEP # There are few frames in between voice section which are labelled as tani (either becauseof strong (or solo) mridandam part or missclassification or some other factor), these are generally 1 or 2 aggLen marked as tani. #AS we don't want to loose any vocal segment we would like to do a median filering here roughly duration of 20 seconds. Which means only those segments longer than 10 seconds in continuam would be marked as same label. median_length = int(np.round(medianDur/aggDur)) prediction = filters.median_filter(prediction,size= median_length) prediction = filters.median_filter(prediction,size= median_length) #array in which we store start and ending of every solo percussion segment perc_sec = [] #flag perStr = 0 for ii,val in enumerate(prediction): if val == perc and perStr == 0: perStr = 1 strInd=ii if val == nperc and perStr ==1: perStr = 0 perc_sec.append([strInd, ii]) if perStr==1: perc_sec.append([strInd, prediction.shape[0]]) #converting array to float values perc_sec = np.array(perc_sec).astype(np.float) #converting from indexes to time stamps perc_sec = perc_sec*float(aggLen)*float(hopsize)/float(fs) for sec in perc_sec: if(sec[1]-sec[0])<120 or sec[1] < 0.5*float(prediction.shape[0])*float(aggLen)*float(hopsize)/float(fs): print "WIERD AUDIO FILE IS HERE: %s"%audiofile np.savetxt(segFile, perc_sec, fmt='%.3f',) return 1
def plotPitch(audio_file, pitch_file, output_file, time_start, time_end): """ Example: PredominantPitchExample: plt.plotPitch('/media/Data/Datasets/PatternProcessing_DB/unsupervisedDBs/hindustaniDB/Hindustani30Ragas/audio/a99e07d5-20a0-467b-8dcd-aa5a095177fd/Rashid_Khan/Evergreen/Raga_Lalit_783aa4b0-26f3-4e18-844c-b787be6d9849.mp3', '/media/Data/Datasets/PatternProcessing_DB/unsupervisedDBs/hindustaniDB/Hindustani30Ragas/audio/a99e07d5-20a0-467b-8dcd-aa5a095177fd/Rashid_Khan/Evergreen/Raga_Lalit_783aa4b0-26f3-4e18-844c-b787be6d9849.pitch', '/home/sankalp/Work/Work_PhD/publications/2016_PhDThesis/plotUtils/ch05_preProcessing/predominantMelodyExample.png', 22*60 + 15, 22*60 + 45) octaveErrorIllustration1: plt.plotPitch('/media/Data/Datasets/PatternProcessing_DB/unsupervisedDBs/hindustaniDB/Hindustani30Ragas/audio/a99e07d5-20a0-467b-8dcd-aa5a095177fd/Rashid_Khan/Evergreen/Raga_Lalit_783aa4b0-26f3-4e18-844c-b787be6d9849.mp3', '/media/Data/Datasets/PatternProcessing_DB/unsupervisedDBs/hindustaniDB/Hindustani30Ragas/audio/a99e07d5-20a0-467b-8dcd-aa5a095177fd/Rashid_Khan/Evergreen/Raga_Lalit_783aa4b0-26f3-4e18-844c-b787be6d9849.pitch', '/home/sankalp/Work/Work_PhD/publications/2016_PhDThesis/plotUtils/ch05_preProcessing/octaveErrorIllustration.png', 22*60 + 20, 22*60 + 25) octaveErrorIllustration2:plt.plotPitch('/media/Data/Datasets/PatternProcessing_DB/unsupervisedDBs/hindustaniDB/Hindustani30Ragas/audio/46997b02-f09c-4969-8138-4e1861f61967/Kaustuv_Kanti_Ganguli/Raag_Shree/Raag_Shree_928a430e-813e-48b0-8a23-566e74aa8dc9.mp3', '/media/Data/Datasets/PatternProcessing_DB/unsupervisedDBs/hindustaniDB/Hindustani30Ragas/audio/46997b02-f09c-4969-8138-4e1861f61967/Kaustuv_Kanti_Ganguli/Raag_Shree/Raag_Shree_928a430e-813e-48b0-8a23-566e74aa8dc9.pitch', '/home/sankalp/Work/Work_PhD/publications/2016_PhDThesis/plotUtils/ch05_preProcessing/octaveErrorIllustration.png', 59*60 + 47, 59*60 + 50) octaveErrorIllustration3: plt.plotPitch('/media/Data/Datasets/PatternProcessing_DB/unsupervisedDBs/hindustaniDB/Hindustani30Ragas/audio/64e5fb9e-5569-4e80-8e6c-f543af9469c7/Prabha_Atre/Maalkauns/Jaako_Mana_Raam_980b4a00-6e7c-41c1-81ee-6b021d237343.mp3', '/media/Data/Datasets/PatternProcessing_DB/unsupervisedDBs/hindustaniDB/Hindustani30Ragas/audio/64e5fb9e-5569-4e80-8e6c-f543af9469c7/Prabha_Atre/Maalkauns/Jaako_Mana_Raam_980b4a00-6e7c-41c1-81ee-6b021d237343.pitch', '/home/sankalp/Work/Work_PhD/publications/2016_PhDThesis/plotUtils/ch05_preProcessing/octaveErrorIllustration.png', 25*60 + 9, 25*60 + 11) """ frameSize = 4096 hopSize = 512 NFFT = 4096 w = np.hamming(frameSize) audio = ess.MonoLoader(filename=audio_file)() sampleRate = float(ess.MetadataReader(filename=audio_file)()[10]) time_pitch = np.loadtxt(pitch_file) sample_start = sampleRate * time_start sample_end = sampleRate * time_end audio = audio[sample_start:sample_end] ind_start = np.argmin(abs(time_pitch[:, 0] - time_start)) ind_end = np.argmin(abs(time_pitch[:, 0] - time_end)) pitch = copy.deepcopy(time_pitch[ind_start:ind_end, 1]) time = copy.deepcopy(time_pitch[ind_start:ind_end, 0]) - time_pitch[ind_start, 0] mX, pX = STFT.stftAnal(audio, w, NFFT, hopSize) fig = plt.figure() ax = fig.add_subplot(111) plt.hold(True) fsize = 14 fsize2 = 14 #font="Times New Roman" maxplotfreq = 1000.0 numFrames = int(mX[:, 0].size) frmTime = hopSize * np.arange(numFrames) / float(sampleRate) binFreq = sampleRate * np.arange(NFFT * maxplotfreq / sampleRate) / NFFT plt.pcolormesh(frmTime, binFreq, np.transpose(mX[:, :NFFT * maxplotfreq / sampleRate + 1])) plt.hold(True) p, = plt.plot(time, pitch, color='k') xLim = ax.get_xlim() yLim = ax.get_ylim() ax.set_aspect((xLim[1] - xLim[0]) / (2 * float(yLim[1] - yLim[0]))) plt.autoscale(tight=True) plt.legend([p], ['Predominant pitch']) plt.xlabel("Time (s)", fontsize=fsize) plt.ylabel("Frequency (Hz)", fontsize=fsize) plt.tight_layout() plt.savefig(output_file, dpi=600)