def speakerDiarizationWrapper(inputFile, numSpeakers, useLDA): if useLDA: #cls = aS.speakerDiarization(inputFile, numSpeakers, plot_res=True) cls = aS.speakerDiarization(inputFile, numSpeakers, plot_res=False) sound = AudioSegment.from_file(inputFile) print("type = ", type(sound)) speaker_0 = AudioSegment.silent(1) speaker_1 = AudioSegment.silent(1) segs,flags = aS.flags2segs(cls, 0.2) for s in range(segs.shape[0]): if(flags[s] == 0.0): speaker_0 = speaker_0 + sound[segs[s,0] : segs[s,1]+1] elif(flags[s] == 1.0): speaker_1 = speaker_1 + sound[segs[s,0] : segs[s,1]+1] print("{} {} {}\n".format(segs[s,0], segs[s,1], flags[s])) speaker_0.export("./ExportedData/Speaker_0.wav", format="wav") speaker_1.export("./ExportedData/Speaker_1.wav", format="wav") else: sound = AudioSegment.from_file(inputFile) speaker_0 = AudioSegment.silent(100) speaker_1 = AudioSegment.silent(100) #cls = aS.speakerDiarization(inputFile, numSpeakers, lda_dim=0, plot_res=True) cls = aS.speakerDiarization(inputFile, numSpeakers, lda_dim=0, plot_res=False) #print("type = ", type(sound)) segs,flags = aS.flags2segs(cls, 0.2) for s in range(segs.shape[0]): if(flags[s] == 0.0): #print("Inside 0") start = round(segs[s,0]*1000) end = round(segs[s,1]*1000 + 1) speaker_0 = speaker_0.append(sound[start : end] , crossfade = 100) elif(flags[s] == 1.0): #print("Inside 1") start = round(segs[s,0]*1000) end = round((segs[s,1])*1000 + 1) speaker_1 = speaker_1.append(sound[start : end],crossfade = 100) # LINE TO PRINT THE STARTING AND ENDING TIMEINGS OF SEGMENTS OF DIFFERENT SPEAKERS #print("{} {} {}\n".format(segs[s,0], segs[s,1], flags[s])) l = inputFile arr = l.split('/') # To make path split by "/" name = arr[-1].split('.')[0] # To get the file name which'll be last element in "arr" list #print(name) # and separate it from it's extension using "split('.')" speaker_0.export("./ExportedData/"+name+"_0.wav", format="wav") speaker_1.export("./ExportedData/"+name+"_1.wav", format="wav")
def split_call_into_speakers(call_file, out_loc): ''' Attempts to split a call file into different segments each time the speaker changes using speaker diarization. This method assumes there are two speakers in the file (sales and customer) and will cut out dial tones and any receptionists before the two speakers' conversation. ''' # set output directories no_rings_out_dir = os.path.join(out_loc, 'calls_no_ringtones') if not os.path.exists(no_rings_out_dir): os.makedirs(no_rings_out_dir) diarized_out_dir = os.path.join(out_loc, 'calls_split_by_speaker') if not os.path.exists(diarized_out_dir): os.makedirs(diarized_out_dir) # load in raw audio file print(call_file) raw_audio = AudioSegment.from_file(call_file, 'wav') file_name = os.path.splitext(os.path.basename(call_file))[0] # uses trained HMM to determine where the ringtones are and only use audio from after # last detected ring and exports intermediate file curr_path = os.path.dirname(os.path.realpath(__file__)) ring_labels = aS.hmmSegmentation(call_file, os.path.join(curr_path, 'hmmRingDetect'), False) segs, flags = aS.flags2segs( ring_labels[0], 1.0) # 1.0 is the mid-term window step from above model no_rings_audio = raw_audio[segs[-1, 0] * 1000:segs[-1, 1] * 1000] temp_out_loc = os.path.join(no_rings_out_dir, file_name) + '.wav' no_rings_audio.export(temp_out_loc, format='wav') # split on speakers now setting num speakers to 2 diarized = aS.speakerDiarization(temp_out_loc, 2, mtSize=0.5, mtStep=0.1) # determine which label was given to customer and salesperson cust = diarized[0] # output the segments no_rings_audio = AudioSegment.from_file( temp_out_loc, format='wav') # update segment so indexing is right segs, flags = aS.flags2segs(diarized, 0.1) #mtstep from above curr_call_out_base = os.path.join(diarized_out_dir, file_name) if not os.path.exists(curr_call_out_base): os.makedirs(curr_call_out_base) for seg in range(segs.shape[0]): # skip segments shorter than 1s (usually 'um' or something) if segs[seg, 1] - segs[seg, 0] < 1: continue out_seg = no_rings_audio[segs[seg, 0] * 1000:segs[seg, 1] * 1000] if flags[seg] == cust: out_seg.export(os.path.join(curr_call_out_base, str(seg) + '_cust.wav'), format='wav') else: out_seg.export(os.path.join(curr_call_out_base, str(seg) + '_sales.wav'), format='wav')
def split_call_into_speakers(in_loc, out_loc): # this function split all audio files in a directory into segments by speaker turns using pyAudioAnalysis library # # in_loc: directory that contains all audio files # out_loc: directory that stores all diarized segments for audio in os.listdir(in_loc): if audio != '.DS_Store': p = os.path.join(in_loc, audio) no_rings_audio = AudioSegment.from_file(p, format='wav') basename = os.path.splitext(os.path.basename(audio))[0] # split on speakers now setting num speakers to 2 diarized = aS.speakerDiarization(p, 2, mtSize=0.5, mtStep=0.1) # determine which label was given to customer and salesperson cust = diarized[0] # output the segments segs, flags = aS.flags2segs(diarized, 0.1) #mtstep from above for seg in range(segs.shape[0]): # skip segments shorter than 1s (usually 'um' or something) if segs[seg, 1] - segs[seg, 0] < 1: continue out_seg = no_rings_audio[segs[seg, 0] * 1000:segs[seg, 1] * 1000] if flags[seg] == cust: out_seg.export(out_loc + basename + '.' + str(seg) + '_cust.wav', format='wav') else: out_seg.export(out_loc + basename + '.' + str(seg) + '_sales.wav', format='wav')
def find_music(audio_file): modelName = "pyAA/data/svmSM" [Fs, x] = aIO.readAudioFile(audio_file) duration = x.shape[0] / float(Fs) t1 = time.clock() flagsInd, classNames, acc, CMt = aS.mtFileClassification( audio_file, modelName, "svm", False, '') [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = aT.loadSVModel(modelName) t2 = time.clock() perTime1 = duration / (t2 - t1) flags = [classNames[int(f)] for f in flagsInd] (segs, classes) = aS.flags2segs(flags, mtStep) i = 0 #len(classes)-1 file_parts = [] cbn = sox.Combiner() if len(classes) > 1: for c in classes: if c == 'music': start = segs[i][0] if i != 0: start -= 0.5 end = segs[i][1] if i != len(classes) - 1: end += 2.5 file_parts.append((int(start * 1000), int(end * 1000))) i += 1 return file_parts
def getMusicSegmentsFromFile(inputFile): modelType = "svm" modelName = "data/svmMovies8classes" dirOutput = inputFile[0:-4] + "_musicSegments" if os.path.exists(dirOutput) and dirOutput!=".": shutil.rmtree(dirOutput) os.makedirs(dirOutput) [Fs, x] = audioBasicIO.readAudioFile(inputFile) if modelType=='svm': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, compute_beat] = aT.load_model(modelName) elif modelType=='knn': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, compute_beat] = aT.load_model_knn(modelName) flagsInd, classNames, acc, CM = aS.mtFileClassification(inputFile, modelName, modelType, plotResults = False, gtFile = "") segs, classes = aS.flags2segs(flagsInd, mtStep) for i, s in enumerate(segs): if (classNames[int(classes[i])] == "Music") and (s[1] - s[0] >= minDuration): strOut = "{0:s}{1:.3f}-{2:.3f}.wav".format(dirOutput+os.sep, s[0], s[1]) wavfile.write( strOut, Fs, x[int(Fs*s[0]):int(Fs*s[1])])
def main(): wavFilePath = "/home/valia/Desktop/megan.wav" mt_size, mt_step, st_win = 1, 0.1, 0.5 newPath = StereoToMono(wavFilePath) print newPath mt_feats_norm = ExtractFeatures(newPath) print mt_feats_norm.shape #arr = np.asarray(F) k_means = KMeans(n_clusters=2) k_means.fit(mt_feats_norm.T) cls = k_means.labels_ segs, c = flags2segs(cls, mt_step) # convert flags to segment limits for sp in range(3): # play each cluster's segment for i in range(len(c)): if c[i] == sp and segs[i, 1] - segs[i, 0] > 1: # play long segments of current speaker print(c[i], segs[i, 0], segs[i, 1]) cmd = "ffmpeg -i {} -ss {} -t {} temp.wav " \ "-loglevel panic -y".format(newPath, segs[i, 0]+1, segs[i, 1]-segs[i, 0]-1) os.system(cmd) #os.system("play temp.wav -q") playsound('temp.wav') readchar.readchar()
# detect mic configuration by analyzing input wav file modelName = get_model_path(args['inputWavFile']) if (args['debug']): print('\tusing: {}'.format(modelName)) model_time = time.time() - start_time modelType = "svm" gtFile = "" returnVal = aS.mtFileClassification(args['inputWavFile'], modelName, modelType, False, gtFile) flagsInd = returnVal[0] classNames = returnVal[1] flags = [classNames[int(f)] for f in flagsInd] (segs, classes) = aS.flags2segs(flags, 1) for s in range(len(segs)): sg = segs[s] diff = int(sg[1]) - int(sg[0]) if (args['debug']): print('{:>6} - {:>6} ({:>6}) : {}').format(sg[0], sg[1], diff, classes[s]) my_segments.append(Segment(int(sg[0]), int(sg[1]), str(classes[s]))) # Speech and non speech lists final_list = [] detected_list = [] segments_non_speech = filter( lambda x: (x.diff >= int(args['threshold'])) and
def run(wavFileName2,bagFile2): global wavFileName global bagFile global xStart global xEnd global annotationFlag, annotations, shadesAndSpeaker, greenIndex global spf, duration, signal time = 0 segmentDuration = 0 segments = [] # >> Open WAVfile #---------------------- #wavFileName -> global variable wavFileName = wavFileName2 bagFile = bagFile2 spf = wave.open(wavFileName,'r') #Extract Raw Audio from Wav File signal = spf.readframes(-1) signal = np.fromstring(signal, 'Int16') #self.axes.clear() #Get wavFile duration frames = spf.getnframes() rate = spf.getframerate() duration = frames / float(rate) # >> Open CSVfile #---------------------- # check if .csv exists csvFileName = bagFile.replace(".bag","_audio.csv") if os.path.isfile(csvFileName): # print '.csv Found !' annotationFile = open(csvFileName, 'rb') read = csv.reader(annotationFile) for row in read: row[0] = float(row[0]) row[1] = float(row[1]) annotations.append([row[0], row[1], row[2]]) # get speakers unic colors for annotation plot and ganttChart for shadeIndex in range(len(annotations)): if annotations[shadeIndex][2][:8] == 'Speech::': shadesAndSpeaker.append([annotations[shadeIndex][2], GreenShades[greenIndex]]) if greenIndex > len(GreenShades): greenIndex = 0 else: greenIndex = greenIndex + 1 # >> Call Classifier in case CSVFile not exists #---------------------- else: # print 'classifier...' [flagsInd, classesAll, acc] = aS.mtFileClassification(wavFileName, 'svmModelTest', 'svm', False) # declare classes [segs, classes] = aS.flags2segs(flagsInd, 1) lengthClass = len(classesAll) className = np.arange(lengthClass, dtype=np.float) for j in range(len(segs)): # no Annotation for Silence segments for i in range(len(classesAll)): if classes[j] == className[i] and classesAll[i] != 'Silence': annotations.append([segs[j][0]*1000, segs[j][1]*1000, classesAll[i]]) # >> Initialize GUI #---------------------- qApp = QtWidgets.QApplication(sys.argv) aw = ApplicationWindow() aw.setWindowTitle("Audio") aw.show() # >> Terminate GUI #---------------------- sys.exit(qApp.exec_())
def run(wavFileName2, bagFile2): time = 0 segmentDuration = 0 segments = [] # >> Open WAVfile #---------------------- #audioGlobals.wavFileName -> global variable audioGlobals.wavFileName = wavFileName2 audioGlobals.bagFile = bagFile2 audioGlobals.spf = wave.open(audioGlobals.wavFileName, 'r') #Extract Raw Audio from Wav File audioGlobals.signal = audioGlobals.spf.readframes(-1) audioGlobals.signal = np.fromstring(audioGlobals.signal, 'Int16') #self.axes.clear() #Get wavFile audioGlabals.duration frames = audioGlobals.spf.getnframes() rate = audioGlobals.spf.getframerate() audioGlobals.duration = frames / float(rate) # >> Open CSVfile #---------------------- # check if .csv exists csvFileName = audioGlobals.bagFile.replace(".bag", "_audio.csv") if os.path.isfile(csvFileName): annotationFile = open(csvFileName, 'rb') read = csv.reader(annotationFile) for row in read: row[0] = float(row[0]) row[1] = float(row[1]) audioGlobals.annotations.append([row[0], row[1], row[2]]) # get speakers unic colors for annotation plot and ganttChart #print len(audioGlobals.GreenShades) for shadeIndex in range(len(audioGlobals.annotations)): if audioGlobals.annotations[shadeIndex][2][:8] == 'Speech::': #print audioGlobals.greenIndex, len(audioGlobals.GreenShades)-1 if audioGlobals.greenIndex >= (len(audioGlobals.GreenShades) - 1): audioGlobals.greenIndex = 0 else: audioGlobals.greenIndex = audioGlobals.greenIndex + 1 #print audioGlobals.greenIndex, shadeIndex audioGlobals.shadesAndSpeaker.append([ audioGlobals.annotations[shadeIndex][2], audioGlobals.GreenShades[audioGlobals.greenIndex] ]) # >> Call Classifier in case CSVFile not exists #---------------------- else: [flagsInd, classesAll, acc, CM] = aS.mtFileClassification(audioGlobals.wavFileName, 'svmModelTest', 'svm', False) # declare classes [segs, classes] = aS.flags2segs(flagsInd, 1) lengthClass = len(classesAll) className = np.arange(lengthClass, dtype=np.float) for j in range(len(segs)): # no Annotation for Silence segments for i in range(len(classesAll)): if classes[j] == className[i] and classesAll[i] != 'Silence': audioGlobals.annotations.append( [segs[j][0] * 1000, segs[j][1] * 1000, classesAll[i]]) # >> Write annotations in csv file csvFileName = audioGlobals.bagFile.replace(".bag", "_audio.csv") annotationFile = open(csvFileName, 'w') write = csv.writer(annotationFile) write.writerows(audioGlobals.annotations) annotationFile.close()
if __name__ == '__main__': # read signal and get normalized segment features: input_file = "../data/song1.mp3" fs, x = readAudioFile(input_file) x = stereo2mono(x) mt_size, mt_step, st_win = 5, 0.5, 0.05 [mt_feats, st_feats, _] = mT(x, fs, mt_size * fs, mt_step * fs, round(fs * st_win), round(fs * st_win * 0.5)) (mt_feats_norm, MEAN, STD) = normalizeFeatures([mt_feats.T]) mt_feats_norm = mt_feats_norm[0].T # perform clustering (k = 4) n_clusters = 4 k_means = sklearn.cluster.KMeans(n_clusters=n_clusters) k_means.fit(mt_feats_norm.T) cls = k_means.labels_ segs, c = flags2segs(cls, mt_step) # convert flags to segment limits for sp in range(n_clusters): # play each cluster's segment for i in range(len(c)): if c[i] == sp and segs[i, 1] - segs[i, 0] > 5: # play long segments of current cluster (only win_to_play seconds) d = segs[i, 1] - segs[i, 0] win_to_play = 10 if win_to_play > d: win_to_play = d print(" * * * * CLUSTER {0:d} * * * * * {1:.1f} - {2:.1f}, " "playing {3:.1f}-{4:.1f}".format( c[i], segs[i, 0], segs[i, 1], segs[i, 0] + d / 2 - win_to_play / 2, segs[i, 0] + d / 2 + win_to_play / 2)) cmd = "avconv -i {} -ss {} -t {} temp.wav " \ "-loglevel panic -y".format(input_file,
def run(wavFileName2, bagFile2): global wavFileName global bagFile global xStart global xEnd global annotationFlag, annotations, shadesAndSpeaker, greenIndex global spf, duration, signal time = 0 segmentDuration = 0 segments = [] # >> Open WAVfile #---------------------- #wavFileName -> global variable wavFileName = wavFileName2 bagFile = bagFile2 spf = wave.open(wavFileName, 'r') #Extract Raw Audio from Wav File signal = spf.readframes(-1) signal = np.fromstring(signal, 'Int16') #self.axes.clear() #Get wavFile duration frames = spf.getnframes() rate = spf.getframerate() duration = frames / float(rate) # >> Open CSVfile #---------------------- # check if .csv exists csvFileName = bagFile.replace(".bag", "_audio.csv") if os.path.isfile(csvFileName): # print '.csv Found !' annotationFile = open(csvFileName, 'rb') read = csv.reader(annotationFile) for row in read: row[0] = float(row[0]) row[1] = float(row[1]) annotations.append([row[0], row[1], row[2]]) # get speakers unic colors for annotation plot and ganttChart for shadeIndex in range(len(annotations)): if annotations[shadeIndex][2][:8] == 'Speech::': shadesAndSpeaker.append( [annotations[shadeIndex][2], GreenShades[greenIndex]]) if greenIndex > len(GreenShades): greenIndex = 0 else: greenIndex = greenIndex + 1 # >> Call Classifier in case CSVFile not exists #---------------------- else: # print 'classifier...' [flagsInd, classesAll, acc] = aS.mtFileClassification(wavFileName, 'svmModelTest', 'svm', False) # declare classes [segs, classes] = aS.flags2segs(flagsInd, 1) lengthClass = len(classesAll) className = np.arange(lengthClass, dtype=np.float) for j in range(len(segs)): # no Annotation for Silence segments for i in range(len(classesAll)): if classes[j] == className[i] and classesAll[i] != 'Silence': annotations.append( [segs[j][0] * 1000, segs[j][1] * 1000, classesAll[i]]) # >> Initialize GUI #---------------------- qApp = QtWidgets.QApplication(sys.argv) aw = ApplicationWindow() aw.setWindowTitle("Audio") aw.show() # >> Terminate GUI #---------------------- sys.exit(qApp.exec_())
def run(wavFileName2,bagFile2): time = 0 segmentDuration = 0 segments = [] # >> Open WAVfile #---------------------- #audioGlobals.wavFileName -> global variable audioGlobals.wavFileName = wavFileName2 audioGlobals.bagFile = bagFile2 audioGlobals.spf = wave.open(audioGlobals.wavFileName,'r') #Extract Raw Audio from Wav File audioGlobals.signal = audioGlobals.spf.readframes(-1) audioGlobals.signal = np.fromstring(audioGlobals.signal, 'Int16') #self.axes.clear() #Get wavFile audioGlabals.duration frames = audioGlobals.spf.getnframes() rate = audioGlobals.spf.getframerate() audioGlobals.duration = frames / float(rate) # >> Open CSVfile #---------------------- # check if .csv exists csvFileName = audioGlobals.bagFile.replace(".bag","_audio.csv") if os.path.isfile(csvFileName): annotationFile = open(csvFileName, 'rb') read = csv.reader(annotationFile) for row in read: row[0] = float(row[0]) row[1] = float(row[1]) audioGlobals.annotations.append([row[0], row[1], row[2]]) # get speakers unic colors for annotation plot and ganttChart #print len(audioGlobals.GreenShades) for shadeIndex in range(len(audioGlobals.annotations)): if audioGlobals.annotations[shadeIndex][2][:8] == 'Speech::': #print audioGlobals.greenIndex, len(audioGlobals.GreenShades)-1 if audioGlobals.greenIndex >= (len(audioGlobals.GreenShades)-1): audioGlobals.greenIndex = 0 else: audioGlobals.greenIndex = audioGlobals.greenIndex + 1 #print audioGlobals.greenIndex, shadeIndex audioGlobals.shadesAndSpeaker.append([audioGlobals.annotations[shadeIndex][2], audioGlobals.GreenShades[audioGlobals.greenIndex]]) # >> Call Classifier in case CSVFile not exists #---------------------- else: [flagsInd, classesAll, acc,CM] = aS.mtFileClassification(audioGlobals.wavFileName, os.path.abspath('audio/ClassifierMethods/svmModelTest'), 'svm', False) # declare classes [segs, classes] = aS.flags2segs(flagsInd, 1) lengthClass = len(classesAll) className = np.arange(lengthClass, dtype=np.float) for j in range(len(segs)): # no Annotation for Silence segments for i in range(len(classesAll)): if classes[j] == className[i] and classesAll[i] != 'Silence': audioGlobals.annotations.append([segs[j][0]*1000, segs[j][1]*1000, classesAll[i]]) # >> Write annotations in csv file csvFileName = audioGlobals.bagFile.replace(".bag","_audio.csv") annotationFile = open(csvFileName, 'w') write = csv.writer(annotationFile) write.writerows(audioGlobals.annotations) annotationFile.close()