def speakerDiarizationWrapper(inputFile, numSpeakers, useLDA):
    if useLDA:
        #cls = aS.speakerDiarization(inputFile, numSpeakers, plot_res=True)
        cls = aS.speakerDiarization(inputFile, numSpeakers, plot_res=False)

        sound = AudioSegment.from_file(inputFile)
        print("type = ", type(sound))
        speaker_0 = AudioSegment.silent(1)
        speaker_1 = AudioSegment.silent(1)

        segs,flags = aS.flags2segs(cls, 0.2)    

        for s in range(segs.shape[0]):
            if(flags[s] == 0.0):
                speaker_0 = speaker_0 + sound[segs[s,0] : segs[s,1]+1]

            elif(flags[s] == 1.0):
                speaker_1 = speaker_1 + sound[segs[s,0] : segs[s,1]+1]
            print("{} {} {}\n".format(segs[s,0], segs[s,1], flags[s]))

        speaker_0.export("./ExportedData/Speaker_0.wav", format="wav")
        speaker_1.export("./ExportedData/Speaker_1.wav", format="wav")

    else:
        sound = AudioSegment.from_file(inputFile)
        speaker_0 = AudioSegment.silent(100)
        speaker_1 = AudioSegment.silent(100)
        #cls = aS.speakerDiarization(inputFile, numSpeakers, lda_dim=0, plot_res=True)
        cls = aS.speakerDiarization(inputFile, numSpeakers, lda_dim=0, plot_res=False)

        #print("type = ", type(sound))

        segs,flags = aS.flags2segs(cls, 0.2)    

        for s in range(segs.shape[0]):

            if(flags[s] == 0.0):
                #print("Inside 0")
                start = round(segs[s,0]*1000)
                end = round(segs[s,1]*1000 + 1)
                speaker_0 = speaker_0.append(sound[start : end] , crossfade = 100)

            elif(flags[s] == 1.0):
                #print("Inside 1")
                start = round(segs[s,0]*1000)
                end = round((segs[s,1])*1000 + 1)
                speaker_1 = speaker_1.append(sound[start : end],crossfade = 100)

# LINE TO PRINT THE STARTING AND ENDING TIMEINGS OF SEGMENTS OF DIFFERENT SPEAKERS
            #print("{} {} {}\n".format(segs[s,0], segs[s,1], flags[s]))
        
        l = inputFile
        arr = l.split('/')              # To make path split by "/"
        name = arr[-1].split('.')[0]    # To get the file name which'll be last element in "arr" list          
        #print(name)                    # and separate it from it's extension using "split('.')"
                                        

        speaker_0.export("./ExportedData/"+name+"_0.wav", format="wav")
        speaker_1.export("./ExportedData/"+name+"_1.wav", format="wav")
Exemplo n.º 2
0
def split_call_into_speakers(call_file, out_loc):
    '''
    Attempts to split a call file into different segments each time the speaker changes using
    speaker diarization. This method assumes there are two speakers in the file (sales and customer)
    and will cut out dial tones and any receptionists before the two speakers' conversation.
    '''
    # set output directories
    no_rings_out_dir = os.path.join(out_loc, 'calls_no_ringtones')
    if not os.path.exists(no_rings_out_dir):
        os.makedirs(no_rings_out_dir)
    diarized_out_dir = os.path.join(out_loc, 'calls_split_by_speaker')
    if not os.path.exists(diarized_out_dir):
        os.makedirs(diarized_out_dir)

    # load in raw audio file
    print(call_file)
    raw_audio = AudioSegment.from_file(call_file, 'wav')
    file_name = os.path.splitext(os.path.basename(call_file))[0]

    # uses trained HMM to determine where the ringtones are and only use audio from after
    # last detected ring and exports intermediate file
    curr_path = os.path.dirname(os.path.realpath(__file__))
    ring_labels = aS.hmmSegmentation(call_file,
                                     os.path.join(curr_path, 'hmmRingDetect'),
                                     False)
    segs, flags = aS.flags2segs(
        ring_labels[0],
        1.0)  # 1.0 is the mid-term window step from above model
    no_rings_audio = raw_audio[segs[-1, 0] * 1000:segs[-1, 1] * 1000]
    temp_out_loc = os.path.join(no_rings_out_dir, file_name) + '.wav'
    no_rings_audio.export(temp_out_loc, format='wav')

    # split on speakers now setting num speakers to 2
    diarized = aS.speakerDiarization(temp_out_loc, 2, mtSize=0.5, mtStep=0.1)

    # determine which label was given to customer and salesperson
    cust = diarized[0]

    # output the segments
    no_rings_audio = AudioSegment.from_file(
        temp_out_loc, format='wav')  # update segment so indexing is right
    segs, flags = aS.flags2segs(diarized, 0.1)  #mtstep from above
    curr_call_out_base = os.path.join(diarized_out_dir, file_name)
    if not os.path.exists(curr_call_out_base):
        os.makedirs(curr_call_out_base)
    for seg in range(segs.shape[0]):
        # skip segments shorter than 1s (usually 'um' or something)
        if segs[seg, 1] - segs[seg, 0] < 1:
            continue
        out_seg = no_rings_audio[segs[seg, 0] * 1000:segs[seg, 1] * 1000]
        if flags[seg] == cust:
            out_seg.export(os.path.join(curr_call_out_base,
                                        str(seg) + '_cust.wav'),
                           format='wav')
        else:
            out_seg.export(os.path.join(curr_call_out_base,
                                        str(seg) + '_sales.wav'),
                           format='wav')
Exemplo n.º 3
0
def split_call_into_speakers(in_loc, out_loc):
    #   this function split all audio files in a directory into segments by speaker turns using pyAudioAnalysis library
    #
    #   in_loc: directory that contains all audio files
    #   out_loc: directory that stores all diarized segments

    for audio in os.listdir(in_loc):
        if audio != '.DS_Store':
            p = os.path.join(in_loc, audio)
            no_rings_audio = AudioSegment.from_file(p, format='wav')
            basename = os.path.splitext(os.path.basename(audio))[0]
            # split on speakers now setting num speakers to 2
            diarized = aS.speakerDiarization(p, 2, mtSize=0.5, mtStep=0.1)
            # determine which label was given to customer and salesperson
            cust = diarized[0]
            # output the segments
            segs, flags = aS.flags2segs(diarized, 0.1)  #mtstep from above
            for seg in range(segs.shape[0]):
                # skip segments shorter than 1s (usually 'um' or something)
                if segs[seg, 1] - segs[seg, 0] < 1:
                    continue
                out_seg = no_rings_audio[segs[seg, 0] * 1000:segs[seg, 1] *
                                         1000]
                if flags[seg] == cust:
                    out_seg.export(out_loc + basename + '.' + str(seg) +
                                   '_cust.wav',
                                   format='wav')
                else:
                    out_seg.export(out_loc + basename + '.' + str(seg) +
                                   '_sales.wav',
                                   format='wav')
Exemplo n.º 4
0
def find_music(audio_file):
    modelName = "pyAA/data/svmSM"

    [Fs, x] = aIO.readAudioFile(audio_file)
    duration = x.shape[0] / float(Fs)
    t1 = time.clock()
    flagsInd, classNames, acc, CMt = aS.mtFileClassification(
        audio_file, modelName, "svm", False, '')
    [
        Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
        computeBEAT
    ] = aT.loadSVModel(modelName)
    t2 = time.clock()
    perTime1 = duration / (t2 - t1)
    flags = [classNames[int(f)] for f in flagsInd]
    (segs, classes) = aS.flags2segs(flags, mtStep)

    i = 0  #len(classes)-1
    file_parts = []

    cbn = sox.Combiner()
    if len(classes) > 1:
        for c in classes:
            if c == 'music':
                start = segs[i][0]
                if i != 0:
                    start -= 0.5
                end = segs[i][1]
                if i != len(classes) - 1:
                    end += 2.5

                file_parts.append((int(start * 1000), int(end * 1000)))
            i += 1

    return file_parts
Exemplo n.º 5
0
def getMusicSegmentsFromFile(inputFile):	
	modelType = "svm"
	modelName = "data/svmMovies8classes"
	
	dirOutput = inputFile[0:-4] + "_musicSegments"
	
	if os.path.exists(dirOutput) and dirOutput!=".":
		shutil.rmtree(dirOutput)	
	os.makedirs(dirOutput)	
	
	[Fs, x] = audioBasicIO.readAudioFile(inputFile)	

	if modelType=='svm':
		[Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, compute_beat] = aT.load_model(modelName)
	elif modelType=='knn':
		[Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, compute_beat] = aT.load_model_knn(modelName)

	flagsInd, classNames, acc, CM = aS.mtFileClassification(inputFile, modelName, modelType, plotResults = False, gtFile = "")
	segs, classes = aS.flags2segs(flagsInd, mtStep)

	for i, s in enumerate(segs):
		if (classNames[int(classes[i])] == "Music") and (s[1] - s[0] >= minDuration):
			strOut = "{0:s}{1:.3f}-{2:.3f}.wav".format(dirOutput+os.sep, s[0], s[1])	
			wavfile.write( strOut, Fs, x[int(Fs*s[0]):int(Fs*s[1])])
Exemplo n.º 6
0
def main():
    wavFilePath = "/home/valia/Desktop/megan.wav"
    mt_size, mt_step, st_win = 1, 0.1, 0.5
    newPath = StereoToMono(wavFilePath)
    print newPath
    mt_feats_norm = ExtractFeatures(newPath)
    print mt_feats_norm.shape
    #arr = np.asarray(F)
    k_means = KMeans(n_clusters=2)
    k_means.fit(mt_feats_norm.T)
    cls = k_means.labels_
    segs, c = flags2segs(cls, mt_step)  # convert flags to segment limits
    for sp in range(3):  # play each cluster's segment
        for i in range(len(c)):
            if c[i] == sp and segs[i, 1] - segs[i, 0] > 1:
                # play long segments of current speaker
                print(c[i], segs[i, 0], segs[i, 1])
                cmd = "ffmpeg -i {} -ss {} -t {} temp.wav " \
                          "-loglevel panic -y".format(newPath, segs[i, 0]+1,
                                                      segs[i, 1]-segs[i, 0]-1)
                os.system(cmd)
                #os.system("play temp.wav -q")
                playsound('temp.wav')
                readchar.readchar()
        # detect mic configuration by analyzing input wav file
        modelName = get_model_path(args['inputWavFile'])

    if (args['debug']):
        print('\tusing: {}'.format(modelName))

    model_time = time.time() - start_time
    modelType = "svm"
    gtFile = ""
    returnVal = aS.mtFileClassification(args['inputWavFile'], modelName,
                                        modelType, False, gtFile)
    flagsInd = returnVal[0]
    classNames = returnVal[1]

    flags = [classNames[int(f)] for f in flagsInd]
    (segs, classes) = aS.flags2segs(flags, 1)

    for s in range(len(segs)):
        sg = segs[s]
        diff = int(sg[1]) - int(sg[0])
        if (args['debug']):
            print('{:>6} - {:>6} ({:>6}) : {}').format(sg[0], sg[1], diff,
                                                       classes[s])
        my_segments.append(Segment(int(sg[0]), int(sg[1]), str(classes[s])))

    # Speech and non speech lists
    final_list = []
    detected_list = []

    segments_non_speech = filter(
        lambda x: (x.diff >= int(args['threshold'])) and
Exemplo n.º 8
0
def run(wavFileName2,bagFile2):
    global wavFileName
    global bagFile
    global xStart
    global xEnd
    global annotationFlag, annotations, shadesAndSpeaker, greenIndex
    global spf, duration, signal

    time = 0
    segmentDuration = 0
    segments = []

    # >> Open WAVfile 
    #----------------------
    #wavFileName -> global variable 
    wavFileName = wavFileName2
    bagFile = bagFile2

    spf = wave.open(wavFileName,'r')
    #Extract Raw Audio from Wav File
    signal = spf.readframes(-1)
    signal = np.fromstring(signal, 'Int16')
    #self.axes.clear()

    #Get wavFile duration
    frames = spf.getnframes()
    rate = spf.getframerate()
    duration = frames / float(rate)

    # >> Open CSVfile 
    #----------------------
    # check if .csv exists
    csvFileName = bagFile.replace(".bag","_audio.csv")
    if os.path.isfile(csvFileName):
        # print '.csv Found !'
        annotationFile = open(csvFileName, 'rb')

        read = csv.reader(annotationFile)
        for row in read:
            row[0] = float(row[0])
            row[1] = float(row[1])
            annotations.append([row[0], row[1], row[2]])

        # get speakers unic colors for annotation plot and ganttChart
        for shadeIndex in range(len(annotations)):
            if annotations[shadeIndex][2][:8] == 'Speech::':
                shadesAndSpeaker.append([annotations[shadeIndex][2], GreenShades[greenIndex]])
                if greenIndex > len(GreenShades):
                    greenIndex = 0
                else:
                    greenIndex = greenIndex + 1

    # >> Call Classifier in case CSVFile not exists 
    #---------------------- 
    else:
        # print 'classifier...'
        [flagsInd, classesAll, acc] = aS.mtFileClassification(wavFileName, 'svmModelTest', 'svm', False)
        # declare classes
        [segs, classes] = aS.flags2segs(flagsInd, 1)
        lengthClass = len(classesAll)
        className = np.arange(lengthClass, dtype=np.float)


        for j in range(len(segs)):
            # no Annotation for Silence segments
            for i in range(len(classesAll)):
                if classes[j] == className[i] and classesAll[i] != 'Silence':
                    annotations.append([segs[j][0]*1000, segs[j][1]*1000, classesAll[i]])




    # >> Initialize GUI 
    #----------------------
    qApp = QtWidgets.QApplication(sys.argv)
    aw = ApplicationWindow()
    aw.setWindowTitle("Audio")
    aw.show()

    # >> Terminate GUI 
    #---------------------- 
    sys.exit(qApp.exec_())
def run(wavFileName2, bagFile2):
    time = 0
    segmentDuration = 0
    segments = []

    # >> Open WAVfile
    #----------------------
    #audioGlobals.wavFileName -> global variable
    audioGlobals.wavFileName = wavFileName2
    audioGlobals.bagFile = bagFile2

    audioGlobals.spf = wave.open(audioGlobals.wavFileName, 'r')
    #Extract Raw Audio from Wav File
    audioGlobals.signal = audioGlobals.spf.readframes(-1)
    audioGlobals.signal = np.fromstring(audioGlobals.signal, 'Int16')
    #self.axes.clear()

    #Get wavFile audioGlabals.duration
    frames = audioGlobals.spf.getnframes()
    rate = audioGlobals.spf.getframerate()
    audioGlobals.duration = frames / float(rate)

    # >> Open CSVfile
    #----------------------
    # check if .csv exists
    csvFileName = audioGlobals.bagFile.replace(".bag", "_audio.csv")
    if os.path.isfile(csvFileName):
        annotationFile = open(csvFileName, 'rb')

        read = csv.reader(annotationFile)
        for row in read:
            row[0] = float(row[0])
            row[1] = float(row[1])
            audioGlobals.annotations.append([row[0], row[1], row[2]])

        # get speakers unic colors for annotation plot and ganttChart
        #print len(audioGlobals.GreenShades)
        for shadeIndex in range(len(audioGlobals.annotations)):
            if audioGlobals.annotations[shadeIndex][2][:8] == 'Speech::':
                #print audioGlobals.greenIndex, len(audioGlobals.GreenShades)-1
                if audioGlobals.greenIndex >= (len(audioGlobals.GreenShades) -
                                               1):
                    audioGlobals.greenIndex = 0
                else:
                    audioGlobals.greenIndex = audioGlobals.greenIndex + 1
                #print audioGlobals.greenIndex, shadeIndex
                audioGlobals.shadesAndSpeaker.append([
                    audioGlobals.annotations[shadeIndex][2],
                    audioGlobals.GreenShades[audioGlobals.greenIndex]
                ])

    # >> Call Classifier in case CSVFile not exists
    #----------------------
    else:
        [flagsInd, classesAll, acc,
         CM] = aS.mtFileClassification(audioGlobals.wavFileName,
                                       'svmModelTest', 'svm', False)
        # declare classes
        [segs, classes] = aS.flags2segs(flagsInd, 1)
        lengthClass = len(classesAll)
        className = np.arange(lengthClass, dtype=np.float)

        for j in range(len(segs)):
            # no Annotation for Silence segments
            for i in range(len(classesAll)):
                if classes[j] == className[i] and classesAll[i] != 'Silence':
                    audioGlobals.annotations.append(
                        [segs[j][0] * 1000, segs[j][1] * 1000, classesAll[i]])

        # >> Write annotations in csv file
        csvFileName = audioGlobals.bagFile.replace(".bag", "_audio.csv")
        annotationFile = open(csvFileName, 'w')
        write = csv.writer(annotationFile)
        write.writerows(audioGlobals.annotations)
        annotationFile.close()
Exemplo n.º 10
0
if __name__ == '__main__':
    # read signal and get normalized segment features:
    input_file = "../data/song1.mp3"
    fs, x = readAudioFile(input_file)
    x = stereo2mono(x)
    mt_size, mt_step, st_win = 5, 0.5, 0.05
    [mt_feats, st_feats, _] = mT(x, fs, mt_size * fs, mt_step * fs,
                                 round(fs * st_win), round(fs * st_win * 0.5))
    (mt_feats_norm, MEAN, STD) = normalizeFeatures([mt_feats.T])
    mt_feats_norm = mt_feats_norm[0].T
    # perform clustering (k = 4)
    n_clusters = 4
    k_means = sklearn.cluster.KMeans(n_clusters=n_clusters)
    k_means.fit(mt_feats_norm.T)
    cls = k_means.labels_
    segs, c = flags2segs(cls, mt_step)  # convert flags to segment limits
    for sp in range(n_clusters):  # play each cluster's segment
        for i in range(len(c)):
            if c[i] == sp and segs[i, 1] - segs[i, 0] > 5:
                # play long segments of current cluster (only win_to_play seconds)
                d = segs[i, 1] - segs[i, 0]
                win_to_play = 10
                if win_to_play > d:
                    win_to_play = d
                print(" * * * * CLUSTER {0:d} * * * * * {1:.1f} - {2:.1f}, "
                      "playing {3:.1f}-{4:.1f}".format(
                          c[i], segs[i, 0], segs[i, 1],
                          segs[i, 0] + d / 2 - win_to_play / 2,
                          segs[i, 0] + d / 2 + win_to_play / 2))
                cmd = "avconv -i {} -ss {} -t {} temp.wav " \
                          "-loglevel panic -y".format(input_file,
Exemplo n.º 11
0
def run(wavFileName2, bagFile2):
    global wavFileName
    global bagFile
    global xStart
    global xEnd
    global annotationFlag, annotations, shadesAndSpeaker, greenIndex
    global spf, duration, signal

    time = 0
    segmentDuration = 0
    segments = []

    # >> Open WAVfile
    #----------------------
    #wavFileName -> global variable
    wavFileName = wavFileName2
    bagFile = bagFile2

    spf = wave.open(wavFileName, 'r')
    #Extract Raw Audio from Wav File
    signal = spf.readframes(-1)
    signal = np.fromstring(signal, 'Int16')
    #self.axes.clear()

    #Get wavFile duration
    frames = spf.getnframes()
    rate = spf.getframerate()
    duration = frames / float(rate)

    # >> Open CSVfile
    #----------------------
    # check if .csv exists
    csvFileName = bagFile.replace(".bag", "_audio.csv")
    if os.path.isfile(csvFileName):
        # print '.csv Found !'
        annotationFile = open(csvFileName, 'rb')

        read = csv.reader(annotationFile)
        for row in read:
            row[0] = float(row[0])
            row[1] = float(row[1])
            annotations.append([row[0], row[1], row[2]])

        # get speakers unic colors for annotation plot and ganttChart
        for shadeIndex in range(len(annotations)):
            if annotations[shadeIndex][2][:8] == 'Speech::':
                shadesAndSpeaker.append(
                    [annotations[shadeIndex][2], GreenShades[greenIndex]])
                if greenIndex > len(GreenShades):
                    greenIndex = 0
                else:
                    greenIndex = greenIndex + 1

    # >> Call Classifier in case CSVFile not exists
    #----------------------
    else:
        # print 'classifier...'
        [flagsInd, classesAll,
         acc] = aS.mtFileClassification(wavFileName, 'svmModelTest', 'svm',
                                        False)
        # declare classes
        [segs, classes] = aS.flags2segs(flagsInd, 1)
        lengthClass = len(classesAll)
        className = np.arange(lengthClass, dtype=np.float)

        for j in range(len(segs)):
            # no Annotation for Silence segments
            for i in range(len(classesAll)):
                if classes[j] == className[i] and classesAll[i] != 'Silence':
                    annotations.append(
                        [segs[j][0] * 1000, segs[j][1] * 1000, classesAll[i]])

    # >> Initialize GUI
    #----------------------
    qApp = QtWidgets.QApplication(sys.argv)
    aw = ApplicationWindow()
    aw.setWindowTitle("Audio")
    aw.show()

    # >> Terminate GUI
    #----------------------
    sys.exit(qApp.exec_())
Exemplo n.º 12
0
def run(wavFileName2,bagFile2):
    time = 0
    segmentDuration = 0
    segments = []

    # >> Open WAVfile 
    #----------------------
    #audioGlobals.wavFileName -> global variable 
    audioGlobals.wavFileName = wavFileName2
    audioGlobals.bagFile = bagFile2

    audioGlobals.spf = wave.open(audioGlobals.wavFileName,'r')
    #Extract Raw Audio from Wav File
    audioGlobals.signal = audioGlobals.spf.readframes(-1)
    audioGlobals.signal = np.fromstring(audioGlobals.signal, 'Int16')
    #self.axes.clear()

    #Get wavFile audioGlabals.duration
    frames = audioGlobals.spf.getnframes()
    rate = audioGlobals.spf.getframerate()
    audioGlobals.duration = frames / float(rate)

    # >> Open CSVfile 
    #----------------------
    # check if .csv exists
    csvFileName = audioGlobals.bagFile.replace(".bag","_audio.csv")
    if os.path.isfile(csvFileName):
        annotationFile = open(csvFileName, 'rb')

        read = csv.reader(annotationFile)
        for row in read:
            row[0] = float(row[0])
            row[1] = float(row[1])
            audioGlobals.annotations.append([row[0], row[1], row[2]])

        # get speakers unic colors for annotation plot and ganttChart
        #print len(audioGlobals.GreenShades)
        for shadeIndex in range(len(audioGlobals.annotations)):
            if audioGlobals.annotations[shadeIndex][2][:8] == 'Speech::':
                #print audioGlobals.greenIndex, len(audioGlobals.GreenShades)-1
                if audioGlobals.greenIndex >= (len(audioGlobals.GreenShades)-1):
                    audioGlobals.greenIndex = 0
                else:
                    audioGlobals.greenIndex = audioGlobals.greenIndex + 1
                #print audioGlobals.greenIndex, shadeIndex
                audioGlobals.shadesAndSpeaker.append([audioGlobals.annotations[shadeIndex][2], audioGlobals.GreenShades[audioGlobals.greenIndex]])

    # >> Call Classifier in case CSVFile not exists 
    #---------------------- 
    else:
        [flagsInd, classesAll, acc,CM] = aS.mtFileClassification(audioGlobals.wavFileName, os.path.abspath('audio/ClassifierMethods/svmModelTest'), 'svm', False)
        # declare classes
        [segs, classes] = aS.flags2segs(flagsInd, 1)
        lengthClass = len(classesAll)
        className = np.arange(lengthClass, dtype=np.float)


        for j in range(len(segs)):
            # no Annotation for Silence segments
            for i in range(len(classesAll)):
                if classes[j] == className[i] and classesAll[i] != 'Silence':
                    audioGlobals.annotations.append([segs[j][0]*1000, segs[j][1]*1000, classesAll[i]])

        # >> Write annotations in csv file
        csvFileName = audioGlobals.bagFile.replace(".bag","_audio.csv")
        annotationFile = open(csvFileName, 'w')
        write = csv.writer(annotationFile)
        write.writerows(audioGlobals.annotations)
        annotationFile.close()