def testUnBiasing(): realTimeDirectory = "../media/Jamison_Evaluations/Real_Time_Evaluation/Audio/" for filePath in sorted(glob.iglob(realTimeDirectory + "*.wav")): name = os.path.basename(filePath) print(name) # Read in the file audio = audioModule.Audio(filePath) plotAudio(audio=audio, name=name, samples=80000) audio.unBias() plotAudio(audio=audio, name=name, samples=80000) audio = audioModule.Audio(filePath) if audio.numberOfChannels > 1: audio.makeMono() plotAudio(audio=audio, name=name, samples=80000) audio.unBias() plotAudio(audio=audio, name=name, samples=80000)
def showVoiceActivityForParticipantAudio(): audioDirectory = "../media/Participant_Audio/*.wav" speechAnalyzer = speechAnalysis.SpeechAnalyzer() for filePath in sorted(glob.iglob(audioDirectory)): name = os.path.basename(filePath)[:-4] audio = audioModule.Audio(filePath=filePath) if audio.numberOfChannels != 1: audio.makeMono() voiceActivity = speechAnalyzer.getVoiceActivityFromAudio(audio) voiceActivity[voiceActivity == 0] = np.nan times = np.arange(0, len(audio.data) / audio.sampleRate, speechAnalyzer.featureStepSize / 1000) signalTimes = np.arange(0, len(audio.data) / audio.sampleRate, 1 / audio.sampleRate) plt.figure(figsize=[16, 8]) plt.plot(signalTimes, audio.data, times, voiceActivity) plt.title(name) plt.show()
def graphSantaBarbara(): audioDirectory = "../media/SBCSAE/audio/*.wav" speechAnalyzer = speechAnalysis.SpeechAnalyzer() for filePath in sorted(glob.iglob(audioDirectory)): fileName = os.path.basename(filePath)[:-4] print(fileName) audio = audioModule.Audio(filePath=filePath) if audio.numberOfChannels != 1: audio.makeMono() intensity = speechAnalyzer.getEnergyFromAudio(audio) filledPauses, timeStamps = speechAnalyzer.getFilledPausesFromAudio( audio) print(convertArrayToTimeStamps(timeStamps)) filledPausesMarkers = np.full(len(timeStamps), 0) energyTimes = np.array(range( 0, len(intensity))) / (1000 / speechAnalyzer.featureStepSize) plt.figure(figsize=(20, 10)) plt.plot(timeStamps, filledPausesMarkers, '^') plt.plot(energyTimes, intensity) plt.savefig("../media/SBCSAE/graphs/" + fileName + ".png") plt.close()
def validateWithSVCCorpus(): speechAnalyzer = speechAnalysis.SpeechAnalyzer() corpusPath = "../media/vocalizationcorpus" labelsPath = corpusPath + "/labels.txt" transcript = [] totalNumberOfFilledPauses = 0 totalNumberOfCorrectlyDetectedPauses = 0 totalNumberOfFalseAlarms = 0 with open(labelsPath) as transcriptFile: lines = transcriptFile.readlines() for row in lines: transcript.append(row.strip().split(',')) # Remove header line transcript.pop(0) for row in transcript: fileName = row[0] utterances = row[4:] # print(fileName, utterances) utterances = np.array(utterances) utterances = utterances.reshape((int(utterances.shape[0] / 3)), 3) if 'filler' in utterances: filePath = corpusPath + "/data/" + fileName + ".wav" audio = audioModule.Audio(filePath=filePath) if audio.numberOfChannels != 1: audio.makeMono() filledPauses, timeStamps = speechAnalyzer.getFilledPausesFromAudio( audio) for utterance in utterances: if utterance[0] == "filler": totalNumberOfFilledPauses += 1 for filledPauseDetectedTime in timeStamps: correctDetection = False for utterance in utterances: if utterance[0] == "filler" and abs( float(utterance[1]) - filledPauseDetectedTime) < 0.5: correctDetection = True if correctDetection: totalNumberOfCorrectlyDetectedPauses += 1 else: totalNumberOfFalseAlarms += 1 print(fileName, totalNumberOfFilledPauses, totalNumberOfCorrectlyDetectedPauses, totalNumberOfFalseAlarms)
def compareLibrosaAndRMS(): filePath = "../media/Participant_Audio/p10_ol.wav" name = os.path.basename(filePath)[:-4] stepSize = 10 # In milliseconds windowSize = 10 audio = audioModule.Audio(filePath=filePath) if audio.numberOfChannels != 1: audio.makeMono() librosaRMS = featureModule.getEnergy(data=audio.data, sampleRate=audio.sampleRate, windowSize=windowSize, stepSize=stepSize) rms = featureModule.getRMSIntensity(data=audio.data, sampleRate=audio.sampleRate, windowSize=windowSize, stepSize=stepSize) times = np.arange(0, len(audio.data) / audio.sampleRate, stepSize / 1000) plt.figure(figsize=[16, 8]) plt.plot(times, librosaRMS) plt.plot(times, rms) plt.title(name) plt.show()
def compareAlgorithmToParticipants(): audioDirectory = "../media/Participant_Audio_30_Sec_Chunks/*.wav" speechAnalyzer = speechAnalysis.SpeechAnalyzer() printParameters(speechAnalyzer) transcript = [] totalNumberOfFilledPauses = 0 totalNumberOfCorrectlyDetectedPauses = 0 totalNumberOfFalseAlarms = 0 with open( "../media/Participant_Audio_30_Sec_Chunks_Transcripts/filled_pauses.txt" ) as transcriptFile: lines = transcriptFile.readlines() for row in lines: transcript.append(row.strip().split(', ')) for line in transcript: name = line[0] actualFilledPausesCount = int(line[1]) for filePath in sorted(glob.iglob(audioDirectory)): fileName = os.path.basename(filePath)[:-4] if fileName == name: audio = audioModule.Audio(filePath=filePath) if audio.numberOfChannels != 1: audio.makeMono() filledPauses, timeStamps = speechAnalyzer.getFilledPausesFromAudio( audio) filledPausesMarkers = np.full(len(timeStamps), 0) filledPausesCount = len(timeStamps) print(name, actualFilledPausesCount, filledPausesCount) totalNumberOfFilledPauses += actualFilledPausesCount if filledPausesCount > actualFilledPausesCount: totalNumberOfFalseAlarms += filledPausesCount - actualFilledPausesCount totalNumberOfCorrectlyDetectedPauses += actualFilledPausesCount else: totalNumberOfCorrectlyDetectedPauses += filledPausesCount precision = totalNumberOfCorrectlyDetectedPauses / ( totalNumberOfCorrectlyDetectedPauses + totalNumberOfFalseAlarms) recall = totalNumberOfCorrectlyDetectedPauses / totalNumberOfFilledPauses f1 = 2 * precision * recall / (precision + recall) print(" Total | Filled pauses:", totalNumberOfFilledPauses) print(" New | Correct filled pauses:", totalNumberOfCorrectlyDetectedPauses, "False alarms:", totalNumberOfFalseAlarms, "Precision:", precision, "Recall:", recall, "F1", f1)
def validateWithCCHP(): corpusTopLevelPath = "../media/cchp_english/" speechAnalyzer = speechAnalysis.SpeechAnalyzer() # Iterate through sub directories with participants. for participantPath in sorted(glob.iglob(corpusTopLevelPath + '*/')): totalNumberOfFilledPauses = 0 totalNumberOfCorrectlyDetectedPauses = 0 totalNumberOfFalseAlarms = 0 # Find the audio files for each condition. for filePath in sorted(glob.iglob(participantPath + "*.wav")): fileName = os.path.basename(filePath)[:-4] # Find the matching transcript for transciptPath in sorted(glob.iglob(participantPath + "*.xml")): transcriptName = os.path.basename(transciptPath)[:-4] if fileName == transcriptName: # Grab the number of filled pauses transcriptFile = open(transciptPath, 'r').read() actualFilledPausesCount = transcriptFile.count( "uh</T>") + transcriptFile.count( "um</T>") + transcriptFile.count("mm</T>") audio = audioModule.Audio(filePath=filePath) if audio.numberOfChannels == 2: audio.makeMono() _, timeStamps = speechAnalyzer.getFilledPausesFromAudio( audio) algorithmFilledPauseCount = len(timeStamps) totalNumberOfFilledPauses += actualFilledPausesCount if algorithmFilledPauseCount > actualFilledPausesCount: totalNumberOfFalseAlarms += algorithmFilledPauseCount - actualFilledPausesCount totalNumberOfCorrectlyDetectedPauses += actualFilledPausesCount else: totalNumberOfCorrectlyDetectedPauses += algorithmFilledPauseCount print(fileName, actualFilledPausesCount, algorithmFilledPauseCount) # precision = totalNumberOfCorrectlyDetectedPauses / (totalNumberOfCorrectlyDetectedPauses + totalNumberOfFalseAlarms) # recall = totalNumberOfCorrectlyDetectedPauses / totalNumberOfFilledPauses # # f1 = 2 * precision * recall / (precision + recall) print(" Total | Filled pauses:", totalNumberOfFilledPauses) print(" New | Correct filled pauses:", totalNumberOfCorrectlyDetectedPauses, "False alarms:", totalNumberOfFalseAlarms)
def testingCCHP(): speechAnalyzer = speechAnalysis.SpeechAnalyzer() # audio = audioModule.Audio(filePath="../media/SBC001.wav") audio = audioModule.Audio( filePath="../media/cchp_english/p102/p102_en_pd.wav") if audio.numberOfChannels != 1: audio.makeMono() filledPauses, timeStamps = speechAnalyzer.getFilledPausesFromAudio(audio) print(timeStamps)
def getFeaturesFromFile(): filePath = "../media/cchp_english/p102/p102_en_pd.wav" audio = audioModule.Audio(filePath=filePath) audio.makeMono() print(filePath) analyzer = speechAnalysis.SpeechAnalyzer() filledPauses, timeStamps = analyzer.getFilledPausesFromAudio(audio) print(len(timeStamps))
def createSlicesFromPausesWithParticipants(): audioDirectory = "../media/Participant_Audio_First_five/*.wav" outputDir = "./filledPauses/" for filePath in sorted(glob.iglob(audioDirectory)): # Audio file i/o name = os.path.basename(filePath)[:-4] participant = name.split("_")[0] condition = name.split("_")[1] # # Make fresh directories # os.mkdir(outputDir + name) print(participant, condition) audio = audioModule.Audio(filePath=filePath) audio.makeMono() filledPauses = featureModule.getFilledPauses( audio.data, audio.sampleRate, utteranceWindowSize, utteranceStepSize, utteranceMinimumLength, utteranceF1MaximumVariance, utteranceF2MaximumVariance, utteranceEnergyThreshold) audio = AudioSegment.from_wav(filePath) for time in filledPauses: ### Output files - pydub is in ms outputPath = outputDir + name + "/" + str(round(time, 2)) # move back 100 ms start = (time - 0.1) * 1000 # grab a second end = (time + 1) * 1000 segment = audio[start:end] # write to disk segment.export(outputPath + ".wav", format="wav") # move back a whole second for more context start = (time - 1) * 1000 segment = audio[start:end] # write to disk segment.export(outputPath + "[extra].wav", format="wav") # -- print("Done with ", name) print(len(filledPauses))
def getFeaturesFromSlices(): filePaths = sorted(glob.iglob("./filledPauses/p3_ol/*extra].wav")) analyzer = speechAnalysis.SpeechAnalyzer() for filePath in filePaths: print(filePath) audio = audioModule.Audio(filePath=filePath) audio.makeMono() filledPauses, timeStamps = analyzer.getFilledPausesFromAudio(audio) print(timeStamps)
def createSlicesFromPausesOnCCHP(): audioDirectory = "../media/cchp_english/*/*.wav" outputDir = "../validation/results/filledPausesSlices/" speechAnalyzer = speechAnalysis.SpeechAnalyzer() # # Make fresh directories # for filePath in sorted(glob.iglob("../media/cchp_english/*/")): # participantDir = filePath.split("/")[-2] # # os.mkdir(outputDir + participantDir + "/") # for condition in ["pd", "ra", "tn"]: # # os.mkdir(outputDir + participantDir + "/" + condition + "/") for filePath in sorted(glob.iglob(audioDirectory)): participantDir = filePath.split("/")[-2] # Audio file i/o name = os.path.basename(filePath)[:-4] condition = name.split("_")[2] audio = audioModule.Audio(filePath=filePath) if audio.numberOfChannels != 1: audio.makeMono() filledPauses, timeStamps = speechAnalyzer.getFilledPausesFromAudio( audio) print(name) audioSegment = AudioSegment.from_wav(filePath) for time in timeStamps: ### Output files - pydub is in ms outputPath = outputDir + participantDir + "/" + condition + "/" + name + "-" + str( round(time, 2)) print(outputPath) # move back 100 ms start = (time - 0.1) * 1000 # grab a second end = (time + 1) * 1000 segment = audioSegment[start:end] # write to disk segment.export(outputPath + ".wav", format="wav") # -- print("Done with ", name)
def parameterSweepP103CCHP(): audioDirectory = "../media/cchp_english/p103/*.wav" outputDir = "../validation/results/p103sweep/" speechAnalyzer = speechAnalysis.SpeechAnalyzer() for minimumLength in list(range(50, 260, 10)): print("threshold:", minimumLength) speechAnalyzer.filledPauseMinimumLength = minimumLength # # Make fresh directories # os.mkdir(outputDir + str(minimumLength) + "/") # for condition in ["pd", "ra", "tn"]: # os.mkdir(outputDir + str(minimumLength) + "/" + condition + "/") for filePath in sorted(glob.iglob(audioDirectory)): participantDir = filePath.split("/")[-2] # Audio file i/o name = os.path.basename(filePath)[:-4] condition = name.split("_")[2] audio = audioModule.Audio(filePath=filePath) if audio.numberOfChannels != 1: audio.makeMono() filledPauses, timeStamps = speechAnalyzer.getFilledPausesFromAudio( audio) print(name, len(timeStamps)) audioSegment = AudioSegment.from_wav(filePath) for time in timeStamps: ### Output files - pydub is in ms outputPath = outputDir + str( minimumLength) + "/" + condition + "/" + name + "-" + str( round(time, 2)) # move back 100 ms start = (time - 0.1) * 1000 # grab a second end = (time + 1) * 1000 segment = audioSegment[start:end] # write to disk segment.export(outputPath + ".wav", format="wav")
def compareAlgorithmToDataset(): print("Running on Dr. Smart's Dataset") speechAnalyzer = speechAnalysis.SpeechAnalyzer() printParameters(speechAnalyzer) directory = '../media/drSmartAudio' dataset = [] numberOfAccurateDetections = 0 numberOfDetections = 0 trueNumberOfFilledPauses = 0 # Load the dataset info for training with open(directory + '/metadata.csv', 'r') as csvfile: reader = csv.reader(csvfile) dataset.extend(reader) # Remove header dataset.pop(0) startTime = time.time() for audioFile in dataset: audio = audioModule.Audio(filePath=directory + audioFile[0]) filledPauses, timeStamps = speechAnalyzer.getFilledPausesFromAudio( audio) if int(audioFile[1]) <= len(timeStamps): numberOfAccurateDetections += int(audioFile[1]) trueNumberOfFilledPauses += int(audioFile[1]) numberOfDetections += len(timeStamps) print() print(" Time to run:", time.time() - startTime) print(" Detections:", numberOfDetections, "Accurate detections:", numberOfAccurateDetections, "Total filled pauses:", trueNumberOfFilledPauses) print(" Precision:", numberOfAccurateDetections / numberOfDetections) print(" Recall:", numberOfAccurateDetections / trueNumberOfFilledPauses) print(" Score: ", (numberOfAccurateDetections / numberOfDetections) * (numberOfAccurateDetections / trueNumberOfFilledPauses)) print()
def runAlgorithmOnSlices(): analyzer = speechAnalysis.SpeechAnalyzer() for subdir, dirs, files in os.walk(outputDir): for file in files: filePath = os.path.join(subdir, file) if "[extra].wav" in filePath: print(filePath) name = os.path.basename(filePath)[:-4] audio = audioModule.Audio(filePath=filePath) audio.makeMono() filledPauses, timeStamps = analyzer.getFilledPausesFromAudio( audio) print(timeStamps)
def runAlgorithmOnParticipants(): underLoadFilledPauses = 0 normalLoadFilledPauses = 0 overLoadFilledPauses = 0 participantCount = 30 directory = "../media/Participant_Audio/" filledPausesForParticipant = [["participant", "ul", "nl", "ol"]] for participantNumber in range(1, participantCount + 1): participantData = [participantNumber] for condition in ["ul", "nl", "ol"]: filePath = directory + "p" + str( participantNumber) + "_" + condition + ".wav" if filePath != "../media/Participant_Audio/p8_nl.wav": print(filePath) audio = audioModule.Audio(filePath=filePath) audio.makeMono() filledPauses = featureModule.getFilledPauses( audio.data, audio.sampleRate, utteranceWindowSize, utteranceStepSize, utteranceMinimumLength, utteranceF1MaximumVariance, utteranceF2MaximumVariance, utteranceEnergyThreshold) participantData.append(len(filledPauses)) print(" ", len(filledPauses)) print(participantData) filledPausesForParticipant.append(participantData) print(filledPausesForParticipant) with open('./filledPauses/filledPausesForParticipant.csv', 'w') as outputFile: writer = csv.writer(outputFile) for row in filledPausesForParticipant: writer.writerow(row)
def runAlgorithmOnDataset(): directory = '../media/drSmartAudio' dataset = [] analyzer = speechAnalysis.SpeechAnalyzer() # Load the dataset info for training with open(directory + '/metadata.csv', 'r') as csvfile: reader = csv.reader(csvfile) dataset.extend(reader) # Remove header dataset.pop(0) for audioFile in dataset: filePath = audioFile[0] print(filePath) audio = audioModule.Audio(filePath=directory + audioFile[0]) filledPauses, timeStamps = analyzer.getFilledPausesFromAudio(audio) print(len(timeStamps))
def showVoiceActivityAndSyllablesForParticipantAudio(): audioDirectory = "../media/Participant_Audio/*.wav" speechAnalyzer = speechAnalysis.SpeechAnalyzer() for filePath in sorted(glob.iglob(audioDirectory)): name = os.path.basename(filePath)[:-4] audio = audioModule.Audio(filePath=filePath) if audio.numberOfChannels != 1: audio.makeMono() print("Getting voice activity...") voiceActivity = speechAnalyzer.getVoiceActivityFromAudio(audio) voiceActivity[voiceActivity == 0] = np.nan voiceActivityBufferSize = int(100 / speechAnalyzer.featureStepSize) voiceActivityBuffered = featureModule.createBufferedBinaryArrayFromArray( voiceActivity == 1, voiceActivityBufferSize).astype(int).astype(float) voiceActivityBuffered[voiceActivityBuffered == 0] = np.nan print("Getting syllables...") syllables, _ = speechAnalyzer.getSyllablesFromAudio(audio) syllableMarkers = np.full(len(syllables), 0) print("Getting other features...") energy = featureModule.getEnergy(audio.data, audio.sampleRate, speechAnalyzer.syllableWindowSize, speechAnalyzer.featureStepSize) energyMinThreshold = featureModule.getEnergyMinimumThreshold(energy) fractionEnergyMinThreshold = energyMinThreshold / max(energy) zcr = librosa.feature.zero_crossing_rate( audio.data, frame_length=int(audio.sampleRate / 1000 * speechAnalyzer.featureStepSize), hop_length=int(audio.sampleRate / 1000 * speechAnalyzer.featureStepSize))[0] zcrTimes = np.arange(0, len(audio.data) / audio.sampleRate + 1, speechAnalyzer.featureStepSize / 1000)[:len(zcr)] pitch = speechAnalyzer.getPitchFromAudio(audio) pitch[pitch == 0] = np.nan pitchTimes = np.arange(0, len(audio.data) / audio.sampleRate, speechAnalyzer.featureStepSize / 1000)[:len(pitch)] times = np.arange(0, len(audio.data) / audio.sampleRate, speechAnalyzer.featureStepSize / 1000) energyTimes = np.arange(0, len(audio.data) / audio.sampleRate, speechAnalyzer.featureStepSize / 1000)[:len(energy)] print("Graphing!") plt.figure(figsize=[16, 8]) plt.plot(zcrTimes, zcr * 1000, 'gold') plt.plot(times, energy / 5) plt.plot(pitchTimes, pitch, 'red') plt.plot(syllables, syllableMarkers, 'go') plt.plot(times, voiceActivityBuffered * -5, 'darkorange') plt.plot(times, voiceActivity, 'purple') plt.title(name) plt.show()
def getFeaturesFromFileUsingWindowing(): filePath = "../media/Participant_Audio/p3_ol.wav" name = os.path.basename(filePath)[:-4] speechAnalyzer = speechAnalysis.SpeechAnalyzer() speechAnalyzer.lookBackSize = 5 # Read in the file, extract data and metadata audio = audioModule.Audio(filePath) if audio.numberOfChannels > 1: audio.makeMono() # Set up time tracker seconds = np.zeros(shape=0) step = 0 sampleStepSize = int(speechAnalyzer.stepSize * audio.sampleRate) sampleLookBackSize = int(speechAnalyzer.lookBackSize * audio.sampleRate) while step < audio.length: # Keep track of what second we're in print("Second:", step / audio.sampleRate) # Look backward to calculate features over long term if step + sampleStepSize - sampleLookBackSize > 0: currentWindow = audioModule.Audio( data=audio.data[step + sampleStepSize - sampleLookBackSize:step + sampleStepSize]) currentWindow.sampleRate = audio.sampleRate ### WORDS PER MINUTE syllables = speechAnalyzer.getSyllablesFromAudio(currentWindow)[0] syllableMarkers = np.full(len(syllables), 0) ### VAD voiceActivity = speechAnalyzer.getVoiceActivityFromAudio( currentWindow) ### INTENSITY energy = featureModule.getEnergy(currentWindow.data, currentWindow.sampleRate, speechAnalyzer.syllableWindowSize, speechAnalyzer.featureStepSize) energyMinThreshold = featureModule.getEnergyMinimumThreshold( energy) fractionEnergyMinThreshold = energyMinThreshold / max(energy) ### PITCH pitch = featureModule.getPitch(currentWindow.data, currentWindow.sampleRate, speechAnalyzer.featureStepSize, fractionEnergyMinThreshold) syllableBinaryArray = np.full(len(voiceActivity), 0) for timeStamp in syllables: syllableBinaryArray[int(timeStamp / (currentWindow.sampleRate / 1000 * speechAnalyzer.featureStepSize) * currentWindow.sampleRate)] = 1 # Mask out all filled pauses that coincide with voice acitivty syllableBinaryArray[voiceActivity.astype(bool)] = 0 if max(syllableBinaryArray) >= 1: # Clean up va for graphing voiceActivity[voiceActivity == 0] = np.nan pitch[pitch == 0] = np.nan pitchTimes = np.arange( 0, len(currentWindow.data) / currentWindow.sampleRate, speechAnalyzer.featureStepSize / 1000)[:len(pitch)] energyTimes = np.arange( 0, len(currentWindow.data) / currentWindow.sampleRate, speechAnalyzer.featureStepSize / 1000)[:len(energy)] times = np.arange( 0, len(currentWindow.data) / currentWindow.sampleRate, speechAnalyzer.featureStepSize / 1000) plt.figure(figsize=[16, 8]) plt.plot(times, energy / 10, pitchTimes, pitch) plt.plot(times, voiceActivity) plt.plot(syllables, syllableMarkers, 'r^') plt.title(name + " from " + str(step / audio.sampleRate - speechAnalyzer.lookBackSize) + " to " + str(step / audio.sampleRate) + " seconds") # plt.savefig("./syllablesVersusVAD/" + name + "_" + str(step/audio.sampleRate - speechAnalyzer.lookBackSize) + "-" + str(step/audio.sampleRate) + "_seconds.png") plt.show() # Increment to next step step += sampleStepSize
def showFeatures(): filePath = "../media/Participant_Audio/p3_ol.wav" name = os.path.basename(filePath)[:-4] speechAnalyzer = speechAnalysis.SpeechAnalyzer() # Read in the file, extract data and metadata audio = audioModule.Audio(filePath) if audio.numberOfChannels > 1: audio.makeMono() speechAnalyzer = speechAnalysis.SpeechAnalyzer() ### AMPLITUDE energy = speechAnalyzer.getEnergyFromAudio(audio) ### PITCH pitches = speechAnalyzer.getPitchFromAudio(audio, energy) ### VAD voiceActivity = speechAnalyzer.getVoiceActivityFromAudio(audio, pitches) ### SYLLABLES syllables = speechAnalyzer.getSyllablesFromAudio(audio, pitches)[0].astype(float) # ### FILLED PAUSES filledPauses = speechAnalyzer.getFilledPausesFromAudio(audio)[0].astype( float) # Mask features with voice activity bufferFrames = int(speechAnalyzer.voiceActivityMaskBufferSize / speechAnalyzer.featureStepSize) mask = np.invert( featureModule.createBufferedBinaryArrayFromArray( voiceActivity.astype(bool), bufferFrames)) # energy[mask[:len(energy)]] = 0 pitches[mask[:len(pitches)]] = 0 syllables[mask[:len(syllables)]] = 0 filledPauses[mask[:len(filledPauses)]] = 0 # Graphing pitches[pitches == 0] = np.nan voiceActivity[voiceActivity == 0] = np.nan syllables[syllables == 0] = np.nan filledPauses[filledPauses == 0] = np.nan pitchTimes = np.arange(0, len(audio.data) / audio.sampleRate, speechAnalyzer.featureStepSize / 1000)[:len(pitches)] times = np.arange(0, len(audio.data) / audio.sampleRate, speechAnalyzer.featureStepSize / 1000) plt.figure(figsize=[16, 8]) plt.plot(times, energy / 10, pitchTimes, pitches) plt.plot(times, voiceActivity, 'orchid') plt.plot(times[:len(syllables)], syllables, color='c', marker='^') plt.plot(times[:len(filledPauses)], filledPauses, 'ro') plt.title(name) # plt.savefig("./syllablesVersusVAD/" + name + "_" + str(step/audio.sampleRate - speechAnalyzer.lookBackSize) + "-" + str(step/audio.sampleRate) + "_seconds.png") plt.show()
def compareAlgorithmToSlices(): print("Running on slices") speechAnalyzer = speechAnalysis.SpeechAnalyzer() printParameters(speechAnalyzer) controlYeses = 0 controlNos = 0 yeses = 0 nos = 0 startTime = time.time() # Compare with file of all existing with open('./filledPauses/filledPausesAllParticipantsRatings.csv' ) as csvfile: reader = csv.DictReader(csvfile) # Go through each existing filled pause for row in reader: participant = row['participant'] condition = row['condition'] timeStamp = row['time'] judgement = row['judgement'] if timeStamp == "862": timeStamp = "862.0" # Keep track of manual classification if judgement == "1": controlYeses += 1 elif judgement == "-1": controlNos += 1 filePath = "./filledPauses/" + participant + "_" + condition[ 1:] + "/" + timeStamp + "[extra].wav" # print(filePath) audio = audioModule.Audio(filePath=filePath) audio.makeMono() # Run algorithm filledPauses, timeStamps = speechAnalyzer.getFilledPausesFromAudio( audio) found = False for timeDetected in timeStamps: if abs(timeDetected - 1.0) < 0.2 and not found: found = True if judgement == "1": yeses += 1 elif judgement == "-1": nos += 1 print() print(" Time to run:", time.time() - startTime) print(" Detections:", (yeses + nos), "Accurate detections:", yeses, "Total filled pauses:", controlYeses) print(" Precision:", yeses / (yeses + nos)) print(" Recall:", yeses / controlYeses) print(" Score: ", (yeses / controlYeses) * (yeses / (yeses + nos))) print()
def showSyllables(): # filePath = "../media/cchp_english/p102/p102_en_pd.wav" filePath = "../media/Participant_Audio_30_Sec_Chunks/p14_ol_chunk18.wav" name = os.path.basename(filePath)[:-4] speechAnalyzer = speechAnalysis.SpeechAnalyzer() audio = audioModule.Audio(filePath=filePath) if audio.numberOfChannels != 1: audio.makeMono() audio.description() syllables, candidates = speechAnalyzer.getSyllablesFromAudio(audio) print(len(syllables)) syllableMarkers = np.full(len(syllables), 0) candidateMarkers = np.full(len(candidates), 0) ### Energy energy = librosa.feature.rmse( audio.data, frame_length=int(audio.sampleRate / 1000 * speechAnalyzer.featureStepSize), hop_length=int(audio.sampleRate / 1000 * speechAnalyzer.featureStepSize))[0] energyTimes = np.arange(0, len(audio.data) / audio.sampleRate, speechAnalyzer.featureStepSize / 1000)[:len(energy)] energyMinThreshold = featureModule.getEnergyMinimumThreshold(energy) fractionEnergyMinThreshold = energyMinThreshold / max(energy) pitch = featureModule.getPitch(audio.data, audio.sampleRate, speechAnalyzer.featureStepSize, fractionEnergyMinThreshold) pitchTimes = np.arange(0, len(audio.data) / audio.sampleRate, speechAnalyzer.featureStepSize / 1000)[:len(pitch)] zcr = librosa.feature.zero_crossing_rate( audio.data, frame_length=int(audio.sampleRate / 1000 * speechAnalyzer.featureStepSize * 4), hop_length=int(audio.sampleRate / 1000 * speechAnalyzer.featureStepSize))[0] zcrTimes = np.arange(0, len(audio.data) / audio.sampleRate + 1, speechAnalyzer.featureStepSize / 1000)[:len(zcr)] voiceActivity = speechAnalyzer.getVoiceActivityFromAudio(audio) voiceActivity[voiceActivity == 0] = np.nan voiceActivityTimes = np.arange(0, len(audio.data) / audio.sampleRate, speechAnalyzer.featureStepSize / 1000)[:len(voiceActivity)] print(len(voiceActivity), len(voiceActivityTimes)) times = np.arange(0, len(audio.data) / audio.sampleRate, speechAnalyzer.featureStepSize / 1000) signalTimes = np.arange(0, len(audio.data) / audio.sampleRate, 1 / audio.sampleRate) plt.figure(figsize=[16, 8]) plt.plot(energyTimes, energy / 10, pitchTimes, pitch, zcrTimes, zcr * 100, candidates, candidateMarkers, 'ro') plt.plot(syllables, syllableMarkers, 'go') plt.plot(voiceActivityTimes, voiceActivity) plt.title(name) plt.show()
def validateWithTranscript(): speechAnalyzer = speechAnalysis.SpeechAnalyzer() transcript = [] totalNumberOfFilledPauses = 0 totalNumberOfCorrectlyDetectedPauses = 0 totalNumberOfFalseAlarms = 0 with open("../media/filled_pauses_validation_participant_audio" + "/filled_pauses.txt") as transcriptFile: lines = transcriptFile.readlines() for row in lines: transcript.append(row.strip().split(', ')) for line in transcript: name = line[0] if name[0] != "#": actualFilledPausesCount = int(line[1]) path = None # for filePath in sorted(glob.iglob("../media/filled_pauses_validation_participant_audio/" + "*.wav")): # fileName = os.path.basename(filePath)[:-4] # # if fileName == name: # path = filePath for filePath in sorted(glob.iglob(audioDirectory + "*.wav")): fileName = os.path.basename(filePath)[:-4] if fileName == name: path = filePath if path: audio = audioModule.Audio(filePath=path) if audio.numberOfChannels != 1: audio.makeMono() filledPauses, timeStamps = speechAnalyzer.getFilledPausesFromAudio( audio) if True: voiceActivity = speechAnalyzer.getVoiceActivityFromAudio( audio) bufferFrames = int( speechAnalyzer.voiceActivityMaskBufferSize / speechAnalyzer.featureStepSize) mask = np.invert( featureModule.createBufferedBinaryArrayFromArray( voiceActivity.astype(bool), bufferFrames)) filledPauses[mask] = 0 filledPausesMarkers = np.full(int(sum(filledPauses)), 0) filledPausesCount = int(sum(filledPauses)) print(name, "\t", actualFilledPausesCount, filledPausesCount, timeStamps) totalNumberOfFilledPauses += actualFilledPausesCount if filledPausesCount > actualFilledPausesCount: totalNumberOfFalseAlarms += filledPausesCount - actualFilledPausesCount totalNumberOfCorrectlyDetectedPauses += actualFilledPausesCount else: totalNumberOfCorrectlyDetectedPauses += filledPausesCount precision = totalNumberOfCorrectlyDetectedPauses / ( totalNumberOfCorrectlyDetectedPauses + totalNumberOfFalseAlarms) recall = totalNumberOfCorrectlyDetectedPauses / totalNumberOfFilledPauses fMeasure = 2 * precision * recall / (precision + recall) print(" Total | Filled pauses:", totalNumberOfFilledPauses) print(" New | Correct filled pauses:", totalNumberOfCorrectlyDetectedPauses, "False alarms:", totalNumberOfFalseAlarms, "Precision:", precision, "Recall:", recall, "F1", fMeasure)
def compareEnergyAndIntensity(): filePath = "../media/Participant_Audio/p10_ol.wav" name = os.path.basename(filePath)[:-4] stepSize = 10 # In milliseconds windowSize = 10 audio = audioModule.Audio(filePath=filePath) if audio.numberOfChannels != 1: audio.makeMono() stepSizeInSamples = int(audio.sampleRate / 1000 * stepSize) windowSizeInSamples = int(audio.sampleRate / 1000 * windowSize) # Parselmouth intensity parselSound = parselmouth.Sound(values=audio.data, sampling_frequency=audio.sampleRate) intensityObject = parselSound.to_intensity(minimum_pitch=50.0, time_step=stepSize / 1000) intensity = intensityObject.values.T shortTermEnergy = np.array([ math.sqrt( sum(audio.data[step:step + windowSizeInSamples]**2) / windowSizeInSamples) for step in range(0, len(audio.data), stepSizeInSamples) ]) rms = np.array([ sum(audio.data[step:step + windowSizeInSamples]**2) for step in range(0, len(audio.data), stepSizeInSamples) ]) # Librosa rms rms = librosa.feature.rms(audio.data, frame_length=windowSizeInSamples, hop_length=stepSizeInSamples)[0] # Current intensity measure amplitude = np.absolute(audio.data) intensityTimes = np.arange(0, len(audio.data) / audio.sampleRate, stepSize / 1000)[:len(intensity)] shortTermEnergyTimes = np.arange(0, len(audio.data) / audio.sampleRate, stepSize / 1000)[:len(shortTermEnergy)] rmsTimes = np.arange(0, len(audio.data) / audio.sampleRate, stepSize / 1000)[:len(rms)] signalTimes = np.arange(0, len(audio.data) / audio.sampleRate, 1 / audio.sampleRate) plt.figure(figsize=[16, 8]) # plt.plot(signalTimes, amplitude / 2) plt.plot(shortTermEnergyTimes, shortTermEnergy) plt.plot(rmsTimes, rms) plt.plot(intensityTimes, intensity * 100) plt.title(name) plt.show()