def toAbsoluteTime(namePrefix, matlabOutputPath, startTimeList): ''' Converts the sampled times from relative to absolute time The input may be split across a number of files. This script assumes that files of the pattern <<namePrefix>><<nameSuffix>>.txt correspond to different parts of the same source file. namePrefix - name of the original wav file with no suffix speechRatePath - the path where the output of the matlab script is placed startTimeList - there needs to be one file here for each file in speechRatePath with the pattern namePrefix Returns a list of lists where each sublist corresponds to the output of one file matching <<namePrefix>> ''' # Load subset speech rate speechRateFNList = utils.findFiles(matlabOutputPath, filterExt=".txt", filterPattern=namePrefix) returnList = [] for start, speechRateFN in utils.safeZip([startTimeList, speechRateFNList], enforceLength=True): speechRateList = utils.openCSV(matlabOutputPath, speechRateFN, valueIndex=0) speechRateList = [value for value in speechRateList if value != ''] speechRateList = [ str(float(start) + float(sampNum)) for sampNum in speechRateList ] returnList.append(speechRateList) return returnList
def toAbsoluteTime(namePrefix, matlabOutputPath, startTimeList): ''' Converts the sampled times from relative to absolute time The input may be split across a number of files. This script assumes that files of the pattern <<namePrefix>><<nameSuffix>>.txt correspond to different parts of the same source file. namePrefix - name of the original wav file with no suffix speechRatePath - the path where the output of the matlab script is placed startTimeList - there needs to be one file here for each file in speechRatePath with the pattern namePrefix Returns a list of lists where each sublist corresponds to the output of one file matching <<namePrefix>> ''' # Load subset speech rate speechRateFNList = utils.findFiles(matlabOutputPath, filterExt=".txt", filterPattern=namePrefix) returnList = [] for start, speechRateFN in utils.safeZip([startTimeList, speechRateFNList], enforceLength=True): speechRateList = utils.openCSV(matlabOutputPath, speechRateFN, valueIndex=0) speechRateList = [value for value in speechRateList if value != ''] speechRateList = [str(float(start) + float(sampNum)) for sampNum in speechRateList] returnList.append(speechRateList) return returnList
def aggregateSpeechRate(tgInfoPath, speechRatePath, outputPath, samplingRate): utils.makeDir(outputPath) finishedList = utils.findFiles(outputPath, filterExt=".txt") for fn in utils.findFiles(tgInfoPath, filterExt=".txt", skipIfNameInList=finishedList): # Load subset speech rate name = os.path.splitext(fn)[0] speechRateFNList = utils.findFiles(speechRatePath, filterExt=".txt", filterPattern=name) subSplitList = utils.openCSV(tgInfoPath, fn) # Convert the sample numbers to seconds # They are in terms of the beginning of the subset they are in but # need to be in terms of the start of the file the larger file the # subset originated from outputList = [] for splitInfo, speechRateFN in utils.safeZip([subSplitList, speechRateFNList], enforceLength=True): start, stop, label = splitInfo speechRateList = utils.openCSV(speechRatePath, speechRateFN, valueIndex=0) speechRateList = [value for value in speechRateList if value != ''] speechRateList = [str(float(start) + float(sampleNum) / float(samplingRate)) for sampleNum in speechRateList] outputList.append( ",".join(speechRateList) ) open(join(outputPath, fn), "w").write("\n".join(outputList) + "\n")
def aggregateFeatures(featurePath, featureList, headerStr=None): outputDir = join(featurePath, "aggr") utils.makeDir(outputDir) fnList = [] dataList = [] # Find the files that exist in all features for feature in featureList: fnSubList = utils.findFiles(join(featurePath, feature), filterExt=".txt") fnList.append(fnSubList) actualFNList = [] for featureFN in fnList[0]: if all([featureFN in subList for subList in fnList]): actualFNList.append(featureFN) for featureFN in actualFNList: dataList = [] for feature in featureList: featureDataList = utils.openCSV(join(featurePath, feature), featureFN, encoding="utf-8") dataList.append([",".join(row) for row in featureDataList]) name = os.path.splitext(featureFN)[0] dataList.insert(0, [name for _ in range(len(dataList[0]))]) tDataList = utils.safeZip(dataList, enforceLength=True) outputList = [",".join(row) for row in tDataList] outputTxt = "\n".join(outputList) outputFN = join(outputDir, name + ".csv") with io.open(outputFN, "w", encoding="utf-8") as fd: fd.write(outputTxt) # Cat all files together aggrOutput = [] if headerStr is not None: aggrOutput.append(headerStr) for fn in utils.findFiles(outputDir, filterExt=".csv"): if fn == "all.csv": continue with io.open(join(outputDir, fn), "r", encoding='utf-8') as fd: aggrOutput.append(fd.read()) with io.open(join(outputDir, "all.csv"), "w", encoding='utf-8') as fd: fd.write("\n".join(aggrOutput))
def manualPhoneCountForEpochs(manualCountsPath, tgInfoPath, epochPath, outputPath): utils.makeDir(outputPath) skipList = utils.findFiles(outputPath, filterExt=".txt") for fn in utils.findFiles(tgInfoPath, filterExt=".txt", skipIfNameInList=skipList): epochList = utils.openCSV(epochPath, fn) tgInfo = utils.openCSV(tgInfoPath, fn) manualCounts = utils.openCSV(manualCountsPath, fn) epochOutputList = [] for epochTuple in epochList: # Epoch num, start, stop epochStart, epochStop = float(epochTuple[1]), float(epochTuple[2]) # Find all of the intervals that are at least partially # contained within the current epoch epochSyllableCount = 0 epochPhoneCount = 0 speechDuration = 0 for info, counts in utils.safeZip([tgInfo, manualCounts], enforceLength=True): start, stop = float(info[0]), float(info[1]) syllableCount, phoneCount = float(counts[0]), float(counts[1]) # Accounts for intervals that straddle an epoch boundary multiplicationFactor = percentInside(start, stop, epochStart, epochStop) speechDuration += (stop - start) * multiplicationFactor epochSyllableCount += syllableCount * multiplicationFactor epochPhoneCount += phoneCount * multiplicationFactor epochOutputList.append("%f,%f,%f" % (epochSyllableCount, epochPhoneCount, speechDuration)) with open(join(outputPath, fn), "w") as fd: fd.write("\n".join(epochOutputList))
def _calculateSyllablesPerSecondForIntervals(wavPath, tgPath, tierName, syllableNucleiPath): # Add syllable nuclei to textgrids for name in utils.findFiles(wavPath, filterExt=".wav", stripExt=True): tg = tgio.openTextGrid(join(tgPath, name + ".TextGrid")) entryList = tg.tierDict[tierName].entryList startTimeList = [entry[0] for entry in entryList] nucleusSyllableList = uwe_sr.toAbsoluteTime(name, syllableNucleiPath, startTimeList) durationList = [] for intervalList, entry in utils.safeZip( [nucleusSyllableList, entryList], enforceLength=True): start, stop = entry[0], entry[1] duration = len(intervalList) / (stop - start) durationList.append(str(duration)) print("%s - %s (syllables/second for each interval)" % (name, ",".join(durationList)))
def analyzeLaughter(textgridPath, outputPath): utils.makeDir(outputPath) speechTierName = "Mother" laughterTierName = "Mother's Backchannel" speechCode = "MS" laughterCode = "LA" pauseCode = "FP" # How much did each event occur? allCodeSummaryList = [] for tierName, code, outputName in [[speechTierName, speechCode, "speech_occurances"], [laughterTierName, laughterCode, "laughter_occurances"], [speechTierName, pauseCode, "pause_code"], ]: entryList = [] summaryList = [] for fn in utils.findFiles(textgridPath, filterExt=".TextGrid"): tg = tgio.openTextGrid(join(textgridPath, fn)) tier = tg.tierDict[tierName] matchEntryList = tier.find(code) durationList = [float(stop)-float(start) for start, stop, label in matchEntryList] matchEntryList = [[fn,str(start),str(stop),label]for start, stop, label in matchEntryList] entryList.extend(matchEntryList) summaryList.append( (fn, str(sum(durationList))) ) entryList = [",".join(row) for row in entryList] open(join(outputPath, outputName+".csv"), "w").write("\n".join(entryList)) allCodeSummaryList.append(summaryList) outputList = ["Filename,Speech,Laughter,Pause",] for speech, laugh, pause in utils.safeZip(allCodeSummaryList, enforceLength=True): outputList.append(",".join([speech[0], speech[1], laugh[1], pause[1]])) open(join(outputPath, "event_cumulative_lengths.csv"), "w").write("\n".join(outputList) + "\n")
def uwePhoneCountForEpochs(epochPath, tgInfoPath, manualCountsPath, outputPath): utils.makeDir(outputPath) for fn in utils.findFiles(tgInfoPath, filterExt=".txt"): print fn epochList = utils.openCSV(epochPath, fn) tgInfo = utils.openCSV(tgInfoPath, fn) manualCounts = utils.openCSV(manualCountsPath, fn) epochOutputList = [] for epochNumber, epochStart, epochStop in epochList: epochStart, epochStop = float(epochStart), float(epochStop) # Find all of the intervals that are at least partially contained within # the current epoch epochSyllableCount = 0 unadjustedEpochSyllableCount = 0 epochArticulationRate = 0 epochAverageSyllableDuration = 0 for info, nucleusList in utils.safeZip([tgInfo, manualCounts], enforceLength=True): start, stop, wordList = info start, stop = float(start), float(stop) syllableCount = len(nucleusList) unadjustedEpochSyllableCount += syllableCount # Accounts for intervals that straddle an epoch boundary multiplicationFactor = _percentInside(start, stop, epochStart, epochStop) epochSyllableCount += syllableCount * multiplicationFactor # epochOutputList.append("%f,%f" % (unadjustedEpochSyllableCount,epochSyllableCount)) epochOutputList.append("%f" % (epochSyllableCount)) open(join(outputPath, fn), "w").write("\n".join(epochOutputList) + "\n")