def extractPraatPitchForEpochs(pitchPath, epochPath, tgInfoPath, outputPath): utils.makeDir(outputPath) for fn in utils.findFiles(pitchPath, filterExt=".txt"): name = os.path.splitext(fn)[0] print name epochList = utils.openCSV(epochPath, fn) epochList = [(epochNum, float(start), float(stop)) for epochNum, start, stop in epochList] entryList = utils.openCSV(tgInfoPath, fn) entryList = [(float(start), float(stop), label) for start, stop, label in entryList] dataList = praat_pi.loadPitchAndTime(pitchPath, fn) # Get F0 values for the intervals when the mother was speaking speechDataList = [] for start, stop, label in entryList: speechDataList.extend(praat_pi.getAllValuesInTime(start, stop, dataList)) # Get F0 values for the times the mother is speaking for each epoch pitchData = [] for epochNum, start, stop in epochList: start, stop = float(start), float(stop) duration = stop - start epochValueList = praat_pi.getAllValuesInTime(start, stop, speechDataList) f0List = [f0Val for time, f0Val, intVal in epochValueList] pitchData.append(praat_pi.extractPitchMeasuresForSegment(f0List, name, epochNum, medianFilterWindowSize=None, filterZeroFlag=True)) open(join(outputPath, "%s.txt" % name), "w").write("\n".join(pitchData) + "\n")
def aggregateSpeechRate(tgInfoPath, speechRatePath, outputPath, samplingRate): utils.makeDir(outputPath) finishedList = utils.findFiles(outputPath, filterExt=".txt") for fn in utils.findFiles(tgInfoPath, filterExt=".txt", skipIfNameInList=finishedList): # Load subset speech rate name = os.path.splitext(fn)[0] speechRateFNList = utils.findFiles(speechRatePath, filterExt=".txt", filterPattern=name) subSplitList = utils.openCSV(tgInfoPath, fn) # Convert the sample numbers to seconds # They are in terms of the beginning of the subset they are in but # need to be in terms of the start of the file the larger file the # subset originated from outputList = [] for splitInfo, speechRateFN in utils.safeZip([subSplitList, speechRateFNList], enforceLength=True): start, stop, label = splitInfo speechRateList = utils.openCSV(speechRatePath, speechRateFN, valueIndex=0) speechRateList = [value for value in speechRateList if value != ''] speechRateList = [str(float(start) + float(sampleNum) / float(samplingRate)) for sampleNum in speechRateList] outputList.append( ",".join(speechRateList) ) open(join(outputPath, fn), "w").write("\n".join(outputList) + "\n")
def manualPhoneCount(tgInfoPath, isleFN, outputPath, skipList=None): if skipList is None: skipList = [] utils.makeDir(outputPath) isleDict = isletool.LexicalTool(isleFN) existFNList = utils.findFiles(outputPath, filterPaths=".txt") for fn in utils.findFiles(tgInfoPath, filterExt=".txt", skipIfNameInList=existFNList): if os.path.exists(join(outputPath, fn)): continue print(fn) dataList = utils.openCSV(tgInfoPath, fn) dataList = [row[2] for row in dataList] # start, stop, tmpLabel outputList = [] for tmpLabel in dataList: if tmpLabel not in skipList: syllableCount, phoneCount = isletool.getNumPhones(isleDict, tmpLabel, maxFlag=True) else: syllableCount, phoneCount = 0, 0 outputList.append("%d,%d" % (syllableCount, phoneCount)) outputTxt = "\n".join(outputList) with open(join(outputPath, fn), "w") as fd: fd.write(outputTxt)
def toAbsoluteTime(namePrefix, matlabOutputPath, startTimeList): ''' Converts the sampled times from relative to absolute time The input may be split across a number of files. This script assumes that files of the pattern <<namePrefix>><<nameSuffix>>.txt correspond to different parts of the same source file. namePrefix - name of the original wav file with no suffix speechRatePath - the path where the output of the matlab script is placed startTimeList - there needs to be one file here for each file in speechRatePath with the pattern namePrefix Returns a list of lists where each sublist corresponds to the output of one file matching <<namePrefix>> ''' # Load subset speech rate speechRateFNList = utils.findFiles(matlabOutputPath, filterExt=".txt", filterPattern=namePrefix) returnList = [] for start, speechRateFN in utils.safeZip([startTimeList, speechRateFNList], enforceLength=True): speechRateList = utils.openCSV(matlabOutputPath, speechRateFN, valueIndex=0) speechRateList = [value for value in speechRateList if value != ''] speechRateList = [ str(float(start) + float(sampNum)) for sampNum in speechRateList ] returnList.append(speechRateList) return returnList
def toAbsoluteTime(namePrefix, matlabOutputPath, startTimeList): ''' Converts the sampled times from relative to absolute time The input may be split across a number of files. This script assumes that files of the pattern <<namePrefix>><<nameSuffix>>.txt correspond to different parts of the same source file. namePrefix - name of the original wav file with no suffix speechRatePath - the path where the output of the matlab script is placed startTimeList - there needs to be one file here for each file in speechRatePath with the pattern namePrefix Returns a list of lists where each sublist corresponds to the output of one file matching <<namePrefix>> ''' # Load subset speech rate speechRateFNList = utils.findFiles(matlabOutputPath, filterExt=".txt", filterPattern=namePrefix) returnList = [] for start, speechRateFN in utils.safeZip([startTimeList, speechRateFNList], enforceLength=True): speechRateList = utils.openCSV(matlabOutputPath, speechRateFN, valueIndex=0) speechRateList = [value for value in speechRateList if value != ''] speechRateList = [str(float(start) + float(sampNum)) for sampNum in speechRateList] returnList.append(speechRateList) return returnList
def removeFilledPauses(inputPath, outputPath): utils.makeDir(outputPath) for fn in utils.findFiles(inputPath, filterExt=".txt"): dataList = utils.openCSV(inputPath, fn) dataList = [[start, stop, label] for start, stop, label in dataList if label == "MS"] dataList = [",".join(row) for row in dataList] open(join(outputPath, fn), "w").write("\n".join(dataList) + "\n")
def adjustEpochNumbers(inputPath, outputPath): utils.makeDir(outputPath) for fn in utils.findFiles(inputPath, filterExt=".txt"): dataList = utils.openCSV(inputPath, fn) dataList = ["%02d,%s,%s" % (int(id)+1,start, stop) for id, start, stop in dataList] open(join(outputPath, fn), "w").write("\n".join(dataList) + "\n")
def manualPhoneCountForEpochs(manualCountsPath, tgInfoPath, epochPath, outputPath): utils.makeDir(outputPath) skipList = utils.findFiles(outputPath, filterExt=".txt") for fn in utils.findFiles(tgInfoPath, filterExt=".txt", skipIfNameInList=skipList): epochList = utils.openCSV(epochPath, fn) tgInfo = utils.openCSV(tgInfoPath, fn) manualCounts = utils.openCSV(manualCountsPath, fn) epochOutputList = [] for epochTuple in epochList: # Epoch num, start, stop epochStart, epochStop = float(epochTuple[1]), float(epochTuple[2]) # Find all of the intervals that are at least partially # contained within the current epoch epochSyllableCount = 0 epochPhoneCount = 0 speechDuration = 0 for info, counts in utils.safeZip([tgInfo, manualCounts], enforceLength=True): start, stop = float(info[0]), float(info[1]) syllableCount, phoneCount = float(counts[0]), float(counts[1]) # Accounts for intervals that straddle an epoch boundary multiplicationFactor = percentInside(start, stop, epochStart, epochStop) speechDuration += (stop - start) * multiplicationFactor epochSyllableCount += syllableCount * multiplicationFactor epochPhoneCount += phoneCount * multiplicationFactor epochOutputList.append("%f,%f,%f" % (epochSyllableCount, epochPhoneCount, speechDuration)) with open(join(outputPath, fn), "w") as fd: fd.write("\n".join(epochOutputList))
def generateEpochRowHeader(epochPath, outputPath, sessionCode): utils.makeDir(outputPath) for fn in utils.findFiles(epochPath, filterExt=".txt"): epochList = utils.openCSV(epochPath, fn) id = fn.split("_")[2] outputList = [",".join([id, sessionCode, epoch, epochStart, epochEnd, str(float(epochEnd) - float(epochStart))]) for epoch, epochStart, epochEnd in epochList] open(join(outputPath, fn), "w").write("\n".join(outputList) + "\n")
def aggregateFeatures(featurePath, featureList, headerStr=None): outputDir = join(featurePath, "aggr") utils.makeDir(outputDir) fnList = [] dataList = [] # Find the files that exist in all features for feature in featureList: fnSubList = utils.findFiles(join(featurePath, feature), filterExt=".txt") fnList.append(fnSubList) actualFNList = [] for featureFN in fnList[0]: if all([featureFN in subList for subList in fnList]): actualFNList.append(featureFN) for featureFN in actualFNList: dataList = [] for feature in featureList: featureDataList = utils.openCSV(join(featurePath, feature), featureFN, encoding="utf-8") dataList.append([",".join(row) for row in featureDataList]) name = os.path.splitext(featureFN)[0] dataList.insert(0, [name for _ in range(len(dataList[0]))]) tDataList = utils.safeZip(dataList, enforceLength=True) outputList = [",".join(row) for row in tDataList] outputTxt = "\n".join(outputList) outputFN = join(outputDir, name + ".csv") with io.open(outputFN, "w", encoding="utf-8") as fd: fd.write(outputTxt) # Cat all files together aggrOutput = [] if headerStr is not None: aggrOutput.append(headerStr) for fn in utils.findFiles(outputDir, filterExt=".csv"): if fn == "all.csv": continue with io.open(join(outputDir, fn), "r", encoding='utf-8') as fd: aggrOutput.append(fd.read()) with io.open(join(outputDir, "all.csv"), "w", encoding='utf-8') as fd: fd.write("\n".join(aggrOutput))
def uwePhoneCountForEpochs(epochPath, tgInfoPath, manualCountsPath, outputPath): utils.makeDir(outputPath) for fn in utils.findFiles(tgInfoPath, filterExt=".txt"): print fn epochList = utils.openCSV(epochPath, fn) tgInfo = utils.openCSV(tgInfoPath, fn) manualCounts = utils.openCSV(manualCountsPath, fn) epochOutputList = [] for epochNumber, epochStart, epochStop in epochList: epochStart, epochStop = float(epochStart), float(epochStop) # Find all of the intervals that are at least partially contained within # the current epoch epochSyllableCount = 0 unadjustedEpochSyllableCount = 0 epochArticulationRate = 0 epochAverageSyllableDuration = 0 for info, nucleusList in utils.safeZip([tgInfo, manualCounts], enforceLength=True): start, stop, wordList = info start, stop = float(start), float(stop) syllableCount = len(nucleusList) unadjustedEpochSyllableCount += syllableCount # Accounts for intervals that straddle an epoch boundary multiplicationFactor = _percentInside(start, stop, epochStart, epochStop) epochSyllableCount += syllableCount * multiplicationFactor # epochOutputList.append("%f,%f" % (unadjustedEpochSyllableCount,epochSyllableCount)) epochOutputList.append("%f" % (epochSyllableCount)) open(join(outputPath, fn), "w").write("\n".join(epochOutputList) + "\n")
def addEpochsToTextgrids(tgPath, epochPath, outputPath): utils.makeDir(outputPath) for name in utils.findFiles(tgPath, filterExt=".TextGrid", stripExt=True): print name tg = tgio.openTextGrid(join(tgPath, name+".TextGrid")) entryList = utils.openCSV(epochPath, name+".txt") entryList = [(float(start), float(end), label) for label, start, end in entryList] tier = tgio.IntervalTier("epochs", entryList, minT=0, maxT=tg.maxTimestamp) tg.addTier(tier) tg.save(join(outputPath, name+".TextGrid"))
def findFrequenciesForWordLists(featurePath, countObj, frequencyNormFunc): frequencyPath = join(featurePath, "frequency") utils.makeDir(frequencyPath) wordsPath = join(featurePath, "words") for fn in utils.findFiles(wordsPath): wordList = utils.openCSV(wordsPath, fn, valueIndex=0, encoding="utf-8") countList = [] for word in wordList: tmp = countObj.getFrequency(word, frequencyNormFunc, outOfDictionaryValue=1) count, freq, logFreq = tmp countList.append("%f,%f,%f" % (count, freq, logFreq)) with open(join(frequencyPath, fn), "w") as fd: fd.write("\n".join(countList))
def medianFilter(f0Path, outputPath, windowSize): # windowSize must be odd assert (windowSize % 2 != 0) utils.makeDir(outputPath) for fn in utils.findFiles(f0Path, filterExt=".txt"): valueList = utils.openCSV(f0Path, fn) f0List = [ float(row[1]) if row[1] != "--undefined--" else 0 for row in valueList ] # time, f0Val, intensityVal f0Filtered = filters.medianFilter(f0List, windowSize, useEdgePadding=True) outputList = [ "%s,%0.3f,%s" % (row[0], f0Val, row[2]) for row, f0Val in zip(*[valueList, f0Filtered]) ] open(join(outputPath, fn), "w").write("\n".join(outputList) + "\n")
def eventStructurePerEpoch(epochPath, fullyFilteredTGPath, childFilteredTGPath, noiseFilteredTGPath, unfilteredTGPath, outputPath, speechTierName, laughterTierName): ''' How frequent and with what duration did laughter, pauses, and speech occur ''' def _getCountsAndDurations(tier, searchLabel): entryList = tier.find(searchLabel) durationList = [float(stop) - float(start) for start, stop, label in entryList] count = len(entryList) return sum(durationList), count utils.makeDir(outputPath) for name in utils.findFiles(epochPath, filterExt=".txt", stripExt=True): epochList = utils.openCSV(epochPath, name+".txt") epochList = [(epochNum, float(start), float(stop)) for epochNum, start, stop in epochList] tg = tgio.openTextGrid(join(fullyFilteredTGPath, name + ".TextGrid")) childFilteredTG = tgio.openTextGrid(join(childFilteredTGPath, name + ".TextGrid")) noiseFilteredTG = tgio.openTextGrid(join(noiseFilteredTGPath, name + ".TextGrid")) origTG = tgio.openTextGrid(join(unfilteredTGPath, name + ".TextGrid")) outputList = [] for epochNum, start, stop in epochList: subTG = tg.crop(strictFlag=False, softFlag=False, startTime=start, endTime=stop) speechTier = subTG.tierDict[speechTierName] laughterTier = subTG.tierDict[laughterTierName] pauseDur, numPauses = _getCountsAndDurations(speechTier, "FP") speechDur, numSpeech = _getCountsAndDurations(speechTier, "MS") laughDur, numLaughter = _getCountsAndDurations(laughterTier, "LA") subCSFilteredTG = childFilteredTG.crop(strictFlag=False, softFlag=False, startTime=start, endTime=stop) csFilteredTier = subCSFilteredTG.tierDict[speechTierName] csFiltSpeech, numCSFiltSpeech = _getCountsAndDurations(csFilteredTier, "MS") subNoiseFilteredTG = noiseFilteredTG.crop(strictFlag=False, softFlag=False, startTime=start, endTime=stop) nsFilteredTier = subNoiseFilteredTG.tierDict[speechTierName] nsFiltSpeech, numNsFiltSpeech = _getCountsAndDurations(nsFilteredTier, "MS") subOrigTG = origTG.crop(strictFlag=False, softFlag=False, startTime=start, endTime=stop) origSpeechTier = subOrigTG.tierDict[speechTierName] fullSpeechDur, fullNumSpeech = _getCountsAndDurations(origSpeechTier, "MS") epochTuple = (speechDur, numSpeech, csFiltSpeech, nsFiltSpeech, fullSpeechDur, fullSpeechDur - speechDur, pauseDur, numPauses, laughDur, numLaughter) outputList.append("%.02f, %d, %.02f, %.02f, %.02f, %.02f, %.02f, %d, %.02f, %d" % epochTuple) open(join(outputPath, name+".txt"), "w").write("\n".join(outputList) + "\n")