def splitAudio(path): outputPath = join(path, "split_audio") utils.makeDir(outputPath) for fn in utils.findFiles(path, filterExt=".wav"): audioScripts.splitStereoAudio(path, fn, outputPath)
def extractSubwav(fn, outputFN, startT, endT, singleChannelFlag, soxPath=None): if soxPath is None: soxPath = "sox" # Assumes it is in the user's path path, fnNoPath = os.path.split(fn) resampledAudioPath = join(path, "resampledAudio") resampledFN = join(resampledAudioPath, fnNoPath) audiofile = wave.open(fn, "r") params = audiofile.getparams() nchannels = params[0] sampwidth = params[1] framerate = params[2] comptype = params[4] compname = params[5] # If you are not using the default Julius training model, you might # need to change the value here. Results will fail if the sampling # frequency is different. if framerate != DEFAULT_FRAMERATE: if not os.path.exists(resampledFN): utils.makeDir(resampledAudioPath) sr = str(DEFAULT_FRAMERATE) soxCmd = "%s %s -r %s %s rate -v 96k" % (soxPath, fn, sr, resampledFN) os.system(soxCmd) if not os.path.exists(resampledFN): raise IncompatibleSampleFrequencyError(framerate, DEFAULT_FRAMERATE) audiofile = wave.open(resampledFN, "r") params = audiofile.getparams() nchannels = params[0] sampwidth = params[1] framerate = params[2] comptype = params[4] compname = params[5] # Extract the audio frames audiofile.setpos(int(framerate * startT)) audioFrames = audiofile.readframes(int(framerate * (endT - startT))) # Convert to single channel if needed if singleChannelFlag is True and nchannels > 1: audioFrames = audioop.tomono(audioFrames, sampwidth, 1, 0) nchannels = 1 outParams = [ nchannels, sampwidth, framerate, len(audioFrames), comptype, compname ] outWave = wave.open(outputFN, "w") outWave.setparams(outParams) outWave.writeframes(audioFrames)
def renameMP3Files(path): outputPath = join(path, "renamed") utils.makeDir(outputPath) for name in utils.findFiles(path, filterExt=".mp3", stripExt=True): if name[-1] == "x": newName = name[:-1] shutil.move(join(path, name + ".mp3"), join(outputPath, newName + ".mp3"))
def extractSubwav(fn, outputFN, startT, endT, singleChannelFlag, soxPath=None): if soxPath is None: soxPath = "sox" # Assumes it is in the user's path path, fnNoPath = os.path.split(fn) resampledAudioPath = join(path, "resampledAudio") resampledFN = join(resampledAudioPath, fnNoPath) audiofile = wave.open(fn, "r") params = audiofile.getparams() nchannels = params[0] sampwidth = params[1] framerate = params[2] comptype = params[4] compname = params[5] # If you are not using the default Julius training model, you might # need to change the value here. Results will fail if the sampling # frequency is different. if framerate != DEFAULT_FRAMERATE: if not os.path.exists(resampledFN): utils.makeDir(resampledAudioPath) sr = str(DEFAULT_FRAMERATE) soxCmd = "%s %s -r %s %s rate -v 96k" % (soxPath, fn, sr, resampledFN) os.system(soxCmd) if not os.path.exists(resampledFN): raise IncompatibleSampleFrequencyError(framerate, DEFAULT_FRAMERATE) audiofile = wave.open(resampledFN, "r") params = audiofile.getparams() nchannels = params[0] sampwidth = params[1] framerate = params[2] comptype = params[4] compname = params[5] # Extract the audio frames audiofile.setpos(int(framerate * startT)) audioFrames = audiofile.readframes(int(framerate * (endT - startT))) # Convert to single channel if needed if singleChannelFlag is True and nchannels > 1: audioFrames = audioop.tomono(audioFrames, sampwidth, 1, 0) nchannels = 1 outParams = [nchannels, sampwidth, framerate, len(audioFrames), comptype, compname] outWave = wave.open(outputFN, "w") outWave.setparams(outParams) outWave.writeframes(audioFrames)
def convertCorpusToUTF8(path): outputDir = join(path, "output") utils.makeDir(outputDir) for fn in utils.findFiles(path, filterExt=".txt"): # cp932 = Japanese with io.open(join(path, fn), "rU", encoding="cp932") as fd: text = fd.read() with io.open(join(outputDir, fn), "w", encoding='utf-8') as fd: fd.write(text)
def forceAlignCorpus(wavPath, txtPath, outputPath, juliusScriptPath=None, soxPath=None, perlPath=None): '''Force aligns every file and prints out summary statistics''' totalNumPhonesFailed = 0 totalNumPhones = 0 totalNumIntervalsFailed = 0 totalNumIntervals = 0 utils.makeDir(outputPath) for name in utils.findFiles(txtPath, filterExt=".txt", stripExt=True): wavNameDict = {name: "%s.wav" % name} output = forceAlignFile([ name, ], wavPath, wavNameDict, txtPath, name + ".txt", outputPath, name, juliusScriptPath, soxPath, perlPath) (numPhonesFailedAlignment, numPhones, numFailedIntervals, numIntervals) = output percentFailed = utils.divide(numPhonesFailedAlignment, numPhones, 0) * 100 percentFailedIntervals = utils.divide(numFailedIntervals, numIntervals, 0) * 100 print("%d intervals of %d total intervals (%0.2f%%) and %d phones " "of %d total phones (%0.2f%%) successfully aligned for %s" % (numIntervals - numFailedIntervals, numIntervals, 100 * (1 - percentFailedIntervals), numPhones - numPhonesFailedAlignment, numPhones, 100 * (1 - percentFailed), name)) totalNumPhonesFailed += numPhonesFailedAlignment totalNumPhones += numPhones totalNumIntervalsFailed += numFailedIntervals totalNumIntervals += numIntervals totalPercentFailed = utils.divide(totalNumPhonesFailed, totalNumPhones, 0) * 100 totalPercentFailedIntervals = utils.divide(totalNumIntervalsFailed, totalNumIntervals, 0) * 100 print("====Summary====") print( "%d intervals of %d total intervals (%0.2f%%) and %d phones of " "%d total phones (%0.2f%%) successfully aligned" % (totalNumIntervals - totalNumIntervalsFailed, totalNumIntervals, 100 * (1 - totalPercentFailedIntervals), totalNumPhones - totalNumPhonesFailed, totalNumPhones, 100 * (1 - totalPercentFailed)))
def getSelectedTxtFiles(txtPath, wavPath): outputPath = join(txtPath, "selected_txt") utils.makeDir(outputPath) nameList = utils.findFiles(wavPath, filterExt=".wav", stripExt=True) nameList = [name.split("_")[0] for name in nameList] nameList = list(set(nameList)) for name in utils.findFiles(txtPath, filterExt=".txt", stripExt=True): if name not in nameList: continue shutil.copy(join(txtPath, name + ".txt"), join(outputPath, name + ".txt"))
def textgridToCSV(inputPath, outputPath, outputExt='.csv'): utils.makeDir(outputPath) for fn in utils.findFiles(inputPath, filterExt=".TextGrid"): tg = tgio.openTextgrid(join(inputPath, fn)) tier = tg.tierDict["utterances"] outputList = [] for start, stop, label in tier.entryList: outputList.append("%s,%s,%s" % (start, stop, label)) name = os.path.splitext(fn)[0] outputTxt = "\n".join(outputList) outputFN = join(outputPath, "%s%s" % (name, outputExt)) with io.open(outputFN, "w", encoding="utf-8") as fd: fd.write(outputTxt)
def forceAlignCorpus(wavPath, txtPath, outputPath, juliusScriptPath=None, soxPath=None, perlPath=None): '''Force aligns every file and prints out summary statistics''' totalNumPhonesFailed = 0 totalNumPhones = 0 totalNumIntervalsFailed = 0 totalNumIntervals = 0 utils.makeDir(outputPath) for name in utils.findFiles(txtPath, filterExt=".txt", stripExt=True): wavNameDict = {name: "%s.wav" % name} output = forceAlignFile([name, ], wavPath, wavNameDict, txtPath, name + ".txt", outputPath, name, juliusScriptPath, soxPath, perlPath) (numPhonesFailedAlignment, numPhones, numFailedIntervals, numIntervals) = output percentFailed = utils.divide(numPhonesFailedAlignment, numPhones, 0) * 100 percentFailedIntervals = utils.divide(numFailedIntervals, numIntervals, 0) * 100 print("%d intervals of %d total intervals (%0.2f%%) and %d phones " "of %d total phones (%0.2f%%) successfully aligned for %s" % (numIntervals - numFailedIntervals, numIntervals, 100 * (1 - percentFailedIntervals), numPhones - numPhonesFailedAlignment, numPhones, 100 * (1 - percentFailed), name)) totalNumPhonesFailed += numPhonesFailedAlignment totalNumPhones += numPhones totalNumIntervalsFailed += numFailedIntervals totalNumIntervals += numIntervals totalPercentFailed = utils.divide(totalNumPhonesFailed, totalNumPhones, 0) * 100 totalPercentFailedIntervals = utils.divide(totalNumIntervalsFailed, totalNumIntervals, 0) * 100 print("====Summary====") print("%d intervals of %d total intervals (%0.2f%%) and %d phones of " "%d total phones (%0.2f%%) successfully aligned" % (totalNumIntervals - totalNumIntervalsFailed, totalNumIntervals, 100 * (1 - totalPercentFailedIntervals), totalNumPhones - totalNumPhonesFailed, totalNumPhones, 100 * (1 - totalPercentFailed)))
def convertCRESTToKanaAndRomaji(inputPath, outputPath, cabochaEncoding, cabochaPath, encoding="cp932"): timeInfoPath = join(outputPath, "speaker_info_and_utterance_timing") for path in [timeInfoPath]: utils.makeDir(path) numUnnamedEntities = 0 numUnidentifiedUtterances = 0 finishedList = utils.findFiles(timeInfoPath, filterExt=".txt") for fn in utils.findFiles(inputPath, filterExt=".txt", skipIfNameInList=finishedList): with io.open(join(inputPath, fn), "r", encoding=encoding) as fd: text = fd.read() textList = text.split("\n") numUnnamedEntitiesForFN = 0 numUnidentifiedUtterancesForFN = 0 speakerInfoList = [] for line in textList: line = line.strip() try: speakerCode, startTime, stopTime, line = line.split(" ", 3) except ValueError: continue origLine = line # Clean up the line before it gets processed # Not sure what "・" is but cabocha doesn't like it for char in [u"(", u")", u" ", u".", u"?", u"「", u"」", u"[", u"]", u"@W", u"@S", u"<", u">", u" ", u"。"]: line = line.replace(char, "") # Used to split names? for char in [u"・", u"·"]: line = line.replace(char, " ") line = line.strip() try: tmp = jProcessingSnippet.getChunkedKana(line, cabochaEncoding, cabochaPath) tmpWordList, tmpKanaList, tmpromajiList = tmp except (jProcessingSnippet.ChunkingError, jProcessingSnippet.NonKatakanaError) as e: print(u"%s, %s" % (str(e), origLine)) tmpWordList = [""] tmpKanaList = [""] tmpromajiList = [""] numUnidentifiedUtterancesForFN += 1 except jProcessingSnippet.UnidentifiedJapaneseText as e: if all([char == u"X" for char in e.word]): numUnnamedEntitiesForFN += 1 else: print(u"%s" % str(e)) numUnidentifiedUtterancesForFN += 1 tmpWordList = [""] tmpKanaList = [""] tmpromajiList = [""] except jProcessingSnippet.EmptyStrError as e: tmpWordList = [""] tmpKanaList = [""] tmpromajiList = [""] except Exception: print(line) raise line = line.replace(u",", u"") outputList = [u"%s,%s,%s" % (speakerCode, startTime, stopTime), origLine, ','.join(tmpWordList), ",".join(tmpKanaList), ",".join(tmpromajiList)] outputStr = ";".join(outputList) speakerInfoList.append(outputStr) print(fn) print("Number of unnamed entities for fn: %d" % numUnnamedEntitiesForFN) print("Number of unidentified utterances for fn: %d" % numUnidentifiedUtterancesForFN) numUnnamedEntities += numUnnamedEntitiesForFN numUnidentifiedUtterances += numUnidentifiedUtterancesForFN outputFN = join(timeInfoPath, fn) with io.open(outputFN, "w", encoding="utf-8") as fd: fd.write("\n".join(speakerInfoList)) print("\n") print("Number of unnamed entities: %d" % numUnnamedEntities) print("Number of unidentified utterances: %d" % numUnidentifiedUtterances)
def forceAlignFile(wavPath, wavName, txtPath, txtFN, outputPath, juliusScriptPath, soxPath): ''' ''' utils.makeDir(outputPath) wavFNDict = {"L": wavName + "_L.wav", "R": wavName + "_R.wav"} # Formatted output of cabocha data = open(join(txtPath, txtFN), "rU").read() dataList = data.split("\n") dataList = [[subRow.split(",") for subRow in row.split(";")] for row in dataList if row != ""] dataDict = {"L": [], "R": []} for timingInfo, line, wordList, kanaList, romajiList in dataList: # Undoing the unnecessary split that just happened line = ",".join(line) speaker, startTimeStr, endTimeStr = timingInfo speaker, startTime, endTime = (speaker.strip(), float(startTimeStr), float(endTimeStr)) dataDict[speaker].append( [startTime, endTime, line, wordList, kanaList, romajiList]) speakerEntryDict = {} numPhonesFailedAlignment = 0 numPhones = 0 numFailedIntervals = 0 numIntervals = 0 for speaker in ["L", "R"]: tmp = juliusAlignment.juliusAlignCabocha(dataDict[speaker], wavPath, wavFNDict[speaker], juliusScriptPath, soxPath, False, True, True) speakerEntryDict[speaker], statList = tmp numPhonesFailedAlignment += statList[0] numPhones += statList[1] numFailedIntervals += statList[2] numIntervals += statList[3] # Create tiers and textgrids tg = tgio.Textgrid() maxDuration = audioScripts.getSoundFileDuration( join(wavPath, wavName + "_L.wav")) for speaker in ["L", "R"]: for aspect in [ juliusAlignment.UTTERANCE, juliusAlignment.WORD, juliusAlignment.PHONE ]: tierName = "%s_%s" % (aspect, speaker) tier = tgio.IntervalTier(tierName, speakerEntryDict[speaker][aspect], minT=0, maxT=maxDuration) tg.addTier(tier) tg.save(join(outputPath, wavName + ".TextGrid")) return (numPhonesFailedAlignment, numPhones, numFailedIntervals, numIntervals)
def juliusAlignCabocha(dataList, wavPath, wavFN, juliusScriptPath, soxPath, perlPath, silenceFlag, forceEndTimeFlag, forceMonophoneAlignFlag): ''' Given utterance-level timing and a wav file, phone-align the audio dataList is the formatted output of cabocha of the form [startTime, endTime, wordList, kanaList, romajiList] ''' tmpOutputPath = join(wavPath, "align_tmp") utils.makeDir(tmpOutputPath) logFn = join(tmpOutputPath, 'align_log_' + str(datetime.datetime.now()) + '.txt') loggerFd = open(logFn, "w") utils.makeDir(tmpOutputPath) tmpTxtFN = join(tmpOutputPath, "tmp.txt") tmpWavFN = join(tmpOutputPath, "tmp.wav") tmpOutputFN = join(tmpOutputPath, "tmp.lab") entryDict = {} for aspect in [UTTERANCE, WORD, PHONE]: entryDict[aspect] = [] # one speech interval at a time numTotalPhones = 0 numPhonesFailedToAlign = 0 numIntervals = 0 numFailedIntervals = 0 # intervalStart, intervalEnd, line, wordList, kanaList, romajiList for rowTuple in dataList: intervalStart = rowTuple[0] intervalEnd = rowTuple[1] line = rowTuple[2] wordList = rowTuple[3] romajiList = rowTuple[5] if line.strip() != "": entryDict[UTTERANCE].append((str(intervalStart), str(intervalEnd), line)) if len([word for word in wordList if word != '']) == 0: continue assert(intervalStart < intervalEnd) # Create romajiTxt (for forced alignment) and # phoneList (for the textgrid) # Phones broken up by word tmpRomajiList = [] tmpFlattenedRomajiList = [] for row in romajiList: rowList = row.split(" ") tmpRomajiList.append(rowList) tmpFlattenedRomajiList.extend(rowList) numWords = len(wordList) wordTimeList = [[] for i in range(numWords)] romajiTxt = " ".join(romajiList) phoneList = [phone for phone in romajiTxt.split(" ")] # No forced-alignment if there is no romaji if romajiTxt.strip() == "": continue # Encapsulate each phone string in boundary silence # - in my limited experience, this messes up the output even more if silenceFlag: romajiTxt = "silB " + romajiTxt + " silE" # Save temporary transcript and wav files for interval with open(tmpTxtFN, "w") as fd: fd.write(romajiTxt) audioScripts.extractSubwav(join(wavPath, wavFN), tmpWavFN, intervalStart, intervalEnd, singleChannelFlag=False, soxPath=soxPath) # Run forced alignment runJuliusAlignment(tmpOutputPath, juliusScriptPath, perlPath, loggerFd) # Get the output (timestamps for each phone) numIntervals += 1 try: matchList = parseJuliusOutput(tmpOutputFN) except JuliusAlignmentError: if forceMonophoneAlignFlag is True and numWords == 1: # One phone occupies the whole interval matchList = [(0.0, (intervalEnd - intervalStart) * 100)] else: numPhonesFailedToAlign += numWords numFailedIntervals += 1 print("Failed to align: %s - %f - %f" % ("".join(romajiList), intervalStart, intervalEnd)) continue adjustedPhonList = [[intervalStart + start, intervalStart + stop, label] for start, stop, label in matchList] # Julius is conservative in estimating the final vowel. Stretch it # to be the length of the utterance if forceEndTimeFlag: adjustedPhonList[-1][1] = intervalEnd entryDict[PHONE].extend(adjustedPhonList) # Get the bounding indicies for the phones in each word phoneToWordIndexList = [] phonesSoFar = 0 for i in range(len(wordList)): numPhones = len(tmpRomajiList[i]) phoneToWordIndexList.append((phonesSoFar, phonesSoFar + numPhones - 1)) phonesSoFar += numPhones # If julius uses a silence model and we don't, then adjust our timings phoneListFromJulius = [label for _, _, label in adjustedPhonList] if "silB" in phoneListFromJulius and "silB" not in tmpFlattenedRomajiList: phoneToWordIndexList = [(startI + 1, endI + 1) for startI, endI in phoneToWordIndexList] lastI = phoneToWordIndexList[-1][1] phoneToWordIndexList = [(0, 0)] + phoneToWordIndexList + [(lastI + 1, lastI + 1)] wordList = [""] + wordList + [""] # Store the words for i in range(len(wordList)): startI, stopI = phoneToWordIndexList[i] entryDict[WORD].append((adjustedPhonList[startI][0], adjustedPhonList[stopI][1], wordList[i])) numTotalPhones += numWords statList = [numPhonesFailedToAlign, numTotalPhones, numFailedIntervals, numIntervals] return entryDict, statList
def forceAlignFile(speakerList, wavPath, wavNameDict, txtPath, txtFN, outputPath, outputWavName, juliusScriptPath, soxPath, perlPath): ''' Normally: speakerList = [name] and wavNameDict = {name:"name.wav"} But, if you have multiple speakers for each file (assuming audio is synced) e.g. in a stereo audio situation: speakerList=["L","R"] and wavNameDict={"L":"%s_%s.wav" % (name, "L"), "R":"%s_%s.wav" % (name, "R")} ''' utils.makeDir(outputPath) # Formatted output of cabocha with io.open(join(txtPath, txtFN), "r", encoding="utf-8") as fd: data = fd.read() dataList = data.split("\n") dataList = [[subRow.split(",") for subRow in row.split(";")] for row in dataList if row != ""] dataDict = {speaker: [] for speaker in speakerList} # Undoing the unnecessary split that just happened for timingInfo, line, wordList, kanaList, romajiList in dataList: line = ",".join(line) speaker, startTimeStr, endTimeStr = timingInfo speaker, startTime, endTime = (speaker.strip(), float(startTimeStr), float(endTimeStr)) dataDict[speaker].append([startTime, endTime, line, wordList, kanaList, romajiList]) # Do the forced alignment speakerEntryDict = {} numPhonesFailedAlignment = 0 numPhones = 0 numFailedIntervals = 0 numIntervals = 0 for speaker in speakerList: output = juliusAlignment.juliusAlignCabocha(dataDict[speaker], wavPath, wavNameDict[speaker], juliusScriptPath, soxPath, perlPath, False, True, True) speakerEntryDict[speaker], statList = output numPhonesFailedAlignment += statList[0] numPhones += statList[1] numFailedIntervals += statList[2] numIntervals += statList[3] # All durations should be the same inputWavFN = next(iter(wavNameDict.values())) maxDuration = audioScripts.getSoundFileDuration(join(wavPath, inputWavFN)) # Create tiers and textgrids from the output of the alignment tg = tgio.Textgrid() for speaker in speakerList: for aspect in [juliusAlignment.UTTERANCE, juliusAlignment.WORD, juliusAlignment.PHONE]: tierName = "%s_%s" % (aspect, speaker) tier = tgio.IntervalTier(tierName, speakerEntryDict[speaker][aspect], minT=0, maxT=maxDuration) tg.addTier(tier) tg.save(join(outputPath, outputWavName + ".TextGrid")) return (numPhonesFailedAlignment, numPhones, numFailedIntervals, numIntervals)
def convertCRESTToKanaAndRomaji(inputPath, outputPath, cabochaEncoding, cabochaPath, encoding="cp932"): timeInfoPath = join(outputPath, "speaker_info_and_utterance_timing") for path in [timeInfoPath]: utils.makeDir(path) numUnnamedEntities = 0 numUnidentifiedUtterances = 0 finishedList = utils.findFiles(timeInfoPath, filterExt=".txt") for fn in utils.findFiles(inputPath, filterExt=".txt", skipIfNameInList=finishedList): with io.open(join(inputPath, fn), "r", encoding=encoding) as fd: text = fd.read() textList = text.split("\n") numUnnamedEntitiesForFN = 0 numUnidentifiedUtterancesForFN = 0 speakerInfoList = [] for line in textList: line = line.strip() try: speakerCode, startTime, stopTime, line = line.split(" ", 3) except ValueError: continue origLine = line # Clean up the line before it gets processed # Not sure what "・" is but cabocha doesn't like it for char in [ u"(", u")", u" ", u".", u"?", u"「", u"」", u"[", u"]", u"@W", u"@S", u"<", u">", u" ", u"。" ]: line = line.replace(char, "") # Used to split names? for char in [u"・", u"·"]: line = line.replace(char, " ") line = line.strip() try: tmp = jProcessingSnippet.getChunkedKana( line, cabochaEncoding, cabochaPath) tmpWordList, tmpKanaList, tmpromajiList = tmp except (jProcessingSnippet.ChunkingError, jProcessingSnippet.NonKatakanaError) as e: print(u"%s, %s" % (str(e), origLine)) tmpWordList = [""] tmpKanaList = [""] tmpromajiList = [""] numUnidentifiedUtterancesForFN += 1 except jProcessingSnippet.UnidentifiedJapaneseText as e: if all([char == u"X" for char in e.word]): numUnnamedEntitiesForFN += 1 else: print(u"%s" % str(e)) numUnidentifiedUtterancesForFN += 1 tmpWordList = [""] tmpKanaList = [""] tmpromajiList = [""] except jProcessingSnippet.EmptyStrError as e: tmpWordList = [""] tmpKanaList = [""] tmpromajiList = [""] except Exception: print(line) raise line = line.replace(u",", u"") outputList = [ u"%s,%s,%s" % (speakerCode, startTime, stopTime), origLine, ','.join(tmpWordList), ",".join(tmpKanaList), ",".join(tmpromajiList) ] outputStr = ";".join(outputList) speakerInfoList.append(outputStr) print(fn) print("Number of unnamed entities for fn: %d" % numUnnamedEntitiesForFN) print("Number of unidentified utterances for fn: %d" % numUnidentifiedUtterancesForFN) numUnnamedEntities += numUnnamedEntitiesForFN numUnidentifiedUtterances += numUnidentifiedUtterancesForFN outputFN = join(timeInfoPath, fn) with io.open(outputFN, "w", encoding="utf-8") as fd: fd.write("\n".join(speakerInfoList)) print("\n") print("Number of unnamed entities: %d" % numUnnamedEntities) print("Number of unidentified utterances: %d" % numUnidentifiedUtterances)
def forceAlignFile(wavPath, wavName, txtPath, txtFN, outputPath, juliusScriptPath, soxPath): ''' ''' utils.makeDir(outputPath) wavFNDict = {"L": wavName + "_L.wav", "R": wavName + "_R.wav"} # Formatted output of cabocha data = open(join(txtPath, txtFN), "rU").read() dataList = data.split("\n") dataList = [[subRow.split(",") for subRow in row.split(";")] for row in dataList if row != ""] dataDict = {"L": [], "R": []} for timingInfo, line, wordList, kanaList, romajiList in dataList: # Undoing the unnecessary split that just happened line = ",".join(line) speaker, startTimeStr, endTimeStr = timingInfo speaker, startTime, endTime = (speaker.strip(), float(startTimeStr), float(endTimeStr)) dataDict[speaker].append([startTime, endTime, line, wordList, kanaList, romajiList]) speakerEntryDict = {} numPhonesFailedAlignment = 0 numPhones = 0 numFailedIntervals = 0 numIntervals = 0 for speaker in ["L", "R"]: tmp = juliusAlignment.juliusAlignCabocha(dataDict[speaker], wavPath, wavFNDict[speaker], juliusScriptPath, soxPath, False, True, True) speakerEntryDict[speaker], statList = tmp numPhonesFailedAlignment += statList[0] numPhones += statList[1] numFailedIntervals += statList[2] numIntervals += statList[3] # Create tiers and textgrids tg = tgio.Textgrid() maxDuration = audioScripts.getSoundFileDuration(join(wavPath, wavName + "_L.wav")) for speaker in ["L", "R"]: for aspect in [juliusAlignment.UTTERANCE, juliusAlignment.WORD, juliusAlignment.PHONE]: tierName = "%s_%s" % (aspect, speaker) tier = tgio.IntervalTier(tierName, speakerEntryDict[speaker][aspect], minT=0, maxT=maxDuration) tg.addTier(tier) tg.save(join(outputPath, wavName + ".TextGrid")) return (numPhonesFailedAlignment, numPhones, numFailedIntervals, numIntervals)
def convertCorpusToKanaAndRomaji(inputPath, outputPath, cabochaEncoding, cabochaPath=None, encoding="cp932"): ''' Reduces a corpus of typical Japanese text to both kana and romaji Each line of input should be of the form: startTime, stopTime, Japanese text ''' utils.makeDir(outputPath) numUnnamedEntities = 0 numUnidentifiedUtterances = 0 numWordsProcessedWithNoError = 0 fnList = utils.findFiles(inputPath, filterExt=".txt") for fn in fnList: with io.open(join(inputPath, fn), "rU", encoding=encoding) as fd: text = fd.read() textList = text.split("\n") numUnnamedEntitiesForFN = 0 numUnidentifiedUtterancesForFN = 0 speakerInfoList = [] for line in textList: line = line.strip() try: startTime, stopTime, line = line.split(",", 2) except ValueError: print("error") continue origLine = line dataPrepTuple = juliusAlignment.formatTextForJulius( line, cabochaEncoding, cabochaPath) (line, tmpWordList, tmpKanaList, tmpRomajiList, unidentifiedUtterance, unnamedEntity, tmpWordCount) = dataPrepTuple numUnnamedEntities += unnamedEntity numUnidentifiedUtterances += unidentifiedUtterance numWordsProcessedWithNoError += tmpWordCount name = os.path.splitext(fn)[0] outputList = [ u"%s,%s,%s" % (name, startTime, stopTime), origLine, tmpWordList, tmpKanaList, tmpRomajiList ] outputStr = ";".join(outputList) speakerInfoList.append(outputStr) if (numUnnamedEntities > 0 or numUnidentifiedUtterances > 0): print(fn) print("Number of unnamed entities for fn: %d" % numUnnamedEntitiesForFN) print("Number of unidentified utterances for fn: %d" % numUnidentifiedUtterancesForFN) numUnnamedEntities += numUnnamedEntitiesForFN numUnidentifiedUtterances += numUnidentifiedUtterancesForFN with io.open(join(outputPath, fn), "w", encoding="utf-8") as fd: fd.write("\n".join(speakerInfoList)) print("\n") print("Number of transcripts converted: %d" % len(fnList)) print("Number of unnamed entities: %d" % numUnnamedEntities) print("Number of unidentified utterances: %d" % numUnidentifiedUtterances) print("Number of words processed without error: %d" % numWordsProcessedWithNoError)
def juliusAlignCabocha(dataList, wavPath, wavFN, juliusScriptPath, soxPath, perlPath, silenceFlag, forceEndTimeFlag, forceMonophoneAlignFlag): ''' Given utterance-level timing and a wav file, phone-align the audio dataList is the formatted output of cabocha of the form [startTime, endTime, wordList, kanaList, romajiList] ''' tmpOutputPath = join(wavPath, "align_tmp") utils.makeDir(tmpOutputPath) logFn = join(tmpOutputPath, 'align_log_' + str(datetime.datetime.now()) + '.txt') loggerFd = open(logFn, "w") utils.makeDir(tmpOutputPath) tmpTxtFN = join(tmpOutputPath, "tmp.txt") tmpWavFN = join(tmpOutputPath, "tmp.wav") tmpOutputFN = join(tmpOutputPath, "tmp.lab") entryDict = {} for aspect in [UTTERANCE, WORD, PHONE]: entryDict[aspect] = [] # one speech interval at a time numTotalPhones = 0 numPhonesFailedToAlign = 0 numIntervals = 0 numFailedIntervals = 0 # intervalStart, intervalEnd, line, wordList, kanaList, romajiList for rowTuple in dataList: intervalStart = rowTuple[0] intervalEnd = rowTuple[1] line = rowTuple[2] wordList = rowTuple[3] romajiList = rowTuple[5] if line.strip() != "": entryDict[UTTERANCE].append( (str(intervalStart), str(intervalEnd), line)) if len([word for word in wordList if word != '']) == 0: continue assert (intervalStart < intervalEnd) # Create romajiTxt (for forced alignment) and # phoneList (for the textgrid) # Phones broken up by word tmpRomajiList = [] tmpFlattenedRomajiList = [] for row in romajiList: rowList = row.split(" ") tmpRomajiList.append(rowList) tmpFlattenedRomajiList.extend(rowList) numWords = len(wordList) wordTimeList = [[] for i in range(numWords)] romajiTxt = " ".join(romajiList) phoneList = [phone for phone in romajiTxt.split(" ")] # No forced-alignment if there is no romaji if romajiTxt.strip() == "": continue # Encapsulate each phone string in boundary silence # - in my limited experience, this messes up the output even more if silenceFlag: romajiTxt = "silB " + romajiTxt + " silE" # Save temporary transcript and wav files for interval with open(tmpTxtFN, "w") as fd: fd.write(romajiTxt) audioScripts.extractSubwav(join(wavPath, wavFN), tmpWavFN, intervalStart, intervalEnd, singleChannelFlag=False, soxPath=soxPath) # Run forced alignment runJuliusAlignment(tmpOutputPath, juliusScriptPath, perlPath, loggerFd) # Get the output (timestamps for each phone) numIntervals += 1 try: matchList = parseJuliusOutput(tmpOutputFN) except JuliusAlignmentError: if forceMonophoneAlignFlag is True and numWords == 1: # One phone occupies the whole interval matchList = [(0.0, (intervalEnd - intervalStart) * 100)] else: numPhonesFailedToAlign += numWords numFailedIntervals += 1 print("Failed to align: %s - %f - %f" % ("".join(romajiList), intervalStart, intervalEnd)) continue adjustedPhonList = [[ intervalStart + start, intervalStart + stop, label ] for start, stop, label in matchList] # Julius is conservative in estimating the final vowel. Stretch it # to be the length of the utterance if forceEndTimeFlag: adjustedPhonList[-1][1] = intervalEnd entryDict[PHONE].extend(adjustedPhonList) # Get the bounding indicies for the phones in each word phoneToWordIndexList = [] phonesSoFar = 0 for i in range(len(wordList)): numPhones = len(tmpRomajiList[i]) phoneToWordIndexList.append( (phonesSoFar, phonesSoFar + numPhones - 1)) phonesSoFar += numPhones # If julius uses a silence model and we don't, then adjust our timings phoneListFromJulius = [label for _, _, label in adjustedPhonList] if "silB" in phoneListFromJulius and "silB" not in tmpFlattenedRomajiList: phoneToWordIndexList = [(startI + 1, endI + 1) for startI, endI in phoneToWordIndexList] lastI = phoneToWordIndexList[-1][1] phoneToWordIndexList = [(0, 0)] + phoneToWordIndexList + [ (lastI + 1, lastI + 1) ] wordList = [""] + wordList + [""] # Store the words for i in range(len(wordList)): startI, stopI = phoneToWordIndexList[i] entryDict[WORD].append((adjustedPhonList[startI][0], adjustedPhonList[stopI][1], wordList[i])) numTotalPhones += numWords statList = [ numPhonesFailedToAlign, numTotalPhones, numFailedIntervals, numIntervals ] return entryDict, statList
def forceAlignFile(speakerList, wavPath, wavNameDict, txtPath, txtFN, outputPath, outputWavName, juliusScriptPath, soxPath, perlPath): ''' Normally: speakerList = [name] and wavNameDict = {name:"name.wav"} But, if you have multiple speakers for each file (assuming audio is synced) e.g. in a stereo audio situation: speakerList=["L","R"] and wavNameDict={"L":"%s_%s.wav" % (name, "L"), "R":"%s_%s.wav" % (name, "R")} ''' utils.makeDir(outputPath) # Formatted output of cabocha with io.open(join(txtPath, txtFN), "r", encoding="utf-8") as fd: data = fd.read() dataList = data.split("\n") dataList = [[subRow.split(",") for subRow in row.split(";")] for row in dataList if row != ""] dataDict = {speaker: [] for speaker in speakerList} # Undoing the unnecessary split that just happened for timingInfo, line, wordList, kanaList, romajiList in dataList: line = ",".join(line) speaker, startTimeStr, endTimeStr = timingInfo speaker, startTime, endTime = (speaker.strip(), float(startTimeStr), float(endTimeStr)) dataDict[speaker].append( [startTime, endTime, line, wordList, kanaList, romajiList]) # Do the forced alignment speakerEntryDict = {} numPhonesFailedAlignment = 0 numPhones = 0 numFailedIntervals = 0 numIntervals = 0 for speaker in speakerList: output = juliusAlignment.juliusAlignCabocha(dataDict[speaker], wavPath, wavNameDict[speaker], juliusScriptPath, soxPath, perlPath, False, True, True) speakerEntryDict[speaker], statList = output numPhonesFailedAlignment += statList[0] numPhones += statList[1] numFailedIntervals += statList[2] numIntervals += statList[3] # All durations should be the same inputWavFN = next(iter(wavNameDict.values())) maxDuration = audioScripts.getSoundFileDuration(join(wavPath, inputWavFN)) # Create tiers and textgrids from the output of the alignment tg = tgio.Textgrid() for speaker in speakerList: for aspect in [ juliusAlignment.UTTERANCE, juliusAlignment.WORD, juliusAlignment.PHONE ]: tierName = "%s_%s" % (aspect, speaker) tier = tgio.IntervalTier(tierName, speakerEntryDict[speaker][aspect], minT=0, maxT=maxDuration) tg.addTier(tier) tg.save(join(outputPath, outputWavName + ".TextGrid")) return (numPhonesFailedAlignment, numPhones, numFailedIntervals, numIntervals)
def convertCorpusToKanaAndRomaji(inputPath, outputPath, cabochaEncoding, cabochaPath=None, encoding="cp932"): ''' Reduces a corpus of typical Japanese text to both kana and romaji Each line of input should be of the form: startTime, stopTime, Japanese text ''' utils.makeDir(outputPath) numUnnamedEntities = 0 numUnidentifiedUtterances = 0 numWordsProcessedWithNoError = 0 fnList = utils.findFiles(inputPath, filterExt=".txt") for fn in fnList: with io.open(join(inputPath, fn), "rU", encoding=encoding) as fd: text = fd.read() textList = text.split("\n") numUnnamedEntitiesForFN = 0 numUnidentifiedUtterancesForFN = 0 speakerInfoList = [] for line in textList: line = line.strip() try: startTime, stopTime, line = line.split(",", 2) except ValueError: print("error") continue origLine = line dataPrepTuple = juliusAlignment.formatTextForJulius(line, cabochaEncoding, cabochaPath) (line, tmpWordList, tmpKanaList, tmpRomajiList, unidentifiedUtterance, unnamedEntity, tmpWordCount) = dataPrepTuple numUnnamedEntities += unnamedEntity numUnidentifiedUtterances += unidentifiedUtterance numWordsProcessedWithNoError += tmpWordCount name = os.path.splitext(fn)[0] outputList = [u"%s,%s,%s" % (name, startTime, stopTime), origLine, tmpWordList, tmpKanaList, tmpRomajiList] outputStr = ";".join(outputList) speakerInfoList.append(outputStr) if(numUnnamedEntities > 0 or numUnidentifiedUtterances > 0): print(fn) print("Number of unnamed entities for fn: %d" % numUnnamedEntitiesForFN) print("Number of unidentified utterances for fn: %d" % numUnidentifiedUtterancesForFN) numUnnamedEntities += numUnnamedEntitiesForFN numUnidentifiedUtterances += numUnidentifiedUtterancesForFN with io.open(join(outputPath, fn), "w", encoding="utf-8") as fd: fd.write("\n".join(speakerInfoList)) print("\n") print("Number of transcripts converted: %d" % len(fnList)) print("Number of unnamed entities: %d" % numUnnamedEntities) print("Number of unidentified utterances: %d" % numUnidentifiedUtterances) print("Number of words processed without error: %d" % numWordsProcessedWithNoError)