def getSelectedTxtFiles(txtPath, wavPath): outputPath = join(txtPath, "selected_txt") utils.makeDir(outputPath) nameList = utils.findFiles(wavPath, filterExt=".wav", stripExt=True) nameList = [name.split("_")[0] for name in nameList] nameList = list(set(nameList)) for name in utils.findFiles(txtPath, filterExt=".txt", stripExt=True): if name not in nameList: continue shutil.copy(join(txtPath, name + ".txt"), join(outputPath, name + ".txt"))
def splitAudio(path): outputPath = join(path, "split_audio") utils.makeDir(outputPath) for fn in utils.findFiles(path, filterExt=".wav"): audioScripts.splitStereoAudio(path, fn, outputPath)
def forceAlignCrest(wavPath, txtPath, outputPath, juliusScriptPath, soxPath): totalNumPhonesFailed = 0 totalNumPhones = 0 totalNumIntervalsFailed = 0 totalNumIntervals = 0 finishedList = utils.findFiles(outputPath, filterExt=".TextGrid", stripExt=True) for name in utils.findFiles(txtPath, filterExt=".txt", skipIfNameInList=finishedList, stripExt=True): (numPhonesFailedAlignment, numPhones, numFailedIntervals, numIntervals) = forceAlignFile(wavPath, name, txtPath, name + ".txt", outputPath, juliusScriptPath, soxPath) percentFailed = utils.divide(numPhonesFailedAlignment, numPhones, 0) * 100 percentFailedIntervals = utils.divide(numFailedIntervals, numIntervals, 0) * 100 print("%d intervals of %d total intervals (%0.2f%%) and %d phones " "of %d total phones (%0.2f%%) failed to align for %s" % (numFailedIntervals, numIntervals, percentFailedIntervals, numPhonesFailedAlignment, numPhones, percentFailed, name)) totalNumPhonesFailed += numPhonesFailedAlignment totalNumPhones += numPhones totalNumIntervalsFailed += numFailedIntervals totalNumIntervals += numIntervals totalPercentFailed = utils.divide(totalNumPhonesFailed, totalNumPhones, 0) * 100 totalPercentFailedIntervals = utils.divide(totalNumIntervalsFailed, totalNumIntervals, 0) * 100 print("====Summary====") print("%d intervals of %d total intervals (%0.2f%%) and %d phones of %d " "total phones (%0.2f%%) failed to align" % (totalNumIntervalsFailed, totalNumIntervals, totalPercentFailedIntervals, totalNumPhonesFailed, totalNumPhones, totalPercentFailed))
def renameMP3Files(path): outputPath = join(path, "renamed") utils.makeDir(outputPath) for name in utils.findFiles(path, filterExt=".mp3", stripExt=True): if name[-1] == "x": newName = name[:-1] shutil.move(join(path, name + ".mp3"), join(outputPath, newName + ".mp3"))
def convertCorpusToUTF8(path): outputDir = join(path, "output") utils.makeDir(outputDir) for fn in utils.findFiles(path, filterExt=".txt"): # cp932 = Japanese with io.open(join(path, fn), "rU", encoding="cp932") as fd: text = fd.read() with io.open(join(outputDir, fn), "w", encoding='utf-8') as fd: fd.write(text)
def forceAlignCorpus(wavPath, txtPath, outputPath, juliusScriptPath=None, soxPath=None, perlPath=None): '''Force aligns every file and prints out summary statistics''' totalNumPhonesFailed = 0 totalNumPhones = 0 totalNumIntervalsFailed = 0 totalNumIntervals = 0 utils.makeDir(outputPath) for name in utils.findFiles(txtPath, filterExt=".txt", stripExt=True): wavNameDict = {name: "%s.wav" % name} output = forceAlignFile([ name, ], wavPath, wavNameDict, txtPath, name + ".txt", outputPath, name, juliusScriptPath, soxPath, perlPath) (numPhonesFailedAlignment, numPhones, numFailedIntervals, numIntervals) = output percentFailed = utils.divide(numPhonesFailedAlignment, numPhones, 0) * 100 percentFailedIntervals = utils.divide(numFailedIntervals, numIntervals, 0) * 100 print("%d intervals of %d total intervals (%0.2f%%) and %d phones " "of %d total phones (%0.2f%%) successfully aligned for %s" % (numIntervals - numFailedIntervals, numIntervals, 100 * (1 - percentFailedIntervals), numPhones - numPhonesFailedAlignment, numPhones, 100 * (1 - percentFailed), name)) totalNumPhonesFailed += numPhonesFailedAlignment totalNumPhones += numPhones totalNumIntervalsFailed += numFailedIntervals totalNumIntervals += numIntervals totalPercentFailed = utils.divide(totalNumPhonesFailed, totalNumPhones, 0) * 100 totalPercentFailedIntervals = utils.divide(totalNumIntervalsFailed, totalNumIntervals, 0) * 100 print("====Summary====") print( "%d intervals of %d total intervals (%0.2f%%) and %d phones of " "%d total phones (%0.2f%%) successfully aligned" % (totalNumIntervals - totalNumIntervalsFailed, totalNumIntervals, 100 * (1 - totalPercentFailedIntervals), totalNumPhones - totalNumPhonesFailed, totalNumPhones, 100 * (1 - totalPercentFailed)))
def textgridToCSV(inputPath, outputPath, outputExt='.csv'): utils.makeDir(outputPath) for fn in utils.findFiles(inputPath, filterExt=".TextGrid"): tg = tgio.openTextgrid(join(inputPath, fn)) tier = tg.tierDict["utterances"] outputList = [] for start, stop, label in tier.entryList: outputList.append("%s,%s,%s" % (start, stop, label)) name = os.path.splitext(fn)[0] outputTxt = "\n".join(outputList) outputFN = join(outputPath, "%s%s" % (name, outputExt)) with io.open(outputFN, "w", encoding="utf-8") as fd: fd.write(outputTxt)
def forceAlignCorpus(wavPath, txtPath, outputPath, juliusScriptPath=None, soxPath=None, perlPath=None): '''Force aligns every file and prints out summary statistics''' totalNumPhonesFailed = 0 totalNumPhones = 0 totalNumIntervalsFailed = 0 totalNumIntervals = 0 utils.makeDir(outputPath) for name in utils.findFiles(txtPath, filterExt=".txt", stripExt=True): wavNameDict = {name: "%s.wav" % name} output = forceAlignFile([name, ], wavPath, wavNameDict, txtPath, name + ".txt", outputPath, name, juliusScriptPath, soxPath, perlPath) (numPhonesFailedAlignment, numPhones, numFailedIntervals, numIntervals) = output percentFailed = utils.divide(numPhonesFailedAlignment, numPhones, 0) * 100 percentFailedIntervals = utils.divide(numFailedIntervals, numIntervals, 0) * 100 print("%d intervals of %d total intervals (%0.2f%%) and %d phones " "of %d total phones (%0.2f%%) successfully aligned for %s" % (numIntervals - numFailedIntervals, numIntervals, 100 * (1 - percentFailedIntervals), numPhones - numPhonesFailedAlignment, numPhones, 100 * (1 - percentFailed), name)) totalNumPhonesFailed += numPhonesFailedAlignment totalNumPhones += numPhones totalNumIntervalsFailed += numFailedIntervals totalNumIntervals += numIntervals totalPercentFailed = utils.divide(totalNumPhonesFailed, totalNumPhones, 0) * 100 totalPercentFailedIntervals = utils.divide(totalNumIntervalsFailed, totalNumIntervals, 0) * 100 print("====Summary====") print("%d intervals of %d total intervals (%0.2f%%) and %d phones of " "%d total phones (%0.2f%%) successfully aligned" % (totalNumIntervals - totalNumIntervalsFailed, totalNumIntervals, 100 * (1 - totalPercentFailedIntervals), totalNumPhones - totalNumPhonesFailed, totalNumPhones, 100 * (1 - totalPercentFailed)))
def convertCorpusToKanaAndRomaji(inputPath, outputPath, cabochaEncoding, cabochaPath=None, encoding="cp932"): ''' Reduces a corpus of typical Japanese text to both kana and romaji Each line of input should be of the form: startTime, stopTime, Japanese text ''' utils.makeDir(outputPath) numUnnamedEntities = 0 numUnidentifiedUtterances = 0 numWordsProcessedWithNoError = 0 fnList = utils.findFiles(inputPath, filterExt=".txt") for fn in fnList: with io.open(join(inputPath, fn), "rU", encoding=encoding) as fd: text = fd.read() textList = text.split("\n") numUnnamedEntitiesForFN = 0 numUnidentifiedUtterancesForFN = 0 speakerInfoList = [] for line in textList: line = line.strip() try: startTime, stopTime, line = line.split(",", 2) except ValueError: print("error") continue origLine = line dataPrepTuple = juliusAlignment.formatTextForJulius( line, cabochaEncoding, cabochaPath) (line, tmpWordList, tmpKanaList, tmpRomajiList, unidentifiedUtterance, unnamedEntity, tmpWordCount) = dataPrepTuple numUnnamedEntities += unnamedEntity numUnidentifiedUtterances += unidentifiedUtterance numWordsProcessedWithNoError += tmpWordCount name = os.path.splitext(fn)[0] outputList = [ u"%s,%s,%s" % (name, startTime, stopTime), origLine, tmpWordList, tmpKanaList, tmpRomajiList ] outputStr = ";".join(outputList) speakerInfoList.append(outputStr) if (numUnnamedEntities > 0 or numUnidentifiedUtterances > 0): print(fn) print("Number of unnamed entities for fn: %d" % numUnnamedEntitiesForFN) print("Number of unidentified utterances for fn: %d" % numUnidentifiedUtterancesForFN) numUnnamedEntities += numUnnamedEntitiesForFN numUnidentifiedUtterances += numUnidentifiedUtterancesForFN with io.open(join(outputPath, fn), "w", encoding="utf-8") as fd: fd.write("\n".join(speakerInfoList)) print("\n") print("Number of transcripts converted: %d" % len(fnList)) print("Number of unnamed entities: %d" % numUnnamedEntities) print("Number of unidentified utterances: %d" % numUnidentifiedUtterances) print("Number of words processed without error: %d" % numWordsProcessedWithNoError)
def convertCRESTToKanaAndRomaji(inputPath, outputPath, cabochaEncoding, cabochaPath, encoding="cp932"): timeInfoPath = join(outputPath, "speaker_info_and_utterance_timing") for path in [timeInfoPath]: utils.makeDir(path) numUnnamedEntities = 0 numUnidentifiedUtterances = 0 finishedList = utils.findFiles(timeInfoPath, filterExt=".txt") for fn in utils.findFiles(inputPath, filterExt=".txt", skipIfNameInList=finishedList): with io.open(join(inputPath, fn), "r", encoding=encoding) as fd: text = fd.read() textList = text.split("\n") numUnnamedEntitiesForFN = 0 numUnidentifiedUtterancesForFN = 0 speakerInfoList = [] for line in textList: line = line.strip() try: speakerCode, startTime, stopTime, line = line.split(" ", 3) except ValueError: continue origLine = line # Clean up the line before it gets processed # Not sure what "・" is but cabocha doesn't like it for char in [ u"(", u")", u" ", u".", u"?", u"「", u"」", u"[", u"]", u"@W", u"@S", u"<", u">", u" ", u"。" ]: line = line.replace(char, "") # Used to split names? for char in [u"・", u"·"]: line = line.replace(char, " ") line = line.strip() try: tmp = jProcessingSnippet.getChunkedKana( line, cabochaEncoding, cabochaPath) tmpWordList, tmpKanaList, tmpromajiList = tmp except (jProcessingSnippet.ChunkingError, jProcessingSnippet.NonKatakanaError) as e: print(u"%s, %s" % (str(e), origLine)) tmpWordList = [""] tmpKanaList = [""] tmpromajiList = [""] numUnidentifiedUtterancesForFN += 1 except jProcessingSnippet.UnidentifiedJapaneseText as e: if all([char == u"X" for char in e.word]): numUnnamedEntitiesForFN += 1 else: print(u"%s" % str(e)) numUnidentifiedUtterancesForFN += 1 tmpWordList = [""] tmpKanaList = [""] tmpromajiList = [""] except jProcessingSnippet.EmptyStrError as e: tmpWordList = [""] tmpKanaList = [""] tmpromajiList = [""] except Exception: print(line) raise line = line.replace(u",", u"") outputList = [ u"%s,%s,%s" % (speakerCode, startTime, stopTime), origLine, ','.join(tmpWordList), ",".join(tmpKanaList), ",".join(tmpromajiList) ] outputStr = ";".join(outputList) speakerInfoList.append(outputStr) print(fn) print("Number of unnamed entities for fn: %d" % numUnnamedEntitiesForFN) print("Number of unidentified utterances for fn: %d" % numUnidentifiedUtterancesForFN) numUnnamedEntities += numUnnamedEntitiesForFN numUnidentifiedUtterances += numUnidentifiedUtterancesForFN outputFN = join(timeInfoPath, fn) with io.open(outputFN, "w", encoding="utf-8") as fd: fd.write("\n".join(speakerInfoList)) print("\n") print("Number of unnamed entities: %d" % numUnnamedEntities) print("Number of unidentified utterances: %d" % numUnidentifiedUtterances)
def convertCRESTToKanaAndRomaji(inputPath, outputPath, cabochaEncoding, cabochaPath, encoding="cp932"): timeInfoPath = join(outputPath, "speaker_info_and_utterance_timing") for path in [timeInfoPath]: utils.makeDir(path) numUnnamedEntities = 0 numUnidentifiedUtterances = 0 finishedList = utils.findFiles(timeInfoPath, filterExt=".txt") for fn in utils.findFiles(inputPath, filterExt=".txt", skipIfNameInList=finishedList): with io.open(join(inputPath, fn), "r", encoding=encoding) as fd: text = fd.read() textList = text.split("\n") numUnnamedEntitiesForFN = 0 numUnidentifiedUtterancesForFN = 0 speakerInfoList = [] for line in textList: line = line.strip() try: speakerCode, startTime, stopTime, line = line.split(" ", 3) except ValueError: continue origLine = line # Clean up the line before it gets processed # Not sure what "・" is but cabocha doesn't like it for char in [u"(", u")", u" ", u".", u"?", u"「", u"」", u"[", u"]", u"@W", u"@S", u"<", u">", u" ", u"。"]: line = line.replace(char, "") # Used to split names? for char in [u"・", u"·"]: line = line.replace(char, " ") line = line.strip() try: tmp = jProcessingSnippet.getChunkedKana(line, cabochaEncoding, cabochaPath) tmpWordList, tmpKanaList, tmpromajiList = tmp except (jProcessingSnippet.ChunkingError, jProcessingSnippet.NonKatakanaError) as e: print(u"%s, %s" % (str(e), origLine)) tmpWordList = [""] tmpKanaList = [""] tmpromajiList = [""] numUnidentifiedUtterancesForFN += 1 except jProcessingSnippet.UnidentifiedJapaneseText as e: if all([char == u"X" for char in e.word]): numUnnamedEntitiesForFN += 1 else: print(u"%s" % str(e)) numUnidentifiedUtterancesForFN += 1 tmpWordList = [""] tmpKanaList = [""] tmpromajiList = [""] except jProcessingSnippet.EmptyStrError as e: tmpWordList = [""] tmpKanaList = [""] tmpromajiList = [""] except Exception: print(line) raise line = line.replace(u",", u"") outputList = [u"%s,%s,%s" % (speakerCode, startTime, stopTime), origLine, ','.join(tmpWordList), ",".join(tmpKanaList), ",".join(tmpromajiList)] outputStr = ";".join(outputList) speakerInfoList.append(outputStr) print(fn) print("Number of unnamed entities for fn: %d" % numUnnamedEntitiesForFN) print("Number of unidentified utterances for fn: %d" % numUnidentifiedUtterancesForFN) numUnnamedEntities += numUnnamedEntitiesForFN numUnidentifiedUtterances += numUnidentifiedUtterancesForFN outputFN = join(timeInfoPath, fn) with io.open(outputFN, "w", encoding="utf-8") as fd: fd.write("\n".join(speakerInfoList)) print("\n") print("Number of unnamed entities: %d" % numUnnamedEntities) print("Number of unidentified utterances: %d" % numUnidentifiedUtterances)
def convertCorpusToKanaAndRomaji(inputPath, outputPath, cabochaEncoding, cabochaPath=None, encoding="cp932"): ''' Reduces a corpus of typical Japanese text to both kana and romaji Each line of input should be of the form: startTime, stopTime, Japanese text ''' utils.makeDir(outputPath) numUnnamedEntities = 0 numUnidentifiedUtterances = 0 numWordsProcessedWithNoError = 0 fnList = utils.findFiles(inputPath, filterExt=".txt") for fn in fnList: with io.open(join(inputPath, fn), "rU", encoding=encoding) as fd: text = fd.read() textList = text.split("\n") numUnnamedEntitiesForFN = 0 numUnidentifiedUtterancesForFN = 0 speakerInfoList = [] for line in textList: line = line.strip() try: startTime, stopTime, line = line.split(",", 2) except ValueError: print("error") continue origLine = line dataPrepTuple = juliusAlignment.formatTextForJulius(line, cabochaEncoding, cabochaPath) (line, tmpWordList, tmpKanaList, tmpRomajiList, unidentifiedUtterance, unnamedEntity, tmpWordCount) = dataPrepTuple numUnnamedEntities += unnamedEntity numUnidentifiedUtterances += unidentifiedUtterance numWordsProcessedWithNoError += tmpWordCount name = os.path.splitext(fn)[0] outputList = [u"%s,%s,%s" % (name, startTime, stopTime), origLine, tmpWordList, tmpKanaList, tmpRomajiList] outputStr = ";".join(outputList) speakerInfoList.append(outputStr) if(numUnnamedEntities > 0 or numUnidentifiedUtterances > 0): print(fn) print("Number of unnamed entities for fn: %d" % numUnnamedEntitiesForFN) print("Number of unidentified utterances for fn: %d" % numUnidentifiedUtterancesForFN) numUnnamedEntities += numUnnamedEntitiesForFN numUnidentifiedUtterances += numUnidentifiedUtterancesForFN with io.open(join(outputPath, fn), "w", encoding="utf-8") as fd: fd.write("\n".join(speakerInfoList)) print("\n") print("Number of transcripts converted: %d" % len(fnList)) print("Number of unnamed entities: %d" % numUnnamedEntities) print("Number of unidentified utterances: %d" % numUnidentifiedUtterances) print("Number of words processed without error: %d" % numWordsProcessedWithNoError)