Пример #1
0
def splitAudio(path):
    
    outputPath = join(path, "split_audio")
    utils.makeDir(outputPath)
    
    for fn in utils.findFiles(path, filterExt=".wav"):
        audioScripts.splitStereoAudio(path, fn, outputPath)
Пример #2
0
def splitAudio(path):

    outputPath = join(path, "split_audio")
    utils.makeDir(outputPath)

    for fn in utils.findFiles(path, filterExt=".wav"):
        audioScripts.splitStereoAudio(path, fn, outputPath)
Пример #3
0
def extractSubwav(fn, outputFN, startT, endT, singleChannelFlag, soxPath=None):

    if soxPath is None:
        soxPath = "sox"  # Assumes it is in the user's path

    path, fnNoPath = os.path.split(fn)
    resampledAudioPath = join(path, "resampledAudio")
    resampledFN = join(resampledAudioPath, fnNoPath)

    audiofile = wave.open(fn, "r")
    params = audiofile.getparams()
    nchannels = params[0]
    sampwidth = params[1]
    framerate = params[2]
    comptype = params[4]
    compname = params[5]

    # If you are not using the default Julius training model, you might
    # need to change the value here.  Results will fail if the sampling
    # frequency is different.
    if framerate != DEFAULT_FRAMERATE:
        if not os.path.exists(resampledFN):
            utils.makeDir(resampledAudioPath)
            sr = str(DEFAULT_FRAMERATE)
            soxCmd = "%s %s -r %s %s rate -v 96k" % (soxPath, fn, sr,
                                                     resampledFN)
            os.system(soxCmd)

        if not os.path.exists(resampledFN):
            raise IncompatibleSampleFrequencyError(framerate,
                                                   DEFAULT_FRAMERATE)

        audiofile = wave.open(resampledFN, "r")
        params = audiofile.getparams()
        nchannels = params[0]
        sampwidth = params[1]
        framerate = params[2]
        comptype = params[4]
        compname = params[5]

    # Extract the audio frames
    audiofile.setpos(int(framerate * startT))
    audioFrames = audiofile.readframes(int(framerate * (endT - startT)))

    # Convert to single channel if needed
    if singleChannelFlag is True and nchannels > 1:
        audioFrames = audioop.tomono(audioFrames, sampwidth, 1, 0)
        nchannels = 1

    outParams = [
        nchannels, sampwidth, framerate,
        len(audioFrames), comptype, compname
    ]

    outWave = wave.open(outputFN, "w")
    outWave.setparams(outParams)
    outWave.writeframes(audioFrames)
Пример #4
0
def renameMP3Files(path):
    
    outputPath = join(path, "renamed")
    utils.makeDir(outputPath)

    for name in utils.findFiles(path, filterExt=".mp3", stripExt=True):
        if name[-1] == "x":
            newName = name[:-1]
            shutil.move(join(path, name + ".mp3"),
                        join(outputPath, newName + ".mp3"))
Пример #5
0
def renameMP3Files(path):

    outputPath = join(path, "renamed")
    utils.makeDir(outputPath)

    for name in utils.findFiles(path, filterExt=".mp3", stripExt=True):
        if name[-1] == "x":
            newName = name[:-1]
            shutil.move(join(path, name + ".mp3"),
                        join(outputPath, newName + ".mp3"))
Пример #6
0
def extractSubwav(fn, outputFN, startT, endT, singleChannelFlag, soxPath=None):
    
    if soxPath is None:
        soxPath = "sox"  # Assumes it is in the user's path
    
    path, fnNoPath = os.path.split(fn)
    resampledAudioPath = join(path, "resampledAudio")
    resampledFN = join(resampledAudioPath, fnNoPath)
    
    audiofile = wave.open(fn, "r")
    params = audiofile.getparams()
    nchannels = params[0]
    sampwidth = params[1]
    framerate = params[2]
    comptype = params[4]
    compname = params[5]

    # If you are not using the default Julius training model, you might
    # need to change the value here.  Results will fail if the sampling
    # frequency is different.
    if framerate != DEFAULT_FRAMERATE:
        if not os.path.exists(resampledFN):
            utils.makeDir(resampledAudioPath)
            sr = str(DEFAULT_FRAMERATE)
            soxCmd = "%s %s -r %s %s rate -v 96k" % (soxPath, fn, sr,
                                                     resampledFN)
            os.system(soxCmd)
        
        if not os.path.exists(resampledFN):
            raise IncompatibleSampleFrequencyError(framerate,
                                                   DEFAULT_FRAMERATE)
        
        audiofile = wave.open(resampledFN, "r")
        params = audiofile.getparams()
        nchannels = params[0]
        sampwidth = params[1]
        framerate = params[2]
        comptype = params[4]
        compname = params[5]

    # Extract the audio frames
    audiofile.setpos(int(framerate * startT))
    audioFrames = audiofile.readframes(int(framerate * (endT - startT)))
    
    # Convert to single channel if needed
    if singleChannelFlag is True and nchannels > 1:
        audioFrames = audioop.tomono(audioFrames, sampwidth, 1, 0)
        nchannels = 1
    
    outParams = [nchannels, sampwidth, framerate, len(audioFrames),
                 comptype, compname]
    
    outWave = wave.open(outputFN, "w")
    outWave.setparams(outParams)
    outWave.writeframes(audioFrames)
Пример #7
0
def convertCorpusToUTF8(path):
    
    outputDir = join(path, "output")
    utils.makeDir(outputDir)
    
    for fn in utils.findFiles(path, filterExt=".txt"):
        # cp932 = Japanese
        with io.open(join(path, fn), "rU", encoding="cp932") as fd:
            text = fd.read()
        with io.open(join(outputDir, fn), "w", encoding='utf-8') as fd:
            fd.write(text)
Пример #8
0
def convertCorpusToUTF8(path):

    outputDir = join(path, "output")
    utils.makeDir(outputDir)

    for fn in utils.findFiles(path, filterExt=".txt"):
        # cp932 = Japanese
        with io.open(join(path, fn), "rU", encoding="cp932") as fd:
            text = fd.read()
        with io.open(join(outputDir, fn), "w", encoding='utf-8') as fd:
            fd.write(text)
Пример #9
0
def forceAlignCorpus(wavPath,
                     txtPath,
                     outputPath,
                     juliusScriptPath=None,
                     soxPath=None,
                     perlPath=None):
    '''Force aligns every file and prints out summary statistics'''
    totalNumPhonesFailed = 0
    totalNumPhones = 0

    totalNumIntervalsFailed = 0
    totalNumIntervals = 0

    utils.makeDir(outputPath)

    for name in utils.findFiles(txtPath, filterExt=".txt", stripExt=True):
        wavNameDict = {name: "%s.wav" % name}
        output = forceAlignFile([
            name,
        ], wavPath, wavNameDict, txtPath, name + ".txt", outputPath, name,
                                juliusScriptPath, soxPath, perlPath)

        (numPhonesFailedAlignment, numPhones, numFailedIntervals,
         numIntervals) = output

        percentFailed = utils.divide(numPhonesFailedAlignment, numPhones,
                                     0) * 100
        percentFailedIntervals = utils.divide(numFailedIntervals, numIntervals,
                                              0) * 100
        print("%d intervals of %d total intervals (%0.2f%%) and %d phones "
              "of %d total phones (%0.2f%%) successfully aligned for %s" %
              (numIntervals - numFailedIntervals, numIntervals, 100 *
               (1 - percentFailedIntervals),
               numPhones - numPhonesFailedAlignment, numPhones, 100 *
               (1 - percentFailed), name))

        totalNumPhonesFailed += numPhonesFailedAlignment
        totalNumPhones += numPhones

        totalNumIntervalsFailed += numFailedIntervals
        totalNumIntervals += numIntervals

    totalPercentFailed = utils.divide(totalNumPhonesFailed, totalNumPhones,
                                      0) * 100
    totalPercentFailedIntervals = utils.divide(totalNumIntervalsFailed,
                                               totalNumIntervals, 0) * 100
    print("====Summary====")
    print(
        "%d intervals of %d total intervals (%0.2f%%) and %d phones of "
        "%d total phones (%0.2f%%) successfully aligned" %
        (totalNumIntervals - totalNumIntervalsFailed, totalNumIntervals, 100 *
         (1 - totalPercentFailedIntervals), totalNumPhones -
         totalNumPhonesFailed, totalNumPhones, 100 * (1 - totalPercentFailed)))
Пример #10
0
def getSelectedTxtFiles(txtPath, wavPath):

    outputPath = join(txtPath, "selected_txt")
    utils.makeDir(outputPath)

    nameList = utils.findFiles(wavPath, filterExt=".wav", stripExt=True)
    nameList = [name.split("_")[0] for name in nameList]
    nameList = list(set(nameList))

    for name in utils.findFiles(txtPath, filterExt=".txt", stripExt=True):
        if name not in nameList:
            continue
        shutil.copy(join(txtPath, name + ".txt"),
                    join(outputPath, name + ".txt"))
Пример #11
0
def getSelectedTxtFiles(txtPath, wavPath):
    
    outputPath = join(txtPath, "selected_txt")
    utils.makeDir(outputPath)
    
    nameList = utils.findFiles(wavPath, filterExt=".wav", stripExt=True)
    nameList = [name.split("_")[0] for name in nameList]
    nameList = list(set(nameList))
    
    for name in utils.findFiles(txtPath, filterExt=".txt", stripExt=True):
        if name not in nameList:
            continue
        shutil.copy(join(txtPath, name + ".txt"),
                    join(outputPath, name + ".txt"))
Пример #12
0
def textgridToCSV(inputPath, outputPath, outputExt='.csv'):
    utils.makeDir(outputPath)

    for fn in utils.findFiles(inputPath, filterExt=".TextGrid"):
        tg = tgio.openTextgrid(join(inputPath, fn))
        tier = tg.tierDict["utterances"]
        outputList = []
        for start, stop, label in tier.entryList:
            outputList.append("%s,%s,%s" % (start, stop, label))
        
        name = os.path.splitext(fn)[0]
        outputTxt = "\n".join(outputList)
        outputFN = join(outputPath, "%s%s" % (name, outputExt))
        with io.open(outputFN, "w", encoding="utf-8") as fd:
            fd.write(outputTxt)
Пример #13
0
def forceAlignCorpus(wavPath, txtPath, outputPath, juliusScriptPath=None,
                     soxPath=None, perlPath=None):
    '''Force aligns every file and prints out summary statistics'''
    totalNumPhonesFailed = 0
    totalNumPhones = 0
    
    totalNumIntervalsFailed = 0
    totalNumIntervals = 0
    
    utils.makeDir(outputPath)
    
    for name in utils.findFiles(txtPath, filterExt=".txt", stripExt=True):
        wavNameDict = {name: "%s.wav" % name}
        output = forceAlignFile([name, ], wavPath, wavNameDict, txtPath,
                                name + ".txt", outputPath, name,
                                juliusScriptPath, soxPath, perlPath)

        (numPhonesFailedAlignment, numPhones,
         numFailedIntervals, numIntervals) = output
        
        percentFailed = utils.divide(numPhonesFailedAlignment,
                                     numPhones, 0) * 100
        percentFailedIntervals = utils.divide(numFailedIntervals,
                                              numIntervals, 0) * 100
        print("%d intervals of %d total intervals (%0.2f%%) and %d phones "
              "of %d total phones (%0.2f%%) successfully aligned for %s" %
              (numIntervals - numFailedIntervals, numIntervals, 100 * (1 - percentFailedIntervals),
               numPhones - numPhonesFailedAlignment, numPhones, 100 * (1 - percentFailed), name))
        
        totalNumPhonesFailed += numPhonesFailedAlignment
        totalNumPhones += numPhones
        
        totalNumIntervalsFailed += numFailedIntervals
        totalNumIntervals += numIntervals
    
    totalPercentFailed = utils.divide(totalNumPhonesFailed,
                                      totalNumPhones, 0) * 100
    totalPercentFailedIntervals = utils.divide(totalNumIntervalsFailed,
                                               totalNumIntervals, 0) * 100
    print("====Summary====")
    print("%d intervals of %d total intervals (%0.2f%%) and %d phones of "
          "%d total phones (%0.2f%%) successfully aligned" %
          (totalNumIntervals - totalNumIntervalsFailed, totalNumIntervals, 100 * (1 - totalPercentFailedIntervals),
           totalNumPhones - totalNumPhonesFailed, totalNumPhones, 100 * (1 - totalPercentFailed)))
Пример #14
0
def convertCRESTToKanaAndRomaji(inputPath, outputPath, cabochaEncoding,
                                cabochaPath, encoding="cp932"):
    
    timeInfoPath = join(outputPath, "speaker_info_and_utterance_timing")
    
    for path in [timeInfoPath]:
        utils.makeDir(path)
    
    numUnnamedEntities = 0
    numUnidentifiedUtterances = 0
    finishedList = utils.findFiles(timeInfoPath, filterExt=".txt")
    for fn in utils.findFiles(inputPath, filterExt=".txt",
                              skipIfNameInList=finishedList):
        with io.open(join(inputPath, fn), "r", encoding=encoding) as fd:
            text = fd.read()
        textList = text.split("\n")
        
        numUnnamedEntitiesForFN = 0
        numUnidentifiedUtterancesForFN = 0
        speakerInfoList = []
        for line in textList:
            line = line.strip()
            try:
                speakerCode, startTime, stopTime, line = line.split(" ", 3)
            except ValueError:
                continue
            
            origLine = line
            
            # Clean up the line before it gets processed
            # Not sure what "・" is but cabocha doesn't like it
            for char in [u"(", u")", u" ", u".", u"?", u"「", u"」",
                         u"[", u"]", u"@W", u"@S", u"<", u">", u" ", u"。"]:
                line = line.replace(char, "")
            
            # Used to split names?
            for char in [u"・", u"·"]:
                line = line.replace(char, " ")
            
            line = line.strip()
            
            try:
                tmp = jProcessingSnippet.getChunkedKana(line, cabochaEncoding,
                                                        cabochaPath)
                tmpWordList, tmpKanaList, tmpromajiList = tmp
            except (jProcessingSnippet.ChunkingError,
                    jProcessingSnippet.NonKatakanaError) as e:
                print(u"%s, %s" % (str(e), origLine))
                tmpWordList = [""]
                tmpKanaList = [""]
                tmpromajiList = [""]
                numUnidentifiedUtterancesForFN += 1
            except jProcessingSnippet.UnidentifiedJapaneseText as e:
                if all([char == u"X" for char in e.word]):
                    numUnnamedEntitiesForFN += 1
                else:
                    print(u"%s" % str(e))
                    numUnidentifiedUtterancesForFN += 1
                tmpWordList = [""]
                tmpKanaList = [""]
                tmpromajiList = [""]
            except jProcessingSnippet.EmptyStrError as e:
                tmpWordList = [""]
                tmpKanaList = [""]
                tmpromajiList = [""]
            except Exception:
                print(line)
                raise
            line = line.replace(u",", u"")
            outputList = [u"%s,%s,%s" % (speakerCode, startTime, stopTime),
                          origLine, ','.join(tmpWordList),
                          ",".join(tmpKanaList), ",".join(tmpromajiList)]
            outputStr = ";".join(outputList)
            
            speakerInfoList.append(outputStr)
        
        print(fn)
        print("Number of unnamed entities for fn: %d" %
              numUnnamedEntitiesForFN)
        print("Number of unidentified utterances for fn: %d" %
              numUnidentifiedUtterancesForFN)
        numUnnamedEntities += numUnnamedEntitiesForFN
        numUnidentifiedUtterances += numUnidentifiedUtterancesForFN

        outputFN = join(timeInfoPath, fn)
        with io.open(outputFN, "w", encoding="utf-8") as fd:
            fd.write("\n".join(speakerInfoList))
    
    print("\n")
    print("Number of unnamed entities: %d" % numUnnamedEntities)
    print("Number of unidentified utterances: %d" % numUnidentifiedUtterances)
Пример #15
0
def forceAlignFile(wavPath, wavName, txtPath, txtFN, outputPath,
                   juliusScriptPath, soxPath):
    '''
    '''

    utils.makeDir(outputPath)

    wavFNDict = {"L": wavName + "_L.wav", "R": wavName + "_R.wav"}

    # Formatted output of cabocha
    data = open(join(txtPath, txtFN), "rU").read()
    dataList = data.split("\n")
    dataList = [[subRow.split(",") for subRow in row.split(";")]
                for row in dataList if row != ""]

    dataDict = {"L": [], "R": []}
    for timingInfo, line, wordList, kanaList, romajiList in dataList:
        # Undoing the unnecessary split that just happened
        line = ",".join(line)

        speaker, startTimeStr, endTimeStr = timingInfo
        speaker, startTime, endTime = (speaker.strip(), float(startTimeStr),
                                       float(endTimeStr))

        dataDict[speaker].append(
            [startTime, endTime, line, wordList, kanaList, romajiList])

    speakerEntryDict = {}
    numPhonesFailedAlignment = 0
    numPhones = 0
    numFailedIntervals = 0
    numIntervals = 0
    for speaker in ["L", "R"]:
        tmp = juliusAlignment.juliusAlignCabocha(dataDict[speaker], wavPath,
                                                 wavFNDict[speaker],
                                                 juliusScriptPath, soxPath,
                                                 False, True, True)
        speakerEntryDict[speaker], statList = tmp
        numPhonesFailedAlignment += statList[0]
        numPhones += statList[1]
        numFailedIntervals += statList[2]
        numIntervals += statList[3]

    # Create tiers and textgrids
    tg = tgio.Textgrid()
    maxDuration = audioScripts.getSoundFileDuration(
        join(wavPath, wavName + "_L.wav"))
    for speaker in ["L", "R"]:
        for aspect in [
                juliusAlignment.UTTERANCE, juliusAlignment.WORD,
                juliusAlignment.PHONE
        ]:

            tierName = "%s_%s" % (aspect, speaker)

            tier = tgio.IntervalTier(tierName,
                                     speakerEntryDict[speaker][aspect],
                                     minT=0,
                                     maxT=maxDuration)
            tg.addTier(tier)

    tg.save(join(outputPath, wavName + ".TextGrid"))

    return (numPhonesFailedAlignment, numPhones, numFailedIntervals,
            numIntervals)
Пример #16
0
def juliusAlignCabocha(dataList, wavPath, wavFN, juliusScriptPath, soxPath,
                       perlPath, silenceFlag, forceEndTimeFlag,
                       forceMonophoneAlignFlag):
    '''
    Given utterance-level timing and a wav file, phone-align the audio
    
    dataList is the formatted output of cabocha of the form
    [startTime, endTime, wordList, kanaList, romajiList]
    '''
    tmpOutputPath = join(wavPath, "align_tmp")
    utils.makeDir(tmpOutputPath)
    
    logFn = join(tmpOutputPath, 'align_log_' + str(datetime.datetime.now()) + '.txt')
    loggerFd = open(logFn, "w")

    utils.makeDir(tmpOutputPath)
    
    tmpTxtFN = join(tmpOutputPath, "tmp.txt")
    tmpWavFN = join(tmpOutputPath, "tmp.wav")
    tmpOutputFN = join(tmpOutputPath, "tmp.lab")
    
    entryDict = {}
    for aspect in [UTTERANCE, WORD, PHONE]:
        entryDict[aspect] = []
    
    # one speech interval at a time
    numTotalPhones = 0
    numPhonesFailedToAlign = 0
    numIntervals = 0
    numFailedIntervals = 0

    # intervalStart, intervalEnd, line, wordList, kanaList, romajiList
    for rowTuple in dataList:
        intervalStart = rowTuple[0]
        intervalEnd = rowTuple[1]
        line = rowTuple[2]
        wordList = rowTuple[3]
        romajiList = rowTuple[5]
        
        if line.strip() != "":
            entryDict[UTTERANCE].append((str(intervalStart),
                                         str(intervalEnd),
                                         line))
        
        if len([word for word in wordList if word != '']) == 0:
            continue
        
        assert(intervalStart < intervalEnd)
        
        # Create romajiTxt (for forced alignment) and
        # phoneList (for the textgrid)
        # Phones broken up by word
        tmpRomajiList = []
        tmpFlattenedRomajiList = []
        for row in romajiList:
            rowList = row.split(" ")
            tmpRomajiList.append(rowList)
            tmpFlattenedRomajiList.extend(rowList)

        numWords = len(wordList)
        wordTimeList = [[] for i in range(numWords)]

        romajiTxt = " ".join(romajiList)
        phoneList = [phone for phone in romajiTxt.split(" ")]
        
        # No forced-alignment if there is no romaji
        if romajiTxt.strip() == "":
            continue
        
        # Encapsulate each phone string in boundary silence
        #    - in my limited experience, this messes up the output even more
        if silenceFlag:
            romajiTxt = "silB " + romajiTxt + " silE"
        
        # Save temporary transcript and wav files for interval
        with open(tmpTxtFN, "w") as fd:
            fd.write(romajiTxt)
                
        audioScripts.extractSubwav(join(wavPath, wavFN), tmpWavFN,
                                   intervalStart, intervalEnd,
                                   singleChannelFlag=False,
                                   soxPath=soxPath)
        
        # Run forced alignment
        runJuliusAlignment(tmpOutputPath, juliusScriptPath, perlPath, loggerFd)
        
        # Get the output (timestamps for each phone)
        numIntervals += 1
        try:
            matchList = parseJuliusOutput(tmpOutputFN)
        except JuliusAlignmentError:
            if forceMonophoneAlignFlag is True and numWords == 1:
                # One phone occupies the whole interval
                matchList = [(0.0, (intervalEnd - intervalStart) * 100)]
            else:
                numPhonesFailedToAlign += numWords
                numFailedIntervals += 1
                print("Failed to align: %s - %f - %f" %
                      ("".join(romajiList), intervalStart, intervalEnd))
                continue

        adjustedPhonList = [[intervalStart + start, intervalStart + stop, label]
                            for start, stop, label in matchList]

        # Julius is conservative in estimating the final vowel.  Stretch it
        # to be the length of the utterance
        if forceEndTimeFlag:
            adjustedPhonList[-1][1] = intervalEnd

        entryDict[PHONE].extend(adjustedPhonList)

        # Get the bounding indicies for the phones in each word
        phoneToWordIndexList = []
        phonesSoFar = 0
        for i in range(len(wordList)):
            numPhones = len(tmpRomajiList[i])
            phoneToWordIndexList.append((phonesSoFar, phonesSoFar + numPhones - 1))
            phonesSoFar += numPhones

        # If julius uses a silence model and we don't, then adjust our timings
        phoneListFromJulius = [label for _, _, label in adjustedPhonList]
        if  "silB" in phoneListFromJulius and "silB" not in tmpFlattenedRomajiList:
            phoneToWordIndexList = [(startI + 1, endI + 1) for startI, endI in phoneToWordIndexList]
            lastI = phoneToWordIndexList[-1][1]
            phoneToWordIndexList = [(0, 0)] + phoneToWordIndexList + [(lastI + 1, lastI + 1)]
            wordList = [""] + wordList + [""]

        # Store the words
        for i in range(len(wordList)):
            startI, stopI = phoneToWordIndexList[i]
            
            entryDict[WORD].append((adjustedPhonList[startI][0],
                                    adjustedPhonList[stopI][1],
                                    wordList[i]))

        numTotalPhones += numWords

    statList = [numPhonesFailedToAlign, numTotalPhones,
                numFailedIntervals, numIntervals]
    return entryDict, statList
Пример #17
0
def forceAlignFile(speakerList, wavPath, wavNameDict, txtPath, txtFN,
                   outputPath, outputWavName, juliusScriptPath, soxPath,
                   perlPath):
    '''
    
    Normally:
    speakerList = [name]
    and
    wavNameDict = {name:"name.wav"}
    
    But, if you have multiple speakers for each file (assuming audio is synced)
    e.g. in a stereo audio situation:
    speakerList=["L","R"]
    and
    wavNameDict={"L":"%s_%s.wav" % (name, "L"), "R":"%s_%s.wav" % (name, "R")}
    '''
    
    utils.makeDir(outputPath)
    
    # Formatted output of cabocha
    with io.open(join(txtPath, txtFN), "r", encoding="utf-8") as fd:
        data = fd.read()
    dataList = data.split("\n")
    dataList = [[subRow.split(",") for subRow in row.split(";")]
                for row in dataList if row != ""]
    
    dataDict = {speaker: [] for speaker in speakerList}
    
    # Undoing the unnecessary split that just happened
    for timingInfo, line, wordList, kanaList, romajiList in dataList:
        line = ",".join(line)
        
        speaker, startTimeStr, endTimeStr = timingInfo
        speaker, startTime, endTime = (speaker.strip(), float(startTimeStr),
                                       float(endTimeStr))
        
        dataDict[speaker].append([startTime, endTime, line, wordList,
                                  kanaList, romajiList])

    # Do the forced alignment
    speakerEntryDict = {}
    numPhonesFailedAlignment = 0
    numPhones = 0
    numFailedIntervals = 0
    numIntervals = 0
    for speaker in speakerList:
        output = juliusAlignment.juliusAlignCabocha(dataDict[speaker], wavPath,
                                                    wavNameDict[speaker],
                                                    juliusScriptPath, soxPath,
                                                    perlPath, False, True, True)
        speakerEntryDict[speaker], statList = output
        
        numPhonesFailedAlignment += statList[0]
        numPhones += statList[1]
        numFailedIntervals += statList[2]
        numIntervals += statList[3]

    # All durations should be the same
    inputWavFN = next(iter(wavNameDict.values()))
    maxDuration = audioScripts.getSoundFileDuration(join(wavPath, inputWavFN))

    # Create tiers and textgrids from the output of the alignment
    tg = tgio.Textgrid()
    for speaker in speakerList:
        for aspect in [juliusAlignment.UTTERANCE, juliusAlignment.WORD,
                       juliusAlignment.PHONE]:
            
            tierName = "%s_%s" % (aspect, speaker)
            tier = tgio.IntervalTier(tierName,
                                     speakerEntryDict[speaker][aspect],
                                     minT=0, maxT=maxDuration)
            tg.addTier(tier)
    
    tg.save(join(outputPath, outputWavName + ".TextGrid"))

    return (numPhonesFailedAlignment, numPhones,
            numFailedIntervals, numIntervals)
Пример #18
0
def convertCRESTToKanaAndRomaji(inputPath,
                                outputPath,
                                cabochaEncoding,
                                cabochaPath,
                                encoding="cp932"):

    timeInfoPath = join(outputPath, "speaker_info_and_utterance_timing")

    for path in [timeInfoPath]:
        utils.makeDir(path)

    numUnnamedEntities = 0
    numUnidentifiedUtterances = 0
    finishedList = utils.findFiles(timeInfoPath, filterExt=".txt")
    for fn in utils.findFiles(inputPath,
                              filterExt=".txt",
                              skipIfNameInList=finishedList):
        with io.open(join(inputPath, fn), "r", encoding=encoding) as fd:
            text = fd.read()
        textList = text.split("\n")

        numUnnamedEntitiesForFN = 0
        numUnidentifiedUtterancesForFN = 0
        speakerInfoList = []
        for line in textList:
            line = line.strip()
            try:
                speakerCode, startTime, stopTime, line = line.split(" ", 3)
            except ValueError:
                continue

            origLine = line

            # Clean up the line before it gets processed
            # Not sure what "・" is but cabocha doesn't like it
            for char in [
                    u"(", u")", u" ", u".", u"?", u"「", u"」", u"[", u"]",
                    u"@W", u"@S", u"<", u">", u" ", u"。"
            ]:
                line = line.replace(char, "")

            # Used to split names?
            for char in [u"・", u"·"]:
                line = line.replace(char, " ")

            line = line.strip()

            try:
                tmp = jProcessingSnippet.getChunkedKana(
                    line, cabochaEncoding, cabochaPath)
                tmpWordList, tmpKanaList, tmpromajiList = tmp
            except (jProcessingSnippet.ChunkingError,
                    jProcessingSnippet.NonKatakanaError) as e:
                print(u"%s, %s" % (str(e), origLine))
                tmpWordList = [""]
                tmpKanaList = [""]
                tmpromajiList = [""]
                numUnidentifiedUtterancesForFN += 1
            except jProcessingSnippet.UnidentifiedJapaneseText as e:
                if all([char == u"X" for char in e.word]):
                    numUnnamedEntitiesForFN += 1
                else:
                    print(u"%s" % str(e))
                    numUnidentifiedUtterancesForFN += 1
                tmpWordList = [""]
                tmpKanaList = [""]
                tmpromajiList = [""]
            except jProcessingSnippet.EmptyStrError as e:
                tmpWordList = [""]
                tmpKanaList = [""]
                tmpromajiList = [""]
            except Exception:
                print(line)
                raise
            line = line.replace(u",", u"")
            outputList = [
                u"%s,%s,%s" % (speakerCode, startTime, stopTime), origLine,
                ','.join(tmpWordList), ",".join(tmpKanaList),
                ",".join(tmpromajiList)
            ]
            outputStr = ";".join(outputList)

            speakerInfoList.append(outputStr)

        print(fn)
        print("Number of unnamed entities for fn: %d" %
              numUnnamedEntitiesForFN)
        print("Number of unidentified utterances for fn: %d" %
              numUnidentifiedUtterancesForFN)
        numUnnamedEntities += numUnnamedEntitiesForFN
        numUnidentifiedUtterances += numUnidentifiedUtterancesForFN

        outputFN = join(timeInfoPath, fn)
        with io.open(outputFN, "w", encoding="utf-8") as fd:
            fd.write("\n".join(speakerInfoList))

    print("\n")
    print("Number of unnamed entities: %d" % numUnnamedEntities)
    print("Number of unidentified utterances: %d" % numUnidentifiedUtterances)
Пример #19
0
def forceAlignFile(wavPath, wavName, txtPath, txtFN, outputPath,
                   juliusScriptPath, soxPath):
    '''
    '''
    
    utils.makeDir(outputPath)
    
    wavFNDict = {"L": wavName + "_L.wav",
                 "R": wavName + "_R.wav"}
    
    # Formatted output of cabocha
    data = open(join(txtPath, txtFN), "rU").read()
    dataList = data.split("\n")
    dataList = [[subRow.split(",") for subRow in row.split(";")]
                for row in dataList if row != ""]
    
    dataDict = {"L": [], "R": []}
    for timingInfo, line, wordList, kanaList, romajiList in dataList:
        # Undoing the unnecessary split that just happened
        line = ",".join(line)
        
        speaker, startTimeStr, endTimeStr = timingInfo
        speaker, startTime, endTime = (speaker.strip(), float(startTimeStr),
                                       float(endTimeStr))
        
        dataDict[speaker].append([startTime, endTime, line, wordList,
                                  kanaList, romajiList])

    speakerEntryDict = {}
    numPhonesFailedAlignment = 0
    numPhones = 0
    numFailedIntervals = 0
    numIntervals = 0
    for speaker in ["L", "R"]:
        tmp = juliusAlignment.juliusAlignCabocha(dataDict[speaker], wavPath,
                                                 wavFNDict[speaker],
                                                 juliusScriptPath,
                                                 soxPath,
                                                 False, True, True)
        speakerEntryDict[speaker], statList = tmp
        numPhonesFailedAlignment += statList[0]
        numPhones += statList[1]
        numFailedIntervals += statList[2]
        numIntervals += statList[3]

    # Create tiers and textgrids
    tg = tgio.Textgrid()
    maxDuration = audioScripts.getSoundFileDuration(join(wavPath,
                                                         wavName + "_L.wav"))
    for speaker in ["L", "R"]:
        for aspect in [juliusAlignment.UTTERANCE, juliusAlignment.WORD,
                       juliusAlignment.PHONE]:
            
            tierName = "%s_%s" % (aspect, speaker)

            tier = tgio.IntervalTier(tierName,
                                        speakerEntryDict[speaker][aspect],
                                        minT=0, maxT=maxDuration)
            tg.addTier(tier)
    
    tg.save(join(outputPath, wavName + ".TextGrid"))

    return (numPhonesFailedAlignment, numPhones,
            numFailedIntervals, numIntervals)
Пример #20
0
def convertCorpusToKanaAndRomaji(inputPath,
                                 outputPath,
                                 cabochaEncoding,
                                 cabochaPath=None,
                                 encoding="cp932"):
    '''
    Reduces a corpus of typical Japanese text to both kana and romaji

    Each line of input should be of the form:
    startTime, stopTime, Japanese text
    '''
    utils.makeDir(outputPath)

    numUnnamedEntities = 0
    numUnidentifiedUtterances = 0
    numWordsProcessedWithNoError = 0

    fnList = utils.findFiles(inputPath, filterExt=".txt")
    for fn in fnList:
        with io.open(join(inputPath, fn), "rU", encoding=encoding) as fd:
            text = fd.read()
        textList = text.split("\n")

        numUnnamedEntitiesForFN = 0
        numUnidentifiedUtterancesForFN = 0
        speakerInfoList = []
        for line in textList:
            line = line.strip()
            try:
                startTime, stopTime, line = line.split(",", 2)
            except ValueError:
                print("error")
                continue
            origLine = line

            dataPrepTuple = juliusAlignment.formatTextForJulius(
                line, cabochaEncoding, cabochaPath)

            (line, tmpWordList, tmpKanaList, tmpRomajiList,
             unidentifiedUtterance, unnamedEntity,
             tmpWordCount) = dataPrepTuple

            numUnnamedEntities += unnamedEntity
            numUnidentifiedUtterances += unidentifiedUtterance
            numWordsProcessedWithNoError += tmpWordCount

            name = os.path.splitext(fn)[0]
            outputList = [
                u"%s,%s,%s" % (name, startTime, stopTime), origLine,
                tmpWordList, tmpKanaList, tmpRomajiList
            ]
            outputStr = ";".join(outputList)

            speakerInfoList.append(outputStr)

        if (numUnnamedEntities > 0 or numUnidentifiedUtterances > 0):
            print(fn)
            print("Number of unnamed entities for fn: %d" %
                  numUnnamedEntitiesForFN)
            print("Number of unidentified utterances for fn: %d" %
                  numUnidentifiedUtterancesForFN)

        numUnnamedEntities += numUnnamedEntitiesForFN
        numUnidentifiedUtterances += numUnidentifiedUtterancesForFN

        with io.open(join(outputPath, fn), "w", encoding="utf-8") as fd:
            fd.write("\n".join(speakerInfoList))

    print("\n")
    print("Number of transcripts converted: %d" % len(fnList))
    print("Number of unnamed entities: %d" % numUnnamedEntities)
    print("Number of unidentified utterances: %d" % numUnidentifiedUtterances)
    print("Number of words processed without error: %d" %
          numWordsProcessedWithNoError)
Пример #21
0
def juliusAlignCabocha(dataList, wavPath, wavFN, juliusScriptPath, soxPath,
                       perlPath, silenceFlag, forceEndTimeFlag,
                       forceMonophoneAlignFlag):
    '''
    Given utterance-level timing and a wav file, phone-align the audio
    
    dataList is the formatted output of cabocha of the form
    [startTime, endTime, wordList, kanaList, romajiList]
    '''
    tmpOutputPath = join(wavPath, "align_tmp")
    utils.makeDir(tmpOutputPath)

    logFn = join(tmpOutputPath,
                 'align_log_' + str(datetime.datetime.now()) + '.txt')
    loggerFd = open(logFn, "w")

    utils.makeDir(tmpOutputPath)

    tmpTxtFN = join(tmpOutputPath, "tmp.txt")
    tmpWavFN = join(tmpOutputPath, "tmp.wav")
    tmpOutputFN = join(tmpOutputPath, "tmp.lab")

    entryDict = {}
    for aspect in [UTTERANCE, WORD, PHONE]:
        entryDict[aspect] = []

    # one speech interval at a time
    numTotalPhones = 0
    numPhonesFailedToAlign = 0
    numIntervals = 0
    numFailedIntervals = 0

    # intervalStart, intervalEnd, line, wordList, kanaList, romajiList
    for rowTuple in dataList:
        intervalStart = rowTuple[0]
        intervalEnd = rowTuple[1]
        line = rowTuple[2]
        wordList = rowTuple[3]
        romajiList = rowTuple[5]

        if line.strip() != "":
            entryDict[UTTERANCE].append(
                (str(intervalStart), str(intervalEnd), line))

        if len([word for word in wordList if word != '']) == 0:
            continue

        assert (intervalStart < intervalEnd)

        # Create romajiTxt (for forced alignment) and
        # phoneList (for the textgrid)
        # Phones broken up by word
        tmpRomajiList = []
        tmpFlattenedRomajiList = []
        for row in romajiList:
            rowList = row.split(" ")
            tmpRomajiList.append(rowList)
            tmpFlattenedRomajiList.extend(rowList)

        numWords = len(wordList)
        wordTimeList = [[] for i in range(numWords)]

        romajiTxt = " ".join(romajiList)
        phoneList = [phone for phone in romajiTxt.split(" ")]

        # No forced-alignment if there is no romaji
        if romajiTxt.strip() == "":
            continue

        # Encapsulate each phone string in boundary silence
        #    - in my limited experience, this messes up the output even more
        if silenceFlag:
            romajiTxt = "silB " + romajiTxt + " silE"

        # Save temporary transcript and wav files for interval
        with open(tmpTxtFN, "w") as fd:
            fd.write(romajiTxt)

        audioScripts.extractSubwav(join(wavPath, wavFN),
                                   tmpWavFN,
                                   intervalStart,
                                   intervalEnd,
                                   singleChannelFlag=False,
                                   soxPath=soxPath)

        # Run forced alignment
        runJuliusAlignment(tmpOutputPath, juliusScriptPath, perlPath, loggerFd)

        # Get the output (timestamps for each phone)
        numIntervals += 1
        try:
            matchList = parseJuliusOutput(tmpOutputFN)
        except JuliusAlignmentError:
            if forceMonophoneAlignFlag is True and numWords == 1:
                # One phone occupies the whole interval
                matchList = [(0.0, (intervalEnd - intervalStart) * 100)]
            else:
                numPhonesFailedToAlign += numWords
                numFailedIntervals += 1
                print("Failed to align: %s - %f - %f" %
                      ("".join(romajiList), intervalStart, intervalEnd))
                continue

        adjustedPhonList = [[
            intervalStart + start, intervalStart + stop, label
        ] for start, stop, label in matchList]

        # Julius is conservative in estimating the final vowel.  Stretch it
        # to be the length of the utterance
        if forceEndTimeFlag:
            adjustedPhonList[-1][1] = intervalEnd

        entryDict[PHONE].extend(adjustedPhonList)

        # Get the bounding indicies for the phones in each word
        phoneToWordIndexList = []
        phonesSoFar = 0
        for i in range(len(wordList)):
            numPhones = len(tmpRomajiList[i])
            phoneToWordIndexList.append(
                (phonesSoFar, phonesSoFar + numPhones - 1))
            phonesSoFar += numPhones

        # If julius uses a silence model and we don't, then adjust our timings
        phoneListFromJulius = [label for _, _, label in adjustedPhonList]
        if "silB" in phoneListFromJulius and "silB" not in tmpFlattenedRomajiList:
            phoneToWordIndexList = [(startI + 1, endI + 1)
                                    for startI, endI in phoneToWordIndexList]
            lastI = phoneToWordIndexList[-1][1]
            phoneToWordIndexList = [(0, 0)] + phoneToWordIndexList + [
                (lastI + 1, lastI + 1)
            ]
            wordList = [""] + wordList + [""]

        # Store the words
        for i in range(len(wordList)):
            startI, stopI = phoneToWordIndexList[i]

            entryDict[WORD].append((adjustedPhonList[startI][0],
                                    adjustedPhonList[stopI][1], wordList[i]))

        numTotalPhones += numWords

    statList = [
        numPhonesFailedToAlign, numTotalPhones, numFailedIntervals,
        numIntervals
    ]
    return entryDict, statList
Пример #22
0
def forceAlignFile(speakerList, wavPath, wavNameDict, txtPath, txtFN,
                   outputPath, outputWavName, juliusScriptPath, soxPath,
                   perlPath):
    '''

    Normally:
    speakerList = [name]
    and
    wavNameDict = {name:"name.wav"}

    But, if you have multiple speakers for each file (assuming audio is synced)
    e.g. in a stereo audio situation:
    speakerList=["L","R"]
    and
    wavNameDict={"L":"%s_%s.wav" % (name, "L"), "R":"%s_%s.wav" % (name, "R")}
    '''

    utils.makeDir(outputPath)

    # Formatted output of cabocha
    with io.open(join(txtPath, txtFN), "r", encoding="utf-8") as fd:
        data = fd.read()
    dataList = data.split("\n")
    dataList = [[subRow.split(",") for subRow in row.split(";")]
                for row in dataList if row != ""]

    dataDict = {speaker: [] for speaker in speakerList}

    # Undoing the unnecessary split that just happened
    for timingInfo, line, wordList, kanaList, romajiList in dataList:
        line = ",".join(line)

        speaker, startTimeStr, endTimeStr = timingInfo
        speaker, startTime, endTime = (speaker.strip(), float(startTimeStr),
                                       float(endTimeStr))

        dataDict[speaker].append(
            [startTime, endTime, line, wordList, kanaList, romajiList])

    # Do the forced alignment
    speakerEntryDict = {}
    numPhonesFailedAlignment = 0
    numPhones = 0
    numFailedIntervals = 0
    numIntervals = 0
    for speaker in speakerList:
        output = juliusAlignment.juliusAlignCabocha(dataDict[speaker], wavPath,
                                                    wavNameDict[speaker],
                                                    juliusScriptPath, soxPath,
                                                    perlPath, False, True,
                                                    True)
        speakerEntryDict[speaker], statList = output

        numPhonesFailedAlignment += statList[0]
        numPhones += statList[1]
        numFailedIntervals += statList[2]
        numIntervals += statList[3]

    # All durations should be the same
    inputWavFN = next(iter(wavNameDict.values()))
    maxDuration = audioScripts.getSoundFileDuration(join(wavPath, inputWavFN))

    # Create tiers and textgrids from the output of the alignment
    tg = tgio.Textgrid()
    for speaker in speakerList:
        for aspect in [
                juliusAlignment.UTTERANCE, juliusAlignment.WORD,
                juliusAlignment.PHONE
        ]:

            tierName = "%s_%s" % (aspect, speaker)
            tier = tgio.IntervalTier(tierName,
                                     speakerEntryDict[speaker][aspect],
                                     minT=0,
                                     maxT=maxDuration)
            tg.addTier(tier)

    tg.save(join(outputPath, outputWavName + ".TextGrid"))

    return (numPhonesFailedAlignment, numPhones, numFailedIntervals,
            numIntervals)
Пример #23
0
def convertCorpusToKanaAndRomaji(inputPath, outputPath, cabochaEncoding,
                                 cabochaPath=None, encoding="cp932"):
    '''
    Reduces a corpus of typical Japanese text to both kana and romaji
    
    Each line of input should be of the form:
    startTime, stopTime, Japanese text
    '''
    utils.makeDir(outputPath)
    
    numUnnamedEntities = 0
    numUnidentifiedUtterances = 0
    numWordsProcessedWithNoError = 0

    fnList = utils.findFiles(inputPath, filterExt=".txt")
    for fn in fnList:
        with io.open(join(inputPath, fn), "rU", encoding=encoding) as fd:
            text = fd.read()
        textList = text.split("\n")
        
        numUnnamedEntitiesForFN = 0
        numUnidentifiedUtterancesForFN = 0
        speakerInfoList = []
        for line in textList:
            line = line.strip()
            try:
                startTime, stopTime, line = line.split(",", 2)
            except ValueError:
                print("error")
                continue
            origLine = line
            
            dataPrepTuple = juliusAlignment.formatTextForJulius(line, cabochaEncoding,
                                                                cabochaPath)
            
            (line, tmpWordList, tmpKanaList, tmpRomajiList,
             unidentifiedUtterance, unnamedEntity, tmpWordCount) = dataPrepTuple
             
            numUnnamedEntities += unnamedEntity
            numUnidentifiedUtterances += unidentifiedUtterance
            numWordsProcessedWithNoError += tmpWordCount
            
            name = os.path.splitext(fn)[0]
            outputList = [u"%s,%s,%s" % (name, startTime, stopTime), origLine,
                          tmpWordList, tmpKanaList, tmpRomajiList]
            outputStr = ";".join(outputList)
            
            speakerInfoList.append(outputStr)
        
        if(numUnnamedEntities > 0 or numUnidentifiedUtterances > 0):
            print(fn)
            print("Number of unnamed entities for fn: %d" %
                  numUnnamedEntitiesForFN)
            print("Number of unidentified utterances for fn: %d" %
                  numUnidentifiedUtterancesForFN)
        
        numUnnamedEntities += numUnnamedEntitiesForFN
        numUnidentifiedUtterances += numUnidentifiedUtterancesForFN

        with io.open(join(outputPath, fn), "w", encoding="utf-8") as fd:
            fd.write("\n".join(speakerInfoList))
     
    print("\n")
    print("Number of transcripts converted: %d" % len(fnList))
    print("Number of unnamed entities: %d" % numUnnamedEntities)
    print("Number of unidentified utterances: %d" % numUnidentifiedUtterances)
    print("Number of words processed without error: %d" % numWordsProcessedWithNoError)