예제 #1
0
def decompress_gz_file(filePath, overWrite=False):
    '''
	Decompress a gz file.

	Args:
		<filePath>: file path.
	Return:
		the absolute path of decompressed file.
	'''
    assert isinstance(
        filePath,
        str), f"<filePath> must be a string but got {type_name(filePath)}."
    filePath = filePath.rstrip()
    if not os.path.isfile(filePath):
        raise WrongPath(f"No such file:{filePath}.")
    elif not filePath.endswith(".gz"):
        raise WrongOperation(f"{filePath}: Unknown suffix.")

    outFile = filePath[:-3]
    if overWrite is True and os.path.isfile(outFile):
        os.remove(outFile)

    cmd = f"gzip -d {filePath}"
    out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE)

    if cod != 0:
        print(err.decode())
        raise ShellProcessError("Failed to decompress file.")
    else:
        return os.path.abspath(outFile)
예제 #2
0
def compress_gz_file(filePath, overWrite=False):
    '''
	Compress a file to gz file.

	Args:
		<filePath>: file path.
		<overWrite>: If True, overwrite gz file when it is existed.
	Return:
		the absolute path of compressed file.
	'''
    assert isinstance(
        filePath,
        str), f"<filePath> must be a string but got {type_name(filePath)}."
    filePath = filePath.strip()
    if not os.path.isfile(filePath):
        raise WrongPath(f"No such file:{filePath}.")

    outFile = filePath + ".gz"
    if overWrite is True and os.path.isfile(outFile):
        os.remove(outFile)

    cmd = f"gzip {filePath}"
    out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE)

    if cod != 0:
        print(err.decode())
        raise ShellProcessError("Failed to compress file.")
    else:
        return os.path.abspath(outFile)
예제 #3
0
def compress_gz_file(filePath, overWrite=False, keepSource=False):
    '''
	Compress a file to gz file.

	Args:
		<filePath>: file path.
		<overWrite>: If True,overwrite gz file when it has existed.
		<keepSource>: If True,retain source file.
	
	Return:
		the path of compressed file.
	'''
    declare.is_file("filePath", filePath)
    declare.is_bool("overWrite", overWrite)
    declare.is_bool("keepSource", keepSource)

    filePath = os.path.abspath(filePath)
    if filePath.endswith(".gz"):
        raise WrongOperation(f"Cannot compress a .gz file:{filePath}.")
    else:
        outFile = filePath + ".gz"

    if os.path.isfile(outFile):
        if overWrite is True:
            os.remove(outFile)
        else:
            raise WrongOperation(
                f"File has existed:{outFile}. If overwrite it,set option <overWrite>=True."
            )

    if keepSource:
        cmd = f"gzip -k {filePath}"
    else:
        cmd = f"gzip {filePath}"

    out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE)

    if cod != 0:
        print(err.decode())
        raise ShellProcessError("Failed to compress file.")
    else:
        return outFile
예제 #4
0
def view_kaldi_usage(toolName):
    '''
	View the help information of specified kaldi command.

	Args:
		<toolName>: kaldi tool name.
	'''
    declare.is_valid_string("toolName", toolName)
    cmd = toolName.strip().split()
    assert len(
        cmd
    ) == 1, f"<toolName> must only include one command name but got: {toolName}."
    cmd = cmd[0]
    cmd += " --help"

    out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE)

    if cod != 0:
        print(err.decode())
        raise ShellProcessError(f"Failed to get kaldi tool info: {toolName}.")
    else:
        print(err.decode())
예제 #5
0
def load_ali(target, aliType=None, name="ali", hmm=None):
    '''
	Load alignment data.

	Args:
		<target>: Python dict object, bytes object, exkaldi alignment object, kaldi alignment file or .npy file.
		<aliType>: None, or one of 'transitionID', 'phoneID', 'pdfID'. It will return different alignment object.
		<name>: a string.
		<hmm>: file path or exkaldi HMM object.
	Return:
		exkaldi alignment data objects.
	'''
    assert isinstance(
        name, str) and len(name) > 0, "Name shoud be a string avaliable."

    ExkaldiInfo.vertify_kaldi_existed()

    def transform(data, cmd):
        out, err, cod = run_shell_command(cmd,
                                          stdin=subprocess.PIPE,
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.PIPE,
                                          inputs=data)
        if (isinstance(cod, int) and cod != 0) and out == b'':
            print(err.decode())
            raise KaldiProcessError('Failed to transform alignment.')
        else:
            result = {}
            sp = BytesIO(out)
            for line in sp.readlines():
                line = line.decode()
                line = line.strip().split()
                utt = line[0]
                matrix = np.array(line[1:], dtype=np.int32)
                result[utt] = matrix
            return results

    if isinstance(target, dict):
        if aliType is None:
            result = NumpyAlignment(target, name)
        elif aliType == "transitionID":
            result = NumpyAlignmentTrans(target, name)
        elif aliType == "phoneID":
            result = NumpyAlignmentPhone(target, name)
        elif aliType == "pdfID":
            result = NumpyAlignmentPdf(target, name)
        else:
            raise WrongOperation(
                f"<aliType> should be None, 'transitionID', 'phoneID' or 'pdfID' but got {aliType}."
            )
        result.check_format()
        return result

    elif type_name(target) in [
            "NumpyAlignment", "NumpyAlignmentTrans", "NumpyAlignmentPhone",
            "NumpyAlignmentPdf", "BytesAlignmentTrans"
    ]:
        result = copy.deepcopy(target)
        result.rename(name)
        return result

    elif isinstance(target, str):

        allFiles = list_files(target)

        results = {
            "NumpyAlignment": NumpyAlignment(),
            "NumpyAlignmentTrans": NumpyAlignmentTrans(),
            "NumpyAlignmentPhone": NumpyAlignmentPhone(),
            "NumpyAlignmentPdf": NumpyAlignmentPdf(),
            "BytesAlignmentTrans": BytesAlignmentTrans(),
        }

        for fileName in allFiles:
            fileName = os.path.abspath(fileName)

            if fileName.endswith(".npy"):
                temp = __read_data_from_file(fileName, "npy")
                if aliType is None:
                    temp = NumpyAlignment(temp.data)
                    results["NumpyAlignment"] += temp
                elif aliType == "transitionID":
                    temp = NumpyAlignmentTrans(temp.data)
                    results["NumpyAlignmentTrans"] += temp
                elif aliType == "phoneID":
                    temp = NumpyAlignmentPhone(temp.data)
                    results["NumpyAlignmentPhone"] += temp
                elif aliType == "pdfID":
                    temp = NumpyAlignmentPdf(temp.data)
                    results["NumpyAlignmentPdf"] += temp
                else:
                    raise WrongOperation(
                        f"<aliType> should be None, 'transitionID','phoneID' or 'pdfID' but got {aliType}."
                    )

            else:
                if fileName.endswith('.gz'):
                    cmd = f'gunzip -c {fileName}'
                else:
                    cmd = f'cat {fileName}'

                if aliType is None or aliType == "transitionID":
                    out, err, cod = run_shell_command(cmd,
                                                      stdout=subprocess.PIPE,
                                                      stderr=subprocess.PIPE)
                    if (isinstance(cod, int) and cod != 0) or out == b'':
                        print(err.decode())
                        raise ShellProcessError(
                            "Failed to get the alignment data from file.")
                    else:
                        temp = BytesAlignmentTrans(out)
                        results["BytesAlignmentTrans"] += temp

                else:
                    temp = tempfile.NamedTemporaryFile("wb+")
                    try:
                        if type_name(hmm) in ("HMM", "MonophoneHMM",
                                              "TriphoneHMM"):
                            hmm.save(temp)
                            hmmFileName = temp.name
                        elif isinstance(hmm, str):
                            if not os.path.isfile(hmm):
                                raise WrongPath(f"No such file:{hmm}.")
                            hmmFileName = hmm
                        else:
                            raise UnsupportedType(
                                f"<hmm> should be a filePath or exkaldi HMM and its sub-class object. but got {type_name(hmm)}."
                            )

                        if aliType == "phoneID":
                            cmd += f" | ali-to-phones --per-frame=true {hmmFileName} ark:- ark,t:-"
                            temp = transform(None, cmd)
                            temp = NumpyAlignmentPhone(temp)
                            results["NumpyAlignmentPhone"] += temp

                        elif target == "pdfID":
                            cmd = f" | ali-to-pdf {hmmFileName} ark:- ark,t:-"
                            temp = transform(None, cmd)
                            temp = NumpyAlignmentPdf(temp)
                            results["NumpyAlignmentPdf"] += temp
                        else:
                            raise WrongOperation(
                                f"<target> should be 'trainsitionID', 'phoneID' or 'pdfID' but got {target}."
                            )

                    finally:
                        temp.close()

        finalResult = []
        for obj in results.values():
            if not obj.is_void:
                obj.rename(name)
                finalResult.append(obj)

        if len(finalResult) == 0:
            raise WrongOperation(
                "<target> dose not include any data avaliable.")
        elif len(finalResult) == 1:
            finalResult = finalResult[0]

        return finalResult
예제 #6
0
def train_ngrams_srilm(lexicons, order, text, outFile, config=None):
    '''
	Train N-Grams language model with SriLM tookit.
	If you don't specified the discount by the <config> option, We defaultly use "kndiscount".

	Args:
		<lexicons>: an exkaldi LexiconBank object.
		<order>: the maximum order of N-Grams.
		<text>: a text corpus file or an exkaldi transcription object.
		<outFile>: output file name of arpa LM.
		<config>: extra configurations, a Python dict object.

	You can use .check_config("train_ngrams_srilm") function to get a reference of extra configurations.
	Also you can run shell command "ngram-count" to look their usage.
	'''
    declare.is_lexicon_bank("lexicons", lexicons)
    declare.is_positive_int("order", order)
    declare.is_potential_transcription("text", text)
    declare.is_valid_file_name("outFile", outFile)
    # verify the max order
    declare.less_equal("order", order, "max order", 9)
    # prepare srilm tool
    ExkaldiInfo.prepare_srilm()

    with FileHandleManager() as fhm:
        # check whether this is a reasonable text corpus that should be splited by space.
        if isinstance(text, str):
            cmd = f"shuf {text} -n 100"
            out, err, cod = run_shell_command(cmd,
                                              stdout="PIPE",
                                              stderr="PIPE")
            if (isinstance(cod, int) and cod != 0):
                print(err.decode())
                raise ShellProcessError(
                    f"Failed to sample from text file:{text}.")
            elif out == b'':
                raise WrongDataFormat(f"Void text file:{text}.")
            else:
                out = out.decode().strip().split("\n")
                spaceCount = 0
                for line in out:
                    spaceCount += line.count(" ")
                if spaceCount < len(out) // 2:
                    raise WrongDataFormat(
                        "The text file doesn't seem to be separated by spaces or sentences are extremely short."
                    )

        else:
            sampleText = text.subset(nRandom=100)
            spaceCount = 0
            for key, value in sampleText.items():
                assert isinstance(
                    value, str
                ), f"Transcription must be string but got: {type_name(value)}."
                spaceCount += value.count(" ")
            if spaceCount < len(sampleText) // 2:
                raise WrongDataFormat(
                    "The text file doesn't seem to be separated by spaces or sentences are extremely short."
                )
            textTemp = fhm.create("a+", suffix=".txt", encoding="utf-8")
            text.save(textTemp, discardUttID=True)
            text = textTemp.name

        unkSymbol = lexicons("oov")

        wordlistTemp = fhm.create("w+", encoding='utf-8', suffix=".txt")
        words = lexicons("words")
        words = "\n".join(words.keys())
        wordlistTemp.write(words)
        wordlistTemp.seek(0)

        extraConfig = " "
        specifyDiscount = False
        if config is not None:
            if check_config(name='train_ngrams_srilm', config=config):
                for key, value in config.items():
                    if isinstance(value, bool):
                        if value is True:
                            extraConfig += f"{key} "
                        if key.endswith("discount"):
                            specifyDiscount = True
                    else:
                        extraConfig += f" {key} {value}"

        cmd = f'ngram-count -text {text} -order {order} -limit-vocab -vocab {wordlistTemp.name} -unk -map-unk "{unkSymbol}" '
        if specifyDiscount is False:
            cmd += "-kndiscount "
        cmd += "-interpolate "

        if not outFile.rstrip().endswith(".arpa"):
            outFile += ".arpa"
        make_dependent_dirs(outFile, pathIsFile=True)
        cmd += f" -lm {outFile}"

        out, err, cod = run_shell_command(cmd, stderr="PIPE")

        if (isinstance(cod, int) and cod != 0) or (
                not os.path.isfile(outFile)) or os.path.getsize(outFile) == 0:
            print(err.decode())
            if os.path.isfile(outFile):
                os.remove(outFile)
            raise KaldiProcessError(
                f'Failed to generate N-Grams language model.')

        return outFile
예제 #7
0
def train_ngrams_kenlm(lexicons, order, text, outFile, config=None):
    '''
	Train N-Grams language model with SriLM tookit.

	Args:
		<lexicons>: an exkaldi LexiconBank object.
		<order>: the maximum order of N-Grams.
		<text>: a text corpus file or an exkaldi transcription object.
		<outFile>: output file name of arpa LM.
		<config>: extra configurations, a Python dict object.

	You can use .check_config("train_ngrams_kenlm") function to get a reference of extra configurations.
	Also you can run shell command "lmplz" to look their usage.
	'''
    declare.is_lexicon_bank("lexicons", lexicons)
    declare.is_positive_int("order", order)
    declare.is_potential_transcription("text", text)
    declare.is_valid_file_name("outFile", outFile)

    declare.less_equal("order", order, "max order", 9)

    with FileHandleManager() as fhm:
        # check whether this is a reasonable text corpus that should be splited by space.
        if isinstance(text, str):
            cmd = f"shuf {text} -n 100"
            out, err, cod = run_shell_command(cmd,
                                              stdout="PIPE",
                                              stderr="PIPE")
            if (isinstance(cod, int) and cod != 0):
                print(err.decode())
                raise ShellProcessError(
                    f"Failed to sample from text file:{text}.")
            elif out == b'':
                raise WrongDataFormat(f"Void text file:{text}.")
            else:
                out = out.decode().strip().split("\n")
                spaceCount = 0
                for line in out:
                    spaceCount += line.count(" ")
                if spaceCount < len(out) // 2:
                    raise WrongDataFormat(
                        "The text file doesn't seem to be separated by spaces or sentences are extremely short."
                    )

        else:
            sampleText = text.subset(nRandom=100)
            spaceCount = 0
            for key, value in sampleText.items():
                assert isinstance(
                    value, str
                ), f"Transcription must be string but got: {type_name(value)}."
                spaceCount += value.count(" ")
            if spaceCount < len(sampleText) // 2:
                raise WrongDataFormat(
                    "The text file doesn't seem to be separated by spaces or sentences are extremely short."
                )
            textTemp = fhm.create("a+", suffix=".txt", encoding="utf-8")
            text.save(textTemp, discardUttID=True)
            text = textTemp.name

        extraConfig = " "
        if config is not None:
            if check_config(name='train_ngrams_kenlm', config=config):
                if "--temp_prefix" in config.keys() and "-T" in config.keys():
                    raise WrongOperation(
                        f'"--temp_prefix" and "-T" is the same configuration so only one of them is expected.'
                    )
                if "--memory" in config.keys() and "-S" in config.keys():
                    raise WrongOperation(
                        f'"--memory" and "-S" is the same configuration so only one of them is expected.'
                    )
                for key, value in config.items():
                    if isinstance(value, bool):
                        if value is True:
                            extraConfig += f"{key} "
                    else:
                        extraConfig += f"{key} {value} "

        if not outFile.rstrip().endswith(".arpa"):
            outFile += ".arpa"
        make_dependent_dirs(outFile, pathIsFile=True)

        wordlistTemp = fhm.create("w+", encoding='utf-8', suffix=".txt")
        words = lexicons("words")
        words_count = math.ceil(len(words) / 10) * 10
        words = "\n".join(words.keys())
        wordlistTemp.write(words)
        wordlistTemp.seek(0)

        KenLMTool = os.path.join(sys.prefix, "exkaldisrc", "tools", "lmplz")

        cmd = f"{KenLMTool}{extraConfig}-o {order} --vocab_estimate {words_count} --text {text} --arpa {outFile} --limit_vocab_file {wordlistTemp.name}"
        out, err, cod = run_shell_command(cmd, stderr="PIPE")

        if (isinstance(cod, int) and cod != 0) or (
                not os.path.isfile(outFile)) or (os.path.getsize(outFile)
                                                 == 0):
            print(err.decode())
            if os.path.isfile(outFile):
                os.remove(outFile)
            raise KenlmProcessError("Failed to generate arpa file.")

        return outFile
예제 #8
0
파일: load.py 프로젝트: ikou-austin/exkaldi
def load_ali(target, aliType="transitionID", name="ali", hmm=None):
    '''
	Load alignment data.

	Args:
		<target>: Python dict object,bytes object,exkaldi alignment object,kaldi alignment file or .npy file.
		<aliType>: None,or one of 'transitionID','phoneID','pdfID'. It will return different alignment object.
		<name>: a string.
		<hmm>: file path or exkaldi HMM object.

	Return:
		exkaldi alignment objects.
	'''
    declare.is_valid_string("name", name)
    declare.is_instances("aliType", aliType,
                         [None, "transitionID", "phoneID", "pdfID"])
    declare.kaldi_existed()

    def transform(data, cmd):
        out, err, cod = run_shell_command(cmd,
                                          stdin="PIPE",
                                          stdout="PIPE",
                                          stderr="PIPE",
                                          inputs=data)
        if (isinstance(cod, int) and cod != 0) and out == b'':
            print(err.decode())
            raise KaldiProcessError('Failed to transform alignment.')
        else:
            result = {}
            sp = BytesIO(out)
            for line in sp.readlines():
                line = line.decode()
                line = line.strip().split()
                utt = line[0]
                matrix = np.array(line[1:], dtype=np.int32)
                result[utt] = matrix
            return result

    if isinstance(target, dict):
        if aliType is None:
            result = NumpyAlignment(target, name)
        elif aliType == "transitionID":
            result = NumpyAlignmentTrans(target, name)
        elif aliType == "phoneID":
            result = NumpyAlignmentPhone(target, name)
        elif aliType == "pdfID":
            result = NumpyAlignmentPdf(target, name)
        else:
            raise WrongOperation(
                f"<aliType> should be None,'transitionID','phoneID' or 'pdfID' but got {aliType}."
            )
        result.check_format()
        return result

    elif isinstance(
            target,
        (NumpyAlignment, NumpyAlignmentTrans, BytesAlignmentTrans)):
        result = copy.deepcopy(target)
        result.rename(name)
        return result

    elif isinstance(target, ArkIndexTable):
        result = target.fetch(arkType="ali")
        if aliType in ["phoneID", "pdfID"]:
            result = result.to_numpy(aliType, hmm)
        result.rename(name)
        return result

    elif isinstance(target, str):

        allFiles = list_files(target)

        numpyAli = {}
        bytesAli = []

        for fileName in allFiles:
            fileName = fileName.strip()
            if fileName.endswith(".npy"):
                try:
                    temp = np.load(fileName, allow_pickle=True)
                    for utt, mat in temp:
                        numpyAli[utt] = mat
                except:
                    raise UnsupportedType(
                        f'This is not a valid Exkaldi npy file: {fileName}.')
            else:
                if fileName.endswith('.gz'):
                    cmd = f'gunzip -c {fileName}'
                else:
                    cmd = f'cat {fileName}'

                    if aliType is None or aliType == "transitionID":
                        out, err, cod = run_shell_command(cmd,
                                                          stdout="PIPE",
                                                          stderr="PIPE")
                        if (isinstance(cod, int) and cod != 0) or out == b'':
                            print(err.decode())
                            raise ShellProcessError(
                                f"Failed to get the alignment data from file: {fileName}."
                            )
                        else:
                            bytesAli.append(out)

                    else:
                        with FileHandleManager() as fhm:

                            declare.is_potential_hmm("hmm", hmm)
                            if not isinstance(hmm, str):
                                hmmTemp = fhm.create("wb+")
                                hmm.save(hmmTemp)
                                hmm = hmmTemp.name

                            if aliType == "phoneID":
                                cmd += f" | ali-to-phones --per-frame=true {hmm} ark:- ark,t:-"
                                temp = transform(None, cmd)

                            else:
                                cmd = f" | ali-to-pdf {hmm} ark:- ark,t:-"
                                temp = transform(None, cmd)

                        numpyAli.update(temp)

            bytesAli = b"".join(bytesAli)
            if aliType is None:
                if len(numpyAli) == 0:
                    return BytesAlignmentTrans(bytesAli, name=name)
                elif len(bytesAli) == 0:
                    return NumpyAlignment(numpyAli, name=name)
                else:
                    result = NumpyAlignmentTrans(
                        numpyAli) + BytesAlignmentTrans(bytesAli)
                    result.rename(name)
                    return result
            elif aliType == "transitionID":
                if len(numpyAli) == 0:
                    return BytesAlignmentTrans(bytesAli, name=name)
                elif len(bytesAli) == 0:
                    return NumpyAlignmentTrans(numpyAli, name=name)
                else:
                    result = NumpyAlignmentTrans(
                        numpyAli) + BytesAlignmentTrans(bytesAli)
                    result.rename(name)
                    return result
            elif aliType == "phoneID":
                return NumpyAlignmentPhone(numpyAli, name=name)
            else:
                return NumpyAlignmentPdf(numpyAli, name=name)

    else:
        raise UnsupportedType(
            f"<target> should be dict,file name or exkaldi alignment or index table object but got: {type_name(target)}."
        )
예제 #9
0
파일: lm.py 프로젝트: xujiajun6677/exkaldi
def train_ngrams_srilm(lexicons, order, textFile, outFile, config=None):
    '''
	Train n-grams language model with Srilm tookit.

	Args:
		<lexicons>: words.txt file path or Exkaldi LexiconBank object.
		<order>: the maxinum order of n-grams.
		<textFile>: text corpus file.
		<outFile>: ARPA out file name.
		<config>: configures, a Python dict object.

	You can use .check_config("train_ngrams_srilm") function to get configure information that you can set.
	Also you can run shell command "lmplz" to look their meaning.
	'''
    assert isinstance(
        order, int
    ) and order > 0 and order < 10, "Expected <n> is a positive int value and it must be smaller than 10."
    assert isinstance(textFile,
                      str), "Expected <textFile> is name-like string."
    assert isinstance(outFile, str), "Expected <outFile> is name-like string."
    assert type_name(
        lexicons
    ) == "LexiconBank", f"Expected <lexicons> is exkaldi LexiconBank object but got {type_name(lexicons)}."

    ExkaldiInfo.prepare_srilm()

    if not os.path.isfile(textFile):
        raise WrongPath(f"No such file:{textFile}")
    else:
        ## Should check the numbers of lines
        cmd = f"shuf {textFile} -n 100"
        out, err, cod = run_shell_command(cmd,
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.PIPE)
        if (isinstance(cod, int) and cod != 0):
            print(err.decode())
            raise ShellProcessError("Failed to sample from text file.")
        elif out == b'':
            raise WrongDataFormat("Void text file.")
        else:
            out = out.decode().strip().split("\n")
            spaceCount = 0
            for line in out:
                spaceCount += line.count(" ")
            if spaceCount < len(out) // 2:
                raise WrongDataFormat(
                    "The text file doesn't seem to be separated by spaces or extremely short."
                )

    wordlist = tempfile.NamedTemporaryFile("w+",
                                           encoding='utf-8',
                                           suffix=".txt")
    unkSymbol = lexicons("oov")
    try:
        lexiconp = lexicons("lexiconp")
        words = [x[0] for x in lexiconp.keys()]
        wordlist.write("\n".join(words))
        wordlist.seek(0)

        #cmd2 = f"ngram-count -text {textFile} -order {order}"
        extraConfig = " "
        specifyDiscount = False
        if config is not None:
            if check_config(name='train_ngrams_srilm', config=config):
                for key, value in config.items():
                    if isinstance(value, bool):
                        if value is True:
                            extraConfig += f"{key} "
                        if key.endswith("discount"):
                            specifyDiscount = True
                    else:
                        extraConfig += f" {key} {value}"

        cmd = f"ngram-count -text {textFile} -order {order} -limit-vocab -vocab {wordlist.name} -unk -map-unk {unkSymbol} "
        if specifyDiscount is False:
            cmd += "-kndiscount "
        cmd += "-interpolate "

        if not outFile.rstrip().endswith(".arpa"):
            outFile += ".arpa"
        make_dependent_dirs(outFile, pathIsFile=True)

        cmd += f" -lm {outFile}"

        out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE)

        if (isinstance(cod, int) and cod != 0) or (
                not os.path.isfile(outFile)) or os.path.getsize(outFile) == 0:
            print(err.decode())
            if os.path.isfile(outFile):
                os.remove(outFile)
            raise KaldiProcessError(
                f'Failed to generate ngrams language model.')
        else:
            return os.path.abspath(outFile)

    finally:
        wordlist.close()
예제 #10
0
파일: lm.py 프로젝트: xujiajun6677/exkaldi
def train_ngrams_kenlm(lexicons, order, textFile, outFile, config=None):
    '''
	Train n-grams language model with KenLm tookit.

	Args:
		<lexicons>: words.txt file path or Exkaldi LexiconBank object.
		<order>: the maxinum order of n-grams.
		<textFile>: text corpus file.
		<outFile>: ARPA out file name.
		<config>: configures, a Python dict object.

	You can use .check_config("train_ngrams_kenlm") function to get configure information that you can set.
	Also you can run shell command "lmplz" to look their meaning.
	'''
    assert isinstance(
        order, int
    ) and 0 < order <= 6, "We support maximum 6-grams LM in current version."

    if not os.path.isfile(textFile):
        raise WrongPath("No such file:{}".format(textFile))
    else:
        ## Should check the numbers of lines
        cmd = f"shuf {textFile} -n 100"
        out, err, cod = run_shell_command(cmd,
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.PIPE)
        if (isinstance(cod, int) and cod != 0):
            print(err.decode())
            raise ShellProcessError("Failed to sample from text file.")
        elif out == b'':
            raise WrongDataFormat("Void text file.")
        else:
            out = out.decode().strip().split("\n")
            spaceCount = 0
            for line in out:
                spaceCount += line.count(" ")
            if spaceCount < len(out) // 2:
                raise WrongDataFormat(
                    "The text file doesn't seem to be separated by spaces or extremely short."
                )

    extraConfig = " "
    if config != None:
        assert isinstance(
            config, dict
        ), f"<config> should be dict object but got: {type_name(config)}."
        if check_config(name='train_ngrams_kenlm', config=config):
            if "--temp_prefix" in config.keys() and "-T" in config.keys():
                raise WrongOperation(
                    f'"--temp_prefix" and "-T" is the same configure so only one of them is expected.'
                )
            if "--memory" in config.keys() and "-S" in config.keys():
                raise WrongOperation(
                    f'"--memory" and "-S" is the same configure so only one of them is expected.'
                )
            for key, value in config.items():
                if isinstance(value, bool):
                    if value is True:
                        extraConfig += f"{key} "
                else:
                    extraConfig += f"{key} {value} "

    assert isinstance(outFile, str), f"<outFile> should be a string."
    if not outFile.rstrip().endswith(".arpa"):
        outFile += ".arpa"
    make_dependent_dirs(outFile, pathIsFile=True)

    words = tempfile.NamedTemporaryFile("w+", suffix=".txt", encoding="utf-8")
    try:
        if type_name(lexicons) == "LexiconBank":
            ws = lexicons("words")
            words_count = math.ceil(len(ws) / 10) * 10
            ws = "\n".join(ws.keys())
        elif isinstance(lexicons, str):
            if not os.path.isfile(lexicons):
                raise WrongPath(f"No such file:{lexicons}.")
            with open(lexicons, "r", encoding="utf-8") as fr:
                lines = fr.readlines()
            ws = []
            for line in lines:
                line = line.strip().split(maxsplit=1)
                if len(line) < 1:
                    continue
                else:
                    ws.append(line[0])
            words_count = math.ceil(len(ws) / 10) * 10
            ws = "\n".join(ws)
        else:
            raise UnsupportedType(
                "<lexicons> should be LexiconBank object or file path.")

        words.write(ws)
        words.seek(0)

        KenLMTool = os.path.join(sys.prefix, "exkaldisrc", "tools", "lmplz")

        cmd = f"{KenLMTool}{extraConfig}-o {order} --vocab_estimate {words_count} --text {textFile} --arpa {outFile} --limit_vocab_file {words.name}"
        out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE)

        if (isinstance(cod, int) and cod != 0) or (
                not os.path.isfile(outFile)) or (os.path.getsize(outFile)
                                                 == 0):
            print(err.decode())
            raise KenlmProcessError("Failed to generate arpa file.")
        else:
            return os.path.abspath(outFile)

    finally:
        words.close()