예제 #1
0
def arpa_to_binary(arpaFile, outFile):
    '''
	Transform ARPA language model to KenLM binary format.

	Args:
		<arpaFile>: ARPA file path.
		<outFile>: output binary file path.

	Return:
		output file name with suffix ".binary".
	'''
    declare.is_file("arpaFile", arpaFile)
    declare.is_valid_string("outFile", outFile)
    outFile = outFile.strip()
    if not outFile.endswith(".binary"):
        outFile += ".binary"

    declare.is_valid_file_name("outFile", outFile)
    make_dependent_dirs(outFile)

    cmd = os.path.join(sys.prefix, "exkaldisrc", "tools", "build_binary")
    cmd += f" -s {arpaFile} {outFile}"
    out, err, cod = run_shell_command(cmd, stderr="PIPE")

    if (cod != 0) or (not os.path.isfile(outFile)) or (os.path.getsize(outFile)
                                                       == 0):
        print(err.decode())
        if os.path.isfile(outFile):
            os.remove(outFile)
        raise KenlmProcessError("Failed to tansform ARPA to binary format.")

    else:
        return outFile
예제 #2
0
파일: common.py 프로젝트: wangyu09/exkaldi
def utt2spk_to_spk2utt(utt2spk,outFile=None):
	'''
	Transform utt2spk to spk2utt.

	Args:
		<utt2spk>: file name or exkaldi ListTable object.
		<outFile>: file name or None.
	
	Return:
		file name or exakldi ListTable object.
	'''
	declare.is_potential_list_table("utt2spk",utt2spk)
	if outFile is not None:
		declare.is_valid_file_name(outFile)
	
	if isinstance(utt2spk,str):
		utt2spk = load_list_table(utt2spk)

	spk2utt = ListTable(name="spk2utt")
	for utt,spk in utt2spk.items():
		declare.is_valid_string("utterance ID",utt)
		declare.is_valid_string("speaker ID",spk)
		assert utt.count(" ") == 0,f"<utterance ID> is not a continuous string but spaces existed: {utt}."
		assert spk.count(" ") == 0,f"<speaker ID> is not a continuous string but spaces existed: {spk}."
		
		try:
			spk2utt[spk] += f" {utt}"
		except KeyError:
			spk2utt[spk] = utt

	if outFile is None:
		return spk2utt
	else:
		spk2utt.save(outFile)
		return outFile
예제 #3
0
	def save(self, fileName=None):
		'''
		Save arguments to file with specified format.

		Args:
			_fileName_: Nonr, a resonable file name.
		
		Return:
			if fileName is None:
				return a string of all contents
			else:
				the saved file name
		'''
		if fileName is not None:
			declare.is_valid_file_name("fileName", fileName)
			make_dependent_dirs(fileName, True)

		contents = []
		contents.append(self.__discription)
		for name, info in self.__arguments.items():
			# option name
			m = "\n"
			m += f"name={name}\n"
			# option value
			if isinstance(info.value,(list,tuple)):
				value="|".join(map(str,info.value))
			else:
				value = info.value
			m += f"value={value}\n"
			# abbreviation and dtype
			m += f"abbr={self.__name2Abb[name]}\n"
			m += f"dtype={info.dtype.__name__}\n"
			# default
			if isinstance(info.default,(list,tuple)):
				default="|".join(map(str,info.default))
			else:
				default = info.default
			m += f"default={default}\n"
			# choices
			if isinstance(info.choices,(list,tuple)):
				choices = "|".join(map(str,info.choices))
			else:
				choices = info.choices
			m += f"choices={choices}\n"
			# boundary and discription
			m += f"minV={info.minV}\n"
			m += f"maxV={info.maxV}\n"
			m += f"discription={info.discription}"
			contents.append(m)
		
		contents = "\n".join(contents) + "\n"

		if fileName is not None:
			with open(fileName, "w", encoding="utf-8") as fw:
				fw.write(contents)
			return fileName
		else:
			return contents
예제 #4
0
파일: common.py 프로젝트: wangyu09/exkaldi
def spk2utt_to_utt2spk(spk2utt,outFile=None):
	'''
	Transform spk2utt file to utt2spk file.

	Args:
		<spk2utt>: file name or exkaldi ListTable object.
		<outFile>: file name or None.

	Return:
		file name or exakldi ListTable object.
	'''
	declare.is_potential_list_table("spk2utt",spk2utt)
	if outFile is not None:
		declare.is_valid_file_name(outFile)
	
	if isinstance(spk2utt,str):
		spk2utt = load_list_table(spk2utt)

	utt2spk = ListTable(name="utt2spk")
	for spk,utts in spk2utt.items():
		declare.is_valid_string("utterance IDs",utts)
		declare.is_valid_string("speaker ID",spk)
		assert spk.count(" ") == 0,f"<speaker ID> is not a continuous string but spaces existed: {spk}."

		for utt in utts.split():
			try:
				utt2spk[utt]
			except KeyError:
				utt2spk[utt] = spk
			else:
				raise WrongDataFormat(f"utterance ID:{utt} has existed toward multiple speakers.")

	if outFile is None:
		return utt2spk
	else:
		utt2spk.save(outFile)
		return outFile
예제 #5
0
def train_ngrams_srilm(lexicons, order, text, outFile, config=None):
    '''
	Train N-Grams language model with SriLM tookit.
	If you don't specified the discount by the <config> option, We defaultly use "kndiscount".

	Args:
		<lexicons>: an exkaldi LexiconBank object.
		<order>: the maximum order of N-Grams.
		<text>: a text corpus file or an exkaldi transcription object.
		<outFile>: output file name of arpa LM.
		<config>: extra configurations, a Python dict object.

	You can use .check_config("train_ngrams_srilm") function to get a reference of extra configurations.
	Also you can run shell command "ngram-count" to look their usage.
	'''
    declare.is_lexicon_bank("lexicons", lexicons)
    declare.is_positive_int("order", order)
    declare.is_potential_transcription("text", text)
    declare.is_valid_file_name("outFile", outFile)
    # verify the max order
    declare.less_equal("order", order, "max order", 9)
    # prepare srilm tool
    ExkaldiInfo.prepare_srilm()

    with FileHandleManager() as fhm:
        # check whether this is a reasonable text corpus that should be splited by space.
        if isinstance(text, str):
            cmd = f"shuf {text} -n 100"
            out, err, cod = run_shell_command(cmd,
                                              stdout="PIPE",
                                              stderr="PIPE")
            if (isinstance(cod, int) and cod != 0):
                print(err.decode())
                raise ShellProcessError(
                    f"Failed to sample from text file:{text}.")
            elif out == b'':
                raise WrongDataFormat(f"Void text file:{text}.")
            else:
                out = out.decode().strip().split("\n")
                spaceCount = 0
                for line in out:
                    spaceCount += line.count(" ")
                if spaceCount < len(out) // 2:
                    raise WrongDataFormat(
                        "The text file doesn't seem to be separated by spaces or sentences are extremely short."
                    )

        else:
            sampleText = text.subset(nRandom=100)
            spaceCount = 0
            for key, value in sampleText.items():
                assert isinstance(
                    value, str
                ), f"Transcription must be string but got: {type_name(value)}."
                spaceCount += value.count(" ")
            if spaceCount < len(sampleText) // 2:
                raise WrongDataFormat(
                    "The text file doesn't seem to be separated by spaces or sentences are extremely short."
                )
            textTemp = fhm.create("a+", suffix=".txt", encoding="utf-8")
            text.save(textTemp, discardUttID=True)
            text = textTemp.name

        unkSymbol = lexicons("oov")

        wordlistTemp = fhm.create("w+", encoding='utf-8', suffix=".txt")
        words = lexicons("words")
        words = "\n".join(words.keys())
        wordlistTemp.write(words)
        wordlistTemp.seek(0)

        extraConfig = " "
        specifyDiscount = False
        if config is not None:
            if check_config(name='train_ngrams_srilm', config=config):
                for key, value in config.items():
                    if isinstance(value, bool):
                        if value is True:
                            extraConfig += f"{key} "
                        if key.endswith("discount"):
                            specifyDiscount = True
                    else:
                        extraConfig += f" {key} {value}"

        cmd = f'ngram-count -text {text} -order {order} -limit-vocab -vocab {wordlistTemp.name} -unk -map-unk "{unkSymbol}" '
        if specifyDiscount is False:
            cmd += "-kndiscount "
        cmd += "-interpolate "

        if not outFile.rstrip().endswith(".arpa"):
            outFile += ".arpa"
        make_dependent_dirs(outFile, pathIsFile=True)
        cmd += f" -lm {outFile}"

        out, err, cod = run_shell_command(cmd, stderr="PIPE")

        if (isinstance(cod, int) and cod != 0) or (
                not os.path.isfile(outFile)) or os.path.getsize(outFile) == 0:
            print(err.decode())
            if os.path.isfile(outFile):
                os.remove(outFile)
            raise KaldiProcessError(
                f'Failed to generate N-Grams language model.')

        return outFile
예제 #6
0
def train_ngrams_kenlm(lexicons, order, text, outFile, config=None):
    '''
	Train N-Grams language model with SriLM tookit.

	Args:
		<lexicons>: an exkaldi LexiconBank object.
		<order>: the maximum order of N-Grams.
		<text>: a text corpus file or an exkaldi transcription object.
		<outFile>: output file name of arpa LM.
		<config>: extra configurations, a Python dict object.

	You can use .check_config("train_ngrams_kenlm") function to get a reference of extra configurations.
	Also you can run shell command "lmplz" to look their usage.
	'''
    declare.is_lexicon_bank("lexicons", lexicons)
    declare.is_positive_int("order", order)
    declare.is_potential_transcription("text", text)
    declare.is_valid_file_name("outFile", outFile)

    declare.less_equal("order", order, "max order", 9)

    with FileHandleManager() as fhm:
        # check whether this is a reasonable text corpus that should be splited by space.
        if isinstance(text, str):
            cmd = f"shuf {text} -n 100"
            out, err, cod = run_shell_command(cmd,
                                              stdout="PIPE",
                                              stderr="PIPE")
            if (isinstance(cod, int) and cod != 0):
                print(err.decode())
                raise ShellProcessError(
                    f"Failed to sample from text file:{text}.")
            elif out == b'':
                raise WrongDataFormat(f"Void text file:{text}.")
            else:
                out = out.decode().strip().split("\n")
                spaceCount = 0
                for line in out:
                    spaceCount += line.count(" ")
                if spaceCount < len(out) // 2:
                    raise WrongDataFormat(
                        "The text file doesn't seem to be separated by spaces or sentences are extremely short."
                    )

        else:
            sampleText = text.subset(nRandom=100)
            spaceCount = 0
            for key, value in sampleText.items():
                assert isinstance(
                    value, str
                ), f"Transcription must be string but got: {type_name(value)}."
                spaceCount += value.count(" ")
            if spaceCount < len(sampleText) // 2:
                raise WrongDataFormat(
                    "The text file doesn't seem to be separated by spaces or sentences are extremely short."
                )
            textTemp = fhm.create("a+", suffix=".txt", encoding="utf-8")
            text.save(textTemp, discardUttID=True)
            text = textTemp.name

        extraConfig = " "
        if config is not None:
            if check_config(name='train_ngrams_kenlm', config=config):
                if "--temp_prefix" in config.keys() and "-T" in config.keys():
                    raise WrongOperation(
                        f'"--temp_prefix" and "-T" is the same configuration so only one of them is expected.'
                    )
                if "--memory" in config.keys() and "-S" in config.keys():
                    raise WrongOperation(
                        f'"--memory" and "-S" is the same configuration so only one of them is expected.'
                    )
                for key, value in config.items():
                    if isinstance(value, bool):
                        if value is True:
                            extraConfig += f"{key} "
                    else:
                        extraConfig += f"{key} {value} "

        if not outFile.rstrip().endswith(".arpa"):
            outFile += ".arpa"
        make_dependent_dirs(outFile, pathIsFile=True)

        wordlistTemp = fhm.create("w+", encoding='utf-8', suffix=".txt")
        words = lexicons("words")
        words_count = math.ceil(len(words) / 10) * 10
        words = "\n".join(words.keys())
        wordlistTemp.write(words)
        wordlistTemp.seek(0)

        KenLMTool = os.path.join(sys.prefix, "exkaldisrc", "tools", "lmplz")

        cmd = f"{KenLMTool}{extraConfig}-o {order} --vocab_estimate {words_count} --text {text} --arpa {outFile} --limit_vocab_file {wordlistTemp.name}"
        out, err, cod = run_shell_command(cmd, stderr="PIPE")

        if (isinstance(cod, int) and cod != 0) or (
                not os.path.isfile(outFile)) or (os.path.getsize(outFile)
                                                 == 0):
            print(err.decode())
            if os.path.isfile(outFile):
                os.remove(outFile)
            raise KenlmProcessError("Failed to generate arpa file.")

        return outFile
예제 #7
0
파일: common.py 프로젝트: wangyu09/exkaldi
def check_multiple_resources(*resources,outFile=None):
	'''
	This function is used to check whether or not use multiple process and verify the resources.

	args:
		<resources>: objects.
		<outFile>: None,file name,or a list of None objects,file names.
				If None,it means standard output stream.
	
	Return:
		lists of resources.
	'''
	# check the number of parallels
	multipleFlag = [ len(re) if isinstance(re,(list,tuple)) else 1 for re in resources ]
	multipleFlag = list(set(multipleFlag))

	if len(multipleFlag) == 0:
		raise WrongOperation(f"No any resource has been received.")
	elif len(multipleFlag) > 2:
		raise WrongOperation(f"The number of resources has various sizes:{multipleFlag}. We hope they have the same amount if their size are not 1.")
	multipleFlag = max(multipleFlag)

	# check and modify the amount of each resource
	resources = list(resources)
	for index,target in enumerate(resources):
		if isinstance(target,(list,tuple)):
			if len(target) == 1:
				resources[index] = [ target[0] for i in range(multipleFlag) ]
			else:
				exType = None
				for t in target:
					if exType is None:
						exType = type_name(t)
					elif type_name(t) != exType:
						raise WrongDataFormat(f"Elements of one group should be the same data class,but got: {exType} != {type_name(t)}.")
		else:
			resources[index] = [ target for i in range(multipleFlag) ]

	# check output file format
	if multipleFlag > 1:
		assert outFile is not None,"When apply parallel processes,output file name is necessary."
		outFiles = []
		declare.is_classes("outFile",outFile,[str,list,tuple])
		if isinstance(outFile,str):
			declare.is_valid_file_name("outFile",outFile)
			outFile = os.path.abspath(outFile)
			dirName = os.path.dirname(outFile)
			fileName = os.path.basename(outFile)
			namePattern = f"nj%0{len(str(multipleFlag))}d_{fileName}"
			outFiles = [ os.path.join(dirName,namePattern%i) for i in range(multipleFlag) ]
		else:
			declare.equal("the number of output files",len(outFile),"the number of parallel processes",multipleFlag)
			outFiles = []
			for f in outFile:
				declare.is_valid_file_name("outFile",f)
				outFiles.append(f)
		
		resources.append(outFiles)

	else:
		if outFile is None:
			outFile = "-"
		else:
			declare.is_valid_file_name("outFile",outFile)

		resources.append([outFile,])

	return resources