Пример #1
0
    def open(self, filePath, mode, encoding=None, name=None):
        '''
		Open a regular file and return the handle.

		Args:
			<name>: a string. After named this handle exclusively,you can call its name to get it again.
					If None,we will use the file name as its default name.
					We allow to open the same file in multiple times as long as you name them differently.
		
		Return:
			a file handle.
		'''
        self.verify_safety()

        if name is not None:
            declare.is_valid_string("name", name)
            assert name not in self.__inventory.keys(
            ), f"<name> has been existed. We hope it be exclusive: {name}."
        else:
            if filePath in self.__inventory.keys():
                raise WrongOperation(
                    f"File has been opened already: {filePath}. If you still want to open it to get another handle,please give it an exclusive name."
                )
            name = filePath

        declare.is_file("filePath", filePath)

        handle = open(filePath, mode, encoding=encoding)

        self.__inventory[name] = handle

        return handle
Пример #2
0
def load_ngrams(target, name="gram"):
    '''
	Load a N-Grams from arpa or binary language model file.

	Args:
		<target>: file path with suffix .arpa or .binary.
	
	Return:
		a KenNGrams object.
	'''
    declare.is_file("target", target)
    target = target.strip()

    with FileHandleManager() as fhm:

        if target.endswith(".arpa"):
            modelTemp = fhm.create("wb+", suffix=".binary")
            arpa_to_binary(target, modelTemp.name)
            modelTemp.seek(0)
            model = KenNGrams(modelTemp.name, name=name)
            model._path = target

        elif target.endswith(".binary"):
            model = KenNGrams(target, name=name)

        else:
            raise UnsupportedType(
                f"Unknown suffix. Language model file should has a suffix .arpa or .binary but got: {target}."
            )

        return model
Пример #3
0
def transform_feat(feat,matFile,outFile=None):
	'''
	Transform feat by a transform matrix. Typically,LDA,MLLT matrices.
	Note that is you want to transform FMLLR,use exkaldi.use_fmllr() function.  

	Share Args:
		Null

	Parallel Args:
		<feat>: exkaldi feature or index table object.
		<matFile>: file name.
		<outFile>: output file name.
	
	Return:
		exkaldi feature or index table object.
	'''
	feats,matFiles,outFiles = check_multiple_resources(feat,matFile,outFile=outFile)

	names = []
	for feat,matFile in zip(feats,matFiles):
		declare.is_feature("feat",feat)
		declare.is_file("matFile",matFile)
		names.append( f"tansform({feat.name})" )

	cmdPattern = 'transform-feats {matFile} {feat} ark:{outFile}'
	resources = {"feat":feats,"matFile":matFiles,"outFile":outFiles}

	return run_kaldi_commands_parallel(resources,cmdPattern,analyzeResult=True,generateArchive="feat",archiveNames=names)
Пример #4
0
def arpa_to_binary(arpaFile, outFile):
    '''
	Transform ARPA language model to KenLM binary format.

	Args:
		<arpaFile>: ARPA file path.
		<outFile>: output binary file path.

	Return:
		output file name with suffix ".binary".
	'''
    declare.is_file("arpaFile", arpaFile)
    declare.is_valid_string("outFile", outFile)
    outFile = outFile.strip()
    if not outFile.endswith(".binary"):
        outFile += ".binary"

    declare.is_valid_file_name("outFile", outFile)
    make_dependent_dirs(outFile)

    cmd = os.path.join(sys.prefix, "exkaldisrc", "tools", "build_binary")
    cmd += f" -s {arpaFile} {outFile}"
    out, err, cod = run_shell_command(cmd, stderr="PIPE")

    if (cod != 0) or (not os.path.isfile(outFile)) or (os.path.getsize(outFile)
                                                       == 0):
        print(err.decode())
        if os.path.isfile(outFile):
            os.remove(outFile)
        raise KenlmProcessError("Failed to tansform ARPA to binary format.")

    else:
        return outFile
Пример #5
0
    def __init__(self, filePath, name="ngram"):
        declare.is_file("filePath", filePath)

        with open(filePath, "rb") as fr:
            t = fr.read(50).decode().strip()
        if t != "mmap lm http://kheafield.com/code format version 5":
            raise UnsupportedType(
                "This may be not a KenLM binary model format.")

        super(KenNGrams, self).__init__(data=b"placeholder", name=name)
        self.__model = kenlm.Model(filePath)
        self._path = None
Пример #6
0
def split_txt_file(filePath, chunks=2):
    '''
	Split a text file into N chunks by average number of lines.

	Args:
		<filePath>: text file path.
		<chunks>: an int avlue. How many chunks to split.

	Return:
		a list of paths of genrated chunk files.
		each file has a a prefix such as "ck0_" which _0_ is the chunk ID.
	'''
    declare.is_file("filePath", filePath)
    declare.greater_equal("chunks", chunks, "minimum chunk size", 2)

    with open(filePath, 'r', encoding='utf-8') as fr:
        data = fr.readlines()

    lines = len(data)
    chunkLines = lines // chunks

    if chunkLines == 0:
        chunkLines = 1
        chunks = lines
        t = 0
    else:
        t = lines - chunkLines * chunks

    a = len(str(chunks))

    filePath = os.path.abspath(filePath)
    dirName = os.path.dirname(filePath)
    fileName = os.path.basename(filePath)

    fileNamePattern = os.path.join(dirName, f"ck%0{a}d_" + fileName)
    newFiles = []
    start = 0
    for i in range(chunks):
        if i < t:
            end = start + chunkLines + 1
        else:
            end = start + chunkLines
        chunkData = data[start:end]
        newFileName = fileNamePattern % (i)
        with open(newFileName, 'w', encoding='utf-8') as fw:
            fw.write(''.join(chunkData))

        newFiles.append(newFileName)
        start = end

    return newFiles
Пример #7
0
def load_transcription(target, name="transcription", checkSpace=True):
    '''
	Load transcription from file.

	Args:
		<target>: transcription file path.
		<name>: a string.
		<checkSpace>: a bbol value. If True,we will check the validity of the number of spaces.

	Return:
		An exkaldi Transcription object.
	'''
    declare.is_classes("target", target,
                       ["dict", "Transcription", "ListTable", "str"])
    declare.is_bool("checkSpace", checkSpace)

    if isinstance(target, str):
        declare.is_file("target", target)
        with open(target, "r", encoding="utf-8") as fr:
            lines = fr.readlines()
        result = Transcription(name=name)
        for index, line in enumerate(lines, start=1):
            t = line.strip().split(maxsplit=1)
            if len(t) < 2:
                print(f"Line Number: {index}")
                print(f"Line Content: {line}")
                raise WrongDataFormat(
                    "Missing entire key and value information.")
            else:
                result[t[0]] = t[1]
    else:
        for utt, utterance in target.items():
            declare.is_valid_string("utterance ID", utt)
            declare.is_valid_string("utterance", utterance)
        result = Transcription(target, name=name)

    if checkSpace:
        sampleText = result.subset(nRandom=100)
        spaceCount = 0
        for key, value in sampleText.items():
            spaceCount += value.count(" ")
        if spaceCount < len(sampleText) // 2:
            errMes = "The transcription doesn't seem to be separated by spaces or extremely short."
            errMes += "If it actually has right format, set the <checkSpace>=False and run this function again."
            raise WrongDataFormat(errMes)

    return result
Пример #8
0
def compress_gz_file(filePath, overWrite=False, keepSource=False):
    '''
	Compress a file to gz file.

	Args:
		<filePath>: file path.
		<overWrite>: If True,overwrite gz file when it has existed.
		<keepSource>: If True,retain source file.
	
	Return:
		the path of compressed file.
	'''
    declare.is_file("filePath", filePath)
    declare.is_bool("overWrite", overWrite)
    declare.is_bool("keepSource", keepSource)

    filePath = os.path.abspath(filePath)
    if filePath.endswith(".gz"):
        raise WrongOperation(f"Cannot compress a .gz file:{filePath}.")
    else:
        outFile = filePath + ".gz"

    if os.path.isfile(outFile):
        if overWrite is True:
            os.remove(outFile)
        else:
            raise WrongOperation(
                f"File has existed:{outFile}. If overwrite it,set option <overWrite>=True."
            )

    if keepSource:
        cmd = f"gzip -k {filePath}"
    else:
        cmd = f"gzip {filePath}"

    out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE)

    if cod != 0:
        print(err.decode())
        raise ShellProcessError("Failed to compress file.")
    else:
        return outFile
Пример #9
0
def ctc_prefix_beam_search(prob,
                           vocabs,
                           blankID=None,
                           beam=5,
                           cutoff=0.999,
                           strick=1.0,
                           lmFile=None,
                           alpha=1.0,
                           beta=0):
    '''
    Prefix beam search decoding algorithm. Lm score is supported.

    Args:
        <prob>: An exkaldi postprobability object. This probalility should be an output of Neural Network with CTC loss fucntion.
                We expect the probability didn't pass any activation function, or it may generate wrong results.
        <vocabs>: a list of vocabulary.
        <blankID>: specify the ID of blank symbol. If None, use the last dimentionality of <prob>.
        <beam>: the beam size.
        <cutoff>: the sum threshold to cut off dimensions whose probability is extremely small.  
        <strick>: When the decoding results of two adjacent frames are the same, the probability of latter will be reduced.
        <lmFile>: If not None, add language model score to beam.
        <alpha>: the weight of LM score.
        <beta>: the length normaoliztion weight of LM score.
    Return:
        An exkaldi Transcription object of decoding results.  
    '''
    declare.is_classes("vocabs", vocabs, [tuple, list])

    declare.is_probability("prob", prob)
    if type_name(prob) == "BytesProb":
        prob = prob.to_numpy()
    elif type_name(prob) == "IndexTable":
        prob = prob.read_record("prob").to_numpy()

    if lmFile is not None:
        declare.is_file("lmFile", lmFile)
    else:
        lmFile = "none"

    probDim = prob.dims
    if len(vocabs) == probDim:
        if blankID is None:
            blankID = probDim - 1
        declare.is_positive_int("blankID", blackID)
        declare.in_boundary("blankID", blackID, 0, probDim - 1)

    elif len(vocabs) == probDim - 1:
        if blankID == None:
            blankID = probDim - 1
        else:
            assert blankID == probDim - 1, f"The dimensibality of probability is {probDim} but only have {len(vocabs)} words. In this case, blank ID must be {probDim-1} but got {blankID}"
    else:
        raise WrongDataFormat(
            f"The dimensibality of probability {probDim} does not match the numbers of words {len(vocabs)}."
        )

    for ID, word in enumerate(vocabs):
        if len(word.strip()) == 0:
            raise WrongDataFormat(f"Found a vocab {word} unavaliable.")

    num_classes = len(vocabs)
    vocabs = " ".join(vocabs)

    sources = [
        vocabs.encode(),
    ]
    uttTemp = []
    for utt, pb in prob.items:
        declare.is_classes("prob", prob, np.ndarray)
        declare.is_classes("the rank of matrix shape", len(pb.shape),
                           "expected rank", 2)
        pb = softmax(pb, axis=1)
        sources.append(f" {pb.shape[0]} ".encode() +
                       pb.astype("float32").tobytes())

    sources = b"".join(sources)

    cmd = os.path.join(sys.prefix, "exkaldisrc", "tools",
                       "prefix_beam_search_decode")
    cmd += " --num_files {}".format(prob.lens[0])
    cmd += " --num_classes {}".format(num_classes)
    cmd += " --blank_id {}".format(blankID)
    cmd += " --lm_model {}".format(lmFile)
    cmd += " --beam_size {}".format(beam)
    cmd += " --cutoff_prob {}".format(cutoff)
    cmd += " --alpha {}".format(alpha)
    cmd += " --beta {}".format(beta)

    out, err, _ = run_shell_command(cmd,
                                    stdin=subprocess.PIPE,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE,
                                    inputs=sources)

    if len(out) == 0:
        raise Exception("Failed to beam search decode.", err.decode())
    else:
        results = Transcription(name="beamSearchResults")
        out = out.decode().strip().split("file")
        results = []
        for index, re in enumerate(out[1:]):
            re = re.strip().split("\n")
            if len(re) <= 1:
                results.append([
                    "",
                ])
            else:
                results[uttTemp[index]] = " ".join(re[1].strip().split()[1:])

        return results
Пример #10
0
	def load(self, filePath):
		'''
		Load auguments from file.

		Args:
			_filePath_: args file path.
		'''
		declare.is_file("filePath", filePath)
		self.reset()

		with open(filePath, "r", encoding="utf-8") as fr:
			lines = fr.read()
		lines = lines.strip()
		if len(lines) == 0:
			raise WrongOperation(f"This is a void file: {filePath}.")
		
		blocks = lines.split("\n\n")
		
		def __parse(name, value, dtype):
			if dtype in [float,int]:
				try:
					value = dtype(value)
				except ValueError:
					raise WrongOperation(f"Option <{name}> need a {dtype.__name__} value but choices got: {value}.")
			elif dtype == bool:
				if value.lower() == "true":
					value = True
				elif c.lower() == "false":
					value = False
				else:
					raise WrongOperation(f"Option <{name}> need a bool value but choices got: {value}.")

			return value  

		self.__discription = blocks[0].strip()
		for blockNo, block in enumerate(blocks[1:], start=1):
			block = block.strip()
			if len(block) == 0:
				continue
			block = block.split("\n")
			# 1. match options
			values = {"name":None,"abbr":None,"dtype":None,"default":None,"choices":None,"minV":None,"maxV":None,"discription":None,"value":None}
			for m in block:
				m = m.strip()
				assert "=" in m, f"Augument should has format: key = value, but got: {m}."
				assert len(m.split("=")) == 2, f"Augument should has format: key = value, but got: {m}."
				m = m.split("=")
				name = m[0].strip()
				value = m[1].strip()
				declare.is_instances("Option key", name, list(values.keys()))
				values[name] = value

			for key, value in values.items():
				assert value is not None, f"Missed {key} information in line: {lineNo}."
			# 2. parse
			name = values["name"]
			# parse the dtype firstly
			declare.is_instances("dtype", values["dtype"], ["float","int","bool","str"])
			values["dtype"] = eval(values["dtype"])
			dtype = values["dtype"]	
			# then parse the choices
			choices = values["choices"]
			if choices in ["none", "None"]:
				choices = None
			else:
				choices = choices.split("|")
				for i, c in enumerate(choices):
					choices[i] = __parse(name, c, dtype)
			values["choices"] = choices
			# then parse the boundary value
			boundary = {"minV":None, "maxV":None}
			for i in boundary.keys():
				V = values[i]
				if V not in ["none", "None"]:
					assert dtype in [float,int], f"Only float and int option can set the boundary but {name} is {dtype.__name__}:"
					assert choices is None, f"{name} cannot set choices and boundary concurrently."
					
					toIntFlag = True
					toFloatFlag = True
					try:
						float(V)
					except ValueError:
						toFloatFlag= False
					try:
						int(V)
					except ValueError:
						toIntFlag= False
					
					if toIntFlag is False and toFloatFlag is False:
						raise WrongDataFormat(f"Boundary values of {name} should be a int or float value but got: {V}.")
					elif toIntFlag is False and toFloatFlag is True: # minV is predicted be a float value
						if dtype != float:
							raise WrongDataFormat(f"{name}'s dtype is int but try to set boundary value with a float value: {V}.")
						else:
							V = float(V)
					elif toIntFlag is True and toFloatFlag is True: # minV is predicted be a float or an int value
						V = dtype(V)
					else:
						raise WrongDataFormat(f"Failed to set {name}'s boundary value: {V}.")
				
					boundary[i] = V
			values["minV"] = boundary["minV"]
			values["maxV"] = boundary["maxV"]
			# then parse the default and value
			if values["default"].lower() == "none":
				values["default"] = None
			else:
				default = values["default"].split("|")
				for i, v in enumerate(default):
					default[i] = __parse(name, v, dtype)
				values["default"] = default if len(default) > 1 else default[0]
			
			# the judgement of "default" will be done by .parse() function, so here we only verify "value"
			if values["value"].lower() == "none":
				values["value"] = None
			else:
				value = values["value"].split("|")
				for i, v in enumerate(value):
					v = __parse(name, v, dtype)
					if values["choices"] is not None:
						declare.is_instances("Option value", v, values["choices"])
					else:
						if values["minV"] is not None:
							declare.greater_equal("Option value", v, "minimum expected value", values["minV"])
						if values["maxV"] is not None:
							declare.less_equal("Option value", v, "maximum expected value", values["maxV"])
					value[i] = v
				if len(value) == 1:
					value = value[0]
				values["value"] = value
			
			# check abbreviation
			if values["abbr"] in ["none", "None"]:
				values["abbr"] = None

			# add this options
			self.add(name=values["name"], 
							 dtype=values["dtype"], 
							 abbr=values["abbr"], 
							 default=values["default"], 
					 		 choices=values["choices"], 
							 minV=values["minV"], 
							 maxV=values["maxV"], 
							 discription=values["discription"]
							)
			
			# finally, modify the "value"
			self.__arguments[values["name"]] = self.__arguments[values["name"]]._replace(value=values["value"])
			if values["value"] is not None:
				self.__setattr__(values["name"], values["value"])