示例#1
0
def make_dependent_dirs(path, pathIsFile=True):
    '''
	Make the dependent directories for a path if it has not existed.

	Args:
		<path>: a file path or folder path.
		<pathIsFile>: a bool value to declare that <path> is a file path or folder path.
	'''
    declare.is_valid_string("path", path)
    declare.is_bool("pathIsFile", pathIsFile)

    path = os.path.abspath(path.strip())

    if pathIsFile:
        if os.path.isdir(path):
            raise WrongPath(
                f"<path> is specified as file but it has existed as directory: {path}. You can remove it then try again."
            )
        else:
            dirPath = os.path.dirname(path)
    else:
        if os.path.isfile(path):
            raise WrongPath(
                f"<path> is specified as directory but it has existed as file: {path}. You can remove it then try again."
            )
        else:
            dirPath = path

    if not os.path.isdir(dirPath):
        try:
            os.makedirs(dirPath)
        except Exception as e:
            print(f"Failed to make directory: {dirPath}.")
            raise e
示例#2
0
    def open(self, filePath, mode, encoding=None, name=None):
        '''
		Open a regular file and return the handle.

		Args:
			<name>: a string. After named this handle exclusively,you can call its name to get it again.
					If None,we will use the file name as its default name.
					We allow to open the same file in multiple times as long as you name them differently.
		
		Return:
			a file handle.
		'''
        self.verify_safety()

        if name is not None:
            declare.is_valid_string("name", name)
            assert name not in self.__inventory.keys(
            ), f"<name> has been existed. We hope it be exclusive: {name}."
        else:
            if filePath in self.__inventory.keys():
                raise WrongOperation(
                    f"File has been opened already: {filePath}. If you still want to open it to get another handle,please give it an exclusive name."
                )
            name = filePath

        declare.is_file("filePath", filePath)

        handle = open(filePath, mode, encoding=encoding)

        self.__inventory[name] = handle

        return handle
示例#3
0
def utt_to_spk(utts,utt2spk):
	'''
	Accept a list of utterance IDs and return their corresponding speaker IDs.

	Args:
		<utts>: a string or list or tuple of utterance IDs.
		<utt2spk>: utt2spk file or ListTable object.
	
	Return:
		a list of speaker IDs.
	'''
	declare.is_classes("utterance IDs",utts,(str,tuple,list))
	if not isinstance(utts,str):
		declare.members_are_valid_strings("utterance IDs",utts)
	else:
		utts = [utts,]	

	declare.is_potential_list_table("utt2spk",utt2spk)
	if isinstance(utt2spk,str):
		utt2spk = load_list_table(utt2spk)
	
	spks = []
	for utt in utts:
		try:
			spk = utt2spk[utt]
		except KeyError:
			raise WrongOperation(f"Miss utterance ID {utt} in utt2spk map.")
		else:
			declare.is_valid_string("The value of utt2spk",utt)
			spktemp = spk.strip().split(maxsplit=1)
			assert len(spktemp) == 1,f"speaker ID in utt2spk has unexpected space: {spk}."
			spks.append(spktemp[0])
	
	return sorted(list(set(spks)))
示例#4
0
def spk_to_utt(spks,spk2utt):
	'''
	Accept a list of speaker IDs and return their corresponding utterance IDs.

	Args:
		<spks>: a string or list or tuple of speaker IDs.
		<spk2utt>: spk2utt file or ListTable object.
	
	Return:
		a list of utterance IDs.
	'''
	declare.is_classes("speaker IDs",spks,(str,tuple,list))

	if not isinstance(spks,str):
		declare.members_are_valid_strings("speaker IDs",spks)
	else:
		spks = [spks,]
		
	declare.is_potential_list_table("spk2utt",spk2utt)
	if isinstance(spk2utt,str):
		spk2utt = load_list_table(spk2utt)
	
	utts = []
	for spk in spks:
		try:
			utt = spk2utt[spk]
		except KeyError:
			raise WrongOperation(f"Miss speaker ID {spk} in spk2utt map.")
		else:
			declare.is_valid_string("The value of spk2utt",utt)
			utts.extend(utt.strip().split())
	
	return sorted(list(set(utts)))
示例#5
0
def utt2spk_to_spk2utt(utt2spk,outFile=None):
	'''
	Transform utt2spk to spk2utt.

	Args:
		<utt2spk>: file name or exkaldi ListTable object.
		<outFile>: file name or None.
	
	Return:
		file name or exakldi ListTable object.
	'''
	declare.is_potential_list_table("utt2spk",utt2spk)
	if outFile is not None:
		declare.is_valid_file_name(outFile)
	
	if isinstance(utt2spk,str):
		utt2spk = load_list_table(utt2spk)

	spk2utt = ListTable(name="spk2utt")
	for utt,spk in utt2spk.items():
		declare.is_valid_string("utterance ID",utt)
		declare.is_valid_string("speaker ID",spk)
		assert utt.count(" ") == 0,f"<utterance ID> is not a continuous string but spaces existed: {utt}."
		assert spk.count(" ") == 0,f"<speaker ID> is not a continuous string but spaces existed: {spk}."
		
		try:
			spk2utt[spk] += f" {utt}"
		except KeyError:
			spk2utt[spk] = utt

	if outFile is None:
		return spk2utt
	else:
		spk2utt.save(outFile)
		return outFile
示例#6
0
    def create(self, mode, suffix=None, encoding=None, name=None):
        '''
		Creat a temporary file and return the handle.

		Args:
			<name>: a string. After named this handle exclusively,you can call its name to get it again.
					If None,we will use the file name as its default name.
		
		Return:
			a file handle.
		'''
        self.verify_safety()

        if suffix is not None:
            declare.is_valid_string("suffix", suffix)

        if name is not None:
            declare.is_valid_string("name", name)
            assert name not in self.__inventory.keys(
            ), f"<name> has been existed. We hope it be exclusive: {name}."

        handle = tempfile.NamedTemporaryFile(mode,
                                             prefix="exkaldi_",
                                             suffix=suffix,
                                             encoding=encoding)

        if name is None:
            self.__inventory[handle.name] = handle
        else:
            self.__inventory[name] = handle

        return handle
示例#7
0
def arpa_to_binary(arpaFile, outFile):
    '''
	Transform ARPA language model to KenLM binary format.

	Args:
		<arpaFile>: ARPA file path.
		<outFile>: output binary file path.

	Return:
		output file name with suffix ".binary".
	'''
    declare.is_file("arpaFile", arpaFile)
    declare.is_valid_string("outFile", outFile)
    outFile = outFile.strip()
    if not outFile.endswith(".binary"):
        outFile += ".binary"

    declare.is_valid_file_name("outFile", outFile)
    make_dependent_dirs(outFile)

    cmd = os.path.join(sys.prefix, "exkaldisrc", "tools", "build_binary")
    cmd += f" -s {arpaFile} {outFile}"
    out, err, cod = run_shell_command(cmd, stderr="PIPE")

    if (cod != 0) or (not os.path.isfile(outFile)) or (os.path.getsize(outFile)
                                                       == 0):
        print(err.decode())
        if os.path.isfile(outFile):
            os.remove(outFile)
        raise KenlmProcessError("Failed to tansform ARPA to binary format.")

    else:
        return outFile
示例#8
0
 def list_one_record(target):
     declare.is_valid_string("filePaths", target)
     cmd = f"ls {target}"
     out, err, cod = run_shell_command(cmd, stdout=subprocess.PIPE)
     if len(out) == 0:
         return []
     else:
         out = out.decode().strip().split("\n")
         newOut = [o for o in out if os.path.isfile(o)]
         return newOut
示例#9
0
	def discribe(self, message):
		'''
		Add a discription of current program.

		Args:
			<message>: string.
		'''
		self.__capture()
		declare.is_valid_string("discription message", message)
		self.__discription = message
示例#10
0
    def call(self, name):
        '''
		Get the file handle again by call its name.
		If unexisted,return None.
		'''
        declare.is_valid_string("name", name)
        try:
            return self.__inventory[name]
        except KeyError:
            return None
示例#11
0
def check_config(name, config=None):
    '''
	Check the users'configures or get the default configures of some functions.

	Args:
		<name>: function name.
		<config>: a list object whose keys are configure name and values are their configure values. If None,return the default configure.
	
	Return:
		if <config> is None:
			Return none,or a dict object of example configure of <name>.
			If the value is a tuple,it standards for multiple types of value you can set.
		else:
			Return True or raise error.
	'''
    declare.is_valid_string("name", name)

    try:
        module = importlib.import_module(f'exkaldi.config.{name}')
    except ModuleNotFoundError:
        print(f"Warning: no default configure for name '{name}'.")
        return None
    else:
        c = module.config

    if config is None:
        config = {}
        for key, value in c.items():
            value = tuple(value[i] for i in range(0, len(value), 2))
            value = value if len(value) > 1 else value[0]
            config[key] = value
        return config

    else:
        if not isinstance(config, dict):
            raise WrongOperation(
                f"<config> has a wrong format. You can use check_config('{name}') to get expected configure format."
            )
        for k in config.keys():
            if not k in c.keys():
                raise WrongOperation(
                    f"No such configure name: <{k}> in {name}.")
            else:
                protos = tuple(c[k][i] for i in range(1, len(c[k]), 2))
                if not isinstance(config[k], protos):
                    if isinstance(config[k], bool):
                        raise WrongDataFormat(
                            f"Configure <{k}> is bool value: {config[k]},but we expected str value like 'true' or 'false'."
                        )
                    else:
                        raise WrongDataFormat(
                            f"Configure <{k}> should be in {protos} but got {type_name(config[k])}."
                        )

            return True
示例#12
0
    def perplexity_sentence(self, sentence):
        '''
		Compute perplexity of a sentence.

		Args:
			<sentence>: a string with out boundary symbols.
		Return:
			a float log-value.
		'''
        declare.is_valid_string("sentence", sentence)

        return self.__model.perplexity(sentence)
示例#13
0
def load_fmllr(target, name="prob", useSuffix=None):
    '''
	Load fmllr transform matrix data.

	Args:
		<target>: Python dict object,bytes object,exkaldi feature or index table object,.ark file,.scp file,npy file.
		<name>: a string.
		<useSuffix>: "ark" or "scp" or "npy". We will check the file type by its suffix. 
								But if <target> is file path and not default suffix (ark or scp),you have to declare which type it is.

	Return:
		A BytesFmllrMatrix or NumpyFmllrMatrix object.
	'''
    declare.is_valid_string("name", name)

    if isinstance(target, dict):
        result = NumpyFmllrMatrix(target, name)
        result.check_format()
        return result

    elif isinstance(target, bytes):
        result = BytesFmllrMatrix(target, name)
        result.check_format()
        return result

    elif isinstance(target, (NumpyFmllrMatrix, BytesFmllrMatrix)):
        result = copy.deepcopy(target)
        result.rename(name)
        return result

    elif isinstance(target, str):
        allData_bytes, allData_numpy, dataType = __read_data_from_file(
            target, useSuffix)
        if dataType == "npy":
            result = NumpyFmllrMatrix(allData_numpy) + BytesFmllrMatrix(
                allData_bytes)
        else:
            result = BytesFmllrMatrix(allData_bytes) + NumpyFmllrMatrix(
                allData_numpy)
        result.rename(name)
        return result

    elif isinstance(target, ArkIndexTable):
        return target.fetch(arkType="fmllrMat", name=name)

    else:
        raise UnsupportedType(
            f"Expected Python dict,bytes object,exkaldi fmllr matrix object,index table object or file path but got{type_name(target)}."
        )
示例#14
0
def load_transcription(target, name="transcription", checkSpace=True):
    '''
	Load transcription from file.

	Args:
		<target>: transcription file path.
		<name>: a string.
		<checkSpace>: a bbol value. If True,we will check the validity of the number of spaces.

	Return:
		An exkaldi Transcription object.
	'''
    declare.is_classes("target", target,
                       ["dict", "Transcription", "ListTable", "str"])
    declare.is_bool("checkSpace", checkSpace)

    if isinstance(target, str):
        declare.is_file("target", target)
        with open(target, "r", encoding="utf-8") as fr:
            lines = fr.readlines()
        result = Transcription(name=name)
        for index, line in enumerate(lines, start=1):
            t = line.strip().split(maxsplit=1)
            if len(t) < 2:
                print(f"Line Number: {index}")
                print(f"Line Content: {line}")
                raise WrongDataFormat(
                    "Missing entire key and value information.")
            else:
                result[t[0]] = t[1]
    else:
        for utt, utterance in target.items():
            declare.is_valid_string("utterance ID", utt)
            declare.is_valid_string("utterance", utterance)
        result = Transcription(target, name=name)

    if checkSpace:
        sampleText = result.subset(nRandom=100)
        spaceCount = 0
        for key, value in sampleText.items():
            spaceCount += value.count(" ")
        if spaceCount < len(sampleText) // 2:
            errMes = "The transcription doesn't seem to be separated by spaces or extremely short."
            errMes += "If it actually has right format, set the <checkSpace>=False and run this function again."
            raise WrongDataFormat(errMes)

    return result
示例#15
0
    def score_sentence(self, sentence, bos=True, eos=True):
        '''
		Score a sentence.

		Args:
			<sentence>: a string with out boundary symbols.
			<bos>: If True, add <s> to the head.
			<eos>: If True, add </s> to the tail.

		Return:
			a float value.
		'''
        declare.is_valid_string("sentence", sentence)
        declare.is_bool("bos", bos)
        declare.is_bool("eos", eos)

        return self.__model.score(sentence, bos, eos)
示例#16
0
    def full_scores_sentence(self, sentence, bos=True, eos=True):
        '''
		Generate full scores (prob, ngram length, oov).

		Args:
			<sentence>: a string with out boundary symbols.
			<bos>: If True, add <s> to the head.
			<eos>: If True, add </s> to the tail.

		Return:
			a iterator of (prob, ngram length, oov).
		'''
        declare.is_valid_string("sentence", sentence)
        declare.is_bool("bos", bos)
        declare.is_bool("eos", eos)

        return self.__model.full_scores(sentence, bos, eos)
示例#17
0
def run_shell_command_parallel(cmds, env=None, timeout=ExkaldiInfo.timeout):
    '''
	Run shell commands with multiple processes.
	In this mode,we don't allow the input and output streams are PIPEs.
	If you mistakely appoint buffer to be input or output stream,we set time out error to avoid dead lock.
	So you can change the time out value into a larger one to deal with large courpus as long as you rightly apply files as the input and output streams. 

	Args:
		<cmds>: a list of strings. Each string should be a command and its options.
		<env>: If None,use exkaldi.version.ENV defaultly.
		<timeout>: a int value. Its the total timeout value of all processes.

	Return:
		a list of pairs: return code and error information.
	'''
    declare.is_classes("cmds", cmds, [tuple, list])
    declare.is_positive_int("timeout", timeout)

    if env is None:
        env = ExkaldiInfo.ENV

    processManager = {}
    for index, cmd in enumerate(cmds):
        declare.is_valid_string("cmd", cmd)
        processManager[index] = subprocess.Popen(cmd,
                                                 shell=True,
                                                 stderr=subprocess.PIPE,
                                                 env=env)

    runningProcess = len(processManager)
    if runningProcess == 0:
        raise WrongOperation("<cmds> has not any command to run.")
    dtimeout = timeout // runningProcess
    assert dtimeout >= 1, f"<timeout> is extremely short: {timeout}."
    for ID, p in processManager.items():
        try:
            out, err = p.communicate(timeout=dtimeout)
        except subprocess.TimeoutExpired:
            p.kill()
            errMes = b"Time Out Error: Process was killed! If you are exactly running the right program,"
            errMes += b"you can set a greater timeout value by exkaldi.info.set_timeout()."
            processManager[ID] = (-9, errMes)
        else:
            processManager[ID] = (p.returncode, err)

    return list(processManager.values())
示例#18
0
    def close(self, name=None):
        '''
		Close file handle.
		'''
        if name is None:
            for t in self.__inventory.values():
                try:
                    t.close()
                except Exception:
                    pass
        else:
            declare.is_valid_string("name", name)
            if name in self.__inventory.keys():
                try:
                    self.__inventory[name].close()
                except Exception:
                    pass
示例#19
0
def run_shell_command(cmd,
                      stdin=None,
                      stdout=None,
                      stderr=None,
                      inputs=None,
                      env=None):
    '''
	Run a shell command with Python subprocess.

	Args:
		<cmd>: a string including a shell command and its options.
		<stdin>,<stdout>,<stderr>: IO streams. If "PIPE",use subprocess.PIPE.
		<inputs>: a string or bytes to send to input stream.
		<env>: If None,use exkaldi.version.ENV defaultly.

	Return:
		out,err,returnCode
	'''
    declare.is_valid_string("cmd", cmd)

    if env is None:
        env = ExkaldiInfo.ENV

    if inputs is not None:
        declare.is_classes("inputs", inputs, [str, bytes])
        if isinstance(inputs, str):
            inputs = inputs.encode()

    if stdin == "PIPE":
        stdin = subprocess.PIPE
    if stdout == "PIPE":
        stdout = subprocess.PIPE
    if stderr == "PIPE":
        stderr = subprocess.PIPE

    p = subprocess.Popen(cmd,
                         shell=True,
                         stdin=stdin,
                         stdout=stdout,
                         stderr=stderr,
                         env=env)
    (out, err) = p.communicate(input=inputs)

    return out, err, p.returncode
示例#20
0
def view_kaldi_usage(toolName):
    '''
	View the help information of specified kaldi command.

	Args:
		<toolName>: kaldi tool name.
	'''
    declare.is_valid_string("toolName", toolName)
    cmd = toolName.strip().split()
    assert len(
        cmd
    ) == 1, f"<toolName> must only include one command name but got: {toolName}."
    cmd = cmd[0]
    cmd += " --help"

    out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE)

    if cod != 0:
        print(err.decode())
        raise ShellProcessError(f"Failed to get kaldi tool info: {toolName}.")
    else:
        print(err.decode())
示例#21
0
def spk2utt_to_utt2spk(spk2utt,outFile=None):
	'''
	Transform spk2utt file to utt2spk file.

	Args:
		<spk2utt>: file name or exkaldi ListTable object.
		<outFile>: file name or None.

	Return:
		file name or exakldi ListTable object.
	'''
	declare.is_potential_list_table("spk2utt",spk2utt)
	if outFile is not None:
		declare.is_valid_file_name(outFile)
	
	if isinstance(spk2utt,str):
		spk2utt = load_list_table(spk2utt)

	utt2spk = ListTable(name="utt2spk")
	for spk,utts in spk2utt.items():
		declare.is_valid_string("utterance IDs",utts)
		declare.is_valid_string("speaker ID",spk)
		assert spk.count(" ") == 0,f"<speaker ID> is not a continuous string but spaces existed: {spk}."

		for utt in utts.split():
			try:
				utt2spk[utt]
			except KeyError:
				utt2spk[utt] = spk
			else:
				raise WrongDataFormat(f"utterance ID:{utt} has existed toward multiple speakers.")

	if outFile is None:
		return utt2spk
	else:
		utt2spk.save(outFile)
		return outFile
示例#22
0
文件: score.py 项目: wangyu09/exkaldi
def edit_distance(ref, hyp, ignore=None, mode='present'):
    '''
	Compute edit-distance score.

	Args:
		<ref>,<hyp>: exkaldi Transcription objects.
		<ignore>: Ignoring specific symbols.
		<mode>: When both are Transcription objects,if mode is 'present',skip the missed utterances.

	Return:
		a namedtuple object including score information.	
	'''
    declare.is_potential_transcription("ref", ref)
    declare.is_potential_transcription("hyp", hyp)
    declare.is_instances("mode", mode, ['all', 'present'])

    if ignore is not None:
        declare.is_valid_string("ignore", ignore)

    if isinstance(ref, str):
        ref = load_transcription(ref)

    if isinstance(hyp, str):
        hyp = load_transcription(hyp)

    allED = 0
    words = 0
    SER = 0
    sentences = 0
    wrongSentences = 0
    missedSentences = 0

    ref = ref.sort()
    hyp = hyp.sort()

    for utt, hypTrans in hyp.items():
        try:
            refTrans = ref[utt]
        except KeyError as e:
            if mode == "all":
                raise Exception(
                    "Missing transcription in reference,set <mode> as 'all' to skip it."
                )
            else:
                missedSentences += 1
        else:
            sentences += 1
            refTrans = refTrans.split()
            hypTrans = hypTrans.split()
            ed, wds = pure_edit_distance(refTrans, hypTrans, ignore=ignore)
            allED += ed
            words += wds
            if ed > 0:
                wrongSentences += 1

    if sentences == 0:
        raise Exception(
            "Missing all transcription in reference. We don't think it's a reasonable result. Check the file please."
        )

    return namedtuple("Score", [
        "editDistance", "words", "SER", "sentences", "wrongSentences",
        "missedSentences"
    ])(allED, words, wrongSentences / sentences, sentences, wrongSentences,
       missedSentences)
示例#23
0
	def add(self,name,dtype,abbr=None,default=None,choices=None,minV=None,maxV=None,discription=None):
		'''
		Add a new option.

		Args:
			_name_: a string which must have a format such as "--exkaldi" (but "--help" is inavaliable exceptionally.).  
			_dtype_: float, int, str or bool.  
			_abbr_: None or a abbreviation of name which must have a format such as "-e" (but "-h" is inavaliable exceptionally.).  
			_dtype_: the default value or a list/tuple of values.  
			_choices_: a list/tuple of values.  
			_minV_: set the minimum value if dtype is int or float. Enable when _choices_ is None.  
			_maxV_: set the maximum value if dtype is int or float. Enable when _choices_ is None.  
			_maxV_: a string to discribe this option.
		'''
		self.__capture()

		# check option name
		declare.is_valid_string("name",name)
		name = name.strip()
		self.__detect_special_char(name)
		assert name[0:2] == "--" and name[2:3] != "-", f"Option name must start with '--' but got: {name}."
		assert name != "--help", "Option name is inavaliable: --help."
		if name in self.__arguments.keys():
			raise WrongOperation(f"Option name has existed: {name}.")
		
		# check dtype
		declare.is_instances("option dtype", dtype, (float,int,bool,str))

		# check abbreviation
		if abbr is not None:
			declare.is_valid_string("abbr",abbr)
			abbr = abbr.strip()
			self.__detect_special_char(abbr)
			assert abbr[0:1] == "-" and abbr[1:2] != "-", f"Abbreviation must start with '-' but got: {abbr}."
			assert abbr != "-h", "Abbreviation is inavaliable: -h."
			if abbr in self.__abb2Name.keys():
				raise WrongOperation(f"Abbreviation has existed: {abbr}.")

		# check default value
		if default is not None:
			if isinstance(default,(list,tuple)):
				declare.members_are_classes(f"Default value of {name}", default, dtype)
			else:
				declare.is_classes(f"Default value of {name}", default, dtype)
			if dtype == str:
				self.__detect_special_char(default)

		# check choices
		if choices is not None:
			declare.is_classes(f"Choices of {name}", choices, (list,tuple))
			declare.members_are_classes(f"Choices of {name}", choices, dtype)
			if dtype == str:
				self.__detect_special_char(choices)
			if default is not None:
				if isinstance(default,(list,tuple)):
					declare.members_are_instances(f"Default value of {name}", default, choices)
				else:
					declare.is_instances(f"Default value of {name}", default, choices)
		
		# check boundary values
		if minV is not None or maxV is not None:
			assert dtype in [float,int], f"Only float and int option can set the boundary but {name} is {dtype.__name__}."
			assert choices is None, f"Cannot set choices and boundary concurrently: {name}."
			if minV is not None:
				declare.is_classes(f"Minimum value of {name}", minV, dtype)
				if default is not None:
					if isinstance(default, (list,tuple)):
						for v in default:
							declare.greater_equal(f"Default value of {name}", v, "minimum expected value", minV)
					else:
						declare.greater_equal(f"Default of {name}", default, "minimum expected value", minV)
			if maxV is not None:
				declare.is_classes(f"Maximum value of {name}", maxV, dtype)
				if default is not None:
					if isinstance(default,(list,tuple)):
						for v in default:					
							declare.less_equal(f"Default value of {name}", v, "maximum expected value", maxV)
					else:
						declare.less_equal(f"Default value of {name}", default, "maximum expected value", maxV)
			if minV is not None and maxV is not None:
				declare.less_equal(f"Minimum value of {name}", minV, f"maximum value", maxV)

		# check discription
		if discription is not None:
			declare.is_valid_string(f"Discription of {name}", discription)
			self.__detect_special_char(discription)

		self.__arguments[name] = self.spec(dtype,default,choices,minV,maxV,discription)
		self.__name2Abb[name] = abbr
		if abbr is not None:
			self.__abb2Name[abbr] = name
示例#24
0
def __read_data_from_file(fileName, useSuffix=None):
    '''
	Read data from file. If the file suffix is unknown,<useSuffix> is necessary.
	'''
    declare.kaldi_existed()

    if useSuffix != None:
        declare.is_valid_string("useSuffix", useSuffix)
        useSuffix = useSuffix.strip().lower()[-3:]
        declare.is_instances("useSuffix", useSuffix, ["ark", "scp", "npy"])
    else:
        useSuffix = ""

    allFiles = list_files(fileName)

    allData_bytes = []
    allData_numpy = {}

    def loadNpyFile(fileName):
        try:
            temp = np.load(fileName, allow_pickle=True)
            data = {}
            for utt_mat in temp:
                assert isinstance(utt_mat[0], str) and isinstance(
                    utt_mat[1], np.ndarray)
                data[utt_mat[0]] = utt_mat[1]
        except:
            raise UnsupportedType(
                f'This is not a valid Exkaldi npy file: {fileName}.')
        else:
            return data

    def loadArkScpFile(fileName, suffix):
        declare.kaldi_existed()

        if suffix == "ark":
            cmd = 'copy-feats ark:'
        else:
            cmd = 'copy-feats scp:'

        cmd += '{} ark:-'.format(fileName)
        out, err, cod = run_shell_command(cmd, stdout="PIPE", stderr="PIPE")
        if (isinstance(cod, int) and cod != 0) or out == b'':
            raise KaldiProcessError('Failed to read archive table.',
                                    err.decode())
        else:
            #if sys.getsizeof(out) > 10000000000:
            #    print('Warning: Data is extramely large. We don't recommend use load_index_table to replace it.')
            return out

    for fileName in allFiles:
        sfx = fileName.strip()[-3:].lower()
        if sfx == "npy":
            allData_numpy.update(loadNpyFile(fileName))
        elif sfx in ["ark", "scp"]:
            allData_bytes.append(loadArkScpFile(fileName, sfx))
        elif useSuffix == "npy":
            allData_numpy.update(loadNpyFile(fileName))
        elif useSuffix in ["ark", "scp"]:
            allData_bytes.append(loadArkScpFile(fileName, sfx))
        else:
            raise UnsupportedType(
                'Unknown file suffix. You can appoint the <useSuffix> option with "scp","ark" or "npy".'
            )

    allData_bytes = b"".join(allData_bytes)

    if useSuffix == "":
        useSuffix = allFiles[0].strip()[-3:].lower()

    if useSuffix == "npy":
        dataType = "numpy"
    else:
        dataType = "bytes"

    return allData_bytes, allData_numpy, dataType
示例#25
0
def load_index_table(target, name="index", useSuffix=None):
    '''
	Load an index table from dict,or archive table file.

	Args:
		<target>: dict object,.ark or .scp file,IndexTable object,bytes archive object.
		<name>: a string.
		<useSuffix>: "ark" or "scp". We will check the file type by its suffix. 
								But if <target> is file path and not default suffix (ark or scp),you have to declare which type it is.

	Return:
		an exkaldi IndexTable object.
	'''
    newTable = IndexTable(name=name)

    if type_name(target) == "dict":
        for key, value in target.items():
            if isinstance(value, (list, tuple)):
                assert len(value) in [
                    3, 4
                ], f"Expected (frames,start index,data size[,file path]) but {value} does not match."
                newTable[key] = newTable.spec(*value)
            elif type_name(value) == "Index":
                newTable[key] = value
            else:
                raise WrongDataFormat(
                    f"Expected list or tuple but got wrong index info format: {value}."
                )

        return newTable

    elif type_name(target) == "IndexTable":
        newTable.update(target)
        return newTable

    elif isinstance(target, BytesArchive):
        newTable.update(target.indexTable)
        return newTable

    else:
        fileList = list_files(target)

        if useSuffix is not None:
            declare.is_valid_string("useSuffix", useSuffix)
            useSuffix = useSuffix.strip()[-3:].lower()
            declare.is_instances("useSuffix", useSuffix, ["ark", "scp"])
        else:
            useSuffix = ""

        for fileName in fileList:

            if fileName.rstrip().endswith(".ark"):
                t = __read_index_table_from_ark_file(fileName)
            elif fileName.rstrip().endswith(".scp"):
                t = __read_index_table_from_scp_file(fileName)
            elif useSuffix == "ark":
                t = __read_index_table_from_ark_file(fileName)
            elif useSuffix == "scp":
                t = __read_index_table_from_scp_file(fileName)
            else:
                raise UnsupportedType(
                    "Unknown file suffix. Specify <useSuffix> please.")

            newTable.update(t)

        return newTable
示例#26
0
文件: score.py 项目: wangyu09/exkaldi
def wer(ref, hyp, ignore=None, mode='all'):
    '''
	Compute WER (word error rate) between <ref> and <hyp>. 

	Args:
		<ref>,<hyp>: exkaldi transcription object or file path.
		<ignore>: ignore a symbol.
		<mode>: "all" or "present".

	Return:
		a namedtuple of score information.
	'''
    declare.is_potential_transcription("ref", ref)
    declare.is_potential_transcription("hyp", hyp)
    declare.is_instances("mode", mode, ['all', 'present'])
    declare.kaldi_existed()

    if ignore is not None:
        declare.is_valid_string("ignore", ignore)

    with FileHandleManager() as fhm:

        if ignore is None:

            if type_name(hyp) == "Transcription":
                hypTemp = fhm.create("w+", suffix=".txt", encoding="utf-8")
                hyp.save(hypTemp)
                hyp = hypTemp.name

            if type_name(ref) == "Transcription":
                refTemp = fhm.create("w+", suffix=".txt", encoding="utf-8")
                ref.save(refTemp)
                ref = refTemp.name

            cmd = f'compute-wer --text --mode={mode} ark:{ref} ark,p:{hyp}'
            scoreOut, scoreErr, _ = run_shell_command(cmd,
                                                      stdout="PIPE",
                                                      stderr="PIPE")

        else:
            # remove the ingored symbol in hyp
            if type_name(hyp) == "Transcription":
                hyp = hyp.save()
            else:
                with open(hyp, "r", encoding="utf-8") as fr:
                    hyp = fr.read()
            hypTemp = fhm.create("w+", suffix=".txt", encoding="utf-8")
            cmd = f'sed "s/{ignore} //g" > {hypTemp.name}'
            hypOut, err, _ = run_shell_command(cmd,
                                               stdin="PIPE",
                                               stdout="PIPE",
                                               stderr="PIPE",
                                               inputs=hyp)
            if len(hypOut) == 0:
                raise WrongDataFormat("<hyp> has wrong data formation.",
                                      err.decode())
            # remove the ingored symbol in ref
            if type_name(ref) == "Transcription":
                ref = ref.save()
            else:
                with open(ref, "r", encoding="utf-8") as fr:
                    ref = fr.read()
            refTemp = fhm.create("w+", suffix=".txt", encoding="utf-8")
            cmd = f'sed "s/{ignore} //g" > {refTemp.name}'
            refOut, err, cod = run_shell_command(cmd,
                                                 stdin="PIPE",
                                                 stdout="PIPE",
                                                 stderr="PIPE",
                                                 inputs=ref)
            if cod != 0 or len(refOut) == 0:
                raise WrongDataFormat("<ref> has wrong data formation.",
                                      err.decode())
            # score
            cmd = f'compute-wer --text --mode={mode} ark:{refTemp.name} ark,p:{hypTemp.name}'
            scoreOut, scoreErr, _ = run_shell_command(cmd,
                                                      stdout="PIPE",
                                                      stderr="PIPE")

    if len(scoreOut) == 0:
        raise KaldiProcessError("Failed to compute WER.", scoreErr.decode())
    else:
        out = scoreOut.decode().split("\n")
        pattern1 = '%WER (.*) \[ (.*) \/ (.*),(.*) ins,(.*) del,(.*) sub \]'
        pattern2 = "%SER (.*) \[ (.*) \/ (.*) \]"
        pattern3 = "Scored (.*) sentences,(.*) not present in hyp."
        s1 = re.findall(pattern1, out[0])[0]
        s2 = re.findall(pattern2, out[1])[0]
        s3 = re.findall(pattern3, out[2])[0]

        return namedtuple("Score", [
            "WER", "words", "insErr", "delErr", "subErr", "SER", "sentences",
            "wrongSentences", "missedSentences"
        ])(
            float(s1[0]),  #WER
            int(s1[2]),  #words
            int(s1[3]),  #ins
            int(s1[4]),  #del
            int(s1[5]),  #sub
            float(s2[0]),  #SER
            int(s2[1]),  #sentences
            int(s2[2]),  #wrong sentences
            int(s3[1])  #missed sentences
        )
示例#27
0
def load_ali(target, aliType="transitionID", name="ali", hmm=None):
    '''
	Load alignment data.

	Args:
		<target>: Python dict object,bytes object,exkaldi alignment object,kaldi alignment file or .npy file.
		<aliType>: None,or one of 'transitionID','phoneID','pdfID'. It will return different alignment object.
		<name>: a string.
		<hmm>: file path or exkaldi HMM object.

	Return:
		exkaldi alignment objects.
	'''
    declare.is_valid_string("name", name)
    declare.is_instances("aliType", aliType,
                         [None, "transitionID", "phoneID", "pdfID"])
    declare.kaldi_existed()

    def transform(data, cmd):
        out, err, cod = run_shell_command(cmd,
                                          stdin="PIPE",
                                          stdout="PIPE",
                                          stderr="PIPE",
                                          inputs=data)
        if (isinstance(cod, int) and cod != 0) and out == b'':
            raise KaldiProcessError('Failed to transform alignment.',
                                    err.decode())
        else:
            result = {}
            sp = BytesIO(out)
            for line in sp.readlines():
                line = line.decode()
                line = line.strip().split()
                utt = line[0]
                matrix = np.array(line[1:], dtype=np.int32)
                result[utt] = matrix
            return result

    if isinstance(target, dict):
        if aliType is None:
            result = NumpyAli(target, name)
        elif aliType == "transitionID":
            result = NumpyAliTrans(target, name)
        elif aliType == "phoneID":
            result = NumpyAliPhone(target, name)
        elif aliType == "pdfID":
            result = NumpyAliPdf(target, name)
        else:
            raise WrongOperation(
                f"<aliType> should be None,'transitionID','phoneID' or 'pdfID' but got {aliType}."
            )
        result.check_format()
        return result

    elif isinstance(target, (NumpyAli, NumpyAliTrans, BytesAliTrans)):
        result = copy.deepcopy(target)
        result.rename(name)
        return result

    elif isinstance(target, IndexTable):
        result = target.fetch(arkType="ali")
        if aliType in ["phoneID", "pdfID"]:
            result = result.to_numpy(aliType, hmm)
        result.rename(name)
        return result

    elif isinstance(target, str):
        allFiles = list_files(target)
        numpyAli = {}
        bytesAli = []

        for fileName in allFiles:
            fileName = fileName.strip()
            if fileName.endswith(".npy"):
                try:
                    temp = np.load(fileName, allow_pickle=True)
                    numpyAli.update(temp)
                except:
                    raise UnsupportedType(
                        f'This is not a valid Exkaldi npy file: {fileName}.')
            else:
                if fileName.endswith('.gz'):
                    cmd = f'gunzip -c {fileName}'
                else:
                    cmd = f'cat {fileName}'

                if aliType is None or aliType == "transitionID":
                    out, err, cod = run_shell_command(cmd,
                                                      stdout="PIPE",
                                                      stderr="PIPE")
                    if (isinstance(cod, int) and cod != 0) or out == b'':
                        raise ShellProcessError(
                            f"Failed to get the alignment data from file: {fileName}.",
                            err.decode())
                    else:
                        bytesAli.append(out)

                else:
                    with FileHandleManager() as fhm:
                        declare.is_potential_hmm("hmm", hmm)
                        if not isinstance(hmm, str):
                            hmmTemp = fhm.create("wb+")
                            hmm.save(hmmTemp)
                            hmm = hmmTemp.name

                        if aliType == "phoneID":
                            cmd += f" | ali-to-phones --per-frame=true {hmm} ark:- ark,t:-"
                            temp = transform(None, cmd)

                        else:
                            cmd += f" | ali-to-pdf {hmm} ark:- ark,t:-"
                            temp = transform(None, cmd)

                    numpyAli.update(temp)

        bytesAli = b"".join(bytesAli)
        if aliType is None:
            if len(numpyAli) == 0:
                return BytesAliTrans(bytesAli, name=name)
            elif len(bytesAli) == 0:
                return NumpyAli(numpyAli, name=name)
            else:
                result = NumpyAliTrans(numpyAli) + BytesAliTrans(bytesAli)
                result.rename(name)
                return result
        elif aliType == "transitionID":
            if len(numpyAli) == 0:
                return BytesAliTrans(bytesAli, name=name)
            elif len(bytesAli) == 0:
                return NumpyAliTrans(numpyAli, name=name)
            else:
                result = NumpyAliTrans(numpyAli) + BytesAliTrans(bytesAli)
                result.rename(name)
                return result
        elif aliType == "phoneID":
            return NumpyAliPhone(numpyAli, name=name)
        else:
            return NumpyAliPdf(numpyAli, name=name)

    else:
        raise UnsupportedType(
            f"<target> should be dict,file name or exkaldi alignment or index table object but got: {type_name(target)}."
        )
示例#28
0
def __compute_feature(target,kaldiTool,useSuffix=None,name="feat",outFile=None):
	'''
	The base funtion to compute feature.
	'''
	declare.kaldi_existed()

	if useSuffix != None:
		declare.is_valid_string("useSuffix",useSuffix)
		useSuffix = useSuffix.strip().lower()[-3:]
		declare.is_instances("useSuffix",useSuffix,["scp","wav"])
	else:
		useSuffix = ""	

	targets,kaldiTools,useSuffixs,names,outFiles = check_multiple_resources(target,kaldiTool,useSuffix,name,outFile=outFile)
	# pretreatment
	fromSegment = False
	with FileHandleManager() as fhm:

		segments = []
		for index,kaldiTool,target,useSuffix,name in zip(range(len(outFiles)),kaldiTools,targets,useSuffixs,names):
			
			declare.is_classes("target",target,["str","ListTable","WavSegment"])
			declare.is_valid_string("name",name)

			if isinstance(target,str):		
		
				allFiles = list_files(target)
				target = ListTable()

				for filePath in allFiles:
					filePath = filePath.strip()
					if filePath[-4:].lower() == ".wav":
						fileName = os.path.basename(filePath)
						uttID = fileName[0:-4].replace(".","")
						target[uttID] = filePath
					
					elif filePath[-4:].lower() == '.scp':
						target += load_list_table(filePath)
					
					elif "wav" == useSuffix:
						fileName = os.path.basename(filePath)
						uttID = fileName.replace(".","")
						target[uttID] = filePath

					elif "scp" == useSuffix:
						target += load_list_table(filePath)

					else:
						raise UnsupportedType('Unknown file suffix. You can declare whether <useSuffix> is "wav" or "scp".')
				
				if len(target) == 0:
					raise WrongDataFormat("There did not include any data to compute data in target.")

				targets[index] = target
			
			elif type_name(target) == "WavSegment":

				segTemp = fhm.create("w+",suffix=".seg",encode="utf-8")
				target.save(segTemp)
				segments.append(segTemp.name)

				targets[index] = target.detach_wav()
				fromSegment = True

	if fromSegment:
		# define the command pattern
		cmdPattern = "extract-segments scp:{wavFile} {segment} ark:- | {kaldiTool} ark:- ark:{outFile}"
		# define resources
		resources = {"wavFile":targets,"segment":segments,"kaldiTool":kaldiTools,"outFile":outFiles}
	else:
		# define the command pattern
		cmdPattern = "{kaldiTool} scp:{wavFile} ark:{outFile}"
		# define resources
		resources = {"wavFile":targets,"kaldiTool":kaldiTools,"outFile":outFiles}

	# Run
	return run_kaldi_commands_parallel(resources,cmdPattern,analyzeResult=True,generateArchive="feat",archiveNames=names)