Пример #1
0
def __read_one_record_from_ark(fp):
    '''
	Read a utterance from opened file pointer of an archive file.
	It is used to generate bytes index table.
	'''
    # read utterance ID
    utt = ''
    while True:
        char = fp.read(1).decode()
        if (char == '') or (char == ' '):
            break
        utt += char
    utt = utt.strip()
    if utt == '':
        if fp.read() == b'':
            return (None, None, None)
        else:
            fp.close()
            raise WrongDataFormat(
                "Miss utterance ID before utterance. This may not be complete Kaldi archeve table file."
            )
    # read data
    binarySymbol = fp.read(2).decode()
    if binarySymbol == '\0B':
        sizeSymbol = fp.read(1).decode()
        if sizeSymbol == '\4':
            frames = int(np.frombuffer(fp.read(4), dtype='int32', count=1)[0])
            buf = fp.read(frames * 5)  # move the handle
            del buf
            dataSize = len(utt) + 8 + frames * 5
            return (utt, frames, dataSize)
        else:
            dataType = sizeSymbol + fp.read(2).decode()
            if dataType == 'CM ':
                fp.close()
                raise UnsupportedType(
                    "Unsupported to generate index table from compressed archive table. Please decompress it firstly."
                )
            elif dataType == 'FM ':
                sampleSize = 4
            elif dataType == 'DM ':
                sampleSize = 8
            else:
                fp.close()
                raise WrongDataFormat(
                    f"This may not be Kaldi archeve table file.")
            s1, rows, s2, cols = np.frombuffer(fp.read(10),
                                               dtype="int8,int32,int8,int32",
                                               count=1)[0]
            rows = int(rows)
            cols = int(cols)
            buf = fp.read(rows * cols * sampleSize)  # move the handle
            del buf
            dataSize = len(utt) + 16 + rows * cols * sampleSize
            return (utt, rows, dataSize)
    else:
        fp.close()
        raise WrongDataFormat(
            "Miss binary symbol before utterance. This may not be Kaldi binary archeve table file."
        )
Пример #2
0
def check_config(name, config=None):
    '''
	Check the users'configures or get the default configures of some functions.

	Args:
		<name>: function name.
		<config>: a list object whose keys are configure name and values are their configure values. If None,return the default configure.
	
	Return:
		if <config> is None:
			Return none,or a dict object of example configure of <name>.
			If the value is a tuple,it standards for multiple types of value you can set.
		else:
			Return True or raise error.
	'''
    declare.is_valid_string("name", name)

    try:
        module = importlib.import_module(f'exkaldi.config.{name}')
    except ModuleNotFoundError:
        print(f"Warning: no default configure for name '{name}'.")
        return None
    else:
        c = module.config

    if config is None:
        config = {}
        for key, value in c.items():
            value = tuple(value[i] for i in range(0, len(value), 2))
            value = value if len(value) > 1 else value[0]
            config[key] = value
        return config

    else:
        if not isinstance(config, dict):
            raise WrongOperation(
                f"<config> has a wrong format. You can use check_config('{name}') to get expected configure format."
            )
        for k in config.keys():
            if not k in c.keys():
                raise WrongOperation(
                    f"No such configure name: <{k}> in {name}.")
            else:
                protos = tuple(c[k][i] for i in range(1, len(c[k]), 2))
                if not isinstance(config[k], protos):
                    if isinstance(config[k], bool):
                        raise WrongDataFormat(
                            f"Configure <{k}> is bool value: {config[k]},but we expected str value like 'true' or 'false'."
                        )
                    else:
                        raise WrongDataFormat(
                            f"Configure <{k}> should be in {protos} but got {type_name(config[k])}."
                        )

            return True
Пример #3
0
def load_list_table(target, name="listTable"):
    '''
	Generate a list table object from dict object or file.

	Args:
		<target>: dict object or a file path.
	
	Return:
		a ListTable object.
	'''
    declare.is_classes("target", target, [dict, ListTable, str])

    newTable = ListTable(name=name)
    if type_name(target) in ["dict", "ListTable"]:
        newTable.update(target)
        return newTable

    else:
        files = list_files(target)
        for filePath in files:
            with open(filePath, "r", encoding="utf-8") as fr:
                lines = fr.readlines()
            for index, line in enumerate(lines, start=1):
                t = line.strip().split(maxsplit=1)
                if len(t) < 2:
                    print(f"Line Number: {index}")
                    print(f"Line Content: {line}")
                    raise WrongDataFormat(
                        f"Missing paired key and value information in file:{filePath}."
                    )
                else:
                    newTable[t[0]] = t[1]

        return newTable
Пример #4
0
def load_transcription(target, name="transcription", checkSpace=True):
    '''
	Load transcription from file.

	Args:
		<target>: transcription file path.
		<name>: a string.
		<checkSpace>: a bbol value. If True,we will check the validity of the number of spaces.

	Return:
		An exkaldi Transcription object.
	'''
    declare.is_classes("target", target,
                       ["dict", "Transcription", "ListTable", "str"])
    declare.is_bool("checkSpace", checkSpace)

    if isinstance(target, str):
        declare.is_file("target", target)
        with open(target, "r", encoding="utf-8") as fr:
            lines = fr.readlines()
        result = Transcription(name=name)
        for index, line in enumerate(lines, start=1):
            t = line.strip().split(maxsplit=1)
            if len(t) < 2:
                print(f"Line Number: {index}")
                print(f"Line Content: {line}")
                raise WrongDataFormat(
                    "Missing entire key and value information.")
            else:
                result[t[0]] = t[1]
    else:
        for utt, utterance in target.items():
            declare.is_valid_string("utterance ID", utt)
            declare.is_valid_string("utterance", utterance)
        result = Transcription(target, name=name)

    if checkSpace:
        sampleText = result.subset(nRandom=100)
        spaceCount = 0
        for key, value in sampleText.items():
            spaceCount += value.count(" ")
        if spaceCount < len(sampleText) // 2:
            errMes = "The transcription doesn't seem to be separated by spaces or extremely short."
            errMes += "If it actually has right format, set the <checkSpace>=False and run this function again."
            raise WrongDataFormat(errMes)

    return result
Пример #5
0
def __read_index_table_from_scp_file(fileName):
    '''
	Read index table from scp file.
	'''
    newTable = ArkIndexTable()

    with FileHandleManager() as fhm:

        fr = fhm.open(fileName, "r", encoding="utf-8")
        lines = fr.readlines()

        for lineID, lineTxt in enumerate(lines):
            line = lineTxt.strip().split()
            if len(line) == 0:
                continue
            elif len(line) == 1:
                print(f"line {lineID}: {lineTxt}")
                raise WrongDataFormat(
                    "Missed complete utterance-filepath information.")
            elif len(line) > 2:
                raise WrongDataFormat(
                    "We don't support reading index table from binary data generated via PIPE line. The second value should be ark file path and the shift."
                )
            else:
                uttID = line[0]
                line = line[1].split(":")
                if len(line) != 2:
                    print(f"line {lineID}: {lineTxt}")
                    raise WrongDataFormat(
                        "Missed complete file path and shift value information."
                    )
                arkFileName = line[0]
                startIndex = int(line[1]) - 1 - len(uttID)

                fr = fhm.call(arkFileName)
                if fr is None:
                    fr = fhm.open(arkFileName, "rb")

                fr.seek(startIndex)
                _, frames, dataSize = __read_one_record_from_ark(fr)
                arkFileName = os.path.abspath(arkFileName)
                newTable[uttID] = newTable.spec(frames, startIndex, dataSize,
                                                arkFileName)

    return newTable
Пример #6
0
def ctc_greedy_search(prob, vocabs, blankID=None):
    '''
    The best path decoding algorithm.

    Args:
        <prob>: An exkaldi probability object. This probalility should be an output of Neural Network with CTC loss fucntion.
        <vocabs>: a list of vocabulary.
        <blankID>: specify the ID of blank symbol. If None, use the last dimentionality of <prob>.
    Return:
        An exkaldi Transcription object of decoding results.  
    '''
    assert isinstance(
        vocabs,
        list), f"<vocabs> must be a list of vocabulary but got {vocabs}."

    if type_name(prob) == "BytesProbability":
        prob = prob.to_numpy()
    elif type_name(prob) == "NumpyProbability":
        pass
    else:
        raise UnsupportedType(
            f"<prob> should be an exkaldi probability object but got {type_name(prob)}."
        )

    probDim = prob.dim
    if len(vocabs) == probDim:
        if blankID is None:
            blankID = probDim - 1
        else:
            assert isinstance(
                blankID, int
            ) and 0 <= blankID < probDim, f"BlankID {blankID} is out of range of int sequences from 0 to {probDim-1}."
    elif len(vocabs) == probDim - 1:
        if blankID == None:
            blankID = probDim - 1
        else:
            assert blankID == probDim - 1, f"The dimensibality of probability is {probDim} but only have {len(vocabs)} words. In this case, blank ID must be {probDim-1} but got {blankID}"
    else:
        raise WrongDataFormat(
            f"The dimensibality of probability {probDim} does not match the numbers of words {len(vocabs)}."
        )

    results = Transcription(name="bestPathResult")
    for utt, pb in prob.items:
        assert isinstance(pb, np.ndarray) and len(
            pb.shape) == 2, "Unsupported probability matrix formatation."
        best_path = np.argmax(pb, 1)
        best_chars_collapsed = [
            vocabs[ID] for ID, _ in groupby(best_path) if ID != blankID
        ]
        try:
            results[utt] = " ".join(best_chars_collapsed)
        except Exception as e:
            print("<vocab> might has non-string items.")
            raise e
    return results
Пример #7
0
def spk2utt_to_utt2spk(spk2uttFile, outFile):
    '''
	Transform spk2utt file to utt2spk file.

	Args:
		<spk2uttFile>: file name.
		<outFile>: file name.
	'''
    assert isinstance(
        spk2uttFile, str
    ), f"<spk2uttFile> should be a string but got: {type_name(spk2uttFile)}."
    assert isinstance(
        outFile,
        str), f"<outFile> should be a string but got: {type_name(outFile)}."

    if not os.path.isfile(spk2uttFile):
        raise WrongPath(f"No such file: {spk2uttFile}.")

    utt2spk = {}
    with open(spk2uttFile, "r", encoding="utf-8") as fr:
        lines = fr.readlines()
    for index, line in enumerate(lines, start=1):
        line = line.strip().split()
        if len(line) == 0:
            continue
        else:
            if len(line) < 2:
                raise WrongDataFormat(
                    f"Mismatching between utt and spk: {line}.")
            spk = line[0]
            for utt in line[1:]:
                if utt in utt2spk.keys():
                    raise WrongDataFormat(
                        f"utt:{utt} is repeated in line {index}.")
                utt2spk[utt] = spk

    with open(outFile, "w") as fw:
        fw.write("\n".join(
            map(lambda utt, spk: utt + " " + spk, utt2spk.items())))

    return os.path.abspath(outFile)
Пример #8
0
def ctc_greedy_search(prob, vocabs, blankID=None):
    '''
    The best path decoding algorithm.

    Args:
        <prob>: An exkaldi probability object. This probalility should be an output of Neural Network with CTC loss fucntion.
        <vocabs>: a list of vocabulary.
        <blankID>: specify the ID of blank symbol. If None, use the last dimentionality of <prob>.
    Return:
        An exkaldi Transcription object of decoding results.  
    '''
    declare.is_classes("vocabs", vocabs, list)

    declare.is_probability("prob", prob)
    if type_name(prob) == "BytesProbability":
        prob = prob.to_numpy()
    elif type_name(prob) == "ArkIndexTable":
        prob = prob.read_record("prob").to_numpy()

    probDim = prob.dim
    if len(vocabs) == probDim:
        if blankID is None:
            blankID = probDim - 1
        declare.is_positive_int("blankID", blackID)
        declare.in_boundary("blankID", blackID, 0, probDim - 1)
    elif len(vocabs) == probDim - 1:
        if blankID == None:
            blankID = probDim - 1
        else:
            assert blankID == probDim - 1, f"The dimensibality of probability is {probDim} but only have {len(vocabs)} words. In this case, blank ID must be {probDim-1} but got {blankID}"
    else:
        raise WrongDataFormat(
            f"The dimensibality of probability {probDim} does not match the numbers of words {len(vocabs)}."
        )

    results = Transcription(name="bestPathResult")
    for utt, pb in prob.items:
        declare.is_classes("prob", prob, np.ndarray)
        declare.is_classes("the rank of matrix shape", len(pb.shape),
                           "expected rank", 2)
        best_path = np.argmax(pb, 1)
        best_chars_collapsed = [
            vocabs[ID] for ID, _ in groupby(best_path) if ID != blankID
        ]
        try:
            results[utt] = " ".join(best_chars_collapsed)
        except Exception as e:
            print("<vocab> might has non-string items.")
            raise e
    return results
Пример #9
0
def load_lat(target, name="lat"):
	'''
	Load lattice data.

	Args:
		<target>: bytes object, file path or exkaldi lattice object.
		<hmm>: file path or exkaldi HMM object.
		<wordSymbol>: file path or exkaldi LexiconBank object.
		<name>: a string.
	Return:
		A exkaldi lattice object.
	'''
	if isinstance(target, bytes):
		return Lattice(target, name)

	elif isinstance(target, str):
		target = list_files(target)
		allData = []
		for fileName in target:
			if fileName.endswith('.gz'):
				cmd = 'gunzip -c {}'.format(fileName)
				out, err, _ = run_shell_command(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
				if out == b'':
					print(err.decode())
					raise WrongDataFormat('Failed to load Lattice.')
				else:
					allData.append(out)
			else:
				try:
					with open(fileName, 'rb') as fr:
						out = fr.read()
				except Exception as e:
					print("Load lattice file defeated. Please make sure it is a lattice file avaliable.")
					raise e
				else:
					allData.append(out)
		try:
			allData = b"".join(allData)
		except Exception as e:
			raise WrongOperation("Only support binary format lattice file.")
		else:
			return Lattice(data=allData, name=name)

	else:
		raise UnsupportedType(f"Expected bytes object or lattice file but got: {type_name(target)}.")
Пример #10
0
def utt2spk_to_spk2utt(utt2spkFile, outFile):
    '''
	Transform utt2spk file to spk2utt file.

	Args:
		<utt2spkFile>: file name.
		<outFile>: file name.
	'''
    assert isinstance(
        utt2spkFile, str
    ), f"<utt2spkFile> should be a string but got: {type_name(utt2spkFile)}."
    assert isinstance(
        outFile,
        str), f"<outFile> should be a string but got: {type_name(outFile)}."

    if not os.path.isfile(utt2spkFile):
        raise WrongPath(f"No such file: {utt2spkFile}.")

    spk2utt = {}
    with open(utt2spkFile, "r", encoding="utf-8") as fr:
        lines = fr.readlines()
    for index, line in enumerate(lines, start=1):
        line = line.strip().split()
        if len(line) == 0:
            continue
        else:
            if len(line) != 2:
                raise WrongDataFormat(
                    f"Mismatching between utt and spk: {line}, line {index}.")
            utt, spk = line[0], line[1]
            if spk not in spk2utt.keys():
                spk2utt[spk] = f"{utt}"
            else:
                spk2utt[spk] += f" {utt}"

    with open(outFile, "w") as fw:
        fw.write("\n".join(
            map(lambda spk, utts: spk + " " + utts, spk2utt.items())))

    return os.path.abspath(outFile)
Пример #11
0
def wer(ref, hyp, ignore=None, mode='all'):
    '''
	Compute WER (word error rate) between <ref> and <hyp>. 

	Args:
		<ref>, <hyp>: exkaldi transcription object or file path.
		<ignore>: ignore a symbol.
		<mode>: "all" or "present".
	Return:
		a namedtuple of score information.
	'''
    assert mode in ['all',
                    'present'], 'Expected <mode> to be "present" or "all".'
    ExkaldiInfo.vertify_kaldi_existed()

    hypTemp = tempfile.NamedTemporaryFile("w+",
                                          suffix=".txt",
                                          encoding="utf-8")
    refTemp = tempfile.NamedTemporaryFile("w+",
                                          suffix=".txt",
                                          encoding="utf-8")
    try:
        if ignore is None:
            if type_name(hyp) == "Transcription":
                hyp.save(hypTemp)
                hypTemp.seek(0)
                hypFileName = hypTemp.name
            elif isinstance(hyp, str):
                if not os.path.isfile(hyp):
                    raise WrongPath(f"No such file:{hyp}.")
                else:
                    hypFileName = hyp
            else:
                raise UnsupportedType(
                    '<hyp> should be exkaldi Transcription object or file path.'
                )

            if type_name(ref) == "Transcription":
                ref.save(refTemp)
                refTemp.seek(0)
                refFileName = refTemp.name
            elif isinstance(ref, str):
                if not os.path.isfile(ref):
                    raise WrongPath(f"No such file:{ref}.")
                else:
                    refFileName = ref
            else:
                raise UnsupportedType(
                    '<ref> should be exkaldi Transcription object or file path.'
                )

            cmd = f'compute-wer --text --mode={mode} ark:{refFileName} ark,p:{hypFileName}'
            scoreOut, scoreErr, _ = run_shell_command(cmd,
                                                      stdout=subprocess.PIPE,
                                                      stderr=subprocess.PIPE)
        else:
            if type_name(hyp) == "Transcription":
                hyp = hyp.save()
            elif isinstance(hyp, str):
                if not os.path.isfile(hyp):
                    raise WrongPath(f"No such file:{hyp}.")
                else:
                    with open(hyp, "r", encoding="utf-8") as fr:
                        hyp = fr.read()
            else:
                raise UnsupportedType(
                    '<hyp> should be exkaldi Transcription object or file path.'
                )

            cmd = f'sed "s/{ignore} //g" > {hypTemp.name}'
            hypOut, err, _ = run_shell_command(cmd,
                                               stdin=subprocess.PIPE,
                                               stdout=subprocess.PIPE,
                                               stderr=subprocess.PIPE,
                                               inputs=hyp.encode())
            if len(hypOut) == 0:
                print(err.decode())
                raise WrongDataFormat("<hyp> has wrong data formation.")

            if type_name(ref) == "Transcription":
                ref = ref.save()
            elif isinstance(ref, str):
                if not os.path.isfile(ref):
                    raise WrongPath(f"No such file:{ref}.")
                else:
                    with open(ref, "r", encoding="utf-8") as fr:
                        ref = fr.read()
            else:
                raise UnsupportedType(
                    '<ref> should be exkaldi Transcription object or file path.'
                )

            cmd = f'sed "s/{ignore} //g" > {refTemp.name}'
            refOut, err, cod = run_shell_command(cmd,
                                                 stdin=subprocess.PIPE,
                                                 stdout=subprocess.PIPE,
                                                 stderr=subprocess.PIPE,
                                                 inputs=hyp.encode())
            if cod != 0 or len(refOut) == 0:
                print(err.decode())
                raise WrongDataFormat("<ref> has wrong data formation.")

            cmd = f'compute-wer --text --mode={mode} ark:{refTemp.name} ark,p:{hypTemp.name}'
            scoreOut, scoreErr, _ = run_shell_command(cmd,
                                                      stdout=subprocess.PIPE,
                                                      stderr=subprocess.PIPE)

    finally:
        hypTemp.close()
        refTemp.close()

    if len(scoreOut) == 0:
        print(scoreErr.decode())
        raise KaldiProcessError("Failed to compute WER.")

    else:
        out = scoreOut.decode().split("\n")
        pattern1 = '%WER (.*) \[ (.*) \/ (.*), (.*) ins, (.*) del, (.*) sub \]'
        pattern2 = "%SER (.*) \[ (.*) \/ (.*) \]"
        pattern3 = "Scored (.*) sentences, (.*) not present in hyp."
        s1 = re.findall(pattern1, out[0])[0]
        s2 = re.findall(pattern2, out[1])[0]
        s3 = re.findall(pattern3, out[2])[0]

        return namedtuple("Score", [
            "WER", "words", "insErr", "delErr", "subErr", "SER", "sentences",
            "wrongSentences", "missedSentences"
        ])(
            float(s1[0]),  #WER
            int(s1[2]),  #words
            int(s1[3]),  #ins
            int(s1[4]),  #del
            int(s1[5]),  #sub
            float(s2[0]),  #SER
            int(s2[1]),  #sentences
            int(s2[2]),  #wrong sentences
            int(s3[1])  #missed sentences
        )
Пример #12
0
def __compute_feature(target,kaldiTool,useSuffix=None,name="feat",outFile=None):
	'''
	The base funtion to compute feature.
	'''
	declare.kaldi_existed()

	if useSuffix != None:
		declare.is_valid_string("useSuffix",useSuffix)
		useSuffix = useSuffix.strip().lower()[-3:]
		declare.is_instances("useSuffix",useSuffix,["scp","wav"])
	else:
		useSuffix = ""	

	targets,kaldiTools,useSuffixs,names,outFiles = check_multiple_resources(target,kaldiTool,useSuffix,name,outFile=outFile)
	# pretreatment
	fromSegment = False
	with FileHandleManager() as fhm:

		segments = []
		for index,kaldiTool,target,useSuffix,name in zip(range(len(outFiles)),kaldiTools,targets,useSuffixs,names):
			
			declare.is_classes("target",target,["str","ListTable","WavSegment"])
			declare.is_valid_string("name",name)

			if isinstance(target,str):		
		
				allFiles = list_files(target)
				target = ListTable()

				for filePath in allFiles:
					filePath = filePath.strip()
					if filePath[-4:].lower() == ".wav":
						fileName = os.path.basename(filePath)
						uttID = fileName[0:-4].replace(".","")
						target[uttID] = filePath
					
					elif filePath[-4:].lower() == '.scp':
						target += load_list_table(filePath)
					
					elif "wav" == useSuffix:
						fileName = os.path.basename(filePath)
						uttID = fileName.replace(".","")
						target[uttID] = filePath

					elif "scp" == useSuffix:
						target += load_list_table(filePath)

					else:
						raise UnsupportedType('Unknown file suffix. You can declare whether <useSuffix> is "wav" or "scp".')
				
				if len(target) == 0:
					raise WrongDataFormat("There did not include any data to compute data in target.")

				targets[index] = target
			
			elif type_name(target) == "WavSegment":

				segTemp = fhm.create("w+",suffix=".seg",encode="utf-8")
				target.save(segTemp)
				segments.append(segTemp.name)

				targets[index] = target.detach_wav()
				fromSegment = True

	if fromSegment:
		# define the command pattern
		cmdPattern = "extract-segments scp:{wavFile} {segment} ark:- | {kaldiTool} ark:- ark:{outFile}"
		# define resources
		resources = {"wavFile":targets,"segment":segments,"kaldiTool":kaldiTools,"outFile":outFiles}
	else:
		# define the command pattern
		cmdPattern = "{kaldiTool} scp:{wavFile} ark:{outFile}"
		# define resources
		resources = {"wavFile":targets,"kaldiTool":kaldiTools,"outFile":outFiles}

	# Run
	return run_kaldi_commands_parallel(resources,cmdPattern,analyzeResult=True,generateArchive="feat",archiveNames=names)
Пример #13
0
def wer(ref, hyp, ignore=None, mode='all'):
    '''
	Compute WER (word error rate) between <ref> and <hyp>. 

	Args:
		<ref>,<hyp>: exkaldi transcription object or file path.
		<ignore>: ignore a symbol.
		<mode>: "all" or "present".

	Return:
		a namedtuple of score information.
	'''
    declare.is_potential_transcription("ref", ref)
    declare.is_potential_transcription("hyp", hyp)
    declare.is_instances("mode", mode, ['all', 'present'])
    declare.kaldi_existed()

    if ignore is not None:
        declare.is_valid_string("ignore", ignore)

    with FileHandleManager() as fhm:

        if ignore is None:

            if type_name(hyp) == "Transcription":
                hypTemp = fhm.create("w+", suffix=".txt", encoding="utf-8")
                hyp.save(hypTemp)
                hyp = hypTemp.name

            if type_name(ref) == "Transcription":
                refTemp = fhm.create("w+", suffix=".txt", encoding="utf-8")
                ref.save(refTemp)
                ref = refTemp.name

            cmd = f'compute-wer --text --mode={mode} ark:{ref} ark,p:{hyp}'
            scoreOut, scoreErr, _ = run_shell_command(cmd,
                                                      stdout="PIPE",
                                                      stderr="PIPE")

        else:
            # remove the ingored symbol in hyp
            if type_name(hyp) == "Transcription":
                hyp = hyp.save()
            else:
                with open(hyp, "r", encoding="utf-8") as fr:
                    hyp = fr.read()
            hypTemp = fhm.create("w+", suffix=".txt", encoding="utf-8")
            cmd = f'sed "s/{ignore} //g" > {hypTemp.name}'
            hypOut, err, _ = run_shell_command(cmd,
                                               stdin="PIPE",
                                               stdout="PIPE",
                                               stderr="PIPE",
                                               inputs=hyp)
            if len(hypOut) == 0:
                print(err.decode())
                raise WrongDataFormat("<hyp> has wrong data formation.")
            # remove the ingored symbol in ref
            if type_name(ref) == "Transcription":
                ref = ref.save()
            else:
                with open(ref, "r", encoding="utf-8") as fr:
                    ref = fr.read()
            refTemp = fhm.create("w+", suffix=".txt", encoding="utf-8")
            cmd = f'sed "s/{ignore} //g" > {refTemp.name}'
            refOut, err, cod = run_shell_command(cmd,
                                                 stdin="PIPE",
                                                 stdout="PIPE",
                                                 stderr="PIPE",
                                                 inputs=ref)
            if cod != 0 or len(refOut) == 0:
                print(err.decode())
                raise WrongDataFormat("<ref> has wrong data formation.")
            # score
            cmd = f'compute-wer --text --mode={mode} ark:{refTemp.name} ark,p:{hypTemp.name}'
            scoreOut, scoreErr, _ = run_shell_command(cmd,
                                                      stdout="PIPE",
                                                      stderr="PIPE")

    if len(scoreOut) == 0:
        print(scoreErr.decode())
        raise KaldiProcessError("Failed to compute WER.")
    else:
        out = scoreOut.decode().split("\n")
        pattern1 = '%WER (.*) \[ (.*) \/ (.*),(.*) ins,(.*) del,(.*) sub \]'
        pattern2 = "%SER (.*) \[ (.*) \/ (.*) \]"
        pattern3 = "Scored (.*) sentences,(.*) not present in hyp."
        s1 = re.findall(pattern1, out[0])[0]
        s2 = re.findall(pattern2, out[1])[0]
        s3 = re.findall(pattern3, out[2])[0]

        return namedtuple("Score", [
            "WER", "words", "insErr", "delErr", "subErr", "SER", "sentences",
            "wrongSentences", "missedSentences"
        ])(
            float(s1[0]),  #WER
            int(s1[2]),  #words
            int(s1[3]),  #ins
            int(s1[4]),  #del
            int(s1[5]),  #sub
            float(s2[0]),  #SER
            int(s2[1]),  #sentences
            int(s2[2]),  #wrong sentences
            int(s3[1])  #missed sentences
        )
Пример #14
0
def train_ngrams_kenlm(lexicons, order, text, outFile, config=None):
    '''
	Train N-Grams language model with SriLM tookit.

	Args:
		<lexicons>: an exkaldi LexiconBank object.
		<order>: the maximum order of N-Grams.
		<text>: a text corpus file or an exkaldi transcription object.
		<outFile>: output file name of arpa LM.
		<config>: extra configurations, a Python dict object.

	You can use .check_config("train_ngrams_kenlm") function to get a reference of extra configurations.
	Also you can run shell command "lmplz" to look their usage.
	'''
    declare.is_lexicon_bank("lexicons", lexicons)
    declare.is_positive_int("order", order)
    declare.is_potential_transcription("text", text)
    declare.is_valid_file_name("outFile", outFile)

    declare.less_equal("order", order, "max order", 9)

    with FileHandleManager() as fhm:
        # check whether this is a reasonable text corpus that should be splited by space.
        if isinstance(text, str):
            cmd = f"shuf {text} -n 100"
            out, err, cod = run_shell_command(cmd,
                                              stdout="PIPE",
                                              stderr="PIPE")
            if (isinstance(cod, int) and cod != 0):
                print(err.decode())
                raise ShellProcessError(
                    f"Failed to sample from text file:{text}.")
            elif out == b'':
                raise WrongDataFormat(f"Void text file:{text}.")
            else:
                out = out.decode().strip().split("\n")
                spaceCount = 0
                for line in out:
                    spaceCount += line.count(" ")
                if spaceCount < len(out) // 2:
                    raise WrongDataFormat(
                        "The text file doesn't seem to be separated by spaces or sentences are extremely short."
                    )

        else:
            sampleText = text.subset(nRandom=100)
            spaceCount = 0
            for key, value in sampleText.items():
                assert isinstance(
                    value, str
                ), f"Transcription must be string but got: {type_name(value)}."
                spaceCount += value.count(" ")
            if spaceCount < len(sampleText) // 2:
                raise WrongDataFormat(
                    "The text file doesn't seem to be separated by spaces or sentences are extremely short."
                )
            textTemp = fhm.create("a+", suffix=".txt", encoding="utf-8")
            text.save(textTemp, discardUttID=True)
            text = textTemp.name

        extraConfig = " "
        if config is not None:
            if check_config(name='train_ngrams_kenlm', config=config):
                if "--temp_prefix" in config.keys() and "-T" in config.keys():
                    raise WrongOperation(
                        f'"--temp_prefix" and "-T" is the same configuration so only one of them is expected.'
                    )
                if "--memory" in config.keys() and "-S" in config.keys():
                    raise WrongOperation(
                        f'"--memory" and "-S" is the same configuration so only one of them is expected.'
                    )
                for key, value in config.items():
                    if isinstance(value, bool):
                        if value is True:
                            extraConfig += f"{key} "
                    else:
                        extraConfig += f"{key} {value} "

        if not outFile.rstrip().endswith(".arpa"):
            outFile += ".arpa"
        make_dependent_dirs(outFile, pathIsFile=True)

        wordlistTemp = fhm.create("w+", encoding='utf-8', suffix=".txt")
        words = lexicons("words")
        words_count = math.ceil(len(words) / 10) * 10
        words = "\n".join(words.keys())
        wordlistTemp.write(words)
        wordlistTemp.seek(0)

        KenLMTool = os.path.join(sys.prefix, "exkaldisrc", "tools", "lmplz")

        cmd = f"{KenLMTool}{extraConfig}-o {order} --vocab_estimate {words_count} --text {text} --arpa {outFile} --limit_vocab_file {wordlistTemp.name}"
        out, err, cod = run_shell_command(cmd, stderr="PIPE")

        if (isinstance(cod, int) and cod != 0) or (
                not os.path.isfile(outFile)) or (os.path.getsize(outFile)
                                                 == 0):
            print(err.decode())
            if os.path.isfile(outFile):
                os.remove(outFile)
            raise KenlmProcessError("Failed to generate arpa file.")

        return outFile
Пример #15
0
def train_ngrams_srilm(lexicons, order, text, outFile, config=None):
    '''
	Train N-Grams language model with SriLM tookit.
	If you don't specified the discount by the <config> option, We defaultly use "kndiscount".

	Args:
		<lexicons>: an exkaldi LexiconBank object.
		<order>: the maximum order of N-Grams.
		<text>: a text corpus file or an exkaldi transcription object.
		<outFile>: output file name of arpa LM.
		<config>: extra configurations, a Python dict object.

	You can use .check_config("train_ngrams_srilm") function to get a reference of extra configurations.
	Also you can run shell command "ngram-count" to look their usage.
	'''
    declare.is_lexicon_bank("lexicons", lexicons)
    declare.is_positive_int("order", order)
    declare.is_potential_transcription("text", text)
    declare.is_valid_file_name("outFile", outFile)
    # verify the max order
    declare.less_equal("order", order, "max order", 9)
    # prepare srilm tool
    ExkaldiInfo.prepare_srilm()

    with FileHandleManager() as fhm:
        # check whether this is a reasonable text corpus that should be splited by space.
        if isinstance(text, str):
            cmd = f"shuf {text} -n 100"
            out, err, cod = run_shell_command(cmd,
                                              stdout="PIPE",
                                              stderr="PIPE")
            if (isinstance(cod, int) and cod != 0):
                print(err.decode())
                raise ShellProcessError(
                    f"Failed to sample from text file:{text}.")
            elif out == b'':
                raise WrongDataFormat(f"Void text file:{text}.")
            else:
                out = out.decode().strip().split("\n")
                spaceCount = 0
                for line in out:
                    spaceCount += line.count(" ")
                if spaceCount < len(out) // 2:
                    raise WrongDataFormat(
                        "The text file doesn't seem to be separated by spaces or sentences are extremely short."
                    )

        else:
            sampleText = text.subset(nRandom=100)
            spaceCount = 0
            for key, value in sampleText.items():
                assert isinstance(
                    value, str
                ), f"Transcription must be string but got: {type_name(value)}."
                spaceCount += value.count(" ")
            if spaceCount < len(sampleText) // 2:
                raise WrongDataFormat(
                    "The text file doesn't seem to be separated by spaces or sentences are extremely short."
                )
            textTemp = fhm.create("a+", suffix=".txt", encoding="utf-8")
            text.save(textTemp, discardUttID=True)
            text = textTemp.name

        unkSymbol = lexicons("oov")

        wordlistTemp = fhm.create("w+", encoding='utf-8', suffix=".txt")
        words = lexicons("words")
        words = "\n".join(words.keys())
        wordlistTemp.write(words)
        wordlistTemp.seek(0)

        extraConfig = " "
        specifyDiscount = False
        if config is not None:
            if check_config(name='train_ngrams_srilm', config=config):
                for key, value in config.items():
                    if isinstance(value, bool):
                        if value is True:
                            extraConfig += f"{key} "
                        if key.endswith("discount"):
                            specifyDiscount = True
                    else:
                        extraConfig += f" {key} {value}"

        cmd = f'ngram-count -text {text} -order {order} -limit-vocab -vocab {wordlistTemp.name} -unk -map-unk "{unkSymbol}" '
        if specifyDiscount is False:
            cmd += "-kndiscount "
        cmd += "-interpolate "

        if not outFile.rstrip().endswith(".arpa"):
            outFile += ".arpa"
        make_dependent_dirs(outFile, pathIsFile=True)
        cmd += f" -lm {outFile}"

        out, err, cod = run_shell_command(cmd, stderr="PIPE")

        if (isinstance(cod, int) and cod != 0) or (
                not os.path.isfile(outFile)) or os.path.getsize(outFile) == 0:
            print(err.decode())
            if os.path.isfile(outFile):
                os.remove(outFile)
            raise KaldiProcessError(
                f'Failed to generate N-Grams language model.')

        return outFile
Пример #16
0
def ctc_prefix_beam_search(prob,
                           vocabs,
                           blankID=None,
                           beam=5,
                           cutoff=0.999,
                           strick=1.0,
                           lmFile=None,
                           alpha=1.0,
                           beta=0):
    '''
    Prefix beam search decoding algorithm. Lm score is supported.

    Args:
        <prob>: An exkaldi postprobability object. This probalility should be an output of Neural Network with CTC loss fucntion.
                We expect the probability didn't pass any activation function, or it may generate wrong results.
        <vocabs>: a list of vocabulary.
        <blankID>: specify the ID of blank symbol. If None, use the last dimentionality of <prob>.
        <beam>: the beam size.
        <cutoff>: the sum threshold to cut off dimensions whose probability is extremely small.  
        <strick>: When the decoding results of two adjacent frames are the same, the probability of latter will be reduced.
        <lmFile>: If not None, add language model score to beam.
        <alpha>: the weight of LM score.
        <beta>: the length normaoliztion weight of LM score.
    Return:
        An exkaldi Transcription object of decoding results.  
    '''
    declare.is_classes("vocabs", vocabs, [tuple, list])

    declare.is_probability("prob", prob)
    if type_name(prob) == "BytesProbability":
        prob = prob.to_numpy()
    elif type_name(prob) == "ArkIndexTable":
        prob = prob.read_record("prob").to_numpy()

    if lmFile is not None:
        declare.is_file("lmFile", lmFile)
    else:
        lmFile = "none"

    probDim = prob.dims
    if len(vocabs) == probDim:
        if blankID is None:
            blankID = probDim - 1
        declare.is_positive_int("blankID", blackID)
        declare.in_boundary("blankID", blackID, 0, probDim - 1)

    elif len(vocabs) == probDim - 1:
        if blankID == None:
            blankID = probDim - 1
        else:
            assert blankID == probDim - 1, f"The dimensibality of probability is {probDim} but only have {len(vocabs)} words. In this case, blank ID must be {probDim-1} but got {blankID}"
    else:
        raise WrongDataFormat(
            f"The dimensibality of probability {probDim} does not match the numbers of words {len(vocabs)}."
        )

    for ID, word in enumerate(vocabs):
        if len(word.strip()) == 0:
            raise WrongDataFormat(f"Found a vocab {word} unavaliable.")

    num_classes = len(vocabs)
    vocabs = " ".join(vocabs)

    sources = [
        vocabs.encode(),
    ]
    uttTemp = []
    for utt, pb in prob.items:
        declare.is_classes("prob", prob, np.ndarray)
        declare.is_classes("the rank of matrix shape", len(pb.shape),
                           "expected rank", 2)
        pb = softmax(pb, axis=1)
        sources.append(f" {pb.shape[0]} ".encode() +
                       pb.astype("float32").tobytes())

    sources = b"".join(sources)

    cmd = os.path.join(sys.prefix, "exkaldisrc", "tools",
                       "prefix_beam_search_decode")
    cmd += " --num_files {}".format(prob.lens[0])
    cmd += " --num_classes {}".format(num_classes)
    cmd += " --blank_id {}".format(blankID)
    cmd += " --lm_model {}".format(lmFile)
    cmd += " --beam_size {}".format(beam)
    cmd += " --cutoff_prob {}".format(cutoff)
    cmd += " --alpha {}".format(alpha)
    cmd += " --beta {}".format(beta)

    out, err, _ = run_shell_command(cmd,
                                    stdin=subprocess.PIPE,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE,
                                    inputs=sources)

    if len(out) == 0:
        print(err.decode())
        raise Exception("Failed to beam search decode.")
    else:
        results = Transcription(name="beamSearchResults")
        out = out.decode().strip().split("file")
        results = []
        for index, re in enumerate(out[1:]):
            re = re.strip().split("\n")
            if len(re) <= 1:
                results.append([
                    "",
                ])
            else:
                results[uttTemp[index]] = " ".join(re[1].strip().split()[1:])

        return results
Пример #17
0
def convert_field(prob, originVocabs, targetVocabs, retainOOV=False):
    '''
    Tranform the dimensions of probability to target field.

    Args:
        <prob>: An exkaldi probability object. This probalility should be an output of Neural Network.
        <originVocabs>: list of original field vocabulary.
        <originVocabs>: list of target field vocabulary.
        <retainOOV>: If True, target words which are not in original vocabulary will be retained in minimum probability of each frame. 
    Return:
        An new exkaldi probability object and a list of new target vocabulary.  
    '''
    assert isinstance(
        originVocabs, list
    ), f"<originVocabs> must be a list of vocabulary but got {originVocabs}."
    assert isinstance(
        targetVocabs, list
    ), f"<targetVocabs> must be a list of vocabulary but got {targetVocabs}."

    if type_name(prob) == "BytesProbability":
        prob = prob.to_numpy()
    elif type_name(prob) == "NumpyProbability":
        pass
    else:
        raise UnsupportedType(
            f"<prob> should be an exkaldi probability object but got {type_name(prob)}."
        )

    probDim = prob.dim
    if len(originVocabs) != probDim:
        raise WrongDataFormat(
            f"The dimensibality of probability {probDim} does not match the numbers of words {len(originVocabs)}."
        )
    assert len(targetVocabs) > 0, f"Target vocabulary is void."

    origin_w2i = dict((w, i) for i, w in enumerate(originVocabs))

    retainIDs = []
    newTargetVocabs = []
    for w in targetVocabs:
        try:
            ID = origin_w2i[w]
        except KeyError:
            if retainOOV is True:
                newTargetVocabs.append(w)
                retainIDs.append(None)
            else:
                pass
        else:
            newTargetVocabs.append(w)
            retainIDs.append(ID)

    results = {}
    for utt, pb in prob.items:
        assert isinstance(pb, np.ndarray) and len(
            pb.shape) == 2, "Unsupported probability matrix shape."
        if retainOOV is True:
            padding = np.min(pb, axis=1)
        new = np.zeros(shape=(pb.shape[0], len(retainIDs)), dtype=np.float32)
        for index, i in enumerate(retainIDs):
            if i is None:
                new[:, index] = padding
            else:
                new[:, index] = pb[:, i]
            results[utt] = new

        results[utt] = new

    newName = f"convert({prob.name})"
    return NumpyProbability(data=results, name=newName), newTargetVocabs
Пример #18
0
	def load(self, filePath):
		'''
		Load auguments from file.

		Args:
			_filePath_: args file path.
		'''
		declare.is_file("filePath", filePath)
		self.reset()

		with open(filePath, "r", encoding="utf-8") as fr:
			lines = fr.read()
		lines = lines.strip()
		if len(lines) == 0:
			raise WrongOperation(f"This is a void file: {filePath}.")
		
		blocks = lines.split("\n\n")
		
		def __parse(name, value, dtype):
			if dtype in [float,int]:
				try:
					value = dtype(value)
				except ValueError:
					raise WrongOperation(f"Option <{name}> need a {dtype.__name__} value but choices got: {value}.")
			elif dtype == bool:
				if value.lower() == "true":
					value = True
				elif c.lower() == "false":
					value = False
				else:
					raise WrongOperation(f"Option <{name}> need a bool value but choices got: {value}.")

			return value  

		self.__discription = blocks[0].strip()
		for blockNo, block in enumerate(blocks[1:], start=1):
			block = block.strip()
			if len(block) == 0:
				continue
			block = block.split("\n")
			# 1. match options
			values = {"name":None,"abbr":None,"dtype":None,"default":None,"choices":None,"minV":None,"maxV":None,"discription":None,"value":None}
			for m in block:
				m = m.strip()
				assert "=" in m, f"Augument should has format: key = value, but got: {m}."
				assert len(m.split("=")) == 2, f"Augument should has format: key = value, but got: {m}."
				m = m.split("=")
				name = m[0].strip()
				value = m[1].strip()
				declare.is_instances("Option key", name, list(values.keys()))
				values[name] = value

			for key, value in values.items():
				assert value is not None, f"Missed {key} information in line: {lineNo}."
			# 2. parse
			name = values["name"]
			# parse the dtype firstly
			declare.is_instances("dtype", values["dtype"], ["float","int","bool","str"])
			values["dtype"] = eval(values["dtype"])
			dtype = values["dtype"]	
			# then parse the choices
			choices = values["choices"]
			if choices in ["none", "None"]:
				choices = None
			else:
				choices = choices.split("|")
				for i, c in enumerate(choices):
					choices[i] = __parse(name, c, dtype)
			values["choices"] = choices
			# then parse the boundary value
			boundary = {"minV":None, "maxV":None}
			for i in boundary.keys():
				V = values[i]
				if V not in ["none", "None"]:
					assert dtype in [float,int], f"Only float and int option can set the boundary but {name} is {dtype.__name__}:"
					assert choices is None, f"{name} cannot set choices and boundary concurrently."
					
					toIntFlag = True
					toFloatFlag = True
					try:
						float(V)
					except ValueError:
						toFloatFlag= False
					try:
						int(V)
					except ValueError:
						toIntFlag= False
					
					if toIntFlag is False and toFloatFlag is False:
						raise WrongDataFormat(f"Boundary values of {name} should be a int or float value but got: {V}.")
					elif toIntFlag is False and toFloatFlag is True: # minV is predicted be a float value
						if dtype != float:
							raise WrongDataFormat(f"{name}'s dtype is int but try to set boundary value with a float value: {V}.")
						else:
							V = float(V)
					elif toIntFlag is True and toFloatFlag is True: # minV is predicted be a float or an int value
						V = dtype(V)
					else:
						raise WrongDataFormat(f"Failed to set {name}'s boundary value: {V}.")
				
					boundary[i] = V
			values["minV"] = boundary["minV"]
			values["maxV"] = boundary["maxV"]
			# then parse the default and value
			if values["default"].lower() == "none":
				values["default"] = None
			else:
				default = values["default"].split("|")
				for i, v in enumerate(default):
					default[i] = __parse(name, v, dtype)
				values["default"] = default if len(default) > 1 else default[0]
			
			# the judgement of "default" will be done by .parse() function, so here we only verify "value"
			if values["value"].lower() == "none":
				values["value"] = None
			else:
				value = values["value"].split("|")
				for i, v in enumerate(value):
					v = __parse(name, v, dtype)
					if values["choices"] is not None:
						declare.is_instances("Option value", v, values["choices"])
					else:
						if values["minV"] is not None:
							declare.greater_equal("Option value", v, "minimum expected value", values["minV"])
						if values["maxV"] is not None:
							declare.less_equal("Option value", v, "maximum expected value", values["maxV"])
					value[i] = v
				if len(value) == 1:
					value = value[0]
				values["value"] = value
			
			# check abbreviation
			if values["abbr"] in ["none", "None"]:
				values["abbr"] = None

			# add this options
			self.add(name=values["name"], 
							 dtype=values["dtype"], 
							 abbr=values["abbr"], 
							 default=values["default"], 
					 		 choices=values["choices"], 
							 minV=values["minV"], 
							 maxV=values["maxV"], 
							 discription=values["discription"]
							)
			
			# finally, modify the "value"
			self.__arguments[values["name"]] = self.__arguments[values["name"]]._replace(value=values["value"])
			if values["value"] is not None:
				self.__setattr__(values["name"], values["value"])
Пример #19
0
def train_ngrams_srilm(lexicons, order, textFile, outFile, config=None):
    '''
	Train n-grams language model with Srilm tookit.

	Args:
		<lexicons>: words.txt file path or Exkaldi LexiconBank object.
		<order>: the maxinum order of n-grams.
		<textFile>: text corpus file.
		<outFile>: ARPA out file name.
		<config>: configures, a Python dict object.

	You can use .check_config("train_ngrams_srilm") function to get configure information that you can set.
	Also you can run shell command "lmplz" to look their meaning.
	'''
    assert isinstance(
        order, int
    ) and order > 0 and order < 10, "Expected <n> is a positive int value and it must be smaller than 10."
    assert isinstance(textFile,
                      str), "Expected <textFile> is name-like string."
    assert isinstance(outFile, str), "Expected <outFile> is name-like string."
    assert type_name(
        lexicons
    ) == "LexiconBank", f"Expected <lexicons> is exkaldi LexiconBank object but got {type_name(lexicons)}."

    ExkaldiInfo.prepare_srilm()

    if not os.path.isfile(textFile):
        raise WrongPath(f"No such file:{textFile}")
    else:
        ## Should check the numbers of lines
        cmd = f"shuf {textFile} -n 100"
        out, err, cod = run_shell_command(cmd,
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.PIPE)
        if (isinstance(cod, int) and cod != 0):
            print(err.decode())
            raise ShellProcessError("Failed to sample from text file.")
        elif out == b'':
            raise WrongDataFormat("Void text file.")
        else:
            out = out.decode().strip().split("\n")
            spaceCount = 0
            for line in out:
                spaceCount += line.count(" ")
            if spaceCount < len(out) // 2:
                raise WrongDataFormat(
                    "The text file doesn't seem to be separated by spaces or extremely short."
                )

    wordlist = tempfile.NamedTemporaryFile("w+",
                                           encoding='utf-8',
                                           suffix=".txt")
    unkSymbol = lexicons("oov")
    try:
        lexiconp = lexicons("lexiconp")
        words = [x[0] for x in lexiconp.keys()]
        wordlist.write("\n".join(words))
        wordlist.seek(0)

        #cmd2 = f"ngram-count -text {textFile} -order {order}"
        extraConfig = " "
        specifyDiscount = False
        if config is not None:
            if check_config(name='train_ngrams_srilm', config=config):
                for key, value in config.items():
                    if isinstance(value, bool):
                        if value is True:
                            extraConfig += f"{key} "
                        if key.endswith("discount"):
                            specifyDiscount = True
                    else:
                        extraConfig += f" {key} {value}"

        cmd = f"ngram-count -text {textFile} -order {order} -limit-vocab -vocab {wordlist.name} -unk -map-unk {unkSymbol} "
        if specifyDiscount is False:
            cmd += "-kndiscount "
        cmd += "-interpolate "

        if not outFile.rstrip().endswith(".arpa"):
            outFile += ".arpa"
        make_dependent_dirs(outFile, pathIsFile=True)

        cmd += f" -lm {outFile}"

        out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE)

        if (isinstance(cod, int) and cod != 0) or (
                not os.path.isfile(outFile)) or os.path.getsize(outFile) == 0:
            print(err.decode())
            if os.path.isfile(outFile):
                os.remove(outFile)
            raise KaldiProcessError(
                f'Failed to generate ngrams language model.')
        else:
            return os.path.abspath(outFile)

    finally:
        wordlist.close()
Пример #20
0
def load_index_table(target, name="index", useSuffix=None):
    '''
	Load an index table from dict,or archive table file.

	Args:
		<target>: dict object,.ark or .scp file,ArkIndexTable object,bytes archive object.
		<name>: a string.
		<useSuffix>: "ark" or "scp". We will check the file type by its suffix. 
								But if <target> is file path and not default suffix (ark or scp),you have to declare which type it is.

	Return:
		an exkaldi ArkIndexTable object.
	'''
    newTable = ArkIndexTable(name=name)

    if type_name(target) == "dict":
        for key, value in target.items():
            if isinstance(value, (list, tuple)):
                assert len(value) in [
                    3, 4
                ], f"Expected (frames,start index,data size[,file path]) but {value} does not match."
                newTable[key] = newTable.spec(*value)
            elif type_name(value) == "Index":
                newTable[key] = value
            else:
                raise WrongDataFormat(
                    f"Expected list or tuple but got wrong index info format: {value}."
                )

        return newTable

    elif type_name(target) == "ArkIndexTable":
        newTable.update(target)
        return newTable

    elif isinstance(target, BytesArchive):
        newTable.update(target.indexTable)
        return newTable

    else:
        fileList = list_files(target)

        if useSuffix is not None:
            declare.is_valid_string("useSuffix", useSuffix)
            useSuffix = useSuffix.strip()[-3:].lower()
            declare.is_instances("useSuffix", useSuffix, ["ark", "scp"])
        else:
            useSuffix = ""

        for fileName in fileList:

            if fileName.rstrip().endswith(".ark"):
                t = __read_index_table_from_ark_file(fileName)
            elif fileName.rstrip().endswith(".scp"):
                t = __read_index_table_from_scp_file(fileName)
            elif useSuffix == "ark":
                t = __read_index_table_from_ark_file(fileName)
            elif useSuffix == "scp":
                t = __read_index_table_from_scp_file(fileName)
            else:
                raise UnsupportedType(
                    "Unknown file suffix. Specify <useSuffix> please.")

            newTable.update(t)

        return newTable
Пример #21
0
def decompress_feat(feat):
    '''
	Decompress a kaldi conpressed feature whose data-type is "CM"
	
	Args:
		<feat>: an exkaldi feature object.
	Return:
		An new exkaldi feature object.

	This function is a cover of kaldi-io-for-python tools. 
	For more information about it, please access to https://github.com/vesis84/kaldi-io-for-python/blob/master/kaldi_io/kaldi_io.py 
	'''
    assert isinstance(
        feat,
        BytesFeature), "Expected <feat> is a exkaldi bytes feature object."

    def _read_compressed_mat(fd):

        # Format of header 'struct',
        global_header = np.dtype([('minvalue', 'float32'),
                                  ('range', 'float32'), ('num_rows', 'int32'),
                                  ('num_cols', 'int32')
                                  ])  # member '.format' is not written,
        per_col_header = np.dtype([('percentile_0', 'uint16'),
                                   ('percentile_25', 'uint16'),
                                   ('percentile_75', 'uint16'),
                                   ('percentile_100', 'uint16')])

        # Read global header,
        globmin, globrange, rows, cols = np.frombuffer(fd.read(16),
                                                       dtype=global_header,
                                                       count=1)[0]
        cols = int(cols)
        rows = int(rows)

        # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ]
        #                         {           cols           }{     size         }
        col_headers = np.frombuffer(fd.read(cols * 8),
                                    dtype=per_col_header,
                                    count=cols)
        col_headers = np.array([
            np.array([x
                      for x in y]) * globrange * 1.52590218966964e-05 + globmin
            for y in col_headers
        ],
                               dtype=np.float32)
        data = np.reshape(np.frombuffer(fd.read(cols * rows),
                                        dtype='uint8',
                                        count=cols * rows),
                          newshape=(cols, rows))  # stored as col-major,

        mat = np.zeros((cols, rows), dtype='float32')
        p0 = col_headers[:, 0].reshape(-1, 1)
        p25 = col_headers[:, 1].reshape(-1, 1)
        p75 = col_headers[:, 2].reshape(-1, 1)
        p100 = col_headers[:, 3].reshape(-1, 1)
        mask_0_64 = (data <= 64)
        mask_193_255 = (data > 192)
        mask_65_192 = (~(mask_0_64 | mask_193_255))

        mat += (p0 + (p25 - p0) / 64. * data) * mask_0_64.astype(np.float32)
        mat += (p25 + (p75 - p25) / 128. *
                (data - 64)) * mask_65_192.astype(np.float32)
        mat += (p75 + (p100 - p75) / 63. *
                (data - 192)) * mask_193_255.astype(np.float32)

        return mat.T, rows, cols

    with BytesIO(feat.data) as sp:
        newData = []

        while True:
            data = b''
            utt = ''
            while True:
                char = sp.read(1)
                data += char
                char = char.decode()
                if (char == '') or (char == ' '): break
                utt += char
            utt = utt.strip()
            if utt == '': break
            binarySymbol = sp.read(2)
            data += binarySymbol
            binarySymbol = binarySymbol.decode()
            if binarySymbol == '\0B':
                dataType = sp.read(3).decode()
                if dataType == 'CM ':
                    data += 'FM '.encode()
                    matrix, rows, cols = _read_compressed_mat(sp)
                    data += '\04'.encode()
                    data += struct.pack(np.dtype('uint32').char, rows)
                    data += '\04'.encode()
                    data += struct.pack(np.dtype('uint32').char, cols)
                    data += matrix.tobytes()
                    newData.append(data)
                else:
                    raise UnsupportedType(
                        "This is not a compressed binary data.")
            else:
                raise WrongDataFormat('Miss right binary symbol.')

    return BytesFeature(b''.join(newData), name=feat.name)
Пример #22
0
def train_ngrams_kenlm(lexicons, order, textFile, outFile, config=None):
    '''
	Train n-grams language model with KenLm tookit.

	Args:
		<lexicons>: words.txt file path or Exkaldi LexiconBank object.
		<order>: the maxinum order of n-grams.
		<textFile>: text corpus file.
		<outFile>: ARPA out file name.
		<config>: configures, a Python dict object.

	You can use .check_config("train_ngrams_kenlm") function to get configure information that you can set.
	Also you can run shell command "lmplz" to look their meaning.
	'''
    assert isinstance(
        order, int
    ) and 0 < order <= 6, "We support maximum 6-grams LM in current version."

    if not os.path.isfile(textFile):
        raise WrongPath("No such file:{}".format(textFile))
    else:
        ## Should check the numbers of lines
        cmd = f"shuf {textFile} -n 100"
        out, err, cod = run_shell_command(cmd,
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.PIPE)
        if (isinstance(cod, int) and cod != 0):
            print(err.decode())
            raise ShellProcessError("Failed to sample from text file.")
        elif out == b'':
            raise WrongDataFormat("Void text file.")
        else:
            out = out.decode().strip().split("\n")
            spaceCount = 0
            for line in out:
                spaceCount += line.count(" ")
            if spaceCount < len(out) // 2:
                raise WrongDataFormat(
                    "The text file doesn't seem to be separated by spaces or extremely short."
                )

    extraConfig = " "
    if config != None:
        assert isinstance(
            config, dict
        ), f"<config> should be dict object but got: {type_name(config)}."
        if check_config(name='train_ngrams_kenlm', config=config):
            if "--temp_prefix" in config.keys() and "-T" in config.keys():
                raise WrongOperation(
                    f'"--temp_prefix" and "-T" is the same configure so only one of them is expected.'
                )
            if "--memory" in config.keys() and "-S" in config.keys():
                raise WrongOperation(
                    f'"--memory" and "-S" is the same configure so only one of them is expected.'
                )
            for key, value in config.items():
                if isinstance(value, bool):
                    if value is True:
                        extraConfig += f"{key} "
                else:
                    extraConfig += f"{key} {value} "

    assert isinstance(outFile, str), f"<outFile> should be a string."
    if not outFile.rstrip().endswith(".arpa"):
        outFile += ".arpa"
    make_dependent_dirs(outFile, pathIsFile=True)

    words = tempfile.NamedTemporaryFile("w+", suffix=".txt", encoding="utf-8")
    try:
        if type_name(lexicons) == "LexiconBank":
            ws = lexicons("words")
            words_count = math.ceil(len(ws) / 10) * 10
            ws = "\n".join(ws.keys())
        elif isinstance(lexicons, str):
            if not os.path.isfile(lexicons):
                raise WrongPath(f"No such file:{lexicons}.")
            with open(lexicons, "r", encoding="utf-8") as fr:
                lines = fr.readlines()
            ws = []
            for line in lines:
                line = line.strip().split(maxsplit=1)
                if len(line) < 1:
                    continue
                else:
                    ws.append(line[0])
            words_count = math.ceil(len(ws) / 10) * 10
            ws = "\n".join(ws)
        else:
            raise UnsupportedType(
                "<lexicons> should be LexiconBank object or file path.")

        words.write(ws)
        words.seek(0)

        KenLMTool = os.path.join(sys.prefix, "exkaldisrc", "tools", "lmplz")

        cmd = f"{KenLMTool}{extraConfig}-o {order} --vocab_estimate {words_count} --text {textFile} --arpa {outFile} --limit_vocab_file {words.name}"
        out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE)

        if (isinstance(cod, int) and cod != 0) or (
                not os.path.isfile(outFile)) or (os.path.getsize(outFile)
                                                 == 0):
            print(err.decode())
            raise KenlmProcessError("Failed to generate arpa file.")
        else:
            return os.path.abspath(outFile)

    finally:
        words.close()