def compute_fbank(target,rate=16000,frameWidth=25,frameShift=10, melBins=23,windowType='povey',useSuffix=None, config=None,name="fbank",outFile=None): ''' Compute fbank feature. Share Args: Null Parallel Args: <target>: wave file,scp file,exkaldi ListTable object or WavSegment object. If it is wave file,we will use it's file name as utterance ID. <rate>: sample rate. <frameWidth>: windows width (ms). <frameShift>: shift windows width (ms). <melbins>: the numbers of mel filter banks. <windowType>: windows type. <useSuffix>: If the suffix of file is not .scp or .wav,use this to specify it. <config>: extra optional configurations. <name>: the name of output feature. <outFile>: output file name. Some usual options can be assigned directly. If you want use more,set <config> = your-configure. You can use exkaldi.check_config('compute_fbank') function to get the reference of extra configurations. Also you can run shell command "compute-fbank-feats" to look their usage. Return: exkaldi feature or index table object. ''' # check the basis configure parameters to build base commands stdParameters = check_multiple_resources(rate,frameWidth,frameShift,melBins,windowType,config) baseCmds = [] for rate,frameWidth,frameShift,melBins,windowType,config,_ in zip(*stdParameters): declare.is_positive_int("rate",rate) declare.is_positive_int("frameWidth",frameWidth) declare.is_positive_int("frameShift",frameShift) declare.is_positive_int("melBins",melBins) declare.greater_equal("frameWidth",frameWidth,"frameShift",frameShift) declare.is_instances("windowType",windowType,["hamming","hanning","povey","rectangular","blackmann"]) kaldiTool = 'compute-fbank-feats --allow-downsample --allow-upsample ' kaldiTool += f'--sample-frequency={rate} ' kaldiTool += f'--frame-length={frameWidth} ' kaldiTool += f'--frame-shift={frameShift} ' kaldiTool += f'--num-mel-bins={melBins} ' kaldiTool += f'--window-type={windowType} ' if config is not None: if check_config(name='compute_fbank',config=config): for key,value in config.items(): if isinstance(value,bool): if value is True: kaldiTool += f"{key} " else: kaldiTool += f"{key}={value} " baseCmds.append(kaldiTool) # run the common function return __compute_feature(target,baseCmds,useSuffix,name,outFile)
def load_ali(target, aliType="transitionID", name="ali", hmm=None): ''' Load alignment data. Args: <target>: Python dict object,bytes object,exkaldi alignment object,kaldi alignment file or .npy file. <aliType>: None,or one of 'transitionID','phoneID','pdfID'. It will return different alignment object. <name>: a string. <hmm>: file path or exkaldi HMM object. Return: exkaldi alignment objects. ''' declare.is_valid_string("name", name) declare.is_instances("aliType", aliType, [None, "transitionID", "phoneID", "pdfID"]) declare.kaldi_existed() def transform(data, cmd): out, err, cod = run_shell_command(cmd, stdin="PIPE", stdout="PIPE", stderr="PIPE", inputs=data) if (isinstance(cod, int) and cod != 0) and out == b'': raise KaldiProcessError('Failed to transform alignment.', err.decode()) else: result = {} sp = BytesIO(out) for line in sp.readlines(): line = line.decode() line = line.strip().split() utt = line[0] matrix = np.array(line[1:], dtype=np.int32) result[utt] = matrix return result if isinstance(target, dict): if aliType is None: result = NumpyAli(target, name) elif aliType == "transitionID": result = NumpyAliTrans(target, name) elif aliType == "phoneID": result = NumpyAliPhone(target, name) elif aliType == "pdfID": result = NumpyAliPdf(target, name) else: raise WrongOperation( f"<aliType> should be None,'transitionID','phoneID' or 'pdfID' but got {aliType}." ) result.check_format() return result elif isinstance(target, (NumpyAli, NumpyAliTrans, BytesAliTrans)): result = copy.deepcopy(target) result.rename(name) return result elif isinstance(target, IndexTable): result = target.fetch(arkType="ali") if aliType in ["phoneID", "pdfID"]: result = result.to_numpy(aliType, hmm) result.rename(name) return result elif isinstance(target, str): allFiles = list_files(target) numpyAli = {} bytesAli = [] for fileName in allFiles: fileName = fileName.strip() if fileName.endswith(".npy"): try: temp = np.load(fileName, allow_pickle=True) numpyAli.update(temp) except: raise UnsupportedType( f'This is not a valid Exkaldi npy file: {fileName}.') else: if fileName.endswith('.gz'): cmd = f'gunzip -c {fileName}' else: cmd = f'cat {fileName}' if aliType is None or aliType == "transitionID": out, err, cod = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") if (isinstance(cod, int) and cod != 0) or out == b'': raise ShellProcessError( f"Failed to get the alignment data from file: {fileName}.", err.decode()) else: bytesAli.append(out) else: with FileHandleManager() as fhm: declare.is_potential_hmm("hmm", hmm) if not isinstance(hmm, str): hmmTemp = fhm.create("wb+") hmm.save(hmmTemp) hmm = hmmTemp.name if aliType == "phoneID": cmd += f" | ali-to-phones --per-frame=true {hmm} ark:- ark,t:-" temp = transform(None, cmd) else: cmd += f" | ali-to-pdf {hmm} ark:- ark,t:-" temp = transform(None, cmd) numpyAli.update(temp) bytesAli = b"".join(bytesAli) if aliType is None: if len(numpyAli) == 0: return BytesAliTrans(bytesAli, name=name) elif len(bytesAli) == 0: return NumpyAli(numpyAli, name=name) else: result = NumpyAliTrans(numpyAli) + BytesAliTrans(bytesAli) result.rename(name) return result elif aliType == "transitionID": if len(numpyAli) == 0: return BytesAliTrans(bytesAli, name=name) elif len(bytesAli) == 0: return NumpyAliTrans(numpyAli, name=name) else: result = NumpyAliTrans(numpyAli) + BytesAliTrans(bytesAli) result.rename(name) return result elif aliType == "phoneID": return NumpyAliPhone(numpyAli, name=name) else: return NumpyAliPdf(numpyAli, name=name) else: raise UnsupportedType( f"<target> should be dict,file name or exkaldi alignment or index table object but got: {type_name(target)}." )
def __read_data_from_file(fileName, useSuffix=None): ''' Read data from file. If the file suffix is unknown,<useSuffix> is necessary. ''' declare.kaldi_existed() if useSuffix != None: declare.is_valid_string("useSuffix", useSuffix) useSuffix = useSuffix.strip().lower()[-3:] declare.is_instances("useSuffix", useSuffix, ["ark", "scp", "npy"]) else: useSuffix = "" allFiles = list_files(fileName) allData_bytes = [] allData_numpy = {} def loadNpyFile(fileName): try: temp = np.load(fileName, allow_pickle=True) data = {} for utt_mat in temp: assert isinstance(utt_mat[0], str) and isinstance( utt_mat[1], np.ndarray) data[utt_mat[0]] = utt_mat[1] except: raise UnsupportedType( f'This is not a valid Exkaldi npy file: {fileName}.') else: return data def loadArkScpFile(fileName, suffix): declare.kaldi_existed() if suffix == "ark": cmd = 'copy-feats ark:' else: cmd = 'copy-feats scp:' cmd += '{} ark:-'.format(fileName) out, err, cod = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") if (isinstance(cod, int) and cod != 0) or out == b'': raise KaldiProcessError('Failed to read archive table.', err.decode()) else: #if sys.getsizeof(out) > 10000000000: # print('Warning: Data is extramely large. We don't recommend use load_index_table to replace it.') return out for fileName in allFiles: sfx = fileName.strip()[-3:].lower() if sfx == "npy": allData_numpy.update(loadNpyFile(fileName)) elif sfx in ["ark", "scp"]: allData_bytes.append(loadArkScpFile(fileName, sfx)) elif useSuffix == "npy": allData_numpy.update(loadNpyFile(fileName)) elif useSuffix in ["ark", "scp"]: allData_bytes.append(loadArkScpFile(fileName, sfx)) else: raise UnsupportedType( 'Unknown file suffix. You can appoint the <useSuffix> option with "scp","ark" or "npy".' ) allData_bytes = b"".join(allData_bytes) if useSuffix == "": useSuffix = allFiles[0].strip()[-3:].lower() if useSuffix == "npy": dataType = "numpy" else: dataType = "bytes" return allData_bytes, allData_numpy, dataType
def load_index_table(target, name="index", useSuffix=None): ''' Load an index table from dict,or archive table file. Args: <target>: dict object,.ark or .scp file,IndexTable object,bytes archive object. <name>: a string. <useSuffix>: "ark" or "scp". We will check the file type by its suffix. But if <target> is file path and not default suffix (ark or scp),you have to declare which type it is. Return: an exkaldi IndexTable object. ''' newTable = IndexTable(name=name) if type_name(target) == "dict": for key, value in target.items(): if isinstance(value, (list, tuple)): assert len(value) in [ 3, 4 ], f"Expected (frames,start index,data size[,file path]) but {value} does not match." newTable[key] = newTable.spec(*value) elif type_name(value) == "Index": newTable[key] = value else: raise WrongDataFormat( f"Expected list or tuple but got wrong index info format: {value}." ) return newTable elif type_name(target) == "IndexTable": newTable.update(target) return newTable elif isinstance(target, BytesArchive): newTable.update(target.indexTable) return newTable else: fileList = list_files(target) if useSuffix is not None: declare.is_valid_string("useSuffix", useSuffix) useSuffix = useSuffix.strip()[-3:].lower() declare.is_instances("useSuffix", useSuffix, ["ark", "scp"]) else: useSuffix = "" for fileName in fileList: if fileName.rstrip().endswith(".ark"): t = __read_index_table_from_ark_file(fileName) elif fileName.rstrip().endswith(".scp"): t = __read_index_table_from_scp_file(fileName) elif useSuffix == "ark": t = __read_index_table_from_ark_file(fileName) elif useSuffix == "scp": t = __read_index_table_from_scp_file(fileName) else: raise UnsupportedType( "Unknown file suffix. Specify <useSuffix> please.") newTable.update(t) return newTable
def __compute_feature(target,kaldiTool,useSuffix=None,name="feat",outFile=None): ''' The base funtion to compute feature. ''' declare.kaldi_existed() if useSuffix != None: declare.is_valid_string("useSuffix",useSuffix) useSuffix = useSuffix.strip().lower()[-3:] declare.is_instances("useSuffix",useSuffix,["scp","wav"]) else: useSuffix = "" targets,kaldiTools,useSuffixs,names,outFiles = check_multiple_resources(target,kaldiTool,useSuffix,name,outFile=outFile) # pretreatment fromSegment = False with FileHandleManager() as fhm: segments = [] for index,kaldiTool,target,useSuffix,name in zip(range(len(outFiles)),kaldiTools,targets,useSuffixs,names): declare.is_classes("target",target,["str","ListTable","WavSegment"]) declare.is_valid_string("name",name) if isinstance(target,str): allFiles = list_files(target) target = ListTable() for filePath in allFiles: filePath = filePath.strip() if filePath[-4:].lower() == ".wav": fileName = os.path.basename(filePath) uttID = fileName[0:-4].replace(".","") target[uttID] = filePath elif filePath[-4:].lower() == '.scp': target += load_list_table(filePath) elif "wav" == useSuffix: fileName = os.path.basename(filePath) uttID = fileName.replace(".","") target[uttID] = filePath elif "scp" == useSuffix: target += load_list_table(filePath) else: raise UnsupportedType('Unknown file suffix. You can declare whether <useSuffix> is "wav" or "scp".') if len(target) == 0: raise WrongDataFormat("There did not include any data to compute data in target.") targets[index] = target elif type_name(target) == "WavSegment": segTemp = fhm.create("w+",suffix=".seg",encode="utf-8") target.save(segTemp) segments.append(segTemp.name) targets[index] = target.detach_wav() fromSegment = True if fromSegment: # define the command pattern cmdPattern = "extract-segments scp:{wavFile} {segment} ark:- | {kaldiTool} ark:- ark:{outFile}" # define resources resources = {"wavFile":targets,"segment":segments,"kaldiTool":kaldiTools,"outFile":outFiles} else: # define the command pattern cmdPattern = "{kaldiTool} scp:{wavFile} ark:{outFile}" # define resources resources = {"wavFile":targets,"kaldiTool":kaldiTools,"outFile":outFiles} # Run return run_kaldi_commands_parallel(resources,cmdPattern,analyzeResult=True,generateArchive="feat",archiveNames=names)
def wer(ref, hyp, ignore=None, mode='all'): ''' Compute WER (word error rate) between <ref> and <hyp>. Args: <ref>,<hyp>: exkaldi transcription object or file path. <ignore>: ignore a symbol. <mode>: "all" or "present". Return: a namedtuple of score information. ''' declare.is_potential_transcription("ref", ref) declare.is_potential_transcription("hyp", hyp) declare.is_instances("mode", mode, ['all', 'present']) declare.kaldi_existed() if ignore is not None: declare.is_valid_string("ignore", ignore) with FileHandleManager() as fhm: if ignore is None: if type_name(hyp) == "Transcription": hypTemp = fhm.create("w+", suffix=".txt", encoding="utf-8") hyp.save(hypTemp) hyp = hypTemp.name if type_name(ref) == "Transcription": refTemp = fhm.create("w+", suffix=".txt", encoding="utf-8") ref.save(refTemp) ref = refTemp.name cmd = f'compute-wer --text --mode={mode} ark:{ref} ark,p:{hyp}' scoreOut, scoreErr, _ = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") else: # remove the ingored symbol in hyp if type_name(hyp) == "Transcription": hyp = hyp.save() else: with open(hyp, "r", encoding="utf-8") as fr: hyp = fr.read() hypTemp = fhm.create("w+", suffix=".txt", encoding="utf-8") cmd = f'sed "s/{ignore} //g" > {hypTemp.name}' hypOut, err, _ = run_shell_command(cmd, stdin="PIPE", stdout="PIPE", stderr="PIPE", inputs=hyp) if len(hypOut) == 0: raise WrongDataFormat("<hyp> has wrong data formation.", err.decode()) # remove the ingored symbol in ref if type_name(ref) == "Transcription": ref = ref.save() else: with open(ref, "r", encoding="utf-8") as fr: ref = fr.read() refTemp = fhm.create("w+", suffix=".txt", encoding="utf-8") cmd = f'sed "s/{ignore} //g" > {refTemp.name}' refOut, err, cod = run_shell_command(cmd, stdin="PIPE", stdout="PIPE", stderr="PIPE", inputs=ref) if cod != 0 or len(refOut) == 0: raise WrongDataFormat("<ref> has wrong data formation.", err.decode()) # score cmd = f'compute-wer --text --mode={mode} ark:{refTemp.name} ark,p:{hypTemp.name}' scoreOut, scoreErr, _ = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") if len(scoreOut) == 0: raise KaldiProcessError("Failed to compute WER.", scoreErr.decode()) else: out = scoreOut.decode().split("\n") pattern1 = '%WER (.*) \[ (.*) \/ (.*),(.*) ins,(.*) del,(.*) sub \]' pattern2 = "%SER (.*) \[ (.*) \/ (.*) \]" pattern3 = "Scored (.*) sentences,(.*) not present in hyp." s1 = re.findall(pattern1, out[0])[0] s2 = re.findall(pattern2, out[1])[0] s3 = re.findall(pattern3, out[2])[0] return namedtuple("Score", [ "WER", "words", "insErr", "delErr", "subErr", "SER", "sentences", "wrongSentences", "missedSentences" ])( float(s1[0]), #WER int(s1[2]), #words int(s1[3]), #ins int(s1[4]), #del int(s1[5]), #sub float(s2[0]), #SER int(s2[1]), #sentences int(s2[2]), #wrong sentences int(s3[1]) #missed sentences )
def edit_distance(ref, hyp, ignore=None, mode='present'): ''' Compute edit-distance score. Args: <ref>,<hyp>: exkaldi Transcription objects. <ignore>: Ignoring specific symbols. <mode>: When both are Transcription objects,if mode is 'present',skip the missed utterances. Return: a namedtuple object including score information. ''' declare.is_potential_transcription("ref", ref) declare.is_potential_transcription("hyp", hyp) declare.is_instances("mode", mode, ['all', 'present']) if ignore is not None: declare.is_valid_string("ignore", ignore) if isinstance(ref, str): ref = load_transcription(ref) if isinstance(hyp, str): hyp = load_transcription(hyp) allED = 0 words = 0 SER = 0 sentences = 0 wrongSentences = 0 missedSentences = 0 ref = ref.sort() hyp = hyp.sort() for utt, hypTrans in hyp.items(): try: refTrans = ref[utt] except KeyError as e: if mode == "all": raise Exception( "Missing transcription in reference,set <mode> as 'all' to skip it." ) else: missedSentences += 1 else: sentences += 1 refTrans = refTrans.split() hypTrans = hypTrans.split() ed, wds = pure_edit_distance(refTrans, hypTrans, ignore=ignore) allED += ed words += wds if ed > 0: wrongSentences += 1 if sentences == 0: raise Exception( "Missing all transcription in reference. We don't think it's a reasonable result. Check the file please." ) return namedtuple("Score", [ "editDistance", "words", "SER", "sentences", "wrongSentences", "missedSentences" ])(allED, words, wrongSentences / sentences, sentences, wrongSentences, missedSentences)
def pad_sequence(data, dim=0, maxLength=None, dtype='float32', padding='tail', truncating='tail', value=0.0): ''' Pad sequence. Args: <data>: a list of NumPy arrays. <dim>: which dimmension to pad. All other dimmensions should be the same size. <maxLength>: If larger than this theshold,truncate it. <dtype>: target dtype. <padding>: padding position,"head","tail" or "random". <truncating>: truncating position,"head","tail". <value>: padding value. Return: a two-tuple: (a Numpy array,a list of padding positions). ''' declare.is_classes("data", data, (list, tuple)) declare.is_non_negative_int("dim", dim) declare.not_void("data", data) declare.is_classes("value", value, (int, float)) declare.is_instances("padding", padding, ["head", "tail", "random"]) declare.is_instances("truncating", padding, ["head", "tail"]) if maxLength is not None: declare.is_positive_int("maxLength", maxLength) lengths = [] newData = [] exRank = None exOtherDims = None for i in data: # verify declare.is_classes("data", i, np.ndarray) shape = i.shape if exRank is None: exRank = len(shape) assert dim < exRank, f"<dim> is out of range: {dim}>{exRank-1}." else: assert len( shape ) == exRank, f"Arrays in <data> has different rank: {exRank}!={len(shape)}." if dim != 0: # transpose rank = [r for r in range(exRank)] rank[0] = dim rank[dim] = 0 i = i.transpose(rank) if exOtherDims is None: exOtherDims = i.shape[1:] else: assert exOtherDims == i.shape[ 1:], f"Expect for sequential dimmension,All arrays in <data> has same shape but got: {exOtherDims}!={i.shape[1:]}." length = len(i) if maxLength is not None and length > maxLength: if truncating == "head": i = i[maxLength:, ...] else: i = i[0:maxLength:, ...] lengths.append(len(i)) newData.append(i) maxLength = max(lengths) batchSize = len(newData) result = np.array(value, dtype=dtype) * np.ones( [batchSize, maxLength, *exOtherDims], dtype=dtype) pos = [] for i in range(batchSize): length = lengths[i] if padding == "tail": result[i][0:length] = newData[i] pos.append((0, length)) elif padding == "head": start = maxLength - length result[i][start:] = newData[i] pos.append((start, maxLength)) else: start = random.randint(0, maxLength - length) end = start + length result[i][start:end] = newData[i] pos.append((start, end)) if dim != 0: exRank = len(result.shape) rank = [r for r in range(exRank)] rank[1] = dim + 1 rank[dim + 1] = 1 result = result.transpose(rank) return result, pos
def add(self,name,dtype,abbr=None,default=None,choices=None,minV=None,maxV=None,discription=None): ''' Add a new option. Args: _name_: a string which must have a format such as "--exkaldi" (but "--help" is inavaliable exceptionally.). _dtype_: float, int, str or bool. _abbr_: None or a abbreviation of name which must have a format such as "-e" (but "-h" is inavaliable exceptionally.). _dtype_: the default value or a list/tuple of values. _choices_: a list/tuple of values. _minV_: set the minimum value if dtype is int or float. Enable when _choices_ is None. _maxV_: set the maximum value if dtype is int or float. Enable when _choices_ is None. _maxV_: a string to discribe this option. ''' self.__capture() # check option name declare.is_valid_string("name",name) name = name.strip() self.__detect_special_char(name) assert name[0:2] == "--" and name[2:3] != "-", f"Option name must start with '--' but got: {name}." assert name != "--help", "Option name is inavaliable: --help." if name in self.__arguments.keys(): raise WrongOperation(f"Option name has existed: {name}.") # check dtype declare.is_instances("option dtype", dtype, (float,int,bool,str)) # check abbreviation if abbr is not None: declare.is_valid_string("abbr",abbr) abbr = abbr.strip() self.__detect_special_char(abbr) assert abbr[0:1] == "-" and abbr[1:2] != "-", f"Abbreviation must start with '-' but got: {abbr}." assert abbr != "-h", "Abbreviation is inavaliable: -h." if abbr in self.__abb2Name.keys(): raise WrongOperation(f"Abbreviation has existed: {abbr}.") # check default value if default is not None: if isinstance(default,(list,tuple)): declare.members_are_classes(f"Default value of {name}", default, dtype) else: declare.is_classes(f"Default value of {name}", default, dtype) if dtype == str: self.__detect_special_char(default) # check choices if choices is not None: declare.is_classes(f"Choices of {name}", choices, (list,tuple)) declare.members_are_classes(f"Choices of {name}", choices, dtype) if dtype == str: self.__detect_special_char(choices) if default is not None: if isinstance(default,(list,tuple)): declare.members_are_instances(f"Default value of {name}", default, choices) else: declare.is_instances(f"Default value of {name}", default, choices) # check boundary values if minV is not None or maxV is not None: assert dtype in [float,int], f"Only float and int option can set the boundary but {name} is {dtype.__name__}." assert choices is None, f"Cannot set choices and boundary concurrently: {name}." if minV is not None: declare.is_classes(f"Minimum value of {name}", minV, dtype) if default is not None: if isinstance(default, (list,tuple)): for v in default: declare.greater_equal(f"Default value of {name}", v, "minimum expected value", minV) else: declare.greater_equal(f"Default of {name}", default, "minimum expected value", minV) if maxV is not None: declare.is_classes(f"Maximum value of {name}", maxV, dtype) if default is not None: if isinstance(default,(list,tuple)): for v in default: declare.less_equal(f"Default value of {name}", v, "maximum expected value", maxV) else: declare.less_equal(f"Default value of {name}", default, "maximum expected value", maxV) if minV is not None and maxV is not None: declare.less_equal(f"Minimum value of {name}", minV, f"maximum value", maxV) # check discription if discription is not None: declare.is_valid_string(f"Discription of {name}", discription) self.__detect_special_char(discription) self.__arguments[name] = self.spec(dtype,default,choices,minV,maxV,discription) self.__name2Abb[name] = abbr if abbr is not None: self.__abb2Name[abbr] = name
def load(self, filePath): ''' Load auguments from file. Args: _filePath_: args file path. ''' declare.is_file("filePath", filePath) self.reset() with open(filePath, "r", encoding="utf-8") as fr: lines = fr.read() lines = lines.strip() if len(lines) == 0: raise WrongOperation(f"This is a void file: {filePath}.") blocks = lines.split("\n\n") def __parse(name, value, dtype): if dtype in [float,int]: try: value = dtype(value) except ValueError: raise WrongOperation(f"Option <{name}> need a {dtype.__name__} value but choices got: {value}.") elif dtype == bool: if value.lower() == "true": value = True elif c.lower() == "false": value = False else: raise WrongOperation(f"Option <{name}> need a bool value but choices got: {value}.") return value self.__discription = blocks[0].strip() for blockNo, block in enumerate(blocks[1:], start=1): block = block.strip() if len(block) == 0: continue block = block.split("\n") # 1. match options values = {"name":None,"abbr":None,"dtype":None,"default":None,"choices":None,"minV":None,"maxV":None,"discription":None,"value":None} for m in block: m = m.strip() assert "=" in m, f"Augument should has format: key = value, but got: {m}." assert len(m.split("=")) == 2, f"Augument should has format: key = value, but got: {m}." m = m.split("=") name = m[0].strip() value = m[1].strip() declare.is_instances("Option key", name, list(values.keys())) values[name] = value for key, value in values.items(): assert value is not None, f"Missed {key} information in line: {lineNo}." # 2. parse name = values["name"] # parse the dtype firstly declare.is_instances("dtype", values["dtype"], ["float","int","bool","str"]) values["dtype"] = eval(values["dtype"]) dtype = values["dtype"] # then parse the choices choices = values["choices"] if choices in ["none", "None"]: choices = None else: choices = choices.split("|") for i, c in enumerate(choices): choices[i] = __parse(name, c, dtype) values["choices"] = choices # then parse the boundary value boundary = {"minV":None, "maxV":None} for i in boundary.keys(): V = values[i] if V not in ["none", "None"]: assert dtype in [float,int], f"Only float and int option can set the boundary but {name} is {dtype.__name__}:" assert choices is None, f"{name} cannot set choices and boundary concurrently." toIntFlag = True toFloatFlag = True try: float(V) except ValueError: toFloatFlag= False try: int(V) except ValueError: toIntFlag= False if toIntFlag is False and toFloatFlag is False: raise WrongDataFormat(f"Boundary values of {name} should be a int or float value but got: {V}.") elif toIntFlag is False and toFloatFlag is True: # minV is predicted be a float value if dtype != float: raise WrongDataFormat(f"{name}'s dtype is int but try to set boundary value with a float value: {V}.") else: V = float(V) elif toIntFlag is True and toFloatFlag is True: # minV is predicted be a float or an int value V = dtype(V) else: raise WrongDataFormat(f"Failed to set {name}'s boundary value: {V}.") boundary[i] = V values["minV"] = boundary["minV"] values["maxV"] = boundary["maxV"] # then parse the default and value if values["default"].lower() == "none": values["default"] = None else: default = values["default"].split("|") for i, v in enumerate(default): default[i] = __parse(name, v, dtype) values["default"] = default if len(default) > 1 else default[0] # the judgement of "default" will be done by .parse() function, so here we only verify "value" if values["value"].lower() == "none": values["value"] = None else: value = values["value"].split("|") for i, v in enumerate(value): v = __parse(name, v, dtype) if values["choices"] is not None: declare.is_instances("Option value", v, values["choices"]) else: if values["minV"] is not None: declare.greater_equal("Option value", v, "minimum expected value", values["minV"]) if values["maxV"] is not None: declare.less_equal("Option value", v, "maximum expected value", values["maxV"]) value[i] = v if len(value) == 1: value = value[0] values["value"] = value # check abbreviation if values["abbr"] in ["none", "None"]: values["abbr"] = None # add this options self.add(name=values["name"], dtype=values["dtype"], abbr=values["abbr"], default=values["default"], choices=values["choices"], minV=values["minV"], maxV=values["maxV"], discription=values["discription"] ) # finally, modify the "value" self.__arguments[values["name"]] = self.__arguments[values["name"]]._replace(value=values["value"]) if values["value"] is not None: self.__setattr__(values["name"], values["value"])
def run_kaldi_commands_parallel(resources,cmdPattern,analyzeResult=True,timeout=ExKaldiInfo.timeout,generateArchive=None,archiveNames=None): ''' Map resources to command pattern and run this command parallelly. Args: <resources>: a dict whose keys are the name of resource and values are lists of resources objects. For example: {"feat": [BytesFeat01,BytesFeat02,... ],"outFile":{"newFeat01.ark","newFeat02.ark",...} }. The "outFile" resource is necessary. When there is only one process to run,"outFile" can be "-" which means the standard output stream. <cmdPattern>: a string needed to map the resources. For example: "copy-feat {feat} ark:{outFile}". Return: a list of triples: (return code,error info,output file or buffer) ''' declare.kaldi_existed() declare.is_classes("resources",resources,dict) declare.is_classes("cmdPattern",cmdPattern,str) assert "outFile" in resources.keys(),"<outFile> key and value is necessary in recources." declare.members_are_classes("the values of resources",resources.values(),[list,tuple]) if generateArchive is not None: analyzeResult = True #forcely analyze the result # check the format of cmomand pattern nameIndexs = [ i for i,c in enumerate(cmdPattern) if c == "{" or c == "}" ] assert len(nameIndexs)%2 == 0,f"The numbers of braces do not match in command pattern: '{cmdPattern}'. " auxiliaryInfo = {} for i in range(0,len(nameIndexs),2): name = cmdPattern[nameIndexs[i]+1:nameIndexs[i+1]] if name not in resources: raise WrongDataFormat(f"Resource is necessary but has not been provided: {name}.") prefix = "" if nameIndexs[i] == 0 else cmdPattern[nameIndexs[i]-1] if name in auxiliaryInfo.keys(): auxiliaryInfo[name][0] += 1 if not prefix in auxiliaryInfo[name][1]: auxiliaryInfo[name][1] += prefix else: auxiliaryInfo[name] = [1,prefix] assert "outFile" in auxiliaryInfo.keys(),"Key: <outFile> is necessary in command pattern." _outFileCountInfo = auxiliaryInfo.pop("outFile") assert _outFileCountInfo[0] == 1,f"Only allow <outFile> appear one time in command pattern but: {_outFileCountInfo[0]}." outFiles = resources.pop("outFile") for outFile in outFiles: if outFile != "-": make_dependent_dirs(outFile,pathIsFile=True) parallel = len(outFiles) if generateArchive is not None: declare.is_instances("generateArchive",generateArchive,["feat","cmvn","ali","fmllr"]) if archiveNames is None: archiveNames = [ generateArchive for i in range(parallel)] elif isinstance(archiveNames,str): archiveNames = [ archiveNames for i in range(parallel)] elif isinstance(archiveNames,(list,tuple)): declare.equal("the number of achieve names",len(archiveNames),"parallel",parallel) else: raise UnsupportedType(f"<archiveNames> should be string or list or tuple but got: {type_name(archiveNames)}.") # regulate resources and run with FileHandleManager() as fhm: newResources = {} if parallel == 1: # Detect whether there is PIPE in command pattern. testPlaceholder = dict( (key,value[0]) if isinstance(value[0],str) else (key,"placeholder") for key,value in resources.items() ) testPlaceholder["outFile"] = "placeholder" testCmd = cmdPattern.format(**testPlaceholder) if "|" in testCmd: inputsBuffer = False else: inputsBuffer = True del testPlaceholder # regularate resources for key,countPrefix in auxiliaryInfo.items(): count,prefix = countPrefix target = resources[key][0] # If target is a list-table,we can not automatically decide whether it is scp-format or ark-format. # So you should appoint it in the command parttern. if type_name(target) in ["ListTable","Transcription"]: if prefix not in [":","="]: errMes = f"There might miss prefix such as 'ark:' or 'scp:' or '--option=' in command pattern before resource: {key}." errMes += "Check the command line please. If you still think there dose not need the prefix," errMes += "save this ListTable or Transcription into file and instead it will this file name." errMes += "In that case,we will skip checking the prefix." raise WrongOperation(errMes) target = target.sort() if (inputsBuffer is True) and count == 1: inputsBuffer = target.save() newResources[key] = "-" else: targetTemp = fhm.create("w+",encoding="utf-8") target.save(targetTemp) newResources[key] = f"{targetTemp.name}" # If target is an index-table,we automatically recognize it as scp-file,so you do not need appoint it. elif type_name(target) == "IndexTable": if prefix != " ": errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}." errMes += f"Because we will decide the prefix depending on its data type." raise WrongOperation(errMes) target = target.sort() if (inputsBuffer is True) and count == 1: inputsBuffer = target.save() newResources[key] = "scp:-" else: targetTemp = fhm.create("w+",suffix=".scp",encoding="utf-8") target.save(targetTemp) newResources[key] = f"scp:{targetTemp.name}" elif isinstance(target,(str,int,float)): # file or other value parameter newResources[key] = f"{target}" elif isinstance(target,(BytesMatrix,BytesVector)): if prefix != " ": errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}." errMes += f"Because we will decide the prefix depending on its data type." raise WrongOperation(errMes) target = target.sort() if (inputsBuffer is True) and count == 1: inputsBuffer = target.data newResources[key] = "ark:-" else: targetTemp = fhm.create("wb+",suffix=".ark") target.save(targetTemp) newResources[key] = f"ark:{targetTemp.name}" elif isinstance(target,(NumpyMatrix,NumpyVector)): if prefix != " ": errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}." errMes += f"Because we will decide the prefix depending on its data type." raise WrongOperation(errMes) target = target.sort() if (inputsBuffer is True) and count == 1: inputsBuffer = target.to_bytes().data newResources[key] = "ark:-" else: target = target.to_bytes() targetTemp = fhm.create("wb+",suffix=".ark") target.save(targetTemp) newResources[key] = f"ark:{targetTemp.name}" elif isinstance(target,BytesArchive): if (inputsBuffer is True) and count == 1: inputsBuffer = target.data newResources[key] = "-" else: targetTemp = fhm.create("wb+") target.save(targetTemp) newResources[key] = f"{targetTemp.name}" else: raise UnsupportedType(f"<target> should be IndexTable,ListTable,file name,int or float value,or exkaldi achieve object but got: {type_name(target)}.") # Then,process output stream outFile = outFiles[0] newResources["outFile"] = outFile inputsBuffer = None if isinstance(inputsBuffer,bool) else inputsBuffer # Then rum command finalCmd = cmdPattern.format(**newResources) out,err,cod = run_shell_command(finalCmd,stdin="PIPE",stdout="PIPE",stderr="PIPE",inputs=inputsBuffer) if analyzeResult: if cod != 0: finalCmd = ",".join([cmd.strip().split(maxsplit=1)[0] for cmd in finalCmd.split("|")]) raise KaldiProcessError(f"Failed to run Kaldi command: {finalCmd}.",err.decode()) if outFile == "-": if generateArchive is not None: if generateArchive == "feat": out = BytesFeat(data=out,name=archiveNames[0]) elif generateArchive == "ali": out = BytesAliTrans(data=out,name=archiveNames[0]) elif generateArchive == "cmvn": out = BytesCMVN(data=out,name=archiveNames[0]) else: out = BytesFmllr(data=out,name=archiveNames[0]) return out else: return (cod,err,out) else: if generateArchive is not None: return load_index_table(outFile,name=archiveNames[0],useSuffix="ark") else: return (cod,err,outFile) else: # In this case,all input IO stream must be files. for key,countPrefix in auxiliaryInfo.items(): count,prefix = countPrefix values = resources[key] newValues = [] for target in values: # If target is scp resource if type_name(target) in ["ListTable","Transcription"]: if prefix not in [":","="]: errMes = f"There might miss prefix such as 'ark:' or 'scp:' or '--option=' in command pattern before resource: {key}." errMes += "Check the command line please. If you still think there dose not need the prefix," errMes += "save this ListTable or Transcription into file and instead it will this file name." errMes += "In that case,we will skip checking the prefix." raise WrongOperation(errMes) target = target.sort() targetTemp = fhm.create("w+",encoding="utf-8") target.save(targetTemp) newValues.append(f"{targetTemp.name}") elif type_name(target) == "IndexTable": if prefix != " ": errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}." errMes += f"Because we will decide the prefix depending on its data type." raise WrongOperation(errMes) target = target.sort() targetTemp = fhm.create("w+",suffix=".scp",encoding="utf-8") target.save(targetTemp) newValues.append(f"scp:{targetTemp.name}") elif isinstance(target,(str,float,int)): # file name or other value parameters newValues.append(f"{target}") elif isinstance(target,(BytesMatrix,BytesVector)): if prefix != " ": errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}." errMes += f"Because we will decide the prefix depending on its data type." raise WrongOperation(errMes) target = target.sort() targetTemp = fhm.create("wb+",suffix=".ark") target.save(targetTemp) newValues.append(f"ark:{targetTemp.name}") elif isinstance(target,(NumpyMatrix,NumpyVector)): if prefix != " ": errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}." errMes += f"Because we will decide the prefix depending on its data type." raise WrongOperation(errMes) target = target.sort().to_bytes() targetTemp = fhm.create("wb+",suffix=".ark") target.save(targetTemp) newValues.append(f"ark:{targetTemp.name}") elif isinstance(target,BytesArchive): targetTemp = fhm.create("wb+") target.save(targetTemp) newValues.append(f"{targetTemp.name}") else: raise UnsupportedType(f"<target> should be IndexTable,ListTable,Transcription,file,int or float values or exkaldi achieve object but got: {type_name(target)}.") newResources[key] = newValues newResources["outFile"] = outFiles # assign these resources to each process and generate multiple commands parallelResources = [] for i in range(parallel): parallelResources.append({}) for key,items in newResources.items(): parallelResources[-1][key] = items[i] cmds = [ cmdPattern.format(**re) for re in parallelResources ] # run flags = run_shell_command_parallel(cmds,timeout=timeout) finalResult = [] done = True for index,info in enumerate(flags): cod,err = info if analyzeResult and cod != 0: print(f"{index}/{len(flags)} error tracking") print(err.decode()) done = False finalResult.append( (cod,err,outFiles[index]) ) if analyzeResult and (not done): finalCmd = ",".join([cmd.strip().split(maxsplit=1)[0] for cmd in cmds[0].split("|")]) raise KaldiProcessError(f"Failed to run Kaldi command: {finalCmd}. Look the error messages above.") else: if generateArchive is not None: for i,fileName in enumerate(outFiles): finalResult[i] = load_index_table(fileName,name=archiveNames[i],useSuffix="ark") return finalResult