def use_cmvn_sliding(feat,windowSize=None,std=False): ''' Allpy sliding CMVN statistics. Args: <feat>: exkaldi feature object. <windowSize>: windows size,If None,use windows size greater_equal than the frames of feature. <std>: a bool value. Return: exkaldi feature object. ''' declare.is_classes("feat",feat, ["BytesFeature","NumpyFeature"]) declare.is_bool("std",std) if windowSize is None: featLen = feat.lens[1] maxLen = max([length for utt,length in featLen]) windowSize = math.ceil(maxLen/100)*100 else: declare.is_positive_int("windowSize",windowSize) if std: std='true' else: std='false' cmd = f'apply-cmvn-sliding --cmn-window={windowSize} --min-cmn-window=100 --norm-vars={std} ark:- ark:-' out,err,cod = run_shell_command(cmd,stdin="PIPE",stderr="PIPE",stdout="PIPE",inputs=feat.data) if cod != 0: print(err.decode()) raise KaldiProcessError("Failed to compute sliding cmvn.") newName = f"cmvn({feat.name},{windowSize})" return BytesFeature(out,name=newName,indexTable=None)
def make_dependent_dirs(path, pathIsFile=True): ''' Make the dependent directories for a path if it has not existed. Args: <path>: a file path or folder path. <pathIsFile>: a bool value to declare that <path> is a file path or folder path. ''' declare.is_valid_string("path", path) declare.is_bool("pathIsFile", pathIsFile) path = os.path.abspath(path.strip()) if pathIsFile: if os.path.isdir(path): raise WrongPath( f"<path> is specified as file but it has existed as directory: {path}. You can remove it then try again." ) else: dirPath = os.path.dirname(path) else: if os.path.isfile(path): raise WrongPath( f"<path> is specified as directory but it has existed as file: {path}. You can remove it then try again." ) else: dirPath = path if not os.path.isdir(dirPath): try: os.makedirs(dirPath) except Exception as e: print(f"Failed to make directory: {dirPath}.") raise e
def load_transcription(target, name="transcription", checkSpace=True): ''' Load transcription from file. Args: <target>: transcription file path. <name>: a string. <checkSpace>: a bbol value. If True,we will check the validity of the number of spaces. Return: An exkaldi Transcription object. ''' declare.is_classes("target", target, ["dict", "Transcription", "ListTable", "str"]) declare.is_bool("checkSpace", checkSpace) if isinstance(target, str): declare.is_file("target", target) with open(target, "r", encoding="utf-8") as fr: lines = fr.readlines() result = Transcription(name=name) for index, line in enumerate(lines, start=1): t = line.strip().split(maxsplit=1) if len(t) < 2: print(f"Line Number: {index}") print(f"Line Content: {line}") raise WrongDataFormat( "Missing entire key and value information.") else: result[t[0]] = t[1] else: for utt, utterance in target.items(): declare.is_valid_string("utterance ID", utt) declare.is_valid_string("utterance", utterance) result = Transcription(target, name=name) if checkSpace: sampleText = result.subset(nRandom=100) spaceCount = 0 for key, value in sampleText.items(): spaceCount += value.count(" ") if spaceCount < len(sampleText) // 2: errMes = "The transcription doesn't seem to be separated by spaces or extremely short." errMes += "If it actually has right format, set the <checkSpace>=False and run this function again." raise WrongDataFormat(errMes) return result
def full_scores_sentence(self, sentence, bos=True, eos=True): ''' Generate full scores (prob, ngram length, oov). Args: <sentence>: a string with out boundary symbols. <bos>: If True, add <s> to the head. <eos>: If True, add </s> to the tail. Return: a iterator of (prob, ngram length, oov). ''' declare.is_valid_string("sentence", sentence) declare.is_bool("bos", bos) declare.is_bool("eos", eos) return self.__model.full_scores(sentence, bos, eos)
def score_sentence(self, sentence, bos=True, eos=True): ''' Score a sentence. Args: <sentence>: a string with out boundary symbols. <bos>: If True, add <s> to the head. <eos>: If True, add </s> to the tail. Return: a float value. ''' declare.is_valid_string("sentence", sentence) declare.is_bool("bos", bos) declare.is_bool("eos", eos) return self.__model.score(sentence, bos, eos)
def compress_gz_file(filePath, overWrite=False, keepSource=False): ''' Compress a file to gz file. Args: <filePath>: file path. <overWrite>: If True,overwrite gz file when it has existed. <keepSource>: If True,retain source file. Return: the path of compressed file. ''' declare.is_file("filePath", filePath) declare.is_bool("overWrite", overWrite) declare.is_bool("keepSource", keepSource) filePath = os.path.abspath(filePath) if filePath.endswith(".gz"): raise WrongOperation(f"Cannot compress a .gz file:{filePath}.") else: outFile = filePath + ".gz" if os.path.isfile(outFile): if overWrite is True: os.remove(outFile) else: raise WrongOperation( f"File has existed:{outFile}. If overwrite it,set option <overWrite>=True." ) if keepSource: cmd = f"gzip -k {filePath}" else: cmd = f"gzip {filePath}" out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE) if cod != 0: print(err.decode()) raise ShellProcessError("Failed to compress file.") else: return outFile
def use_cmvn(feat,cmvn,utt2spk=None,std=False,outFile=None): ''' Apply CMVN statistics to feature. Share Args: Null Parrallel Args: <feat>: exkaldi feature or index table object. <cmvn>: exkaldi CMVN statistics or index object. <utt2spk>: file path or ListTable object. <std>: If true,apply std normalization. <outFile>: out file name. Return: feature or index table object. ''' feats,cmvns,utt2spks,stds,outFiles = check_multiple_resources(feat,cmvn,utt2spk,std,outFile=outFile) names = [] for i,feat,cmvn,utt2spk,std in zip(range(len(outFiles)),feats,cmvns,utt2spks,stds): # verify feature and cmvn declare.is_feature("feat",feat) declare.is_cmvn("cmvn",cmvn) # verify utt2spk if utt2spk is not None: declare.is_potential_list_table("utt2spk",utt2spk) # std declare.is_bool("std",std) #stds[i] = "true" if std else "false" names.append( f"cmvn({feat.name},{cmvn.name})" ) if utt2spks[0] is None: cmdPattern = 'apply-cmvn --norm-vars={std} {cmvn} {feat} ark:{outFile}' resources = {"feat":feats,"cmvn":cmvns,"std":stds,"outFile":outFiles} else: cmdPattern = 'apply-cmvn --norm-vars={std} --utt2spk=ark:{utt2spk} {cmvn} {feat} ark:{outFile}' resources = {"feat":feats,"cmvn":cmvns,"utt2spk":utt2spks,"std":stds,"outFile":outFiles} return run_kaldi_commands_parallel(resources,cmdPattern,analyzeResult=True,generateArchive="feat",archiveNames=names)
def __init__(self, indexTable, processFunc, batchSize, chunks='auto', otherArgs=None, shuffle=False, retainData=0.0): declare.is_index_table("indexTable", indexTable) declare.is_callable("processFunc", processFunc) declare.is_positive_int("batchSize", batchSize) declare.is_bool("shuffle", shuffle) declare.in_boundary("retainData", retainData, minV=0.0, maxV=0.9) self.processFunc = processFunc self._batchSize = batchSize self.otherArgs = otherArgs self._shuffle = shuffle self._chunks = chunks if chunks != 'auto': declare.is_positive_int("chunks", chunks) totalDataNumber = len(indexTable) trainDataNumber = int(totalDataNumber * (1 - retainData)) evalDataNumber = totalDataNumber - trainDataNumber scpTable = indexTable.shuffle() self.trainTable = scpTable.subset(nHead=trainDataNumber) self.evalTable = scpTable.subset(nTail=evalDataNumber) if chunks == 'auto': #Compute the chunks automatically sampleTable = self.trainTable.subset(nHead=10) meanSize = sum( [indexInfo.dataSize for indexInfo in sampleTable.values()]) / 10 autoChunkSize = math.ceil( 104857600 / meanSize) # 100MB = 102400KB = 104857600 B self._chunks = trainDataNumber // autoChunkSize if self._chunks == 0: self._chunks = 1 self.make_dataset_bag(shuffle=False) self._epoch = 0 self.load_dataset(0) self.currentDataset = self.nextDataset self.nextDataset = None self.epochSize = len(self.currentDataset) self.countEpochSizeFlag = True self.currentPosition = 0 self.currentEpochPosition = 0 self._isNewEpoch = False self._isNewChunk = False self.datasetIndex = 0 if self._chunks > 1: self.datasetIndex = 1 self.loadDatasetThread = threading.Thread(target=self.load_dataset, args=(1, )) self.loadDatasetThread.start()
def __init__(self, indexTable, processFunc, batchSize, chunks='auto', otherArgs=None, shuffle=False, retainData=0.0): ''' Args: _indexTable_: an ExKaldi IndexTable object whose <filePath> info is necessary. _processFunc_: a function receive a IndexTable object return return an iterable dataset object. It at least need two arguments to receive ( the data iteator itself, a IndexTable object of a chunk data ). _batchSize_: mini batch size. _chunks_: how many chunks to split. _otherArgs_: other arguments to send to <processFunc>. _shuffle_: If True, shuffle a batch data. _retainData_: a probability value. how much data to retained for evaluation. ''' declare.is_index_table("indexTable", indexTable) declare.is_callable("processFunc", processFunc) declare.is_positive_int("batchSize", batchSize) declare.is_bool("shuffle", shuffle) declare.in_boundary("retainData", retainData, minV=0.0, maxV=0.9) self.__processFunc = processFunc self.__batchSize = batchSize self.__otherArgs = otherArgs self.__shuffle = shuffle self.__chunks = chunks if chunks != 'auto': declare.is_positive_int("chunks", chunks) totalDataNumber = len(indexTable) trainDataNumber = int(totalDataNumber * (1 - retainData)) evalDataNumber = totalDataNumber - trainDataNumber scpTable = indexTable.shuffle() self.__trainTable = scpTable.subset(nHead=trainDataNumber) if evalDataNumber > 0: self.__evalTable = scpTable.subset(nTail=evalDataNumber) else: self.__evalTable = None if chunks == 'auto': #Compute the chunks automatically sampleTable = self.__trainTable.subset(nHead=10) meanSize = sum( [indexInfo.dataSize for indexInfo in sampleTable.values()]) / 10 autoChunkSize = math.ceil( 104857600 / meanSize) # 100MB = 102400KB = 104857600 B self.__chunks = trainDataNumber // autoChunkSize if self.__chunks == 0: self.__chunks = 1 # split train dataset into N chunks self.__make_dataset_bag(shuffle=False) # initialize some parameters self.__epoch = 0 self.__currentPosition = 0 self.__currentEpochPosition = 0 self.__isNewEpoch = False self.__isNewChunk = False self.__datasetIndex = 0 # load the first chunk data self.__load_dataset(0) self.__currentDataset = self.__nextDataset self.__nextDataset = None # accumulate counts self.__epochSize = len(self.__currentDataset) self.__countEpochSizeFlag = True # try to load the next chunk if self.__chunks > 1: self.__datasetIndex = 1 self.__loadDatasetThread = threading.Thread( target=self.__load_dataset, args=(1, )) self.__loadDatasetThread.start()
def tuple_dataset(archives,frameLevel=False): ''' Tuple feature or alignment archives in "utterance" level or "frame" level. Args: <archives>: exkaldi feature or alignment objects. <framelevel>: If True,tuple data in frame level. Or in utterance level. Return: List of tupled data. ''' declare.is_classes("archives",archives,(tuple,list)) assert len(archives) > 1,"<archives> should has multiple items." declare.is_bool("frameLevel",frameLevel) archives = match_utterances(archives) fields = {} for index,ark in enumerate(archives): if frameLevel is True: declare.belong_classes("achieves",ark,(BytesMatrix,BytesVector,NumpyMatrix,NumpyVector)) else: declare.belong_classes("achieves",ark,(BytesMatrix,BytesVector,NumpyMatrix,NumpyVector,ListTable)) if isinstance(ark,(BytesMatrix,BytesVector)): ark = ark.to_numpy() if ark.name not in fields.keys(): fields[ark.name] = [] fields[ark.name].append(ark) fieldNames = list(fields.keys()) try: if frameLevel: templet = namedtuple(typename="TupledData",field_names=["key","frameID",]+fieldNames) else: templet = namedtuple(typename="TupledData",field_names=["key",]+fieldNames) except ValueError as e: e.args = ('While tuple data,use "name" of archives as identity ID so they are expected Python valid identifiers.'+ 'You can use ".rename()" method to rename it and try this function again.'+"\n"+ e.args[0],) raise e def align_tuple_data_to_frame(key,record,templet): if isinstance(record[0],list): frameSize = len(record[0][0]) else: frameSize = len(record[0]) for re in record[1:]: if isinstance(re,list): for sr in re: if len(sr) != frameSize: raise WrongOperation(f"Cannot tuple data with different frame length to frame level: {frameSize}!={len(sr)}.") else: if len(re) != frameSize: raise WrongOperation(f"Cannot tuple data with different frame length to frame level: {frameSize}!={len(re)}.") result = [] for frameIndex in range(frameSize): new = [] for re in record: if isinstance(re,list): filedR = [] for sr in re: filedR.append( sr[frameIndex] ) new.append(filedR) else: new.append( re[frameIndex:frameIndex+1] ) result.append(templet( key,frameIndex,*new )) return result result = [] for key in archives[0].keys(): oneRecord = [] for field in fieldNames: fieldData = [] for ark in fields[field]: fieldData.append( ark.data[key] ) if len(fieldData) == 1: fieldData = fieldData[0] oneRecord.append( fieldData ) if frameLevel: result.extend( align_tuple_data_to_frame(key,oneRecord,templet) ) else: result.append( templet(key,*oneRecord)) return result