def log_softmax(data, axis=1): ''' The log-softmax function. Args: <data>: a Numpy array. <axis>: the dimension to softmax. Return: A new array. ''' declare.is_classes("data", data, np.ndarray) if len(data.shape) == 1: axis = 0 declare.in_boundary("axis", axis, 0, len(data.shape) - 1) dataShape = list(data.shape) dataShape[axis] = 1 maxValue = data.max(axis, keepdims=True) dataNor = data - maxValue dataExp = np.exp(dataNor) dataExpSum = np.sum(dataExp, axis) dataExpSumLog = np.log(dataExpSum) + maxValue.reshape(dataExpSum.shape) return data - dataExpSumLog.reshape(dataShape)
def ctc_greedy_search(prob, vocabs, blankID=None): ''' The best path decoding algorithm. Args: <prob>: An exkaldi probability object. This probalility should be an output of Neural Network with CTC loss fucntion. <vocabs>: a list of vocabulary. <blankID>: specify the ID of blank symbol. If None, use the last dimentionality of <prob>. Return: An exkaldi Transcription object of decoding results. ''' declare.is_classes("vocabs", vocabs, list) declare.is_probability("prob", prob) if type_name(prob) == "BytesProb": prob = prob.to_numpy() elif type_name(prob) == "IndexTable": prob = prob.read_record("prob").to_numpy() probDim = prob.dim if len(vocabs) == probDim: if blankID is None: blankID = probDim - 1 declare.is_positive_int("blankID", blackID) declare.in_boundary("blankID", blackID, 0, probDim - 1) elif len(vocabs) == probDim - 1: if blankID == None: blankID = probDim - 1 else: assert blankID == probDim - 1, f"The dimensibality of probability is {probDim} but only have {len(vocabs)} words. In this case, blank ID must be {probDim-1} but got {blankID}" else: raise WrongDataFormat( f"The dimensibality of probability {probDim} does not match the numbers of words {len(vocabs)}." ) results = Transcription(name="bestPathResult") for utt, pb in prob.items: declare.is_classes("prob", prob, np.ndarray) declare.is_classes("the rank of matrix shape", len(pb.shape), "expected rank", 2) best_path = np.argmax(pb, 1) best_chars_collapsed = [ vocabs[ID] for ID, _ in groupby(best_path) if ID != blankID ] try: results[utt] = " ".join(best_chars_collapsed) except Exception as e: e.args = ("<vocab> might has non-string items.\n" + e.args[0], ) raise e return results
def softmax(data, axis=1): ''' The softmax function. Args: <data>: a Numpy array. <axis>: the dimension to softmax. Return: A new array. ''' declare.is_classes("data", data, np.ndarray) if len(data.shape) == 1: axis = 0 declare.in_boundary("axis", axis, 0, len(data.shape) - 1) maxValue = data.max(axis, keepdims=True) dataNor = data - maxValue dataExp = np.exp(dataNor) dataExpSum = np.sum(dataExp, axis, keepdims=True) return dataExp / dataExpSum
def ctc_prefix_beam_search(prob, vocabs, blankID=None, beam=5, cutoff=0.999, strick=1.0, lmFile=None, alpha=1.0, beta=0): ''' Prefix beam search decoding algorithm. Lm score is supported. Args: <prob>: An exkaldi postprobability object. This probalility should be an output of Neural Network with CTC loss fucntion. We expect the probability didn't pass any activation function, or it may generate wrong results. <vocabs>: a list of vocabulary. <blankID>: specify the ID of blank symbol. If None, use the last dimentionality of <prob>. <beam>: the beam size. <cutoff>: the sum threshold to cut off dimensions whose probability is extremely small. <strick>: When the decoding results of two adjacent frames are the same, the probability of latter will be reduced. <lmFile>: If not None, add language model score to beam. <alpha>: the weight of LM score. <beta>: the length normaoliztion weight of LM score. Return: An exkaldi Transcription object of decoding results. ''' declare.is_classes("vocabs", vocabs, [tuple, list]) declare.is_probability("prob", prob) if type_name(prob) == "BytesProb": prob = prob.to_numpy() elif type_name(prob) == "IndexTable": prob = prob.read_record("prob").to_numpy() if lmFile is not None: declare.is_file("lmFile", lmFile) else: lmFile = "none" probDim = prob.dims if len(vocabs) == probDim: if blankID is None: blankID = probDim - 1 declare.is_positive_int("blankID", blackID) declare.in_boundary("blankID", blackID, 0, probDim - 1) elif len(vocabs) == probDim - 1: if blankID == None: blankID = probDim - 1 else: assert blankID == probDim - 1, f"The dimensibality of probability is {probDim} but only have {len(vocabs)} words. In this case, blank ID must be {probDim-1} but got {blankID}" else: raise WrongDataFormat( f"The dimensibality of probability {probDim} does not match the numbers of words {len(vocabs)}." ) for ID, word in enumerate(vocabs): if len(word.strip()) == 0: raise WrongDataFormat(f"Found a vocab {word} unavaliable.") num_classes = len(vocabs) vocabs = " ".join(vocabs) sources = [ vocabs.encode(), ] uttTemp = [] for utt, pb in prob.items: declare.is_classes("prob", prob, np.ndarray) declare.is_classes("the rank of matrix shape", len(pb.shape), "expected rank", 2) pb = softmax(pb, axis=1) sources.append(f" {pb.shape[0]} ".encode() + pb.astype("float32").tobytes()) sources = b"".join(sources) cmd = os.path.join(sys.prefix, "exkaldisrc", "tools", "prefix_beam_search_decode") cmd += " --num_files {}".format(prob.lens[0]) cmd += " --num_classes {}".format(num_classes) cmd += " --blank_id {}".format(blankID) cmd += " --lm_model {}".format(lmFile) cmd += " --beam_size {}".format(beam) cmd += " --cutoff_prob {}".format(cutoff) cmd += " --alpha {}".format(alpha) cmd += " --beta {}".format(beta) out, err, _ = run_shell_command(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, inputs=sources) if len(out) == 0: raise Exception("Failed to beam search decode.", err.decode()) else: results = Transcription(name="beamSearchResults") out = out.decode().strip().split("file") results = [] for index, re in enumerate(out[1:]): re = re.strip().split("\n") if len(re) <= 1: results.append([ "", ]) else: results[uttTemp[index]] = " ".join(re[1].strip().split()[1:]) return results
def __init__(self, indexTable, processFunc, batchSize, chunks='auto', otherArgs=None, shuffle=False, retainData=0.0): declare.is_index_table("indexTable", indexTable) declare.is_callable("processFunc", processFunc) declare.is_positive_int("batchSize", batchSize) declare.is_bool("shuffle", shuffle) declare.in_boundary("retainData", retainData, minV=0.0, maxV=0.9) self.processFunc = processFunc self._batchSize = batchSize self.otherArgs = otherArgs self._shuffle = shuffle self._chunks = chunks if chunks != 'auto': declare.is_positive_int("chunks", chunks) totalDataNumber = len(indexTable) trainDataNumber = int(totalDataNumber * (1 - retainData)) evalDataNumber = totalDataNumber - trainDataNumber scpTable = indexTable.shuffle() self.trainTable = scpTable.subset(nHead=trainDataNumber) self.evalTable = scpTable.subset(nTail=evalDataNumber) if chunks == 'auto': #Compute the chunks automatically sampleTable = self.trainTable.subset(nHead=10) meanSize = sum( [indexInfo.dataSize for indexInfo in sampleTable.values()]) / 10 autoChunkSize = math.ceil( 104857600 / meanSize) # 100MB = 102400KB = 104857600 B self._chunks = trainDataNumber // autoChunkSize if self._chunks == 0: self._chunks = 1 self.make_dataset_bag(shuffle=False) self._epoch = 0 self.load_dataset(0) self.currentDataset = self.nextDataset self.nextDataset = None self.epochSize = len(self.currentDataset) self.countEpochSizeFlag = True self.currentPosition = 0 self.currentEpochPosition = 0 self._isNewEpoch = False self._isNewChunk = False self.datasetIndex = 0 if self._chunks > 1: self.datasetIndex = 1 self.loadDatasetThread = threading.Thread(target=self.load_dataset, args=(1, )) self.loadDatasetThread.start()
def __init__(self, indexTable, processFunc, batchSize, chunks='auto', otherArgs=None, shuffle=False, retainData=0.0): ''' Args: _indexTable_: an ExKaldi IndexTable object whose <filePath> info is necessary. _processFunc_: a function receive a IndexTable object return return an iterable dataset object. It at least need two arguments to receive ( the data iteator itself, a IndexTable object of a chunk data ). _batchSize_: mini batch size. _chunks_: how many chunks to split. _otherArgs_: other arguments to send to <processFunc>. _shuffle_: If True, shuffle a batch data. _retainData_: a probability value. how much data to retained for evaluation. ''' declare.is_index_table("indexTable", indexTable) declare.is_callable("processFunc", processFunc) declare.is_positive_int("batchSize", batchSize) declare.is_bool("shuffle", shuffle) declare.in_boundary("retainData", retainData, minV=0.0, maxV=0.9) self.__processFunc = processFunc self.__batchSize = batchSize self.__otherArgs = otherArgs self.__shuffle = shuffle self.__chunks = chunks if chunks != 'auto': declare.is_positive_int("chunks", chunks) totalDataNumber = len(indexTable) trainDataNumber = int(totalDataNumber * (1 - retainData)) evalDataNumber = totalDataNumber - trainDataNumber scpTable = indexTable.shuffle() self.__trainTable = scpTable.subset(nHead=trainDataNumber) if evalDataNumber > 0: self.__evalTable = scpTable.subset(nTail=evalDataNumber) else: self.__evalTable = None if chunks == 'auto': #Compute the chunks automatically sampleTable = self.__trainTable.subset(nHead=10) meanSize = sum( [indexInfo.dataSize for indexInfo in sampleTable.values()]) / 10 autoChunkSize = math.ceil( 104857600 / meanSize) # 100MB = 102400KB = 104857600 B self.__chunks = trainDataNumber // autoChunkSize if self.__chunks == 0: self.__chunks = 1 # split train dataset into N chunks self.__make_dataset_bag(shuffle=False) # initialize some parameters self.__epoch = 0 self.__currentPosition = 0 self.__currentEpochPosition = 0 self.__isNewEpoch = False self.__isNewChunk = False self.__datasetIndex = 0 # load the first chunk data self.__load_dataset(0) self.__currentDataset = self.__nextDataset self.__nextDataset = None # accumulate counts self.__epochSize = len(self.__currentDataset) self.__countEpochSizeFlag = True # try to load the next chunk if self.__chunks > 1: self.__datasetIndex = 1 self.__loadDatasetThread = threading.Thread( target=self.__load_dataset, args=(1, )) self.__loadDatasetThread.start()