Пример #1
0
def log_softmax(data, axis=1):
    '''
	The log-softmax function.

	Args:
		<data>: a Numpy array.
		<axis>: the dimension to softmax.
	Return:
		A new array.
	'''
    declare.is_classes("data", data, np.ndarray)
    if len(data.shape) == 1:
        axis = 0
    declare.in_boundary("axis", axis, 0, len(data.shape) - 1)

    dataShape = list(data.shape)
    dataShape[axis] = 1
    maxValue = data.max(axis, keepdims=True)
    dataNor = data - maxValue

    dataExp = np.exp(dataNor)
    dataExpSum = np.sum(dataExp, axis)
    dataExpSumLog = np.log(dataExpSum) + maxValue.reshape(dataExpSum.shape)

    return data - dataExpSumLog.reshape(dataShape)
Пример #2
0
def ctc_greedy_search(prob, vocabs, blankID=None):
    '''
    The best path decoding algorithm.

    Args:
        <prob>: An exkaldi probability object. This probalility should be an output of Neural Network with CTC loss fucntion.
        <vocabs>: a list of vocabulary.
        <blankID>: specify the ID of blank symbol. If None, use the last dimentionality of <prob>.
    Return:
        An exkaldi Transcription object of decoding results.  
    '''
    declare.is_classes("vocabs", vocabs, list)

    declare.is_probability("prob", prob)
    if type_name(prob) == "BytesProb":
        prob = prob.to_numpy()
    elif type_name(prob) == "IndexTable":
        prob = prob.read_record("prob").to_numpy()

    probDim = prob.dim
    if len(vocabs) == probDim:
        if blankID is None:
            blankID = probDim - 1
        declare.is_positive_int("blankID", blackID)
        declare.in_boundary("blankID", blackID, 0, probDim - 1)
    elif len(vocabs) == probDim - 1:
        if blankID == None:
            blankID = probDim - 1
        else:
            assert blankID == probDim - 1, f"The dimensibality of probability is {probDim} but only have {len(vocabs)} words. In this case, blank ID must be {probDim-1} but got {blankID}"
    else:
        raise WrongDataFormat(
            f"The dimensibality of probability {probDim} does not match the numbers of words {len(vocabs)}."
        )

    results = Transcription(name="bestPathResult")
    for utt, pb in prob.items:
        declare.is_classes("prob", prob, np.ndarray)
        declare.is_classes("the rank of matrix shape", len(pb.shape),
                           "expected rank", 2)
        best_path = np.argmax(pb, 1)
        best_chars_collapsed = [
            vocabs[ID] for ID, _ in groupby(best_path) if ID != blankID
        ]
        try:
            results[utt] = " ".join(best_chars_collapsed)
        except Exception as e:
            e.args = ("<vocab> might has non-string items.\n" + e.args[0], )
            raise e
    return results
Пример #3
0
def softmax(data, axis=1):
    '''
	The softmax function.

	Args:
		<data>: a Numpy array.
		<axis>: the dimension to softmax.
		
	Return:
		A new array.
	'''
    declare.is_classes("data", data, np.ndarray)
    if len(data.shape) == 1:
        axis = 0
    declare.in_boundary("axis", axis, 0, len(data.shape) - 1)

    maxValue = data.max(axis, keepdims=True)
    dataNor = data - maxValue

    dataExp = np.exp(dataNor)
    dataExpSum = np.sum(dataExp, axis, keepdims=True)

    return dataExp / dataExpSum
Пример #4
0
def ctc_prefix_beam_search(prob,
                           vocabs,
                           blankID=None,
                           beam=5,
                           cutoff=0.999,
                           strick=1.0,
                           lmFile=None,
                           alpha=1.0,
                           beta=0):
    '''
    Prefix beam search decoding algorithm. Lm score is supported.

    Args:
        <prob>: An exkaldi postprobability object. This probalility should be an output of Neural Network with CTC loss fucntion.
                We expect the probability didn't pass any activation function, or it may generate wrong results.
        <vocabs>: a list of vocabulary.
        <blankID>: specify the ID of blank symbol. If None, use the last dimentionality of <prob>.
        <beam>: the beam size.
        <cutoff>: the sum threshold to cut off dimensions whose probability is extremely small.  
        <strick>: When the decoding results of two adjacent frames are the same, the probability of latter will be reduced.
        <lmFile>: If not None, add language model score to beam.
        <alpha>: the weight of LM score.
        <beta>: the length normaoliztion weight of LM score.
    Return:
        An exkaldi Transcription object of decoding results.  
    '''
    declare.is_classes("vocabs", vocabs, [tuple, list])

    declare.is_probability("prob", prob)
    if type_name(prob) == "BytesProb":
        prob = prob.to_numpy()
    elif type_name(prob) == "IndexTable":
        prob = prob.read_record("prob").to_numpy()

    if lmFile is not None:
        declare.is_file("lmFile", lmFile)
    else:
        lmFile = "none"

    probDim = prob.dims
    if len(vocabs) == probDim:
        if blankID is None:
            blankID = probDim - 1
        declare.is_positive_int("blankID", blackID)
        declare.in_boundary("blankID", blackID, 0, probDim - 1)

    elif len(vocabs) == probDim - 1:
        if blankID == None:
            blankID = probDim - 1
        else:
            assert blankID == probDim - 1, f"The dimensibality of probability is {probDim} but only have {len(vocabs)} words. In this case, blank ID must be {probDim-1} but got {blankID}"
    else:
        raise WrongDataFormat(
            f"The dimensibality of probability {probDim} does not match the numbers of words {len(vocabs)}."
        )

    for ID, word in enumerate(vocabs):
        if len(word.strip()) == 0:
            raise WrongDataFormat(f"Found a vocab {word} unavaliable.")

    num_classes = len(vocabs)
    vocabs = " ".join(vocabs)

    sources = [
        vocabs.encode(),
    ]
    uttTemp = []
    for utt, pb in prob.items:
        declare.is_classes("prob", prob, np.ndarray)
        declare.is_classes("the rank of matrix shape", len(pb.shape),
                           "expected rank", 2)
        pb = softmax(pb, axis=1)
        sources.append(f" {pb.shape[0]} ".encode() +
                       pb.astype("float32").tobytes())

    sources = b"".join(sources)

    cmd = os.path.join(sys.prefix, "exkaldisrc", "tools",
                       "prefix_beam_search_decode")
    cmd += " --num_files {}".format(prob.lens[0])
    cmd += " --num_classes {}".format(num_classes)
    cmd += " --blank_id {}".format(blankID)
    cmd += " --lm_model {}".format(lmFile)
    cmd += " --beam_size {}".format(beam)
    cmd += " --cutoff_prob {}".format(cutoff)
    cmd += " --alpha {}".format(alpha)
    cmd += " --beta {}".format(beta)

    out, err, _ = run_shell_command(cmd,
                                    stdin=subprocess.PIPE,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE,
                                    inputs=sources)

    if len(out) == 0:
        raise Exception("Failed to beam search decode.", err.decode())
    else:
        results = Transcription(name="beamSearchResults")
        out = out.decode().strip().split("file")
        results = []
        for index, re in enumerate(out[1:]):
            re = re.strip().split("\n")
            if len(re) <= 1:
                results.append([
                    "",
                ])
            else:
                results[uttTemp[index]] = " ".join(re[1].strip().split()[1:])

        return results
Пример #5
0
    def __init__(self,
                 indexTable,
                 processFunc,
                 batchSize,
                 chunks='auto',
                 otherArgs=None,
                 shuffle=False,
                 retainData=0.0):

        declare.is_index_table("indexTable", indexTable)
        declare.is_callable("processFunc", processFunc)
        declare.is_positive_int("batchSize", batchSize)
        declare.is_bool("shuffle", shuffle)
        declare.in_boundary("retainData", retainData, minV=0.0, maxV=0.9)

        self.processFunc = processFunc
        self._batchSize = batchSize
        self.otherArgs = otherArgs
        self._shuffle = shuffle
        self._chunks = chunks

        if chunks != 'auto':
            declare.is_positive_int("chunks", chunks)

        totalDataNumber = len(indexTable)
        trainDataNumber = int(totalDataNumber * (1 - retainData))
        evalDataNumber = totalDataNumber - trainDataNumber
        scpTable = indexTable.shuffle()

        self.trainTable = scpTable.subset(nHead=trainDataNumber)
        self.evalTable = scpTable.subset(nTail=evalDataNumber)

        if chunks == 'auto':
            #Compute the chunks automatically
            sampleTable = self.trainTable.subset(nHead=10)
            meanSize = sum(
                [indexInfo.dataSize
                 for indexInfo in sampleTable.values()]) / 10
            autoChunkSize = math.ceil(
                104857600 / meanSize)  # 100MB = 102400KB = 104857600 B
            self._chunks = trainDataNumber // autoChunkSize
            if self._chunks == 0:
                self._chunks = 1

        self.make_dataset_bag(shuffle=False)
        self._epoch = 0

        self.load_dataset(0)
        self.currentDataset = self.nextDataset
        self.nextDataset = None

        self.epochSize = len(self.currentDataset)
        self.countEpochSizeFlag = True

        self.currentPosition = 0
        self.currentEpochPosition = 0
        self._isNewEpoch = False
        self._isNewChunk = False
        self.datasetIndex = 0

        if self._chunks > 1:
            self.datasetIndex = 1
            self.loadDatasetThread = threading.Thread(target=self.load_dataset,
                                                      args=(1, ))
            self.loadDatasetThread.start()
Пример #6
0
    def __init__(self,
                 indexTable,
                 processFunc,
                 batchSize,
                 chunks='auto',
                 otherArgs=None,
                 shuffle=False,
                 retainData=0.0):
        '''
		Args:
			_indexTable_: an ExKaldi IndexTable object whose <filePath> info is necessary.
			_processFunc_: a function receive a IndexTable object return return an iterable dataset object.
										It at least need two arguments to receive ( the data iteator itself, a IndexTable object of a chunk data ).
			_batchSize_: mini batch size.
			_chunks_: how many chunks to split.
			_otherArgs_: other arguments to send to <processFunc>.
			_shuffle_: If True, shuffle a batch data.
			_retainData_: a probability value. how much data to retained for evaluation.
		'''
        declare.is_index_table("indexTable", indexTable)
        declare.is_callable("processFunc", processFunc)
        declare.is_positive_int("batchSize", batchSize)
        declare.is_bool("shuffle", shuffle)
        declare.in_boundary("retainData", retainData, minV=0.0, maxV=0.9)

        self.__processFunc = processFunc
        self.__batchSize = batchSize
        self.__otherArgs = otherArgs
        self.__shuffle = shuffle
        self.__chunks = chunks

        if chunks != 'auto':
            declare.is_positive_int("chunks", chunks)

        totalDataNumber = len(indexTable)
        trainDataNumber = int(totalDataNumber * (1 - retainData))
        evalDataNumber = totalDataNumber - trainDataNumber
        scpTable = indexTable.shuffle()

        self.__trainTable = scpTable.subset(nHead=trainDataNumber)
        if evalDataNumber > 0:
            self.__evalTable = scpTable.subset(nTail=evalDataNumber)
        else:
            self.__evalTable = None

        if chunks == 'auto':
            #Compute the chunks automatically
            sampleTable = self.__trainTable.subset(nHead=10)
            meanSize = sum(
                [indexInfo.dataSize
                 for indexInfo in sampleTable.values()]) / 10
            autoChunkSize = math.ceil(
                104857600 / meanSize)  # 100MB = 102400KB = 104857600 B
            self.__chunks = trainDataNumber // autoChunkSize
            if self.__chunks == 0:
                self.__chunks = 1

        # split train dataset into N chunks
        self.__make_dataset_bag(shuffle=False)

        # initialize some parameters
        self.__epoch = 0
        self.__currentPosition = 0
        self.__currentEpochPosition = 0
        self.__isNewEpoch = False
        self.__isNewChunk = False
        self.__datasetIndex = 0

        # load the first chunk data
        self.__load_dataset(0)
        self.__currentDataset = self.__nextDataset
        self.__nextDataset = None

        # accumulate counts
        self.__epochSize = len(self.__currentDataset)
        self.__countEpochSizeFlag = True

        # try to load the next chunk
        if self.__chunks > 1:
            self.__datasetIndex = 1
            self.__loadDatasetThread = threading.Thread(
                target=self.__load_dataset, args=(1, ))
            self.__loadDatasetThread.start()