def decode(probs, alpha=1.0, beta=0.0, beam=100, method='clm2', clm=None):

    hypScore = None
    refScore = None
    # TODO Couldn't find score_sentence
    #refScore = ctc.score_sentence(probs,labels)
    #refScore += alpha*self.lm.score_tg(" ".join(sentence)) + beta*len(sentence)

    char_inds = pickle.load(open(CHAR_CORPUS_VOCAB_FILE, 'rb'))

    # Various decoding options

    align = None
    if method == 'pmax':
        # Pointwise argmax
        hyp, align = ctc.decode_best_path(probs)
    elif method == 'bg':
        import prefixTree
        # Bigram LM w/ prefix tree dictionary constraint
        print 'Loading prefix tree (this can take a while)...'
        pt = prefixTree.loadPrefixTree()
        lm = pt.lm
        print 'Done loading prefix tree.'
        tic = time.time()
        hyp, hypScore = bg_decoder.decode_bg_lm(probs,
                                                pt,
                                                lm,
                                                beam=beam,
                                                alpha=alpha,
                                                beta=beta)
        toc = time.time()
        print 'decoding time (wall): %f' % (toc - tic)
    elif method == 'clm':
        import clm_decoder
        # Character LM
        # NOTE need to restructure decoders into classes
        hyp, hypScore = clm_decoder.decode_clm(probs,
                                               clm,
                                               beam=beam,
                                               alpha=alpha,
                                               beta=beta)
    elif method == 'clm2':
        import clm_decoder2
        # Character LM
        # NOTE need to restructure decoders into classes
        hyp, hypScore = clm_decoder2.decode_clm(probs,
                                                clm,
                                                beam=beam,
                                                alpha=alpha,
                                                beta=beta,
                                                char_inds=char_inds)
    elif method == 'fast':
        hyp, hypScore = decode_lm_wrapper(probs, beam, alpha, beta)
    else:
        assert False, 'No such decoding method: %s' % method

    return hyp, hypScore, refScore, align
示例#2
0
def decode(probs, alpha=1.0, beta=0.0, beam=100, method='clm2', clm=None):

    hypScore = None
    refScore = None
    # TODO Couldn't find score_sentence
    #refScore = ctc.score_sentence(probs,labels)
    #refScore += alpha*self.lm.score_tg(" ".join(sentence)) + beta*len(sentence)

    char_inds = pickle.load(open(CHAR_CORPUS_VOCAB_FILE, 'rb'))

    # Various decoding options

    align = None
    if method == 'pmax':
        # Pointwise argmax
        hyp, align = ctc.decode_best_path(probs)
    elif method == 'bg':
        import prefixTree
        # Bigram LM w/ prefix tree dictionary constraint
        print 'Loading prefix tree (this can take a while)...'
        pt = prefixTree.loadPrefixTree()
        lm = pt.lm
        print 'Done loading prefix tree.'
        tic = time.time()
        hyp, hypScore = bg_decoder.decode_bg_lm(probs, pt, lm, beam=beam,
                alpha=alpha, beta=beta)
        toc = time.time()
        print 'decoding time (wall): %f' % (toc - tic)
    elif method == 'clm':
        import clm_decoder
        # Character LM
        # NOTE need to restructure decoders into classes
        hyp, hypScore = clm_decoder.decode_clm(probs, clm, beam=beam,
                alpha=alpha, beta=beta)
    elif method == 'clm2':
        import clm_decoder2
        # Character LM
        # NOTE need to restructure decoders into classes
        hyp, hypScore = clm_decoder2.decode_clm(probs, clm, beam=beam,
                alpha=alpha, beta=beta, char_inds=char_inds)
    elif method == 'fast':
        hyp, hypScore = decode_lm_wrapper(probs, beam, alpha, beta)
    else:
        assert False, 'No such decoding method: %s' % method

    return hyp, hypScore, refScore, align
示例#3
0
    def costAndGrad(self, data, labels=None):

        T = data.shape[1]
        self.setViews(T)

        if self.temporalLayer > 0:
            stack = self.stack[:-1]
            wt, _ = self.stack[-1]
            if self.train:
                grad = self.grad[:-1]
                dwt, _ = self.grad[-1]
        else:
            stack = self.stack
            if self.train:
                grad = self.grad

        # forward prop
        self.hActs[0].assign(cm.CUDAMatrix(data))

        i = 1
        for w, b in stack:
            cm.dot(w, self.hActs[i - 1], self.hActs[i])
            self.hActs[i].add_col_vec(b)

            # forward prop through time
            if i == self.temporalLayer:
                for t in xrange(1, T):
                    self.hActs[i].minmax(0.0, self.maxAct, col=t - 1)
                    cm.mvdot_col_slice(wt,
                                       self.hActs[i],
                                       t - 1,
                                       self.hActs[i],
                                       t,
                                       beta=1.0)
                self.hActs[i].minmax(0.0, self.maxAct, col=T - 1)

            if i <= self.numLayers and i != self.temporalLayer:
                # hard relu
                self.hActs[i].maximum(0.0)
            i += 1

        # Subtract max activation
        self.hActs[-1].max(axis=0, target=self.rowVec)
        self.hActs[-1].add_row_mult(self.rowVec, -1.0, target=self.probs)

        # Softmax
        cm.exp(self.probs)
        self.probs.sum(axis=0, target=self.rowVec)
        cm.pow(self.rowVec, -1.0, target=self.rowVec)
        self.probs.mult_by_row(self.rowVec)

        self.probs.copy_to_host()
        if not self.train:
            return ctc.decode_best_path(
                self.probs.numpy_array.astype(np.float64))

        cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(
            np.float64),
                                          labels,
                                          blank=0)

        if skip:
            return cost, self.grad, skip

        self.deltasC.assign(cm.CUDAMatrix(deltas))

        # back prop
        i = self.numLayers
        deltasIn, deltasOut = self.deltasC, self.deltasOut
        for w, b in reversed(stack):
            # compute gradient
            cm.dot(deltasIn, self.hActs[i].T, target=grad[i][0])
            deltasIn.sum(axis=1, target=grad[i][1])

            # compute next layer deltas
            if i > 0:
                cm.dot(w.T, deltasIn, target=deltasOut)

            # backprop through time
            if i == self.temporalLayer:
                self.hActs[i].within(0.0, self.maxAct, target=self.tmpGrad)
                self.deltaTemp.assign(0.0)
                for t in xrange(T - 1, 0, -1):
                    # Add in temporal delta
                    cm.mvdot_col_slice(wt.T,
                                       self.deltaTemp,
                                       t,
                                       deltasOut,
                                       t,
                                       beta=1.0)

                    # Push through activation fn
                    deltasOut.mult_slice(t, self.tmpGrad, t)
                    self.deltaTemp.set_single_col(t - 1, deltasOut, t)

                # Accumulate temporal gradient
                cm.dot(self.deltaTemp, self.hActs[i].T, target=dwt)

                cm.mvdot_col_slice(wt.T,
                                   self.deltaTemp,
                                   0,
                                   deltasOut,
                                   0,
                                   beta=1.0)
                deltasOut.mult_slice(0, self.tmpGrad, 0)

            if i > 0 and i != self.temporalLayer:
                self.hActs[i].sign(target=self.tmpGrad)
                deltasOut.mult(self.tmpGrad)

            if i == self.numLayers:
                deltasIn = self.deltasIn

            deltasIn, deltasOut = deltasOut, deltasIn
            i -= 1

        return cost, self.grad, skip
示例#4
0
    def costAndGrad(self,data,labels=None):
        
        T = data.shape[1]
        self.setViews(T)

        if self.temporalLayer > 0:
            stack = self.stack[:-1]
            wt,_ = self.stack[-1]
            if self.train:
                grad = self.grad[:-1]
                dwt,_ = self.grad[-1]
        else:
            stack = self.stack
            if self.train:
                grad = self.grad
        
        # forward prop 
        self.hActs[0].assign(cm.CUDAMatrix(data))

        i = 1
        for w,b in stack:
            cm.dot(w,self.hActs[i-1],self.hActs[i])
            self.hActs[i].add_col_vec(b)

            # forward prop through time
            if i == self.temporalLayer:
                for t in xrange(1,T):
                    self.hActs[i].minmax(0.0,self.maxAct,col=t-1)
                    cm.mvdot_col_slice(wt,self.hActs[i],t-1,self.hActs[i],t,beta=1.0)
                self.hActs[i].minmax(0.0,self.maxAct,col=T-1)

            if i <= self.numLayers and i != self.temporalLayer:
                # hard relu
                self.hActs[i].maximum(0.0)
            i += 1

        # Subtract max activation
        self.hActs[-1].max(axis=0,target=self.rowVec)
        self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs)

        # Softmax
        cm.exp(self.probs)
        self.probs.sum(axis=0,target=self.rowVec)
        cm.pow(self.rowVec,-1.0,target=self.rowVec)
        self.probs.mult_by_row(self.rowVec)

        self.probs.copy_to_host()
	if not self.train:
	    return ctc.decode_best_path(self.probs.numpy_array.astype(np.float64))

        cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64),
                labels,blank=0)

        if skip:
            return cost,self.grad,skip

        self.deltasC.assign(cm.CUDAMatrix(deltas))

        # back prop
        i = self.numLayers 
        deltasIn,deltasOut = self.deltasC,self.deltasOut
        for w,b in reversed(stack):
            # compute gradient
            cm.dot(deltasIn,self.hActs[i].T,target=grad[i][0])
            deltasIn.sum(axis=1,target=grad[i][1])

            # compute next layer deltas
            if i > 0:
                cm.dot(w.T,deltasIn,target=deltasOut)

            # backprop through time
            if i == self.temporalLayer:
                self.hActs[i].within(0.0,self.maxAct,target=self.tmpGrad)
                self.deltaTemp.assign(0.0)
                for t in xrange(T-1,0,-1):
                    # Add in temporal delta
                    cm.mvdot_col_slice(wt.T,self.deltaTemp,t,deltasOut,t,beta=1.0)

                    # Push through activation fn
                    deltasOut.mult_slice(t,self.tmpGrad,t) 
                    self.deltaTemp.set_single_col(t-1,deltasOut,t)

 
                # Accumulate temporal gradient
                cm.dot(self.deltaTemp,self.hActs[i].T,
                        target=dwt)

                cm.mvdot_col_slice(wt.T,self.deltaTemp,0,deltasOut,0,beta=1.0)
                deltasOut.mult_slice(0,self.tmpGrad,0)

            if i > 0 and i != self.temporalLayer:
                self.hActs[i].sign(target=self.tmpGrad)
                deltasOut.mult(self.tmpGrad)

            if i == self.numLayers:
                deltasIn = self.deltasIn

            deltasIn,deltasOut = deltasOut,deltasIn
            i -= 1

        return cost,self.grad,skip
示例#5
0
    def costAndGrad(self,data,labels=None,key=None):
        """
        Forward prop entire utterance
        Call CTC cost function
        Compute gradient

        data is a 2-D matrix where each column is a single time frame
        Number of input frames changes across iterations
        
        labels is a vector of symbol ids, length unknown and does not
        depend on the number of time frames
        """

        ## forward prop
        # this is the same as minibatch forward prop 
        # since we pre-compute context window features for each time
        self.hActs[0] = data
        i = 1
        for w,b in self.stack:
            self.hActs[i] = w.dot(self.hActs[i-1])+b
            if i <= len(self.layerSizes):
                self.hActs[i] = self.activation(self.hActs[i])
            i += 1

        probs = self.hActs[-1]-gp.max(self.hActs[-1],axis=0)
	probs = gp.as_numpy_array(probs)
        probs = np.exp(probs)
        probs = probs/np.sum(probs,axis=0)
#	probs[probs<1e-12] = 1e-12 # TODO have to clamp?

        ## pass probs and label string to ctc loss
        # TODO how much does passing to different function cost us? 
	if not self.train:
	    return ctc.decode_best_path(probs, ref=labels, blank=0)
	    #return ctc.decode_bp_bigrams(probs, blank=0, B=None)

        cost, self.deltas[-1], skip = ctc.ctc_loss(probs, labels, blank=0)

	# Bad utterance ?
	if skip:
	    return cost,self.grad,skip

	# Store probabilities and error signal for a given key
	#if key is not None and key in self.hist:
	#    self.hist[key].append((probs,self.deltas[-1]))

	self.deltas[-1] = gp.garray(self.deltas[-1])

        # back prop
        i = len(self.layerSizes)-1
        for w,b in reversed(self.stack[1:]):
            grad = self.activation(self.hActs[i+1], True)
            self.deltas[i] = w.T.dot(self.deltas[i+1])*grad
            i -= 1

        # compute gradients
        # NOTE we do not divide by utterance length. 
        #    Will need to scale up weight norm penalty accordingly
        for i in range(len(self.grad)):
            self.grad[i][0] = self.deltas[i].dot(self.hActs[i].T)
            self.grad[i][1] = gp.sum(self.deltas[i],axis=1).reshape(-1,1)

        return cost,self.grad,skip