def decode(probs, alpha=1.0, beta=0.0, beam=100, method='clm2', clm=None): hypScore = None refScore = None # TODO Couldn't find score_sentence #refScore = ctc.score_sentence(probs,labels) #refScore += alpha*self.lm.score_tg(" ".join(sentence)) + beta*len(sentence) char_inds = pickle.load(open(CHAR_CORPUS_VOCAB_FILE, 'rb')) # Various decoding options align = None if method == 'pmax': # Pointwise argmax hyp, align = ctc.decode_best_path(probs) elif method == 'bg': import prefixTree # Bigram LM w/ prefix tree dictionary constraint print 'Loading prefix tree (this can take a while)...' pt = prefixTree.loadPrefixTree() lm = pt.lm print 'Done loading prefix tree.' tic = time.time() hyp, hypScore = bg_decoder.decode_bg_lm(probs, pt, lm, beam=beam, alpha=alpha, beta=beta) toc = time.time() print 'decoding time (wall): %f' % (toc - tic) elif method == 'clm': import clm_decoder # Character LM # NOTE need to restructure decoders into classes hyp, hypScore = clm_decoder.decode_clm(probs, clm, beam=beam, alpha=alpha, beta=beta) elif method == 'clm2': import clm_decoder2 # Character LM # NOTE need to restructure decoders into classes hyp, hypScore = clm_decoder2.decode_clm(probs, clm, beam=beam, alpha=alpha, beta=beta, char_inds=char_inds) elif method == 'fast': hyp, hypScore = decode_lm_wrapper(probs, beam, alpha, beta) else: assert False, 'No such decoding method: %s' % method return hyp, hypScore, refScore, align
def costAndGrad(self, data, labels=None): T = data.shape[1] self.setViews(T) if self.temporalLayer > 0: stack = self.stack[:-1] wt, _ = self.stack[-1] if self.train: grad = self.grad[:-1] dwt, _ = self.grad[-1] else: stack = self.stack if self.train: grad = self.grad # forward prop self.hActs[0].assign(cm.CUDAMatrix(data)) i = 1 for w, b in stack: cm.dot(w, self.hActs[i - 1], self.hActs[i]) self.hActs[i].add_col_vec(b) # forward prop through time if i == self.temporalLayer: for t in xrange(1, T): self.hActs[i].minmax(0.0, self.maxAct, col=t - 1) cm.mvdot_col_slice(wt, self.hActs[i], t - 1, self.hActs[i], t, beta=1.0) self.hActs[i].minmax(0.0, self.maxAct, col=T - 1) if i <= self.numLayers and i != self.temporalLayer: # hard relu self.hActs[i].maximum(0.0) i += 1 # Subtract max activation self.hActs[-1].max(axis=0, target=self.rowVec) self.hActs[-1].add_row_mult(self.rowVec, -1.0, target=self.probs) # Softmax cm.exp(self.probs) self.probs.sum(axis=0, target=self.rowVec) cm.pow(self.rowVec, -1.0, target=self.rowVec) self.probs.mult_by_row(self.rowVec) self.probs.copy_to_host() if not self.train: return ctc.decode_best_path( self.probs.numpy_array.astype(np.float64)) cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype( np.float64), labels, blank=0) if skip: return cost, self.grad, skip self.deltasC.assign(cm.CUDAMatrix(deltas)) # back prop i = self.numLayers deltasIn, deltasOut = self.deltasC, self.deltasOut for w, b in reversed(stack): # compute gradient cm.dot(deltasIn, self.hActs[i].T, target=grad[i][0]) deltasIn.sum(axis=1, target=grad[i][1]) # compute next layer deltas if i > 0: cm.dot(w.T, deltasIn, target=deltasOut) # backprop through time if i == self.temporalLayer: self.hActs[i].within(0.0, self.maxAct, target=self.tmpGrad) self.deltaTemp.assign(0.0) for t in xrange(T - 1, 0, -1): # Add in temporal delta cm.mvdot_col_slice(wt.T, self.deltaTemp, t, deltasOut, t, beta=1.0) # Push through activation fn deltasOut.mult_slice(t, self.tmpGrad, t) self.deltaTemp.set_single_col(t - 1, deltasOut, t) # Accumulate temporal gradient cm.dot(self.deltaTemp, self.hActs[i].T, target=dwt) cm.mvdot_col_slice(wt.T, self.deltaTemp, 0, deltasOut, 0, beta=1.0) deltasOut.mult_slice(0, self.tmpGrad, 0) if i > 0 and i != self.temporalLayer: self.hActs[i].sign(target=self.tmpGrad) deltasOut.mult(self.tmpGrad) if i == self.numLayers: deltasIn = self.deltasIn deltasIn, deltasOut = deltasOut, deltasIn i -= 1 return cost, self.grad, skip
def costAndGrad(self,data,labels=None): T = data.shape[1] self.setViews(T) if self.temporalLayer > 0: stack = self.stack[:-1] wt,_ = self.stack[-1] if self.train: grad = self.grad[:-1] dwt,_ = self.grad[-1] else: stack = self.stack if self.train: grad = self.grad # forward prop self.hActs[0].assign(cm.CUDAMatrix(data)) i = 1 for w,b in stack: cm.dot(w,self.hActs[i-1],self.hActs[i]) self.hActs[i].add_col_vec(b) # forward prop through time if i == self.temporalLayer: for t in xrange(1,T): self.hActs[i].minmax(0.0,self.maxAct,col=t-1) cm.mvdot_col_slice(wt,self.hActs[i],t-1,self.hActs[i],t,beta=1.0) self.hActs[i].minmax(0.0,self.maxAct,col=T-1) if i <= self.numLayers and i != self.temporalLayer: # hard relu self.hActs[i].maximum(0.0) i += 1 # Subtract max activation self.hActs[-1].max(axis=0,target=self.rowVec) self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs) # Softmax cm.exp(self.probs) self.probs.sum(axis=0,target=self.rowVec) cm.pow(self.rowVec,-1.0,target=self.rowVec) self.probs.mult_by_row(self.rowVec) self.probs.copy_to_host() if not self.train: return ctc.decode_best_path(self.probs.numpy_array.astype(np.float64)) cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64), labels,blank=0) if skip: return cost,self.grad,skip self.deltasC.assign(cm.CUDAMatrix(deltas)) # back prop i = self.numLayers deltasIn,deltasOut = self.deltasC,self.deltasOut for w,b in reversed(stack): # compute gradient cm.dot(deltasIn,self.hActs[i].T,target=grad[i][0]) deltasIn.sum(axis=1,target=grad[i][1]) # compute next layer deltas if i > 0: cm.dot(w.T,deltasIn,target=deltasOut) # backprop through time if i == self.temporalLayer: self.hActs[i].within(0.0,self.maxAct,target=self.tmpGrad) self.deltaTemp.assign(0.0) for t in xrange(T-1,0,-1): # Add in temporal delta cm.mvdot_col_slice(wt.T,self.deltaTemp,t,deltasOut,t,beta=1.0) # Push through activation fn deltasOut.mult_slice(t,self.tmpGrad,t) self.deltaTemp.set_single_col(t-1,deltasOut,t) # Accumulate temporal gradient cm.dot(self.deltaTemp,self.hActs[i].T, target=dwt) cm.mvdot_col_slice(wt.T,self.deltaTemp,0,deltasOut,0,beta=1.0) deltasOut.mult_slice(0,self.tmpGrad,0) if i > 0 and i != self.temporalLayer: self.hActs[i].sign(target=self.tmpGrad) deltasOut.mult(self.tmpGrad) if i == self.numLayers: deltasIn = self.deltasIn deltasIn,deltasOut = deltasOut,deltasIn i -= 1 return cost,self.grad,skip
def costAndGrad(self,data,labels=None,key=None): """ Forward prop entire utterance Call CTC cost function Compute gradient data is a 2-D matrix where each column is a single time frame Number of input frames changes across iterations labels is a vector of symbol ids, length unknown and does not depend on the number of time frames """ ## forward prop # this is the same as minibatch forward prop # since we pre-compute context window features for each time self.hActs[0] = data i = 1 for w,b in self.stack: self.hActs[i] = w.dot(self.hActs[i-1])+b if i <= len(self.layerSizes): self.hActs[i] = self.activation(self.hActs[i]) i += 1 probs = self.hActs[-1]-gp.max(self.hActs[-1],axis=0) probs = gp.as_numpy_array(probs) probs = np.exp(probs) probs = probs/np.sum(probs,axis=0) # probs[probs<1e-12] = 1e-12 # TODO have to clamp? ## pass probs and label string to ctc loss # TODO how much does passing to different function cost us? if not self.train: return ctc.decode_best_path(probs, ref=labels, blank=0) #return ctc.decode_bp_bigrams(probs, blank=0, B=None) cost, self.deltas[-1], skip = ctc.ctc_loss(probs, labels, blank=0) # Bad utterance ? if skip: return cost,self.grad,skip # Store probabilities and error signal for a given key #if key is not None and key in self.hist: # self.hist[key].append((probs,self.deltas[-1])) self.deltas[-1] = gp.garray(self.deltas[-1]) # back prop i = len(self.layerSizes)-1 for w,b in reversed(self.stack[1:]): grad = self.activation(self.hActs[i+1], True) self.deltas[i] = w.T.dot(self.deltas[i+1])*grad i -= 1 # compute gradients # NOTE we do not divide by utterance length. # Will need to scale up weight norm penalty accordingly for i in range(len(self.grad)): self.grad[i][0] = self.deltas[i].dot(self.hActs[i].T) self.grad[i][1] = gp.sum(self.deltas[i],axis=1).reshape(-1,1) return cost,self.grad,skip