def compute_seq_ppl(self, xs, ys): #### YOUR CODE HERE #### J = 0 ns = len(xs) hs = zeros((ns+1, self.hdim)) cs = zeros((ns, self.cdim)) # predicted probas ps = zeros((ns, self.Udim)) #### YOUR CODE HERE #### L = self.sparams.L Lc = self.Lcluster cfreq = self.cfreq cwords = self.cwords direct_size = self.hsize U = self.params.U H = self.params.H C = zeros((self.cdim, self.hdim)) if self.isCompression is True: C = self.params.C ## # Forward propagation for i in xrange(ns): hs[i+1] = sigmoid(H.dot(hs[i]) + L[xs[i]]) #hs[i+1] = 2.0/(1 + exp(-2.0*(H.dot(hs[i]) + L[xs[i]]))) - 1 #without maximum entropy optimization word_cluster = Lc[ys[i]] st_word = cwords[word_cluster, 0] ed_word = st_word + cfreq[word_cluster] part_cluster = zeros((self.class_size, )) part_word = zeros((ed_word - st_word, )) if self.isME is True: if direct_size > 0 and xs[i] != -1: part_cluster += self.params.cluster_direct[xs[i]] indexs = cwords[word_cluster, 0:int(cfreq[word_cluster])] if xs[i] < direct_size: part_word += self.params.word_direct[xs[i], indexs] if self.isCompression is True: cs[i] = sigmoid(C.dot(hs[i+1])) part_cluster += U[self.vdim:].dot(cs[i]) part_word += U[st_word:ed_word].dot(cs[i]) ps[i, self.vdim:] = softmax(part_cluster) ps[i, st_word:ed_word] = softmax(part_word) else: part_cluster += U[self.vdim:].dot(hs[i+1]) part_word += U[st_word:ed_word].dot(hs[i+1]) ps[i, self.vdim:] = softmax(part_cluster) ps[i, st_word:ed_word] = softmax(part_word) #ps[i, self.vdim:] = softmax(U[self.vdim:,:].dot(hs[i+1])) #ps[i, st_word:ed_word] = softmax(U[st_word:ed_word,:].dot(hs[i+1])) #print maximum(ps[i, ys[st_word:ed_word]]), ps[i,ys[i]], maximum(ps[i, self.vdim:]), ps[i, self.vdim+word_cluster] J -= log(ps[i, ys[i]] * ps[i, self.vdim+word_cluster]) return J
def forwardProp(self, node, correct, guess): cost = total = 0.0 if node.isLeaf == True: node.fprop = True node.hActs1 = self.L[:, node.word] node.probs = softmax(self.Ws.dot(node.hActs1) + self.bs) p = node.probs * make_onehot(node.label, len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) return cost, 1 c1, t1 = self.forwardProp(node.left, correct, guess) c2, t2 = self.forwardProp(node.right, correct, guess) if node.left.fprop and node.right.fprop: node.fprop = True h = np.hstack([node.left.hActs1, node.right.hActs1]) tmp = np.zeros(len(node.left.hActs1)) for i in range(len(tmp)): tmp[i] = h.dot(self.V[i]).dot(h) node.hActs1 = self.ReLU(self.W.dot(h) + self.b + tmp) node.probs = softmax(self.Ws.dot(node.hActs1) + self.bs) p = node.probs * make_onehot(node.label, len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) cost += c1 cost += c2 total += t1 total += t2 return cost, total + 1
def forwardProp(self, node, correct=[], guess=[]): cost = total = 0.0 # this is exactly the same setup as forwardProp in rnn.py if node.isLeaf == True: node.fprop = True node.hActs1 = self.L[:, node.word] node.hActs2 = self.ReLU(self.W2.dot(node.hActs1) + self.b2) node.probs = softmax( self.Ws.dot(node.hActs2 * self.mask) + self.bs) p = node.probs * make_onehot(node.label, len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) return cost, 1 c1, t1 = self.forwardProp(node.left, correct, guess) c2, t2 = self.forwardProp(node.right, correct, guess) if node.left.fprop and node.right.fprop: node.fprop = True h = np.hstack([node.left.hActs1, node.right.hActs1]) node.hActs1 = self.ReLU(self.W1.dot(h) + self.b1) node.hActs2 = self.ReLU(self.W2.dot(node.hActs1) + self.b2) node.probs = softmax( self.Ws.dot(node.hActs2 * self.mask) + self.bs) p = node.probs * make_onehot(node.label, len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) cost += c1 cost += c2 total += t1 total += t2 return cost, total + 1
def forwardProp(self,node, correct=[], guess=[]): cost = total = 0.0 # this is exactly the same setup as forwardProp in rnn.py if node.isLeaf == True: node.fprop = True node.hActs1 = self.L[:,node.word] node.hActs2 = self.ReLU(self.W2.dot(node.hActs1)+self.b2) node.probs = softmax(self.Ws.dot(node.hActs2)+self.bs) p = node.probs*make_onehot(node.label,len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) return cost, 1 c1,t1 = self.forwardProp(node.left,correct,guess) c2,t2 = self.forwardProp(node.right,correct,guess) if node.left.fprop and node.right.fprop: node.fprop = True h = np.hstack([node.left.hActs1, node.right.hActs1]) node.hActs1 = self.ReLU(self.W1.dot(h) + self.b1) node.hActs2 = self.ReLU(self.W2.dot(node.hActs1) + self.b2) node.probs = softmax(self.Ws.dot(node.hActs2)+self.bs) p = node.probs*make_onehot(node.label,len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) cost += c1 cost += c2 total += t1 total += t2 return cost, total + 1
def forwardProp(self,node,correct, guess): cost = total = 0.0 if node.isLeaf == True: node.fprop = True node.hActs1 = self.L[:, node.word] node.probs = softmax(self.Ws.dot(node.hActs1)+self.bs) p = node.probs*make_onehot(node.label, len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) return cost, 1 c1,t1 = self.forwardProp(node.left,correct,guess) c2,t2 = self.forwardProp(node.right,correct,guess) if node.left.fprop and node.right.fprop: node.fprop = True h = np.hstack([node.left.hActs1, node.right.hActs1]) tmp = np.zeros(len(node.left.hActs1)) for i in range(len(tmp)): tmp[i] = h.dot(self.V[i]).dot(h) node.hActs1 = self.ReLU(self.W.dot(h) + self.b + tmp) node.probs = softmax(self.Ws.dot(node.hActs1)+self.bs) p = node.probs*make_onehot(node.label,len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) cost += c1 cost += c2 total += t1 total += t2 return cost, total + 1
def f_prop(self, ys, h_in): """Given a series of xs and a series of ys, returns hidden vector at end, and also the cost""" N = len(ys) # total num timesteps #L = self.params['L'] Wh = self.params['Wh'] #Wx = self.params['Wx'] U = self.params['U'] b1 = self.params['b1'] b2 = self.params['b2'] self.yhats = np.zeros([self.outdim, N]) self.hs = np.zeros([self.hdim, N+1]) # np.random.seed(2234) # self.hs[:,-1] = np.random.normal(0,.1,(self.hdim)) self.hs[:,-1] = h_in cost = 0 for t in xrange(N): h_prev = self.hs[:,t-1] z_1 = np.dot(Wh, h_prev) + b1 #+ np.dot(Wx, Lx) h1 = np.maximum(z_1, 0) self.hs[:,t] = h1 yhat = softmax(np.dot(U, h1) + b2) self.yhats[:,t] = yhat cost += -np.log(yhat[ys[t]]) return cost
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 #### YOUR CODE HERE #### # Expect xs as list of indices ns = len(xs) # make matrix here of corresponding h(t) # hs[-1] = initial hidden state (zeros) # shape T x Dh hs = zeros((ns+1, self.hdim)) # predicted probas, shape T x V ps = zeros((ns, self.vdim)) for t in range(ns): # shape Dh hs[t] = sigmoid(self.params.H.dot(hs[t-1]) + self.sparams.L[xs[t]]) # shape T x V ps[t] = softmax(self.params.U.dot(hs[t])) J += -log(ps[t,ys[t]]) #### END YOUR CODE #### return J
def predict_proba(self, windows): """ Predict class probabilities. Should return a matrix P of probabilities, with each row corresponding to a row of X. windows = array (n x windowsize), each row is a window of indices """ # handle singleton input by making sure we have # a list-of-lists if not hasattr(windows[0], "__iter__"): windows = [windows] #### YOUR CODE HERE #### # TODO: Vectorize this P = zeros((len(windows), self.params.b2.shape[0])) for idx in range(0, len(windows)): # Forward propagation window = array(windows[idx]) words = [ self.sparams.L[window[0]], self.sparams.L[window[1]], self.sparams.L[window[2]] ] x = reshape(words, self.sparams.L.shape[1] * 3) # 3n row vector z2 = self.params.W.dot(x) + self.params.b1 a2 = tanh(z2) z3 = self.params.U.dot(a2) + self.params.b2 a3 = softmax(z3) P[idx, :] = a3 return P # rows are output for each input
def compute_loss(self, windows, labels): """ Compute the loss for a given dataset. windows = same as for predict_proba labels = list of class labels, for each row of windows """ #### YOUR CODE HERE #### print "windows shape ", windows.shape x = self.sparams.L[windows[:,0]] for i in range(len(windows[0])-1): x = np.concatenate((x,self.sparams.L[windows[:,i+1]]),axis=1) z = self.params.W.dot(x.T)+self.params.b1.reshape((self.params.b1.shape[0],1)) h = tanh(z) p = softmax(self.params.U.dot(h)+self.params.b2.reshape((self.params.b2.shape[0],1))) labelArray = np.zeros((len(labels),self.params.b2.shape[0])) for i in range(len(labels)): labelArray[i] = make_onehot(labels[i],self.params.b2.shape[0]) batch = len(labels) p = p*labelArray.T p = np.sum(p,axis=0) J = np.sum(-np.log(p)) Jreg = batch*(self.lreg/2.0)*(np.sum(self.params.W**2)+np.sum(self.params.U**2)) J += Jreg #### END YOUR CODE #### return J
def predict_proba(self, windows): """ Predict class probabilities. Should return a matrix P of probabilities, with each row corresponding to a row of X. windows = array (n x windowsize), each row is a window of indices """ # handle singleton input by making sure we have # a list-of-lists #hasattr( object, name) #The arguments are an object and a string. The result is True if the string is the name of one of the object's #attributes, False if not. (This is implemented by calling getattr(object, name) and seeing whether it raises an #exception or not.) if not hasattr(windows[0], "__iter__"): windows = [windows] #### YOUR CODE HERE #### P = [] for window in windows: x = hstack(self.sparams.L[window]) h = tanh(self.params.W.dot(x) + self.params.b1) p = softmax(self.params.U.dot(h) + self.params.b2) P.append(p) #### END YOUR CODE #### return P # rows are output for each input
def predict_proba(self, windows): """ Predict class probabilities. Should return a matrix P of probabilities, with each row corresponding to a row of X. windows = array (n x windowsize), each row is a window of indices """ # handle singleton input by making sure we have # a list-of-lists if not hasattr(windows[0], "__iter__"): windows = [windows] #### YOUR CODE HERE #### # x - (W) -> a - (tanh) -> h - (U) -> z - (softmax) -> p P = [] for window in windows: # Is it possible to use fully-vectorized method instead of for loop? x = hstack(self.sparams.L[window, :]) # the same as above h = tanh(self.params.W.dot(x) + self.params.b1) p = softmax(self.params.U.dot(h) + self.params.b2) P.append(p) #### END YOUR CODE #### return array(P) # rows are output for each input
def f_prop(self, ys, h_in): """Given a series of xs and a series of ys, returns hidden vector at end, and also the cost""" N = len(ys) # total num timesteps #L = self.params['L'] Wh = self.params['Wh'] #Wx = self.params['Wx'] U = self.params['U'] b1 = self.params['b1'] b2 = self.params['b2'] self.yhats = np.zeros([self.outdim, N]) self.hs = np.zeros([self.hdim, N + 1]) # np.random.seed(2234) # self.hs[:,-1] = np.random.normal(0,.1,(self.hdim)) self.hs[:, -1] = h_in cost = 0 for t in xrange(N): h_prev = self.hs[:, t - 1] z_1 = np.dot(Wh, h_prev) + b1 #+ np.dot(Wx, Lx) h1 = np.maximum(z_1, 0) self.hs[:, t] = h1 yhat = softmax(np.dot(U, h1) + b2) self.yhats[:, t] = yhat cost += -np.log(yhat[ys[t]]) return cost
def predict_proba(self, windows): """ Predict class probabilities. Should return a matrix P of probabilities, with each row corresponding to a row of X. windows = array (n x windowsize), each row is a window of indices """ # handle singleton input by making sure we have # a list-of-lists if not hasattr(windows[0], "__iter__"): windows = [windows] #### YOUR CODE HERE #### N = len(windows) P = zeros((N, self.params.b2.shape[0])) for n in xrange(N): x = self.sparams.L[windows[n]] x = x.reshape((x.shape[0]*x.shape[1])) z = self.params.W.dot(x) + self.params.b1 h = tanh(z) P[n,:] = softmax(self.params.U.dot(h) + self.params.b2) #### END YOUR CODE #### return P # rows are output for each input
def _acc_grads(self, x, label): """ Accumulate gradients from a training example. """ #import ipdb; ipdb.set_trace() ## # Forward propagation z1 = self.params.W.dot(x) + self.params.b1 h1 = tanh(z1) z2 = np.dot(self.params.U, h1) + self.params.b2 h2 = tanh(z2) z3 = np.dot(self.params.G, h2) + self.params.b3 y_hat = softmax(z3) y = make_onehot(label, self.outputsize) d3 = y_hat - y self.grads.b3 += d3 self.grads.G += np.outer(d3, h2) + self.lreg * self.params.G d2 = np.dot(self.params.G.T, d3) * tanhd(z2) self.grads.b2 += d2 self.grads.U += np.outer(d2, h1) + self.lreg * self.params.U d1 = np.dot(self.params.U.T, d2) * tanhd(z1) self.grads.W += np.outer(d1, x) + self.lreg * self.params.W self.grads.b1 += d1
def predict_proba(self, windows): """ Predict class probabilities. Should return a matrix P of probabilities, with each row corresponding to a row of X. windows = array (n x windowsize), each row is a window of indices """ # handle singleton input by making sure we have # a list-of-lists if not hasattr(windows[0], "__iter__"): windows = [windows] #### YOUR CODE HERE #### P = zeros((len(windows), self.params.U.shape[0])) for i, window in enumerate(windows): a1 = hstack(self.sparams.L[window, :]) a2 = tanh(self.params.W.dot(a1) + self.params.b1) # h y_hat = softmax(self.params.U.dot(a2) + self.params.b2) P[i, :] = y_hat #### END YOUR CODE #### return P # rows are output for each input
def predict_proba(self, windows): """ Predict class probabilities. Should return a matrix P of probabilities, with each row corresponding to a row of X. windows = array (n x windowsize), each row is a window of indices """ # handle singleton input by making sure we have # a list-of-lists if not hasattr(windows[0], "__iter__"): windows = [windows] #### YOUR CODE HERE #### #### END YOUR CODE #### P = zeros((len(windows), self.params.b2.shape[0])) # (|V|, 5) for i in range(P.shape[0]): # Forward propagation x = array([self.sparams.L[w] for w in windows[i]]).reshape(self.sparams.L.shape[1] * len(windows[0])) a1 = x # 3n = 150 input vector z1 = dot(self.params.W, a1) + self.params.b1 # 100 vector a2 = tanh(z1) # 100 vector z2 = dot(self.params.U, a2) + self.params.b2 # 5 vector a3 = softmax(z2) # 5 vector P[i, :] = a3 return P # rows are output for each input
def predict_proba(self, windows): """ Predict class probabilities. Should return a matrix P of probabilities, with each row corresponding to a row of X. windows = array (n x windowsize), each row is a window of indices """ # handle singleton input by making sure we have # a list-of-lists if not hasattr(windows[0], "__iter__"): windows = [windows] #### YOUR CODE HERE #### # construct input matrix x = vstack([concatenate([self.sparams.L[idx] for idx in window]) for window in windows]) z1 = self.params.W.dot(x.T) + self.params.b1[:, newaxis] h1 = 2 * sigmoid(2 * z1) - 1 z2 = self.params.U.dot(h1) + self.params.b2[:, newaxis] P = softmax(z2.T) #### END YOUR CODE #### return P # rows are output for each input
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 #### YOUR CODE HERE #### ns = len(xs) hs = np.zeros((ns + 1, self.hdim)) for i in range(ns): hs[i + 1] = sigmoid( self.params.H.dot(hs[i]) + self.params.W.dot(self.sparams.L[xs[i]])) p = softmax(self.params.U.dot(hs[i + 1])) p = p * make_onehot(ys[i], self.vdim) J += -np.log(np.sum(p)) #### END YOUR CODE #### Jreg = 0.5 * self.lreg * (np.sum(self.params.H**2) + np.sum( self.params.W**2) + np.sum(self.params.U**2)) return J + Jreg
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 #### YOUR CODE HERE #### ns = len(xs) self.xs = xs self.ys = ys hs = zeros((ns + 1, self.hdim)) self.hs1 = hs # for each time step for t in xrange(ns): hs[t] = sigmoid( dot(self.params.H, hs[t - 1]) + self.sparams.L[xs[t]]) y_hat = softmax(dot(self.params.U, hs[t])) J -= log(y_hat[ys[t]]) #### END YOUR CODE #### return J
def ModelOutputCenterWord(clf,word_to_num,num_to_word,num_to_tag,windowsize): d = len(clf.sparams.L[0]) partW = clf.params.W[:,(windowsize/2)*d:(windowsize/2+1)*d] z = clf.sparams.L.dot(partW.T) + clf.params.b1 # z -> (N,h) h = clf.tanh(z) p = softmax(h.dot(clf.params.U.T)+clf.params.b2) #p ->(N,C) outputLayer = collections.defaultdict(list) for i in range(len(p)): for j in range(len(p[i])): outputLayer[j].append((p[i][j],i)) topN = 10 topscores = np.zeros((len(clf.params.b2),topN)) topwords = np.zeros((len(clf.params.b2),topN)) for i in range(len(outputLayer)): a = sorted(outputLayer[i],numericalCmp) for j in range(topN): topscores[i][j] = a[j][0] topwords[i][j] = a[j][1] print "topscores -->" print topscores for i in range(1,5): print "Output Neuron %d: %s" % (i,num_to_tag[i]) words = [] for j in topwords[i]: words.append(num_to_word[j]) print_scores(topscores[i],words)
def compute_loss(self, windows, labels): """ Compute the loss for a given dataset. windows = same as for predict_proba labels = list of class labels, for each row of windows """ #### YOUR CODE HERE #### if not hasattr(windows[0], "__iter__"): windows = [windows] labels = [labels] N = len(windows) # x = self.sparams.L[windows] # x = x.reshape((N,x.shape[-2]*x.shape[-1])) # z = x.dot(self.params.W.T) + self.params.b1 # h = tanh(z) # z2 = h.dot(self.params.U.T) + self.params.b2 # p = softmax(z2) # J -= sum(log(p[0][labels]) # J += (self.lreg / 2.0) * (sum(self.params.W**2.0) + sum(self.params.U**2.0)) J = 0 for n in xrange(N): x = self.sparams.L[windows[n]] x = reshape(x, x.shape[0]*x.shape[1]) h = tanh(self.params.W.dot(x) + self.params.b1) y_hat = softmax(self.params.U.dot(h) + self.params.b2) y = make_onehot(labels[n], len(y_hat)) J -= sum(y*log(y_hat)) J += (self.lreg / 2.0) * (sum(self.params.W**2.0) + sum(self.params.U**2.0)) #### END YOUR CODE #### return J
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ xf = [] for idx in window: xf.extend( self.sparams.L[idx]) # extract representation tanhX = tanh(self.params.W.dot(xf) + self.params.b1) softmaxP = softmax(self.params.U.dot(tanhX) + self.params.b2) y = make_onehot(label, len(softmaxP)) delta2 = softmaxP -y self.grads.U += outer(delta2, tanhX) + self.lreg * self.params.U self.grads.b2 += delta2 delta1 = self.params.U.T.dot(delta2)*(1. - tanhX*tanhX) self.grads.W += outer(delta1, xf) + self.lreg * self.params.W self.grads.b1 += delta1
def predict_proba(self, windows): """ Predict class probabilities. Should return a matrix P of probabilities, with each row corresponding to a row of X. windows = array (n x windowsize), each row is a window of indices """ # handle singleton input by making sure we have # a list-of-lists if not hasattr(windows[0], "__iter__"): windows = [windows] P = [] for window in windows: # extract representation: concatenate window of words into a numpy colunm vector x = hstack(self.sparams.L[window, :]) # just two layers, so simple h = tanh(self.params.W.dot(x) + self.params.b1) p = softmax(self.params.U.dot(h) + self.params.b2) P.append(p) return array(P) # rows are output for each input
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ ns = len(xs) h_ant = zeros((1, self.hdim)) J = 0 #### YOUR CODE HERE #### for step in xrange(0,ns): # print "hs[step-1].shape %s" % (hs[step-1].shape,) # print "self.params.H.shape %s" % (self.params.H.shape,) # print "self.sparams.L.shape %s" % (self.sparams.L.shape,) # print "self.sparams.L[xs[step]].shape %s" % (self.sparams.L[xs[step]].shape,) a1 = self.params.H.dot(h_ant.T).T + self.sparams.L[xs[step]] h = sigmoid( a1 ) a2 = self.params.U.dot(h.T).T # print "h.shape %s" % (h.shape,) # print "a2.shape %s" % (a2.shape,) # print "self.params.U.shape %s" % (self.params.U.shape,) y_hat = softmax( a2 ) h_ant = h J -= log( y_hat[:,ys[step]] ) #### END YOUR CODE #### return J
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ #J = 0 ns = len(xs) #### YOUR CODE HERE #### # forward propagation hs = zeros((ns+1, self.hdim)) ps = zeros((ns, self.vdim)) # predicted probas for t in range(0, ns): hs[t] = sigmoid(dot(self.params.H, hs[t-1]) + self.sparams.L[xs[t], :]) ps[t] = softmax(dot(self.params.U, hs[t])) J = - sum(log(ps[arange(ns), ys])) #### END YOUR CODE #### return J
def predict_seq_proba(self, X): #### YOUR CODE HERE #### # Expect xs as list of indices #ns = len(xs) ns = X.shape[0] # X.shape = (ns, Dw) #X = self.L[xs,:] Z = (self.params.W1.dot(X.T)).T + self.params.b1 # A.shape = (ns, Dh) A = sigmoid(Z) assert A.shape == (ns, self.hdim) if self.drop_p > 0.: A = A * (1 - self.drop_p) # Max each node of A over time (max of each column over all rows) # use argmax for use in backprop mx = argmax(A, 0) # Max pooling vector # this will select max elements of A: # h.shape == (Dh,) h = A[mx, list(range(len(mx)))] assert h.shape == (self.hdim, ) # prediction probabilities ps = softmax(self.params.Ws.dot(h) + self.params.bs) return (ps)
def compute_loss(self, windows, labels): """ Compute the loss for a given dataset. windows = same as for predict_proba labels = list of class labels, for each row of windows """ #### YOUR CODE HERE #### L = self.sparams.L U = self.params.U W = self.params.W b1 = self.params.b1 b2 = self.params.b2 lambda_ = self.lreg J = 0 labels_tem = None if not hasattr(windows[0], "__iter__"): windows = [windows] labels_tem = [labels] else: labels_tem = labels for i in xrange(len(windows)): x = hstack(L[windows[i], :]) h = tanh(W.dot(x) + b1) y_hat = softmax(U.dot(h) + b2) J -= log(y_hat[labels_tem[i]]) J += (lambda_ / 2.0) * (sum(W ** 2.0) + sum(U ** 2.0)) #### END YOUR CODE #### return J
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 #### YOUR CODE HERE #### ns = len(xs) hs = zeros((ns+1, self.hdim)) # predicted probas ps = zeros((ns, self.vdim)) #### YOUR CODE HERE #### L = self.sparams.L U = self.params.U H = self.params.H ## # Forward propagation for i in xrange(ns): hs[i+1] = sigmoid(H.dot(hs[i]) + L[xs[i]]) #hs[i+1] = 2.0/(1.0 + exp(-2.0*(H.dot(hs[i]) + L[xs[i]]))) - 1.0 ps[i] = softmax(U.dot(hs[i+1])) J -= log(ps[i][ys[i]]) #### END YOUR CODE #### return J
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 #### YOUR CODE HERE #### ns = len(xs) hs = zeros((ns+1, self.hdim)) ps = zeros((ns, self.vdim)) for i in xrange(ns): hs[i] = sigmoid(self.params.H.dot(hs[i-1]) + self.sparams.L[xs[i]]) ps[i] = softmax(self.params.U.dot(hs[i])) J -= log(ps[i][ys[i]]) #### END YOUR CODE #### return J
def predict_proba(self, windows): """ Predict class probabilities. Should return a matrix P of probabilities, with each row corresponding to a row of X. windows = array (n x windowsize), each row is a window of indices """ # handle singleton input by making sure we have # a list-of-lists if not hasattr(windows[0], "__iter__"): windows = [windows] n = len(windows) P = zeros((len(windows),self.params.b2.shape[0])) #### YOUR CODE HERE #### for idx in xrange(n): window = windows[idx] x = hstack(self.sparams.L[window]) h = tanh(self.params.W.dot(x) + self.params.b1) scores = self.params.U.dot(h) + self.params.b2 P[idx,:]= softmax(scores) #### END YOUR CODE #### return P # rows are output for each input
def predict_proba(self, windows): """ Predict class probabilities. Should return a matrix P of probabilities, with each row corresponding to a row of X. windows = array (n x windowsize), each row is a window of indices """ # handle singleton input by making sure we have # a list-of-lists if not hasattr(windows[0], "__iter__"): windows = [windows] #### YOUR CODE HERE #### x = self.sparams.L[windows[:,0]] for i in range(len(windows[0])-1): x = np.concatenate((x,self.sparams.L[windows[:,i+1]]),axis=1) z = self.params.W.dot(x.T)+self.params.b1.reshape((self.params.b1.shape[0],1)) h = self.tanh(z) p = softmax(self.params.U.dot(h)+self.params.b2.reshape((self.params.b2.shape[0],1))) #### END YOUR CODE #### return p # rows are output for each input
def predict_proba(self, windows): """ Predict class probabilities. Should return a matrix P of probabilities, with each row corresponding to a row of X. windows = array (n x windowsize), each row is a window of indices """ # handle singleton input by making sure we have # a list-of-lists if not hasattr(windows[0], "__iter__"): windows = [windows] P = empty((len(windows), self.nclass)) #### YOUR CODE HERE #### for i, row in enumerate(windows): x = self.sparams.L[row, :].flatten() #words = [self.sparams.L[row[0]], self.sparams.L[row[1]], self.sparams.L[row[2]]] #x = reshape(words, self.sparams.L.shape[1] *3) # 3n row vector h = tanh(self.params.W.dot(x) + self.params.b1) p = softmax(self.params.U.dot(h) + self.params.b2) P[i, :] = p #### END YOUR CODE #### return P # rows are output for each input
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 #### YOUR CODE HERE #### ns = len(xs) h_prev = zeros(self.hdim) for t in xrange(ns): h_t = sigmoid(dot(self.params.H, h_prev) + self.sparams.L[xs[t]]) if t == ns - 1: yhat_t = softmax(dot(self.params.U, h_t)) J = -log(yhat_t[ys]) h_prev = h_t J += .5 * self.lamb * (sum(self.params.H**2) + sum(self.params.U**2)) #### END YOUR CODE #### return J
def _acc_grads(self, xs, ys): """ Accumulate gradients, given a pair of training sequences: xs = [<indices>] # input words ys = [<indices>] # output words (to predict) Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.H += (your gradient dJ/dH) self.sgrads.L[i] = (gradient dJ/dL[i]) # update row Per the handout, you should: - make predictions by running forward in time through the entire input sequence - for *each* output word in ys, compute the gradients with respect to the cross-entropy loss for that output word - run backpropagation-through-time for self.bptt timesteps, storing grads in self.grads (for H, U) and self.sgrads (for L) You'll want to store your predictions \hat{y}(t) and the hidden layer values h(t) as you run forward, so that you can access them during backpropagation. At time 0, you should initialize the hidden layer to be a vector of zeros. """ # Expect xs as list of indices ns = len(xs) # make matrix here of corresponding h(t) # hs[-1] = initial hidden state (zeros) hs = zeros((ns+1, self.hdim)) # predicted probs ps = zeros((ns, self.vdim)) #### YOUR CODE HERE #### # forward propagation for t in xrange(ns): hs[t] = sigmoid(dot(self.params.H, hs[t-1]) + self.sparams.L[xs[t]]) ps[t] = softmax(dot(self.sparams.U, hs[t])) # backpropagation through time for i in xrange(ns): d2i = ps[i] d2i[ys[i]] -= 1 d1 = dot(self.sparams.U.T, d2i) * hs[i] * (1 - hs[i]) self.sgrads.U = dot(d2i.reshape((-1, 1)), hs[i].reshape((1, -1))) for t in xrange(i, i - self.bptt - 1, -1): if t >= 0: # the farthest reference will thus be hs[-1] self.sgrads.L[xs[t]] = d1 self.grads.H += dot(d1.reshape((-1, 1)), hs[t-1].reshape((1, -1))) d1 = dot(self.params.H.T, d1) * hs[t-1] * (1 - hs[t-1]) # accumulate punishments/deltas
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 #### YOUR CODE HERE #### ns = len(xs) hs = zeros((ns + 1, self.hdim)) ps = zeros((ns, self.vdim)) for i in xrange(ns): hs[i] = sigmoid( self.params.H.dot(hs[i - 1]) + self.sparams.L[xs[i]]) ps[i] = softmax(self.params.U.dot(hs[i])) J -= log(ps[i][ys[i]]) #### END YOUR CODE #### return J
def predict_proba(self, windows): """ Predict class probabilities. Should return a matrix P of probabilities, with each row corresponding to a row of X. windows = array (n x windowsize), each row is a window of indices """ # handle singleton input by making sure we have # a list-of-lists if not hasattr(windows[0], "__iter__"): windows = [windows] #### YOUR CODE HERE #### P = [] for window in windows: x = hstack(self.sparams.L[window]) h = tanh(self.params.W.dot(x) + self.params.b1) p = softmax(self.params.U.dot(h) + self.params.b2) P.append(p) #### END YOUR CODE #### return P # rows are output for each input
def compute_loss(self, windows, labels): """ Compute the loss for a given dataset. windows = same as for predict_proba labels = list of class labels, for each row of windows """ labels_list = None #### YOUR CODE HERE #### if not hasattr(windows[0], "__iter__"): windows = [windows] labels_list = [labels] else: labels_list = labels J = 0 for i in xrange(len(windows)): x = hstack(self.sparams.L[windows[i], :]) # extract representation h = tanh(self.params.W.dot(x) + self.params.b1) p = softmax(self.params.U.dot(h) + self.params.b2) J += - log(p[labels_list[i]]) Jreg = (self.lreg / 2.0) * (sum(self.params.W**2.0) + sum(self.params.U**2.0)) #### END YOUR CODE #### return J + Jreg
def forward_pass(self, x): ''' Forward pass, Arguemnt: x input vectore Return:(zs, hs) hidden and output activations (hs) and inputs to activation function (zs) example: input _ dims [100, 30, 20, 5] output: hs = [x, h1, h2, h3] zs = [z1, z2, z3] ''' hs = [x] zs = [] h = x for i in range(1, len(self.dims)): W = self._get_param('W', i) b = self._get_param('b', i) z = W.dot(h) + b zs.append(z) # now activation function, if it's a last layer we use softmax # else tanh if i == len(self.dims) - 1: # last layer h = softmax(z) else: h = self.act(z) hs.append(h) return (zs, hs)
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 #### YOUR CODE HERE #### # hs[-1] = initial hidden state (zeros) ns = len(ys) hs = zeros((ns+1, self.hdim)) for t in range(ns): hs[t] = sigmoid(self.params.H.dot(hs[t-1]) + self.sparams.L[xs[t]]) #ps[t] = softmax(self.params.U.dot(hs[t])) #J -= log(ps[t][ys[t]]) h_final = hs[ns-1] z = self.params.U.dot(h_final) y_hat = [] for i in range(n_aspect): current = z[sent_dim*i:sent_dim*(i+1)] y_hat.extend(softmax(current)) J =- sum(ys.reshape(len(ys),1)*log(array(y_hat).reshape(len(y_hat),1))) #### END YOUR CODE #### return J
def predict_proba(self, windows): """ Predict class probabilities. Should return a matrix P of probabilities, with each row corresponding to a row of X. windows = array (n x windowsize), each row is a window of indices """ # handle singleton input by making sure we have # a list-of-lists if not hasattr(windows[0], "__iter__"): windows = [windows] #### YOUR CODE HERE #### #print 'windows.shape',windows[0] P=[] for window in windows: x = hstack([self.sparams.L[idx] for idx in window]) # extract representation,(150,) matrix #x=reshape(x,(x.shape[0]*x.shape[1])) #print self.params.W.shape,' ',x.shape,' ',self.params.b1.shape a =self.params.W.dot(x)+self.params.b1#(100,150)*(150,)+(100,)=>(100,) h = tanh(a)#(100,) p = softmax(self.params.U.dot(h) + self.params.b2)#(5,100)*(100,)+(100,)=>(5,) P.append(p) #### END YOUR CODE #### return P # rows are output for each input
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### ## # Forward propagation x = hstack(self.sparams.L[window, :]) h = tanh(2*(self.params.W.dot(x)+self.params.b1)) p = softmax(self.params.U.dot(h)+self.params.b2) ## y = make_onehot(label, 5) delta = p - y # Backpropagation self.grads.U += outer(delta, h) + self.lreg * self.params.U self.grads.b2 += delta gradh = dot(self.params.U.T,delta) * (1-h**2) self.grads.W += outer(gradh, x) + self.lreg * self.params.W self.grads.b1 += gradh dL = self.params.W.T.dot(gradh).reshape(self.window_size, self.word_vec_size) for i in xrange(self.window_size): self.sgrads.L[window[i], :] = dL[i]
def compute_loss(self, windows, labels): """ Compute the loss for a given dataset. windows = same as for predict_proba labels = list of class labels, for each row of windows """ #### YOUR CODE HERE #### labels_lst = None # handle singleton input by making sure we have # a list-of-lists if not hasattr(windows[0], "__iter__"): windows = [windows] labels_lst = [labels] else: labels_lst = labels J = 0.0 for window, label in zip(windows, labels_lst): x = hstack(self.sparams.L[window]) # (150,) --> (X,) h = tanh(dot(self.params.W, x) + self.params.b1) # (H,) y_hat = softmax(dot(self.params.U, h) + self.params.b2) # (Dy,) J -= log(y_hat[label]) J += (self.lreg / 2.0) * (sum(self.params.W**2.0) + sum(self.params.U**2.0)) #### END YOUR CODE #### return J
def predict_proba(self, windows): """ Predict class probabilities. Should return a matrix P of probabilities, with each row corresponding to a row of X. windows = array (n x windowsize), each row is a window of indices """ # handle singleton input by making sure we have # a list-of-lists if not hasattr(windows[0], "__iter__"): windows = [windows] #### YOUR CODE HERE #### print windows N = len(windows) windowsize = len(windows[0]) (Dy, H) = self.params.U.shape P = zeros((N, Dy)) for i, window in enumerate(windows): x = hstack(self.sparams.L[window]) # (150,) --> (X,) h = tanh(dot(self.params.W, x) + self.params.b1) # (H,) y_hat = softmax(dot(self.params.U, h) + self.params.b2) # (Dy,) P[i, :] = y_hat #### END YOUR CODE #### return P # rows are output for each input
def predict_proba(self, idx): """ Predict class probabilities. """ x = self.sparams.L[idx] p = softmax(self.params.W.dot(x) + self.params.b) return p
def predict_proba(self, windows): """ Predict class probabilities. Should return a matrix P of probabilities, with each row corresponding to a row of X. windows = array (n x windowsize), each row is a window of indices """ # handle singleton input by making sure we have # a list-of-lists if not hasattr(windows[0], "__iter__"): windows = [windows] #### YOUR CODE HERE #### N = len(windows) P = zeros((N, self.params.b2.shape[0])) for n in xrange(N): x = self.sparams.L[windows[n]] x = x.reshape((x.shape[0] * x.shape[1])) z = self.params.W.dot(x) + self.params.b1 h = tanh(z) P[n, :] = softmax(self.params.U.dot(h) + self.params.b2) #### END YOUR CODE #### return P # rows are output for each input
def compute_loss(self, windows, labels): """ Compute the loss for a given dataset. windows = same as for predict_proba labels = list of class labels, for each row of windows """ #### YOUR CODE HERE #### if not hasattr(windows[0], "__iter__"): windows = [windows] labels = [labels] N = len(windows) # x = self.sparams.L[windows] # x = x.reshape((N,x.shape[-2]*x.shape[-1])) # z = x.dot(self.params.W.T) + self.params.b1 # h = tanh(z) # z2 = h.dot(self.params.U.T) + self.params.b2 # p = softmax(z2) # J -= sum(log(p[0][labels]) # J += (self.lreg / 2.0) * (sum(self.params.W**2.0) + sum(self.params.U**2.0)) J = 0 for n in xrange(N): x = self.sparams.L[windows[n]] x = reshape(x, x.shape[0] * x.shape[1]) h = tanh(self.params.W.dot(x) + self.params.b1) y_hat = softmax(self.params.U.dot(h) + self.params.b2) y = make_onehot(labels[n], len(y_hat)) J -= sum(y * log(y_hat)) J += (self.lreg / 2.0) * (sum(self.params.W**2.0) + sum(self.params.U**2.0)) #### END YOUR CODE #### return J
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ ns = len(xs) h_ant = zeros((1, self.hdim)) J = 0 #### YOUR CODE HERE #### for step in xrange(0, ns): # print "hs[step-1].shape %s" % (hs[step-1].shape,) # print "self.params.H.shape %s" % (self.params.H.shape,) # print "self.sparams.L.shape %s" % (self.sparams.L.shape,) # print "self.sparams.L[xs[step]].shape %s" % (self.sparams.L[xs[step]].shape,) a1 = self.params.H.dot(h_ant.T).T + self.sparams.L[xs[step]] h = sigmoid(a1) a2 = self.params.U.dot(h.T).T # print "h.shape %s" % (h.shape,) # print "a2.shape %s" % (a2.shape,) # print "self.params.U.shape %s" % (self.params.U.shape,) y_hat = softmax(a2) h_ant = h J -= log(y_hat[:, ys[step]]) #### END YOUR CODE #### return J
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 #### YOUR CODE HERE #### # Expect xs as list of indices ns = len(xs) # make matrix here of corresponding h(t) # hs[-1] = initial hidden state (zeros) hs = zeros((ns+1, self.hdim)) # _for memory purposes_, we do not compute the loss in one fell swoop # forward propagation for t in xrange(ns): hs[t] = sigmoid(dot(self.params.H, hs[t-1]) + self.sparams.L[xs[t]]) p = softmax(dot(self.sparams.U, hs[t])) J -= sum(log(p[ys[t]])) #### END YOUR CODE #### return J
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ ns = len(xs) hs = zeros((ns+1, self.hdim)) ps = zeros((ns, self.vdim)) for i in range(ns): z1 = self.params.H.dot(hs[i-1]) + self.sparams.L[xs[i]] hs[i] = sigmoid(z1) z2 = self.params.U.dot(hs[i]) ps[i] = softmax(z2) J = sum(-log(ps[range(len(ps)), ys])) return J
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 #### YOUR CODE HERE #### ns = len(xs) self.xs = xs self.ys=ys hs = zeros((ns+1, self.hdim)) self.hs1 = hs # for each time step for t in xrange(ns): hs[t] = sigmoid(dot(self.params.H, hs[t - 1]) + self.sparams.L[xs[t]]) y_hat = softmax(dot(self.params.U, hs[t])) J -= log(y_hat[ys[t]]) #### END YOUR CODE #### return J
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ W, b1, U, b2 = self.params.W, self.params.b1, self.params.U, self.params.b2 H, input_size = W.shape C, H = U.shape # Convert window indices to input X = self.sparams.L[window].reshape(input_size, 1) # Forward Pass (predictions) z = np.dot(W, X) + b1.reshape(H, 1) hidden = np.tanh(z) # Tanh scores = np.dot(U, hidden) + b2.reshape(C, 1) probs = softmax(scores) y_hat = probs[label] # Cross Entropy Loss loss = -np.log(y_hat) # Backpropagate! dscores = probs dscores[label] -= 1 self.grads.b2 += dscores.reshape(C) self.grads.U += np.dot(dscores, hidden.T) dhidden = np.dot(U.T, dscores) dz = (1 - hidden**2) * dhidden # tanh derivative self.grads.b1 += dz.reshape(H) self.grads.W += np.dot(dz, X.T) # Push input vectors around dX = np.dot(W.T, dz).reshape(self.windowsize, self.D) self.sgrads.L[window] = dX # Regularization loss += 0.5 * self.lreg*(np.sum(W**2) + np.sum(b1**2) + np.sum(U**2) + np.sum(b2**2)) self.grads.W += self.lreg*W self.grads.b1 += self.lreg*b1 self.grads.U += self.lreg*U self.grads.b2 += self.lreg*b2
def forward_pass(self, x): z1 = self.params.W.dot(x) + self.params.b1 h1 = tanh(z1) z2 = np.dot(self.params.U, h1) + self.params.b2 h2 = tanh(z2) z3 = np.dot(self.params.G, h2) + self.params.b3 y_hat = softmax(z3) return y_hat
def predict(self, node, correct=[], guess=[]): cost = total = 0.0 # this is exactly the same setup as forwardProp in rnn.py if node.isLeaf == True: node.fprop = True node.hActs1 = self.L[:,node.word] #node.hActs2 = self.ReLU(self.W2.dot(node.hActs1)+self.b2) tmp = node.hActs1*self.dropoutP tmpMaxout = np.zeros((self.maxoutK, self.middleDim)) for i in range(self.maxoutK): tmpMaxout[i] = self.W2[i].dot(tmp)+self.b2[i] (node.hActs2, node.idx) = self.maxout(tmpMaxout) node.probs = softmax((self.Ws*self.dropoutP).dot(node.hActs1)+self.bs) p = node.probs*make_onehot(node.label,len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) return cost, 1 c1,t1 = self.forwardProp(node.left,correct,guess) c2,t2 = self.forwardProp(node.right,correct,guess) if node.left.fprop and node.right.fprop: node.fprop = True h = np.hstack([node.left.hActs1, node.right.hActs1]) node.hActs1 = self.ReLU(self.W1.dot(h) + self.b1) #node.hActs2 = self.ReLU(self.W2.dot(node.hActs1)+self.b2) tmp = node.hActs1*self.dropoutP tmpMaxout = np.zeros((self.maxoutK,self.middleDim)) for i in range(self.maxoutK): tmpMaxout[i] = self.W2[i].dot(tmp)+self.b2[i] (node.hActs2, node.idx) = self.maxout(tmpMaxout) node.probs = softmax((self.Ws*self.dropoutP).dot(node.hActs2)+self.bs) p = node.probs*make_onehot(node.label,len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) cost += c1 cost += c2 total += t1 total += t2 return cost, total + 1
def compute_loss(self, x, label): """ Compute the cost function for a single example. """ #import ipdb; ipdb.set_trace() ## # Forward propagation p = softmax(self.params.W.dot(x) + self.params.b) J = -1 * np.log(p[label]) # cross-entropy loss Jreg = (self.lreg / 2.0) * np.sum(self.params.W**2.0) return J + Jreg
def compute_loss(self, idx, label): """ Compute the cost function for a single example. """ ## # Forward propagation x = self.sparams.L[idx] p = softmax(self.params.W.dot(x) + self.params.b) J = -1*log(p[label]) # cross-entropy loss Jreg = (self.lreg / 2.0) * sum(self.params.W**2.0) return J + Jreg