def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### onehot_vecs = expand_dims(self.sparams.L[window, :].flatten(), axis=0) #print "onehot_vecs.shape: %s " % (onehot_vecs.shape,) ## # Forward propagation a1 = self.params.W.dot(onehot_vecs.T).T + self.params.b1 s = sigmoid(2.0 * a1) h = 2.0 * s - 1.0 a2 = self.params.U.dot(h.T).T + self.params.b2 y_hat = softmax(a2) ## # Backpropagation t = zeros(y_hat.shape) t[:, label] = 1 delta_out = y_hat - t self.grads.U += h.T.dot(delta_out).T + self.lreg * self.params.U #print "delta_out shape: %s" % (delta_out.shape,) self.grads.b2 += delta_out.flatten() #print "self.grads.b2.shape: %s " % (self.grads.b2.shape,) delta_hidden = delta_out.dot(self.params.U) * 4.0 * sigmoid_grad(s) self.grads.W += delta_hidden.T.dot( onehot_vecs) + self.lreg * self.params.W self.grads.b1 += delta_hidden.flatten() #print "self.grads.b2.shape: %s " % (self.grads.b1.shape,) grad_xs = delta_hidden.dot(self.params.W).T #print "grad_xs.shape: %s " % (grad_xs.shape,) self.sgrads.L[window[0]] = grad_xs[range(0, 50)].flatten() self.sgrads.L[window[1]] = grad_xs[range(50, 100)].flatten() self.sgrads.L[window[2]] = grad_xs[range(100, 150)].flatten()
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### onehot_vecs = expand_dims(self.sparams.L[window,:].flatten(),axis=0) #print "onehot_vecs.shape: %s " % (onehot_vecs.shape,) ## # Forward propagation a1 = self.params.W.dot(onehot_vecs.T).T + self.params.b1 s = sigmoid( 2.0 * a1 ) h = 2.0 * s - 1.0 a2 = self.params.U.dot(h.T).T + self.params.b2 y_hat = softmax( a2 ) ## # Backpropagation t = zeros( y_hat.shape ) t[:,label] = 1 delta_out = y_hat - t self.grads.U += h.T.dot(delta_out).T + self.lreg * self.params.U #print "delta_out shape: %s" % (delta_out.shape,) self.grads.b2 += delta_out.flatten() #print "self.grads.b2.shape: %s " % (self.grads.b2.shape,) delta_hidden = delta_out.dot(self.params.U) * 4.0 * sigmoid_grad( s ) self.grads.W += delta_hidden.T.dot(onehot_vecs) + self.lreg * self.params.W self.grads.b1 += delta_hidden.flatten() #print "self.grads.b2.shape: %s " % (self.grads.b1.shape,) grad_xs = delta_hidden.dot(self.params.W).T #print "grad_xs.shape: %s " % (grad_xs.shape,) self.sgrads.L[window[0]] = grad_xs[range(0,50)].flatten() self.sgrads.L[window[1]] = grad_xs[range(50,100)].flatten() self.sgrads.L[window[2]] = grad_xs[range(100,150)].flatten()
def _acc_grads(self, xs, ys): """ Accumulate gradients, given a pair of training sequences: xs = [<indices>] # input words ys = [<indices>] # output words (to predict) Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.H += (your gradient dJ/dH) self.sgrads.L[i] = (gradient dJ/dL[i]) # update row Per the handout, you should: - make predictions by running forward in time through the entire input sequence - for *each* output word in ys, compute the gradients with respect to the cross-entropy loss for that output word - run backpropagation-through-time for self.bptt timesteps, storing grads in self.grads (for H, U) and self.sgrads (for L) You'll want to store your predictions \hat{y}(t) and the hidden layer values h(t) as you run forward, so that you can access them during backpropagation. At time 0, you should initialize the hidden layer to be a vector of zeros. """ # Expect xs as list of indices ns = len(xs) # make matrix here of corresponding h(t) # hs[-1] = initial hidden state (zeros) hs = zeros((ns + 1, self.hdim)) # predicted probas ps = zeros((ns, self.vdim)) #### YOUR CODE HERE #### ## # Forward propagation for step in xrange(0, ns): # print "hs[step-1].shape %s" % (hs[step-1].shape,) # print "self.params.H.shape %s" % (self.params.H.shape,) # print "self.sparams.L.shape %s" % (self.sparams.L.shape,) # print "self.sparams.L[xs[step]].shape %s" % (self.sparams.L[xs[step]].shape,) a1 = self.params.H.dot(hs[step - 1].T).T + self.sparams.L[xs[step]] a1 = expand_dims(a1, axis=0) h = sigmoid(a1) a2 = self.params.U.dot(h.T).T # print "h.flatten().shape %s" % (h.flatten().shape,) # print "a2.shape %s" % (a2.shape,) # print "self.params.U.shape %s" % (self.params.U.shape,) y_hat = softmax(a2) # print "y_hat.shape %s" % (y_hat.shape,) hs[step] = h.flatten() ps[step] = y_hat ## # Backward propagation through time for step in xrange(ns - 1, -1, -1): t = zeros(ps[step].shape) t[ys[step]] = 1 delta_out = ps[step] - t self.grads.U += outer(hs[step], delta_out).T delta_hidden = delta_out.dot(self.params.U) * sigmoid_grad( hs[step]) for step_bp in xrange(step, step - self.bptt - 1, -1): if step_bp < 0: break self.grads.H += outer(delta_hidden, hs[step_bp - 1]) self.sgrads.L[xs[step_bp]] = delta_hidden delta_hidden = delta_hidden.dot(self.params.H) * sigmoid_grad( hs[step_bp - 1])
def _acc_grads(self, xs, ys): """ Accumulate gradients, given a pair of training sequences: xs = [<indices>] # input words ys = [<indices>] # output words (to predict) Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.H += (your gradient dJ/dH) self.sgrads.L[i] = (gradient dJ/dL[i]) # update row Per the handout, you should: - make predictions by running forward in time through the entire input sequence - for *each* output word in ys, compute the gradients with respect to the cross-entropy loss for that output word - run backpropagation-through-time for self.bptt timesteps, storing grads in self.grads (for H, U) and self.sgrads (for L) You'll want to store your predictions \hat{y}(t) and the hidden layer values h(t) as you run forward, so that you can access them during backpropagation. At time 0, you should initialize the hidden layer to be a vector of zeros. """ # Expect xs as list of indices ns = len(xs) # make matrix here of corresponding h(t) # hs[-1] = initial hidden state (zeros) hs = zeros((ns+1, self.hdim)) # predicted probas ps = zeros((ns, self.vdim)) #### YOUR CODE HERE #### ## # Forward propagation for step in xrange(0,ns): # print "hs[step-1].shape %s" % (hs[step-1].shape,) # print "self.params.H.shape %s" % (self.params.H.shape,) # print "self.sparams.L.shape %s" % (self.sparams.L.shape,) # print "self.sparams.L[xs[step]].shape %s" % (self.sparams.L[xs[step]].shape,) a1 = self.params.H.dot(hs[step-1].T).T + self.sparams.L[xs[step]] a1 = expand_dims(a1,axis=0) h = sigmoid( a1 ) a2 = self.params.U.dot(h.T).T # print "h.flatten().shape %s" % (h.flatten().shape,) # print "a2.shape %s" % (a2.shape,) # print "self.params.U.shape %s" % (self.params.U.shape,) y_hat = softmax( a2 ) # print "y_hat.shape %s" % (y_hat.shape,) hs[step] = h.flatten() ps[step] = y_hat ## # Backward propagation through time for step in xrange(ns-1,-1,-1): t = zeros( ps[step].shape ) t[ys[step]] = 1 delta_out = ps[step] - t self.grads.U += outer(hs[step],delta_out).T delta_hidden = delta_out.dot(self.params.U) * sigmoid_grad( hs[step] ) for step_bp in xrange(step,step-self.bptt-1,-1): if step_bp < 0: break self.grads.H += outer(delta_hidden,hs[step_bp-1]) self.sgrads.L[xs[step_bp]] = delta_hidden delta_hidden = delta_hidden.dot(self.params.H) * sigmoid_grad( hs[step_bp-1] )