def train(self, X, y, learning_rate=1e-3, reg=1e-4, decay_rate=1.00, opt='sgd', n_iters=1000, batch_size=200, verbose=True): lr = learning_rate self.reg = reg for i in range(n_iters): ids = np.random.choice(X.shape[0], batch_size) layer1, l1cache = layers.dense_forward(X[ids], self.W1, self.b1) layer2, l2cache = layers.non_linearity_forward(layer1, hiddenLayer='relu') layer3, l3cache = layers.dense_forward(layer2, self.W2, self.b2) layer4, l4cache = layers.non_linearity_forward(layer3, hiddenLayer='sigmoid') loss, l5cache = layers.binary_cross_entropy_loss_forward(layer4, y[ids]) # adding regularization loss loss += 0.5*self.reg*(np.sum(layer2*layer2))/(batch_size*batch_size) dlayer5 = 1.0 dlayer4 = layers.binary_cross_entropy_loss_backward(dlayer5, l5cache) dlayer3 = layers.non_linearity_backward(dlayer4, l4cache, hiddenLayer='sigmoid') dlayer2, dW2, db2 = layers.dense_backward(dlayer3, l3cache) dlayer2 += (self.reg*layer2)/batch_size dlayer1 = layers.non_linearity_backward(dlayer2, l2cache, hiddenLayer='relu') _, dW1, db1 = layers.dense_backward(dlayer1, l1cache) if i % 500 == 0: lr *= decay_rate if verbose: print "Iteration %d, loss = %g" % (i, loss) self.params, self.W1 = optimizers.optimize(self.params, self.W1, dW1, lr=lr, name='W1', opt=opt) self.params, self.b1 = optimizers.optimize(self.params, self.b1, db1, lr=lr, name='b1', opt=opt) self.params, self.W2 = optimizers.optimize(self.params, self.W2, dW2, lr=lr, name='W2', opt=opt) self.params, self.b2 = optimizers.optimize(self.params, self.b2, db2, lr=lr, name='b2', opt=opt) self.loss_history.append(loss)
def train(self, X, y, X_val=None, y_val=None, learning_rate=1e-2, reg = 1e-4, decay_rate=0.95, opt='sgd', n_iters=5000, batch_size=200, verbose=1): lr = learning_rate for i in xrange(n_iters): # adding dense layer1 ids = np.random.choice(X.shape[0], batch_size) l1out, l1cache = layers.dense_forward(X[ids], self.W1, self.b1) # adding non-linearity layer2 l2out, l2cache = layers.non_linearity_forward(l1out,self.hiddenLayer) # adding dense layer3 l3out, l3cache = layers.dense_forward(l2out, self.W2, self.b2) # adding non-linearity layer4 l4out,l4cache = layers.non_linearity_forward(l3out, self.hiddenLayer) # adding dense layer5 l5out,l5cache = layers.dense_forward(l4out,self.W3, self.b3) # adding softmax layer loss, l6cache = layers.softmax_loss_forward(l5out, y[ids]) loss = loss + 0.5*reg*(np.sum(self.W1**2) + np.sum(self.W2**2) + np.sum(self.W3**2)) self.loss_history.append(loss) if verbose and i % 500 == 0: lr *= decay_rate print "Iteration %d, loss = %f" % (i, loss) if X_val is not None and y_val is not None: print "Validation Accuracy :%f" % (self.accuracy(X_val, y_val)) dlayer6 = 1.0 dlayer5 = layers.softmax_loss_backward(dlayer6, l6cache) dlayer4, dW3, db3 = layers.dense_backward(dlayer5, l5cache) dlayer3 = layers.non_linearity_backward(dlayer4, l4cache, self.hiddenLayer) dlayer2, dW2, db2 = layers.dense_backward(dlayer3, l3cache) dlayer1 = layers.non_linearity_backward(dlayer2, l2cache, self.hiddenLayer) _, dW1, db1 = layers.dense_backward(dlayer1, l1cache) self.gradientLayer1.append(np.mean(np.abs(dlayer1))) self.gradientLayer2.append(np.mean(np.abs(dlayer3))) self.params, self.W1 = optimizers.optimize(self.params, self.W1, dW1, lr=lr, name='W1', opt=opt) self.params, self.b1 = optimizers.optimize(self.params, self.b1, db1, lr=lr, name='b1', opt=opt) self.params, self.W2 = optimizers.optimize(self.params, self.W2, dW2, lr=lr, name='W2', opt=opt) self.params, self.b2 = optimizers.optimize(self.params, self.b2, db2, lr=lr, name='b2', opt=opt) self.params, self.W3 = optimizers.optimize(self.params, self.W3, dW3, lr=lr, name='W3', opt=opt) self.params, self.b3 = optimizers.optimize(self.params, self.b3, db3, lr=lr, name='b3', opt=opt) # gradients due to regularization self.W1 += reg * dW1 self.W2 += reg * dW2 self.W3 += reg * dW3
def test_denselayer(): x = np.random.randn(10, 6) w = np.random.randn(6, 5) b = np.random.randn(5) dout = np.random.randn(10, 5) dx_num = eval_numerical_gradient_array(lambda x: layers.dense_forward(x, w, b)[0], x, dout) dw_num = eval_numerical_gradient_array(lambda w: layers.dense_forward(x, w, b)[0], w, dout) db_num = eval_numerical_gradient_array(lambda b: layers.dense_forward(x, w, b)[0], b, dout) _, cache = layers.dense_forward(x, w, b) dx, dw, db = layers.dense_backward(dout, cache) # The error should be around 1e-10 print 'Testing dense layers:' print 'dx error: ', rel_error(dx_num, dx) print 'dw error: ', rel_error(dw_num, dw) print 'db error: ', rel_error(db_num, db)
def test_denselayer(): x = np.random.randn(10, 6) w = np.random.randn(6, 5) b = np.random.randn(5) dout = np.random.randn(10, 5) dx_num = eval_numerical_gradient_array( lambda x: layers.dense_forward(x, w, b)[0], x, dout) dw_num = eval_numerical_gradient_array( lambda w: layers.dense_forward(x, w, b)[0], w, dout) db_num = eval_numerical_gradient_array( lambda b: layers.dense_forward(x, w, b)[0], b, dout) _, cache = layers.dense_forward(x, w, b) dx, dw, db = layers.dense_backward(dout, cache) # The error should be around 1e-10 print 'Testing dense layers:' print 'dx error: ', rel_error(dx_num, dx) print 'dw error: ', rel_error(dw_num, dw) print 'db error: ', rel_error(db_num, db)
def train(self, X, learning_rate=1e-2, batch_size=100, nb_epochs=1): """ Training based on CBOW model using negative sampling :param nb_epochs: number of iterations :param batch_size: the number of sentences trained upon in 1 iteration :param learning_rate: :param X: list of sentences used for training """ N = len(X) start_index = self.word_to_index[self.start_token] end_index = self.word_to_index[self.end_token] unknown_index = self.word_to_index[self.unknown_token] id_x = [] for i in xrange(N): sentence = nltk.word_tokenize(X[i]) if len(sentence) == 0: continue id_x.append(start_index) for word in sentence: if word in self.word_to_index: id_x.append(self.word_to_index[word]) else: id_x.append(unknown_index) id_x.append(end_index) corpus_size = len(id_x) print corpus_size n_iters = corpus_size//batch_size for epoch in xrange(nb_epochs): for itr in xrange(n_iters): batch = np.random.randint(corpus_size, size=batch_size) trX = np.zeros([batch_size, self.size]) trY = np.zeros([batch_size], dtype=np.int32) context = [] ids_to_update = np.zeros([batch_size], dtype=np.int32) for id, w_id in enumerate(batch): context_ids = id_x[max(0, w_id-self.window):w_id] + id_x[w_id+1:min(w_id+1+self.window, corpus_size)] context.append(context_ids) context_window = np.array(context_ids) trX[id] = np.mean(self.W_inp[context_window, :], axis=0) trY[id] = id ids_to_update[id] = id_x[w_id] context = np.array(context) trX, trY, ids_to_update, context = shuffle(trX, trY, ids_to_update, context, random_state=0) W = self.W_out[ids_to_update] # print trX, trY, ids_to_update, context, W b = np.zeros([batch_size]) layer1, l1cache = layers.dense_forward(trX, W.T, b) layer2, l2cache = layers.sigmoid_forward(layer1) loss, l3cache = layers.softmax_loss_forward(layer2, trY) self.loss_history.append(loss) dlayer3 = 1.0 dlayer2 = layers.softmax_loss_backward(dlayer3, l3cache) dlayer1 = layers.sigmoid_backward(dlayer2, l2cache) dx_inp, dW_tmp, db = layers.dense_backward(dlayer1, l1cache) dW = dW_tmp.T for i in xrange(batch_size): self.W_inp[context[i], :] -= (learning_rate * dx_inp[i])/len(context[i]) self.W_out[ids_to_update, :] -= learning_rate * dW
def train(self, X, y, X_val=None, y_val=None, learning_rate=1e-2, reg=1e-4, decay_rate=0.95, opt='sgd', n_iters=5000, batch_size=200, verbose=1): lr = learning_rate for i in xrange(n_iters): W1, b1 = self.weights['W1'], self.weights['b1'] W2, b2 = self.weights['W2'], self.weights['b2'] W3, b3 = self.weights['W3'], self.weights['b3'] # dense layer1 ids = np.random.choice(X.shape[0], batch_size) l1out, l1cache = layers.dense_forward(X[ids], W1, b1) # non-linearity layer2 l2out, l2cache = layers.non_linearity_forward( l1out, self.non_linearity) # dense layer3 l3out, l3cache = layers.dense_forward(l2out, W2, b2) # non-linearity layer4 l4out, l4cache = layers.non_linearity_forward( l3out, self.non_linearity) # dense layer5 l5out, l5cache = layers.dense_forward(l4out, W3, b3) # softmax layer loss, l6cache = layers.softmax_loss_forward(l5out, y[ids]) loss = loss + 0.5 * reg * (np.sum(W1**2) + np.sum(W2**2) + np.sum(W3**2)) self.loss_history.append(loss) if verbose and i % 500 == 0: lr *= decay_rate print "Iteration %d, loss = %f" % (i, loss) if X_val is not None and y_val is not None: print "Validation Accuracy :%f" % (self.accuracy( X_val, y_val)) dlayer6 = 1.0 dlayer5 = layers.softmax_loss_backward(dlayer6, l6cache) dlayer4, dW3, db3 = layers.dense_backward(dlayer5, l5cache) dlayer3 = layers.non_linearity_backward(dlayer4, l4cache, self.non_linearity) dlayer2, dW2, db2 = layers.dense_backward(dlayer3, l3cache) dlayer1 = layers.non_linearity_backward(dlayer2, l2cache, self.non_linearity) _, dW1, db1 = layers.dense_backward(dlayer1, l1cache) self.gradientLayer1.append(np.mean(np.abs(dlayer1))) self.gradientLayer2.append(np.mean(np.abs(dlayer3))) # gradients due to regularization dW1 += reg * W1 dW2 += reg * W2 dW3 += reg * W3 self.params, W1 = optimizers.optimize(self.params, W1, dW1, lr=lr, name='W1', opt=opt) self.params, b1 = optimizers.optimize(self.params, b1, db1, lr=lr, name='b1', opt=opt) self.params, W2 = optimizers.optimize(self.params, W2, dW2, lr=lr, name='W2', opt=opt) self.params, b2 = optimizers.optimize(self.params, b2, db2, lr=lr, name='b2', opt=opt) self.params, W3 = optimizers.optimize(self.params, W3, dW3, lr=lr, name='W3', opt=opt) self.params, b3 = optimizers.optimize(self.params, b3, db3, lr=lr, name='b3', opt=opt) self.weights['W1'], self.weights['b1'] = W1, b1 self.weights['W2'], self.weights['b2'] = W2, b2 self.weights['W3'], self.weights['b3'] = W3, b3
def train(self, X, y, learning_rate=1e-2, opt='sgd', n_iters=5000, batch_size=200, verbose=1): lr = learning_rate N, T, D = X.shape for i in xrange(n_iters): ids = np.random.choice(X.shape[0], batch_size) h0 = np.zeros((batch_size, self.hidden_dim)) layer1, l1cache = rnn_layers.rnn_forward(X[ids], h0, self.Wx, self.Wh, self.b, self.non_liniearity) final_layer = (layer1[:, T - 1, :]) layer2, l2cache = layers.dense_forward(final_layer, self.W1, self.b1) loss, l3cache = layers.softmax_loss_forward(layer2, y[ids]) self.loss_history.append(loss) if verbose == 1 and i % 500 == 0: print 'Iteration %d: loss %g' % (i, loss) dlayer3 = 1.0 dlayer2 = layers.softmax_loss_backward(dlayer3, l3cache) dlayer1, dW1, db1 = layers.dense_backward(dlayer2, l2cache) dh = np.zeros((batch_size, T, self.hidden_dim)) dh[:, T - 1, :] = dlayer1 _, _, dWx, dWh, db = rnn_layers.rnn_backward(dh, l1cache) self.params, self.Wx = optimizers.optimize(self.params, self.Wx, dWx, lr=lr, name='Wx', opt=opt) self.params, self.Wh = optimizers.optimize(self.params, self.Wh, dWh, lr=lr, name='Wh', opt=opt) self.params, self.b = optimizers.optimize(self.params, self.b, db, lr=lr, name='b', opt=opt) self.params, self.W1 = optimizers.optimize(self.params, self.W1, dW1, lr=lr, name='W1', opt=opt) self.params, self.b1 = optimizers.optimize(self.params, self.b1, db1, lr=lr, name='b1', opt=opt)