def fit(self, X, Y, learning_rate=10e-7, reg=0*10e-22, epochs=120000, show_fig=False): X, Y = shuffle(X, Y) Xvalid, Yvalid = X[-1000:], Y[-1000:] X, Y = X[:-1000], Y[:-1000] N, D = X.shape self.W = np.random.randn(D) / np.sqrt(D) self.b = 0 costs = [] best_validation_error = 1 for i in xrange(epochs): # forward propagation and cost calculation pY = self.forward(X) # gradient descent step self.W -= learning_rate*(X.T.dot(pY - Y) + reg*self.W) self.b -= learning_rate*((pY - Y).sum() + reg*self.b) if i % 20 == 0: pYvalid = self.forward(Xvalid) c = sigmoid_cost(Yvalid, pYvalid) costs.append(c) e = error_rate(Yvalid, np.round(pYvalid)) print "i:", i, "cost:", c, "error:", e if e < best_validation_error: best_validation_error = e print "best_validation_error:", best_validation_error if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=1e-7, reg=0., epochs=10000, show_fig=False): X, Y = shuffle(X, Y) Xvalid, Yvalid = X[-1000:], Y[-1000:] Tvalid = y2indicator(Yvalid) X, Y = X[:-1000], Y[:-1000] N, D = X.shape K = len(set(Y)) T = y2indicator(Y) self.W = np.random.randn(D, K) / np.sqrt(D) self.b = np.zeros(K) costs = [] best_validation_error = 1 for i in range(epochs): # forward propagation and cost calculation pY = self.forward(X) # gradient descent step self.W -= learning_rate*(X.T.dot(pY - T) + reg*self.W) self.b -= learning_rate*((pY - T).sum(axis=0) + reg*self.b) if i % 10 == 0: pYvalid = self.forward(Xvalid) c = cost(Tvalid, pYvalid) costs.append(c) e = error_rate(Yvalid, np.argmax(pYvalid, axis=1)) print("i:", i, "cost:", c, "error:", e) if e < best_validation_error: best_validation_error = e print("best_validation_error:", best_validation_error) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=1e-6, reg=1e-6, epochs=10000, show_fig=False): X, Y = shuffle(X, Y) Xvalid, Yvalid = X[-1000:], Y[-1000:] # Tvalid = y2indicator(Yvalid) X, Y = X[:-1000], Y[:-1000] N, D = X.shape K = len(set(Y)) T = y2indicator(Y) self.W1 = np.random.randn(D, self.M) / np.sqrt(D) self.b1 = np.zeros(self.M) self.W2 = np.random.randn(self.M, K) / np.sqrt(self.M) self.b2 = np.zeros(K) costs = [] best_validation_error = 1 for i in range(epochs): # forward propagation and cost calculation pY, Z = self.forward(X) # gradient descent step pY_T = pY - T self.W2 -= learning_rate*(Z.T.dot(pY_T) + reg*self.W2) self.b2 -= learning_rate*(pY_T.sum(axis=0) + reg*self.b2) # dZ = pY_T.dot(self.W2.T) * (Z > 0) # relu dZ = pY_T.dot(self.W2.T) * (1 - Z*Z) # tanh self.W1 -= learning_rate*(X.T.dot(dZ) + reg*self.W1) self.b1 -= learning_rate*(dZ.sum(axis=0) + reg*self.b1) if i % 10 == 0: pYvalid, _ = self.forward(Xvalid) c = cost2(Yvalid, pYvalid) costs.append(c) e = error_rate(Yvalid, np.argmax(pYvalid, axis=1)) print("i:", i, "cost:", c, "error:", e) if e < best_validation_error: best_validation_error = e print("best_validation_error:", best_validation_error) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=5*10e-7, reg=1.0, epochs=10000, show_fig=False): X, Y = shuffle(X, Y) Xvalid, Yvalid = X[-1000:], Y[-1000:] X, Y = X[:-1000], Y[:-1000] N, D = X.shape self.W1 = np.random.randn(D, self.M) / np.sqrt(D + self.M) self.b1 = np.zeros(self.M) self.W2 = np.random.randn(self.M) / np.sqrt(self.M) self.b2 = 0 costs = [] best_validation_error = 1 for i in xrange(epochs): # forward propagation and cost calculation pY, Z = self.forward(X) # gradient descent step pY_Y = pY - Y self.W2 -= learning_rate*(Z.T.dot(pY_Y) + reg*self.W2) self.b2 -= learning_rate*((pY_Y).sum() + reg*self.b2) # print "(pY_Y).dot(self.W2.T) shape:", (pY_Y).dot(self.W2.T).shape # print "Z shape:", Z.shape # dZ = np.outer(pY_Y, self.W2) * (Z > 0) dZ = np.outer(pY_Y, self.W2) * (1 - Z*Z) self.W1 -= learning_rate*(X.T.dot(dZ) + reg*self.W1) self.b1 -= learning_rate*(np.sum(dZ, axis=0) + reg*self.b1) if i % 20 == 0: pYvalid, _ = self.forward(Xvalid) c = sigmoid_cost(Yvalid, pYvalid) costs.append(c) e = error_rate(Yvalid, np.round(pYvalid)) print "i:", i, "cost:", c, "error:", e if e < best_validation_error: best_validation_error = e print "best_validation_error:", best_validation_error if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, Xtest, Ytest, pretrain=True, epochs=1, batch_sz=100): N = len(X) # greedy layer-wise training of autoencoders pretrain_epochs = 1 if not pretrain: pretrain_epochs = 0 current_input = X for ae in self.hidden_layers: ae.fit(current_input, epochs=pretrain_epochs) # create current_input for the next layer current_input = ae.transform(current_input) n_batches = N // batch_sz costs = [] print("supervised training...") for i in range(epochs): print("epoch:", i) X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j*batch_sz:(j*batch_sz + batch_sz)] Ybatch = Y[j*batch_sz:(j*batch_sz + batch_sz)] self.session.run( self.train_op, feed_dict={self.X: Xbatch, self.Y: Ybatch} ) c, p = self.session.run( (self.cost, self.prediction), feed_dict={self.X: Xtest, self.Y: Ytest }) error = error_rate(p, Ytest) if j % 10 == 0: print("j / n_batches:", j, "/", n_batches, "cost:", c, "error:", error) costs.append(c) plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=0.01, mu=0.99, epochs=30, batch_sz=100): # cast to float32 learning_rate = np.float32(learning_rate) mu = np.float32(mu) N, D = X.shape K = len(set(Y)) self.hidden_layers = [] mi = D for mo in self.hidden_layer_sizes: h = HiddenLayer(mi, mo) self.hidden_layers.append(h) mi = mo # initialize logistic regression layer W = init_weights((mo, K)) b = np.zeros(K, dtype=np.float32) self.W = theano.shared(W) self.b = theano.shared(b) self.params = [self.W, self.b] self.allWs = [] for h in self.hidden_layers: self.params += h.params self.allWs.append(h.W) self.allWs.append(self.W) X_in = T.matrix('X_in') targets = T.ivector('Targets') pY = self.forward(X_in) cost = -T.mean(T.log(pY[T.arange(pY.shape[0]), targets])) prediction = self.predict(X_in) updates = momentum_updates(cost, self.params, mu, learning_rate) train_op = theano.function( inputs=[X_in, targets], outputs=[cost, prediction], updates=updates, ) n_batches = N // batch_sz costs = [] lastWs = [W.get_value() for W in self.allWs] W_changes = [] print("supervised training...") for i in range(epochs): print("epoch:", i) X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] c, p = train_op(Xbatch, Ybatch) if j % 100 == 0: print("j / n_batches:", j, "/", n_batches, "cost:", c, "error:", error_rate(p, Ybatch)) costs.append(c) # log changes in all Ws W_change = [ np.abs(W.get_value() - lastW).mean() for W, lastW in zip(self.allWs, lastWs) ] W_changes.append(W_change) lastWs = [W.get_value() for W in self.allWs] W_changes = np.array(W_changes) plt.subplot(2, 1, 1) for i in range(W_changes.shape[1]): plt.plot(W_changes[:, i], label='layer %s' % i) plt.legend() # plt.show() plt.subplot(2, 1, 2) plt.plot(costs) plt.show()
def batch_grad(): #get data and for test and train sets X, Y = get_normalized_data() #XTrain = X[:-1000, :] #YTrain = Y[:-1000] #YTrain_ind = y2indicator(YTrain) #XTest = X[-1000:, :] #YTest = Y[-1000:] # = y2indicator(YTest) Y_ind = y2indicator(Y) batchSz = 500 #Initialize random weights N, D = X.shape K = len(set(Y)) M = 300 W1 = np.random.randn(D, M) b1 = np.random.randn(M) W2 = np.random.randn(M, K) b2 = np.random.randn(K) learning_rate = 10e-5 reg = 0.01 no_batches = int(N / batchSz) print("No of bathces: ", no_batches) for i in range(300): for n in range(no_batches): #get current batch XBatch = X[n * batchSz:(n * batchSz + batchSz), :] YBatch_ind = Y_ind[n * batchSz:(n * batchSz + batchSz), :] #Forward prop pY, Z = forward_relu(XBatch, W1, b1, W2, b2) #Backprop W2 += learning_rate * (derivative_w2(pY, YBatch_ind, Z) + reg * W2) b2 += learning_rate * (derivative_b2(pY, YBatch_ind) + reg * b2) W1 += learning_rate * ( derivative_w1(pY, YBatch_ind, W2, Z, XBatch) + reg * W1) b1 += learning_rate * (derivative_b1(pY, YBatch_ind, W2, Z) + reg * b1) if n % 100 == 0: #Forward prop #pY, Z = forward_relu(XBatch, W1, b1, W2, b2) YBatch = Y[n * batchSz:n * batchSz + batchSz] P = np.argmax(pY, axis=1) er = error_rate(P, YBatch) c = cost(YBatch_ind, pY) print("Loop: ", i, n, "Error rate: ", er, "Cost: ", c) pY, Z = forward_relu(X, W1, b1, W2, b2) p = np.argmax(pY, axis=1) print("Final training error rate: ", error_rate(p, Y)) XTest = get_test_data() pY, ZTest = forward_relu(XTest, W1, b1, W2, b2) YTest = np.argmax(pY, axis=1) f = open("test_result.csv", "w") f.write("ImageId,Label\n") n = YTest.shape[0] for i in range(n): f.write(str(i + 1) + "," + str(YTest[i]) + "\n") f.close()
def main(): max_iter = 10 print_period = 10 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1_0 = np.random.randn(D, M) / np.sqrt(D) b1_0 = np.zeros(M) W2_0 = np.random.randn(M, K) / np.sqrt(M) b2_0 = np.zeros(K) W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() # 1st moment mW1 = 0 mb1 = 0 mW2 = 0 mb2 = 0 # 2nd moment vW1 = 0 vb1 = 0 vW2 = 0 vb2 = 0 # hyperparams lr0 = 0.001 beta1 = 0.9 beta2 = 0.999 eps = 1e-8 # 1. Adam loss_adam = [] err_adam = [] t = 1 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 # new m mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 # new v vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 # bias correction correction1 = 1 - beta1**t hat_mW1 = mW1 / correction1 hat_mb1 = mb1 / correction1 hat_mW2 = mW2 / correction1 hat_mb2 = mb2 / correction1 correction2 = 1 - beta2**t hat_vW1 = vW1 / correction2 hat_vb1 = vb1 / correction2 hat_vW2 = vW2 / correction2 hat_vb2 = vb2 / correction2 # update t t += 1 # apply updates to the params W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps) b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps) W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps) b2 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) loss_adam.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) err = error_rate(pY, Ytest) err_adam.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. RMSprop with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() loss_rms = [] err_rms = [] # comparable hyperparameters for fair comparison lr0 = 0.001 mu = 0.9 decay_rate = 0.999 eps = 1e-8 # rmsprop cache cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 # momentum dW1 = 0 db1 = 0 dW2 = 0 db2 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2) + eps) W2 -= dW2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2) + eps) b2 -= db2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1) + eps) W1 -= dW1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1) + eps) b1 -= db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) loss_rms.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) err = error_rate(pY, Ytest) err_rms.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(loss_adam, label='adam') plt.plot(loss_rms, label='rmsprop') plt.legend() plt.show()
def main(): X, Y, _, _ = get_transformed_data() X = X[:, :300] # normalize X first mu = X.mean(axis=0) std = X.std(axis=0) X = (X - mu) / std print "Performing logistic regression..." Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # 1. full W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange(200): p_y = forward(Xtrain, W, b) W += lr * (gradW(Ytrain_ind, p_y, Xtrain) - reg * W) b += lr * (gradb(Ytrain_ind, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) if i % 10 == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for full GD:", datetime.now() - t0 # 2. stochastic W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange( 1): # takes very long since we're computing cost for 41k samples tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in xrange(min(N, 500)): # shortcut so it won't take so long... x = tmpX[n, :].reshape(1, D) y = tmpY[n, :].reshape(1, 10) p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_stochastic.append(ll) if n % (N / 2) == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for SGD:", datetime.now() - t0 # 3. batch W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N / batch_sz t0 = datetime.now() for i in xrange(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in xrange(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if j % (n_batches / 2) == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for batch GD:", datetime.now() - t0 x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
# Update T t += 1 # Apply Update to parameters W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps) b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps) W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps) b1 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps) if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) err = error_rate(pY, Y_test) err_adam.append(err) print("Eror rate: ", err) pY, _ = forward(X_test, W1, b1, W2, b2) print("Final error rate: ", error_rate(pY, Y_test)) # 2. RMSprop with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() loss_rms = [] err_rms = []
def fit(self, X, Y, Xtest, Ytest, pretrain=True, learning_rate=0.01, mu=0.99, reg=0.1, epochs=1, batch_sz=100): # greedy layer-wise training of autoencoders pretrain_epochs = 1 if not pretrain: pretrain_epochs = 0 current_input = X for ae in self.hidden_layers: ae.fit(current_input, epochs=pretrain_epochs) # create current_input for the next layer current_input = ae.hidden_op(current_input) # initialize logistic regression layer N = len(Y) K = len(set(Y)) W0 = init_weights((self.hidden_layers[-1].M, K)) self.W = theano.shared(W0, "W_logreg") self.b = theano.shared(np.zeros(K), "b_logreg") self.params = [self.W, self.b] for ae in self.hidden_layers: self.params += ae.forward_params # for momentum self.dW = theano.shared(np.zeros(W0.shape), "dW_logreg") self.db = theano.shared(np.zeros(K), "db_logreg") self.dparams = [self.dW, self.db] for ae in self.hidden_layers: self.dparams += ae.forward_dparams X_in = T.matrix('X_in') targets = T.ivector('Targets') pY = self.forward(X_in) # squared_magnitude = [(p*p).sum() for p in self.params] # reg_cost = T.sum(squared_magnitude) cost = -T.mean(T.log(pY[T.arange(pY.shape[0]), targets])) #+ reg*reg_cost prediction = self.predict(X_in) cost_predict_op = theano.function( inputs=[X_in, targets], outputs=[cost, prediction], ) updates = [(p, p + mu * dp - learning_rate * T.grad(cost, p)) for p, dp in zip(self.params, self.dparams) ] + [(dp, mu * dp - learning_rate * T.grad(cost, p)) for p, dp in zip(self.params, self.dparams)] # updates = [(p, p - learning_rate*T.grad(cost, p)) for p in self.params] train_op = theano.function( inputs=[X_in, targets], updates=updates, ) n_batches = N / batch_sz costs = [] print "supervised training..." for i in xrange(epochs): print "epoch:", i X, Y = shuffle(X, Y) for j in xrange(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] train_op(Xbatch, Ybatch) the_cost, the_prediction = cost_predict_op(Xtest, Ytest) error = error_rate(the_prediction, Ytest) print "j / n_batches:", j, "/", n_batches, "cost:", the_cost, "error:", error costs.append(the_cost) plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=1e-2, mu=0.99, decay=0.999, reg=1e-3, epochs=10, batch_sz=100, show_fig=False): K = len(set(Y)) # won't work later b/c we turn it into indicator # make a validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = y2indicator(Y).astype(np.float32) # Y = Y.astype(np.int32) Xvalid, Yvalid = X[-1000:], Y[-1000:] Yvalid_flat = np.argmax(Yvalid, axis=1) # for calculating error rate X, Y = X[:-1000], Y[:-1000] # initialize hidden layers N, D = X.shape self.hidden_layers = [] M1 = D count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 W, b = init_weight_and_bias(M1, K) self.W = tf.Variable(W.astype(np.float32)) self.b = tf.Variable(b.astype(np.float32)) # collect params for later use self.params = [self.W, self.b] for h in self.hidden_layers: self.params += h.params # set up theano functions and variables tfX = tf.placeholder(tf.float32, shape=(None, D), name='X') tfT = tf.placeholder(tf.float32, shape=(None, K), name='T') act = self.forward(tfX) rcost = reg * sum([tf.nn.l2_loss(p) for p in self.params]) cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=act, labels=tfT)) + rcost prediction = self.predict(tfX) train_op = tf.train.RMSPropOptimizer(learning_rate, decay=decay, momentum=mu).minimize(cost) n_batches = N // batch_sz costs = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] session.run(train_op, feed_dict={tfX: Xbatch, tfT: Ybatch}) if j % 20 == 0: c = session.run(cost, feed_dict={ tfX: Xvalid, tfT: Yvalid }) costs.append(c) p = session.run(prediction, feed_dict={ tfX: Xvalid, tfT: Yvalid }) e = error_rate(Yvalid_flat, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) # TODO: ask lazy programmer how to make a score function. # For this lecture: https://www.udemy.com/data-science-deep-learning-in-theano-tensorflow/learn/v4/t/lecture/5228492?start=0 if show_fig: plt.plot(costs) plt.show()
def main(): max_iter = 10 print_period = 10 X, Y = get_normalized_data() reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1_0 = np.random.randn(D, M) / np.sqrt(D) b1_0 = np.zeros(M) W2_0 = np.random.randn(M, K) / np.sqrt(M) b2_0 = np.zeros(K) W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() # 1st moment mW1 = 0 mb1 = 0 mW2 = 0 mb2 = 0 # 2nd moment vW1 = 0 vb1 = 0 vW2 = 0 vb2 = 0 # hyperparams lr0 = 0.001 beta1 = 0.9 beta2 = 0.999 eps = 1e-8 # 1. Adam loss_adam = [] err_adam = [] t = 1 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # new m mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 # new v vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 # bias correction correction1 = 1 - beta1 ** t hat_mW1 = mW1 / correction1 hat_mb1 = mb1 / correction1 hat_mW2 = mW2 / correction1 hat_mb2 = mb2 / correction1 correction2 = 1 - beta2 ** t hat_vW1 = vW1 / correction2 hat_vb1 = vb1 / correction2 hat_vW2 = vW2 / correction2 hat_vb2 = vb2 / correction2 # update t t += 1 # apply updates to the params W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps) b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps) W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps) b2 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) loss_adam.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) err = error_rate(pY, Ytest) err_adam.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. RMSprop with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() loss_rms = [] err_rms = [] # comparable hyperparameters for fair comparison lr0 = 0.001 mu = 0.9 decay_rate = 0.999 eps = 1e-8 # rmsprop cache cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 # momentum dW1 = 0 db1 = 0 dW2 = 0 db2 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2 dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2) + eps) W2 -= dW2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2 db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2) + eps) b2 -= db2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1 dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1) + eps) W1 -= dW1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1 db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1) + eps) b1 -= db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) loss_rms.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) err = error_rate(pY, Ytest) err_rms.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(loss_adam, label='adam') plt.plot(loss_rms, label='rmsprop') plt.legend() plt.show()
def batch_grad(): #get data and for test and train sets X, Y = get_normalized_data() #XTrain = X[:-1000, :] #YTrain = Y[:-1000] #YTrain_ind = y2indicator(YTrain) #XTest = X[-1000:, :] #YTest = Y[-1000:] # = y2indicator(YTest) Y_ind = y2indicator(Y) batchSz = 500 #Initialize random weights N, D = X.shape K = len(set(Y)) M = 300 W1 = np.random.randn(D, M) b1 = np.random.randn(M) W2 = np.random.randn(M, K) b2 = np.random.randn(K) learning_rate = 10e-5 no_batches = int(N / batchSz) print("No of bathces: ", no_batches) for i in range(300): for n in range(no_batches): #get current batch XBatch = X[n * batchSz:(n * batchSz + batchSz), :] #YBatch = Y[n*batchSz:n*batchSz + batchSz] YBatch_ind = Y_ind[n * batchSz:(n * batchSz + batchSz), :] #Forward prop pY, Z = forward_relu(XBatch, W1, b1, W2, b2) #Backprop W2 += learning_rate * derivative_w2(pY, YBatch_ind, Z) b2 += learning_rate * derivative_b2(pY, YBatch_ind) W1 += learning_rate * derivative_w1(pY, YBatch_ind, W2, Z, XBatch) b1 += learning_rate * derivative_b1(pY, YBatch_ind, W2, Z) if n % 100 == 0: #Forward prop #pY, Z = forward_relu(XBatch, W1, b1, W2, b2) YBatch = Y[n * batchSz:n * batchSz + batchSz] P = np.argmax(pY, axis=1) er = error_rate(P, YBatch) c = cost(YBatch_ind, pY) print("Loop: ", i, n, "Error rate: ", er, "Cost: ", c) # pY, Z = forward_prop(XTrain, W1, b1, W2, b2) # P = np.argmax(pY, axis=1) # print("Final training error rate: ", error_rate(P, YTrain)) # # pY, Z = forward_prop(XTest, W1, b1, W2, b2) # P = np.argmax(pY, axis=1) # print("Final testing error rate: ", error_rate(P, YTest)) pY, Z = forward_relu(X, W1, b1, W2, b2) p = np.argmax(pY, axis=1) print("Final Final training error rate: ", error_rate(p, Y))
def main(): Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() print("Performing logistic regression...") N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # 1. full W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(50): p_y = forward(Xtrain, W, b) W += lr*(gradW(Ytrain_ind, p_y, Xtrain) - reg*W) b += lr*(gradb(Ytrain_ind, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for full GD:", datetime.now() - t0) # 2. stochastic W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(50): # takes very long since we're computing cost for 41k samples tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in range(min(N, 500)): # shortcut so it won't take so long... x = tmpX[n,:].reshape(1,D) y = tmpY[n,:].reshape(1,10) p_y = forward(x, W, b) W += lr*(gradW(y, p_y, x) - reg*W) b += lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_stochastic.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for SGD:", datetime.now() - t0) # 3. batch W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz t0 = datetime.now() for i in range(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:] y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:] p_y = forward(x, W, b) W += lr*(gradW(y, p_y, x) - reg*W) b += lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for batch GD:", datetime.now() - t0) x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
def main(): # 3 scenarios # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 15 print_period = 10 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.0001 reg = 0.001 # Xtrain = X[:-1000, ] # Ytrain = Y[:-1000] # Xtest = X[-1000:, ] # Ytest = Y[-1000:, ] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = int(N / batch_sz) M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # Batch losses_batch = [] error_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) # A = ' ' # A = u"\n| |\n|----------------------| \n(\\__/) || \n(• v •) || \n / D" if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) # print( # u"|----------------------|\n| | \n Costo # en i=%d, j=%d: \n %.6f" % (i, j, l) + A) e = error_rate(pY, Ytest) error_batch.append(e) print("Ratio de error:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate: ", error_rate(pY, Ytest)) # Momentum W1 = W1_0.copy() b1 = b1.copy() W2 = W2.copy() b2 = b2.copy() losses_rms = [] errors_rms = [] lr0 = 0.001 cacheW2 = 0 cacheb2 = 0 cacheW1 = 0 cacheb1 = 0 decay_rate = 0.99 eps = 0.000001 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) gb2 = (derivative_b2(Ybatch, pYbatch) + reg * b2) gW1 = (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) gb1 = (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) # caches cacheW2 = decay_rate * cacheW2 + (1 - decay_rate) * gW2 * gW2 cacheb2 = decay_rate * cacheb2 + (1 - decay_rate) * gb2 * gb2 cacheW1 = decay_rate * cacheW1 + (1 - decay_rate) * gW1 * gW1 cacheb1 = decay_rate * cacheb1 + (1 - decay_rate) * gb1 * gb1 W2 -= lr0 * gW2 / (np.sqrt(cacheW2) + eps) b2 -= lr0 * gb2 / (np.sqrt(cacheb2) + eps) W1 -= lr0 * gW1 / (np.sqrt(cacheW1) + eps) b1 -= lr0 * gb1 / (np.sqrt(cacheb1) + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_rms.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_rms.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate: ", error_rate(pY, Ytest)) plt.plot(losses_batch, label='batch') plt.plot(losses_rms, label='rmsprop') plt.legend() plt.show()
def fit(self, X, Y, Xtest, Ytest, pretrain=True, train_head_only=False, learning_rate=0.1, mu=0.99, reg=0.0, epochs=1, batch_sz=100): # cast to float32 learning_rate = np.float32(learning_rate) mu = np.float32(mu) reg = np.float32(reg) # greedy layer-wise training of autoencoders pretrain_epochs = 2 if not pretrain: pretrain_epochs = 0 current_input = X for ae in self.hidden_layers: ae.fit(current_input, epochs=pretrain_epochs) # create current_input for the next layer current_input = ae.hidden_op(current_input) # initialize logistic regression layer N = len(Y) K = len(set(Y)) W0 = init_weights((self.hidden_layers[-1].M, K)) self.W = theano.shared(W0, "W_logreg") self.b = theano.shared(np.zeros(K, dtype=np.float32), "b_logreg") self.params = [self.W, self.b] if not train_head_only: for ae in self.hidden_layers: self.params += ae.forward_params X_in = T.matrix('X_in') targets = T.ivector('Targets') pY = self.forward(X_in) squared_magnitude = [(p * p).sum() for p in self.params] reg_cost = T.sum(squared_magnitude) cost = -T.mean(T.log(pY[T.arange(pY.shape[0]), targets])) + reg * reg_cost prediction = self.predict(X_in) cost_predict_op = theano.function( inputs=[X_in, targets], outputs=[cost, prediction], ) updates = momentum_updates(cost, self.params, mu, learning_rate) train_op = theano.function( inputs=[X_in, targets], updates=updates, ) n_batches = N // batch_sz costs = [] print("supervised training...") for i in range(epochs): print("epoch:", i) X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] train_op(Xbatch, Ybatch) the_cost, the_prediction = cost_predict_op(Xtest, Ytest) error = error_rate(the_prediction, Ytest) print("j / n_batches:", j, "/", n_batches, "cost:", the_cost, "error:", error) costs.append(the_cost) plt.plot(costs) plt.show()
def fit(self, X, Y, Xvalid, Yvalid, learning_rate=1e-3, mu=0.99, decay=0.999, reg=1e-3, epoches=10, batch_sz=100, show_fig=False): # step1. get data X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = y2indicator(Y).astype(np.int32) Xvalid = Xvalid.astype(np.float32) Yvalid_vector = Yvalid.astype(np.int32) Yvalid = y2indicator(Yvalid).astype(np.int32) # step1.1 initialize each layer and parameters(with tf.Variable) of NN and keep them in a list N, D = X.shape M1 = D K = Y.shape[1] self.hidden_layers = [] # for saving HiddenLayer object count = 0 for M2 in self.hidden_layer_size: # 這邊做出第一層~ 倒數第二層 hidden layer 的HiddenLayer object h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 W, b = init_weight_and_bias(M1, K) # 最後輸出層的 weight self.W = tf.Variable(W.astype(np.float32)) self.b = tf.Variable(b.astype(np.float32)) # collect all the parameters that we are going to use grediant descent self.params = [self.W, self.b] for layer in self.hidden_layers: self.params += layer.params # step1.2 tf.palceholder tfX = tf.placeholder(tf.float32, shape=(None, D), name="X") tfT = tf.placeholder(tf.float32, shape=(None, K), name="T") # step2. model act = self.forward( tfX) # 最後不經過softmax喔,也不通過其他的activation fun(因為tf 就是這樣要求的) # step3. cost function rcost = reg * sum([tf.nn.l2_loss(p) for p in self.params]) cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(logits=act, labels=tfT)) + rcost prediction_op = self.predict(tfX) # step4. solver traiin_op = tf.train.RMSPropOptimizer(learning_rate=learning_rate, momentum=mu, decay=decay).minimize(cost) init = tf.global_variables_initializer() n_batches = N // batch_sz costs = [] with tf.Session() as sess: sess.run(init) for i in range(epoches): for j in range(n_batches): Xbatch = X[j * batch_sz:(j + 1) * batch_sz, ] Ybatch = Y[j * batch_sz:(j + 1) * batch_sz, ] sess.run(traiin_op, feed_dict={tfX: Xbatch, tfT: Ybatch}) if j % 50 == 0: cost_val = sess.run(cost, feed_dict={ tfX: Xvalid, tfT: Yvalid }) costs.append(cost_val) preds = sess.run(prediction_op, feed_dict={tfX: Xvalid}) err = error_rate(Yvalid_vector, preds) print("i:", i, "j:", j, "nb:", n_batches, "cost:", cost_val, "error rate:", err) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, lr=10e-5, mu=0.99, reg=10e-7, decay=0.99999, eps=10e-3, batch_sz=30, epochs=100, show_fig=True): lr = np.float32(lr) mu = np.float32(mu) reg = np.float32(reg) decay = np.float32(decay) eps = np.float32(eps) # make a validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = Y.astype(np.int32) Xvalid, Yvalid = X[-1000:], Y[-1000:] X, Y = X[:-1000], Y[:-1000] # initialize convpool layers N, c, d, d = X.shape mi = c outw = d outh = d self.convpool_layers = [] for mo, fw, fh in self.convpool_layer_sizes: layer = ConvPoolLayer(mi, mo, fw, fh) self.convpool_layers.append(layer) outw = (outw - fw + 1) / 2 outh = (outh - fh + 1) / 2 mi = mo # initialize mlp layers K = len(set(Y)) self.hidden_layers = [] M1 = self.convpool_layer_sizes[-1][0]*outw*outh # size must be same as output of last convpool layer count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 # logistic regression layer W, b = init_weight_and_bias(M1, K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for c in self.convpool_layers: self.params += c.params for h in self.hidden_layers: self.params += h.params # for momentum dparams = [theano.shared(np.zeros(p.get_value().shape, dtype=np.float32)) for p in self.params] # for rmsprop cache = [theano.shared(np.zeros(p.get_value().shape, dtype=np.float32)) for p in self.params] # set up theano functions and variables thX = T.tensor4('X', dtype='float32') thY = T.ivector('Y') pY = self.forward(thX) rcost = reg*T.sum([(p*p).sum() for p in self.params]) cost = -T.mean(T.log(pY[T.arange(thY.shape[0]), thY])) + rcost prediction = self.predict(thX) cost_predict_op = theano.function(inputs=[thX, thY], outputs=[cost, prediction]) # updates = [ # (c, decay*c + (np.float32(1)-decay)*T.grad(cost, p)*T.grad(cost, p)) for p, c in zip(self.params, cache) # ] + [ # (p, p + mu*dp - lr*T.grad(cost, p)/T.sqrt(c + eps)) for p, c, dp in zip(self.params, cache, dparams) # ] + [ # (dp, mu*dp - lr*T.grad(cost, p)/T.sqrt(c + eps)) for p, c, dp in zip(self.params, cache, dparams) # ] # momentum only updates = [ (p, p + mu*dp - lr*T.grad(cost, p)) for p, dp in zip(self.params, dparams) ] + [ (dp, mu*dp - lr*T.grad(cost, p)) for p, dp in zip(self.params, dparams) ] train_op = theano.function( inputs=[thX, thY], updates=updates ) n_batches = N / batch_sz costs = [] for i in xrange(epochs): X, Y = shuffle(X, Y) for j in xrange(n_batches): Xbatch = X[j*batch_sz:(j*batch_sz+batch_sz)] Ybatch = Y[j*batch_sz:(j*batch_sz+batch_sz)] train_op(Xbatch, Ybatch) if j % 20 == 0: c, p = cost_predict_op(Xvalid, Yvalid) costs.append(c) e = error_rate(Yvalid, p) print "i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e if show_fig: plt.plot(costs) plt.show()
def main(): max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. const # cost = -16 LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 2. RMSprop W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_rms = [] CR_rms = [] lr0 = 0.001 # if you set this too high you'll get NaN! cache_W2 = 0 cache_b2 = 0 cache_W1 = 0 cache_b1 = 0 decay_rate = 0.999 eps = 0.0000000001 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2 W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_rms.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_rms.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label='const') plt.plot(LL_rms, label='rms') plt.legend() plt.show()
def main(): #Get train and test data XTrain, YTrain = get_train_data() YTrain_ind = y2indicator(YTrain) XTrain = reshape(XTrain) XTest, YTest = get_test_data() YTest_ind = y2indicator(YTest) XTest = reshape(XTest) N,K = YTrain_ind.shape M=100 lr = np.float32(0.000001) reg = np.float32(0.01) mu = np.float32(0.99) poolsize = (2,2) batch_sz = 500 no_batches = int(N/batch_sz) #Initial random weight values W1_shape = (20, 3, 5, 5) W1_init = init_filter(W1_shape, poolsize) b1_init = np.zeros([W1_shape[0]]) W2_shape = (50, 20, 5, 5) W2_init = init_filter(W2_shape, poolsize) b2_init = np.zeros([W2_shape[0]]) W3_init = np.random.randn(W2_shape[0]*5*5, M)/np.sqrt(W2_shape[0]*5*5 + M) b3_init = np.zeros([M]) W4_init = np.random.randn(M,K)/np.sqrt(M+K) b4_init = np.zeros([K]) #Create theano variables X = T.tensor4('X', dtype='float32') #inputs Y = T.matrix('Y') W1 = theano.shared(W1_init.astype(np.float32), 'W1') #Weights b1 = theano.shared(b1_init.astype(np.float32), 'b1') W2 = theano.shared(W2_init.astype(np.float32), 'W2') b2 = theano.shared(b2_init.astype(np.float32), 'b2') W3 = theano.shared(W3_init.astype(np.float32), 'W3') b3 = theano.shared(b3_init.astype(np.float32), 'b3') W4 = theano.shared(W4_init.astype(np.float32), 'W4') b4 = theano.shared(b4_init.astype(np.float32), 'b4') dW1 = theano.shared(np.zeros(W1_init.shape, dtype=np.float32)) #Momentum variables db1 = theano.shared(np.zeros(b1_init.shape, dtype=np.float32)) dW2 = theano.shared(np.zeros(W2_init.shape, dtype=np.float32)) db2 = theano.shared(np.zeros(b2_init.shape, dtype=np.float32)) dW3 = theano.shared(np.zeros(W3_init.shape, dtype=np.float32)) db3 = theano.shared(np.zeros(b3_init.shape, dtype=np.float32)) dW4 = theano.shared(np.zeros(W4_init.shape, dtype=np.float32)) db4 = theano.shared(np.zeros(b4_init.shape, dtype=np.float32)) #Forward prop equations Z1 = convpool(X, W1, b1) #2 Conv-pool layer Z2 = convpool(Z1, W2, b2) Z3 = relu(Z2.flatten(ndim=2).dot(W3) + b3) #Fully connected NN P = T.nnet.softmax(Z3.dot(W4) + b4) #Cost and prediction equations params = (W1, b1, W2, b2, W3, b3, W4, b4) reg_cost = reg*np.sum([(param*param).sum() for param in params]) cost = (Y * T.log(P)).sum() + reg_cost pred = T.argmax(P, axis=1) #Update Weights W1_update = W1 + mu*dW1 + lr*T.grad(cost, W1) b1_update = b1 + mu*db1 + lr*T.grad(cost,b1) W2_update = W2 + mu*dW2 + lr*T.grad(cost, W2) b2_update = b2 + mu*db2 + lr*T.grad(cost,b2) W3_update = W3 + mu*dW3 + lr*T.grad(cost, W3) b3_update = b3 + mu*db3 + lr*T.grad(cost,b3) W4_update = W4 + mu*dW4 + lr*T.grad(cost, W4) b4_update = b4 + mu*db4 + lr*T.grad(cost,b4) #Gradient updates for momentum dW1_update = mu*dW1 + lr*T.grad(cost, W1) db1_update = mu*db1 + lr*T.grad(cost, b1) dW2_update = mu*dW2 + lr*T.grad(cost, W2) db2_update = mu*db2 + lr*T.grad(cost, b2) dW3_update = mu*dW3 + lr*T.grad(cost, W3) db3_update = mu*db3 + lr*T.grad(cost, b3) dW4_update = mu*dW4 + lr*T.grad(cost, W4) db4_update = mu*db4 + lr*T.grad(cost, b4) #Train function train = theano.function( inputs=[X,Y], updates=[ (W1, W1_update), (b1, b1_update), (W2, W2_update), (b2, b2_update), (W3, W3_update), (b3, b3_update), (W4, W4_update), (b4, b4_update), (dW1, dW1_update), (db1, db1_update), (dW2, dW2_update), (db2, db2_update), (dW3, dW3_update), (db3, db3_update), (dW4, dW4_update), (db4, db4_update), ]) #Get cost and prediction function get_res = theano.function( inputs=[X,Y], outputs=[cost,pred]) #Run batch gradient descent costs = [] for i in range(400): for n in range(no_batches): #get current batches XBatch = XTrain[n*batch_sz:(n*batch_sz + batch_sz), :] YBatch_ind = YTrain_ind[n*batch_sz:(n*batch_sz + batch_sz), :] #Forward prop train(XBatch, YBatch_ind) if(n%200 == 0): #YBatch = YTrain[n*batch_sz:(n*batch_sz + batch_sz)] c, P = get_res(XTest, YTest_ind) er = error_rate(P, YTest) print("Iteration: ", i, "Cost: ", c, "Error rate: ", er)
def score(self,X,Y): prediction = self.forward(X) return (1 - error_rate(Y,prediction))
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. batch # cost = -16 LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 2. batch with momentum W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_momentum = [] CR_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates dW2 = mu*dW2 - lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) W2 += dW2 db2 = mu*db2 - lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) b2 += db2 dW1 = mu*dW1 - lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) W1 += dW1 db1 = mu*db1 - lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) b1 += db1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_momentum.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_momentum.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 3. batch with Nesterov momentum W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_nest = [] CR_nest = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates dW2 = mu*mu*dW2 - (1 + mu)*lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) W2 += dW2 db2 = mu*mu*db2 - (1 + mu)*lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) b2 += db2 dW1 = mu*mu*dW1 - (1 + mu)*lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) W1 += dW1 db1 = mu*mu*db1 - (1 + mu)*lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) b1 += db1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_nest.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_nest.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label="batch") plt.plot(LL_momentum, label="momentum") plt.plot(LL_nest, label="nesterov") plt.legend() plt.show()
def main(): # compare 2 scenarios: # 1. batch GD with RMSProp and momentum # 2. Adam GD max_iter = 20 print_period = 10 X, Y = get_normalized_data() reg = 0.01 Xtrain, Ytrain = X[:-1000, :], Y[:-1000] Xtest, Ytest = X[-1000:, :], Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape M = 300 # number of hidden layer units K = len(set(Ytrain)) batch_size = 500 n_batches = N // batch_size # randomly initialize weights: W1_0 = np.random.randn(D, M) / np.sqrt(D) b1_0 = np.zeros(M) W2_0 = np.random.randn(M, K) / np.sqrt(M) b2_0 = np.zeros(K) # 1. batch GD with RMSProp and momentum: print('\nperforming batch GD with RMSProp and momentum...') W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() LL_rm = [] CR_rm = [] # hyperparams: lr0 = 0.001 #lr0 = 0.0001 mu = 0.9 decay = 0.999 eps = 10e-9 # momentum (velocity terms): dW1 = 0 db1 = 0 dW2 = 0 db2 = 0 # rms-prop cache (with no bias correction): cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 t0 = datetime.now() for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: # (note: we utilize a bit different version of momentum) gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2) cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2 dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2 + eps)) W2 -= dW2 #dW2 = mu*dW2 - lr0*gW2 / (np.sqrt(cache_W2) + eps) #W2 += dW2 gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2) cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2 db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2 + eps)) b2 -= db2 #db2 = mu*db2 - lr0*gb2 / (np.sqrt(cache_b2) + eps) #b2 += db2 gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1) cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1 dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1 + eps)) W1 -= dW1 #dW1 = mu*dW1 - lr0*gW1 / (np.sqrt(cache_W1) + eps) #W1 += dW1 gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1) cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1 db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1 + eps)) b1 -= db1 #db1 = mu*db1 - lr0*gb1 / (np.sqrt(cache_b1) + eps) #b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_rm.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_rm.append(error) print('error rate:', error) dt1 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) print('\nFinal error rate:', error_rate(pY, Ytest)) print('Elapsed time for batch GD with RMSProp and momentum:', dt1) # plot the cost plt.plot(LL_rm) plt.title('Cost for batch GD with RMSProp and momentum') plt.show() # 2. Adam optimizer print('\nperforming Adam optimizer...') W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() # hyperparams: lr = 0.001 beta1 = 0.9 beta2 = 0.999 eps = 10e-9 # 1st moment: mW1 = 0 mb1 = 0 mW2 = 0 mb2 = 0 # 2nd moment: vW1 = 0 vb1 = 0 vW2 = 0 vb2 = 0 LL_adam = [] CR_adam = [] t0 = datetime.now() t = 1 # index; used instead of j, because j starts with 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates: # gradients: gW2 = derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2 gb2 = derivative_b2(Ybatch, p_Ybatch) + reg * b2 gW1 = derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1 # 1st moment: mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 # 2nd moment: vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 # bias correction: mW2_bc = mW2 / (1 - beta1**t) mb2_bc = mb2 / (1 - beta1**t) mW1_bc = mW1 / (1 - beta1**t) mb1_bc = mb1 / (1 - beta1**t) vW2_bc = vW2 / (1 - beta2**t) vb2_bc = vb2 / (1 - beta2**t) vW1_bc = vW1 / (1 - beta2**t) vb1_bc = vb1 / (1 - beta2**t) # weights and biases (parameters): W2 = W2 - lr * mW2_bc / np.sqrt(vW2_bc + eps) b2 = b2 - lr * mb2_bc / np.sqrt(vb2_bc + eps) W1 = W1 - lr * mW1_bc / np.sqrt(vW1_bc + eps) b1 = b1 - lr * mb1_bc / np.sqrt(vb1_bc + eps) t += 1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_adam.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_adam.append(error) print('error rate:', error) dt2 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) print('\nFinal error rate:', error_rate(pY, Ytest)) print('Elapsed time for Adam optimizer:', dt2) # plot the cost plt.plot(LL_adam) plt.title('Cost for Adam optimizer') plt.show() # plot costs from the two experiments together: plt.plot(LL_rm, label='RMSProp with momentum') plt.plot(LL_adam, label='Adam optimizer') plt.title('Cost') plt.legend() plt.show()
def fit(self, X, Y, learning_rate=1e-3, mu=0.9, decay=0.9, reg=0, eps=1e-10, epochs=100, batch_sz=30, show_fig=False): learning_rate = np.float32(learning_rate) mu = np.float32(mu) decay = np.float32(decay) reg = np.float32(reg) eps = np.float32(eps) # make a validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = Y.astype(np.int32) Xvalid, Yvalid = X[-1000:], Y[-1000:] X, Y = X[:-1000], Y[:-1000] # initialize hidden layers N, D = X.shape K = len(set(Y)) self.hidden_layers = [] M1 = D count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 W, b = init_weight_and_bias(M1, K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for h in self.hidden_layers: self.params += h.params # set up theano functions and variables thX = T.fmatrix('X') thY = T.ivector('Y') pY = self.th_forward(thX) rcost = reg*T.sum([(p*p).sum() for p in self.params]) cost = -T.mean(T.log(pY[T.arange(thY.shape[0]), thY])) + rcost prediction = self.th_predict(thX) # actual prediction function self.predict_op = theano.function(inputs=[thX], outputs=prediction) cost_predict_op = theano.function(inputs=[thX, thY], outputs=[cost, prediction]) updates = rmsprop(cost, self.params, learning_rate, mu, decay, eps) train_op = theano.function( inputs=[thX, thY], updates=updates ) n_batches = N // batch_sz costs = [] for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j*batch_sz:(j*batch_sz+batch_sz)] Ybatch = Y[j*batch_sz:(j*batch_sz+batch_sz)] train_op(Xbatch, Ybatch) if j % 20 == 0: c, p = cost_predict_op(Xvalid, Yvalid) costs.append(c) e = error_rate(Yvalid, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()
def main(): train, test = get_data() Xtrain = rearrange(train['X']) Ytrain = train['y'].flatten() - 1 del train Xtrain, Ytrain = shuffle(Xtrain, Ytrain) Ytrain_ind = y2indicator(Ytrain) Xtest = rearrange(test['X']) Ytest = test['y'].flatten() - 1 del test Ytest_ind = y2indicator(Ytest) max_iter = 8 print_period = 10 lr = np.float32(0.00001) reg = np.float32(0.01) mu = np.float32(0.99) N = Xtrain.shape[0] batch_sz = 500 n_batches = N // batch_sz M = 500 K = 10 poolsz = (2, 2) # after conv will be of dimension 32 - 5 + 1 = 28 # after downsample 28 / 2 = 14 W1_shape = (20, 3, 5, 5) # (num_feature_maps, num_color_channels, filter_width, filter_height) W1_init = init_filter(W1_shape, poolsz) b1_init = np.zeros(W1_shape[0], dtype=np.float32) # one bias per output feature map # after conv will be of dimension 14 - 5 + 1 = 10 # after downsample 10 / 2 = 5 W2_shape = (50, 20, 5, 5) # (num_feature_maps, old_num_feature_maps, filter_width, filter_height) W2_init = init_filter(W2_shape, poolsz) b2_init = np.zeros(W2_shape[0], dtype=np.float32) # vanilla ANN weights W3_init = np.random.randn(W2_shape[0]*5*5, M) / np.sqrt(W2_shape[0]*5*5 + M) b3_init = np.zeros(M, dtype=np.float32) W4_init = np.random.randn(M, K) / np.sqrt(M + K) b4_init = np.zeros(K, dtype=np.float32) # step 2: define theano variables and expressions X = T.tensor4('X', dtype='float32') Y = T.matrix('T') W1 = theano.shared(W1_init, 'W1') b1 = theano.shared(b1_init, 'b1') W2 = theano.shared(W2_init, 'W2') b2 = theano.shared(b2_init, 'b2') W3 = theano.shared(W3_init.astype(np.float32), 'W3') b3 = theano.shared(b3_init, 'b3') W4 = theano.shared(W4_init.astype(np.float32), 'W4') b4 = theano.shared(b4_init, 'b4') # momentum changes dW1 = theano.shared(np.zeros(W1_init.shape, dtype=np.float32), 'dW1') db1 = theano.shared(np.zeros(b1_init.shape, dtype=np.float32), 'db1') dW2 = theano.shared(np.zeros(W2_init.shape, dtype=np.float32), 'dW2') db2 = theano.shared(np.zeros(b2_init.shape, dtype=np.float32), 'db2') dW3 = theano.shared(np.zeros(W3_init.shape, dtype=np.float32), 'dW3') db3 = theano.shared(np.zeros(b3_init.shape, dtype=np.float32), 'db3') dW4 = theano.shared(np.zeros(W4_init.shape, dtype=np.float32), 'dW4') db4 = theano.shared(np.zeros(b4_init.shape, dtype=np.float32), 'db4') # forward pass Z1 = convpool(X, W1, b1) Z2 = convpool(Z1, W2, b2) Z3 = relu(Z2.flatten(ndim=2).dot(W3) + b3) pY = T.nnet.softmax( Z3.dot(W4) + b4) # define the cost function and prediction params = (W1, b1, W2, b2, W3, b3, W4, b4) reg_cost = reg*np.sum((param*param).sum() for param in params) cost = -(Y * T.log(pY)).sum() + reg_cost prediction = T.argmax(pY, axis=1) # step 3: training expressions and functions update_W1 = W1 + mu*dW1 - lr*T.grad(cost, W1) update_b1 = b1 + mu*db1 - lr*T.grad(cost, b1) update_W2 = W2 + mu*dW2 - lr*T.grad(cost, W2) update_b2 = b2 + mu*db2 - lr*T.grad(cost, b2) update_W3 = W3 + mu*dW3 - lr*T.grad(cost, W3) update_b3 = b3 + mu*db3 - lr*T.grad(cost, b3) update_W4 = W4 + mu*dW4 - lr*T.grad(cost, W4) update_b4 = b4 + mu*db4 - lr*T.grad(cost, b4) # update weight changes update_dW1 = mu*dW1 - lr*T.grad(cost, W1) update_db1 = mu*db1 - lr*T.grad(cost, b1) update_dW2 = mu*dW2 - lr*T.grad(cost, W2) update_db2 = mu*db2 - lr*T.grad(cost, b2) update_dW3 = mu*dW3 - lr*T.grad(cost, W3) update_db3 = mu*db3 - lr*T.grad(cost, b3) update_dW4 = mu*dW4 - lr*T.grad(cost, W4) update_db4 = mu*db4 - lr*T.grad(cost, b4) train = theano.function( inputs=[X, Y], updates=[ (W1, update_W1), (b1, update_b1), (W2, update_W2), (b2, update_b2), (W3, update_W3), (b3, update_b3), (W4, update_W4), (b4, update_b4), (dW1, update_dW1), (db1, update_db1), (dW2, update_dW2), (db2, update_db2), (dW3, update_dW3), (db3, update_db3), (dW4, update_dW4), (db4, update_db4), ], ) # create another function for this because we want it over the whole dataset get_prediction = theano.function( inputs=[X, Y], outputs=[cost, prediction], ) t0 = datetime.now() LL = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] train(Xbatch, Ybatch) if j % print_period == 0: cost_val, prediction_val = get_prediction(Xtest, Ytest_ind) err = error_rate(prediction_val, Ytest) # cost_val = 0 # err = 0 ### test print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, err)) LL.append(cost_val) print("Elapsed time:", (datetime.now() - t0)) plt.plot(LL) plt.show()
def main(): X, Y, _, _ = get_transformed_data() X = X[:, :300] # normalize X first mu = X.mean(axis=0) std = X.std(axis=0) X = (X - mu) / std print "Performing logistic regression..." Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # 1. full W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange(200): p_y = forward(Xtrain, W, b) W += lr*(gradW(Ytrain_ind, p_y, Xtrain) - reg*W) b += lr*(gradb(Ytrain_ind, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) if i % 10 == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for full GD:", datetime.now() - t0 # 2. stochastic W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange(1): # takes very long since we're computing cost for 41k samples tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in xrange(min(N, 500)): # shortcut so it won't take so long... x = tmpX[n,:].reshape(1,D) y = tmpY[n,:].reshape(1,10) p_y = forward(x, W, b) W += lr*(gradW(y, p_y, x) - reg*W) b += lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_stochastic.append(ll) if n % (N/2) == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for SGD:", datetime.now() - t0 # 3. batch W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N / batch_sz t0 = datetime.now() for i in xrange(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in xrange(n_batches): x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:] y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:] p_y = forward(x, W, b) W += lr*(gradW(y, p_y, x) - reg*W) b += lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if j % (n_batches/2) == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for batch GD:", datetime.now() - t0 x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
def fit(self, X, Y, learning_rate=10e-7, mu=0.99, decay=0.999, reg=10e-12, eps=10e-10, epochs=400, batch_sz=100, show_fig=False): learning_rate = np.float32(learning_rate) mu = np.float32(mu) decay = np.float32(decay) reg = np.float32(reg) eps = np.float32(eps) # make a validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = Y.astype(np.int32) Xvalid, Yvalid = X[-1000:], Y[-1000:] X, Y = X[:-1000], Y[:-1000] # initialize hidden layers N, D = X.shape K = len(set(Y)) self.hidden_layers = [] M1 = D count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 W, b = init_weight_and_bias(M1, K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for h in self.hidden_layers: self.params += h.params # for momentum dparams = [theano.shared(np.zeros(p.get_value().shape, dtype=np.float32)) for p in self.params] # for rmsprop cache = [theano.shared(np.zeros(p.get_value().shape, dtype=np.float32)) for p in self.params] # set up theano functions and variables thX = T.fmatrix('X') thY = T.ivector('Y') pY = self.forward(thX) rcost = reg*T.sum([(p*p).sum() for p in self.params]) cost = -T.mean(T.log(pY[T.arange(thY.shape[0]), thY])) + rcost prediction = self.predict(thX) cost_predict_op = theano.function(inputs=[thX, thY], outputs=[cost, prediction]) updates = [ (c, decay*c + (np.float32(1)-decay)*T.grad(cost, p)*T.grad(cost, p)) for p, c in zip(self.params, cache) ] + [ (p, p + mu*dp - learning_rate*T.grad(cost, p)/T.sqrt(c + eps)) for p, c, dp in zip(self.params, cache, dparams) ] + [ (dp, mu*dp - learning_rate*T.grad(cost, p)/T.sqrt(c + eps)) for p, c, dp in zip(self.params, cache, dparams) ] # momentum only # updates = [ # (p, p + mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, dparams) # ] + [ # (dp, mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, dparams) # ] train_op = theano.function( inputs=[thX, thY], updates=updates ) n_batches = N / batch_sz costs = [] for i in xrange(epochs): X, Y = shuffle(X, Y) for j in xrange(n_batches): Xbatch = X[j*batch_sz:(j*batch_sz+batch_sz)] Ybatch = Y[j*batch_sz:(j*batch_sz+batch_sz)] train_op(Xbatch, Ybatch) if j % 20 == 0: c, p = cost_predict_op(Xvalid, Yvalid) costs.append(c) e = error_rate(Yvalid, p) print "i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e if show_fig: plt.plot(costs) plt.show()
def main(): # compare 5 scenarios: # 1. batch SGD with constant learning rate # 2. batch SGD with RMSProp # 3. batch SGD with AdaGrad # 4. batch SGD with exponential decay np.random.seed(2) max_iter = 20 print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain, Ytrain = X[:-1000, :], Y[:-1000] Xtest, Ytest = X[-1000:, :], Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape M = 300 # number of hidden layer units K = len(set(Ytrain)) batch_size = 500 n_batches = N // batch_size # randomly initialize weights: W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights: W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. batch SGD with constant learning rate: LL_batch = [] CR_batch = [] t0 = datetime.now() print('\nperforming batch SGD with constant learning rate...') for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: W2 -= lr * (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, p_Ybatch) + reg * b2) W1 -= lr * (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_batch.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_batch.append(error) print('error rate:', error) dt1 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) final_err1 = error_rate(pY, Ytest) # plot the cost #plt.plot(LL_batch) #plt.title('Cost for batch GD with const lr') #plt.show() # 2. batch GD with RMSProp: W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() LL_RMSProp = [] CR_RMSProp = [] lr0 = 0.001 # initial learning rate cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 decay = 0.999 eps = 10e-10 t0 = datetime.now() print('\nperforming batch SGD with RMSProp...') for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2) cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2 W2 -= lr0 * gW2 / np.sqrt(cache_W2 + eps) gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2) cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2 b2 -= lr0 * gb2 / np.sqrt(cache_b2 + eps) gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1) cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1 W1 -= lr0 * gW1 / np.sqrt(cache_W1 + eps) gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1) cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1 b1 -= lr0 * gb1 / np.sqrt(cache_b1 + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_RMSProp.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_RMSProp.append(error) print('error rate:', error) dt2 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) final_err2 = error_rate(pY, Ytest) # plot the cost #plt.plot(LL_RMSProp) #plt.title('Cost for batch SGD with RMSProp') #plt.show() # 3. batch SGD with AdaGrad: W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() LL_AdaGrad = [] CR_AdaGrad = [] lr0 = 0.01 # initial learning rate cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 eps = 10e-10 t0 = datetime.now() print('\nperforming batch SGD with AdaGrad...') for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2) cache_W2 = cache_W2 + gW2 * gW2 W2 -= lr0 * gW2 / np.sqrt(cache_W2 + eps) gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2) cache_b2 = cache_b2 + gb2 * gb2 b2 -= lr0 * gb2 / np.sqrt(cache_b2 + eps) gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1) cache_W1 = cache_W1 + gW1 * gW1 W1 -= lr0 * gW1 / np.sqrt(cache_W1 + eps) gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1) cache_b1 = cache_b1 + gb1 * gb1 b1 -= lr0 * gb1 / np.sqrt(cache_b1 + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_AdaGrad.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_AdaGrad.append(error) print('error rate:', error) dt3 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) final_err3 = error_rate(pY, Ytest) # plot the cost #plt.plot(LL_AdaGrad) #plt.title('Cost for batch SGD with AdaGrad') #plt.show() ''' # 4. batch SGD with exponential decay: W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() LL_exp = [] CR_exp = [] lr0 = 0.0004 # initial learning rate k = 1e-7 t = 0 # initial log lr = lr0 t0 = datetime.now() print('\nperforming batch SGD with lr exponential decay...') for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_size:(j+1)*batch_size, :] Ybatch = Ytrain_ind[j*batch_size:(j+1)*batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg*W2) W2 -= lr*gW2 gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg*b2) b2 -= lr*gb2 gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg*W1) W1 -= lr*gW1 gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg*b1) b1 -= lr*gb1 # decrease the learning rate lr = lr0 * np.exp(-k*t) t += 1 if j % print_period == 0: print('current learning rate:', lr) pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_exp.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_exp.append(error) print('error rate:', error) dt4 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) print('\nFinal error rate:', error_rate(pY, Ytest)) print('Elapsed time for batch SGD with lr exponential decay:', dt4) # plot the cost #plt.plot(LL_exp) #plt.title('Cost for batch SGD with lr exponential decay') #plt.show() ''' print('\nBatch SGD with constant learning rate:') print('final error rate:', final_err1) print('elapsed time:', dt1) print('\nBatch SGD with RMSProp:') print('final error rate:', final_err2) print('elapsed time:', dt2) print('\nBatch SGD with AdaGrad:') print('final error rate:', final_err3) print('elapsed time:', dt3) # plot the costs together: plt.plot(LL_batch, label='const_lr') plt.plot(LL_RMSProp, label='RMSProp') plt.plot(LL_AdaGrad, label='AdaGrad') #plt.plot(LL_exp, label='lr_exp_decay') plt.legend() plt.show()
def fit(self, X, Y, Xtest, Ytest, pretrain=True, learning_rate=0.01, mu=.99, reg=0 * .1, epochs=1, batch_sz=100, show_fig=False): pretrain_epochs = 1 if not pretrain: pretrain_epochs = 0 current_input = X for ae in self.hidden_layers: ae.fit(current_input, epochs=pretrain_epochs) current_input = ae.hidden_op(current_input) N = len(Y) K = len(set(Y)) W0 = init_weight(self.hidden_layers[-1].M, K) self.W = theano.shared(W0, 'W_logreg') self.b = theano.shared(np.zeros(K), 'b_logreg') self.params = [self.W, self.b] for ae in self.hidden_layers: self.params += ae.forward_params self.dW = theano.shared(np.zeros(W0.shape), 'dW_logreg') self.db = theano.shared(np.zeros(K), 'db_logreg') self.dparams = [self.dW, self.db] for ae in self.hidden_layers: self.dparams += ae.forward_dparams X_in = T.matrix('X_in') targets = T.ivector('Targets') pY = self.forward(X_in) # squared_magnitude = [(p*p) for p in self.params] # reg_cost= T.sum(squared_magnitude) cost = -T.mean(T.log(pY[T.arange(pY.shape[0]), targets])) #+ reg_cost prediction = self.predict(X_in) cost_predict_op = theano.function(inputs=[X_in, targets], outputs=[cost, prediction]) updates = [(p, p + mu * dp - learning_rate * T.grad(cost, p)) for p, dp in zip(self.params, self.dparams) ] + [(dp, mu * dp - learning_rate * T.grad(cost, p)) for p, dp in zip(self.params, self.dparams)] train_op = theano.function( inputs=[X_in, targets], updates=updates, ) n_batches = N / batch_sz costs = [] print 'supervised training' for i in xrange(epochs): print "epoch:", i X, Y = shuffle(X, Y) for j in xrange(n_batches): Xbatch = X[j * batch_sz:(j + 1) * batch_sz] Ybatch = Y[j * batch_sz:(j + 1) * batch_sz] train_op(Xbatch, Ybatch) the_cost, the_prediction = cost_predict_op(Xtest, Ytest) error = error_rate(the_prediction, Ytest) print "i:%d\tj:%d\tnb:%d\tcost:%.6f\terror:%.3f\t" % ( i, j, n_batches, the_cost, error) costs.append(the_cost) if show_fig: plt.plot(costs) plt.show()
def main(): max_iter = 20 # make it 30 for sigmoid print_period = 10 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.00004 reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. const # cost = -16 LL_batch = [] CR_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)) err = error_rate(pY, Ytest) CR_batch.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. RMSprop W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_rms = [] CR_rms = [] lr0 = 0.001 # if you set this too high you'll get NaN! cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 decay_rate = 0.999 eps = 1e-10 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_rms.append(ll) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)) err = error_rate(pY, Ytest) CR_rms.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(LL_batch, label='const') plt.plot(LL_rms, label='rms') plt.legend() plt.show()
def fit(self, X, Y, learning_rate=10e-4, mu=0.99, decay=0.999, reg=10e-3, epochs=400, batch_sz=128, show_fig=False): K = len(set(Y)) # make a validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = y2indicator(Y).astype(np.float32) Xvalid, Yvalid = X[-1000:], Y[-1000:] Yvalid_flat = np.argmax(Yvalid, axis=1) X, Y = X[:-1000], Y[:-1000] # intialize hidden layers N, D = X.shape self.hidden_layers = [] M1 = D count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 #output of last layer is input of next count += 1 # initaliz params of output layers W, b = init_weight_and_bias(M1, K) self.W = tf.Variable(W.astype(np.float32)) self.b = tf.Variable(b.astype(np.float32)) self.params = [self.W, self.b] for h in self.hidden_layers: self.params += h.params tfX = tf.placeholder(tf.float32, shape=(None, D), name='X') tfT = tf.placeholder(tf.float32, shape=(None, K), name='T') act = self.forward(tfX) rcost = reg * sum([tf.nn.l2_loss(p) for p in self.params]) cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=act, labels=tfT)) + rcost predction = self.predict(tfX) train_op = tf.train.RMSPropOptimizer(learning_rate, decay=decay, momentum=mu).minimize(cost) n_batches = int(N / batch_sz) costs = [] init = tf.initialize_all_variables() with tf.Session() as session: session.run(init) for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] session.run(train_op, feed_dict={tfX: Xbatch, tfT: Ybatch}) if j % 20 == 0: c = session.run(cost, feed_dict={ tfX: Xvalid, tfT: Yvalid }) costs.append(c) p = session.run(predction, feed_dict={ tfX: Xvalid, tfT: Yvalid }) e = error_rate(Yvalid_flat, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error_rate", e) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, lr=1e-3, mu=0.99, reg=1e-3, decay=0.99999, eps=1e-10, batch_sz=30, epochs=3, show_fig=True): lr = np.float32(lr) mu = np.float32(mu) reg = np.float32(reg) decay = np.float32(decay) eps = np.float32(eps) K = len(set(Y)) # make a validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = y2indicator(Y).astype(np.float32) Xvalid, Yvalid = X[-1000:], Y[-1000:] X, Y = X[:-1000], Y[:-1000] Yvalid_flat = np.argmax(Yvalid, axis=1) # for calculating error rate # initialize convpool layers N, width, height, c = X.shape mi = c outw = width outh = height self.convpool_layers = [] for mo, fw, fh in self.convpool_layer_sizes: layer = ConvPoolLayer(mi, mo, fw, fh) self.convpool_layers.append(layer) outw = outw // 2 outh = outh // 2 mi = mo # initialize mlp layers self.hidden_layers = [] M1 = self.convpool_layer_sizes[-1][ 0] * outw * outh # size must be same as output of last convpool layer count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 # logistic regression layer W, b = init_weight_and_bias(M1, K) self.W = tf.Variable(W, 'W_logreg') self.b = tf.Variable(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for h in self.convpool_layers: self.params += h.params for h in self.hidden_layers: self.params += h.params # set up tensorflow functions and variables tfX = tf.placeholder(tf.float32, shape=(None, width, height, c), name='X') tfY = tf.placeholder(tf.float32, shape=(None, K), name='Y') act = self.forward(tfX) rcost = reg * sum([tf.nn.l2_loss(p) for p in self.params]) cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=act, labels=tfY)) + rcost prediction = self.predict(tfX) train_op = tf.train.RMSPropOptimizer(lr, decay=decay, momentum=mu).minimize(cost) n_batches = N // batch_sz costs = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] session.run(train_op, feed_dict={tfX: Xbatch, tfY: Ybatch}) if j % 20 == 0: c = session.run(cost, feed_dict={ tfX: Xvalid, tfY: Yvalid }) costs.append(c) p = session.run(prediction, feed_dict={ tfX: Xvalid, tfY: Yvalid }) e = error_rate(Yvalid_flat, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()
def main(): #Get train and test data XTrain, YTrain = get_train_data() YTrain_ind = y2indicator(YTrain) XTrain = reshape(XTrain) XTest, YTest = get_test_data() YTest_ind = y2indicator(YTest) XTest = reshape(XTest) N, K = YTrain_ind.shape lr = np.float32(0.001) mu = np.float32(0.99) reg = np.float32(0.01) poolsz = (2, 2) M = 100 batch_sz = 500 no_batches = int(N / batch_sz) #Initial random weights W1_shape = (5, 5, 3, 20) W1_init = init_filter(W1_shape, poolsz) b1_init = np.zeros([W1_shape[3]]) W2_shape = (5, 5, 25, 50) W2_init = init_filter(W2_shape, poolsz) b2_init = np.zeros([W2_shape[3]]) W3_init = np.random.randn(W2_shape[3] * 8 * 8, M) / np.sqrt(W2_shape[3] * 8 * 8 + M) b3_init = np.zeros([M]) W4_init = np.random.randn(M, K) / np.sqrt(M + K) b4_init = np.zeros([K]) #Tensorflow variables X = tf.placeholder(name='X', dtype='float32', shape=(batch_sz, 32, 32, 3)) Y = tf.placeholder(name='Y', dtype='float32', shape=(batch_sz, K)) W1 = tf.Variable(W1_init.astype(np.float32), name='W1') b1 = tf.Variable(b1_init.astype(np.float32), name='b1') W2 = tf.Variable(W2_init.astype(np.float32), name='W2') b2 = tf.Variable(b2_init.astype(np.float32), name='b2') W3 = tf.Variable(W3_init.astype(np.float32), name='W3') b3 = tf.Variable(b3_init.astype(np.float32), name='b3') W4 = tf.Variable(W4_init.astype(np.float32), name='W4') b4 = tf.Variable(b4_init.astype(np.float32), name='b4') #Forward prop Z1 = convpool(X, W1, b1) Z2 = convpool(Z1, W2, b2) Z2_shape = Z2.get_shape().as_list() Z2_flat = tf.reshape(Z2, [Z2_shape[0], np.prod(Z2_shape[1:])]) Z3 = tf.nn.relu(tf.matmul(Z2_flat, W3) + b3) pY = tf.matmul(Z3, W4) + b4 #Cost and prediction cost = tf.reduce_sum( tf.nn.softmax_cross_entropy_with_logits(logits=pY, labels=Y)) #Train function train = tf.train.RMSPropOptimizer(lr, decay=0.99, momentum=mu).minimize(cost) #Get prediction pred = tf.argmax(pY, axis=1) init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(100): for n in range(no_batches): #get current batches XBatch = XTrain[n * batch_sz:(n * batch_sz + batch_sz), :] YBatch_ind = YTrain_ind[n * batch_sz:(n * batch_sz + batch_sz), :] #Forward prop session.run(train, feed_dict={X: XBatch, Y: YBatch_ind}) if (n % 200 == 0): YBatch = YTrain[n * batch_sz:(n * batch_sz + batch_sz)] c = session.run(cost, feed_dict={X: XBatch, Y: YBatch_ind}) P = session.run(pred, feed_dict={X: XBatch}) er = error_rate(P, YBatch) print("Iteration: ", i, "Cost: ", c, "Error rate: ", er)
def main(): max_iter = 20 print_period = 20 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() Ytrain_ind = y2indicator(Ytrain) # Target of train data Ytest_ind = y2indicator(Ytest) # Target of test data lr = 0.00004 reg = 0.01 N, D = Xtrain.shape M = 300 K = 10 np.random.seed(123) W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) batch_sz = 500 n_batches = N // batch_sz # 82 # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. learning rate = constant losses_batch = [] errors_batch = [] for i in range(max_iter): # Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] # Target of each batch pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_batch.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. RMSprop W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_rms = [] errors_rms = [] ''' in RMSprop you can use a bigger lr, but if you set this too high you'll get NaN! if you use the same learning rate within RMSprop and General method, there is only slight difference between them. ''' lr0 = 0.001 cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 decay_rate = 0.999 eps = 1e-10 for i in range(max_iter): # Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] # Target of each batch pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # # update # cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*np.square(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) # W2 -= lr0 / (np.sqrt(cache_W2) + eps) *(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) # cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*np.square(derivative_b2(Ybatch, pYbatch) + reg*b2) # b2 -= lr0 / (np.sqrt(cache_b2) + eps) *(derivative_b2(Ybatch, pYbatch) + reg*b2) # cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*np.square(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2)) # W1 -= lr0 / (np.sqrt(cache_W1) + eps) *(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) # cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*np.square(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) # b1 -= lr0 / (np.sqrt(cache_b1) + eps) *(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) # updates # 更聰明的寫法,是把上面式子中,會重複計算到的部分提出來計算並指派給變數,讓它只計算一次,這樣會加速 gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2 W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_rms.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_rms.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(losses_batch, label='contant') plt.plot(losses_rms, label='RMSprop') plt.legend() plt.show()
def main(): ''' RMSprop is a form adaptative learning rate which decreases over time ''' max_iter = 20 #for RelU #max_iter = 30 #for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.0004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M =300 K=10 #1. batch SGD W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:((j+1)*batch_sz), :] Ybatch = Ytrain_ind[j*batch_sz:((j+1)*batch_sz), :] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch,pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period ==0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) #2. RMSProp W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_rms = [] CR_rms = [] lr0 = 0.001 cache_W2 = 0 cache_b2 = 0 cache_W1 = 0 cache_b1 = 0 decay_rate = 1 - 1e-5 eps = 1e-10 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:((j+1)*batch_sz), :] Ybatch = Ytrain_ind[j*batch_sz:((j+1)*batch_sz), :] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) #updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1-decay_rate)*gW2*gW2 W2 -= lr0*gW2 /(np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch,pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1-decay_rate)*gb2*gb2 b2 -= lr0*gb2 /(np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1-decay_rate)*gW1*gW1 W1 -= lr0*gW1 /(np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1-decay_rate)*gb1*gb1 b1 -= lr0*gb1 /(np.sqrt(cache_b1) + eps) if j % print_period ==0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_rms.append(ll) print "RMS Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_rms.append(err) print "RMS Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "RMS Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label='batch') plt.plot(LL_rms, label='rms') plt.legend() plt.show()
def score(self, X, Y): prediction = self.predict(X) return 1 - error_rate(Y, prediction)
def fit(self, X, Y, learning_rate=10e-7, mu=0.99, decay=0.999, reg=10e-12, eps=10e-10, epochs=400, batch_sz=100, show_fig=False): learning_rate = np.float32(learning_rate) mu = np.float32(mu) decay = np.float32(decay) reg = np.float32(reg) eps = np.float32(eps) # make a validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = Y.astype(np.int32) Xvalid, Yvalid = X[-1000:], Y[-1000:] X, Y = X[:-1000], Y[:-1000] # initialize hidden layers N, D = X.shape K = len(set(Y)) self.hidden_layers = [] M1 = D count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 W, b = weights_and_bias_init(M1, K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for h in self.hidden_layers: self.params += h.params # for momentum dparams = [ theano.shared(np.zeros(p.get_value().shape, dtype=np.float32)) for p in self.params ] # for rmsprop cache = [ theano.shared(np.zeros(p.get_value().shape, dtype=np.float32)) for p in self.params ] # set up theano functions and variables thX = T.fmatrix('X') thT = T.ivector('T') Y = self.th_forward(thX) rcost = reg * T.sum([(p * p).sum() for p in self.params]) cost = -T.mean(T.log(Y[T.arange(thT.shape[0]), thT])) + rcost prediction = self.th_predict(thX) # actual prediction function self.predict_op = theano.function(inputs=[thX], outputs=prediction) cost_predict_op = theano.function(inputs=[thX, thT], outputs=[cost, prediction]) updates = [ (c, decay * c + (np.float32(1) - decay) * T.grad(cost, p) * T.grad(cost, p)) for p, c in zip(self.params, cache) ] + [ (p, p + mu * dp - learning_rate * T.grad(cost, p) / T.sqrt(c + eps)) for p, c, dp in zip(self.params, cache, dparams) ] + [(dp, mu * dp - learning_rate * T.grad(cost, p) / T.sqrt(c + eps)) for p, c, dp in zip(self.params, cache, dparams)] # momentum only # updates = [ # (p, p + mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, dparams) # ] + [ # (dp, mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, dparams) # ] train_op = theano.function(inputs=[thX, thT], updates=updates) n_batches = N // batch_sz costs = [] for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] train_op(Xbatch, Ybatch) if j % 20 == 0: c, p = cost_predict_op(Xvalid, Yvalid) costs.append(c) e = error_rate(Yvalid, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 20 # make it 30 for sigmoid print_period = 50 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.00004 reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. batch losses_batch = [] errors_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_batch.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. batch with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_momentum = [] errors_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # update velocities dW2 = mu*dW2 - lr*gW2 db2 = mu*db2 - lr*gb2 dW1 = mu*dW1 - lr*gW1 db1 = mu*db1 - lr*gb1 # updates W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_momentum.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_momentum.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 3. batch with Nesterov momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_nesterov = [] errors_nesterov = [] mu = 0.9 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # v update vW2 = mu*vW2 - lr*gW2 vb2 = mu*vb2 - lr*gb2 vW1 = mu*vW1 - lr*gW1 vb1 = mu*vb1 - lr*gb1 # param update W2 += mu*vW2 - lr*gW2 b2 += mu*vb2 - lr*gb2 W1 += mu*vW1 - lr*gW1 b1 += mu*vb1 - lr*gb1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_nesterov.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_nesterov.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(losses_batch, label="batch") plt.plot(losses_momentum, label="momentum") plt.plot(losses_nesterov, label="nesterov") plt.legend() plt.show()
def main(): max_iter = 10 print_period = 10 X_train, X_test, Y_train, Y_test = get_normalized_data() reg = 0.01 Y_train_ind = y2indicator(Y_train) Y_test_ind = y2indicator(Y_test) N, D = X_train.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1_0 = np.random.randn(D, M) / np.sqrt(D) b1_0 = np.zeros(M) W2_0 = np.random.randn(M, K) / np.sqrt(K) b2_0 = np.zeros(K) # .1 Adam W1 = W1_0.copy() W2 = W2_0.copy() b1 = b1_0.copy() b2 = b2_0.copy() losses_adam = [] errors_adam = [] # 1st moment mW1 = 0 mW2 = 0 mb1 = 0 mb2 = 0 # 2nd moment vW1 = 0 vW2 = 0 vb1 = 0 vb2 = 0 # Hyperparams eps = 1e-8 lr = 0.001 beta1 = 0.9 beta2 = 0.999 t = 1 for i in range(max_iter): for j in range(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 # new m mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 # new v vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 # bias correction correction1 = 1 - beta1 ** t mW1_hat = mW1 / correction1 mb1_hat = mb1 / correction1 mW2_hat = mW2 / correction1 mb2_hat = mb2 / correction1 # correction2 = 1 - beta2 ** t vb2_hat = vb2 / correction2 vb1_hat = vb1 / correction2 vW2_hat = vW2 / correction2 vW1_hat = vW1 / correction2 t += 1 # weights W1 = W1 - lr * mW1_hat / np.sqrt(vW1_hat + eps) b1 = b1 - lr * mb1_hat / np.sqrt(vb1_hat + eps) W2 = W2 - lr * mW2_hat / np.sqrt(vW2_hat + eps) b2 = b2 - lr * mb2_hat / np.sqrt(vb2_hat + eps) if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) losses_adam.append(l) print(f'Adam Cost at iteration i={i}, j={j} : {l}') e = error_rate(pY, Y_test) errors_adam.append(e) print("error_rate", e) pY, _ = forward(X_test, W1, b1, W2, b2) adam_error = error_rate(pY, Y_test) # 3. RMSProp with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_rms = [] errors_rms = [] # comparable hyper parameters for fair lr0 = 0.001 mu = 0.9 decay_rate = 0.999 eps = 1e-8 # rmsprop cache cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 # momentum dW1 = 0 db1 = 0 dW2 = 0 db2 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2) + eps) W2 -= dW2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2) + eps) b2 -= db2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1) + eps) W1 -= dW1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1) + eps) b1 -= db1 if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) losses_rms.append(l) print(f'Cost at iteration i={i}, j={j} : {l}') err = error_rate(pY, Y_test) errors_rms.append(err) print("Error rate:", err) pY, _ = forward(X_test, W1, b1, W2, b2) rms_error = error_rate(pY, Y_test) print(f"Final RMSProp error rate: {rms_error}") print(f"Final Adam error rate: {adam_error}") plt.plot(losses_adam, label='batch cost') plt.plot(losses_rms, label='RMSProp cost') plt.legend() plt.show()
def fit(self, X, Y, lr=1e-3, mu=0.99, reg=1e-3, decay=0.99999, eps=1e-10, batch_sz=30, epochs=3, show_fig=True): lr = np.float32(lr) mu = np.float32(mu) reg = np.float32(reg) decay = np.float32(decay) eps = np.float32(eps) # A cross validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = Y.astype(np.int32) Xvalid, Yvalid = X[-1000:], Y[-1000:] X, Y = X[:-1000], Y[:-1000] # initialize convpool layers N, c, width, height = X.shape mi = c outw = width outh = height self.convpool_layers = [] for mo, fw, fh in self.convpool_layer_sizes: layer = ConvPoolLayer(mi, mo, fw, fh) self.convpool_layers.append(layer) outw = (outw - fw + 1) // 2 outh = (outh - fh + 1) // 2 mi = mo # initialize mlp layers K = len(set(Y)) self.hidden_layers = [] M1 = self.convpool_layer_sizes[-1][ 0] * outw * outh # size must be same as output of last convpool layer count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 # logistic regression layer W, b = init_weight_and_bias(M1, K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for c in self.convpool_layers: self.params += c.params for h in self.hidden_layers: self.params += h.params # set up theano functions and variables thX = T.tensor4('X', dtype='float32') thY = T.ivector('Y') pY = self.forward(thX) rcost = reg * T.sum([(p * p).sum() for p in self.params]) cost = -T.mean(T.log(pY[T.arange(thY.shape[0]), thY])) + rcost prediction = self.th_predict(thX) cost_predict_op = theano.function(inputs=[thX, thY], outputs=[cost, prediction]) updates = rmsprop(cost, self.params, lr, mu, decay, eps) train_op = theano.function(inputs=[thX, thY], outputs=cost, updates=updates) n_batches = N // batch_sz costs = [] for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] train_c = train_op(Xbatch, Ybatch) if j % 20 == 0: c, p = cost_predict_op(Xvalid, Yvalid) costs.append(c) e = error_rate(Yvalid, p) print("i:", i, "j:", j, "nb:", n_batches, "train cost:", train_c, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, Xtest, Ytest, pretrain=True, learning_rate=0.01, mu=0.99, reg=0.1, epochs=1, batch_sz=100): # greedy layer-wise training of autoencoders pretrain_epochs = 1 if not pretrain: pretrain_epochs = 0 current_input = X for ae in self.hidden_layers: ae.fit(current_input, epochs=pretrain_epochs) # create current_input for the next layer current_input = ae.hidden_op(current_input) # initialize logistic regression layer N = len(Y) K = len(set(Y)) W0 = init_weights((self.hidden_layers[-1].M, K)) self.W = theano.shared(W0, "W_logreg") self.b = theano.shared(np.zeros(K), "b_logreg") self.params = [self.W, self.b] for ae in self.hidden_layers: self.params += ae.forward_params # for momentum self.dW = theano.shared(np.zeros(W0.shape), "dW_logreg") self.db = theano.shared(np.zeros(K), "db_logreg") self.dparams = [self.dW, self.db] for ae in self.hidden_layers: self.dparams += ae.forward_dparams X_in = T.matrix('X_in') targets = T.ivector('Targets') pY = self.forward(X_in) # squared_magnitude = [(p*p).sum() for p in self.params] # reg_cost = T.sum(squared_magnitude) cost = -T.mean( T.log(pY[T.arange(pY.shape[0]), targets]) ) #+ reg*reg_cost prediction = self.predict(X_in) cost_predict_op = theano.function( inputs=[X_in, targets], outputs=[cost, prediction], ) updates = [ (p, p + mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, self.dparams) ] + [ (dp, mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, self.dparams) ] # updates = [(p, p - learning_rate*T.grad(cost, p)) for p in self.params] train_op = theano.function( inputs=[X_in, targets], updates=updates, ) n_batches = N // batch_sz costs = [] print("supervised training...") for i in range(epochs): print("epoch:", i) X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j*batch_sz:(j*batch_sz + batch_sz)] Ybatch = Y[j*batch_sz:(j*batch_sz + batch_sz)] train_op(Xbatch, Ybatch) the_cost, the_prediction = cost_predict_op(Xtest, Ytest) error = error_rate(the_prediction, Ytest) print("j / n_batches:", j, "/", n_batches, "cost:", the_cost, "error:", error) costs.append(the_cost) plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=1e-4, mu=0.9, decay=0.9, epochs=15, batch_sz=100, display_cost=False, save_params=False): # set evarything to np.float32 to enable tf computation running correctly learning_rate = np.float32(learning_rate) mu = np.float32(mu) decay = np.float32(decay) # create a vailidation set: X, Y = shuffle(X, Y) Xvalid, Yvalid = X[-1000:, ], Y[-1000:] X, Y = X[:-1000, ], Y[:-1000] # initialize hidden layers: N, D = X.shape K = len(set(Y)) self.hidden_layers = [] M1 = D count = 0 # iterate the self.hidden_layer_sizes list through M1 variable: for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 # the last_hidden_layer-output_layer weights and bias: W, b = init_weight_and_bias(M1, K) self.W = tf.Variable(W, name='W%s' % count) self.b = tf.Variable(b, name='b%s' % count) # collect all the network's parameters: self.params = [self.W, self.b] for h in self.hidden_layers: self.params += h.parameters # define tensorflow placeholders: tfX = tf.placeholder(tf.float32, shape=(None, D), name='X') tfT = tf.placeholder(tf.int32, shape=(None, ), name='T') # the logits ouputs of the network: Y_logits = self.forward_train(tfX) # define the expression for cost: cost = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=Y_logits, labels=tfT)) # define the tensorflow train function: train_op = tf.train.RMSPropOptimizer(learning_rate, decay=decay, momentum=mu).minimize(cost) predict_op = self.predict(tfX) # validation cost will be calculated separately since nothing will be dropped Y_logits_valid = self.forward_predict(tfX) cost_valid = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=Y_logits_valid, labels=tfT)) n_batches = N // batch_sz costs = [] init = tf.global_variables_initializer() with tf.Session() as session: # initialize all tf variables: print('\nInitializing variables...') session.run(init) print('\nPerforming batch SGD with RMSProp and momentum...') for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j + 1) * batch_sz, :] Ybatch = Y[j * batch_sz:(j + 1) * batch_sz] session.run(train_op, feed_dict={tfX: Xbatch, tfT: Ybatch}) if j % 20 == 0: c = session.run(cost_valid, feed_dict={ tfX: Xvalid, tfT: Yvalid }) costs.append(c) prediction = session.run(predict_op, feed_dict={ tfX: Xvalid, tfT: Yvalid }) #print(prediction) error = error_rate(Yvalid, prediction) print('\ni: %d, j: %d, cost: %.6f, error: %.6f' % (i, j, c, error)) # make the final prediction: prediction = session.run(predict_op, feed_dict={tfX: Xvalid}) final_error = error_rate(Yvalid, prediction) if save_params: for h in self.hidden_layers: p_type = 'W' for p in h.parameters: p = p.eval() #print(type(p)) #print(p.shape) name = p_type + str(h.id) np.save(name, p) p_type = 'b' # last hidden layer - output layer parameters: np.save('W%s' % count, self.W.eval()) np.save('b%s' % count, self.b.eval()) if display_cost: plt.plot(costs) plt.show()
def fit(self, X, Y, lr=10e-4, mu=0.99, reg=10e-4, decay=0.99999, eps=10e-3, batch_sz=30, epochs=3, show_fig=True): lr = np.float32(lr) mu = np.float32(mu) reg = np.float32(reg) decay = np.float32(decay) eps = np.float32(eps) K = len(set(Y)) # make a validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = y2indicator(Y).astype(np.float32) Xvalid, Yvalid = X[-1000:], Y[-1000:] X, Y = X[:-1000], Y[:-1000] Yvalid_flat = np.argmax(Yvalid, axis=1) # for calculating error rate # initialize convpool layers N, d, d, c = X.shape mi = c outw = d outh = d self.convpool_layers = [] for mo, fw, fh in self.convpool_layer_sizes: layer = ConvPoolLayer(mi, mo, fw, fh) self.convpool_layers.append(layer) outw = outw / 2 outh = outh / 2 mi = mo # initialize mlp layers self.hidden_layers = [] M1 = self.convpool_layer_sizes[-1][0]*outw*outh # size must be same as output of last convpool layer count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 # logistic regression layer W, b = init_weight_and_bias(M1, K) self.W = tf.Variable(W, 'W_logreg') self.b = tf.Variable(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for h in self.convpool_layers: self.params += h.params for h in self.hidden_layers: self.params += h.params # set up tensorflow functions and variables tfX = tf.placeholder(tf.float32, shape=(None, d, d, c), name='X') tfY = tf.placeholder(tf.float32, shape=(None, K), name='Y') act = self.forward(tfX) rcost = reg*sum([tf.nn.l2_loss(p) for p in self.params]) cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(act, tfY)) + rcost prediction = self.predict(tfX) train_op = tf.train.RMSPropOptimizer(lr, decay=decay, momentum=mu).minimize(cost) n_batches = N / batch_sz costs = [] init = tf.initialize_all_variables() with tf.Session() as session: session.run(init) for i in xrange(epochs): X, Y = shuffle(X, Y) for j in xrange(n_batches): Xbatch = X[j*batch_sz:(j*batch_sz+batch_sz)] Ybatch = Y[j*batch_sz:(j*batch_sz+batch_sz)] session.run(train_op, feed_dict={tfX: Xbatch, tfY: Ybatch}) if j % 20 == 0: c = session.run(cost, feed_dict={tfX: Xvalid, tfY: Yvalid}) costs.append(c) p = session.run(prediction, feed_dict={tfX: Xvalid, tfY: Yvalid}) e = error_rate(Yvalid_flat, p) print "i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e if show_fig: plt.plot(costs) plt.show()
def main(): Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() print("Performing logistic regression...") N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # 1. full W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(50): p_y = forward(Xtrain, W, b) W += lr * (gradW(Ytrain_ind, p_y, Xtrain) - reg * W) b += lr * (gradb(Ytrain_ind, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for full GD:", datetime.now() - t0) # 2. stochastic W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range( 50): # takes very long since we're computing cost for 41k samples tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in range(min(N, 500)): # shortcut so it won't take so long... x = tmpX[n, :].reshape( 1, D ) # in this case, x is a vector, shape=(D,) ,為了讓 feature and target 可以計算 forward and Weights 要做reshape y = tmpY[n, :].reshape( 1, 10 ) # y is a vector, need to convert into metrix for y2indicator calculation p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_stochastic.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for SGD:", datetime.now() - t0) # 3. batch W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz t0 = datetime.now() for i in range(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for batch GD:", datetime.now() - t0) x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
def fit(self, X, Y, learning_rate=0.01, mu=0.99, epochs=30, batch_sz=100): N, D = X.shape K = len(set(Y)) self.hidden_layers = [] mi = D for mo in self.hidden_layer_sizes: h = HiddenLayer(mi, mo) self.hidden_layers.append(h) mi = mo # initialize logistic regression layer W = init_weights((mo, K)) b = np.zeros(K) self.W = theano.shared(W) self.b = theano.shared(b) self.params = [self.W, self.b] self.allWs = [] for h in self.hidden_layers: self.params += h.params self.allWs.append(h.W) self.allWs.append(self.W) X_in = T.matrix('X_in') targets = T.ivector('Targets') pY = self.forward(X_in) cost = -T.mean( T.log(pY[T.arange(pY.shape[0]), targets]) ) prediction = self.predict(X_in) # cost_predict_op = theano.function( # inputs=[X_in, targets], # outputs=[cost, prediction], # ) dparams = [theano.shared(p.get_value()*0) for p in self.params] grads = T.grad(cost, self.params) updates = [ (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads) ] + [ (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads) ] train_op = theano.function( inputs=[X_in, targets], outputs=[cost, prediction], updates=updates, ) n_batches = N / batch_sz costs = [] lastWs = [W.get_value() for W in self.allWs] W_changes = [] print "supervised training..." for i in xrange(epochs): print "epoch:", i X, Y = shuffle(X, Y) for j in xrange(n_batches): Xbatch = X[j*batch_sz:(j*batch_sz + batch_sz)] Ybatch = Y[j*batch_sz:(j*batch_sz + batch_sz)] c, p = train_op(Xbatch, Ybatch) if j % 100 == 0: print "j / n_batches:", j, "/", n_batches, "cost:", c, "error:", error_rate(p, Ybatch) costs.append(c) # log changes in all Ws W_change = [np.abs(W.get_value() - lastW).mean() for W, lastW in zip(self.allWs, lastWs)] W_changes.append(W_change) lastWs = [W.get_value() for W in self.allWs] W_changes = np.array(W_changes) plt.subplot(2,1,1) for i in xrange(W_changes.shape[1]): plt.plot(W_changes[:,i], label='layer %s' % i) plt.legend() # plt.show() plt.subplot(2,1,2) plt.plot(costs) plt.show()
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 20 # make it 30 for sigmoid print_period = 50 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.00004 reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. batch losses_batch = [] errors_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # updates W2 -= lr*gW2 b2 -= lr*gb2 W1 -= lr*gW1 b1 -= lr*gb1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_batch.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. batch with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_momentum = [] errors_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # update velocities dW2 = mu*dW2 - lr*gW2 db2 = mu*db2 - lr*gb2 dW1 = mu*dW1 - lr*gW1 db1 = mu*db1 - lr*gb1 # updates W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_momentum.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_momentum.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 3. batch with Nesterov momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_nesterov = [] errors_nesterov = [] mu = 0.9 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # v update vW2 = mu*vW2 - lr*gW2 vb2 = mu*vb2 - lr*gb2 vW1 = mu*vW1 - lr*gW1 vb1 = mu*vb1 - lr*gb1 # param update W2 += mu*vW2 - lr*gW2 b2 += mu*vb2 - lr*gb2 W1 += mu*vW1 - lr*gW1 b1 += mu*vb1 - lr*gb1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_nesterov.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_nesterov.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(losses_batch, label="batch") plt.plot(losses_momentum, label="momentum") plt.plot(losses_nesterov, label="nesterov") plt.legend() plt.show()
def fit(self, X, Y, learning_rate=1e-2, mu=0.99, decay=0.999, reg=1e-3, epochs=10, batch_sz=100, show_fig=False): K = len(set(Y)) # won't work later b/c we turn it into indicator # make a validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = y2indicator(Y).astype(np.float32) # Y = Y.astype(np.int32) Xvalid, Yvalid = X[-1000:], Y[-1000:] Yvalid_flat = np.argmax(Yvalid, axis=1) # for calculating error rate X, Y = X[:-1000], Y[:-1000] # initialize hidden layers N, D = X.shape self.hidden_layers = [] M1 = D count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 W, b = init_weight_and_bias(M1, K) self.W = tf.Variable(W.astype(np.float32)) self.b = tf.Variable(b.astype(np.float32)) # collect params for later use self.params = [self.W, self.b] for h in self.hidden_layers: self.params += h.params # set up theano functions and variables tfX = tf.placeholder(tf.float32, shape=(None, D), name='X') tfT = tf.placeholder(tf.float32, shape=(None, K), name='T') act = self.forward(tfX) rcost = reg*sum([tf.nn.l2_loss(p) for p in self.params]) cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=act, labels=tfT ) ) + rcost prediction = self.predict(tfX) train_op = tf.train.RMSPropOptimizer(learning_rate, decay=decay, momentum=mu).minimize(cost) n_batches = N // batch_sz costs = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j*batch_sz:(j*batch_sz+batch_sz)] Ybatch = Y[j*batch_sz:(j*batch_sz+batch_sz)] session.run(train_op, feed_dict={tfX: Xbatch, tfT: Ybatch}) if j % 20 == 0: c = session.run(cost, feed_dict={tfX: Xvalid, tfT: Yvalid}) costs.append(c) p = session.run(prediction, feed_dict={tfX: Xvalid, tfT: Yvalid}) e = error_rate(Yvalid_flat, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, lr=1e-3, mu=0.99, reg=1e-3, decay=0.99999, eps=1e-10, batch_sz=30, epochs=3, show_fig=True): lr = np.float32(lr) mu = np.float32(mu) reg = np.float32(reg) decay = np.float32(decay) eps = np.float32(eps) # make a validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = Y.astype(np.int32) Xvalid, Yvalid = X[-1000:], Y[-1000:] X, Y = X[:-1000], Y[:-1000] # initialize convpool layers N, c, width, height = X.shape mi = c outw = width outh = height self.convpool_layers = [] for mo, fw, fh in self.convpool_layer_sizes: layer = ConvPoolLayer(mi, mo, fw, fh) self.convpool_layers.append(layer) outw = (outw - fw + 1) // 2 outh = (outh - fh + 1) // 2 mi = mo # initialize mlp layers K = len(set(Y)) self.hidden_layers = [] M1 = self.convpool_layer_sizes[-1][0]*outw*outh # size must be same as output of last convpool layer count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 # logistic regression layer W, b = init_weight_and_bias(M1, K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for c in self.convpool_layers: self.params += c.params for h in self.hidden_layers: self.params += h.params # set up theano functions and variables thX = T.tensor4('X', dtype='float32') thY = T.ivector('Y') pY = self.forward(thX) rcost = reg*T.sum([(p*p).sum() for p in self.params]) cost = -T.mean(T.log(pY[T.arange(thY.shape[0]), thY])) + rcost prediction = self.th_predict(thX) cost_predict_op = theano.function(inputs=[thX, thY], outputs=[cost, prediction]) updates = rmsprop(cost, self.params, lr, mu, decay, eps) train_op = theano.function( inputs=[thX, thY], outputs=cost, updates=updates ) n_batches = N // batch_sz costs = [] for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j*batch_sz:(j*batch_sz+batch_sz)] Ybatch = Y[j*batch_sz:(j*batch_sz+batch_sz)] train_c = train_op(Xbatch, Ybatch) if j % 20 == 0: c, p = cost_predict_op(Xvalid, Yvalid) costs.append(c) e = error_rate(Yvalid, p) print( "i:", i, "j:", j, "nb:", n_batches, "train cost:", train_c, "cost:", c, "error rate:", e ) if show_fig: plt.plot(costs) plt.show()