def main(): X, Y, _, _ = get_transformed_data() X = X[:, :300] # normalize X first mu = X.mean(axis=0) std = X.std(axis=0) X = (X - mu) / std
def benchmark_pca(): Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() print("Performing logistic regression...") N, D = Xtrain.shape Ytrain_ind = np.zeros((N, 10)) for i in range(N): Ytrain_ind[i, Ytrain[i]] = 1 Ntest = len(Ytest) Ytest_ind = np.zeros((Ntest, 10)) for i in range(Ntest): Ytest_ind[i, Ytest[i]] = 1 W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL = [] LLtest = [] CRtest = [] # D = 300 -> error = 0.07 lr = 0.0001 reg = 0.01 for i in range(200): p_y = forward(Xtrain, W, b) # print "p_y:", p_y ll = cost(p_y, Ytrain_ind) LL.append(ll) p_y_test = forward(Xtest, W, b) lltest = cost(p_y_test, Ytest_ind) LLtest.append(lltest) err = error_rate(p_y_test, Ytest) CRtest.append(err) W += lr * (gradW(Ytrain_ind, p_y, Xtrain) - reg * W) b += lr * (gradb(Ytrain_ind, p_y) - reg * b) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) iters = range(len(LL)) plt.plot(iters, LL, label='train loss') plt.plot(iters, LLtest, label='test loss') plt.title('Loss') plt.legend() plt.show() plt.plot(CRtest) plt.title('Error') plt.show()
def main(): Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() print("Performing logistic regression...") N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # 1. full W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(50): p_y = forward(Xtrain, W, b) W += lr * (gradW(Ytrain_ind, p_y, Xtrain) - reg * W) b += lr * (gradb(Ytrain_ind, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for full GD:", datetime.now() - t0) # 2. stochastic W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range( 50): # takes very long since we're computing cost for 41k samples tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in range(min(N, 500)): # shortcut so it won't take so long... x = tmpX[n, :].reshape(1, D) y = tmpY[n, :].reshape(1, 10) p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_stochastic.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for SGD:", datetime.now() - t0) # 3. batch W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz t0 = datetime.now() for i in range(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for batch GD:", datetime.now() - t0) x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
def main(): X, Y, _, _ = get_transformed_data() X = X[:, :300] mu = X.mean(axis=0) std = X.std(axis=0) np.place(std, std == 0, 1) X = (X - mu) / std Xtrain, Ytrain = X[:-1000], Y[:-1000] Xtest, Ytest = X[-1000:], Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) #Full W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL = [] learning_rate = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange(200): pY = forward(Xtrain, W, b) W -= learning_rate * (derivative_W(pY, Ytrain_ind, Xtrain) + reg * W) b -= learning_rate * (derivative_b(pY, Ytrain_ind) + reg * b) pYtest = forward(Xtest, W, b) ll = cost(pYtest, Ytest_ind) LL.append(ll) if i % 10 == 0: err = error_rate(pYtest, Ytest) print "Cost at iter %d: %.6f" % (i, ll) print "Error rate:", err pY = forward(Xtest, W, b) print "Final error rate:", error_rate(pY, pYtest) print "Elapsed time for full GD:", datetime.now() - t0 #SGD W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_stochastic = [] learning_rate = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange(1): # one epoch tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in xrange(min(N, 500)): x = tmpX[n, :].reshape(1, D) y = tmpY[n, :].reshape(1, 10) p_y = forward(x, W, b) W -= learning_rate * (derivative_W(p_y, y, x) + reg * W) b -= learning_rate * (derivative_b(p_y, y) + reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_stochastic.append(ll) if n % (N / 2) == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final Error rate:", error_rate(p_y, Ytest) print "Elapsed time for SGD:", datetime.now() - t0 #Batch W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_batch = [] learning_rate = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N / batch_sz t0 = datetime.now() for i in xrange(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in xrange(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] p_y = forward(x, W, b) W -= learning_rate * (derivative_W(p_y, y, x) + reg * W) b -= learning_rate * (derivative_b(p_y, y) + reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if j % (n_batches / 2) == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final Error rate:", error_rate(p_y, Ytest) print "Elapsed time for Batch GD:", datetime.now() - t0 x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label='full') x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label='stochastic') x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label='batch') plt.legend() plt.show()
def main(): X, Y, _, _ = get_transformed_data() X = X[:, :300] # normalize X first mu = X.mean(axis=0) std = X.std(axis=0) X = (X - mu) / std print("Performing logistic regression...") Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # # 1. full # W = np.random.randn(D, 10) / 28 # b = np.zeros(10) # LL = [] # lr = 0.0001 # reg = 0.01 # t0 = datetime.now() # for i in xrange(200): # p_y = forward(Xtrain, W, b) # # W += lr * (gradW(Ytrain_ind, p_y, Xtrain) - reg * W) # b += lr * (gradb(Ytrain_ind, p_y) - reg * b) # # # p_y_test = forward(Xtest, W, b) # ll = cost(p_y_test, Ytest_ind) # LL.append(ll) # if i % 10 == 0: # err = error_rate(p_y_test, Ytest) # print("Cost at iteration %d: %.6f" % (i, ll)) # print("Error rate:", err) # p_y = forward(Xtest, W, b) # print("Final error rate:", error_rate(p_y, Ytest)) # print("Elapsted time for full GD:", datetime.now() - t0) # # # # 2. stochastic # W = np.random.randn(D, 10) / 28 # b = np.zeros(10) # LL_stochastic = [] # lr = 0.0001 # reg = 0.01 # # t0 = datetime.now() # for i in range(1): # takes very long since we're computing cost for 41k samples # tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) # for n in range(min(N, 500)): # shortcut so it won't take so long... # x = tmpX[n,:].reshape(1,D) # y = tmpY[n,:].reshape(1,10) # p_y = forward(x, W, b) # # W += lr*(gradW(y, p_y, x) - reg*W) # b += lr*(gradb(y, p_y) - reg*b) # # p_y_test = forward(Xtest, W, b) # ll = cost(p_y_test, Ytest_ind) # LL_stochastic.append(ll) # # if n % (N/2) == 0: # err = error_rate(p_y_test, Ytest) # print("Cost at iteration %d: %.6f" % (i, ll)) # print("Error rate:", err) # p_y = forward(Xtest, W, b) # print("Final error rate:", error_rate(p_y, Ytest)) # print("Elapsted time for SGD:", datetime.now() - t0) # # # # 3. batch W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N / batch_sz t0 = datetime.now() for i in range(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j + 1) * batch_sz, :] y = tmpY[j * batch_sz:(j + 1) * batch_sz, :] p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if j % (n_batches / 2) == 0: err = error_rate(p_y_test, Ytest) print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for batch GD:", datetime.now() - t0) x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
def main(): Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() print("Performing logistic regression...") N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # 1. Full GD W = np.random.randn( D, 10) / 28 # Square root of no. of dimentionality. i.e. 28 * 28 = 784 b = np.zeros(10) loss_batch = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(epoch): p_y = forward(Xtrain, W, b) W += lr * (gradW(Ytrain_ind, p_y, Xtrain) - reg * W) b += lr * (gradb(Ytrain_ind, p_y) - reg * b) p_y_test = forward(Xtest, W, b) temp_loss = cost(p_y_test, Ytest_ind) loss_batch.append(temp_loss) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, temp_loss)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for full GD:", datetime.now() - t0) print("=======================================================") # 2. Stochastic GD W = np.random.randn(D, 10) / 28 b = np.zeros(10) loss_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range( epoch ): # takes very long since we're computing cost for 41k samples tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) #for n in range(min(N, 500)): # shortcut so it won't take so long... for n in range(N): x = tmpX[n, :].reshape(1, D) y = tmpY[n, :].reshape(1, 10) p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) loss = cost(p_y_test, Ytest_ind) loss_stochastic.append(loss) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, loss)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for SGD:", datetime.now() - t0) print("=======================================================") # 3. Mini-batch GD W = np.random.randn(D, 10) / 28 b = np.zeros(10) loss_mini_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz t0 = datetime.now() for i in range(epoch): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) temp_loss = cost(p_y_test, Ytest_ind) loss_mini_batch.append(temp_loss) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, temp_loss)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for mini-batch GD:", datetime.now() - t0) # Plot graph x1 = np.linspace(0, 1, len(loss_batch)) plt.plot(x1, loss_batch, label="full(batch) GD") x2 = np.linspace(0, 1, len(loss_stochastic)) plt.plot(x2, loss_stochastic, label="stochastic GD") x3 = np.linspace(0, 1, len(loss_mini_batch)) plt.plot(x3, loss_mini_batch, label="mini-batch GD") plt.legend() plt.show()
def main(): # get PCA transformed data X, Y, _, _ = get_transformed_data() X = X[:, :300] # the first 300 features # normalize X first mu = X.mean(axis=0) std = X.std(axis=0) X = (X - mu) / std print "Performing logistic regression..." Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # 1. full W = np.random.randn( D, 10 ) / 28 # we're setting our initial weights to be pretty small, proportional to the square root of the dimensionality b = np.zeros(10) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange(200): p_y = forward(Xtrain, W, b) W += lr * (gradW(Ytrain_ind, p_y, Xtrain) - reg * W) b += lr * (gradb(Ytrain_ind, p_y) - reg * b) # do a forward pass on the test set so that we can calculate the cost on the test set and then plot that p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) if i % 10 == 0: # calculate the error rate on every 10 iterations err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for full GD:", datetime.now() - t0 # 2. stochastic W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange( 1): # takes very long since we're computing cost for 41k samples # on each pass, we typically want to shuffle through the training data and the labels tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) # we're actually only going to go through 500 samples because its slow for n in xrange(min(N, 500)): # shortcut so it won't take so long... # reshape x into a 2 dimensional matrix x = tmpX[n, :].reshape(1, D) y = tmpY[n, :].reshape(1, 10) # forward pass to get the output p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_stochastic.append(ll) if n % ( N / 2 ) == 0: # calculate the error rate once for every N/2 samples err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for SGD:", datetime.now() - t0 # 3. batch W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N / batch_sz t0 = datetime.now() for i in xrange(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in xrange(n_batches): # get the current batches input and targets x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] # forward pass to get the output predictions p_y = forward(x, W, b) # Gradient descent W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if j % ( n_batches / 2 ) == 0: # print error rate at every (number of batches)/2 iterations err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for batch GD:", datetime.now() - t0 x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
def main(): X_train, Y_train, X_test, Y_test, N_train, N_test = get_transformed_data() X_train = X_train[:32000, ] Y_train = Y_train[:32000] X_test = X_test[:12000, ] Y_test = Y_test[:12000] Y_test_for_comp = Y_test Y_train_ind = ylength2indicator(Y_train) Y_test_ind = ylength2indicator(Y_test) # Above is to compute the length indicator; Y_train_Q = ytoint(Y_train) Y_test_Q = ytoint(Y_test) # Above is to compute the length only; ## About the iterations max_iter = 150 print_period = 1000 N = X_train.shape[0] n_batches = N / batch_sz M = 2048 K = [7, 11] KM = 3072 poolsz = (2, 2) ##Placeholder and other variables. X = tf.placeholder(tf.float32, shape=(batch_sz, 40, 40, 3), name='X') T0 = tf.placeholder(tf.float32, shape=(batch_sz, K[0]), name='T') T1 = tf.placeholder(tf.float32, shape=(batch_sz, K[1]), name='T') ## About Optimization parameters. W1_shape = ( 5, 5, 3, 16 ) #(filter_width, filter_height, num_col_chanels, num_feature_maps) W1_init = init_filter(W1_shape, poolsz) b1_init = np.zeros(W1_shape[-1], dtype=np.float32) W2_shape = ( 5, 5, 16, 32 ) # (filter_width, filter_height, old_num_feature_maps, num_feature_maps) W2_init = init_filter(W2_shape, poolsz) b2_init = np.zeros(W2_shape[-1], dtype=np.float32) W3_shape = ( 5, 5, 32, 48 ) # (filter_width, filter_height, old_num_feature_maps, num_feature_maps) W3_init = init_filter(W3_shape, poolsz) b3_init = np.zeros(W3_shape[-1], dtype=np.float32) W4_shape = ( 3, 3, 48, 64 ) # (filter_width, filter_height, old_num_feature_maps, num_feature_maps)] W4_init = init_filter(W4_shape, poolsz) b4_init = np.zeros(W4_shape[-1], dtype=np.float32) W5_shape = ( 3, 3, 64, 128 ) # (filter_width, filter_height, old_num_feature_maps, num_feature_maps)] W5_init = init_filter(W5_shape, poolsz) b5_init = np.zeros(W5_shape[-1], dtype=np.float32) W6_init = np.random.randn(W5_shape[-1] * 2 * 2, M) / np.sqrt(W4_shape[-1] * 2 * 2 + M) b6_init = np.zeros(M, dtype=np.float32) W7_init = np.random.randn(M, KM) / np.sqrt(M + KM) b7_init = np.zeros(KM, dtype=np.float32) W8_init = np.random.randn(KM, K[0]) / np.sqrt(KM + K[0]) b8_init = np.zeros(K[0], dtype=np.float32) W8N_init = np.random.randn(KM, K[1]) / np.sqrt(KM + K[1]) b8N_init = np.zeros(K[1], dtype=np.float32) W1_L = tf.Variable(W1_init.astype(np.float32)) b1_L = tf.Variable(b1_init.astype(np.float32)) W2_L = tf.Variable(W2_init.astype(np.float32)) b2_L = tf.Variable(b2_init.astype(np.float32)) W3_L = tf.Variable(W3_init.astype(np.float32)) b3_L = tf.Variable(b3_init.astype(np.float32)) W4_L = tf.Variable(W4_init.astype(np.float32)) b4_L = tf.Variable(b4_init.astype(np.float32)) W5_L = tf.Variable(W5_init.astype(np.float32)) b5_L = tf.Variable(b5_init.astype(np.float32)) W6_L = tf.Variable(W6_init.astype(np.float32)) b6_L = tf.Variable(b6_init.astype(np.float32)) W7_L = tf.Variable(W7_init.astype(np.float32)) b7_L = tf.Variable(b7_init.astype(np.float32)) W8_L = tf.Variable(W8_init.astype(np.float32)) b8_L = tf.Variable(b8_init.astype(np.float32)) Z1_L = conv1pool(X, W1_L, b1_L) Z2_L = conv2pool(Z1_L, W2_L, b2_L) Z3_L = conv2pool(Z2_L, W3_L, b3_L) Z4_L = conv2pool(Z3_L, W4_L, b4_L) print Z4_L Z5_L = conv2pool(Z4_L, W5_L, b5_L) print Z5_L Z5_shape_L = Z5_L.get_shape().as_list() Z5r_L = tf.reshape(Z5_L, [Z5_shape_L[0], np.prod(Z5_shape_L[1:])]) Z5r_L = tf.nn.dropout(Z5r_L, keep_prob) print Z5r_L Z6_L = tf.nn.relu(tf.matmul(Z5r_L, W6_L) + b6_L) Z6_L = tf.nn.dropout(Z6_L, keep_prob) print Z6_L Z7_L = tf.nn.relu(tf.matmul(Z6_L, W7_L) + b7_L) Z7_L = tf.nn.dropout(Z7_L, keep_prob) Yish_L = tf.matmul(Z7_L, W8_L) + b8_L cost_L = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(Yish_L, T0)) train_op_L = tf.train.RMSPropOptimizer(0.0001, decay=0.98, momentum=0.9).minimize(cost_L) predict_op_L = tf.argmax(Yish_L, 1) W1 = [0, 0, 0, 0, 0] b1 = [0, 0, 0, 0, 0] W2 = [0, 0, 0, 0, 0] b2 = [0, 0, 0, 0, 0] W3 = [0, 0, 0, 0, 0] b3 = [0, 0, 0, 0, 0] W4 = [0, 0, 0, 0, 0] b4 = [0, 0, 0, 0, 0] W5 = [0, 0, 0, 0, 0] b5 = [0, 0, 0, 0, 0] W6 = [0, 0, 0, 0, 0] b6 = [0, 0, 0, 0, 0] W7 = [0, 0, 0, 0, 0] b7 = [0, 0, 0, 0, 0] W8 = [0, 0, 0, 0, 0] b8 = [0, 0, 0, 0, 0] Yish = [0, 0, 0, 0, 0] cost = [0, 0, 0, 0, 0] train_op = [0, 0, 0, 0, 0] predict_op = [0, 0, 0, 0, 0] for h in range(5): W1[h] = tf.Variable(W1_init.astype(np.float32)) b1[h] = tf.Variable(b1_init.astype(np.float32)) W2[h] = tf.Variable(W2_init.astype(np.float32)) b2[h] = tf.Variable(b2_init.astype(np.float32)) W3[h] = tf.Variable(W3_init.astype(np.float32)) b3[h] = tf.Variable(b3_init.astype(np.float32)) W4[h] = tf.Variable(W4_init.astype(np.float32)) b4[h] = tf.Variable(b4_init.astype(np.float32)) W5[h] = tf.Variable(W5_init.astype(np.float32)) b5[h] = tf.Variable(b5_init.astype(np.float32)) W6[h] = tf.Variable(W6_init.astype(np.float32)) b6[h] = tf.Variable(b6_init.astype(np.float32)) W7[h] = tf.Variable(W7_init.astype(np.float32)) b7[h] = tf.Variable(b7_init.astype(np.float32)) W8[h] = tf.Variable(W8N_init.astype(np.float32)) b8[h] = tf.Variable(b8N_init.astype(np.float32)) Z1 = conv1pool(X, W1[h], b1[h]) Z2 = conv2pool(Z1, W2[h], b2[h]) Z3 = conv2pool(Z2, W3[h], b3[h]) Z4 = conv2pool(Z3, W4[h], b4[h]) Z5 = conv2pool(Z4, W5[h], b5[h]) Z5_shape = Z5.get_shape().as_list() Z5r = tf.reshape(Z5, [Z5_shape[0], np.prod(Z5_shape[1:])]) Z5r = tf.nn.dropout(Z5r, keep_prob) Z6 = tf.nn.relu(tf.matmul(Z5r, W6[h]) + b6[h]) Z6 = tf.nn.dropout(Z6, keep_prob) Z7 = tf.nn.relu(tf.matmul(Z6, W7[h]) + b7[h]) Z7 = tf.nn.dropout(Z7, keep_prob) Yish[h] = tf.matmul(Z7, W8[h]) + b8[h] cost[h] = tf.reduce_sum( tf.nn.softmax_cross_entropy_with_logits(Yish[h], T1)) train_op[h] = tf.train.RMSPropOptimizer(0.0001, decay=0.98, momentum=0.9).minimize(cost[h]) predict_op[h] = tf.argmax(Yish[h], 1) ## Save all the variables. saver = tf.train.Saver() LL = [] Error_Testing = [] Yish_log = [] init = tf.initialize_all_variables() with tf.Session() as session: session.run(init) print 'Computing the length of the number.' for i in xrange(max_iter): Yish_ = np.zeros((len(X_test), K[0])) for j in xrange(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] if len(Xbatch) == batch_sz: session.run(train_op_L, feed_dict={ X: Xbatch, T0: Ybatch, keep_prob0: 0.8, keep_prob: 0.5 }) if j % print_period == 0: test_cost = 0 prediction_test = np.zeros(len(X_test)) prediction_train = np.zeros(len(X_train)) for k in xrange(len(X_test) / batch_sz): Xtestbatch = X_test[k * batch_sz:(k * batch_sz + batch_sz), ] Ytestbatch = Y_test_ind[k * batch_sz:(k * batch_sz + batch_sz), ] test_cost += session.run(cost_L, feed_dict={ X: Xtestbatch, T0: Ytestbatch, keep_prob0: 1, keep_prob: 1 }) prediction_test[k * batch_sz:(k * batch_sz + batch_sz)] = session.run( predict_op_L, feed_dict={ X: Xtestbatch, keep_prob0: 1, keep_prob: 1 }) Yish_[k * batch_sz:(k * batch_sz + batch_sz)] = session.run( Yish_L, feed_dict={ X: Xtestbatch, keep_prob0: 1, keep_prob: 1 }) err_testing = error_rate(prediction_test, Y_test_Q) print "Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % ( i, j, test_cost, err_testing) LL.append(test_cost) Error_Testing.append(err_testing) Yish_ = session.run(tf.nn.log_softmax(Yish_)) Yish_log = np.array([Yish_]) Yish_log_length = Yish_log Yish_log_length = np.reshape(Yish_log_length, (len(X_test), 1, K[0])) # # # # # Evaluating the digit. Y_train_ = np.array(digit(Y_train)) Y_test_ = np.array(digit(Y_test)) Yish_log_ = np.array([]).reshape(len(X_test), 2, 0) for h in range(5): Y_train = Y_train_[:, h] Y_test = Y_test_[:, h] Y_train_ind = y2indicator(Y_train) Y_test_ind = y2indicator(Y_test) t0 = datetime.now() LL = [] Error_Training = [] Error_Testing = [] Yish_log = [] print 'Computing the %d digit of the number.' % (h) for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] if len(Xbatch) == batch_sz: session.run(train_op[h], feed_dict={ X: Xbatch, T1: Ybatch, keep_prob0: 0.8, keep_prob: 0.5 }) if j % print_period == 0: test_cost = 0 prediction_test = np.zeros(len(X_test)) prediction_train = np.zeros(len(X_train)) Yish_ = np.zeros((len(X_test), K[1])) for k in xrange(len(X_test) / batch_sz): Xtestbatch = X_test[k * batch_sz:(k * batch_sz + batch_sz), ] Ytestbatch = Y_test_ind[k * batch_sz:( k * batch_sz + batch_sz), ] test_cost += session.run(cost[h], feed_dict={ X: Xtestbatch, T1: Ytestbatch, keep_prob0: 1, keep_prob: 1 }) prediction_test_batch = session.run( predict_op[h], feed_dict={ X: Xtestbatch, keep_prob0: 1, keep_prob: 1 }) for n, item in enumerate( prediction_test_batch): if item == 10: prediction_test_batch[n] = 0 prediction_test[k * batch_sz:( k * batch_sz + batch_sz)] = prediction_test_batch Yish_[k * batch_sz:(k * batch_sz + batch_sz)] = session.run( Yish[h], feed_dict={ X: Xtestbatch, keep_prob0: 1, keep_prob: 1 }) Y_test_transformed = Y_test for n, item in enumerate(Y_test): if item == 10: Y_test_transformed[n] = 0 for n, item in enumerate(Yish_): if np.argmax(item, axis=0) == 10: Yish_[n] = [0.0909] * 11 err_testing = error_rate(prediction_test, Y_test_transformed) print "Cost / err on digit h=%d at iteration i=%d, j=%d: %.3f / %.3f" % ( h, i, j, test_cost, err_testing) LL.append(test_cost) Error_Testing.append(err_testing) print(session.run(W1[h][0, 0, 0, 3])) Yish_ = session.run(tf.nn.log_softmax(Yish_)) for itr in range(len(Yish_)): Yish_log.append([ prediction_test[itr], Yish_[itr, int(prediction_test[itr])] ]) Yish_log = np.array(Yish_log) Yish_log_ = np.dstack((Yish_log_, Yish_log)) # print np.shape(Yish_log_); save_path = saver.save(session, model_path) # To make an artificial form. b = np.zeros((len(X_test), 2, 1)) Yish_log_ = np.concatenate((b, Yish_log_), axis=2) Yish_log_ = np.concatenate((Yish_log_, b), axis=2) Yish_log_whole = np.concatenate((Yish_log_, Yish_log_length), axis=1) # print np.shape(Yish_log_whole); # print Yish_log_whole[1:2,] # Inference of the whole number############# # Argmax statistics. Inf_digit = np.zeros((len(X_test), 7)) Inf_num = np.zeros(len(X_test)) Inf_digit[:, 0] = Yish_log_whole[:, 1, 0] + Yish_log_whole[:, 2, 0] for j in range(len(X_test)): for i in range(1, 7): Inf_digit[j, i] = sum( Yish_log_whole[j, 1, 1:i]) + Yish_log_whole[j, 2, i] Length_digit = np.argmax(Inf_digit, 1) # Inference for i in range(len(Length_digit)): if Length_digit[i] == 0: Inf_num[i] = 0 else: Inf_num[i] = ''.join([ str(int(x)) for x in Yish_log_whole[i, 0, 1:Length_digit[i] + 1] ]) Inf_num = [int(x) for x in Inf_num] print Inf_num[0:9] print Y_test_for_comp[0:9] #Evaluation of the Error rate: err_testing = error_rate(Y_test_for_comp, Inf_num) print err_testing
def main(): X, Y, _, _ = get_transformed_data() X = X[:, :300] # normalize X first mu = X.mean(axis=0) std = X.std(axis=0) X = (X - mu) / std print "Performing logistic regression..." Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # 1. full W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange(200): p_y = forward(Xtrain, W, b) W += lr*(gradW(Ytrain_ind, p_y, Xtrain) - reg*W) b += lr*(gradb(Ytrain_ind, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) if i % 10 == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for full GD:", datetime.now() - t0 # 2. stochastic W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange(1): # takes very long since we're computing cost for 41k samples tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in xrange(min(N, 500)): # shortcut so it won't take so long... x = tmpX[n,:].reshape(1,D) y = tmpY[n,:].reshape(1,10) p_y = forward(x, W, b) W += lr*(gradW(y, p_y, x) - reg*W) b += lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_stochastic.append(ll) if n % (N/2) == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for SGD:", datetime.now() - t0 # 3. batch W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N / batch_sz t0 = datetime.now() for i in xrange(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in xrange(n_batches): x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:] y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:] p_y = forward(x, W, b) W += lr*(gradW(y, p_y, x) - reg*W) b += lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if j % (n_batches/2) == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for batch GD:", datetime.now() - t0 x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
def main(): X, Y, _, _ = get_transformed_data() X = X[:, :300] mu = X.mean(axis=0) std = X.std(axis=0) X = (X - mu) / std # normalize X first print "Performing logistic regression..." Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) #1. Full GD W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL = [] #the whole array of lost functions with iterations. lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange(50): p_y = forward(Xtrain, W, b) W += lr * (gradW(Ytrain_ind, p_y, Xtrain) - reg * W) b += lr * (gradb(Ytrain_ind, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) if i % 10 == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "The lost sequence is given as:", LL print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for full GD:", datetime.now() - t0 #2. Stochastic W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange(1): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in xrange(min(N, 500)): x = tmpX[n, :].reshape(1, D) y = tmpY[n, :].reshape(1, 10) p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_stochastic.append(ll) if n % (N / 2) == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsed time for SGD:", datetime.now() - t0 # x1 = np.linspace(0, 1, len(LL)) # plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") plt.legend() plt.show() print LL #3. batch W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N / batch_sz t0 = datetime.now() for i in xrange(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in xrange(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if j % (n_batches / 2) == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for batch GD:", datetime.now() - t0 x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
def main(): Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() print("Performing logistic regression...") N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # 1. full W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(50): p_y = forward(Xtrain, W, b) W += lr*(gradW(Ytrain_ind, p_y, Xtrain) - reg*W) b += lr*(gradb(Ytrain_ind, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for full GD:", datetime.now() - t0) # 2. stochastic W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(50): # takes very long since we're computing cost for 41k samples tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in range(min(N, 500)): # shortcut so it won't take so long... x = tmpX[n,:].reshape(1,D) y = tmpY[n,:].reshape(1,10) p_y = forward(x, W, b) W += lr*(gradW(y, p_y, x) - reg*W) b += lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_stochastic.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for SGD:", datetime.now() - t0) # 3. batch W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz t0 = datetime.now() for i in range(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:] y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:] p_y = forward(x, W, b) W += lr*(gradW(y, p_y, x) - reg*W) b += lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for batch GD:", datetime.now() - t0) x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
def main(): X, Y, _, _ = get_transformed_data() X = X[:, :300] # normalize the data: mu = X.mean(axis=0) std = X.std(axis=0) X = (X - mu) / std print('Performing logistic regression...') Xtrain, Ytrain = X[:-1000, :], Y[:-1000] Xtest, Ytest = X[-1000:, :], Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) K = len(set(Y)) np.random.seed() # 1. Full Gradient Descend: W = np.random.randn(D, K) / np.sqrt(D) b = np.zeros(K) LL = [] # a storage for costs lr = 0.0001 # learning rate reg = 0.01 # L2-regularization term t0 = datetime.now() print('utilizing full GD...') for i in range(200): p_y = forward(Xtrain, W, b) W += lr * (grad_W(Ytrain_ind, p_y, Xtrain) - reg * W) b += lr * (grad_b(Ytrain_ind, p_y).sum(axis=0) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) if i % 10 == 0: error = error_rate(p_y_test, Ytest) print('i: %d, cost: %.6f, error: %.6f' % (i, ll, error)) dt1 = datetime.now() - t0 p_y_test = forward(Xtest, W, b) plt.plot(LL) plt.title('Cost for full GD') plt.show() plt.savefig('Cost_full_GD.png') print('Final error rate:', error_rate(p_y_test, Ytest)) print('Elapsed time for full GD:', dt1) # 2. Stochastic Gradien Descent W = np.random.randn(D, K) / np.sqrt(D) b = np.zeros(K) LLstochastic = [] # a storage for costs lr = 0.0001 # learning rate reg = 0.01 # L2-regularization term t0 = datetime.now() print('utilizing stochastic GD...') for i in range(25): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) # we consider just 500 samples, not all the dataset for n in range(N): x = tmpX[n, :].reshape(1, D) y = tmpY[n, :].reshape(1, K) p_y = forward(x, W, b) W += lr * (grad_W(y, p_y, x) - reg * W) b += lr * (grad_b(y, p_y).sum(axis=0) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LLstochastic.append(ll) if n % (N // 2) == 0: error = error_rate(p_y_test, Ytest) print('i: %d, cost: %.6f, error: %.6f' % (i, ll, error)) dt2 = datetime.now() - t0 p_y_test = forward(Xtest, W, b) plt.plot(LLstochastic) plt.title('Cost for stochastic GD') plt.show() plt.savefig('Cost_stochastic_GD.png') print('Final error rate:', error_rate(p_y_test, Ytest)) print('Elapsed time for stochastic GD:', dt2) # 3. Batch Gradient Descent: W = np.random.randn(D, K) / np.sqrt(D) b = np.zeros(K) LLbatch = [] lr = 0.0001 # learning rate reg = 0.01 # L2-regularization term batch_size = 500 n_batches = N // batch_size t0 = datetime.now() print('utilizing batch GD...') for i in range(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_size:batch_size * (j + 1), :] y = tmpY[j * batch_size:batch_size * (j + 1), :] p_y = forward(x, W, b) W += lr * (grad_W(y, p_y, x) - reg * W) b += lr * (grad_b(y, p_y).sum(axis=0) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LLbatch.append(ll) if j % (n_batches // 2) == 0: error = error_rate(p_y_test, Ytest) print('i: %d, cost: %.6f, error: %.6f' % (i, ll, error)) dt3 = datetime.now() - t0 p_y_test = forward(Xtest, W, b) plt.plot(LLbatch) plt.title('Cost for batch GD') plt.show() plt.savefig('Cost_batch_GD.png') print('Final error rate:', error_rate(p_y_test, Ytest)) print('Elapsed time for batch GD', dt3) # plot all costs together: x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label='full') x2 = np.linspace(0, 1, len(LLstochastic)) plt.plot(x2, LLstochastic, label='stochastic') x3 = np.linspace(0, 1, len(LLbatch)) plt.plot(x3, LLbatch, label='batch') plt.legend() plt.show() plt.savefig('Costs_together.png')
def main(): Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() print('logistic regression') # randomly assign weights N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) M = 10 scale = 28 # full grad descent W, b = initwb(D, M, scale) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(200): P_Y = forward(Xtrain, W, b) W += lr * (gradW(Ytrain_ind, P_Y, Xtrain) - reg * W) b += lr * (gradb(Ytrain_ind, P_Y) - reg * b) P_Y_test = forward(Xtest, W, b) ll = cost(P_Y_test, Ytest_ind) LL.append(ll) if i % 10 == 0: err = error_rate(P_Y_test, Ytest) print("cost at iter: %d: %.6f" % (i, ll)) print("error rate: ", err, "\n") P_Y = forward(Xtest, W, b) print("final error: ", error_rate(P_Y, Ytest)) print("elapsed time for full GD: ", datetime.now() - t0) # 2. Stochastic W, b = initwb(D, M, scale) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(1): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in range(min(N, 500)): x = tmpX[n, :].reshape(1, D) y = tmpY[n, :].reshape(1, 10) P_Y = forward(x, W, b) W += lr * (gradW(y, P_Y, x) - reg * W) b += lr * (gradb(y, P_Y) - reg * b) P_Y_test = forward(Xtest, W, b) ll = cost(P_Y_test, Ytest_ind) LL_stochastic.append(ll) if n % (N / 2) == 0: err = error_rate(P_Y_test, Ytest) print("Cost at iteration %d: %6.f" % (i, ll)) print("error rate: ", err) P_Y = forward(Xtest, W, b) print("error rate: ", error_rate(P_Y, Ytest)) print("elapsed time for SGD: ", datetime.now() - t0) # batch W, b = initwb(D, M, scale) LL_batch = [] lr = 0.001 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz t0 = datetime.now() for i in range(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] P_Y = forward(x, W, b) W += lr * (gradW(y, P_Y, x) - reg * W) b += lr * (gradb(y, P_Y) - reg * b) P_Y_test = forward(Xtest, W, b) ll = cost(P_Y_test, Ytest_ind) LL_batch.append(ll) if j % (n_batches / 2) == 0: err = error_rate(P_Y_test, Ytest) print("Cost at iteration %d: %6.f" % (i, ll)) print("error rate: ", err) P_Y = forward(Xtest, W, b) print("error rate: ", error_rate(P_Y, Ytest)) print("elapsed time for SGD: ", datetime.now() - t0) x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
def main(): X, Y, _, _ = get_transformed_data() #First 300 factors X = X[:,:300] # normalize X first mu = X.mean(axis=0) std = X.std(axis=0) X = (X-mu) / std print("Performing logistic regression...") Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) #1. full gradient descent W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(200): p_y = forward(Xtrain, W, b) W+= lr*(gradW(Ytrain_ind, p_y, Xtrain) - reg*W) b+= lr*(gradb(Ytrain_ind, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) err = error_rate(p_y_test, Ytest) if i % 10 ==0: print("FULL Cost a iteration %d: %.6f" %(i,ll)) print("FULL Error rate:", err) p_y = forward(Xtest, W, b) print("FULL Final error rate", error_rate(p_y, Ytest)) print("FULL GD time", (datetime.now() - t0)) #2. Stochastic gradient descent W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(1): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in xrange(min(N,500)): x = tmpX[n, :].reshape(1, D) y = tmpY[n, :].reshape(1, 10) p_y = forward(x, W, b) W+= lr*(gradW(y, p_y, x) - reg*W) b+= lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_stochastic.append(ll) err = error_rate(p_y_test, Ytest) if n % int(N/2) ==0: print("STOCHASTIC Cost a iteration %d: %.6f" %(i,ll)) print("STOCHASTIC Error rate:", err) p_y = forward(Xtest, W, b) print("STOCHASTIC Final error rate", error_rate(p_y, Ytest)) print("STOCHASTIC GD time", (datetime.now() - t0)) #3. batch W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N / batch_sz t0 = datetime.now() for i in range(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in xrange(n_batches): x = tmpX[j*batch_sz:((j+1)*batch_sz), :] y = tmpY[j*batch_sz:((j+1)*batch_sz), :] p_y = forward(x, W, b) W+= lr*(gradW(y, p_y, x) - reg*W) b+= lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if j % int(n_batches/2) ==0: err = error_rate(p_y_test, Ytest) print("BATCH Cost a iteration %d: %.6f" %(i,ll)) print("BATCH Error rate:", err) p_y = forward(Xtest, W, b) print("BATCH Final error rate", error_rate(p_y, Ytest)) print("BATCH GD time", (datetime.now() - t0)) x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label='full') x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label='stochastic') x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label='batch') plt.legend() plt.show()