def benchmark_pca(): Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() print("Performing logistic regression...") N, D = Xtrain.shape Ytrain_ind = np.zeros((N, 10)) for i in range(N): Ytrain_ind[i, Ytrain[i]] = 1 Ntest = len(Ytest) Ytest_ind = np.zeros((Ntest, 10)) for i in range(Ntest): Ytest_ind[i, Ytest[i]] = 1 W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL = [] LLtest = [] CRtest = [] # D = 300 -> error = 0.07 lr = 0.0001 reg = 0.01 for i in range(200): p_y = forward(Xtrain, W, b) # print "p_y:", p_y ll = cost(p_y, Ytrain_ind) LL.append(ll) p_y_test = forward(Xtest, W, b) lltest = cost(p_y_test, Ytest_ind) LLtest.append(lltest) err = error_rate(p_y_test, Ytest) CRtest.append(err) W += lr * (gradW(Ytrain_ind, p_y, Xtrain) - reg * W) b += lr * (gradb(Ytrain_ind, p_y) - reg * b) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) iters = range(len(LL)) plt.plot(iters, LL, label='train loss') plt.plot(iters, LLtest, label='test loss') plt.title('Loss') plt.legend() plt.show() plt.plot(CRtest) plt.title('Error') plt.show()
def benchmark_full(): Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() print("Performing logistic regression...") # lr = LogisticRegression(solver='lbfgs') # convert Ytrain and Ytest to (N x K) matrices of indicator variables N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL = [] LLtest = [] CRtest = [] # reg = 1 # learning rate 0.0001 is too high, 0.00005 is also too high # 0.00003 / 2000 iterations => 0.363 error, -7630 cost # 0.00004 / 1000 iterations => 0.295 error, -7902 cost # 0.00004 / 2000 iterations => 0.321 error, -7528 cost # reg = 0.1, still around 0.31 error # reg = 0.01, still around 0.31 error lr = 0.00004 reg = 0.01 for i in range(500): p_y = forward(Xtrain, W, b) # print "p_y:", p_y ll = cost(p_y, Ytrain_ind) LL.append(ll) p_y_test = forward(Xtest, W, b) lltest = cost(p_y_test, Ytest_ind) LLtest.append(lltest) err = error_rate(p_y_test, Ytest) CRtest.append(err) W += lr * (gradW(Ytrain_ind, p_y, Xtrain) - reg * W) b += lr * (gradb(Ytrain_ind, p_y) - reg * b) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) iters = range(len(LL)) plt.plot(iters, LL, iters, LLtest) plt.show() plt.plot(CRtest) plt.show()
def main(): Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() print("Performing logistic regression...") N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # 1. full W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(50): p_y = forward(Xtrain, W, b) W += lr * (gradW(Ytrain_ind, p_y, Xtrain) - reg * W) b += lr * (gradb(Ytrain_ind, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for full GD:", datetime.now() - t0) # 2. stochastic W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range( 50): # takes very long since we're computing cost for 41k samples tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in range(min(N, 500)): # shortcut so it won't take so long... x = tmpX[n, :].reshape(1, D) y = tmpY[n, :].reshape(1, 10) p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_stochastic.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for SGD:", datetime.now() - t0) # 3. batch W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz t0 = datetime.now() for i in range(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for batch GD:", datetime.now() - t0) x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
def main(): X, Y, _, _ = get_transformed_data() X = X[:, :300] # normalize X first mu = X.mean(axis=0) std = X.std(axis=0) X = (X - mu) / std print("Performing logistic regression...") Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # # 1. full # W = np.random.randn(D, 10) / 28 # b = np.zeros(10) # LL = [] # lr = 0.0001 # reg = 0.01 # t0 = datetime.now() # for i in xrange(200): # p_y = forward(Xtrain, W, b) # # W += lr * (gradW(Ytrain_ind, p_y, Xtrain) - reg * W) # b += lr * (gradb(Ytrain_ind, p_y) - reg * b) # # # p_y_test = forward(Xtest, W, b) # ll = cost(p_y_test, Ytest_ind) # LL.append(ll) # if i % 10 == 0: # err = error_rate(p_y_test, Ytest) # print("Cost at iteration %d: %.6f" % (i, ll)) # print("Error rate:", err) # p_y = forward(Xtest, W, b) # print("Final error rate:", error_rate(p_y, Ytest)) # print("Elapsted time for full GD:", datetime.now() - t0) # # # # 2. stochastic # W = np.random.randn(D, 10) / 28 # b = np.zeros(10) # LL_stochastic = [] # lr = 0.0001 # reg = 0.01 # # t0 = datetime.now() # for i in range(1): # takes very long since we're computing cost for 41k samples # tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) # for n in range(min(N, 500)): # shortcut so it won't take so long... # x = tmpX[n,:].reshape(1,D) # y = tmpY[n,:].reshape(1,10) # p_y = forward(x, W, b) # # W += lr*(gradW(y, p_y, x) - reg*W) # b += lr*(gradb(y, p_y) - reg*b) # # p_y_test = forward(Xtest, W, b) # ll = cost(p_y_test, Ytest_ind) # LL_stochastic.append(ll) # # if n % (N/2) == 0: # err = error_rate(p_y_test, Ytest) # print("Cost at iteration %d: %.6f" % (i, ll)) # print("Error rate:", err) # p_y = forward(Xtest, W, b) # print("Final error rate:", error_rate(p_y, Ytest)) # print("Elapsted time for SGD:", datetime.now() - t0) # # # # 3. batch W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N / batch_sz t0 = datetime.now() for i in range(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j + 1) * batch_sz, :] y = tmpY[j * batch_sz:(j + 1) * batch_sz, :] p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if j % (n_batches / 2) == 0: err = error_rate(p_y_test, Ytest) print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for batch GD:", datetime.now() - t0) x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
def main(): Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() print("Performing logistic regression...") N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # 1. full W = np.random.randn(D, 10) / np.sqrt(D) W0 = W.copy() # save for later b = np.zeros(10) test_losses_full = [] lr = 0.9 reg = 0. t0 = datetime.now() last_dt = 0 intervals = [] for i in range(50): p_y = forward(Xtrain, W, b) gW = gradW(Ytrain_ind, p_y, Xtrain) / N gb = gradb(Ytrain_ind, p_y) / N W += lr*(gW - reg*W) b += lr*(gb - reg*b) p_y_test = forward(Xtest, W, b) test_loss = cost(p_y_test, Ytest_ind) dt = (datetime.now() - t0).total_seconds() # save these dt2 = dt - last_dt last_dt = dt intervals.append(dt2) test_losses_full.append([dt, test_loss]) if (i + 1) % 10 == 0: print("Cost at iteration %d: %.6f" % (i + 1, test_loss)) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for full GD:", datetime.now() - t0) # save the max time so we don't surpass it in subsequent iterations max_dt = dt avg_interval_dt = np.mean(intervals) # 2. stochastic W = W0.copy() b = np.zeros(10) test_losses_sgd = [] lr = 0.001 reg = 0. t0 = datetime.now() last_dt_calculated_loss = 0 done = False for i in range(50): # takes very long since we're computing cost for 41k samples tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in range(N): x = tmpX[n,:].reshape(1,D) y = tmpY[n,:].reshape(1,10) p_y = forward(x, W, b) gW = gradW(y, p_y, x) gb = gradb(y, p_y) W += lr*(gW - reg*W) b += lr*(gb - reg*b) dt = (datetime.now() - t0).total_seconds() dt2 = dt - last_dt_calculated_loss if dt2 > avg_interval_dt: last_dt_calculated_loss = dt p_y_test = forward(Xtest, W, b) test_loss = cost(p_y_test, Ytest_ind) test_losses_sgd.append([dt, test_loss]) # time to quit if dt > max_dt: done = True break if done: break if (i + 1) % 1 == 0: print("Cost at iteration %d: %.6f" % (i + 1, test_loss)) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for SGD:", datetime.now() - t0) # 3. mini-batch W = W0.copy() b = np.zeros(10) test_losses_batch = [] batch_sz = 500 lr = 0.08 reg = 0. n_batches = int(np.ceil(N / batch_sz)) t0 = datetime.now() last_dt_calculated_loss = 0 done = False for i in range(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j*batch_sz:(j + 1)*batch_sz,:] y = tmpY[j*batch_sz:(j + 1)*batch_sz,:] p_y = forward(x, W, b) current_batch_sz = len(x) gW = gradW(y, p_y, x) / current_batch_sz gb = gradb(y, p_y) / current_batch_sz W += lr*(gW - reg*W) b += lr*(gb - reg*b) dt = (datetime.now() - t0).total_seconds() dt2 = dt - last_dt_calculated_loss if dt2 > avg_interval_dt: last_dt_calculated_loss = dt p_y_test = forward(Xtest, W, b) test_loss = cost(p_y_test, Ytest_ind) test_losses_batch.append([dt, test_loss]) # time to quit if dt > max_dt: done = True break if done: break if (i + 1) % 10 == 0: print("Cost at iteration %d: %.6f" % (i + 1, test_loss)) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for mini-batch GD:", datetime.now() - t0) # convert to numpy arrays test_losses_full = np.array(test_losses_full) test_losses_sgd = np.array(test_losses_sgd) test_losses_batch = np.array(test_losses_batch) plt.plot(test_losses_full[:,0], test_losses_full[:,1], label="full") plt.plot(test_losses_sgd[:,0], test_losses_sgd[:,1], label="sgd") plt.plot(test_losses_batch[:,0], test_losses_batch[:,1], label="mini-batch") plt.legend() plt.show()
def main(): Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() print("Performing logistic regression...") N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # 1. Full GD W = np.random.randn( D, 10) / 28 # Square root of no. of dimentionality. i.e. 28 * 28 = 784 b = np.zeros(10) loss_batch = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(epoch): p_y = forward(Xtrain, W, b) W += lr * (gradW(Ytrain_ind, p_y, Xtrain) - reg * W) b += lr * (gradb(Ytrain_ind, p_y) - reg * b) p_y_test = forward(Xtest, W, b) temp_loss = cost(p_y_test, Ytest_ind) loss_batch.append(temp_loss) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, temp_loss)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for full GD:", datetime.now() - t0) print("=======================================================") # 2. Stochastic GD W = np.random.randn(D, 10) / 28 b = np.zeros(10) loss_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range( epoch ): # takes very long since we're computing cost for 41k samples tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) #for n in range(min(N, 500)): # shortcut so it won't take so long... for n in range(N): x = tmpX[n, :].reshape(1, D) y = tmpY[n, :].reshape(1, 10) p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) loss = cost(p_y_test, Ytest_ind) loss_stochastic.append(loss) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, loss)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for SGD:", datetime.now() - t0) print("=======================================================") # 3. Mini-batch GD W = np.random.randn(D, 10) / 28 b = np.zeros(10) loss_mini_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz t0 = datetime.now() for i in range(epoch): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) temp_loss = cost(p_y_test, Ytest_ind) loss_mini_batch.append(temp_loss) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, temp_loss)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for mini-batch GD:", datetime.now() - t0) # Plot graph x1 = np.linspace(0, 1, len(loss_batch)) plt.plot(x1, loss_batch, label="full(batch) GD") x2 = np.linspace(0, 1, len(loss_stochastic)) plt.plot(x2, loss_stochastic, label="stochastic GD") x3 = np.linspace(0, 1, len(loss_mini_batch)) plt.plot(x3, loss_mini_batch, label="mini-batch GD") plt.legend() plt.show()
def main(): # get PCA transformed data X, Y, _, _ = get_transformed_data() X = X[:, :300] # the first 300 features # normalize X first mu = X.mean(axis=0) std = X.std(axis=0) X = (X - mu) / std print "Performing logistic regression..." Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # 1. full W = np.random.randn( D, 10 ) / 28 # we're setting our initial weights to be pretty small, proportional to the square root of the dimensionality b = np.zeros(10) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange(200): p_y = forward(Xtrain, W, b) W += lr * (gradW(Ytrain_ind, p_y, Xtrain) - reg * W) b += lr * (gradb(Ytrain_ind, p_y) - reg * b) # do a forward pass on the test set so that we can calculate the cost on the test set and then plot that p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) if i % 10 == 0: # calculate the error rate on every 10 iterations err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for full GD:", datetime.now() - t0 # 2. stochastic W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange( 1): # takes very long since we're computing cost for 41k samples # on each pass, we typically want to shuffle through the training data and the labels tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) # we're actually only going to go through 500 samples because its slow for n in xrange(min(N, 500)): # shortcut so it won't take so long... # reshape x into a 2 dimensional matrix x = tmpX[n, :].reshape(1, D) y = tmpY[n, :].reshape(1, 10) # forward pass to get the output p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_stochastic.append(ll) if n % ( N / 2 ) == 0: # calculate the error rate once for every N/2 samples err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for SGD:", datetime.now() - t0 # 3. batch W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N / batch_sz t0 = datetime.now() for i in xrange(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in xrange(n_batches): # get the current batches input and targets x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] # forward pass to get the output predictions p_y = forward(x, W, b) # Gradient descent W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if j % ( n_batches / 2 ) == 0: # print error rate at every (number of batches)/2 iterations err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for batch GD:", datetime.now() - t0 x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
def main(): X_train, X_test, t_train, t_test = get_pca_normalized_data() print("Performing multi-class logistic regression...\n") N, D = X_train.shape K = 10 T_train = T_indicator(t_train) T_test = T_indicator(t_test) lr = float(sys.argv[1]) reg = float(sys.argv[2]) batch_size = int(sys.argv[3]) ######## 1. FULL GRADIENT DESCENT ######## print('Full Gradient Descent') W = np.random.randn(D, K) / np.sqrt(D) b = np.zeros(K) J_test_full = [] t0 = datetime.now() for epoch in range(50): Y_train = forward(X_train, W, b) W -= lr * (gradW(T_train, Y_train, X_train) - reg * W) b -= lr * (gradb(T_train, Y_train) - reg * b) Y_test = forward(X_test, W, b) j_test = J(T_test, Y_test) J_test_full.append(j_test) if epoch % 1 == 0: err = accuracy(predict(Y_test), t_test) if epoch % 10 == 0: print("Epoch {}:\tcost: {}\taccuracy: {}".format( epoch, round(j_test, 4), err)) Y_test = forward(X_test, W, b) print("Final accuracy:", accuracy(predict(Y_test), t_test)) print("Elapsted time for full GD: {}\n".format(datetime.now() - t0)) ######## 2. STOCHASTIC GRADIENT DESCENT ######## print('Stochastic Gradient Descent') W = np.random.randn(D, K) / np.sqrt(D) b = np.zeros(K) J_test_stochastic = [] t0 = datetime.now() for epoch in range( 50): # takes very long since we're computing cost for 41k samples tmpX, tmpT = shuffle(X_train, T_train) for n in range(min(N, 500)): # shortcut so it won't take so long... x = tmpX[n, :].reshape(1, D) t = tmpT[n, :].reshape(1, 10) Y_train = forward(x, W, b) W -= lr * (gradW(t, Y_train, x) - reg * W) b -= lr * (gradb(t, Y_train) - reg * b) Y_test = forward(X_test, W, b) j_test = J(T_test, Y_test) J_test_stochastic.append(j_test) if epoch % 1 == 0: err = accuracy(predict(Y_test), t_test) if epoch % 10 == 0: print("Epoch {}:\tcost: {}\taccuracy: {}".format( epoch, round(j_test, 4), err)) Y_test_final = forward(X_test, W, b) print("Final accuracy:", accuracy(predict(Y_test_final), t_test)) print("Elapsted time for SGD: {}\n".format(datetime.now() - t0)) ######## 3. BATCH GRADIENT DESCENT ######## print('Batch Gradient Descent') W = np.random.randn(D, K) / np.sqrt(D) b = np.zeros(K) J_test_batch = [] nb_batches = N // batch_size t0 = datetime.now() for epoch in range(50): tmpX, tmpT = shuffle(X_train, T_train) for batch_index in range(nb_batches): x = tmpX[batch_index * batch_size:(batch_index * batch_size + batch_size), :] t = tmpT[batch_index * batch_size:(batch_index * batch_size + batch_size), :] Y_train = forward(x, W, b) W -= lr * (gradW(t, Y_train, x) - reg * W) b -= lr * (gradb(t, Y_train) - reg * b) Y_test = forward(X_test, W, b) j_test = J(T_test, Y_test) J_test_batch.append(j_test) if epoch % 1 == 0: err = accuracy(predict(Y_test), t_test) if epoch % 10 == 0: print("Epoch {}\tcost: {}\taccuracy: {}".format( epoch, round(j_test, 4), err)) Y_test_final = forward(X_test, W, b) print("Final accuracy:", accuracy(predict(Y_test_final), t_test)) print("Elapsted time for batch GD:", datetime.now() - t0) ######## PLOTS ######## x1 = np.linspace(0, 1, len(J_test_full)) plt.plot(x1, J_test_full, label="full") x2 = np.linspace(0, 1, len(J_test_stochastic)) plt.plot(x2, J_test_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(J_test_batch)) plt.plot(x3, J_test_batch, label="batch") plt.legend() #plt.savefig('full_vs_stoch_vs_batch_lr={}_reg={}_batch_size={}.png'.format(lr, reg, batch_size)) plt.show()
def main(): X, Y, _, _ = get_transformed_data() X = X[:, :300] # normalize X first mu = X.mean(axis=0) std = X.std(axis=0) X = (X - mu) / std print "Performing logistic regression..." Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # 1. full W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange(200): p_y = forward(Xtrain, W, b) W += lr*(gradW(Ytrain_ind, p_y, Xtrain) - reg*W) b += lr*(gradb(Ytrain_ind, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) if i % 10 == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for full GD:", datetime.now() - t0 # 2. stochastic W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange(1): # takes very long since we're computing cost for 41k samples tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in xrange(min(N, 500)): # shortcut so it won't take so long... x = tmpX[n,:].reshape(1,D) y = tmpY[n,:].reshape(1,10) p_y = forward(x, W, b) W += lr*(gradW(y, p_y, x) - reg*W) b += lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_stochastic.append(ll) if n % (N/2) == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for SGD:", datetime.now() - t0 # 3. batch W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N / batch_sz t0 = datetime.now() for i in xrange(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in xrange(n_batches): x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:] y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:] p_y = forward(x, W, b) W += lr*(gradW(y, p_y, x) - reg*W) b += lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if j % (n_batches/2) == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for batch GD:", datetime.now() - t0 x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
def main(): X, Y, _, _ = get_transformed_data() X = X[:, :300] mu = X.mean(axis=0) std = X.std(axis=0) X = (X - mu) / std # normalize X first print "Performing logistic regression..." Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) #1. Full GD W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL = [] #the whole array of lost functions with iterations. lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange(50): p_y = forward(Xtrain, W, b) W += lr * (gradW(Ytrain_ind, p_y, Xtrain) - reg * W) b += lr * (gradb(Ytrain_ind, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) if i % 10 == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "The lost sequence is given as:", LL print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for full GD:", datetime.now() - t0 #2. Stochastic W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange(1): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in xrange(min(N, 500)): x = tmpX[n, :].reshape(1, D) y = tmpY[n, :].reshape(1, 10) p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_stochastic.append(ll) if n % (N / 2) == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsed time for SGD:", datetime.now() - t0 # x1 = np.linspace(0, 1, len(LL)) # plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") plt.legend() plt.show() print LL #3. batch W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N / batch_sz t0 = datetime.now() for i in xrange(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in xrange(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if j % (n_batches / 2) == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for batch GD:", datetime.now() - t0 x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
def main(): Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() print("Performing logistic regression...") N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # 1. full W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(50): p_y = forward(Xtrain, W, b) W += lr*(gradW(Ytrain_ind, p_y, Xtrain) - reg*W) b += lr*(gradb(Ytrain_ind, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for full GD:", datetime.now() - t0) # 2. stochastic W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(50): # takes very long since we're computing cost for 41k samples tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in range(min(N, 500)): # shortcut so it won't take so long... x = tmpX[n,:].reshape(1,D) y = tmpY[n,:].reshape(1,10) p_y = forward(x, W, b) W += lr*(gradW(y, p_y, x) - reg*W) b += lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_stochastic.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for SGD:", datetime.now() - t0) # 3. batch W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz t0 = datetime.now() for i in range(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:] y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:] p_y = forward(x, W, b) W += lr*(gradW(y, p_y, x) - reg*W) b += lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for batch GD:", datetime.now() - t0) x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
def main(): Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() print('logistic regression') # randomly assign weights N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) M = 10 scale = 28 # full grad descent W, b = initwb(D, M, scale) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(200): P_Y = forward(Xtrain, W, b) W += lr * (gradW(Ytrain_ind, P_Y, Xtrain) - reg * W) b += lr * (gradb(Ytrain_ind, P_Y) - reg * b) P_Y_test = forward(Xtest, W, b) ll = cost(P_Y_test, Ytest_ind) LL.append(ll) if i % 10 == 0: err = error_rate(P_Y_test, Ytest) print("cost at iter: %d: %.6f" % (i, ll)) print("error rate: ", err, "\n") P_Y = forward(Xtest, W, b) print("final error: ", error_rate(P_Y, Ytest)) print("elapsed time for full GD: ", datetime.now() - t0) # 2. Stochastic W, b = initwb(D, M, scale) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(1): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in range(min(N, 500)): x = tmpX[n, :].reshape(1, D) y = tmpY[n, :].reshape(1, 10) P_Y = forward(x, W, b) W += lr * (gradW(y, P_Y, x) - reg * W) b += lr * (gradb(y, P_Y) - reg * b) P_Y_test = forward(Xtest, W, b) ll = cost(P_Y_test, Ytest_ind) LL_stochastic.append(ll) if n % (N / 2) == 0: err = error_rate(P_Y_test, Ytest) print("Cost at iteration %d: %6.f" % (i, ll)) print("error rate: ", err) P_Y = forward(Xtest, W, b) print("error rate: ", error_rate(P_Y, Ytest)) print("elapsed time for SGD: ", datetime.now() - t0) # batch W, b = initwb(D, M, scale) LL_batch = [] lr = 0.001 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz t0 = datetime.now() for i in range(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] P_Y = forward(x, W, b) W += lr * (gradW(y, P_Y, x) - reg * W) b += lr * (gradb(y, P_Y) - reg * b) P_Y_test = forward(Xtest, W, b) ll = cost(P_Y_test, Ytest_ind) LL_batch.append(ll) if j % (n_batches / 2) == 0: err = error_rate(P_Y_test, Ytest) print("Cost at iteration %d: %6.f" % (i, ll)) print("error rate: ", err) P_Y = forward(Xtest, W, b) print("error rate: ", error_rate(P_Y, Ytest)) print("elapsed time for SGD: ", datetime.now() - t0) x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
def main(): X, Y, _, _ = get_transformed_data() #First 300 factors X = X[:,:300] # normalize X first mu = X.mean(axis=0) std = X.std(axis=0) X = (X-mu) / std print("Performing logistic regression...") Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) #1. full gradient descent W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(200): p_y = forward(Xtrain, W, b) W+= lr*(gradW(Ytrain_ind, p_y, Xtrain) - reg*W) b+= lr*(gradb(Ytrain_ind, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) err = error_rate(p_y_test, Ytest) if i % 10 ==0: print("FULL Cost a iteration %d: %.6f" %(i,ll)) print("FULL Error rate:", err) p_y = forward(Xtest, W, b) print("FULL Final error rate", error_rate(p_y, Ytest)) print("FULL GD time", (datetime.now() - t0)) #2. Stochastic gradient descent W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(1): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in xrange(min(N,500)): x = tmpX[n, :].reshape(1, D) y = tmpY[n, :].reshape(1, 10) p_y = forward(x, W, b) W+= lr*(gradW(y, p_y, x) - reg*W) b+= lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_stochastic.append(ll) err = error_rate(p_y_test, Ytest) if n % int(N/2) ==0: print("STOCHASTIC Cost a iteration %d: %.6f" %(i,ll)) print("STOCHASTIC Error rate:", err) p_y = forward(Xtest, W, b) print("STOCHASTIC Final error rate", error_rate(p_y, Ytest)) print("STOCHASTIC GD time", (datetime.now() - t0)) #3. batch W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N / batch_sz t0 = datetime.now() for i in range(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in xrange(n_batches): x = tmpX[j*batch_sz:((j+1)*batch_sz), :] y = tmpY[j*batch_sz:((j+1)*batch_sz), :] p_y = forward(x, W, b) W+= lr*(gradW(y, p_y, x) - reg*W) b+= lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if j % int(n_batches/2) ==0: err = error_rate(p_y_test, Ytest) print("BATCH Cost a iteration %d: %.6f" %(i,ll)) print("BATCH Error rate:", err) p_y = forward(Xtest, W, b) print("BATCH Final error rate", error_rate(p_y, Ytest)) print("BATCH GD time", (datetime.now() - t0)) x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label='full') x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label='stochastic') x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label='batch') plt.legend() plt.show()
# 1. full W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() <<<<<<< HEAD for i in xrange(200): ======= for i in range(200): >>>>>>> upstream/master p_y = forward(Xtrain, W, b) W += lr*(gradW(Ytrain_ind, p_y, Xtrain) - reg*W) b += lr*(gradb(Ytrain_ind, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) if i % 10 == 0: err = error_rate(p_y_test, Ytest) <<<<<<< HEAD print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for full GD:", datetime.now() - t0 =======