def main():
    X,Y = get_normalized_data()

    max_iter = 20
    print_period = 10

    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N / batch_sz

    M1 = 300
    M2 = 100
    K = 10
    W1_init = np.random.randn(D, M1) / 28
    b1_init = np.zeros(M)
    W2_init = np.random.randn(M1, M2) / np.sqrt(M1)
    b2_init = np.zeros(K)
    W3_init = np.random.randn(M2, k) / np.sqrt(M2)
    b3_init = np.zeros(K)
示例#2
0
def main():
    X_train, X_test, t_train, t_test = get_normalized_data()
    ann = ANN([500,300])
    session = ann.set_session(tf.InteractiveSession())
    ann.fit(X_train, X_test, t_train, t_test, show_fig = True)
    writer = tf.summary('./tensorboard_logs/demo1')
    writer.add_graph(session.graph)
示例#3
0
def main():
    # step 1: get the data and define all the usual variables
    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()

    # hidden layer sizes and dropout rates (keep rate)
    ann = ANN([500, 300], [0.8, 0.5, 0.5])
    ann.fit(Xtrain, Ytrain, Xtest, Ytest, show_fig=True)
示例#4
0
def main():
	Xtrain,Xtest, Ytrain,Ytest = get_normalized_data()
	ann = ANN([500, 300])
	session = tf.InteractiveSession()
	ann.set_session(session)
	ann.fit(Xtrain, Ytrain, Xtest, Ytest, show_fig = True)
	print("Train accuracy:", ann.score(Xtrain, Ytrain))
	print("Test accuracy:", ann.score(Xtest, Ytest))
def main():
    X, Y = get_normalized_data()  # normalize MNIST dataset
    t0 = datetime.now()
    model = ANN([2000, 1000], [0.8, 0.5, 0.5])
    model.fit(X, Y, display_cost=True)
    dt = datetime.now() - t0

    print('Elapsed time:', dt)
def main():
    X, Y = get_normalized_data()

    t0 = datetime.now()
    model = ANN([2000, 1000, 500])
    model.fit(X, Y, display_cost=True, save_params=True)
    dt = datetime.now() - t0

    print('Elapsed time:', dt)
示例#7
0
def main():
    # step 1: get the data and define all the usual variables
    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()

    ann = ANN([500, 300])
    ann.fit(Xtrain, Ytrain, Xtest, Ytest, show_fig=True)

    print("Train accuracy:", ann.score(Xtrain, Ytrain))
    print("Test accuracy:", ann.score(Xtest, Ytest))
def main():
  # step 1: get the data and define all the usual variables
  Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()

  ann = ANN([500, 300])
  ann.fit(Xtrain, Ytrain, Xtest, Ytest, show_fig=True)

  print("Train accuracy:", ann.score(Xtrain, Ytrain))
  print("Test accuracy:", ann.score(Xtest, Ytest))
def main():
    X, Y = get_normalized_data()

    t0 = datetime.now()
    model = ANN([500, 300], [0.8, 0.8, 0.8])
    model.fit(X, Y, display_cost=True, save_params=False)
    dt = datetime.now() - t0

    print('Elapsed time:', dt)
示例#10
0
def main():
	Xtrain,Xtest, Ytrain, Ytest = get_normalized_data()
	model = TFLogistic("./tf.model")
	model.fit(Xtrain, Ytrain, Xtest, Ytest)
	print("final train accuracy:", model.score(Xtrain, Ytrain))
	print("final test accuracy:", model.score(Xtest, Ytest))
	model.save("my_trained.json")
	model = TFLogistic.load("my_trained.json")
	print("final train accuracy (after reload):", model.score(Xtrain, Ytrain))
	print("final test accuracy (after reload):", model.score(Xtest, Ytest))
def main():
    X, Y = get_normalized_data()

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]

    model = TFLogistic("tf.model")
    model.fit(Xtrain, Ytrain, Xtest, Ytest)
def main():
    X, Y = get_normalized_data()  # normalized MNIST dataset
    Xtest, Ytest = X[-1000:, :], Y[-1000:]
    Xtrain, Ytrain = X[:-1000, :], Y[:-1000]

    model = ANN([1000, 500, 500])
    model.fit(Xtrain, Ytrain, display_cost=True)
    # joblib.dump(model, 'mymodel.pkl')
    # model = joblib.load('mymodel.pkl')
    print('Test set acc:',
          model.score(Xtest.astype(np.float32), Ytest.astype(np.float32)))
示例#13
0
def main():
    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
    
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)
    
    lr = 0.00004
    reg = 0.01
    N, D = Xtrain.shape
    K = 10
    
    max_iter = 1000
    batch_sz = 500
    n_batches = N // batch_sz
    print_period = 10
    
    W_init = np.random.randn(D, K)
    b_init = np.random.randn(K)
    
    X = tf.placeholder(tf.float32, shape=(None, D), name='X')
    T = tf.placeholder(tf.float32, shape=(None, K), name='T')
    W = tf.Variable(W_init.astype(np.float32))
    b = tf.Variable(b_init.astype(np.float32))
    
    Yish = tf.matmul(X, W) + b
    
    cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=Yish, labels=T))
    
    #train_op = tf.train.GradientDescentOptimizer(lr).minimize(cost)
    train_op = tf.train.RMSPropOptimizer(lr, decay=0.99, momentum=0.9).minimize(cost)
    
    predict_op = tf.argmax(Yish, 1)
    
    LL = []
    init = tf.initialize_all_variables()
    with tf.Session() as session:
        session.run(init)
        
        for i in range(max_iter):
            for j in range(n_batches):
                Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
                Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
                
                session.run(train_op, feed_dict={X: Xbatch, T: Ybatch})
                if j % print_period == 0:
                    test_cost = session.run(cost, feed_dict={X: Xtest, T: Ytest_ind})
                    prediction = session.run(predict_op, feed_dict={X: Xtest})
                    err = error_rate(prediction, Ytest)
                    print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, test_cost, err))
                    LL.append(test_cost)
                    
    plt.plot(LL)
    plt.show()
def benchmark_full():
    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()

    print("Performing logistic regression...")
    # lr = LogisticRegression(solver='lbfgs')

    # convert Ytrain and Ytest to (N x K) matrices of indicator variables
    N, D = Xtrain.shape
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    W = np.random.randn(D, 10) / np.sqrt(D)
    b = np.zeros(10)
    LL = []
    LLtest = []
    CRtest = []

    # reg = 1
    # learning rate 0.0001 is too high, 0.00005 is also too high
    # 0.00003 / 2000 iterations => 0.363 error, -7630 cost
    # 0.00004 / 1000 iterations => 0.295 error, -7902 cost
    # 0.00004 / 2000 iterations => 0.321 error, -7528 cost

    # reg = 0.1, still around 0.31 error
    # reg = 0.01, still around 0.31 error
    lr = 0.00004
    reg = 0.01
    for i in range(500):
        p_y = forward(Xtrain, W, b)
        # print "p_y:", p_y
        ll = cost(p_y, Ytrain_ind)
        LL.append(ll)

        p_y_test = forward(Xtest, W, b)
        lltest = cost(p_y_test, Ytest_ind)
        LLtest.append(lltest)

        err = error_rate(p_y_test, Ytest)
        CRtest.append(err)

        W += lr * (gradW(Ytrain_ind, p_y, Xtrain) - reg * W)
        b += lr * (gradb(Ytrain_ind, p_y) - reg * b)
        if i % 10 == 0:
            print("Cost at iteration %d: %.6f" % (i, ll))
            print("Error rate:", err)

    p_y = forward(Xtest, W, b)
    print("Final error rate:", error_rate(p_y, Ytest))
    iters = range(len(LL))
    plt.plot(iters, LL, iters, LLtest)
    plt.show()
    plt.plot(CRtest)
    plt.show()
def main():

    X, Y = get_normalized_data()

    X, Y = shuffle(X, Y)
    Xtrain, Ytrain = X[:-1000], Y[:-1000]
    Xtest, Ytest = X[-1000:], Y[-1000:]

    ann = ANN([500, 300])
    ann.fit(Xtrain, Ytrain, Xtest, Ytest, show_fig=True)

    print("Train accuracy:", ann.score(Xtrain, Ytrain))
    print("Test accuracy:", ann.score(Xtest, Ytest))
示例#16
0
def main():
    # step 1: get the data and define all the usual variables
    X, Y = get_normalized_data()
    # Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3)
    X, Y = shuffle(X, Y)
    Xtrain, Ytrain = X[:-1000], Y[:-1000]
    Xtest, Ytest = X[-1000:], Y[-1000:]

    ann = ANN([500, 300])
    ann.fit(Xtrain, Ytrain, Xtest, Ytest, show_fig=True)

    print("Train accuracy:", ann.score(Xtrain, Ytrain))
    print("Test accuracy:", ann.score(Xtest, Ytest))
示例#17
0
def main():
    # step 1: get the data and define all the usual variables
    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()

    ann = ANN([500, 300])

    session = tf.compat.v1.InteractiveSession()
    ann.set_session(session)

    ann.fit(Xtrain, Ytrain, Xtest, Ytest, show_fig=True)

    print("Train accuracy:", ann.score(Xtrain, Ytrain))
    print("Test accuracy:", ann.score(Xtest, Ytest))
示例#18
0
def main():
    X, Y = get_normalized_data()

    t0 = datetime.now()

    model = ANN([500, 300])

    session = tf.InteractiveSession()
    model.set_session(session)

    model.fit(X, Y, display_cost=True, save_params=True)

    dt = datetime.now() - t0

    print('Elapsed time:', dt)
示例#19
0
def main():
    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
    #    Ytrain_ind = y2indicator(Ytrain)
    #    Ytest_ind = y2indicator(Ytest)

    #Find optimal hyperparams
    M = [100, 200, 300]
    lrs = [0.00001, 0.000001, 0.0001]
    regs = [0.1, 0.01, 0.001]

    ann = ANN(Xtrain, Ytrain, Xtest, Ytest)

    lr, reg, M, LL = ann.grid_search(M, lrs, regs)
    print("Found optimal values: lr={}, reg={}, M={} at a cost of {}".format(
        lr, reg, M, LL))
def main():
    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()

    model = TFLogistic("./tf.model")
    model.fit(Xtrain, Ytrain, Xtest, Ytest)

    # test out restoring the model via the predict function
    print("final train accuracy:", model.score(Xtrain, Ytrain))
    print("final test accuracy:", model.score(Xtest, Ytest))

    # save the model
    model.save("my_trained_model.json")

    # load and score again
    model = TFLogistic.load("my_trained_model.json")
    print("final train accuracy (after reload):", model.score(Xtrain, Ytrain))
    print("final test accuracy (after reload):", model.score(Xtest, Ytest))
示例#21
0
def main():
    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()

    model = TFLogistic("./tf.model")
    model.fit(Xtrain, Ytrain, Xtest, Ytest)

    # test out restoring the model via the predict function
    print("final train accuracy:", model.score(Xtrain, Ytrain))
    print("final test accuracy:", model.score(Xtest, Ytest))

    # save the model
    model.save("my_trained_model.json")

    # load and score again
    model = TFLogistic.load("my_trained_model.json")
    print("final train accuracy (after reload):", model.score(Xtrain, Ytrain))
    print("final test accuracy (after reload):", model.score(Xtest, Ytest))
def main():
    max_iter = 20 # make it 30 for sigmoid
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
示例#23
0
def main():
    # step 1: get the data and define all the usual variables
    X, Y = get_normalized_data()

    max_iter = 15
    print_period = 10

    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
def main():
    X, Y = get_normalized_data()

    Xtrain = X[:-1000, ]
    Ytrain = Y[:-1000]
    Xtest = X[-1000:, ]
    Ytest = Y[-1000:]

    model = TFLogistic("tf.model")
    model.fit(Xtrain, Ytrain, Xtest, Ytest)

    # test out restoring the model via the predict function
    print "final train accuracy:", model.score(Xtrain, Ytrain)
    print "final test accuracy:", model.score(Xtest, Ytest)

    # save the model
    model.save("my_trained_model.json")

    # load and score again
    model = TFLogistic.load("my_trained_model.json")
    print "final train accuracy (after reload):", model.score(Xtrain, Ytrain)
    print "final test accuracy (after reload):", model.score(Xtest, Ytest)
def main():
    X, Y = get_normalized_data()

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]

    model = TFLogistic("tf.model")
    model.fit(Xtrain, Ytrain, Xtest, Ytest)

    # test out restoring the model via the predict function
    print "final train accuracy:", model.score(Xtrain, Ytrain)
    print "final test accuracy:", model.score(Xtest, Ytest)

    # save the model
    model.save("my_trained_model.json")

    # load and score again
    model = TFLogistic.load("my_trained_model.json")
    print "final train accuracy (after reload):", model.score(Xtrain, Ytrain)
    print "final test accuracy (after reload):", model.score(Xtest, Ytest)
示例#26
0
def main():
    # compare 3 scenarios:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum

    max_iter = 20 # make it 30 for sigmoid
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
示例#27
0
def main():
    # step 1: get the data and define all the usual variables
    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()

    ann = ANN([500, 300], [0.8, 0.5, 0.5], './tf.model')

    session = tf.InteractiveSession()
    ann.set_session(session)
    ann.fit(Xtrain, Ytrain, Xtest, Ytest)

    print("final train accuracy:", ann.score(Xtrain, Ytrain))
    print("final test accuracy:", ann.score(Xtest, Ytest))

    ann.save("my_saved_model.json")

    session.close()

    sess = tf.InteractiveSession()

    model = ANN.load(sess, "my_saved_model.json")
    model.set_session(sess)
    model.restore_model()
    print("final train accuracy (after reload):", model.score(Xtrain, Ytrain))
    print("final test accuracy (after reload):", model.score(Xtest, Ytest))
def main():
    # 1.batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov

    max_iter = 20
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain, Ytrain = X[:-1000], Y[:-1000]
    Xtest, Ytest = X[-1000:], Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = X.shape
    batch_sz = 500
    n_batches = N / batch_sz

    M = 300  # number of hidden neurons
    K = 10  # number of output classes

    W1 = np.random.randn(D, M) / np.sqrt(D + M)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    #1. batch SGD
    LL_batch = []
    CR_batch = []

    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j + 1) * batch_sz, ]
            Ybatch = Ytrain_ind[j * batch_sz:(j + 1) * batch_sz, ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) +
                        reg * W1)
            b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_batch.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)

    #2. batch with momentum
    W1 = np.random.randn(D, M) / np.sqrt(D + M)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_momentum = []
    CR_momentum = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0

    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j + 1) * batch_sz, ]
            Ybatch = Ytrain_ind[j * batch_sz:(j + 1) * batch_sz, ]

            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            dW2 = mu * dW2 - lr * (derivative_w2(Z, Ybatch, pYbatch) +
                                   reg * W2)
            W2 += dW2
            db2 = mu * db2 - lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            b2 += db2
            dW1 = mu * dW1 - lr * (
                derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1)
            W1 += dW1
            db1 = mu * db1 - lr * (derivative_b1(Z, Ybatch, pYbatch, W2) +
                                   reg * b1)
            b1 += db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                ll = cost(pY, Ytest_ind)
                LL_momentum.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_momentum.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)

    #3. batch with Nesterov momentum
    W1 = np.random.randn(D, M) / np.sqrt(D + M)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_nesterov = []
    CR_nesterov = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0

    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j + 1) * batch_sz, ]
            Ybatch = Ytrain_ind[j * batch_sz:(j + 1) * batch_sz, ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            dW2 = mu * mu * dW2 - (1 + mu) * lr * (
                derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            W2 += dW2
            db2 = mu * mu * db2 - (1 + mu) * lr * (
                derivative_b2(Ybatch, pYbatch) + reg * b2)
            b2 += db2
            dW1 = mu * mu * dW1 - (1 + mu) * lr * (
                derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1)
            W1 += dW1
            db1 = mu * mu * db1 - (1 + mu) * lr * (
                derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)
            b1 += db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                ll = cost(pY, Ytest_ind)
                LL_nesterov.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_nesterov.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)

    plt.plot(LL_batch, label="batch")
    plt.plot(LL_momentum, label='momentum')
    plt.plot(LL_nesterov, label='nesterov')
    plt.legend()
    plt.show()
def main():
    # step 1: get the data and define all the usual variables
    X, Y = get_normalized_data()

    max_iter = 15
    print_period = 10

    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000, ]
    Ytrain = Y[:-1000]
    Xtest = X[-1000:, ]
    Ytest = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    # add an extra layer just for fun
    M1 = 300
    M2 = 100
    K = 10
    W1_init = np.random.randn(D, M1) / 28
    b1_init = np.zeros(M1)
    W2_init = np.random.randn(M1, M2) / np.sqrt(M1)
    b2_init = np.zeros(M2)
    W3_init = np.random.randn(M2, K) / np.sqrt(M2)
    b3_init = np.zeros(K)

    # initialize varaibles and expressions
    X = tf.placeholder(tf.float32, shape=(None, D), name='X')
    T = tf.placeholder(tf.float32, shape=(None, K), name='Y')
    W1 = tf.Variable(W1_init.astype(np.float32))
    b1 = tf.Variable(b1_init.astype(np.float32))
    W2 = tf.Variable(W2_init.astype(np.float32))
    b2 = tf.Variable(b2_init.astype(np.float32))
    W3 = tf.Variable(W3_init.astype(np.float32))
    b3 = tf.Variable(b3_init.astype(np.float32))

    # define the model
    Z1 = tf.nn.relu(tf.matmul(X, W1) + b1)
    Z2 = tf.nn.relu(tf.matmul(Z1, W2) + b2)
    # the cost function does the softmaxing! SO NO SOFTMAXING HERE
    Yish = tf.matmul(Z2, W3) + b3

    cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=Yish, labels=T))

    train_op = tf.train.RMSPropOptimizer(lr, decay=0.99, momentum=0.9).minimize(cost)

    predict_op = tf.argmax(Yish, 1)

    costs = []
    init = tf.global_variables_initializer()
    with tf.Session() as session:
        session.run(init)

        for i in range(max_iter):
            for j in range(n_batches):
                Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
                Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]

                session.run(train_op, feed_dict={X: Xbatch, T: Ybatch})
                if j % print_period == 0:
                    test_cost = session.run(cost, feed_dict={X: Xtest, T: Ytest_ind})
                    prediction = session.run(predict_op, feed_dict={X: Xtest})
                    err = error_rate(prediction, Ytest)
                    print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, test_cost, err))
                    costs.append(test_cost)

    plt.plot(costs)
    plt.show()
def main():
    # step 1: get the data and define all the usual variables
    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()

    ann = ANN([500, 300], [0.8, 0.5, 0.5])
    ann.fit(Xtrain, Ytrain, Xtest, Ytest, show_fig=True)
# Note: is helpful to look at keras_example.py first


import numpy as np
import matplotlib.pyplot as plt
from util import get_normalized_data

import torch
from torch.autograd import Variable
from torch import optim



# get the data, same as Theano + Tensorflow examples
# no need to split now, the fit() function will do it
X, Y = get_normalized_data()

# get shapes
_, D = X.shape
K = len(set(Y))

# split the data
Xtrain = X[:-1000,]
Ytrain = Y[:-1000]
Xtest  = X[-1000:,]
Ytest  = Y[-1000:]

# Note: no need to convert Y to indicator matrix


# the model will be a sequence of layers
def main():
    max_iter = 20 # make it 30 for sigmoid
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N / batch_sz

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # 1. const
    # cost = -16
    LL_batch = []
    CR_batch = []
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_batch.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)


    # 2. RMSprop
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_rms = []
    CR_rms = []
    lr0 = 0.001 # if you set this too high you'll get NaN!
    cache_W2 = 0
    cache_b2 = 0
    cache_W1 = 0
    cache_b1 = 0
    decay_rate = 0.999
    eps = 0.0000000001
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2
            W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps)

            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2
            b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps)

            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1
            W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps)

            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1
            cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1
            b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_rms.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_rms.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)

    plt.plot(LL_batch, label='const')
    plt.plot(LL_rms, label='rms')
    plt.legend()
    plt.show()
示例#33
0
def main():
    # compare 3 scenarios:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum

    max_iter = 20  # make it 30 for sigmoid
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000, ]
    Ytrain = Y[:-1000]
    Xtest = X[-1000:, ]
    Ytest = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # 1. batch
    # cost = -16
    LL_batch = []
    CR_batch = []
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) +
                        reg * W1)
            b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll))

                err = error_rate(pY, Ytest)
                CR_batch.append(err)
                print("Error rate:", err)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    # 2. batch with momentum
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_momentum = []
    CR_momentum = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            dW2 = mu * dW2 - lr * (derivative_w2(Z, Ybatch, pYbatch) +
                                   reg * W2)
            W2 += dW2
            db2 = mu * db2 - lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            b2 += db2
            dW1 = mu * dW1 - lr * (
                derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1)
            W1 += dW1
            db1 = mu * db1 - lr * (derivative_b1(Z, Ybatch, pYbatch, W2) +
                                   reg * b1)
            b1 += db1

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_momentum.append(ll)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll))

                err = error_rate(pY, Ytest)
                CR_momentum.append(err)
                print("Error rate:", err)
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    # 3. batch with Nesterov momentum
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_nest = []
    CR_nest = []
    mu = 0.9
    # alternate version uses dW
    # dW2 = 0
    # db2 = 0
    # dW1 = 0
    # db1 = 0
    vW2 = 0
    vb2 = 0
    vW1 = 0
    vb1 = 0
    for i in range(max_iter):
        for j in range(n_batches):
            # because we want g(t) = grad(f(W(t-1) - lr*mu*dW(t-1)))
            # dW(t) = mu*dW(t-1) + g(t)
            # W(t) = W(t-1) - mu*dW(t)
            W1_tmp = W1 - lr * mu * vW1
            b1_tmp = b1 - lr * mu * vb1
            W2_tmp = W2 - lr * mu * vW2
            b2_tmp = b2 - lr * mu * vb2

            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            # pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            pYbatch, Z = forward(Xbatch, W1_tmp, b1_tmp, W2_tmp, b2_tmp)

            # updates
            # dW2 = mu*mu*dW2 - (1 + mu)*lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            # W2 += dW2
            # db2 = mu*mu*db2 - (1 + mu)*lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            # b2 += db2
            # dW1 = mu*mu*dW1 - (1 + mu)*lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            # W1 += dW1
            # db1 = mu*mu*db1 - (1 + mu)*lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)
            # b1 += db1
            vW2 = mu * vW2 + derivative_w2(Z, Ybatch, pYbatch) + reg * W2_tmp
            W2 -= lr * vW2
            vb2 = mu * vb2 + derivative_b2(Ybatch, pYbatch) + reg * b2_tmp
            b2 -= lr * vb2
            vW1 = mu * vW1 + derivative_w1(Xbatch, Z, Ybatch, pYbatch,
                                           W2_tmp) + reg * W1_tmp
            W1 -= lr * vW1
            vb1 = mu * vb1 + derivative_b1(Z, Ybatch, pYbatch,
                                           W2_tmp) + reg * b1_tmp
            b1 -= lr * vb1

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_nest.append(ll)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll))

                err = error_rate(pY, Ytest)
                CR_nest.append(err)
                print("Error rate:", err)
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    plt.plot(LL_batch, label="batch")
    plt.plot(LL_momentum, label="momentum")
    plt.plot(LL_nest, label="nesterov")
    plt.legend()
    plt.show()
def main():
    max_iter = 10
    print_period = 10

    X, Y = get_normalized_data()
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10
    W1_0 = np.random.randn(D, M) / np.sqrt(D)
    b1_0 = np.zeros(M)
    W2_0 = np.random.randn(M, K) / np.sqrt(M)
    b2_0 = np.zeros(K)

    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    # 1st moment
    mW1 = 0
    mb1 = 0
    mW2 = 0
    mb2 = 0

    # 2nd moment
    vW1 = 0
    vb1 = 0
    vW2 = 0
    vb2 = 0

    # hyperparams
    lr0 = 0.001
    beta1 = 0.9
    beta2 = 0.999
    eps = 1e-8

    # 1. Adam
    loss_adam = []
    err_adam = []
    t = 1
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            # gradients
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1

            # new m
            mW1 = beta1 * mW1 + (1 - beta1) * gW1
            mb1 = beta1 * mb1 + (1 - beta1) * gb1
            mW2 = beta1 * mW2 + (1 - beta1) * gW2
            mb2 = beta1 * mb2 + (1 - beta1) * gb2

            # new v
            vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1
            vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1
            vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2
            vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2

            # bias correction
            correction1 = 1 - beta1 ** t
            hat_mW1 = mW1 / correction1
            hat_mb1 = mb1 / correction1
            hat_mW2 = mW2 / correction1
            hat_mb2 = mb2 / correction1

            correction2 = 1 - beta2 ** t
            hat_vW1 = vW1 / correction2
            hat_vb1 = vb1 / correction2
            hat_vW2 = vW2 / correction2
            hat_vb2 = vb2 / correction2

            # update t
            t += 1

            # apply updates to the params
            W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps)
            b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps)
            W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps)
            b2 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps)


            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                loss_adam.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                err = error_rate(pY, Ytest)
                err_adam.append(err)
                print("Error rate:", err)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))


    # 2. RMSprop with momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()
    loss_rms = []
    err_rms = []

    # comparable hyperparameters for fair comparison
    lr0 = 0.001
    mu = 0.9
    decay_rate = 0.999
    eps = 1e-8

    # rmsprop cache
    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1

    # momentum
    dW1 = 0
    db1 = 0
    dW2 = 0
    db2 = 0
    
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2
            dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2) + eps)
            W2 -= dW2

            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2
            db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2) + eps)
            b2 -= db2

            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1
            dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1) + eps)
            W1 -= dW1

            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1
            cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1
            db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1) + eps)
            b1 -= db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                loss_rms.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                err = error_rate(pY, Ytest)
                err_rms.append(err)
                print("Error rate:", err)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    plt.plot(loss_adam, label='adam')
    plt.plot(loss_rms, label='rmsprop')
    plt.legend()
    plt.show()
def main():
    # step 1: get the data and define all the usual variables
    X, Y = get_normalized_data()

    ann = ANN([500, 300], [0.8, 0.5, 0.5])
    ann.fit(X, Y)
示例#36
0
from keras.models import Sequential
from keras.layers import Dense, Activation
from util import get_normalized_data, y2indicator

import matplotlib.pyplot as plt

Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()

N, D = Xtrain.shape
K = len(set(Ytrain))

Ytrain = y2indicator(Ytrain)
Ytest = y2indicator(Ytest)

model = Sequential()

model.add(Dense(units=500, input_dim=D))
model.add(Activation('relu'))
model.add(Dense(units=300))
model.add(Activation('relu'))
model.add(Dense(units=K))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

r = model.fit(Xtrain,
              Ytrain,
              validation_data=(Xtest, Ytest),
              epochs=15,
示例#37
0
def batch_grad():

    #get data and for test and train sets
    X, Y = get_normalized_data()
    #XTrain = X[:-1000, :]
    #YTrain = Y[:-1000]
    #YTrain_ind = y2indicator(YTrain)
    #XTest = X[-1000:, :]
    #YTest = Y[-1000:]
    # = y2indicator(YTest)
    Y_ind = y2indicator(Y)

    batchSz = 500
    #Initialize random weights
    N, D = X.shape
    K = len(set(Y))
    M = 300
    W1 = np.random.randn(D, M)
    b1 = np.random.randn(M)
    W2 = np.random.randn(M, K)
    b2 = np.random.randn(K)

    learning_rate = 0.001
    reg = 0.01
    cache_w2 = 0
    cache_b2 = 0
    cache_w1 = 0
    cache_b1 = 0
    decay_rate = 0.999
    eps = 10e-10

    no_batches = int(N / batchSz)
    print("No of bathces: ", no_batches)
    for i in range(300):
        for n in range(no_batches):
            #get current batch
            XBatch = X[n * batchSz:(n * batchSz + batchSz), :]
            YBatch_ind = Y_ind[n * batchSz:(n * batchSz + batchSz), :]
            #Forward prop
            pY, Z = forward_relu(XBatch, W1, b1, W2, b2)

            #Backprop
            gW2 = derivative_w2(pY, YBatch_ind, Z) + reg * W2
            cache_w2 = decay_rate * cache_w2 + (1 - decay_rate) * gW2 * gW2
            W2 += learning_rate * gW2 / (np.sqrt(cache_w2) + eps)

            gb2 = derivative_b2(pY, YBatch_ind) + reg * b2
            cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2
            b2 += learning_rate * gb2 / (np.sqrt(cache_b2) + eps)

            gW1 = derivative_w1(pY, YBatch_ind, W2, Z, XBatch) + reg * W1
            cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2
            b2 += learning_rate * gb2 / (np.sqrt(cache_b2) + eps)

            gb1 = derivative_b1(pY, YBatch_ind, W2, Z) + reg * b1
            cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1
            b1 += learning_rate * gb1 / (np.sqrt(cache_b1) + eps)

            if n % 100 == 0:
                #Forward prop
                #pY, Z = forward_relu(XBatch, W1, b1, W2, b2)
                YBatch = Y[n * batchSz:n * batchSz + batchSz]
                P = np.argmax(pY, axis=1)
                er = error_rate(P, YBatch)

                c = cost(YBatch_ind, pY)
                print("Loop: ", i, n, "Error rate: ", er, "Cost: ", c)

    pY, Z = forward_relu(X, W1, b1, W2, b2)
    p = np.argmax(pY, axis=1)
    print("Final Final training error rate: ", error_rate(p, Y))

    XTest = get_test_data()
    pY, ZTest = forward_relu(XTest, W1, b1, W2, b2)
    YTest = np.argmax(pY, axis=1)

    f = open("test_rms.csv", "w")
    f.write("ImageId,Label\n")
    n = YTest.shape[0]
    for i in range(n):
        f.write(str(i + 1) + "," + str(YTest[i]) + "\n")
    f.close()
from keras.models import Sequential
from keras.layers import Dense, Activation
from util import get_normalized_data, y2indicator

import matplotlib.pyplot as plt

# NOTE: do NOT name your file keras.py because it will conflict
# with importing keras

# installation is easy! just the usual "sudo pip(3) install keras"


# get the data, same as Theano + Tensorflow examples
# no need to split now, the fit() function will do it
Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()

# get shapes
N, D = Xtrain.shape
K = len(set(Ytrain))

# by default Keras wants one-hot encoded labels
# there's another cost function we can use
# where we can just pass in the integer labels directly
# just like Tensorflow / Theano
Ytrain = y2indicator(Ytrain)
Ytest = y2indicator(Ytest)


# the model will be a sequence of layers
model = Sequential()
def main():
    # step 1: get the data and define all the usual variables
    X, Y = get_normalized_data()

    max_iter = 15
    print_period = 10

    lr = 0.00004
    mu = 0.9

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N / batch_sz

    # add an extra layer just for fun
    M1 = 300
    M2 = 100
    K = 10
    W1_init = np.random.randn(D, M1) / 28
    b1_init = np.zeros(M1)
    W2_init = np.random.randn(M1, M2) / np.sqrt(M1)
    b2_init = np.zeros(M2)
    W3_init = np.random.randn(M2, K) / np.sqrt(M2)
    b3_init = np.zeros(K)


    # define variables and expressions
    X = tf.placeholder(tf.float32, shape=(None, D), name='X')
    T = tf.placeholder(tf.float32, shape=(None, K), name='T')
    W1 = tf.Variable(W1_init.astype(np.float32))
    b1 = tf.Variable(b1_init.astype(np.float32))
    W2 = tf.Variable(W2_init.astype(np.float32))
    b2 = tf.Variable(b2_init.astype(np.float32))
    W3 = tf.Variable(W3_init.astype(np.float32))
    b3 = tf.Variable(b3_init.astype(np.float32))

    # define the model
    Z1 = tf.nn.relu( tf.matmul(X, W1) + b1 )
    Z2 = tf.nn.relu( tf.matmul(Z1, W2) + b2 )
    Yish = tf.matmul(Z2, W3) + b3 # remember, the cost function does the softmaxing! weird, right?

    # softmax_cross_entropy_with_logits take in the "logits"
    # if you wanted to know the actual output of the neural net,
    # you could pass "Yish" into tf.nn.softmax(logits)
    cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(Yish, T))

    # we choose the optimizer but don't implement the algorithm ourselves
    # let's go with RMSprop, since we just learned about it.
    # it includes momentum!
    train_op = tf.train.RMSPropOptimizer(lr, decay=0.99, momentum=0.9).minimize(cost)

    # we'll use this to calculate the error rate
    predict_op = tf.argmax(Yish, 1)

    LL = []
    init = tf.initialize_all_variables()
    with tf.Session() as session:
        session.run(init)

        for i in xrange(max_iter):
            for j in xrange(n_batches):
                Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
                Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]

                session.run(train_op, feed_dict={X: Xbatch, T: Ybatch})
                if j % print_period == 0:
                    test_cost = session.run(cost, feed_dict={X: Xtest, T: Ytest_ind})
                    prediction = session.run(predict_op, feed_dict={X: Xtest})
                    err = error_rate(prediction, Ytest)
                    print "Cost / err at iteration i=%d, j=%d: %.6f / %.3f" % (i, j, test_cost, err)
                    LL.append(test_cost)

    plt.plot(LL)
    plt.show()
def main():
    # compare 3 scenarios:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum

    max_iter = 20 # make it 30 for sigmoid
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N / batch_sz

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # 1. batch
    # cost = -16
    LL_batch = []
    CR_batch = []
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_batch.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)

    # 2. batch with momentum
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_momentum = []
    CR_momentum = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            dW2 = mu*dW2 - lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            W2 += dW2
            db2 = mu*db2 - lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            b2 += db2
            dW1 = mu*dW1 - lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            W1 += dW1
            db1 = mu*db1 - lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)
            b1 += db1

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_momentum.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_momentum.append(err)
                print "Error rate:", err
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)


    # 3. batch with Nesterov momentum
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_nest = []
    CR_nest = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            dW2 = mu*mu*dW2 - (1 + mu)*lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            W2 += dW2
            db2 = mu*mu*db2 - (1 + mu)*lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            b2 += db2
            dW1 = mu*mu*dW1 - (1 + mu)*lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            W1 += dW1
            db1 = mu*mu*db1 - (1 + mu)*lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)
            b1 += db1

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_nest.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_nest.append(err)
                print "Error rate:", err
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)



    plt.plot(LL_batch, label="batch")
    plt.plot(LL_momentum, label="momentum")
    plt.plot(LL_nest, label="nesterov")
    plt.legend()
    plt.show()
def main():
    # compare 3 scenarios:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum

    max_iter = 20 # make it 30 for sigmoid
    print_period = 50

    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # save initial weights
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # 1. batch
    losses_batch = []
    errors_batch = []
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_batch.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_batch.append(e)
                print("Error rate:", e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    # 2. batch with momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()
    losses_momentum = []
    errors_momentum = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # gradients
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1

            # update velocities
            dW2 = mu*dW2 - lr*gW2
            db2 = mu*db2 - lr*gb2
            dW1 = mu*dW1 - lr*gW1
            db1 = mu*db1 - lr*gb1

            # updates
            W2 += dW2
            b2 += db2
            W1 += dW1
            b1 += db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_momentum.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_momentum.append(e)
                print("Error rate:", e)
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))


    # 3. batch with Nesterov momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    losses_nesterov = []
    errors_nesterov = []

    mu = 0.9
    vW2 = 0
    vb2 = 0
    vW1 = 0
    vb1 = 0
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1

            # v update
            vW2 = mu*vW2 - lr*gW2
            vb2 = mu*vb2 - lr*gb2
            vW1 = mu*vW1 - lr*gW1
            vb1 = mu*vb1 - lr*gb1

            # param update
            W2 += mu*vW2 - lr*gW2
            b2 += mu*vb2 - lr*gb2
            W1 += mu*vW1 - lr*gW1
            b1 += mu*vb1 - lr*gb1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_nesterov.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_nesterov.append(e)
                print("Error rate:", e)
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))



    plt.plot(losses_batch, label="batch")
    plt.plot(losses_momentum, label="momentum")
    plt.plot(losses_nesterov, label="nesterov")
    plt.legend()
    plt.show()
def main():
    X, Y = get_normalized_data()

    max_iter = 20
    print_period = 10
    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest = X[-1000:,]
    Ytest = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N / batch_sz

    M = 300
    K = 10
    W1_init = np.random.randn(D, M) / 28
    b1_init = np.zeros(M)
    W2_init = np.random.randn(M, K) / np.sqrt(M)
    b2_init = np.zeros(K)

    thX = T.matrix('X')
    thT = T.matrix('T')
    W1 = theano.shared(W1_init, 'W1')
    b1 = theano.shared(b1_init, 'b1')
    W2 = theano.shared(W2_init, 'W2')
    b2 = theano.shared(b2_init, 'b2')

    thZ = relu( thX.dot(W1) + b1 )
    thY = T.nnet.softmax( thZ.dot(W2) + b2 )

    cost = -(thT * T.log(thY)).sum() + reg*((W1*W1).sum() + (b1*b1).sum() + (W2*W2).sum() + (b2*b2).sum())
    prediction = T.argmax(thY, axis=1)

    update_W1 = W1 - lr*T.grad(cost, W1)
    update_b1 = b1 - lr*T.grad(cost, b1)
    update_W2 = W2 - lr*T.grad(cost, W2)
    update_b2 = b2 - lr*T.grad(cost, b2)

    train = theano.function(
        inputs=[thX, thT],
        updates=[(W1, update_W1), (b1, update_b1), (W2, update_W2), (b2, update_b2)],

    )

    get_prediction = theano.function(
        inputs=[thX, thT],
        outputs =[cost, prediction],
    )

    LL = []
    for i in range(max_iter):
        for j in range(int(n_batches)):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]

            train(Xbatch, Ybatch)
            if j % print_period == 0:
                cost_val, prediction_val = get_prediction(Xtest, Ytest_ind)
                err = error_rate(prediction_val, Ytest)
                print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, err))
                LL.append(cost_val)

    plt.plot(LL)
    plt.show()
示例#43
0
def main():
    # step 1: get the data and define all the usual variables
    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()

    ann = ANN([500, 300], [0.8, 0.5, 0.5])
    ann.fit(Xtrain, Ytrain, Xtest, Ytest, show_fig=True)
def main():
    # step 1: get the data and define all the usual variables
    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()

    max_iter = 20
    print_period = 10

    lr = 0.0004
    reg = 0.01

    Xtrain = Xtest.astype(np.float32)
    Ytrain = Ytest.astype(np.float32)
    Ytrain_ind = y2indicator(Ytrain).astype(np.float32)
    Ytest_ind = y2indicator(Ytest).astype(np.float32)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10
    W1_init = np.random.randn(D, M) / 28
    b1_init = np.zeros(M)
    W2_init = np.random.randn(M, K) / np.sqrt(M)
    b2_init = np.zeros(K)

    # step 2: define theano variables and expressions
    thX = T.matrix('X')
    thT = T.matrix('T')
    W1 = theano.shared(W1_init, 'W1')
    b1 = theano.shared(b1_init, 'b1')
    W2 = theano.shared(W2_init, 'W2')
    b2 = theano.shared(b2_init, 'b2')

    # we can use the built-in theano functions to do relu and softmax
    thZ = relu( thX.dot(W1) + b1 ) # relu is new in version 0.7.1 but just in case you don't have it
    thY = T.nnet.softmax( thZ.dot(W2) + b2 )

    # define the cost function and prediction
    cost = -(thT * T.log(thY)).sum() + reg*((W1*W1).sum() + (b1*b1).sum() + (W2*W2).sum() + (b2*b2).sum())
    prediction = T.argmax(thY, axis=1)

    # step 3: training expressions and functions
    # we can just include regularization as part of the cost because it is also automatically differentiated!
    update_W1 = W1 - lr*T.grad(cost, W1)
    update_b1 = b1 - lr*T.grad(cost, b1)
    update_W2 = W2 - lr*T.grad(cost, W2)
    update_b2 = b2 - lr*T.grad(cost, b2)

    train = theano.function(
        inputs=[thX, thT],
        updates=[(W1, update_W1), (b1, update_b1), (W2, update_W2), (b2, update_b2)],
    )

    # create another function for this because we want it over the whole dataset
    get_prediction = theano.function(
        inputs=[thX, thT],
        outputs=[cost, prediction],
    )

    costs = []
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]

            train(Xbatch, Ybatch)
            if j % print_period == 0:
                cost_val, prediction_val = get_prediction(Xtest, Ytest_ind)
                err = error_rate(prediction_val, Ytest)
                print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, err))
                costs.append(cost_val)

    plt.plot(costs)
    plt.show()