Exemplo n.º 1
0
def nesterov_momentum(learning_rate):

    X, Y = transform_data()
    X, Y = shuffle(X, Y)
    N = len(X) // 2
    Xtrain = X[:N]
    Ytrain = Y[:N]
    Ttrain = generate_T(Ytrain)
    Xtest = X[N:]
    Ytest = Y[N:]
    Ttest = generate_T(Ytest)
    N, D = Xtrain.shape
    M = 100
    K = len(set(Y))
    iterations = 50
    batch_N = 250
    batches = N // batch_N
    v = np.random.randn(M, K) / np.sqrt(M + K)
    b_1 = np.random.randn(K) / np.sqrt(K)
    w = np.random.randn(D, M) / np.sqrt(D + M)
    b_0 = np.random.randn(M) / np.sqrt(M)
    mu = .9
    dv = 0
    db_1 = 0
    dw = 0
    db_0 = 0
    nesterov_cost = []
    nesterov_cr = []
    best_nesterov = 0
    best_iteration = 0
    for i in range(iterations):
        for b in range(batches):
            X = Xtrain[b * batch_N:(b + 1) * batch_N, :]
            T = Ttrain[b * batch_N:(b + 1) * batch_N, :]
            Y, Z = generate_Y('tanh', X, w, b_0, v, b_1)
            Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1)
            P_test = np.argmax(Y_test, axis=1)
            if b % batches == 0:
                nesterov_cost.append(cross_entropy(Y_test, Ttest))
                cr = classification_rate(P_test, Ytest)
                nesterov_cr.append(cr)
                if cr > best_nesterov:
                    best_nesterov = cr
                    best_iteration = i
            dv = mu * dv - learning_rate * derivative_v('tanh', Z, Y, T)
            db_1 = mu * db_1 - learning_rate * derivative_b1('tanh', Y, T)
            dw = mu * dw - learning_rate * derivative_w('tanh', X, Y, Z, T, v)
            db_0 = mu * db_0 - learning_rate * derivative_b0(
                'tanh', Y, Z, T, v)
            v += mu * dv - learning_rate * derivative_v('tanh', Z, Y, T)
            b_1 += mu * db_1 - learning_rate * derivative_b1('tanh', Y, T)
            w += mu * dw + learning_rate * derivative_w('tanh', X, Y, Z, T, v)
            b_0 += mu * db_0 - learning_rate * derivative_b0(
                'tanh', Y, Z, T, v)
        if i % 100 == 0:
            print('Nesterov Cost: ', nesterov_cost[i],
                  'Nesterov Classification: ', nesterov_cr[i])
    return nesterov_cost, nesterov_cr, best_nesterov, best_iteration
Exemplo n.º 2
0
def exp_decay(learning_rate):

    X, Y = transform_data()
    X, Y = shuffle(X, Y)
    N = len(X) // 2
    Xtrain = X[:N]
    Ytrain = Y[:N]
    Ttrain = generate_T(Ytrain)
    Xtest = X[N:]
    Ytest = Y[N:]
    Ttest = generate_T(Ytest)
    N, D = Xtrain.shape
    M = 100
    K = len(set(Y))
    iterations = 50
    batch_N = 250
    batches = N // batch_N
    dv = 0
    d_b1 = 0
    dw = 0
    d_b0 = 0
    mu = .9
    v = np.random.randn(M, K) / np.sqrt(M + K)
    b_1 = np.random.randn(K) / np.sqrt(K)
    w = np.random.randn(D, M) / np.sqrt(D + M)
    b_0 = np.random.randn(M) / np.sqrt(M)
    learning_rate = learning_rate
    exp_cost = []
    exp_cr = []
    exp_lr = []
    best_exp = 0
    best_iteration = 0
    for i in range(iterations):
        learning_rate = learning_rate * np.exp(-K * i)
        exp_lr.append(learning_rate)
        for b in range(batches):
            X = Xtrain[b * batches:(b + 1) * batches, :]
            T = Ttrain[b * batches:(b + 1) * batches, :]
            Y, Z = generate_Y('tanh', X, w, b_0, v, b_1)
            Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1)
            P_test = np.argmax(Y_test, axis=1)
            if b % batches == 0:
                exp_cost.append(cross_entropy(Y_test, Ttest))
                cr = classification_rate(P_test, Ytest)
                exp_cr.append(cr)
                if cr > best_exp:
                    best_exp = cr
                    best_iteration = i
            dv = mu * dv - learning_rate * derivative_v('tanh', Z, Y, T)
            d_b1 = mu * d_b1 - learning_rate * derivative_b1('tanh', Y, T)
            dw = mu * dw - learning_rate * derivative_w('tanh', X, Y, Z, T, v)
            d_b0 = mu * d_b0 - learning_rate * derivative_b0(
                'tanh', Y, Z, T, v)
            v += dv
            b_1 += d_b1
            w += dw
            b_0 += d_b0
        if i % 10 == 0:
            print('Exp Cost: ', exp_cost[i], 'Exp Classification: ', exp_cr[i])
    return exp_cost, exp_cr, exp_lr, best_exp, best_iteration
Exemplo n.º 3
0
    def train(self, X, Y, activation=1, lr=10e-7, reg=10e-7, epoch=10):
        N, D = X.shape  #Diamentionality of our data
        batch_size = 500
        n_batches = int(N / batch_size)
        ind = tar2ind(
            Y
        )  # WE convert our target array into indicator matrix using one hot encoding
        _, K = ind.shape

        self.W1 = np.random.randn(D, self.M) / np.sqrt(
            D)  #Input to hidden weight
        self.W2 = np.random.randn(self.M, K) / np.sqrt(
            self.M)  #Hidden to output weights
        self.b1 = np.random.randn(self.M)
        self.b2 = np.random.randn(K)
        dW2 = 0
        db2 = 0
        dW1 = 0
        db1 = 0
        mu = 0.9  # Momentum
        decay_rate = 0.99

        cost = []
        for n in range(0, 200):
            #tempx , tempy = shuffle(X, ind)
            for i in range(0, n_batches):
                X_tr = X[i * batch_size:(i * batch_size + batch_size), :]
                Y_tr = Y[i * batch_size:(i * batch_size + batch_size), ]
                ind = tar2ind(Y_tr)
                output, hidden = forward(X_tr, activation, self.W1, self.b1,
                                         self.W2, self.b2)

                #Performing backpropagation now
                dW2 = mu * dW2 + lr * (derivative_W2(ind, output, hidden, reg,
                                                     self.W2))
                self.W2 = self.W2 + dW2
                db2 = mu * db2 + lr * (derivative_b2(ind, output, reg,
                                                     self.b2))
                self.b2 = self.b2 + db2
                dW1 = mu * dW1 + lr * (derivative_W1(
                    ind, output, hidden, self.W2, X_tr, activation, reg,
                    self.W1))
                self.W1 = self.W1 + dW1
                db1 = mu * db1 + lr * (derivative_b1(
                    ind, output, hidden, self.W2, activation, reg, self.b1))
                self.b1 = self.b1 + db1
                c = cross_entropy(ind, output)
                cost.append(c)

                if i % 10 == 0:
                    result = np.argmax(output, axis=1)
                    r = classification_rate(Y_tr, result)
                    print("iteration:- ", i, "cost:- ", c,
                          "classification rate:- ", r)
Exemplo n.º 4
0
def batch(learning_rate):

    X, Y = transform_data()
    X, Y = shuffle(X, Y)
    N = len(X) // 2
    Xtrain = X[:N]
    Ytrain = Y[:N]
    Ttrain = generate_T(Ytrain)
    Xtest = X[N:]
    Ytest = Y[N:]
    Ttest = generate_T(Ytest)
    N, D = Xtrain.shape
    M = 100
    K = len(set(Y))
    iterations = 50
    batch_N = 250
    batches = len(X) // batch_N
    v = np.random.randn(M, K) / np.sqrt(M + K)
    b_1 = np.random.randn(K) / np.sqrt(K)
    w = np.random.randn(D, M) / np.sqrt(D + M)
    b_0 = np.random.randn(M) / np.sqrt(M)
    batch_cost = []
    batch_cr = []
    best_batch = 0
    best_iteration = 0
    for i in range(iterations):
        for b in range(batches):
            X = Xtrain[b * batch_N:(b + 1) * batch_N, :]
            T = Ttrain[b * batch_N:(b + 1) * batch_N, :]
            Y, Z = generate_Y('tanh', X, w, b_0, v, b_1)
            Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1)
            P_test = np.argmax(Y_test, axis=1)
            if b % batches == 0:
                batch_cost.append(cross_entropy(Y_test, Ttest))
                cr = classification_rate(P_test, Ytest)
                batch_cr.append(cr)
                if cr > best_batch:
                    best_batch = cr
                    best_iteration = i
            v -= learning_rate * derivative_v('tanh', Z, Y, T)
            b_1 -= learning_rate * derivative_b1('tanh', Y, T)
            w -= learning_rate * derivative_w('tanh', X, Y, Z, T, v)
            b_0 -= learning_rate * derivative_b0('tanh', Y, Z, T, v)
        if i % 100 == 0:
            print('Batch Cost: ', batch_cost[i], 'Batch Classification: ',
                  batch_cr[i])
    return batch_cost, batch_cr, best_batch, best_iteration
Exemplo n.º 5
0
def batch_grad():

    #get data and for test and train sets
    X, Y = get_normalized_data()
    #XTrain = X[:-1000, :]
    #YTrain = Y[:-1000]
    #YTrain_ind = y2indicator(YTrain)
    #XTest = X[-1000:, :]
    #YTest = Y[-1000:]
    # = y2indicator(YTest)
    Y_ind = y2indicator(Y)

    batchSz = 500
    #Initialize random weights
    N, D = X.shape
    K = len(set(Y))
    M = 300
    W1 = np.random.randn(D, M)
    b1 = np.random.randn(M)
    W2 = np.random.randn(M, K)
    b2 = np.random.randn(K)

    learning_rate = 0.001
    reg = 0.01
    cache_w2 = 0
    cache_b2 = 0
    cache_w1 = 0
    cache_b1 = 0
    decay_rate = 0.999
    eps = 10e-10

    no_batches = int(N / batchSz)
    print("No of bathces: ", no_batches)
    for i in range(300):
        for n in range(no_batches):
            #get current batch
            XBatch = X[n * batchSz:(n * batchSz + batchSz), :]
            YBatch_ind = Y_ind[n * batchSz:(n * batchSz + batchSz), :]
            #Forward prop
            pY, Z = forward_relu(XBatch, W1, b1, W2, b2)

            #Backprop
            gW2 = derivative_w2(pY, YBatch_ind, Z) + reg * W2
            cache_w2 = decay_rate * cache_w2 + (1 - decay_rate) * gW2 * gW2
            W2 += learning_rate * gW2 / (np.sqrt(cache_w2) + eps)

            gb2 = derivative_b2(pY, YBatch_ind) + reg * b2
            cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2
            b2 += learning_rate * gb2 / (np.sqrt(cache_b2) + eps)

            gW1 = derivative_w1(pY, YBatch_ind, W2, Z, XBatch) + reg * W1
            cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2
            b2 += learning_rate * gb2 / (np.sqrt(cache_b2) + eps)

            gb1 = derivative_b1(pY, YBatch_ind, W2, Z) + reg * b1
            cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1
            b1 += learning_rate * gb1 / (np.sqrt(cache_b1) + eps)

            if n % 100 == 0:
                #Forward prop
                #pY, Z = forward_relu(XBatch, W1, b1, W2, b2)
                YBatch = Y[n * batchSz:n * batchSz + batchSz]
                P = np.argmax(pY, axis=1)
                er = error_rate(P, YBatch)

                c = cost(YBatch_ind, pY)
                print("Loop: ", i, n, "Error rate: ", er, "Cost: ", c)

    pY, Z = forward_relu(X, W1, b1, W2, b2)
    p = np.argmax(pY, axis=1)
    print("Final Final training error rate: ", error_rate(p, Y))

    XTest = get_test_data()
    pY, ZTest = forward_relu(XTest, W1, b1, W2, b2)
    YTest = np.argmax(pY, axis=1)

    f = open("test_rms.csv", "w")
    f.write("ImageId,Label\n")
    n = YTest.shape[0]
    for i in range(n):
        f.write(str(i + 1) + "," + str(YTest[i]) + "\n")
    f.close()
Exemplo n.º 6
0
def rmsprop(learning_rate):

    X, Y = transform_data()
    X, Y = shuffle(X, Y)
    N = len(X) // 2
    Xtrain = X[:N]
    Ytrain = Y[:N]
    Ttrain = generate_T(Ytrain)
    Xtest = X[N:]
    Ytest = Y[N:]
    Ttest = generate_T(Ytest)
    Ttest = generate_T(Ytest)
    N, D = Xtrain.shape
    M = 100
    K = len(set(Y))
    iterations = 50
    batch_N = 250
    batches = N // batch_N
    dv = 0
    d_b1 = 0
    dw = 0
    d_b0 = 0
    mu = .9
    v = np.random.randn(M, K) / np.sqrt(M + K)
    b_1 = np.random.randn(K) / np.sqrt(K)
    w = np.random.randn(D, M) / np.sqrt(D + M)
    b_0 = np.random.randn(M) / np.sqrt(M)
    cache_v = np.ones((M, K))
    cache_b1 = np.ones(K)
    cache_w = np.ones((D, M))
    cache_b0 = np.ones(M)
    epsilon = 10e-10
    decay = .9
    rmsprop_cost = []
    rmsprop_cr = []
    best_rms = 0
    best_iteration = 0
    for i in range(iterations):
        for b in range(batches):
            X = Xtrain[b * batches:(b + 1) * batches, :]
            T = Ttrain[b * batches:(b + 1) * batches, :]
            Y, Z = generate_Y('tanh', X, w, b_0, v, b_1)
            Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1)
            P_test = np.argmax(Y_test, axis=1)
            if b % batches == 0:
                rmsprop_cost.append(cross_entropy(Y_test, Ttest))
                cr = classification_rate(P_test, Ytest)
                rmsprop_cr.append(cr)
                if cr > best_rms:
                    best_rms = cr
                    best_iteration = i
            cache_v = decay * cache_v + (1 - decay) * derivative_v(
                'tanh', Z, Y, T)**2
            cache_b1 = decay * cache_b1 + (1 - decay) * derivative_b1(
                'tanh', Y, T)**2
            cache_w = decay * cache_w + (1 - decay) * derivative_w(
                'tanh', X, Y, Z, T, v)**2
            cache_b0 = decay * cache_b0 + (1 - decay) * derivative_b0(
                'tanh', Y, Z, T, v)**2
            dv = mu * dv - learning_rate * derivative_v(
                'tanh', Z, Y, T) / (np.sqrt(cache_v + epsilon))
            d_b1 = mu * d_b1 - learning_rate * derivative_b1(
                'tanh', Y, T) / (np.sqrt(cache_b1 + epsilon))
            dw = mu * dw - learning_rate * derivative_w(
                'tanh', X, Y, Z, T, v) / (np.sqrt(cache_w + epsilon))
            d_b0 = mu * d_b0 - learning_rate * derivative_b0(
                'tanh', Y, Z, T, v) / (np.sqrt(cache_b0 + epsilon))
            v += dv
            b_1 += d_b1
            w += dw
            b_0 += d_b0
        if i % 10 == 0:
            print('RMSProp Cost: ', rmsprop_cost[i],
                  'RMSProp Classification: ', rmsprop_cr[i])
    return rmsprop_cost, rmsprop_cr, best_rms, best_iteration
def batch_grad():

    #get data and for test and train sets
    X, Y = get_normalized_data()
    #XTrain = X[:-1000, :]
    #YTrain = Y[:-1000]
    #YTrain_ind = y2indicator(YTrain)
    #XTest = X[-1000:, :]
    #YTest = Y[-1000:]
    # = y2indicator(YTest)
    Y_ind = y2indicator(Y)

    batchSz = 500
    #Initialize random weights
    N, D = X.shape
    K = len(set(Y))
    M = 300
    W1 = np.random.randn(D, M)
    b1 = np.random.randn(M)
    W2 = np.random.randn(M, K)
    b2 = np.random.randn(K)

    learning_rate = 10e-5

    no_batches = int(N / batchSz)
    print("No of bathces: ", no_batches)
    for i in range(300):
        for n in range(no_batches):
            #get current batch
            XBatch = X[n * batchSz:(n * batchSz + batchSz), :]
            #YBatch = Y[n*batchSz:n*batchSz + batchSz]
            YBatch_ind = Y_ind[n * batchSz:(n * batchSz + batchSz), :]
            #Forward prop
            pY, Z = forward_relu(XBatch, W1, b1, W2, b2)

            #Backprop
            W2 += learning_rate * derivative_w2(pY, YBatch_ind, Z)
            b2 += learning_rate * derivative_b2(pY, YBatch_ind)
            W1 += learning_rate * derivative_w1(pY, YBatch_ind, W2, Z, XBatch)
            b1 += learning_rate * derivative_b1(pY, YBatch_ind, W2, Z)

            if n % 100 == 0:
                #Forward prop
                #pY, Z = forward_relu(XBatch, W1, b1, W2, b2)
                YBatch = Y[n * batchSz:n * batchSz + batchSz]
                P = np.argmax(pY, axis=1)
                er = error_rate(P, YBatch)

                c = cost(YBatch_ind, pY)
                print("Loop: ", i, n, "Error rate: ", er, "Cost: ", c)

    # pY, Z = forward_prop(XTrain, W1, b1, W2, b2)
    # P = np.argmax(pY, axis=1)
    # print("Final training error rate: ", error_rate(P, YTrain))
    #
    # pY, Z = forward_prop(XTest, W1, b1, W2, b2)
    # P = np.argmax(pY, axis=1)
    # print("Final testing error rate: ", error_rate(P, YTest))

    pY, Z = forward_relu(X, W1, b1, W2, b2)
    p = np.argmax(pY, axis=1)
    print("Final Final training error rate: ", error_rate(p, Y))
Exemplo n.º 8
0
    def full(self):

        for i in range(self.iterations):
            Y_train, Z = generate_Y(self.activation, self.Xtrain, self.w,
                                    self.b_0, self.v, self.b_1)
            P_train = np.argmax(Y_train, axis=1)
            Y_test, _ = generate_Y(self.activation, self.Xtest, self.w,
                                   self.b_0, self.v, self.b_1)
            P_test = np.argmax(Y_test, axis=1)
            self.train_cost.append(cross_entropy(Y_train, self.Ttrain))
            self.test_cost.append(cross_entropy(Y_test, self.Ttest))
            train_cr = classification_rate(P_train, self.Ytrain)
            self.train_cr.append(train_cr)
            test_cr = classification_rate(P_test, self.Ytest)
            self.test_cr.append(test_cr)
            if train_cr > self.best_train:
                self.best_train = train_cr
                self.train_iteration = i
            if test_cr > self.best_test:
                self.best_test = test_cr
                self.test_iteration = i
            self.m_v = self.decay_0 * self.m_v + (
                1 - self.decay_0) * derivative_v(self.activation, Z, Y_train,
                                                 self.Ttrain)
            self.dm_v = self.m_v / (1 - self.decay_0**(i + 1))
            self.v_v = self.decay_1 * self.v_v + (
                1 - self.decay_1) * derivative_v(self.activation, Z, Y_train,
                                                 self.Ttrain)**2
            self.dv_v = self.v_v / (1 - self.decay_1**(i + 1))
            self.m_b1 = self.decay_0 * self.m_b1 + (
                1 - self.decay_0) * derivative_b1(self.activation, Y_train,
                                                  self.Ttrain)
            self.dm_b1 = self.m_b1 / (1 - self.decay_0**(i + 1))
            self.v_b1 = self.decay_1 * self.v_b1 + (
                1 - self.decay_1) * derivative_b1(self.activation, Y_train,
                                                  self.Ttrain)**2
            self.dv_b1 = self.v_b1 / (1 - self.decay_1**(i + 1))
            self.m_w = self.decay_0 * self.m_w + (
                1 - self.decay_0) * derivative_w(self.activation, self.Xtrain,
                                                 Y_train, Z, self.Ttrain,
                                                 self.v)
            self.dm_w = self.m_w / (1 - self.decay_0**(i + 1))
            self.v_w = self.decay_1 * self.v_w + (
                1 - self.decay_1) * derivative_w(self.activation, self.Xtrain,
                                                 Y_train, Z, self.Ttrain,
                                                 self.v)**2
            self.dv_w = self.v_w / (1 - self.decay_1**(i + 1))
            self.m_b0 = self.decay_0 * self.m_b0 + (
                1 - self.decay_0) * derivative_b0(self.activation, Y_train, Z,
                                                  self.Ttrain, self.v)
            self.dm_b0 = self.m_b0 / (1 - self.decay_0**(i + 1))
            self.v_b0 = self.decay_1 * self.v_b0 + (
                1 - self.decay_1) * derivative_b0(self.activation, Y_train, Z,
                                                  self.Ttrain, self.v)**2
            self.dv_b0 = self.v_b0 / (1 - self.decay_1**(i + 1))
            self.v -= self.learning_rate * self.dm_v / (np.sqrt(self.dv_v +
                                                                self.epsilon))
            self.b_1 -= self.learning_rate * self.dm_b1 / (
                np.sqrt(self.dv_b1 + self.epsilon))
            self.w -= self.learning_rate * self.dm_w / (np.sqrt(self.dv_w +
                                                                self.epsilon))
            self.b_0 -= self.learning_rate * self.dm_b0 / (
                np.sqrt(self.dv_b0 + self.epsilon))
            if i % 100 == 0:
                print(i, 'Train Cost: ', self.train_cost[i],
                      'Train Classification Rate: ', self.train_cr[i])
Exemplo n.º 9
0
    def stochastic(self, samples):

        for i in range(self.iterations):
            current_X, current_T = shuffle(self.Xtrain, self.Ttrain)
            for s in range(samples):
                X = current_X[s, :].reshape(1, current_X.shape[1])
                T = current_T[s, :].reshape(1, current_T.shape[1])
                Y, Z = generate_Y(self.activation, X, self.w, self.b_0, self.v,
                                  self.b_1)
                Y_train, _ = generate_Y(self.activation, self.Xtrain, self.w,
                                        self.b_0, self.v, self.b_1)
                P_train = np.argmax(Y_train, axis=1)
                Y_test, _ = generate_Y(self.activation, self.Xtest, self.w,
                                       self.b_0, self.v, self.b_1)
                P_test = np.argmax(Y_test, axis=1)
                self.train_cost.append(cross_entropy(Y_train, self.Ttrain))
                self.test_cost.append(cross_entropy(Y_test, self.Ttest))
                train_cr = classification_rate(P_train, self.Ytrain)
                self.train_cr.append(train_cr)
                test_cr = classification_rate(P_test, self.Ytest)
                self.test_cr.append(test_cr)
                if train_cr > self.best_train:
                    self.best_train = train_cr
                    self.train_iteration = i
                if test_cr > self.best_test:
                    self.best_test = test_cr
                    self.test_iteration = i
                self.m_v = self.decay_0 * self.m_v + (
                    1 - self.decay_0) * derivative_v(self.activation, Z, Y, T)
                self.dm_v = self.m_v / (1 - self.decay_0**(i + 1))
                self.v_v = self.decay_1 * self.v_v + (
                    1 - self.decay_1) * derivative_v(self.activation, Z, Y,
                                                     T)**2
                self.dv_v = self.v_v / (1 - self.decay_1**(i + 1))
                self.m_b1 = self.decay_0 * self.m_b1 + (
                    1 - self.decay_0) * derivative_b1(self.activation, Y, T)
                self.dm_b1 = self.m_b1 / (1 - self.decay_0**(i + 1))
                self.v_b1 = self.decay_1 * self.v_b1 + (
                    1 - self.decay_1) * derivative_b1(self.activation, Y, T)**2
                self.dv_b1 = self.v_b1 / (1 - self.decay_1**(i + 1))
                self.m_w = self.decay_0 * self.m_w + (
                    1 - self.decay_0) * derivative_w(self.activation, X, Y, Z,
                                                     T, self.v)
                self.dm_w = self.m_w / (1 - self.decay_0**(i + 1))
                self.v_w = self.decay_1 * self.v_w + (
                    1 - self.decay_1) * derivative_w(self.activation, X, Y, Z,
                                                     T, self.v)**2
                self.dv_w = self.v_w / (1 - self.decay_1**(i + 1))
                self.m_b0 = self.decay_0 * self.m_b0 + (
                    1 - self.decay_0) * derivative_b0(self.activation, Y, Z, T,
                                                      self.v)
                self.dm_b0 = self.m_b0 / (1 - self.decay_0**(i + 1))
                self.v_b0 = self.decay_1 * self.v_b0 + (
                    1 - self.decay_1) * derivative_b0(self.activation, Y, Z, T,
                                                      self.v)**2
                self.dv_b0 = self.v_b0 / (1 - self.decay_1**(i + 1))
                self.v -= self.learning_rate * self.dm_v / (
                    np.sqrt(self.dv_v + self.epsilon))
                self.b_1 -= self.learning_rate * self.dm_b1 / (
                    np.sqrt(self.dv_b1 + self.epsilon))
                self.w -= self.learning_rate * self.dm_w / (
                    np.sqrt(self.dv_w + self.epsilon))
                self.b_0 -= self.learning_rate * self.dm_b0 / (
                    np.sqrt(self.dv_b0 + self.epsilon))
            if i % 100 == 0:
                print(i, 'Train Cost: ', self.train_cost[i],
                      'Train Classification Rate: ', self.train_cr[i])