def grid(): X,Y = transform_data() X,Y = shuffle(X,Y) N = len(X)//2 Xtrain = X[:N] Ytrain = Y[:N] Ttrain = generate_T(Ytrain) Xtest = X[N:] Ytest = Y[N:] Ttest = generate_T(Ytest) N,D = Xtrain.shape K = len(set(Y)) w0 = np.random.randn(D,K)/np.sqrt(D+K) b0 = np.random.randn(K)/np.sqrt(K) learning_rates = [10**i for i in range(-7,-3,1)] momentums = [1-10**i for i in sorted(list(range(-4,0)),reverse=True)] iterations = 2000 best_lr = 0 best_momentum = 0 best_cr = 0 cost = {} cr = {} for lr in learning_rates: learning_rate = lr for mu in momentums: dw = 0 db = 0 cost[(lr,mu)] = list() cr[(lr,mu)] = list() for i in range(iterations): if i == 0: A_train = relu(Xtrain.dot(w0) + b0) A_test = relu(Xtest.dot(w0) + b0) else: A_train = relu(Xtrain.dot(w) + b0) A_test = relu(Xtest.dot(w) + b0) Y_train = np.exp(A_train)/np.exp(A_train).sum(axis=1,keepdims=True) Y_test = np.exp(A_test)/np.exp(A_test).sum(axis=1,keepdims=True) P_test = np.argmax(Y_test,axis=1) cost[(lr,mu)].append(cross_entropy(Y_test,Ttest)) current_cr = classification_rate(P_test,Ytest) cr[(lr,mu)].append(current_cr) if current_cr > best_cr: best_cr = current_cr best_lr = lr best_mu = mu dw = mu*dw - (1-mu)*learning_rate*derivative_w(Xtrain,Y_train,Ttrain) db = mu*db - (1-mu)*learning_rate*derivative_b(Y_train,Ttrain) if i == 0: w = w0 + dw b = b0 + db else: w += dw b += db if i % 100 == 0: print('Learning Rate: ',lr,'Momentum: ',mu,'Cost: ',cost[(lr,mu)][i],'Classification Rate: ',cr[(lr,mu)][i]) if i == (iterations - 1): print('') return cost,cr,best_lr,best_mu,best_cr
def exp_decay(learning_rate): X, Y = transform_data() X, Y = shuffle(X, Y) N = len(X) // 2 Xtrain = X[:N] Ytrain = Y[:N] Ttrain = generate_T(Ytrain) Xtest = X[N:] Ytest = Y[N:] Ttest = generate_T(Ytest) N, D = Xtrain.shape M = 100 K = len(set(Y)) iterations = 50 batch_N = 250 batches = N // batch_N dv = 0 d_b1 = 0 dw = 0 d_b0 = 0 mu = .9 v = np.random.randn(M, K) / np.sqrt(M + K) b_1 = np.random.randn(K) / np.sqrt(K) w = np.random.randn(D, M) / np.sqrt(D + M) b_0 = np.random.randn(M) / np.sqrt(M) learning_rate = learning_rate exp_cost = [] exp_cr = [] exp_lr = [] best_exp = 0 best_iteration = 0 for i in range(iterations): learning_rate = learning_rate * np.exp(-K * i) exp_lr.append(learning_rate) for b in range(batches): X = Xtrain[b * batches:(b + 1) * batches, :] T = Ttrain[b * batches:(b + 1) * batches, :] Y, Z = generate_Y('tanh', X, w, b_0, v, b_1) Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1) P_test = np.argmax(Y_test, axis=1) if b % batches == 0: exp_cost.append(cross_entropy(Y_test, Ttest)) cr = classification_rate(P_test, Ytest) exp_cr.append(cr) if cr > best_exp: best_exp = cr best_iteration = i dv = mu * dv - learning_rate * derivative_v('tanh', Z, Y, T) d_b1 = mu * d_b1 - learning_rate * derivative_b1('tanh', Y, T) dw = mu * dw - learning_rate * derivative_w('tanh', X, Y, Z, T, v) d_b0 = mu * d_b0 - learning_rate * derivative_b0( 'tanh', Y, Z, T, v) v += dv b_1 += d_b1 w += dw b_0 += d_b0 if i % 10 == 0: print('Exp Cost: ', exp_cost[i], 'Exp Classification: ', exp_cr[i]) return exp_cost, exp_cr, exp_lr, best_exp, best_iteration
def nesterov_momentum(learning_rate): X, Y = transform_data() X, Y = shuffle(X, Y) N = len(X) // 2 Xtrain = X[:N] Ytrain = Y[:N] Ttrain = generate_T(Ytrain) Xtest = X[N:] Ytest = Y[N:] Ttest = generate_T(Ytest) N, D = Xtrain.shape M = 100 K = len(set(Y)) iterations = 50 batch_N = 250 batches = N // batch_N v = np.random.randn(M, K) / np.sqrt(M + K) b_1 = np.random.randn(K) / np.sqrt(K) w = np.random.randn(D, M) / np.sqrt(D + M) b_0 = np.random.randn(M) / np.sqrt(M) mu = .9 dv = 0 db_1 = 0 dw = 0 db_0 = 0 nesterov_cost = [] nesterov_cr = [] best_nesterov = 0 best_iteration = 0 for i in range(iterations): for b in range(batches): X = Xtrain[b * batch_N:(b + 1) * batch_N, :] T = Ttrain[b * batch_N:(b + 1) * batch_N, :] Y, Z = generate_Y('tanh', X, w, b_0, v, b_1) Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1) P_test = np.argmax(Y_test, axis=1) if b % batches == 0: nesterov_cost.append(cross_entropy(Y_test, Ttest)) cr = classification_rate(P_test, Ytest) nesterov_cr.append(cr) if cr > best_nesterov: best_nesterov = cr best_iteration = i dv = mu * dv - learning_rate * derivative_v('tanh', Z, Y, T) db_1 = mu * db_1 - learning_rate * derivative_b1('tanh', Y, T) dw = mu * dw - learning_rate * derivative_w('tanh', X, Y, Z, T, v) db_0 = mu * db_0 - learning_rate * derivative_b0( 'tanh', Y, Z, T, v) v += mu * dv - learning_rate * derivative_v('tanh', Z, Y, T) b_1 += mu * db_1 - learning_rate * derivative_b1('tanh', Y, T) w += mu * dw + learning_rate * derivative_w('tanh', X, Y, Z, T, v) b_0 += mu * db_0 - learning_rate * derivative_b0( 'tanh', Y, Z, T, v) if i % 100 == 0: print('Nesterov Cost: ', nesterov_cost[i], 'Nesterov Classification: ', nesterov_cr[i]) return nesterov_cost, nesterov_cr, best_nesterov, best_iteration
def train(self, X, Y, activation=1, lr=10e-7, reg=10e-7, epoch=10): N, D = X.shape #Diamentionality of our data batch_size = 500 n_batches = int(N / batch_size) ind = tar2ind( Y ) # WE convert our target array into indicator matrix using one hot encoding _, K = ind.shape self.W1 = np.random.randn(D, self.M) / np.sqrt( D) #Input to hidden weight self.W2 = np.random.randn(self.M, K) / np.sqrt( self.M) #Hidden to output weights self.b1 = np.random.randn(self.M) self.b2 = np.random.randn(K) dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 mu = 0.9 # Momentum decay_rate = 0.99 cost = [] for n in range(0, 200): #tempx , tempy = shuffle(X, ind) for i in range(0, n_batches): X_tr = X[i * batch_size:(i * batch_size + batch_size), :] Y_tr = Y[i * batch_size:(i * batch_size + batch_size), ] ind = tar2ind(Y_tr) output, hidden = forward(X_tr, activation, self.W1, self.b1, self.W2, self.b2) #Performing backpropagation now dW2 = mu * dW2 + lr * (derivative_W2(ind, output, hidden, reg, self.W2)) self.W2 = self.W2 + dW2 db2 = mu * db2 + lr * (derivative_b2(ind, output, reg, self.b2)) self.b2 = self.b2 + db2 dW1 = mu * dW1 + lr * (derivative_W1( ind, output, hidden, self.W2, X_tr, activation, reg, self.W1)) self.W1 = self.W1 + dW1 db1 = mu * db1 + lr * (derivative_b1( ind, output, hidden, self.W2, activation, reg, self.b1)) self.b1 = self.b1 + db1 c = cross_entropy(ind, output) cost.append(c) if i % 10 == 0: result = np.argmax(output, axis=1) r = classification_rate(Y_tr, result) print("iteration:- ", i, "cost:- ", c, "classification rate:- ", r)
def batch(learning_rate): X, Y = transform_data() X, Y = shuffle(X, Y) N = len(X) // 2 Xtrain = X[:N] Ytrain = Y[:N] Ttrain = generate_T(Ytrain) Xtest = X[N:] Ytest = Y[N:] Ttest = generate_T(Ytest) N, D = Xtrain.shape M = 100 K = len(set(Y)) iterations = 50 batch_N = 250 batches = len(X) // batch_N v = np.random.randn(M, K) / np.sqrt(M + K) b_1 = np.random.randn(K) / np.sqrt(K) w = np.random.randn(D, M) / np.sqrt(D + M) b_0 = np.random.randn(M) / np.sqrt(M) batch_cost = [] batch_cr = [] best_batch = 0 best_iteration = 0 for i in range(iterations): for b in range(batches): X = Xtrain[b * batch_N:(b + 1) * batch_N, :] T = Ttrain[b * batch_N:(b + 1) * batch_N, :] Y, Z = generate_Y('tanh', X, w, b_0, v, b_1) Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1) P_test = np.argmax(Y_test, axis=1) if b % batches == 0: batch_cost.append(cross_entropy(Y_test, Ttest)) cr = classification_rate(P_test, Ytest) batch_cr.append(cr) if cr > best_batch: best_batch = cr best_iteration = i v -= learning_rate * derivative_v('tanh', Z, Y, T) b_1 -= learning_rate * derivative_b1('tanh', Y, T) w -= learning_rate * derivative_w('tanh', X, Y, Z, T, v) b_0 -= learning_rate * derivative_b0('tanh', Y, Z, T, v) if i % 100 == 0: print('Batch Cost: ', batch_cost[i], 'Batch Classification: ', batch_cr[i]) return batch_cost, batch_cr, best_batch, best_iteration
train_costs = [] test_costs = [] learning_rate = 0.001 for i in xrange(10000): pYtrain = fwd(Xtrain, w, b) pYtest = fwd(Xtest, w, b) ctrain = xentropy(Ytrain, pYtrain) ctest = xentropy(Ytest, pYtest) train_costs.append(ctrain) test_costs.append(ctest) w -= learning_rate * Xtrain.T.dot(pYtrain - Ytrain) b -= learning_rate * (pYtrain - Ytrain).sum() if i % 1000 == 0: print i, ctrain, ctest, xentropy(Ytrain, pYtrain), xentropy(Ytest, pYtest) print "Final train classification_rate", classification_rate( Ytrain, np.round(pYtrain)) print "Final train classification_rate", classification_rate( Ytest, np.round(pYtest)) legend1, = plt.plot(train_costs, label='train cost') legend2, = plt.plot(test_costs, label='test cost') plt.legend([legend1, legend2]) plt.show()
''' Created on May 14, 2017 @author: Varela ''' #https://www.udemy.com/data-science-logistic-regression-in-python/learn/v4/t/lecture/5286980?start=0 import numpy as np import pandas as pd from ecommerce_preprocess import get_binary_data from util import sigmoid, fwd, classification_rate #randomly predicts data X, Y = get_binary_data() D = X.shape[1] W = np.random.randn(D) b = 0 P_Y_given_X = fwd(X, W, b) predictions = np.round(P_Y_given_X) print "Score:", classification_rate(Y, predictions)
def rmsprop(learning_rate): X, Y = transform_data() X, Y = shuffle(X, Y) N = len(X) // 2 Xtrain = X[:N] Ytrain = Y[:N] Ttrain = generate_T(Ytrain) Xtest = X[N:] Ytest = Y[N:] Ttest = generate_T(Ytest) Ttest = generate_T(Ytest) N, D = Xtrain.shape M = 100 K = len(set(Y)) iterations = 50 batch_N = 250 batches = N // batch_N dv = 0 d_b1 = 0 dw = 0 d_b0 = 0 mu = .9 v = np.random.randn(M, K) / np.sqrt(M + K) b_1 = np.random.randn(K) / np.sqrt(K) w = np.random.randn(D, M) / np.sqrt(D + M) b_0 = np.random.randn(M) / np.sqrt(M) cache_v = np.ones((M, K)) cache_b1 = np.ones(K) cache_w = np.ones((D, M)) cache_b0 = np.ones(M) epsilon = 10e-10 decay = .9 rmsprop_cost = [] rmsprop_cr = [] best_rms = 0 best_iteration = 0 for i in range(iterations): for b in range(batches): X = Xtrain[b * batches:(b + 1) * batches, :] T = Ttrain[b * batches:(b + 1) * batches, :] Y, Z = generate_Y('tanh', X, w, b_0, v, b_1) Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1) P_test = np.argmax(Y_test, axis=1) if b % batches == 0: rmsprop_cost.append(cross_entropy(Y_test, Ttest)) cr = classification_rate(P_test, Ytest) rmsprop_cr.append(cr) if cr > best_rms: best_rms = cr best_iteration = i cache_v = decay * cache_v + (1 - decay) * derivative_v( 'tanh', Z, Y, T)**2 cache_b1 = decay * cache_b1 + (1 - decay) * derivative_b1( 'tanh', Y, T)**2 cache_w = decay * cache_w + (1 - decay) * derivative_w( 'tanh', X, Y, Z, T, v)**2 cache_b0 = decay * cache_b0 + (1 - decay) * derivative_b0( 'tanh', Y, Z, T, v)**2 dv = mu * dv - learning_rate * derivative_v( 'tanh', Z, Y, T) / (np.sqrt(cache_v + epsilon)) d_b1 = mu * d_b1 - learning_rate * derivative_b1( 'tanh', Y, T) / (np.sqrt(cache_b1 + epsilon)) dw = mu * dw - learning_rate * derivative_w( 'tanh', X, Y, Z, T, v) / (np.sqrt(cache_w + epsilon)) d_b0 = mu * d_b0 - learning_rate * derivative_b0( 'tanh', Y, Z, T, v) / (np.sqrt(cache_b0 + epsilon)) v += dv b_1 += d_b1 w += dw b_0 += d_b0 if i % 10 == 0: print('RMSProp Cost: ', rmsprop_cost[i], 'RMSProp Classification: ', rmsprop_cr[i]) return rmsprop_cost, rmsprop_cr, best_rms, best_iteration
Y = expA / expA.sum(axis=1, keepdims=True) return Y def classification_rate(Y, P): n_correct = 0 n_total = 0 for i in xrange(len(Y)): n_total += 1 if Y[i] == P[i]: n_correct += 1 return float(n_correct) / n_total P_Y_given_X = forward(X, W1, b1, W2, b2) P = np.argmax(P_Y_given_X, axis=1) # assert(len(P) == len(Y)) print "classification rate for random weights:", classification_rate(Y, P) Z = utl.fwd(X, W1, b1) A = Z.dot(W2) + b2 P_Y_given_X = utl.softmax(A) P = np.argmax(P_Y_given_X, axis=1) print "classification rate for random weights (test):", utl.classification_rate( Y, P) utl.fwdprop(X, W1, b1, W2, b2) print "classification rate for random weights (test-2):", utl.classification_rate( Y, P)
def main(argv): #load data test_data_1 = np.load(FLAGS.data_dir + 'test_x_1.npy') test_data_2 = np.load(FLAGS.data_dir + 'test_x_2.npy') test_data_3 = np.load(FLAGS.data_dir + 'test_x_3.npy') test_data_4 = np.load(FLAGS.data_dir + 'test_x_4.npy') test_data = [test_data_1, test_data_2, test_data_3, test_data_4] test_labels_1 = np.load(FLAGS.data_dir + 'test_y_1.npy') test_labels_2 = np.load(FLAGS.data_dir + 'test_y_2.npy') test_labels_3 = np.load(FLAGS.data_dir + 'test_y_3.npy') test_labels_4 = np.load(FLAGS.data_dir + 'test_y_4.npy') test_labels = [test_labels_1, test_labels_2, test_labels_3, test_labels_4] train_data_1 = np.load(FLAGS.data_dir + 'train_x_1.npy') train_data_2 = np.load(FLAGS.data_dir + 'train_x_2.npy') train_data_3 = np.load(FLAGS.data_dir + 'train_x_3.npy') train_data_4 = np.load(FLAGS.data_dir + 'train_x_4.npy') train_data = [train_data_1, train_data_2, train_data_3, train_data_4] train_labels_1 = np.load(FLAGS.data_dir + 'train_y_1.npy') train_labels_2 = np.load(FLAGS.data_dir + 'train_y_2.npy') train_labels_3 = np.load(FLAGS.data_dir + 'train_y_3.npy') train_labels_4 = np.load(FLAGS.data_dir + 'train_y_4.npy') train_labels = [ train_labels_1, train_labels_2, train_labels_3, train_labels_4 ] #count data test_count = [ test_data[0].shape[0], test_data[1].shape[0], test_data[2].shape[0], test_data[3].shape[0] ] train_count = [ train_data[0].shape[0], train_data[1].shape[0], train_data[2].shape[0], train_data[3].shape[0] ] #specify model input_placeholder = tf.placeholder(tf.float32, [None, 16641], name='input_placeholder') my_network = tf.identity(model.build_network(input_placeholder), name='output2') #define classification loss #code adapted from Paul Quint's hackathon 3 REG_COEFF = 0.0001 labels = tf.placeholder(tf.float32, [None, 7], name='labels') cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=my_network) confusion_matrix_op = tf.confusion_matrix(tf.argmax(labels, axis=1), tf.argmax(my_network, axis=1), num_classes=7) regularization_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = cross_entropy + REG_COEFF * sum(regularization_losses) #set up training and saving #code adapted from Paul Quint's hackathon 3 global_step_tensor = tf.get_variable('global_step', trainable=False, shape=[], initializer=tf.zeros_initializer) optimizer = tf.train.AdamOptimizer() train_op = optimizer.minimize(total_loss, global_step=global_step_tensor) saver = tf.train.Saver() sum_cross_entropy = tf.reduce_mean(cross_entropy) EPOCHS_BEFORE_STOPPING = 12 #run the actual training #code adapted from Paul Quint's hackathon 3 with tf.Session() as session: session.run(tf.global_variables_initializer()) best_test_conf_mxs = [] best_epoch = [0, 0, 0, 0] best_test_ce = [10, 10, 10, 10] best_train_ce = [0, 0, 0, 0] best_classification_rate = [0, 0, 0, 0] epochs_since_best = [0, 0, 0, 0] for k in range(0, 4): session.run(tf.global_variables_initializer()) batch_size = FLAGS.batch_size print("\n !!!!! NEW K (" + str(k) + ") !!!!!\n") for epoch in range(FLAGS.max_epoch_num): print("################### EPOCH " + str(epoch) + " #####################") print("##################################################\n") ce_vals = [] for i in range(train_count[k] // batch_size): batch_data = train_data[k][i * batch_size:(i + 1) * batch_size, :] batch_labels = train_labels[k][i * batch_size:(i + 1) * batch_size] _, train_ce = session.run([train_op, sum_cross_entropy], { input_placeholder: batch_data, labels: batch_labels }) ce_vals.append(train_ce) avg_train_ce = sum(ce_vals) / len(ce_vals) best_train_ce[k] = avg_train_ce print('TRAIN CROSS ENTROPY: ' + str(avg_train_ce)) print("\n##################################################") # run gradient steps and report mean loss on train data ce_vals = [] conf_mxs = [] for i in range(test_count[k] // batch_size): batch_data = test_data[k][i * batch_size:(i + 1) * batch_size, :] batch_labels = test_labels[k][i * batch_size:(i + 1) * batch_size] test_ce, conf_matrix = session.run( [sum_cross_entropy, confusion_matrix_op], { input_placeholder: batch_data, labels: batch_labels }) ce_vals.append(test_ce) conf_mxs.append(conf_matrix) avg_test_ce = sum(ce_vals) / len(ce_vals) classification_rate = util.classification_rate(sum(conf_mxs), 7) print('TEST CROSS ENTROPY: ' + str(avg_test_ce)) print('TEST CONFUSION MATRIX:') print(str(sum(conf_mxs))) print('TEST CLASSIFICATION RATE:' + str(classification_rate)) best_test_conf_mxs.append(sum(conf_mxs)) best_test_ce[k] = avg_test_ce best_classification_rate[k] = classification_rate print('Confusion Matrix: ') print(str(sum(best_test_conf_mxs))) print('Avg Test CE: ' + str(np.average(best_test_ce))) print('Avg Train CE: ' + str(np.average(best_train_ce))) print('Avg Classification Rate: ' + str(np.average(best_classification_rate))) print('Generating model now...') session.run(tf.global_variables_initializer()) for j in range(0, 4): for epoch in range(FLAGS.max_epoch_num): for i in range(train_count[j] // batch_size): batch_data = train_data[j][i * batch_size:(i + 1) * batch_size, :] batch_labels = train_labels[j][i * batch_size:(i + 1) * batch_size] _, train_ce = session.run([train_op, sum_cross_entropy], { input_placeholder: batch_data, labels: batch_labels }) saver.save(session, FLAGS.save_dir) print('Model is generated and saved')
def full(self): for i in range(self.iterations): Y_train, Z = generate_Y(self.activation, self.Xtrain, self.w, self.b_0, self.v, self.b_1) P_train = np.argmax(Y_train, axis=1) Y_test, _ = generate_Y(self.activation, self.Xtest, self.w, self.b_0, self.v, self.b_1) P_test = np.argmax(Y_test, axis=1) self.train_cost.append(cross_entropy(Y_train, self.Ttrain)) self.test_cost.append(cross_entropy(Y_test, self.Ttest)) train_cr = classification_rate(P_train, self.Ytrain) self.train_cr.append(train_cr) test_cr = classification_rate(P_test, self.Ytest) self.test_cr.append(test_cr) if train_cr > self.best_train: self.best_train = train_cr self.train_iteration = i if test_cr > self.best_test: self.best_test = test_cr self.test_iteration = i self.m_v = self.decay_0 * self.m_v + ( 1 - self.decay_0) * derivative_v(self.activation, Z, Y_train, self.Ttrain) self.dm_v = self.m_v / (1 - self.decay_0**(i + 1)) self.v_v = self.decay_1 * self.v_v + ( 1 - self.decay_1) * derivative_v(self.activation, Z, Y_train, self.Ttrain)**2 self.dv_v = self.v_v / (1 - self.decay_1**(i + 1)) self.m_b1 = self.decay_0 * self.m_b1 + ( 1 - self.decay_0) * derivative_b1(self.activation, Y_train, self.Ttrain) self.dm_b1 = self.m_b1 / (1 - self.decay_0**(i + 1)) self.v_b1 = self.decay_1 * self.v_b1 + ( 1 - self.decay_1) * derivative_b1(self.activation, Y_train, self.Ttrain)**2 self.dv_b1 = self.v_b1 / (1 - self.decay_1**(i + 1)) self.m_w = self.decay_0 * self.m_w + ( 1 - self.decay_0) * derivative_w(self.activation, self.Xtrain, Y_train, Z, self.Ttrain, self.v) self.dm_w = self.m_w / (1 - self.decay_0**(i + 1)) self.v_w = self.decay_1 * self.v_w + ( 1 - self.decay_1) * derivative_w(self.activation, self.Xtrain, Y_train, Z, self.Ttrain, self.v)**2 self.dv_w = self.v_w / (1 - self.decay_1**(i + 1)) self.m_b0 = self.decay_0 * self.m_b0 + ( 1 - self.decay_0) * derivative_b0(self.activation, Y_train, Z, self.Ttrain, self.v) self.dm_b0 = self.m_b0 / (1 - self.decay_0**(i + 1)) self.v_b0 = self.decay_1 * self.v_b0 + ( 1 - self.decay_1) * derivative_b0(self.activation, Y_train, Z, self.Ttrain, self.v)**2 self.dv_b0 = self.v_b0 / (1 - self.decay_1**(i + 1)) self.v -= self.learning_rate * self.dm_v / (np.sqrt(self.dv_v + self.epsilon)) self.b_1 -= self.learning_rate * self.dm_b1 / ( np.sqrt(self.dv_b1 + self.epsilon)) self.w -= self.learning_rate * self.dm_w / (np.sqrt(self.dv_w + self.epsilon)) self.b_0 -= self.learning_rate * self.dm_b0 / ( np.sqrt(self.dv_b0 + self.epsilon)) if i % 100 == 0: print(i, 'Train Cost: ', self.train_cost[i], 'Train Classification Rate: ', self.train_cr[i])
def stochastic(self, samples): for i in range(self.iterations): current_X, current_T = shuffle(self.Xtrain, self.Ttrain) for s in range(samples): X = current_X[s, :].reshape(1, current_X.shape[1]) T = current_T[s, :].reshape(1, current_T.shape[1]) Y, Z = generate_Y(self.activation, X, self.w, self.b_0, self.v, self.b_1) Y_train, _ = generate_Y(self.activation, self.Xtrain, self.w, self.b_0, self.v, self.b_1) P_train = np.argmax(Y_train, axis=1) Y_test, _ = generate_Y(self.activation, self.Xtest, self.w, self.b_0, self.v, self.b_1) P_test = np.argmax(Y_test, axis=1) self.train_cost.append(cross_entropy(Y_train, self.Ttrain)) self.test_cost.append(cross_entropy(Y_test, self.Ttest)) train_cr = classification_rate(P_train, self.Ytrain) self.train_cr.append(train_cr) test_cr = classification_rate(P_test, self.Ytest) self.test_cr.append(test_cr) if train_cr > self.best_train: self.best_train = train_cr self.train_iteration = i if test_cr > self.best_test: self.best_test = test_cr self.test_iteration = i self.m_v = self.decay_0 * self.m_v + ( 1 - self.decay_0) * derivative_v(self.activation, Z, Y, T) self.dm_v = self.m_v / (1 - self.decay_0**(i + 1)) self.v_v = self.decay_1 * self.v_v + ( 1 - self.decay_1) * derivative_v(self.activation, Z, Y, T)**2 self.dv_v = self.v_v / (1 - self.decay_1**(i + 1)) self.m_b1 = self.decay_0 * self.m_b1 + ( 1 - self.decay_0) * derivative_b1(self.activation, Y, T) self.dm_b1 = self.m_b1 / (1 - self.decay_0**(i + 1)) self.v_b1 = self.decay_1 * self.v_b1 + ( 1 - self.decay_1) * derivative_b1(self.activation, Y, T)**2 self.dv_b1 = self.v_b1 / (1 - self.decay_1**(i + 1)) self.m_w = self.decay_0 * self.m_w + ( 1 - self.decay_0) * derivative_w(self.activation, X, Y, Z, T, self.v) self.dm_w = self.m_w / (1 - self.decay_0**(i + 1)) self.v_w = self.decay_1 * self.v_w + ( 1 - self.decay_1) * derivative_w(self.activation, X, Y, Z, T, self.v)**2 self.dv_w = self.v_w / (1 - self.decay_1**(i + 1)) self.m_b0 = self.decay_0 * self.m_b0 + ( 1 - self.decay_0) * derivative_b0(self.activation, Y, Z, T, self.v) self.dm_b0 = self.m_b0 / (1 - self.decay_0**(i + 1)) self.v_b0 = self.decay_1 * self.v_b0 + ( 1 - self.decay_1) * derivative_b0(self.activation, Y, Z, T, self.v)**2 self.dv_b0 = self.v_b0 / (1 - self.decay_1**(i + 1)) self.v -= self.learning_rate * self.dm_v / ( np.sqrt(self.dv_v + self.epsilon)) self.b_1 -= self.learning_rate * self.dm_b1 / ( np.sqrt(self.dv_b1 + self.epsilon)) self.w -= self.learning_rate * self.dm_w / ( np.sqrt(self.dv_w + self.epsilon)) self.b_0 -= self.learning_rate * self.dm_b0 / ( np.sqrt(self.dv_b0 + self.epsilon)) if i % 100 == 0: print(i, 'Train Cost: ', self.train_cost[i], 'Train Classification Rate: ', self.train_cr[i])