def fit(self, X, Y, learning_rate=5 * 10e-7, reg=1.0, epochs=10000, show_fig=False, use_tanh=True): self.use_tanh = use_tanh X, Y = shuffle(X, Y) Xvalid, Yvalid = X[-1000:], Y[-1000:] X, Y = X[:-1000], Y[:-1000] N, D = X.shape self.W1 = np.random.randn(D, self.M) / np.sqrt(D + self.M) self.b1 = np.zeros(self.M) self.W2 = np.random.randn(self.M) / np.sqrt(self.M) self.b2 = 0 costs = [] best_validation_error = 1 for i in range(epochs): # forward propagation pY, Z = self.forward(X) # gradient descent pY_Y = pY - Y self.W2 -= learning_rate * (Z.T.dot(pY_Y) + reg * self.W2) self.b2 -= learning_rate * ((pY_Y).sum() + reg * self.b2) if self.use_tanh: dZ = np.outer(pY_Y, self.W2) * (1 - Z * Z) else: dZ = np.outer(pY_Y, self.W2) * (Z > 0) self.W1 -= learning_rate * (X.T.dot(dZ) + reg * self.W1) self.b1 -= learning_rate * (dZ.sum(axis=0) + reg * self.b1) if i % 20 == 0: pYvalid, _ = self.forward(Xvalid) c = sigmoid_cost(Yvalid, pYvalid) costs.append(c) e = error_rate(Yvalid, np.round(pYvalid)) print("i:", i, "cost:", c, "error", e) if e < best_validation_error: best_validation_error = e print("best validation error:", best_validation_error) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=10e-6, reg=10e-1, epochs=10000, show_fig=False): X, Y = shuffle(X, Y) Xvalid, Yvalid = X[-1000:], Y[-1000:] X, Y = X[:-1000], Y[:-1000] N, D = X.shape K = len(set(Y)) T = y2indicator(Y) self.W1 = np.random.rand(D, self.M) / np.sqrt(D + self.M) self.b1 = np.zeros(self.M) self.W2 = np.random.randn(self.M, K) / np.sqrt(self.M + K) self.b2 = np.zeros(K) costs = [] best_validation_error = 1 for i in range(epochs): pY, Z = self.forward(X) # gradient descent step pY_T = pY - T self.W2 -= learning_rate * (Z.T.dot(pY_T) + reg * self.W2) self.b2 -= learning_rate * (pY_T.sum(axis=0) + reg * self.b2) #dZ = pY_T.dot(self.W2.T) * (Z > 0) dZ = pY_T.dot(self.W2.T) * (1 - Z * Z) self.W1 -= learning_rate * (X.T.dot(dZ) + reg * self.W1) self.b1 -= learning_rate * (dZ.sum(axis=0) + reg * self.b1) if i % 10 == 0: pYvalid, _ = self.forward(Xvalid) c = cost2(Yvalid, pYvalid) costs.append(c) e = error_rate(pYvalid, np.argmax(pYvalid, axis=1)) print("i:", i, "cost:", c, "error:", e) if e < best_validation_error: best_validation_error = e print("Best validation error: ", best_validation_error) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=10e-8, reg=10e-12, epochs=10000, show_fig=False): X, Y = shuffle(X, Y) Xvalid, Yvalid = X[-1000:], Y[-1000:] Tvalid = y2indicator(Yvalid) X, Y = X[:-1000], Y[:-1000] N, D = X.shape K = len(set(Y)) T = y2indicator(Y) self.W = np.random.randn(D, K) / np.sqrt(D + K) self.b = np.zeros(K) costs = [] best_validation_error = 1 for i in range(epochs): # forward prop pY = self.forward(X) # gradient descent self.W -= learning_rate * (X.T.dot(pY - T) + reg * self.W) self.b -= learning_rate * ((pY - T).sum(axis=0) + reg * self.b) if i % 10 == 0: pYvalid = self.forward(Xvalid) c = cost(Tvalid, pYvalid) costs.append(c) e = error_rate(Yvalid, np.argmax(pYvalid, axis=1)) print("i:", i, "cost:", c, "error:", e) if e < best_validation_error: best_validation_error = e print("best validation error:", best_validation_error) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=10e-7, reg=0, epochs=120000, show_fig=False): X, Y = shuffle(X, Y) Xvalid, Yvalid = X[-1000:], Y[-1000:] X, Y = X[:-1000], Y[:-1000] N, D = X.shape self.W = np.random.randn(D) / np.sqrt(D) self.b = 0 costs = [] best_validation_error = 1 for i in range(epochs): pY = self.forward(X) # gradient descent self.W -= learning_rate * (X.T.dot(pY - Y) + reg * self.W) self.b -= learning_rate * ((pY - Y).sum() + reg * self.b) if i % 20 == 0: pYvalid = self.forward(Xvalid) c = sigmoid_cost(Yvalid, pYvalid) costs.append(c) e = error_rate(Yvalid, np.round(pYvalid)) print("i:", i, "cost:", c, "error:", e) if e < best_validation_error: best_validation_error = e print("Best validation error:", best_validation_error) if show_fig: plt.plot(costs) plt.show()
def score(self, X, Y): prediction = self.predict(X) return 1 - error_rate(Y, prediction)
def main(): # load the data, transform as needed train = loadmat('../large_files/train_32x32.mat') test = loadmat('../large_files/test_32x32.mat') # Need to scale! don't leave as 0..255 # Y is a N x 1 matrix with values 1..10 (MATLAB indexes by 1) # So flatten it and make it 0..9 # Also need indicator matrix for cost calculation Xtrain = rearrange(train['X']) Ytrain = train['y'].flatten() - 1 del train Xtrain, Ytrain = shuffle(Xtrain, Ytrain) Ytrain_ind = y2indicator(Ytrain) Xtest = rearrange(test['X']) Ytest = test['y'].flatten() - 1 del test Ytest_ind = y2indicator(Ytest) max_iter = 8 print_period = 10 lr = np.float32(0.00001) reg = np.float32(0.01) mu = np.float32(0.99) N = Xtrain.shape[0] batch_sz = 500 n_batches = N / batch_sz M = 500 K = 10 poolsz = (2, 2) # after conv will be of dimension 32 - 5 + 1 = 28 # after downsample 28 / 2 = 14 W1_shape = (20, 3, 5, 5) # (num_feature_maps, num_color_channels, filter_width, filter_height) W1_init = init_filter(W1_shape, poolsz) b1_init = np.zeros(W1_shape[0], dtype=np.float32) # one bias per output feature map # after conv will be of dimension 14 - 5 + 1 = 10 # after downsample 10 / 2 = 5 W2_shape = (50, 20, 5, 5) # (num_feature_maps, old_num_feature_maps, filter_width, filter_height) W2_init = init_filter(W2_shape, poolsz) b2_init = np.zeros(W2_shape[0], dtype=np.float32) # vanilla ANN weights W3_init = np.random.randn(W2_shape[0]*5*5, M) / np.sqrt(W2_shape[0]*5*5 + M) b3_init = np.zeros(M, dtype=np.float32) W4_init = np.random.randn(M, K) / np.sqrt(M + K) b4_init = np.zeros(K, dtype=np.float32) # define theano variables and expressions X = T.tensor4('X', dtype='float32') Y = T.matrix('T') W1 = theano.shared(W1_init, 'W1') b1 = theano.shared(b1_init, 'b1') W2 = theano.shared(W2_init, 'W2') b2 = theano.shared(b2_init, 'b2') W3 = theano.shared(W3_init.astype(np.float32), 'W3') b3 = theano.shared(b3_init, 'b3') W4 = theano.shared(W4_init.astype(np.float32), 'W4') b4 = theano.shared(b4_init, 'b4') # momentum changes dW1 = theano.shared(np.zeros(W1_init.shape, dtype=np.float32), 'dW1') db1 = theano.shared(np.zeros(b1_init.shape, dtype=np.float32), 'db1') dW2 = theano.shared(np.zeros(W2_init.shape, dtype=np.float32), 'dW2') db2 = theano.shared(np.zeros(b2_init.shape, dtype=np.float32), 'db2') dW3 = theano.shared(np.zeros(W3_init.shape, dtype=np.float32), 'dW3') db3 = theano.shared(np.zeros(b3_init.shape, dtype=np.float32), 'db3') dW4 = theano.shared(np.zeros(W4_init.shape, dtype=np.float32), 'dW4') db4 = theano.shared(np.zeros(b4_init.shape, dtype=np.float32), 'db4') # forward pass Z1 = convpool(X, W1, b1) Z2 = convpool(Z1, W2, b2) Z3 = relu(Z2.flatten(ndim=2).dot(W3) + b3) pY = T.nnet.softmax( Z3.dot(W4) + b4) # define the cost function and prediction params = (W1, b1, W2, b2, W3, b3, W4, b4) reg_cost = reg*np.sum((param*param).sum() for param in params) cost = -(Y * T.log(pY)).sum() + reg_cost prediction = T.argmax(pY, axis=1) # step 3: training expressions and functions update_W1 = W1 + mu*dW1 - lr*T.grad(cost, W1) update_b1 = b1 + mu*db1 - lr*T.grad(cost, b1) update_W2 = W2 + mu*dW2 - lr*T.grad(cost, W2) update_b2 = b2 + mu*db2 - lr*T.grad(cost, b2) update_W3 = W3 + mu*dW3 - lr*T.grad(cost, W3) update_b3 = b3 + mu*db3 - lr*T.grad(cost, b3) update_W4 = W4 + mu*dW4 - lr*T.grad(cost, W4) update_b4 = b4 + mu*db4 - lr*T.grad(cost, b4) # update weight changes update_dW1 = mu*dW1 - lr*T.grad(cost, W1) update_db1 = mu*db1 - lr*T.grad(cost, b1) update_dW2 = mu*dW2 - lr*T.grad(cost, W2) update_db2 = mu*db2 - lr*T.grad(cost, b2) update_dW3 = mu*dW3 - lr*T.grad(cost, W3) update_db3 = mu*db3 - lr*T.grad(cost, b3) update_dW4 = mu*dW4 - lr*T.grad(cost, W4) update_db4 = mu*db4 - lr*T.grad(cost, b4) train = theano.function( inputs=[X, Y], updates=[ (W1, update_W1), (b1, update_b1), (W2, update_W2), (b2, update_b2), (W3, update_W3), (b3, update_b3), (W4, update_W4), (b4, update_b4), (dW1, update_dW1), (db1, update_db1), (dW2, update_dW2), (db2, update_db2), (dW3, update_dW3), (db3, update_db3), (dW4, update_dW4), (db4, update_db4), ], ) # create another function for this because we want it over the whole dataset get_prediction = theano.function( inputs=[X, Y], outputs=[cost, prediction], ) t0 = datetime.now() LL = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] train(Xbatch, Ybatch) if j % print_period == 0: cost_val, prediction_val = get_prediction(Xtest, Ytest_ind) err = error_rate(prediction_val, Ytest) print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, err)) LL.append(cost_val) print("Elapsed time:", (datetime.now() - t0)) plt.plot(LL) plt.show() # visualize W1 (20, 3, 5, 5) W1_val = W1.get_value() grid = np.zeros((8*5, 8*5)) m = 0 n = 0 for i in range(20): for j in range(3): filt = W1_val[i,j] grid[m*5:(m+1)*5,n*5:(n+1)*5] = filt m += 1 if m >= 8: m = 0 n += 1 plt.imshow(grid, cmap='gray') plt.title("W1") plt.show() # visualize W2 (50, 20, 5, 5) W2_val = W2.get_value() grid = np.zeros((32*5, 32*5)) m = 0 n = 0 for i in range(50): for j in range(20): filt = W2_val[i,j] grid[m*5:(m+1)*5,n*5:(n+1)*5] = filt m += 1 if m >= 32: m = 0 n += 1 plt.imshow(grid, cmap='gray') plt.title("W2") plt.show()
def main(): Xtrain, Ytrain, Xtest, Ytest = MNISTData().loadFlatData() Xtrain, Ytrain = shuffle(Xtrain, Ytrain) Xtest, Ytest = shuffle(Xtest, Ytest) Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) max_iter = 20 print_period = 10 N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M1 = 1000 M2 = 500 K = 10 W1_init, b1_init = init_weight_and_biases(D, M1) W2_init, b2_init = init_weight_and_biases(M1, M2) W3_init, b3_init = init_weight_and_biases(M2, K) # define tensorflow vars and expressions X = tf.placeholder(tf.float32, shape=[None, D], name='X') T = tf.placeholder(tf.float32, shape=[None, K], name='T') W1 = tf.Variable(W1_init.astype(np.float32)) b1 = tf.Variable(b1_init.astype(np.float32)) W2 = tf.Variable(W2_init.astype(np.float32)) b2 = tf.Variable(b2_init.astype(np.float32)) W3 = tf.Variable(W3_init.astype(np.float32)) b3 = tf.Variable(b3_init.astype(np.float32)) Z1 = tf.nn.relu(tf.matmul(X, W1) + b1) Z2 = tf.nn.relu(tf.matmul(Z1, W2) + b2) Yish = tf.matmul(Z2, W3) + b3 cost =tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=Yish, labels=T)) train_op = tf.train.RMSPropOptimizer(0.0001, decay=0.99, momentum=0.9).minimize(cost) # used for error rate prediction predict_op = tf.argmax(Yish, 1) LL = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] session.run(train_op, feed_dict={X: Xbatch, T: Ybatch}) if j % print_period == 0: test_cost = session.run(cost, feed_dict={X: Xtest, T: Ytest_ind}) prediction = session.run(predict_op, feed_dict={X: Xtest, T: Ytest_ind}) err = error_rate(prediction, Ytest) print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, test_cost, err)) LL.append(test_cost) plt.plot(LL) plt.show()
def main(): train = loadmat('../large_files/train_32x32.mat') # N = 73257 test = loadmat('../large_files/test_32x32.mat') # N = 26032 # Need to scale! don't leave as 0..255 # Y is a N x 1 matrix with values 1..10 (MATLAB indexes by 1) # So flatten it and make it 0..9 # Also need indicator matrix for cost calculation Xtrain = rearrange(train['X']) Ytrain = train['y'].flatten() - 1 print(len(Ytrain)) del train Xtrain, Ytrain = shuffle(Xtrain, Ytrain) Ytrain_ind = y2indicator(Ytrain) Xtest = rearrange(test['X']) Ytest = test['y'].flatten() - 1 del test Ytest_ind = y2indicator(Ytest) # gradient descent params max_iter = 6 print_period = 10 N = Xtrain.shape[0] batch_sz = 500 n_batches = N // batch_sz # limit samples since input will always have to be same size # you could also just do N = N / batch_sz * batch_sz Xtrain = Xtrain[:73000, ] Ytrain = Ytrain[:73000] Xtest = Xtest[:26000, ] Ytest = Ytest[:26000] Ytest_ind = Ytest_ind[:26000, ] # print "Xtest.shape:", Xtest.shape # print "Ytest.shape:", Ytest.shape # initial weights M = 500 K = 10 poolsz = (2, 2) W1_shape = ( 5, 5, 3, 20 ) # (filter_width, filter_height, num_color_channels, num_feature_maps) W1_init = init_filter(W1_shape, poolsz) b1_init = np.zeros(W1_shape[-1], dtype=np.float32) # one bias per output feature map W2_shape = ( 5, 5, 20, 50 ) # (filter_width, filter_height, old_num_feature_maps, num_feature_maps) W2_init = init_filter(W2_shape, poolsz) b2_init = np.zeros(W2_shape[-1], dtype=np.float32) # vanilla ANN weights W3_init = np.random.randn(W2_shape[-1] * 8 * 8, M) / np.sqrt(W2_shape[-1] * 8 * 8 + M) b3_init = np.zeros(M, dtype=np.float32) W4_init = np.random.randn(M, K) / np.sqrt(M + K) b4_init = np.zeros(K, dtype=np.float32) # define variables and expressions # using None as the first shape element takes up too much RAM unfortunately X = tf.placeholder(tf.float32, shape=(batch_sz, 32, 32, 3), name='X') T = tf.placeholder(tf.float32, shape=(batch_sz, K), name='T') W1 = tf.Variable(W1_init.astype(np.float32)) b1 = tf.Variable(b1_init.astype(np.float32)) W2 = tf.Variable(W2_init.astype(np.float32)) b2 = tf.Variable(b2_init.astype(np.float32)) W3 = tf.Variable(W3_init.astype(np.float32)) b3 = tf.Variable(b3_init.astype(np.float32)) W4 = tf.Variable(W4_init.astype(np.float32)) b4 = tf.Variable(b4_init.astype(np.float32)) Z1 = convpool(X, W1, b1) Z2 = convpool(Z1, W2, b2) Z2_shape = Z2.get_shape().as_list() Z2r = tf.reshape(Z2, [Z2_shape[0], np.prod(Z2_shape[1:])]) Z3 = tf.nn.relu(tf.matmul(Z2r, W3) + b3) Yish = tf.matmul(Z3, W4) + b4 cost = tf.reduce_sum( tf.nn.softmax_cross_entropy_with_logits(logits=Yish, labels=T)) train_op = tf.train.RMSPropOptimizer(0.0001, decay=0.99, momentum=0.9).minimize(cost) # we'll use this to calculate the error rate predict_op = tf.argmax(Yish, 1) t0 = datetime.now() LL = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] if len(Xbatch) == batch_sz: session.run(train_op, feed_dict={X: Xbatch, T: Ybatch}) if j % print_period == 0: # due to RAM limitations we need to have a fixed size input # so as a result, we have this ugly total cost and prediction computation test_cost = 0 prediction = np.zeros(len(Xtest)) for k in range(len(Xtest) / batch_sz): Xtestbatch = Xtest[k * batch_sz:(k * batch_sz + batch_sz), ] Ytestbatch = Ytest_ind[k * batch_sz:(k * batch_sz + batch_sz), ] test_cost += session.run(cost, feed_dict={ X: Xtestbatch, T: Ytestbatch }) prediction[k * batch_sz:(k * batch_sz + batch_sz)] = session.run( predict_op, feed_dict={X: Xtestbatch}) err = error_rate(prediction, Ytest) print( "Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, test_cost, err)) LL.append(test_cost) print("Elapsed time:", (datetime.now() - t0)) plt.plot(LL) plt.show()
def fit(self, X, Y, lr=10e-4, mu=0.99, reg=10e-4, decay=0.99999, eps=10e-3, batch_sz=30, epochs=3, show_fig=True): lr = np.float32(lr) mu = np.float32(mu) reg = np.float32(reg) decay = np.float32(decay) eps = np.float32(eps) K = len(set(Y)) # make a validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = y2indicator(Y).astype(np.float32) Xvalid, Yvalid = X[-1000:], Y[-1000:] X, Y = X[:-1000], Y[:-1000] Yvalid_flat = np.argmax(Yvalid, axis=1) # for calculating error rate # initialize convpool layers N, width, height, c = X.shape mi = c outw = width outh = height self.convpool_layers = [] for mo, fw, fh in self.convpool_layer_sizes: layer = ConvPoolLayer(mi, mo, fw, fh) self.convpool_layers.append(layer) outw = outw / 2 outh = outh / 2 mi = mo # initialize mlp layers self.hidden_layers = [] M1 = self.convpool_layer_sizes[-1][0] * outw * outh # size must be same as output of last convpool layer count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 # logistic regression layer W, b = init_weight_and_biases(M1, K) self.W = tf.Variable(W, 'W_logreg') self.b = tf.Variable(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for h in self.convpool_layers: self.params += h.params for h in self.hidden_layers: self.params += h.params # set up tensorflow functions and variables tfX = tf.placeholder(tf.float32, shape=(None, width, height, c), name='X') tfY = tf.placeholder(tf.float32, shape=(None, K), name='Y') act = self.forward(tfX) rcost = reg * sum([tf.nn.l2_loss(p) for p in self.params]) cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=act, labels=tfY ) ) + rcost prediction = self.predict(tfX) train_op = tf.train.RMSPropOptimizer(lr, decay=decay, momentum=mu).minimize(cost) n_batches = N // batch_sz costs = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] session.run(train_op, feed_dict={tfX: Xbatch, tfY: Ybatch}) if j % 20 == 0: c = session.run(cost, feed_dict={tfX: Xvalid, tfY: Yvalid}) costs.append(c) p = session.run(prediction, feed_dict={tfX: Xvalid, tfY: Yvalid}) e = error_rate(Yvalid_flat, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, lr=10e-5, mu=0.99, reg=10e-7, decay=0.99999, eps=10e-3, batch_sz=30, epochs=100, show_fig=True): lr = np.float32(lr) mu = np.float32(mu) reg = np.float32(reg) decay = np.float32(decay) eps = np.float32(eps) # make a validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = Y.astype(np.int32) Xvalid, Yvalid = X[-1000:], Y[-1000:] X, Y = X[:-1000], Y[:-1000] # initialize convpool layers N, c, width, height = X.shape mi = c outw = width outh = height self.convpool_layers = [] for mo, fw, fh in self.convpool_layer_sizes: layer = ConvPoolLayer(mi, mo, fw, fh) self.convpool_layers.append(layer) outw = (outw - fw + 1) / 2 outh = (outh - fh + 1) / 2 mi = mo # initialize mlp layers K = len(set(Y)) self.hidden_layers = [] M1 = self.convpool_layer_sizes[-1][0]*outw*outh # size must be same as output of last convpool layer count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 # logistic regression layer W, b = init_weight_and_biases(M1, K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for c in self.convpool_layers: self.params += c.params for h in self.hidden_layers: self.params += h.params # for momentum dparams = [theano.shared(np.zeros(p.get_value().shape, dtype=np.float32)) for p in self.params] # for rmsprop cache = [theano.shared(np.zeros(p.get_value().shape, dtype=np.float32)) for p in self.params] # set up theano functions and variables thX = T.tensor4('X', dtype='float32') thY = T.ivector('Y') pY = self.forward(thX) rcost = reg*T.sum([(p*p).sum() for p in self.params]) cost = -T.mean(T.log(pY[T.arange(thY.shape[0]), thY])) + rcost prediction = self.th_predict(thX) cost_predict_op = theano.function(inputs=[thX, thY], outputs=[cost, prediction]) # updates = [ # (c, decay*c + (np.float32(1)-decay)*T.grad(cost, p)*T.grad(cost, p)) for p, c in zip(self.params, cache) # ] + [ # (p, p + mu*dp - lr*T.grad(cost, p)/T.sqrt(c + eps)) for p, c, dp in zip(self.params, cache, dparams) # ] + [ # (dp, mu*dp - lr*T.grad(cost, p)/T.sqrt(c + eps)) for p, c, dp in zip(self.params, cache, dparams) # ] # momentum only updates = [ (p, p + mu*dp - lr*T.grad(cost, p)) for p, dp in zip(self.params, dparams) ] + [ (dp, mu*dp - lr*T.grad(cost, p)) for p, dp in zip(self.params, dparams) ] train_op = theano.function( inputs=[thX, thY], updates=updates ) n_batches = N // batch_sz costs = [] for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j*batch_sz:(j*batch_sz+batch_sz)] Ybatch = Y[j*batch_sz:(j*batch_sz+batch_sz)] train_op(Xbatch, Ybatch) if j % 20 == 0: c, p = cost_predict_op(Xvalid, Yvalid) costs.append(c) e = error_rate(Yvalid, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()