Exemplo n.º 1
0
    def fit(self,
            X,
            Y,
            learning_rate=5 * 10e-7,
            reg=1.0,
            epochs=10000,
            show_fig=False,
            use_tanh=True):
        self.use_tanh = use_tanh

        X, Y = shuffle(X, Y)
        Xvalid, Yvalid = X[-1000:], Y[-1000:]
        X, Y = X[:-1000], Y[:-1000]

        N, D = X.shape
        self.W1 = np.random.randn(D, self.M) / np.sqrt(D + self.M)
        self.b1 = np.zeros(self.M)
        self.W2 = np.random.randn(self.M) / np.sqrt(self.M)
        self.b2 = 0

        costs = []
        best_validation_error = 1

        for i in range(epochs):
            # forward propagation
            pY, Z = self.forward(X)

            # gradient descent
            pY_Y = pY - Y
            self.W2 -= learning_rate * (Z.T.dot(pY_Y) + reg * self.W2)
            self.b2 -= learning_rate * ((pY_Y).sum() + reg * self.b2)

            if self.use_tanh:
                dZ = np.outer(pY_Y, self.W2) * (1 - Z * Z)
            else:
                dZ = np.outer(pY_Y, self.W2) * (Z > 0)
            self.W1 -= learning_rate * (X.T.dot(dZ) + reg * self.W1)
            self.b1 -= learning_rate * (dZ.sum(axis=0) + reg * self.b1)

            if i % 20 == 0:
                pYvalid, _ = self.forward(Xvalid)
                c = sigmoid_cost(Yvalid, pYvalid)
                costs.append(c)
                e = error_rate(Yvalid, np.round(pYvalid))
                print("i:", i, "cost:", c, "error", e)
                if e < best_validation_error:
                    best_validation_error = e
        print("best validation error:", best_validation_error)

        if show_fig:
            plt.plot(costs)
            plt.show()
    def fit(self,
            X,
            Y,
            learning_rate=10e-6,
            reg=10e-1,
            epochs=10000,
            show_fig=False):
        X, Y = shuffle(X, Y)
        Xvalid, Yvalid = X[-1000:], Y[-1000:]
        X, Y = X[:-1000], Y[:-1000]

        N, D = X.shape
        K = len(set(Y))
        T = y2indicator(Y)

        self.W1 = np.random.rand(D, self.M) / np.sqrt(D + self.M)
        self.b1 = np.zeros(self.M)
        self.W2 = np.random.randn(self.M, K) / np.sqrt(self.M + K)
        self.b2 = np.zeros(K)

        costs = []
        best_validation_error = 1
        for i in range(epochs):
            pY, Z = self.forward(X)

            # gradient descent step
            pY_T = pY - T
            self.W2 -= learning_rate * (Z.T.dot(pY_T) + reg * self.W2)
            self.b2 -= learning_rate * (pY_T.sum(axis=0) + reg * self.b2)

            #dZ = pY_T.dot(self.W2.T) * (Z > 0)
            dZ = pY_T.dot(self.W2.T) * (1 - Z * Z)
            self.W1 -= learning_rate * (X.T.dot(dZ) + reg * self.W1)
            self.b1 -= learning_rate * (dZ.sum(axis=0) + reg * self.b1)

            if i % 10 == 0:
                pYvalid, _ = self.forward(Xvalid)
                c = cost2(Yvalid, pYvalid)
                costs.append(c)
                e = error_rate(pYvalid, np.argmax(pYvalid, axis=1))
                print("i:", i, "cost:", c, "error:", e)

                if e < best_validation_error:
                    best_validation_error = e
        print("Best validation error: ", best_validation_error)

        if show_fig:
            plt.plot(costs)
            plt.show()
Exemplo n.º 3
0
    def fit(self,
            X,
            Y,
            learning_rate=10e-8,
            reg=10e-12,
            epochs=10000,
            show_fig=False):
        X, Y = shuffle(X, Y)
        Xvalid, Yvalid = X[-1000:], Y[-1000:]
        Tvalid = y2indicator(Yvalid)
        X, Y = X[:-1000], Y[:-1000]

        N, D = X.shape
        K = len(set(Y))
        T = y2indicator(Y)

        self.W = np.random.randn(D, K) / np.sqrt(D + K)
        self.b = np.zeros(K)

        costs = []
        best_validation_error = 1

        for i in range(epochs):
            # forward prop
            pY = self.forward(X)

            # gradient descent
            self.W -= learning_rate * (X.T.dot(pY - T) + reg * self.W)
            self.b -= learning_rate * ((pY - T).sum(axis=0) + reg * self.b)

            if i % 10 == 0:
                pYvalid = self.forward(Xvalid)
                c = cost(Tvalid, pYvalid)
                costs.append(c)
                e = error_rate(Yvalid, np.argmax(pYvalid, axis=1))
                print("i:", i, "cost:", c, "error:", e)

                if e < best_validation_error:
                    best_validation_error = e
        print("best validation error:", best_validation_error)

        if show_fig:
            plt.plot(costs)
            plt.show()
    def fit(self,
            X,
            Y,
            learning_rate=10e-7,
            reg=0,
            epochs=120000,
            show_fig=False):
        X, Y = shuffle(X, Y)
        Xvalid, Yvalid = X[-1000:], Y[-1000:]
        X, Y = X[:-1000], Y[:-1000]

        N, D = X.shape
        self.W = np.random.randn(D) / np.sqrt(D)
        self.b = 0

        costs = []
        best_validation_error = 1

        for i in range(epochs):
            pY = self.forward(X)

            # gradient descent
            self.W -= learning_rate * (X.T.dot(pY - Y) + reg * self.W)
            self.b -= learning_rate * ((pY - Y).sum() + reg * self.b)

            if i % 20 == 0:
                pYvalid = self.forward(Xvalid)
                c = sigmoid_cost(Yvalid, pYvalid)
                costs.append(c)
                e = error_rate(Yvalid, np.round(pYvalid))

                print("i:", i, "cost:", c, "error:", e)

                if e < best_validation_error:
                    best_validation_error = e

        print("Best validation error:", best_validation_error)

        if show_fig:
            plt.plot(costs)
            plt.show()
Exemplo n.º 5
0
 def score(self, X, Y):
     prediction = self.predict(X)
     return 1 - error_rate(Y, prediction)
Exemplo n.º 6
0
def main():
    # load the data, transform as needed
    train = loadmat('../large_files/train_32x32.mat')
    test  = loadmat('../large_files/test_32x32.mat')

    # Need to scale! don't leave as 0..255
    # Y is a N x 1 matrix with values 1..10 (MATLAB indexes by 1)
    # So flatten it and make it 0..9
    # Also need indicator matrix for cost calculation
    Xtrain = rearrange(train['X'])
    Ytrain = train['y'].flatten() - 1
    del train
    Xtrain, Ytrain = shuffle(Xtrain, Ytrain)
    Ytrain_ind = y2indicator(Ytrain)

    Xtest  = rearrange(test['X'])
    Ytest  = test['y'].flatten() - 1
    del test
    Ytest_ind  = y2indicator(Ytest)


    max_iter = 8
    print_period = 10

    lr = np.float32(0.00001)
    reg = np.float32(0.01)
    mu = np.float32(0.99)

    N = Xtrain.shape[0]
    batch_sz = 500
    n_batches = N / batch_sz

    M = 500
    K = 10
    poolsz = (2, 2)

    # after conv will be of dimension 32 - 5 + 1 = 28
    # after downsample 28 / 2 = 14
    W1_shape = (20, 3, 5, 5) # (num_feature_maps, num_color_channels, filter_width, filter_height)
    W1_init = init_filter(W1_shape, poolsz)
    b1_init = np.zeros(W1_shape[0], dtype=np.float32) # one bias per output feature map

    # after conv will be of dimension 14 - 5 + 1 = 10
    # after downsample 10 / 2 = 5
    W2_shape = (50, 20, 5, 5) # (num_feature_maps, old_num_feature_maps, filter_width, filter_height)
    W2_init = init_filter(W2_shape, poolsz)
    b2_init = np.zeros(W2_shape[0], dtype=np.float32)

    # vanilla ANN weights
    W3_init = np.random.randn(W2_shape[0]*5*5, M) / np.sqrt(W2_shape[0]*5*5 + M)
    b3_init = np.zeros(M, dtype=np.float32)
    W4_init = np.random.randn(M, K) / np.sqrt(M + K)
    b4_init = np.zeros(K, dtype=np.float32)


    # define theano variables and expressions
    X = T.tensor4('X', dtype='float32')
    Y = T.matrix('T')
    W1 = theano.shared(W1_init, 'W1')
    b1 = theano.shared(b1_init, 'b1')
    W2 = theano.shared(W2_init, 'W2')
    b2 = theano.shared(b2_init, 'b2')
    W3 = theano.shared(W3_init.astype(np.float32), 'W3')
    b3 = theano.shared(b3_init, 'b3')
    W4 = theano.shared(W4_init.astype(np.float32), 'W4')
    b4 = theano.shared(b4_init, 'b4')

    # momentum changes
    dW1 = theano.shared(np.zeros(W1_init.shape, dtype=np.float32), 'dW1')
    db1 = theano.shared(np.zeros(b1_init.shape, dtype=np.float32), 'db1')
    dW2 = theano.shared(np.zeros(W2_init.shape, dtype=np.float32), 'dW2')
    db2 = theano.shared(np.zeros(b2_init.shape, dtype=np.float32), 'db2')
    dW3 = theano.shared(np.zeros(W3_init.shape, dtype=np.float32), 'dW3')
    db3 = theano.shared(np.zeros(b3_init.shape, dtype=np.float32), 'db3')
    dW4 = theano.shared(np.zeros(W4_init.shape, dtype=np.float32), 'dW4')
    db4 = theano.shared(np.zeros(b4_init.shape, dtype=np.float32), 'db4')

    # forward pass
    Z1 = convpool(X, W1, b1)
    Z2 = convpool(Z1, W2, b2)
    Z3 = relu(Z2.flatten(ndim=2).dot(W3) + b3)
    pY = T.nnet.softmax( Z3.dot(W4) + b4)

    # define the cost function and prediction
    params = (W1, b1, W2, b2, W3, b3, W4, b4)
    reg_cost = reg*np.sum((param*param).sum() for param in params)
    cost = -(Y * T.log(pY)).sum() + reg_cost
    prediction = T.argmax(pY, axis=1)

    # step 3: training expressions and functions
    update_W1 = W1 + mu*dW1 - lr*T.grad(cost, W1)
    update_b1 = b1 + mu*db1 - lr*T.grad(cost, b1)
    update_W2 = W2 + mu*dW2 - lr*T.grad(cost, W2)
    update_b2 = b2 + mu*db2 - lr*T.grad(cost, b2)
    update_W3 = W3 + mu*dW3 - lr*T.grad(cost, W3)
    update_b3 = b3 + mu*db3 - lr*T.grad(cost, b3)
    update_W4 = W4 + mu*dW4 - lr*T.grad(cost, W4)
    update_b4 = b4 + mu*db4 - lr*T.grad(cost, b4)

    # update weight changes
    update_dW1 = mu*dW1 - lr*T.grad(cost, W1)
    update_db1 = mu*db1 - lr*T.grad(cost, b1)
    update_dW2 = mu*dW2 - lr*T.grad(cost, W2)
    update_db2 = mu*db2 - lr*T.grad(cost, b2)
    update_dW3 = mu*dW3 - lr*T.grad(cost, W3)
    update_db3 = mu*db3 - lr*T.grad(cost, b3)
    update_dW4 = mu*dW4 - lr*T.grad(cost, W4)
    update_db4 = mu*db4 - lr*T.grad(cost, b4)

    train = theano.function(
        inputs=[X, Y],
        updates=[
            (W1, update_W1),
            (b1, update_b1),
            (W2, update_W2),
            (b2, update_b2),
            (W3, update_W3),
            (b3, update_b3),
            (W4, update_W4),
            (b4, update_b4),
            (dW1, update_dW1),
            (db1, update_db1),
            (dW2, update_dW2),
            (db2, update_db2),
            (dW3, update_dW3),
            (db3, update_db3),
            (dW4, update_dW4),
            (db4, update_db4),
        ],
    )

    # create another function for this because we want it over the whole dataset
    get_prediction = theano.function(
        inputs=[X, Y],
        outputs=[cost, prediction],
    )

    t0 = datetime.now()
    LL = []
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]

            train(Xbatch, Ybatch)
            if j % print_period == 0:
                cost_val, prediction_val = get_prediction(Xtest, Ytest_ind)
                err = error_rate(prediction_val, Ytest)
                print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, err))
                LL.append(cost_val)
    print("Elapsed time:", (datetime.now() - t0))
    plt.plot(LL)
    plt.show()

    # visualize W1 (20, 3, 5, 5)
    W1_val = W1.get_value()
    grid = np.zeros((8*5, 8*5))
    m = 0
    n = 0
    for i in range(20):
        for j in range(3):
            filt = W1_val[i,j]
            grid[m*5:(m+1)*5,n*5:(n+1)*5] = filt
            m += 1
            if m >= 8:
                m = 0
                n += 1
    plt.imshow(grid, cmap='gray')
    plt.title("W1")
    plt.show()

    # visualize W2 (50, 20, 5, 5)
    W2_val = W2.get_value()
    grid = np.zeros((32*5, 32*5))
    m = 0
    n = 0
    for i in range(50):
        for j in range(20):
            filt = W2_val[i,j]
            grid[m*5:(m+1)*5,n*5:(n+1)*5] = filt
            m += 1
            if m >= 32:
                m = 0
                n += 1
    plt.imshow(grid, cmap='gray')
    plt.title("W2")
    plt.show()
Exemplo n.º 7
0
def main():
	Xtrain, Ytrain, Xtest, Ytest = MNISTData().loadFlatData()

	Xtrain, Ytrain = shuffle(Xtrain, Ytrain)
	Xtest, Ytest = shuffle(Xtest, Ytest)

	Ytrain_ind = y2indicator(Ytrain)
	Ytest_ind = y2indicator(Ytest)


	max_iter = 20
	print_period = 10
	N, D = Xtrain.shape
	batch_sz = 500
	n_batches = N // batch_sz

	M1 = 1000
	M2 = 500
	K = 10
	W1_init, b1_init = init_weight_and_biases(D, M1)
	W2_init, b2_init = init_weight_and_biases(M1, M2)
	W3_init, b3_init = init_weight_and_biases(M2, K)

	# define tensorflow vars and expressions
	X = tf.placeholder(tf.float32, shape=[None, D], name='X')
	T = tf.placeholder(tf.float32, shape=[None, K], name='T')
	W1 = tf.Variable(W1_init.astype(np.float32))
	b1 = tf.Variable(b1_init.astype(np.float32))
	W2 = tf.Variable(W2_init.astype(np.float32))
	b2 = tf.Variable(b2_init.astype(np.float32))
	W3 = tf.Variable(W3_init.astype(np.float32))
	b3 = tf.Variable(b3_init.astype(np.float32))

	Z1 = tf.nn.relu(tf.matmul(X, W1) + b1)
	Z2 = tf.nn.relu(tf.matmul(Z1,  W2) + b2)

	Yish = tf.matmul(Z2, W3) + b3
	cost =tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=Yish, labels=T))

	train_op = tf.train.RMSPropOptimizer(0.0001, decay=0.99, momentum=0.9).minimize(cost)

	# used for error rate prediction
	predict_op = tf.argmax(Yish, 1)

	LL = []
	init = tf.global_variables_initializer()
	with tf.Session() as session:
		session.run(init)

		for i in range(max_iter):
			for j in range(n_batches):
				Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
				Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]

				session.run(train_op, feed_dict={X: Xbatch, T: Ybatch})
				if j % print_period == 0:
					test_cost = session.run(cost, feed_dict={X: Xtest, T: Ytest_ind})
					prediction = session.run(predict_op, feed_dict={X: Xtest, T: Ytest_ind})
					err = error_rate(prediction, Ytest)

					print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, test_cost, err))
					LL.append(test_cost)

	plt.plot(LL)
	plt.show()
Exemplo n.º 8
0
def main():
    train = loadmat('../large_files/train_32x32.mat')  # N = 73257
    test = loadmat('../large_files/test_32x32.mat')  # N = 26032

    # Need to scale! don't leave as 0..255
    # Y is a N x 1 matrix with values 1..10 (MATLAB indexes by 1)
    # So flatten it and make it 0..9
    # Also need indicator matrix for cost calculation
    Xtrain = rearrange(train['X'])
    Ytrain = train['y'].flatten() - 1
    print(len(Ytrain))
    del train
    Xtrain, Ytrain = shuffle(Xtrain, Ytrain)
    Ytrain_ind = y2indicator(Ytrain)

    Xtest = rearrange(test['X'])
    Ytest = test['y'].flatten() - 1
    del test
    Ytest_ind = y2indicator(Ytest)

    # gradient descent params
    max_iter = 6
    print_period = 10
    N = Xtrain.shape[0]
    batch_sz = 500
    n_batches = N // batch_sz

    # limit samples since input will always have to be same size
    # you could also just do N = N / batch_sz * batch_sz
    Xtrain = Xtrain[:73000, ]
    Ytrain = Ytrain[:73000]
    Xtest = Xtest[:26000, ]
    Ytest = Ytest[:26000]
    Ytest_ind = Ytest_ind[:26000, ]
    # print "Xtest.shape:", Xtest.shape
    # print "Ytest.shape:", Ytest.shape

    # initial weights
    M = 500
    K = 10
    poolsz = (2, 2)

    W1_shape = (
        5, 5, 3, 20
    )  # (filter_width, filter_height, num_color_channels, num_feature_maps)
    W1_init = init_filter(W1_shape, poolsz)
    b1_init = np.zeros(W1_shape[-1],
                       dtype=np.float32)  # one bias per output feature map

    W2_shape = (
        5, 5, 20, 50
    )  # (filter_width, filter_height, old_num_feature_maps, num_feature_maps)
    W2_init = init_filter(W2_shape, poolsz)
    b2_init = np.zeros(W2_shape[-1], dtype=np.float32)

    # vanilla ANN weights
    W3_init = np.random.randn(W2_shape[-1] * 8 * 8,
                              M) / np.sqrt(W2_shape[-1] * 8 * 8 + M)
    b3_init = np.zeros(M, dtype=np.float32)
    W4_init = np.random.randn(M, K) / np.sqrt(M + K)
    b4_init = np.zeros(K, dtype=np.float32)

    # define variables and expressions
    # using None as the first shape element takes up too much RAM unfortunately
    X = tf.placeholder(tf.float32, shape=(batch_sz, 32, 32, 3), name='X')
    T = tf.placeholder(tf.float32, shape=(batch_sz, K), name='T')
    W1 = tf.Variable(W1_init.astype(np.float32))
    b1 = tf.Variable(b1_init.astype(np.float32))
    W2 = tf.Variable(W2_init.astype(np.float32))
    b2 = tf.Variable(b2_init.astype(np.float32))
    W3 = tf.Variable(W3_init.astype(np.float32))
    b3 = tf.Variable(b3_init.astype(np.float32))
    W4 = tf.Variable(W4_init.astype(np.float32))
    b4 = tf.Variable(b4_init.astype(np.float32))

    Z1 = convpool(X, W1, b1)
    Z2 = convpool(Z1, W2, b2)
    Z2_shape = Z2.get_shape().as_list()
    Z2r = tf.reshape(Z2, [Z2_shape[0], np.prod(Z2_shape[1:])])
    Z3 = tf.nn.relu(tf.matmul(Z2r, W3) + b3)
    Yish = tf.matmul(Z3, W4) + b4

    cost = tf.reduce_sum(
        tf.nn.softmax_cross_entropy_with_logits(logits=Yish, labels=T))

    train_op = tf.train.RMSPropOptimizer(0.0001, decay=0.99,
                                         momentum=0.9).minimize(cost)

    # we'll use this to calculate the error rate
    predict_op = tf.argmax(Yish, 1)

    t0 = datetime.now()
    LL = []
    init = tf.global_variables_initializer()
    with tf.Session() as session:
        session.run(init)

        for i in range(max_iter):
            for j in range(n_batches):
                Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
                Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]

                if len(Xbatch) == batch_sz:
                    session.run(train_op, feed_dict={X: Xbatch, T: Ybatch})
                    if j % print_period == 0:
                        # due to RAM limitations we need to have a fixed size input
                        # so as a result, we have this ugly total cost and prediction computation
                        test_cost = 0
                        prediction = np.zeros(len(Xtest))
                        for k in range(len(Xtest) / batch_sz):
                            Xtestbatch = Xtest[k * batch_sz:(k * batch_sz +
                                                             batch_sz), ]
                            Ytestbatch = Ytest_ind[k * batch_sz:(k * batch_sz +
                                                                 batch_sz), ]
                            test_cost += session.run(cost,
                                                     feed_dict={
                                                         X: Xtestbatch,
                                                         T: Ytestbatch
                                                     })
                            prediction[k *
                                       batch_sz:(k * batch_sz +
                                                 batch_sz)] = session.run(
                                                     predict_op,
                                                     feed_dict={X: Xtestbatch})
                        err = error_rate(prediction, Ytest)
                        print(
                            "Cost / err at iteration i=%d, j=%d: %.3f / %.3f" %
                            (i, j, test_cost, err))
                        LL.append(test_cost)
    print("Elapsed time:", (datetime.now() - t0))
    plt.plot(LL)
    plt.show()
Exemplo n.º 9
0
	def fit(self, X, Y, lr=10e-4, mu=0.99, reg=10e-4, decay=0.99999, eps=10e-3, batch_sz=30, epochs=3, show_fig=True):
		lr = np.float32(lr)
		mu = np.float32(mu)
		reg = np.float32(reg)
		decay = np.float32(decay)
		eps = np.float32(eps)
		K = len(set(Y))

		# make a validation set
		X, Y = shuffle(X, Y)
		X = X.astype(np.float32)
		Y = y2indicator(Y).astype(np.float32)

		Xvalid, Yvalid = X[-1000:], Y[-1000:]
		X, Y = X[:-1000], Y[:-1000]
		Yvalid_flat = np.argmax(Yvalid, axis=1)  # for calculating error rate

		# initialize convpool layers
		N, width, height, c = X.shape
		mi = c
		outw = width
		outh = height
		self.convpool_layers = []
		for mo, fw, fh in self.convpool_layer_sizes:
			layer = ConvPoolLayer(mi, mo, fw, fh)
			self.convpool_layers.append(layer)
			outw = outw / 2
			outh = outh / 2
			mi = mo

		# initialize mlp layers
		self.hidden_layers = []
		M1 = self.convpool_layer_sizes[-1][0] * outw * outh  # size must be same as output of last convpool layer
		count = 0
		for M2 in self.hidden_layer_sizes:
			h = HiddenLayer(M1, M2, count)
			self.hidden_layers.append(h)
			M1 = M2
			count += 1

		# logistic regression layer
		W, b = init_weight_and_biases(M1, K)
		self.W = tf.Variable(W, 'W_logreg')
		self.b = tf.Variable(b, 'b_logreg')

		# collect params for later use
		self.params = [self.W, self.b]
		for h in self.convpool_layers:
			self.params += h.params
		for h in self.hidden_layers:
			self.params += h.params

		# set up tensorflow functions and variables
		tfX = tf.placeholder(tf.float32, shape=(None, width, height, c), name='X')
		tfY = tf.placeholder(tf.float32, shape=(None, K), name='Y')
		act = self.forward(tfX)

		rcost = reg * sum([tf.nn.l2_loss(p) for p in self.params])
		cost = tf.reduce_mean(
			tf.nn.softmax_cross_entropy_with_logits(
				logits=act,
				labels=tfY
			)
		) + rcost
		prediction = self.predict(tfX)

		train_op = tf.train.RMSPropOptimizer(lr, decay=decay, momentum=mu).minimize(cost)

		n_batches = N // batch_sz
		costs = []
		init = tf.global_variables_initializer()
		with tf.Session() as session:
			session.run(init)
			for i in range(epochs):
				X, Y = shuffle(X, Y)
				for j in range(n_batches):
					Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)]
					Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)]

					session.run(train_op, feed_dict={tfX: Xbatch, tfY: Ybatch})

					if j % 20 == 0:
						c = session.run(cost, feed_dict={tfX: Xvalid, tfY: Yvalid})
						costs.append(c)

						p = session.run(prediction, feed_dict={tfX: Xvalid, tfY: Yvalid})
						e = error_rate(Yvalid_flat, p)
						print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e)

		if show_fig:
			plt.plot(costs)
			plt.show()
Exemplo n.º 10
0
    def fit(self, X, Y, lr=10e-5, mu=0.99, reg=10e-7, decay=0.99999, eps=10e-3, batch_sz=30, epochs=100, show_fig=True):
        lr = np.float32(lr)
        mu = np.float32(mu)
        reg = np.float32(reg)
        decay = np.float32(decay)
        eps = np.float32(eps)

        # make a validation set
        X, Y = shuffle(X, Y)
        X = X.astype(np.float32)
        Y = Y.astype(np.int32)
        Xvalid, Yvalid = X[-1000:], Y[-1000:]
        X, Y = X[:-1000], Y[:-1000]

        # initialize convpool layers
        N, c, width, height = X.shape
        mi = c
        outw = width
        outh = height
        self.convpool_layers = []
        for mo, fw, fh in self.convpool_layer_sizes:
            layer = ConvPoolLayer(mi, mo, fw, fh)
            self.convpool_layers.append(layer)
            outw = (outw - fw + 1) / 2
            outh = (outh - fh + 1) / 2
            mi = mo

        # initialize mlp layers
        K = len(set(Y))
        self.hidden_layers = []
        M1 = self.convpool_layer_sizes[-1][0]*outw*outh # size must be same as output of last convpool layer
        count = 0
        for M2 in self.hidden_layer_sizes:
            h = HiddenLayer(M1, M2, count)
            self.hidden_layers.append(h)
            M1 = M2
            count += 1

        # logistic regression layer
        W, b = init_weight_and_biases(M1, K)
        self.W = theano.shared(W, 'W_logreg')
        self.b = theano.shared(b, 'b_logreg')

        # collect params for later use
        self.params = [self.W, self.b]
        for c in self.convpool_layers:
            self.params += c.params
        for h in self.hidden_layers:
            self.params += h.params

        # for momentum
        dparams = [theano.shared(np.zeros(p.get_value().shape, dtype=np.float32)) for p in self.params]

        # for rmsprop
        cache = [theano.shared(np.zeros(p.get_value().shape, dtype=np.float32)) for p in self.params]

        # set up theano functions and variables
        thX = T.tensor4('X', dtype='float32')
        thY = T.ivector('Y')
        pY = self.forward(thX)

        rcost = reg*T.sum([(p*p).sum() for p in self.params])
        cost = -T.mean(T.log(pY[T.arange(thY.shape[0]), thY])) + rcost
        prediction = self.th_predict(thX)

        cost_predict_op = theano.function(inputs=[thX, thY], outputs=[cost, prediction])

        # updates = [
        #     (c, decay*c + (np.float32(1)-decay)*T.grad(cost, p)*T.grad(cost, p)) for p, c in zip(self.params, cache)
        # ] + [
        #     (p, p + mu*dp - lr*T.grad(cost, p)/T.sqrt(c + eps)) for p, c, dp in zip(self.params, cache, dparams)
        # ] + [
        #     (dp, mu*dp - lr*T.grad(cost, p)/T.sqrt(c + eps)) for p, c, dp in zip(self.params, cache, dparams)
        # ]

        # momentum only
        updates = [
            (p, p + mu*dp - lr*T.grad(cost, p)) for p, dp in zip(self.params, dparams)
        ] + [
            (dp, mu*dp - lr*T.grad(cost, p)) for p, dp in zip(self.params, dparams)
        ]

        train_op = theano.function(
            inputs=[thX, thY],
            updates=updates
        )

        n_batches = N // batch_sz
        costs = []
        for i in range(epochs):
            X, Y = shuffle(X, Y)
            for j in range(n_batches):
                Xbatch = X[j*batch_sz:(j*batch_sz+batch_sz)]
                Ybatch = Y[j*batch_sz:(j*batch_sz+batch_sz)]

                train_op(Xbatch, Ybatch)

                if j % 20 == 0:
                    c, p = cost_predict_op(Xvalid, Yvalid)
                    costs.append(c)
                    e = error_rate(Yvalid, p)
                    print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e)

        if show_fig:
            plt.plot(costs)
            plt.show()