예제 #1
0
def grid():

	X,Y = transform_data()
	X,Y = shuffle(X,Y)
	N = len(X)//2
	Xtrain = X[:N]
	Ytrain = Y[:N]
	Ttrain = generate_T(Ytrain)
	Xtest = X[N:]
	Ytest = Y[N:]
	Ttest = generate_T(Ytest)
	N,D = Xtrain.shape
	K = len(set(Y))
	w0 = np.random.randn(D,K)/np.sqrt(D+K)
	b0 = np.random.randn(K)/np.sqrt(K)
	learning_rates = [10**i for i in range(-7,-3,1)]
	momentums = [1-10**i for i in sorted(list(range(-4,0)),reverse=True)]
	iterations = 2000
	best_lr = 0
	best_momentum = 0
	best_cr = 0
	cost = {}
	cr = {}
	for lr in learning_rates:
		learning_rate = lr
		for mu in momentums:
			dw = 0
			db = 0
			cost[(lr,mu)] = list()
			cr[(lr,mu)] = list()
			for i in range(iterations):
				if i == 0:
					A_train = relu(Xtrain.dot(w0) + b0)
					A_test = relu(Xtest.dot(w0) + b0)
				else:
					A_train = relu(Xtrain.dot(w) + b0)
					A_test = relu(Xtest.dot(w) + b0)	
				Y_train = np.exp(A_train)/np.exp(A_train).sum(axis=1,keepdims=True)
				Y_test = np.exp(A_test)/np.exp(A_test).sum(axis=1,keepdims=True)
				P_test = np.argmax(Y_test,axis=1)
				cost[(lr,mu)].append(cross_entropy(Y_test,Ttest))
				current_cr = classification_rate(P_test,Ytest)
				cr[(lr,mu)].append(current_cr)
				if current_cr > best_cr:
					best_cr = current_cr
					best_lr = lr
					best_mu = mu
				dw = mu*dw - (1-mu)*learning_rate*derivative_w(Xtrain,Y_train,Ttrain)
				db = mu*db - (1-mu)*learning_rate*derivative_b(Y_train,Ttrain)
				if i == 0:
					w = w0 + dw
					b = b0 + db
				else:
					w += dw
					b += db
				if i % 100 == 0:
					print('Learning Rate: ',lr,'Momentum: ',mu,'Cost: ',cost[(lr,mu)][i],'Classification Rate: ',cr[(lr,mu)][i])
				if i == (iterations - 1):
					print('')
	return cost,cr,best_lr,best_mu,best_cr
예제 #2
0
def exp_decay(learning_rate):

    X, Y = transform_data()
    X, Y = shuffle(X, Y)
    N = len(X) // 2
    Xtrain = X[:N]
    Ytrain = Y[:N]
    Ttrain = generate_T(Ytrain)
    Xtest = X[N:]
    Ytest = Y[N:]
    Ttest = generate_T(Ytest)
    N, D = Xtrain.shape
    M = 100
    K = len(set(Y))
    iterations = 50
    batch_N = 250
    batches = N // batch_N
    dv = 0
    d_b1 = 0
    dw = 0
    d_b0 = 0
    mu = .9
    v = np.random.randn(M, K) / np.sqrt(M + K)
    b_1 = np.random.randn(K) / np.sqrt(K)
    w = np.random.randn(D, M) / np.sqrt(D + M)
    b_0 = np.random.randn(M) / np.sqrt(M)
    learning_rate = learning_rate
    exp_cost = []
    exp_cr = []
    exp_lr = []
    best_exp = 0
    best_iteration = 0
    for i in range(iterations):
        learning_rate = learning_rate * np.exp(-K * i)
        exp_lr.append(learning_rate)
        for b in range(batches):
            X = Xtrain[b * batches:(b + 1) * batches, :]
            T = Ttrain[b * batches:(b + 1) * batches, :]
            Y, Z = generate_Y('tanh', X, w, b_0, v, b_1)
            Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1)
            P_test = np.argmax(Y_test, axis=1)
            if b % batches == 0:
                exp_cost.append(cross_entropy(Y_test, Ttest))
                cr = classification_rate(P_test, Ytest)
                exp_cr.append(cr)
                if cr > best_exp:
                    best_exp = cr
                    best_iteration = i
            dv = mu * dv - learning_rate * derivative_v('tanh', Z, Y, T)
            d_b1 = mu * d_b1 - learning_rate * derivative_b1('tanh', Y, T)
            dw = mu * dw - learning_rate * derivative_w('tanh', X, Y, Z, T, v)
            d_b0 = mu * d_b0 - learning_rate * derivative_b0(
                'tanh', Y, Z, T, v)
            v += dv
            b_1 += d_b1
            w += dw
            b_0 += d_b0
        if i % 10 == 0:
            print('Exp Cost: ', exp_cost[i], 'Exp Classification: ', exp_cr[i])
    return exp_cost, exp_cr, exp_lr, best_exp, best_iteration
예제 #3
0
def nesterov_momentum(learning_rate):

    X, Y = transform_data()
    X, Y = shuffle(X, Y)
    N = len(X) // 2
    Xtrain = X[:N]
    Ytrain = Y[:N]
    Ttrain = generate_T(Ytrain)
    Xtest = X[N:]
    Ytest = Y[N:]
    Ttest = generate_T(Ytest)
    N, D = Xtrain.shape
    M = 100
    K = len(set(Y))
    iterations = 50
    batch_N = 250
    batches = N // batch_N
    v = np.random.randn(M, K) / np.sqrt(M + K)
    b_1 = np.random.randn(K) / np.sqrt(K)
    w = np.random.randn(D, M) / np.sqrt(D + M)
    b_0 = np.random.randn(M) / np.sqrt(M)
    mu = .9
    dv = 0
    db_1 = 0
    dw = 0
    db_0 = 0
    nesterov_cost = []
    nesterov_cr = []
    best_nesterov = 0
    best_iteration = 0
    for i in range(iterations):
        for b in range(batches):
            X = Xtrain[b * batch_N:(b + 1) * batch_N, :]
            T = Ttrain[b * batch_N:(b + 1) * batch_N, :]
            Y, Z = generate_Y('tanh', X, w, b_0, v, b_1)
            Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1)
            P_test = np.argmax(Y_test, axis=1)
            if b % batches == 0:
                nesterov_cost.append(cross_entropy(Y_test, Ttest))
                cr = classification_rate(P_test, Ytest)
                nesterov_cr.append(cr)
                if cr > best_nesterov:
                    best_nesterov = cr
                    best_iteration = i
            dv = mu * dv - learning_rate * derivative_v('tanh', Z, Y, T)
            db_1 = mu * db_1 - learning_rate * derivative_b1('tanh', Y, T)
            dw = mu * dw - learning_rate * derivative_w('tanh', X, Y, Z, T, v)
            db_0 = mu * db_0 - learning_rate * derivative_b0(
                'tanh', Y, Z, T, v)
            v += mu * dv - learning_rate * derivative_v('tanh', Z, Y, T)
            b_1 += mu * db_1 - learning_rate * derivative_b1('tanh', Y, T)
            w += mu * dw + learning_rate * derivative_w('tanh', X, Y, Z, T, v)
            b_0 += mu * db_0 - learning_rate * derivative_b0(
                'tanh', Y, Z, T, v)
        if i % 100 == 0:
            print('Nesterov Cost: ', nesterov_cost[i],
                  'Nesterov Classification: ', nesterov_cr[i])
    return nesterov_cost, nesterov_cr, best_nesterov, best_iteration
예제 #4
0
    def train(self, X, Y, activation=1, lr=10e-7, reg=10e-7, epoch=10):
        N, D = X.shape  #Diamentionality of our data
        batch_size = 500
        n_batches = int(N / batch_size)
        ind = tar2ind(
            Y
        )  # WE convert our target array into indicator matrix using one hot encoding
        _, K = ind.shape

        self.W1 = np.random.randn(D, self.M) / np.sqrt(
            D)  #Input to hidden weight
        self.W2 = np.random.randn(self.M, K) / np.sqrt(
            self.M)  #Hidden to output weights
        self.b1 = np.random.randn(self.M)
        self.b2 = np.random.randn(K)
        dW2 = 0
        db2 = 0
        dW1 = 0
        db1 = 0
        mu = 0.9  # Momentum
        decay_rate = 0.99

        cost = []
        for n in range(0, 200):
            #tempx , tempy = shuffle(X, ind)
            for i in range(0, n_batches):
                X_tr = X[i * batch_size:(i * batch_size + batch_size), :]
                Y_tr = Y[i * batch_size:(i * batch_size + batch_size), ]
                ind = tar2ind(Y_tr)
                output, hidden = forward(X_tr, activation, self.W1, self.b1,
                                         self.W2, self.b2)

                #Performing backpropagation now
                dW2 = mu * dW2 + lr * (derivative_W2(ind, output, hidden, reg,
                                                     self.W2))
                self.W2 = self.W2 + dW2
                db2 = mu * db2 + lr * (derivative_b2(ind, output, reg,
                                                     self.b2))
                self.b2 = self.b2 + db2
                dW1 = mu * dW1 + lr * (derivative_W1(
                    ind, output, hidden, self.W2, X_tr, activation, reg,
                    self.W1))
                self.W1 = self.W1 + dW1
                db1 = mu * db1 + lr * (derivative_b1(
                    ind, output, hidden, self.W2, activation, reg, self.b1))
                self.b1 = self.b1 + db1
                c = cross_entropy(ind, output)
                cost.append(c)

                if i % 10 == 0:
                    result = np.argmax(output, axis=1)
                    r = classification_rate(Y_tr, result)
                    print("iteration:- ", i, "cost:- ", c,
                          "classification rate:- ", r)
예제 #5
0
 def compute(self, pred, label, seq_mask=None):
     label = np.expand_dims(label, axis=2)
     ce = cross_entropy(
         softmax=pred,
         label=label,
         soft_label=False,
         axis=-1,
         ignore_index=-100)
     ce = np.squeeze(ce, axis=2)
     if seq_mask is not None:
         ce = ce * seq_mask
         word_num = np.sum(seq_mask)
         return ce, word_num
     return ce
예제 #6
0
def batch(learning_rate):

    X, Y = transform_data()
    X, Y = shuffle(X, Y)
    N = len(X) // 2
    Xtrain = X[:N]
    Ytrain = Y[:N]
    Ttrain = generate_T(Ytrain)
    Xtest = X[N:]
    Ytest = Y[N:]
    Ttest = generate_T(Ytest)
    N, D = Xtrain.shape
    M = 100
    K = len(set(Y))
    iterations = 50
    batch_N = 250
    batches = len(X) // batch_N
    v = np.random.randn(M, K) / np.sqrt(M + K)
    b_1 = np.random.randn(K) / np.sqrt(K)
    w = np.random.randn(D, M) / np.sqrt(D + M)
    b_0 = np.random.randn(M) / np.sqrt(M)
    batch_cost = []
    batch_cr = []
    best_batch = 0
    best_iteration = 0
    for i in range(iterations):
        for b in range(batches):
            X = Xtrain[b * batch_N:(b + 1) * batch_N, :]
            T = Ttrain[b * batch_N:(b + 1) * batch_N, :]
            Y, Z = generate_Y('tanh', X, w, b_0, v, b_1)
            Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1)
            P_test = np.argmax(Y_test, axis=1)
            if b % batches == 0:
                batch_cost.append(cross_entropy(Y_test, Ttest))
                cr = classification_rate(P_test, Ytest)
                batch_cr.append(cr)
                if cr > best_batch:
                    best_batch = cr
                    best_iteration = i
            v -= learning_rate * derivative_v('tanh', Z, Y, T)
            b_1 -= learning_rate * derivative_b1('tanh', Y, T)
            w -= learning_rate * derivative_w('tanh', X, Y, Z, T, v)
            b_0 -= learning_rate * derivative_b0('tanh', Y, Z, T, v)
        if i % 100 == 0:
            print('Batch Cost: ', batch_cost[i], 'Batch Classification: ',
                  batch_cr[i])
    return batch_cost, batch_cr, best_batch, best_iteration
예제 #7
0
 def update(self, x, t):
     self.model.zerograd()
     y = self.model.forward(x)
     pred = np.argmax(y, axis=1)
     acc = 1.0 * np.where(pred == t)[0].size / y.shape[
         0]  #devide the number of currect answer in the batch by batch size
     prob = util.softmax(y)  #change output to probability (normalization)
     loss = util.cross_entropy(prob, t)  #loss function
     dout = prob
     dout[np.arange(dout.shape[0]),
          t] -= 1  #differentiate loss function by y
     self.model.backward(
         dout / dout.shape[0]
     )  #calculate partial differentiations by each parameters of each layer to use in next update() function to update parameters.
     self.model.update(
     )  #update parameters based on the partial differntials
     return loss, acc
예제 #8
0
def train(device, model, train_mode, examples, num_steps):
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
    milestones = list(map(int, num_steps * np.array([1 / 2, 3 / 4, 7 / 8])))
    logger.info('lr milestones: {}'.format(milestones))
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones)

    model.train()

    for i, example in zip(range(num_steps), examples):
        scheduler.step()
        optimizer.zero_grad()
        if train_mode == 'pair':
            im0, im1, target = example
            im0, im1, target = im0.to(device), im1.to(device), target.to(
                device)
            output = model(im0, im1)
            loss = torch.nn.functional.binary_cross_entropy_with_logits(
                output, target)
        elif train_mode == 'softmax':
            train_ims, test_ims, _ = example
            # train_ims: [b, k, n, ...]
            # test_ims: [b, k, n', ...]
            test_ims, gt = util.flatten_few_shot_examples(test_ims,
                                                          shuffle=True)
            train_ims, test_ims, gt = train_ims.to(device), test_ims.to(
                device), gt.to(device)
            # test_ims: [b, m, ...]
            # gt: [b, m]
            scores = model(train_ims, test_ims)
            # scores: [b, m, k]
            loss = util.cross_entropy(scores, gt, dim=-1)
            # Besides the loss, we can obtain the accuracy.
            _, pred = torch.max(scores, -1, keepdim=False)
            is_correct = torch.eq(pred, gt).cpu().numpy()
            acc = np.sum(is_correct) / is_correct.size
        else:
            raise ValueError('unknown train mode: "{}"'.format(train_mode))
        loss.backward()
        optimizer.step()
        logger.info('step %d, loss %.4f', i, loss.item())
예제 #9
0
파일: lobe.py 프로젝트: PVirie/vmachine
    def build_graphs(self, input, pasts):
        """component content h, selective focus s, memory m"""
        """return generated thought u, generated content v"""
        """v -> input; only when receiving an external input"""
        """m -> h; only when receiving an external input"""
        """s -> h; only when receiving an external input"""
        """s -> m; when perform thinking"""
        s = self.selective_focus(pasts)
        G = self.generative_focus(pasts)
        h = self.forward(input, G)
        v = self.backward(h, G)

        m = self.retrieve_memory(s)
        u = self.backward(m, G)

        grads, delta = self.Mw.gradients(h)
        self.memorize_operation = util.apply_gradients(grads, delta, 1.0)
        self.improve_focus_operation = util.cross_entropy(
            s, h, self.get_selective_focus_variables())
        self.reset_memory_operation = self.Mw.get_reset_operation()
        self.reseed_memory_operation = self.Mw.get_reseed_operation()

        return u, v
예제 #10
0
# Calculate cross-entropy error for random weights, and for closed-form solution to bayes classifier

import numpy as np
from util import sigmoid, cross_entropy

N = 100
D = 2

means = np.array(((-2,-2), (2,2)))
covar = np.eye(2)

# Artifically create 2 classes, center first 50 points at (-2,-2), last 50 at (2,2)
X = np.random.randn(N, D)
X[:N//2, :] = X[:N//2, :] + means[0] * np.ones((N//2,D))
X[N//2:, :] = X[N//2:, :] + means[1] * np.ones((N//2,D))
Xb = np.concatenate((np.ones((N, 1)), X), axis=1)

# Class labels, first 50 are 0, last 50 are 1
T = np.concatenate((np.zeros((N//2,)), np.ones((N//2,))))

# Random weights
w = np.random.randn(D+1)
Y = sigmoid(Xb @ w)
print('Random weights:', cross_entropy(T, Y))

# Closed form Bayes solution
w = ((means[1, None] - means[0, None]) @ np.linalg.inv(covar)).T
w = np.concatenate(((0,), w.reshape(D))) # Add weight for bias
Y = sigmoid(Xb @ w)
print('Closed form solution:', cross_entropy(T, Y))
예제 #11
0
def rmsprop(learning_rate):

    X, Y = transform_data()
    X, Y = shuffle(X, Y)
    N = len(X) // 2
    Xtrain = X[:N]
    Ytrain = Y[:N]
    Ttrain = generate_T(Ytrain)
    Xtest = X[N:]
    Ytest = Y[N:]
    Ttest = generate_T(Ytest)
    Ttest = generate_T(Ytest)
    N, D = Xtrain.shape
    M = 100
    K = len(set(Y))
    iterations = 50
    batch_N = 250
    batches = N // batch_N
    dv = 0
    d_b1 = 0
    dw = 0
    d_b0 = 0
    mu = .9
    v = np.random.randn(M, K) / np.sqrt(M + K)
    b_1 = np.random.randn(K) / np.sqrt(K)
    w = np.random.randn(D, M) / np.sqrt(D + M)
    b_0 = np.random.randn(M) / np.sqrt(M)
    cache_v = np.ones((M, K))
    cache_b1 = np.ones(K)
    cache_w = np.ones((D, M))
    cache_b0 = np.ones(M)
    epsilon = 10e-10
    decay = .9
    rmsprop_cost = []
    rmsprop_cr = []
    best_rms = 0
    best_iteration = 0
    for i in range(iterations):
        for b in range(batches):
            X = Xtrain[b * batches:(b + 1) * batches, :]
            T = Ttrain[b * batches:(b + 1) * batches, :]
            Y, Z = generate_Y('tanh', X, w, b_0, v, b_1)
            Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1)
            P_test = np.argmax(Y_test, axis=1)
            if b % batches == 0:
                rmsprop_cost.append(cross_entropy(Y_test, Ttest))
                cr = classification_rate(P_test, Ytest)
                rmsprop_cr.append(cr)
                if cr > best_rms:
                    best_rms = cr
                    best_iteration = i
            cache_v = decay * cache_v + (1 - decay) * derivative_v(
                'tanh', Z, Y, T)**2
            cache_b1 = decay * cache_b1 + (1 - decay) * derivative_b1(
                'tanh', Y, T)**2
            cache_w = decay * cache_w + (1 - decay) * derivative_w(
                'tanh', X, Y, Z, T, v)**2
            cache_b0 = decay * cache_b0 + (1 - decay) * derivative_b0(
                'tanh', Y, Z, T, v)**2
            dv = mu * dv - learning_rate * derivative_v(
                'tanh', Z, Y, T) / (np.sqrt(cache_v + epsilon))
            d_b1 = mu * d_b1 - learning_rate * derivative_b1(
                'tanh', Y, T) / (np.sqrt(cache_b1 + epsilon))
            dw = mu * dw - learning_rate * derivative_w(
                'tanh', X, Y, Z, T, v) / (np.sqrt(cache_w + epsilon))
            d_b0 = mu * d_b0 - learning_rate * derivative_b0(
                'tanh', Y, Z, T, v) / (np.sqrt(cache_b0 + epsilon))
            v += dv
            b_1 += d_b1
            w += dw
            b_0 += d_b0
        if i % 10 == 0:
            print('RMSProp Cost: ', rmsprop_cost[i],
                  'RMSProp Classification: ', rmsprop_cr[i])
    return rmsprop_cost, rmsprop_cr, best_rms, best_iteration
예제 #12
0
 def get_loss(self):
     return util.cross_entropy(self.O, util.one_hot(self.labels_active))
예제 #13
0
    def full(self):

        for i in range(self.iterations):
            Y_train, Z = generate_Y(self.activation, self.Xtrain, self.w,
                                    self.b_0, self.v, self.b_1)
            P_train = np.argmax(Y_train, axis=1)
            Y_test, _ = generate_Y(self.activation, self.Xtest, self.w,
                                   self.b_0, self.v, self.b_1)
            P_test = np.argmax(Y_test, axis=1)
            self.train_cost.append(cross_entropy(Y_train, self.Ttrain))
            self.test_cost.append(cross_entropy(Y_test, self.Ttest))
            train_cr = classification_rate(P_train, self.Ytrain)
            self.train_cr.append(train_cr)
            test_cr = classification_rate(P_test, self.Ytest)
            self.test_cr.append(test_cr)
            if train_cr > self.best_train:
                self.best_train = train_cr
                self.train_iteration = i
            if test_cr > self.best_test:
                self.best_test = test_cr
                self.test_iteration = i
            self.m_v = self.decay_0 * self.m_v + (
                1 - self.decay_0) * derivative_v(self.activation, Z, Y_train,
                                                 self.Ttrain)
            self.dm_v = self.m_v / (1 - self.decay_0**(i + 1))
            self.v_v = self.decay_1 * self.v_v + (
                1 - self.decay_1) * derivative_v(self.activation, Z, Y_train,
                                                 self.Ttrain)**2
            self.dv_v = self.v_v / (1 - self.decay_1**(i + 1))
            self.m_b1 = self.decay_0 * self.m_b1 + (
                1 - self.decay_0) * derivative_b1(self.activation, Y_train,
                                                  self.Ttrain)
            self.dm_b1 = self.m_b1 / (1 - self.decay_0**(i + 1))
            self.v_b1 = self.decay_1 * self.v_b1 + (
                1 - self.decay_1) * derivative_b1(self.activation, Y_train,
                                                  self.Ttrain)**2
            self.dv_b1 = self.v_b1 / (1 - self.decay_1**(i + 1))
            self.m_w = self.decay_0 * self.m_w + (
                1 - self.decay_0) * derivative_w(self.activation, self.Xtrain,
                                                 Y_train, Z, self.Ttrain,
                                                 self.v)
            self.dm_w = self.m_w / (1 - self.decay_0**(i + 1))
            self.v_w = self.decay_1 * self.v_w + (
                1 - self.decay_1) * derivative_w(self.activation, self.Xtrain,
                                                 Y_train, Z, self.Ttrain,
                                                 self.v)**2
            self.dv_w = self.v_w / (1 - self.decay_1**(i + 1))
            self.m_b0 = self.decay_0 * self.m_b0 + (
                1 - self.decay_0) * derivative_b0(self.activation, Y_train, Z,
                                                  self.Ttrain, self.v)
            self.dm_b0 = self.m_b0 / (1 - self.decay_0**(i + 1))
            self.v_b0 = self.decay_1 * self.v_b0 + (
                1 - self.decay_1) * derivative_b0(self.activation, Y_train, Z,
                                                  self.Ttrain, self.v)**2
            self.dv_b0 = self.v_b0 / (1 - self.decay_1**(i + 1))
            self.v -= self.learning_rate * self.dm_v / (np.sqrt(self.dv_v +
                                                                self.epsilon))
            self.b_1 -= self.learning_rate * self.dm_b1 / (
                np.sqrt(self.dv_b1 + self.epsilon))
            self.w -= self.learning_rate * self.dm_w / (np.sqrt(self.dv_w +
                                                                self.epsilon))
            self.b_0 -= self.learning_rate * self.dm_b0 / (
                np.sqrt(self.dv_b0 + self.epsilon))
            if i % 100 == 0:
                print(i, 'Train Cost: ', self.train_cost[i],
                      'Train Classification Rate: ', self.train_cr[i])
예제 #14
0
    def stochastic(self, samples):

        for i in range(self.iterations):
            current_X, current_T = shuffle(self.Xtrain, self.Ttrain)
            for s in range(samples):
                X = current_X[s, :].reshape(1, current_X.shape[1])
                T = current_T[s, :].reshape(1, current_T.shape[1])
                Y, Z = generate_Y(self.activation, X, self.w, self.b_0, self.v,
                                  self.b_1)
                Y_train, _ = generate_Y(self.activation, self.Xtrain, self.w,
                                        self.b_0, self.v, self.b_1)
                P_train = np.argmax(Y_train, axis=1)
                Y_test, _ = generate_Y(self.activation, self.Xtest, self.w,
                                       self.b_0, self.v, self.b_1)
                P_test = np.argmax(Y_test, axis=1)
                self.train_cost.append(cross_entropy(Y_train, self.Ttrain))
                self.test_cost.append(cross_entropy(Y_test, self.Ttest))
                train_cr = classification_rate(P_train, self.Ytrain)
                self.train_cr.append(train_cr)
                test_cr = classification_rate(P_test, self.Ytest)
                self.test_cr.append(test_cr)
                if train_cr > self.best_train:
                    self.best_train = train_cr
                    self.train_iteration = i
                if test_cr > self.best_test:
                    self.best_test = test_cr
                    self.test_iteration = i
                self.m_v = self.decay_0 * self.m_v + (
                    1 - self.decay_0) * derivative_v(self.activation, Z, Y, T)
                self.dm_v = self.m_v / (1 - self.decay_0**(i + 1))
                self.v_v = self.decay_1 * self.v_v + (
                    1 - self.decay_1) * derivative_v(self.activation, Z, Y,
                                                     T)**2
                self.dv_v = self.v_v / (1 - self.decay_1**(i + 1))
                self.m_b1 = self.decay_0 * self.m_b1 + (
                    1 - self.decay_0) * derivative_b1(self.activation, Y, T)
                self.dm_b1 = self.m_b1 / (1 - self.decay_0**(i + 1))
                self.v_b1 = self.decay_1 * self.v_b1 + (
                    1 - self.decay_1) * derivative_b1(self.activation, Y, T)**2
                self.dv_b1 = self.v_b1 / (1 - self.decay_1**(i + 1))
                self.m_w = self.decay_0 * self.m_w + (
                    1 - self.decay_0) * derivative_w(self.activation, X, Y, Z,
                                                     T, self.v)
                self.dm_w = self.m_w / (1 - self.decay_0**(i + 1))
                self.v_w = self.decay_1 * self.v_w + (
                    1 - self.decay_1) * derivative_w(self.activation, X, Y, Z,
                                                     T, self.v)**2
                self.dv_w = self.v_w / (1 - self.decay_1**(i + 1))
                self.m_b0 = self.decay_0 * self.m_b0 + (
                    1 - self.decay_0) * derivative_b0(self.activation, Y, Z, T,
                                                      self.v)
                self.dm_b0 = self.m_b0 / (1 - self.decay_0**(i + 1))
                self.v_b0 = self.decay_1 * self.v_b0 + (
                    1 - self.decay_1) * derivative_b0(self.activation, Y, Z, T,
                                                      self.v)**2
                self.dv_b0 = self.v_b0 / (1 - self.decay_1**(i + 1))
                self.v -= self.learning_rate * self.dm_v / (
                    np.sqrt(self.dv_v + self.epsilon))
                self.b_1 -= self.learning_rate * self.dm_b1 / (
                    np.sqrt(self.dv_b1 + self.epsilon))
                self.w -= self.learning_rate * self.dm_w / (
                    np.sqrt(self.dv_w + self.epsilon))
                self.b_0 -= self.learning_rate * self.dm_b0 / (
                    np.sqrt(self.dv_b0 + self.epsilon))
            if i % 100 == 0:
                print(i, 'Train Cost: ', self.train_cost[i],
                      'Train Classification Rate: ', self.train_cr[i])
예제 #15
0
def main():
	# load the data:
	(Xtrain, Ytrain), (Xtest, Ytest) = mnist.load_data()
	# print(Xtrain.shape)
	N, d, _ = Xtrain.shape
	D = d*d
	Ntest = len(Xtest)

	# normalize the data:
	Xtrain = Xtrain / 255.0
	Xtest = Xtest / 255.0

	# display:
	# n = np.random.choice(N)
	# plt.imshow(Xtrain[n], cmap='gray')
	# plt.title(str(Ytrain[n]))
	# plt.show()

	# reshape the data:
	Xtrain = Xtrain.reshape(N, D)
	Xtest = Xtest.reshape(Ntest, D)	

	# print('Xtrain.max():', Xtrain.max())
	# print('Xtrain.shape:', Xtrain.shape)

	Ytrain_ind = y2indicator(Ytrain)
	Ytest_ind = y2indicator(Ytest)


	# define hyperparameters:
	epochs = 30
	print_period = 10
	lr = 0.00004
	reg = 0.01

	batch_sz = 500
	n_batches = N // batch_sz

	M = 300 # the hidden layer size
	K = len(set(Ytrain))

	# randomly initialize the weights:
	W1_init = np.random.randn(D, M) / np.sqrt(D)
	b1_init = np.zeros(M)
	W2_init = np.random.randn(M, K) / np.sqrt(M)
	b2_init = np.zeros(K)

	
	# 1. mini-batch SGD:
	losses_batch = []
	errors_batch = []

	W1 = W1_init.copy()
	b1 = b1_init.copy()
	W2 = W2_init.copy()
	b2 = b2_init.copy()

	print('\nmini-batch SGD')

	t0 = datetime.now()
	for i in range(epochs):
		Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind)
		for j in range(n_batches):
			Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz]
			Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz]
			pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

			# update the params:
			W2 -= lr*(derivative_W2(Z, Ybatch, pYbatch) + reg*W2)
			b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
			W1 -= lr*(derivative_W1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
			b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)

			if j % print_period == 0:
				pY, _ = forward(Xtest, W1, b1, W2, b2)
				l = cross_entropy(pY, Ytest)
				losses_batch.append(l)
				e = error_rate(pY, Ytest)
				errors_batch.append(e)
				sys.stdout.write('epoch: %d, batch: %d, cost: %.6f, error_rate: %.4f\r' % (i, j, l, e))
				# print('\nepoch: %d, batch: %d, cost: %6f' % (i, j, l))
				# print('error_rate:', e)

	sys.stdout.flush()	
	pY, _ = forward(Xtest, W1, b1, W2, b2)
	print('ETA:', datetime.now() - t0, 'final error rate:', error_rate(pY, Ytest), ' '*20)

	
	# 2. mini-batch SGD with momentum - version 1:
	losses_momentum1 = []
	errors_momentum1 = []

	W1 = W1_init.copy()
	b1 = b1_init.copy()
	W2 = W2_init.copy()
	b2 = b2_init.copy()

	mu = 0.9 # momentum term
	# initial values for the 'velocities':
	dW2 = 0 
	db2 = 0
	dW1 = 0
	db1 = 0

	print('\nmini-batch SGD with momentum - version 1')
	t0 = datetime.now()
	for i in range(epochs):
		Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind)
		for j in range(n_batches):
			Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz]
			Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz]
			pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

			# calculate the gradients:
			gW2 = derivative_W2(Z, Ybatch, pYbatch) + reg*W2
			gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
			gW1 = derivative_W1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
			gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1

			# update the 'velocities':
			dW2 = mu*dW2 - lr*gW2  
			db2 = mu*db2 - lr*gb2 
			dW1 = mu*dW1 - lr*gW1 
			db1 = mu*db1 - lr*gb1 
			
			# update the params:
			W2 += dW2
			b2 += db2
			W1 += dW1
			b1 += db1

			if j % print_period == 0:
				pY, _ = forward(Xtest, W1, b1, W2, b2)
				l = cross_entropy(pY, Ytest)
				losses_momentum1.append(l)
				e = error_rate(pY, Ytest)
				errors_momentum1.append(e)
				sys.stdout.write('epoch: %d, batch: %d, cost: %.6f, error_rate: %.4f\r' % (i, j, l, e))
				# print('\nepoch: %d, batch: %d, cost: %6f' % (i, j, l))
				# print('error_rate:', e)
	
	sys.stdout.flush()
	pY, _ = forward(Xtest, W1, b1, W2, b2)
	print('ETA:', datetime.now() - t0, 'final error rate:', error_rate(pY, Ytest), ' '*20)
	
	'''
	# 3. mini-batch SGD with momentum - version 2:
	losses_momentum2 = []
	errors_momentum2 = []

	W1 = W1_init.copy()
	b1 = b1_init.copy()
	W2 = W2_init.copy()
	b2 = b2_init.copy()

	mu = 0.9 # momentum term
	# initial values for the 'velocities':
	dW2 = 0 
	db2 = 0
	dW1 = 0
	db1 = 0

	# lr = 0.0004

	print('\nmini-batch SGD with momentum - version 2')
	t0 = datetime.now()
	for i in range(epochs):
		Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind)
		for j in range(n_batches):
			Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz]
			Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz]
			pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

			# calculate the gradients:
			gW2 = derivative_W2(Z, Ybatch, pYbatch) + reg*W2
			gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
			gW1 = derivative_W1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
			gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1

			# # update the 'velocities':
			dW2 = mu*dW2 + (1-mu)*gW2  
			db2 = mu*db2 + (1-mu)*gb2 
			dW1 = mu*dW1 + (1-mu)*gW1 
			db1 = mu*db1 + (1-mu)*gb1 

			# update the 'velocities':
			# dW2 = mu*dW2 + gW2  
			# db2 = mu*db2 + gb2 
			# dW1 = mu*dW1 + gW1 
			# db1 = mu*db1 + gb1 
			
			# update the params:
			W2 -= lr*dW2
			b2 -= lr*db2
			W1 -= lr*dW1
			b1 -= lr*db1

			if j % print_period == 0:
				pY, _ = forward(Xtest, W1, b1, W2, b2)
				l = cross_entropy(pY, Ytest)
				losses_momentum2.append(l)
				e = error_rate(pY, Ytest)
				errors_momentum2.append(e)
				sys.stdout.write('epoch: %d, batch: %d, cost: %.6f, error_rate: %.4f\r' % (i, j, l, e))
				sys.stdout.flush()				
				
	pY, _ = forward(Xtest, W1, b1, W2, b2)
	print('ETA:', datetime.now() - t0, 'final error rate:', error_rate(pY, Ytest), ' '*20)
    # best result: epochs = 25, final_error = 0.0179
	'''
	# 4. mini-batch SGD with Nesterov momentum:
	losses_nesterov_momentum = []
	errors_nesterov_momentum = []

	W1 = W1_init.copy()
	b1 = b1_init.copy()
	W2 = W2_init.copy()
	b2 = b2_init.copy()

	mu = 0.9 # momentum term
	# initial values for the 'velocities':
	dW2 = 0 
	db2 = 0
	dW1 = 0
	db1 = 0


	print('\nmini-batch SGD with Nesterov momentum')
	t0 = datetime.now()
	for i in range(epochs):
		Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind)
		for j in range(n_batches):
			Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz]
			Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz]
			pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

			# calculate the gradients:
			gW2 = derivative_W2(Z, Ybatch, pYbatch) + reg*W2
			gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
			gW1 = derivative_W1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
			gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1
			
			# update the 'velocities':
			dW2 = mu*dW2 - lr*gW2  
			db2 = mu*db2 - lr*gb2 
			dW1 = mu*dW1 - lr*gW1 
			db1 = mu*db1 - lr*gb1 
			
			# update the params:
			W2 += mu*dW2 - lr*gW2  
			b2 += mu*db2 - lr*gb2 
			W1 += mu*dW1 - lr*gW1 
			b1 += mu*db1 - lr*gb1 

			if j % print_period == 0:
				pY, _ = forward(Xtest, W1, b1, W2, b2)
				l = cross_entropy(pY, Ytest)
				losses_nesterov_momentum.append(l)
				e = error_rate(pY, Ytest)
				errors_nesterov_momentum.append(e)
				sys.stdout.write('epoch: %d, batch: %d, cost: %.6f, error_rate: %.4f\r' % (i, j, l, e))
				sys.stdout.flush()
				# print('\nepoch: %d, batch: %d, cost: %6f' % (i, j, l))
				# print('error_rate:', e)

	
	pY, _ = forward(Xtest, W1, b1, W2, b2)
	print('ETA:', datetime.now() - t0, 'final error rate:', error_rate(pY, Ytest), ' '*20)
	
	# plot the losses:
	plt.plot(losses_batch, label='mini-batch SGD')
	plt.plot(losses_momentum1, label='+ momentum')
	plt.plot(losses_nesterov_momentum, label='+ Nesterov momentum')
	plt.xlabel('iterations')
	plt.ylabel('loss')
	plt.legend()
	plt.show()