def sgd(self, batch_size=50, epsilon=0.01, epochs=1000): # epochs=10): # Quickly check the performance of the model """ Mini-batch gradient descent on training data. batch_size: number of training examples between each weight update epsilon: learning rate epochs: the number of times to go through the entire training data """ # Compute the number of training examples and number of mini-batches. N = min(len(self.trainX), len(self.trainY)) num_batches = int(N/batch_size) # Variables to keep track of statistics loss_log = [] test_acc_log = [] train_acc_log = [] timestamp = time.time() timestamp2 = time.time() predictions_not_shown = True # In each "epoch", the network is exposed to the entire training set. for t in range(epochs): # We will order the training data using a random permutation. permutation = np.random.permutation(N) # Evaluate the accuracy on 1000 samples from the training and test data test_acc_log.append( self.evaluate(self.testX, self.testY, 1000) ) train_acc_log.append( self.evaluate(self.trainX, self.trainY, 1000)) batch_loss = 0 for k in range(num_batches): # Reset buffer containing updates # TODO # Make empty containers to store the sum value of partial derivatives in one batch. # Based on page: 18, slide: l05-backpropagation dw_buffer = [i*0 for i in self.dw] db_buffer = [i*0 for i in self.db] # Mini-batch loop for i in range(batch_size): # Select the next training example (x,y) x = self.trainX[permutation[k*batch_size+i]] y = self.trainY[permutation[k*batch_size+i]] # Feed forward inputs # TODO self.backward(x, y) # self.forward is included in self.backward # Compute gradients # TODO # To sum the partial derivatives for each parameter. # Based on page: 18, slide: l05-backpropagation for l in range(self.L): dw_buffer[l] += self.dw[l] db_buffer[l] += self.db[l] # Update loss log batch_loss += self.loss(self.a[self.L-1], y) for l in range(self.L): self.batch_a[l] += self.a[l] / batch_size # Update the weights at the end of the mini-batch using gradient descent for l in range(1,self.L): # Based on page: 18, slide: l05-backpropagation # self.w[l] = # TODO self.w[l] -= epsilon * (dw_buffer[l] / batch_size) # self.b[l] = # TODO self.b[l] -= epsilon * (db_buffer[l] / batch_size) # Update logs loss_log.append( batch_loss / batch_size ) batch_loss = 0 # Update plot of statistics every 10 seconds. if time.time() - timestamp > 10: timestamp = time.time() fnn_utils.plot_stats(self.batch_a, loss_log, test_acc_log, train_acc_log) # Display predictions every 20 seconds. if (time.time() - timestamp2 > 20) or predictions_not_shown: predictions_not_shown = False timestamp2 = time.time() fnn_utils.display_predictions(self,show_pct=True) # Reset batch average for l in range(self.L): self.batch_a[l].fill(0.0) # Save the graph automatically # fnn_utils.save_pic(epochs, epsilon, batch_size, self.network_shape) return test_acc_log, train_acc_log, loss_log
def sgd(self, batch_size=50, epsilon=0.01, epochs=5): """ Mini-batch gradient descent on training data. batch_size: number of training examples between each weight update epsilon: learning rate epochs: the number of times to go through the entire training data """ # Compute the number of training examples and number of mini-batches. N = min(len(self.trainX), len(self.trainY)) num_batches = int(N / batch_size) # Variables to keep track of statistics loss_log = [] test_acc_log = [] train_acc_log = [] timestamp = time.time() timestamp2 = time.time() predictions_not_shown = True # In each "epoch", the network is exposed to the entire training set. for t in range(epochs): # We will order the training data using a random permutation. permutation = np.random.permutation(N) # Evaluate the accuracy on 1000 samples from the training and test data test_acc_log.append(self.evaluate(self.testX, self.testY, 1000)) train_acc_log.append(self.evaluate(self.trainX, self.trainY, 1000)) batch_loss = 0 for k in range(num_batches): # Reset buffer containing updates # TODO batch_dw = [np.zeros(w.shape) for w in self.w] batch_db = [np.zeros(b.shape) for b in self.b] # Mini-batch loop for i in range(batch_size): # Select the next training example (x,y) x = self.trainX[permutation[k * batch_size + i]] y = self.trainY[permutation[k * batch_size + i]] # Feed forward inputs self.forward(x) # Compute gradients self.backward(x, y) # Update loss log batch_loss += self.loss(self.a[self.L - 1], y) for l in range(self.L): self.batch_a[l] += self.a[l] / batch_size batch_db[l] += self.db[l] batch_dw[l] += self.dw[l] # Update the weights at the end of the mini-batch using gradient descent for l in range(1, self.L): self.w[l] = self.w[l] - epsilon * (batch_dw[l] / batch_size) self.b[l] = self.b[l] - epsilon * (batch_db[l] / batch_size) # Update logs loss_log.append(batch_loss / batch_size) batch_loss = 0 # Update plot of statistics every 10 seconds. if time.time() - timestamp > 10: timestamp = time.time() fnn_utils.plot_stats(self.batch_a, loss_log, test_acc_log, train_acc_log) # Display predictions every 20 seconds. if (time.time() - timestamp2 > 20) or predictions_not_shown: predictions_not_shown = False timestamp2 = time.time() fnn_utils.display_predictions(self, show_pct=True) # Reset batch average for l in range(self.L): self.batch_a[l].fill(0.0)
def sgd(self, batch_size=50, epsilon=0.01, epochs=1000): """ Mini-batch gradient descent on training data. batch_size: number of training examples between each weight update epsilon: learning rate epochs: the number of times to go through the entire training data """ # Overwrite batch_size = 5 epsilon = 0.01 epochs = 150 # Compute the number of training examples and number of mini-batches. N = min(len(self.trainX), len(self.trainY)) num_batches = int(N / batch_size) # Variables to keep track of statistics loss_log = [] test_acc_log = [] train_acc_log = [] timestamp = time.time() timestamp2 = time.time() predictions_not_shown = True # In each "epoch", the network is exposed to the entire training set. for t in range(epochs): print("epoch ", t) # We will order the training data using a random permutation. permutation = np.random.permutation(N) # Evaluate the accuracy on 1000 samples from the training and test data test_acc_log.append(self.evaluate(self.testX, self.testY, 1000)) train_acc_log.append(self.evaluate(self.trainX, self.trainY, 1000)) batch_loss = 0 print(test_acc_log[-1]) for k in range(num_batches): # Reset buffer containing updates self.dw = [np.zeros((m1, m0)) for (m0, m1) in self.crossings] self.db = [np.zeros(m) for m in self.network_shape] self.delta = [np.zeros(m) for m in self.network_shape] self.z = [np.zeros(m) for m in self.network_shape] self.a = [np.zeros(m) for m in self.network_shape] # Mini-batch loop for i in range(batch_size): # Select the next training example (x,y) x = self.trainX[permutation[k * batch_size + i]] y = self.trainY[permutation[k * batch_size + i]] # Feed forward inputs x_pred = self.predict(x) # Compute gradients self.backward(x_pred, y) # Update losgis log batch_loss += self.loss(self.a[self.L - 1], y) for l in range(self.L): self.batch_a[l] += self.a[l] / batch_size # Update the weights at the end of the mini-batch using gradient descent for l in range(1, self.L): self.w[l] -= epsilon * self.dw[l] self.b[l] -= epsilon * self.db[l] # Update logs loss_log.append(batch_loss / batch_size) batch_loss = 0 # Update plot of statistics every 10 seconds. if time.time() - timestamp > 10: timestamp = time.time() fnn_utils.plot_stats(self.batch_a, loss_log, test_acc_log, train_acc_log) with open("stats.txt", "a") as outfile: outfile.write("Epoch: " + str(t) + "\nTime elapsed: " + str(time.time() - self.starttime) + " seconds" + "\nLoss: " + str(loss_log[-1]) + "\nTest Accuracy: " + str(test_acc_log[-1]) + "\nTrain Accuracy: " + str(train_acc_log[-1]) + "\n\n") # Display predictions every 20 seconds. if (time.time() - timestamp2 > 20) or predictions_not_shown: predictions_not_shown = False timestamp2 = time.time() fnn_utils.display_predictions(self, show_pct=True) # Reset batch average for l in range(self.L): self.batch_a[l].fill(0.0)