print nn_cost_fun(init_params, 400, 25, 10, X, y, 0)[0] # """ # Initializing Parameters Rand rand_theta1 = rand_initialize_weights(400, 25) rand_theta2 = rand_initialize_weights(25, 10) rand_params = np.r_[rand_theta1.ravel(), rand_theta2.ravel()] # print nn_cost_fun(rand_params, 400, 25, 10, X, y, 0)[0] # ================= compute_numerical_gradient... ================= # If your backpropagation implementation is correct, then the relative difference will be small (less than 1e-9) Relative Difference sample = np.random.choice(X.shape[0], 10) XX = X[sample] yy = y[sample] nn_param_grad = nn_cost_fun(rand_params, 400, 25, 10, XX, yy, 0)[1] number_param_grad = compute_numerical_grad(rand_params, 400, 25, 10, XX, yy, 0) diff = np.abs(number_param_grad - nn_param_grad) / (np.abs(number_param_grad) + np.abs(nn_param_grad)) print 'number_param_grad diff is: ', diff[0:100] print 'Training Neural Network...' l = 1 result = opt.minimize(fun=nn_cost_fun, x0=rand_params, args=(input_layer_size, hidden_layer_size, num_labels, X, y, l), method='TNC', jac=True, options={'maxiter': 150}) params_trained = result.x
# The diff, given at the last, represent the difference between # the gradients from BP and numerical computing. It should be # very small, otherwise the autoencoder was not trained correctly. ##========================================================== import numpy as np import sparse_autoencoder as sp import compute_numerical_grad as co data = np.random.rand(64, 100) visible_size = 64 hidden_size = [16, 8, 4] #hidden_size = [25] theta = sp.initial_parameter(hidden_size, visible_size) bp_cost = sp.compute_cost(theta, data, visible_size, hidden_size) print bp_cost bp_grad = sp.compute_grad(theta, data, visible_size, hidden_size) print bp_grad num_grad = co.compute_numerical_grad(sp.compute_cost, theta, data, visible_size, hidden_size) diff = np.linalg.norm(bp_grad - num_grad) / np.linalg.norm(num_grad + bp_grad) print str(diff) + " should be less than 1e-9! Is it?"
# Randomly generate theta and data visible_size = 32 hidden_size = [16, 8, 4] data = np.random.rand(32, 100) layer_ind = range(len(hidden_size) + 1) layer_ind.remove(0) layer_size = [visible_size] + hidden_size # Debugging! for ind in layer_ind: theta = sp.initial_parameter(layer_size[ind], layer_size[ind - 1]) bp_grad = sp.compute_grad(theta, data, layer_size[ind - 1], layer_size[ind]) num_grad = co.compute_numerical_grad(sp.compute_cost, theta, data, layer_size[ind - 1], layer_size[ind]) diff = np.linalg.norm(bp_grad - num_grad) /\ np.linalg.norm(num_grad + bp_grad) print str(diff) + " should be less than 1e-9! Is it?" W = theta[:layer_size[ind]*layer_size[ind-1]].\ reshape(layer_size[ind], layer_size[ind-1]) data = np.dot(W, data)
dpark_ctx = DparkContext() printdiff = [] start = clock() # Debugging! for ind in layer_ind: theta = sp.initial_parameter(layer_size[ind], layer_size[ind - 1]) bp_grad = sp.compute_grad(theta, data, layer_size[ind-1], layer_size[ind], 0.0001, 0.01, 3, dpark_ctx) num_grad = co.compute_numerical_grad(sp.compute_cost, theta, data, layer_size[ind-1], layer_size[ind], 0.0001, 0.01, 3, dpark_ctx) diff = np.linalg.norm(bp_grad - num_grad) /\ np.linalg.norm(num_grad + bp_grad) printdiff.append(diff) W = theta[:layer_size[ind]*layer_size[ind-1]].\ reshape(layer_size[ind], layer_size[ind-1]) data = np.dot(W, data) for ind in layer_ind: print str(printdiff[ind-1]) + " should be less than 1e-9! Is it?"
dpark_ctx = DparkContext() printdiff = [] start = clock() # Debugging! for ind in layer_ind: theta = sp.initial_parameter(layer_size[ind], layer_size[ind - 1]) bp_grad = sp.compute_grad(theta, data, layer_size[ind - 1], layer_size[ind], 0.0001, 0.01, 3, dpark_ctx) num_grad = co.compute_numerical_grad(sp.compute_cost, theta, data, layer_size[ind - 1], layer_size[ind], 0.0001, 0.01, 3, dpark_ctx) diff = np.linalg.norm(bp_grad - num_grad) /\ np.linalg.norm(num_grad + bp_grad) printdiff.append(diff) W = theta[:layer_size[ind]*layer_size[ind-1]].\ reshape(layer_size[ind], layer_size[ind-1]) data = np.dot(W, data) for ind in layer_ind: print str(printdiff[ind - 1]) + " should be less than 1e-9! Is it?"
import compute_numerical_grad as co # Randomly generate theta and data visible_size = 32 hidden_size = [16, 8, 4] data = np.random.rand(32, 100) layer_ind = range(len(hidden_size) + 1) layer_ind.remove(0) layer_size = [visible_size] + hidden_size # Debugging! for ind in layer_ind: theta = sp.initial_parameter(layer_size[ind], layer_size[ind - 1]) bp_grad = sp.compute_grad(theta, data, layer_size[ind-1], layer_size[ind]) num_grad = co.compute_numerical_grad(sp.compute_cost, theta, data, layer_size[ind-1], layer_size[ind]) diff = np.linalg.norm(bp_grad - num_grad) /\ np.linalg.norm(num_grad + bp_grad) print str(diff) + " should be less than 1e-9! Is it?" W = theta[:layer_size[ind]*layer_size[ind-1]].\ reshape(layer_size[ind], layer_size[ind-1]) data = np.dot(W, data)
# is highly recommended to check precisely. # The diff, given at the last, represent the difference between # the gradients from BP and numerical computing. It should be # very small, otherwise the autoencoder was not trained correctly. ##========================================================== import numpy as np import sparse_autoencoder as sp import compute_numerical_grad as co data = np.random.rand(64, 100) visible_size = 64 hidden_size = [16, 8, 4] #hidden_size = [25] theta = sp.initial_parameter(hidden_size, visible_size) bp_cost = sp.compute_cost(theta, data, visible_size, hidden_size) print bp_cost bp_grad = sp.compute_grad(theta, data, visible_size, hidden_size) print bp_grad num_grad = co.compute_numerical_grad(sp.compute_cost, theta, data, visible_size, hidden_size) diff = np.linalg.norm(bp_grad - num_grad) / np.linalg.norm(num_grad + bp_grad) print str(diff) + " should be less than 1e-9! Is it?"