def finalTest(size_training, size_test, hidden_layers, lambd, num_iterations):
    print "\nBeginning of the finalTest... \n"

    images_training, labels_training, images_test, labels_test = read_dataset(size_training, size_test)
    # Setup the parameters you will use for this exercise
    input_layer_size = 784        # 28x28 Input Images of Digits
    num_labels = 10         # 10 labels, from 0 to 9 (one label for each digit)
    layers = [input_layer_size] + hidden_layers + [num_labels]
    num_of_hidden_layers = len(hidden_layers)
    # Fill the randInitializeWeights.py in order to initialize the neural network weights.
    Theta = randInitializeWeights(layers)

    # Unroll parameters
    nn_weights = unroll_params(Theta)
    res = fmin_l_bfgs_b(costFunction, nn_weights, fprime=backwards, args=(layers, images_training, labels_training, num_labels, lambd), maxfun = num_iterations, factr = 1., disp = True)
    Theta = roll_params(res[0], layers)

    print "\nTesting Neural Network... \n"

    pred_training = predict(Theta, images_training)
    print '\nAccuracy on training set: ' + str(mean(labels_training == pred_training) * 100)

    pred = predict(Theta, images_test)
    print '\nAccuracy on test set: ' + str(mean(labels_test == pred) * 100)

    # Display the images where the algorithm got wrong
    temp = (labels_test == pred)
    indexes_false = []
    for i in range(size_test):
        if temp[i] == 0:

    displayData(images_training[indexes_false, :])
def backwards(nn_weights, layers, X, y, num_labels, lambd):
    # Computes the gradient fo the neural network.
    # nn_weights: Neural network parameters (vector)
    # layers: a list with the number of units per layer.
    # X: a matrix where every row is a training example for a handwritten digit image
    # y: a vector with the labels of each instance
    # num_labels: the number of units in the output layer
    # lambd: regularization factor

    # Setup some useful variables
    m = X.shape[0]
    num_layers = len(layers)

    # Roll Params
    # The parameters for the neural network are "unrolled" into the vector
    # nn_params and need to be converted back into the weight matrices.
    Theta = roll_params(nn_weights, layers)

    # You need to return the following variables correctly
    Theta_grad = [zeros(w.shape) for w in Theta]

    # ================================ TODO ================================
    # The vector y passed into the function is a vector of labels
    # containing values from 1..K. You need to map this vector into a
    # binary vector of 1's and 0's to be used with the neural network
    # cost function.
    yv = zeros((num_labels, m))
    for i in range(m):
        yv[y[i]][i] = 1

    # ================================ TODO ================================
    # In this point implement the backpropagaition algorithm
    a = [[] for i in range(num_layers)]
    z = [[] for i in range(num_layers)]
    delta=[[] for i in range(num_layers)]
    for t in range(m):
        a[0] = X[t]
        for i in range(0, num_layers - 1):
            a[i] = insert(a[i], 0, 1)
            z[i] = Theta[i].dot(transpose(a[i]))
            a[i + 1] = sigmoid(z[i])
        delta[-1] = a[-1] - yv[:,t]
        for i in range(num_layers - 1, 0, -1):
            if i > 1:
                delta[i - 1] = (transpose(Theta[i-1][:, 1:]).dot(delta[i])) * sigmoidGradient(z[i - 2]) #because z[0] corresponds to z2

        for i in range(0, num_layers - 1):
            Theta_grad[i] += atleast_2d(delta[i+1]).T.dot(atleast_2d(a[i]))

    # regularization
    for l in range(0, num_layers - 1):
        for i in range(Theta[l].shape[0]):
            for j in range(1, Theta[l].shape[1]):
                Theta_grad[l][i][j] += lambd * Theta[l][i][j]

    # Unroll Params
    Theta_grad = unroll_params(Theta_grad)

    return Theta_grad/m
def checkNNGradients(lambd):

    input_layer_size = 3
    hidden_layer_size = 5
    num_labels = 3
    m = 5
    layers = [3, 5, 3]

    # In this point we generate a number of random data
    Theta = []
    Theta.append(debugInitializeWeights(hidden_layer_size, input_layer_size))
    Theta.append(debugInitializeWeights(num_labels, hidden_layer_size))

    X = debugInitializeWeights(m, input_layer_size - 1)
    y = remainder(arange(m) + 1, num_labels)

    # Unroll parameters
    nn_params = unroll_params(Theta)

    # Compute Numerical Gradient
    numgrad = computeNumericalGradient(nn_params, layers, X, y, num_labels,

    # Compute Analytical Gradient (BackPropagation)
    truegrad = backwards(nn_params, layers, X, y, num_labels, lambd)

    print concatenate(([numgrad], [truegrad]), axis=0).transpose()
    print "The above two columns must be very similar.\n(Left-Numerical Gradient, Right-Analytical Gradient (BackPropagation)\n"

    diff = linalg.norm(numgrad - truegrad) / linalg.norm(numgrad + truegrad)
    print "\nNote: If the implementation of the backpropagation is correct, the relative different must be quite small (less that 1e-09)."
    print "Relative difference: " + str(diff) + "\n"
layers = [input_layer_size]
for i in range(num_of_hidden_layers):
    layers = layers +  [int(raw_input('Please select the number nodes for the ' + str(i+1) + ' hidden layers: '))]
layers = layers + [num_labels]

raw_input('\nProgram paused. Press enter to continue!!!')

print "\nInitializing Neural Network Parameters ...\n"

# ================================ DONE ================================
# Fill the randInitializeWeights.py in order to initialize the neural network weights. 
Theta = randInitializeWeights(layers)

# Unroll parameters
nn_weights = unroll_params(Theta)

raw_input('\nProgram paused. Press enter to continue!!!')

# ================================ Step 3: Sigmoid  ================================================
#  Before you start implementing the neural network, you will first
#  implement the gradient for the sigmoid function. You should complete the
#  code in the sigmoidGradient.m file.

print "\nEvaluating sigmoid function ...\n"

g = sigmoid(array([-1, -0.5, 0,  0.5, 1]))
print "Sigmoid evaluated at [1 -0.5 0 0.5 1]:  "
print g
    # Computes the gradient fo the neural network.
    # nn_weights: Neural network parameters (vector)
    # layers: a list with the number of units per layer.
    # X: a matrix where every row is a training example for a handwritten digit image
    # y: a vector with the labels of each instance
    # num_labels: the number of units in the output layer
    # lambd: regularization factor
    # Setup some useful variables
    m = X.shape[0]
    num_layers = len(layers)

    # Roll Params
    # The parameters for the neural network are "unrolled" into the vector
    # nn_params and need to be converted back into the weight matrices.
    Theta = roll_params(nn_weights, layers)
    # You need to return the following variables correctly 
    Theta_grad = [zeros(w.shape) for w in Theta]

    # The vector y passed into the function is a vector of labels
    # containing values from 1..K. You need to map this vector into a 
    # binary vector of 1's and 0's to be used with the neural network
    # cost function.
    yv = zeros((num_labels, m))
    for i in range(m):
	yv[y[i],i] = 1

    # In this point implement the backpropagaition algorithm 
    A = []
    a = ones(X.shape[0])
    a = vstack((a,X.transpose()))
    Z = []
    for i in range(num_layers-1):
	z = dot(Theta[i],a)
	a = sigmoid(z)
	if i != num_layers-2:
	    a = vstack((ones(a.shape[1]),a))  
    # A: list of result after each layer
    h = a.transpose()

    # delta for the last layer
    delta = h - yv.transpose()
    # calculate of gradients
    for j in range(num_layers-2,0,-1):
	Theta_grad[j] = Theta_grad[j] + dot(delta.transpose(),A[j])
	# calculate of delta for current layer(have to remove the first column of Theta)
	tmp = dot(Theta[j][:,1:].transpose(),delta.transpose())
	tmp = tmp.transpose()
	tmp_matrix = zeros(tmp.shape)
	for i in range(m):
	    tmp_matrix[i] = sigmoidGradient(Z[j].transpose()[i])
	delta = tmp_matrix * tmp
    Theta_grad[0] = Theta_grad[0] + dot(delta.transpose(),A[0])

    # regularization
    for i in range(num_layers-1):
	for j in range((Theta_grad[i].shape)[0]):
	    for k in range((Theta_grad[i].shape)[1]):
		Theta_grad[i][j,k] = Theta_grad[i][j,k]/m
		if k >=1:
			Theta_grad[i][j,k] = Theta_grad[i][j,k] + lambd/m*Theta[i][j,k]
    # Unroll Params
    Theta_grad = unroll_params(Theta_grad)

    return Theta_grad
def backwards(nn_weights, layers, X, y, num_labels, lambd):
    # Computes the gradient fo the neural network.
    # nn_weights: Neural network parameters (vector)
    # layers: a list with the number of units per layer.
    # X: a matrix where every row is a training example for a handwritten digit image
    # y: a vector with the labels of each instance
    # num_labels: the number of units in the output layer
    # lambd: regularization factor

    # Setup some useful variables
    m = X.shape[0]
    num_layers = len(layers)

    # Roll Params
    # The parameters for the neural network are "unrolled" into the vector
    # nn_params and need to be converted back into the weight matrices.
    Theta = roll_params(nn_weights, layers)

    # You need to return the following variables correctly
    Theta_grad = [zeros(w.shape) for w in Theta]

    # ================================ DONE ================================
    # The vector y passed into the function is a vector of labels
    # containing values from 1..K. You need to map this vector into a
    # binary vector of 1's and 0's to be used with the neural network
    # cost function.
    yv = zeros((m, num_labels))
    for i in range(m):
        yv[i][y[i]] += 1

    # ================================ DONE ================================
    # In this point implement the backpropagation algorithm

    # In this point calculate the cost of the neural network (feedforward)

    # Step 1: Initialization of useful variables

    # Z and A will store the hidden states of the network, as lists of matrices, of size num_layers
    A = [addColumnOne(X)]
    Z = [addColumnOne(X)]

    # delta will store the delta for each layer from the last to the second layer (in reverse order)
    delta = []

    # Step 2: Feedforward
    for i in range(num_layers - 1):
        h = A[i].dot(Theta[i].T)
        h = addColumnOne(sigmoid(h))

    # Step 3: Backpropagation
    d = removeFirstColumn(A[-1]) - yv

    for i in range(num_layers - 2, 0, -1):
        d = removeFirstColumn(d.dot(Theta[i])) * sigmoidGradient(Z[i])

    # delta is of size num_layers-1 (no delta for the input layer)

    for i in range(num_layers - 1):
        Theta_grad[i] += delta[i].T.dot(A[i])
        # DONE: no regularization on the bias weights !!
        Theta_grad[i] += lambd * Theta[i]
        for j in range(Theta[i].shape[0]):
            Theta_grad[i][j, 0] -= lambd * Theta[i][j, 0]
        Theta_grad[i] /= m

    # Unroll Params
    Theta_grad = unroll_params(Theta_grad)

    return Theta_grad
def backwards(nn_weights, layers, X, y, num_labels, lambd):
    # Computes the gradient fo the neural network.
    # nn_weights: Neural network parameters (vector)
    # layers: a list with the number of units per layer.
    # X: a matrix where every row is a training example for a handwritten digit image
    # y: a vector with the labels of each instance
    # num_labels: the number of units in the output layer
    # lambd: regularization factor

    # Setup some useful variables
    m = X.shape[0]
    num_layers = len(layers)

    # Roll Params
    # The parameters for the neural network are "unrolled" into the vector
    # nn_params and need to be converted back into the weight matrices.
    Theta = roll_params(nn_weights, layers)

    Theta_grad = [np.zeros(w.shape) for w in Theta]

    yv = np.zeros((num_labels, m))
    for i in range(m):
        yv[y[i]][i] = 1

    # Implementation of the backpropagation algorithm

    for i in range(m):

        a_values, z_values = [], [
        ]  # arrays where the values of the activations are to be stored

        a = np.append([1], X[i, :])

        # Loop of the feedforward algorithm
        for k in range(num_layers - 1):
            z = np.dot(Theta[k], a)
            a = np.append([1], sigmoid(z))

        delta_layer = a[1:] - yv[:, i]  # error array of the outer layer
        # np.outer to calculate the matrix product of delta_layer.T and a_values[-2]
        Theta_grad[-1] += np.outer(delta_layer, a_values[-2]) / m

        # Descending loop
        for h in range(num_layers - 2):
            # Error of the (num_layers - 2 - h)-th hidden layer
            # The error that corresponds to the bias factors is not taken into account
            delta_layer = np.dot(Theta[-1 - h].T,
                                 delta_layer)[1:] * sigmoidGradient(
                                     z_values[-2 - h])
            # Calculation of the gradient
            Theta_grad[-2 - h] += np.outer(delta_layer, a_values[-3 - h]) / m

    for h in range(num_layers - 1):
        # The terms corresponding to the bias factors are not regularized
        Theta_grad[h][:, 1:] += lambd * Theta[h][:, 1:] / m

    # Unroll Params
    Theta_grad = unroll_params(Theta_grad)

    return Theta_grad
