Exemplo n.º 1
def output(partId):
    # Random Test Cases
    X = reshape(3 * sin(arange(1, 31, 1)), (3,10), order='F')
    Xm = reshape(sin(arange(1, 33)), (16,2), order='F') / 5
    ym = 1 + arange(1, 17) % 4
    t1 = sin(reshape(arange(1,25,2), (4,3), order='F'))
    t2 = cos(reshape(arange(1,41,2), (4,5), order='F'))
    t = hstack([t1.ravel('F'), t2.ravel('F')])
    if partId == '1':
        J, _ = nnCostFunction(t, 2, 4, 4, Xm, ym, 0)
        return sprintf('%0.5f ', J)
    elif partId == '2':
        J, _ = nnCostFunction(t, 2, 4, 4, Xm, ym, 1.5)
        return sprintf('%0.5f ', J)
    elif partId == '3':
        return sprintf('%0.5f ', sigmoidGradient(X))
    elif partId == '4':
        J, grad = nnCostFunction(t, 2, 4, 4, Xm, ym, 0)
        out = sprintf('%0.5f ', J)
        return out + sprintf('%0.5f ', grad)
    elif partId == '5':
        J, grad = nnCostFunction(t, 2, 4, 4, Xm, ym, 1.5)
        out = sprintf('%0.5f ', J)
        return out + sprintf('%0.5f ', grad)
Exemplo n.º 2
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels,
                   X, y, lambda_):
    """ computes the cost and gradient of the neural network. The
        parameters for the neural network are "unrolled" into the vector
        nn_params and need to be converted back into the weight matrices.

        The returned parameter grad should be a "unrolled" vector of the
        partial derivatives of the neural network.

    # Reshape nn_params back into the parameters Theta1 and Theta2,
    # the weight matrices for our 2 layer neural network
    # Obtain Theta1 and Theta2 back from nn_params
    Theta1 = nn_params[:hidden_layer_size * (input_layer_size + 1)].reshape(
        hidden_layer_size, input_layer_size + 1, order='F')  # (25, 401)
    Theta2 = nn_params[hidden_layer_size * (input_layer_size + 1):].reshape(
        num_labels, hidden_layer_size + 1, order='F')  # (10, 26)

    # Setup some useful variables
    m = len(X)
    y = pd.get_dummies(y).as_matrix()

    # ====================== YOUR CODE HERE ======================
    # Instructions: You should complete the code by working through the
    #               following parts.
    # Part 1: Feedforward the neural network and return the cost in the
    #         variable J. After implementing Part 1, you can verify that your
    #         cost function computation is correct by verifying the cost
    #         computed in ex4.m
    # Part 2: Implement the backpropagation algorithm to compute the gradients
    #         Theta1_grad and Theta2_grad. You should return the partial
    #         derivatives of the cost function with respect to Theta1 and
    #         Theta2 in Theta1_grad and Theta2_grad, respectively.
    #         After implementing Part 2, you can check that
    #         your implementation is correct by running checkNNGradients
    #         Note: The vector y passed into the function is a vector of labels
    #               containing values from 1..K. You need to map this vector
    #               into a binary vector of 1's and 0's to be used with
    #               the neural network cost function.
    #         Hint: We recommend implementing backpropagation using a for-loop
    #               over the training examples if you are implementing it
    #               for the first time.
    # Part 3: Implement regularization with the cost function and gradients.
    #         Hint: You can implement this around the code for backpropagation.
    #               That is, you can compute the gradients
    #               for the regularization separately and then add them
    #               to Theta1_grad and Theta2_grad from Part 2.

    # Feedforward the neural network...
    a1 = np.c_[np.ones(m), X]  # (5000, 401)

    z2 = a1 @ Theta1.T  # (5000, 401) @ (401, 25) = (5000, 25)
    a2 = np.c_[np.ones(len(z2)), sigmoid(z2)]  # (5000, 26)

    z3 = a2 @ Theta2.T  # (5000, 26) @ (26, 10) = (5000, 10)
    a3 = sigmoid(z3)  # (5000, 10)

    # Computing cost...
    J = -np.mean(np.sum(y * np.log(a3) + (1 - y) * np.log(1 - a3), axis=1))

    # Computing regularized cost...
    J += lambda_ * (sum(np.sum(np.square(Theta1[:, 1:]), axis=1)) +
                    sum(np.sum(np.square(Theta2[:, 1:]), axis=1))) / (2 * m)

    # Computing δ(del) and ∆(delta)...
    del3 = a3 - y  # (5000, 10)
    delta2 = del3.T @ a2  # (10, 26)

    del2 = del3 @ Theta2 * sigmoidGradient(np.c_[np.ones(len(z2)), z2])
    delta1 = del2[:, 1:].T @ a1  # (25, 401)

    # Computing gradient...
    Theta1_grad = delta1 / m
    Theta2_grad = delta2 / m

    # Computing regularized gradient...
    Theta1_grad += lambda_ * np.c_[np.zeros(len(Theta1)), Theta1[:, 1:]] / m
    Theta2_grad += lambda_ * np.c_[np.zeros(len(Theta2)), Theta2[:, 1:]] / m
    # -------------------------------------------------------------

    # =========================================================================

    # Unroll gradient
    grad = np.r_[Theta1_grad.flatten(order='F'),

    return J, grad
Exemplo n.º 3
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, \
 num_labels, X, y, lambda_reg):
    #NNCOSTFUNCTION Implements the neural network cost function for a two layer
    #neural network which performs classification
    #   [J grad] = NNCOSTFUNCTON(nn_params, hidden_layer_size, num_labels, ...
    #   X, y, lambda) computes the cost and gradient of the neural network. The
    #   parameters for the neural network are "unrolled" into the vector
    #   nn_params and need to be converted back into the weight matrices.
    #   The returned parameter grad should be a "unrolled" vector of the
    #   partial derivatives of the neural network.

    # Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices
    # for our 2 layer neural network
    Theta1 = np.reshape(nn_params[:hidden_layer_size * (input_layer_size + 1)], \
                     (hidden_layer_size, input_layer_size + 1), order='F')

    Theta2 = np.reshape(nn_params[hidden_layer_size * (input_layer_size + 1):], \
                     (num_labels, hidden_layer_size + 1), order='F')

    # Setup some useful variables
    m = len(X)

    # # You need to return the following variables correctly
    J = 0
    Theta1_grad = np.zeros(Theta1.shape)
    Theta2_grad = np.zeros(Theta2.shape)

    # ====================== YOUR CODE HERE ======================
    # Instructions: You should complete the code by working through the
    #               following parts.
    # Part 1: Feedforward the neural network and return the cost in the
    #         variable J. After implementing Part 1, you can verify that your
    #         cost function computation is correct by verifying the cost
    #         computed in ex4.m
    # Part 2: Implement the backpropagation algorithm to compute the gradients
    #         Theta1_grad and Theta2_grad. You should return the partial derivatives of
    #         the cost function with respect to Theta1 and Theta2 in Theta1_grad and
    #         Theta2_grad, respectively. After implementing Part 2, you can check
    #         that your implementation is correct by running checkNNGradients
    #         Note: The vector y passed into the function is a vector of labels
    #               containing values from 1..K. You need to map this vector into a
    #               binary vector of 1's and 0's to be used with the neural network
    #               cost function.
    #         Hint: We recommend implementing backpropagation using a for-loop
    #               over the training examples if you are implementing it for the
    #               first time.
    # Part 3: Implement regularization with the cost function and gradients.
    #         Hint: You can implement this around the code for
    #               backpropagation. That is, you can compute the gradients for
    #               the regularization separately and then add them to Theta1_grad
    #               and Theta2_grad from Part 2.

    # add column of ones as bias unit from input layer to second layer
    X = np.column_stack((np.ones((m, 1)), X))  # = a1

    # calculate second layer as sigmoid( z2 ) where z2 = Theta1 * a1
    a2 = s.sigmoid(np.dot(X, Theta1.T))

    # add column of ones as bias unit from second layer to third layer
    a2 = np.column_stack((np.ones((a2.shape[0], 1)), a2))

    # calculate third layer as sigmoid ( z3 ) where z3 = Theta2 * a2
    a3 = s.sigmoid(np.dot(a2, Theta2.T))



    # recode labels as vectors containing only values 0 or 1
    labels = y
    # set y to be matrix of size m x k
    y = np.zeros((m, num_labels))
    # for every label, convert it into vector of 0s and a 1 in the appropriate position
    for i in xrange(m):
        y[i, labels[i] - 1] = 1

    # at this point, both a3 and y are m x k matrices, where m is the number of inputs
    # and k is the number of hypotheses. Given that the cost function is a sum
    # over m and k, loop over m and in each loop, sum over k by doing a sum over the row

    cost = 0
    for i in xrange(m):
        cost += np.sum(y[i] * np.log(a3[i]) + (1 - y[i]) * np.log(1 - a3[i]))

    J = -(1.0 / m) * cost

    # note that Theta1[:,1:] is necessary given that the first column corresponds to transitions
    # from the bias terms, and we are not regularizing those parameters. Thus, we get rid
    # of the first column.

    sumOfTheta1 = np.sum(np.sum(Theta1[:, 1:]**2))
    sumOfTheta2 = np.sum(np.sum(Theta2[:, 1:]**2))

    J = J + ((lambda_reg / (2.0 * m)) * (sumOfTheta1 + sumOfTheta2))


    bigDelta1 = 0
    bigDelta2 = 0

    # for each training example
    for t in range(m):

        ## step 1: perform forward pass
        # set lowercase x to the t-th row of X
        x = X[t]
        a2 = s.sigmoid(np.dot(x, Theta1.T))
        a2 = np.concatenate((np.array([1]), a2))
        a3 = s.sigmoid(np.dot(a2, Theta2.T))

        delta3 = np.zeros((num_labels))

        for k in range(num_labels):
            y_k = y[t, k]
            delta3[k] = a3[k] - y_k

        delta2 = (np.dot(Theta2[:, 1:].T, delta3).T) * sg.sigmoidGradient(
            np.dot(x, Theta1.T))

        ## step 4: accumulate gradient from this example
        # accumulation
        # note that
        #   delta2.shape =
        #   x.shape      =
        #   delta3.shape =
        #   a2.shape     =
        # np.dot(delta2,x) and np.dot(delta3,a2) don't do outer product
        # could do e.g. np.dot(delta2[:,None], x[None,:])
        # seems faster to do np.outer(delta2, x)
        # solution from http://stackoverflow.com/a/22950320/583834
        bigDelta1 += np.outer(delta2, x)
        bigDelta2 += np.outer(delta3, a2)

    # step 5: obtain gradient for neural net cost function by dividing the accumulated gradients by m
    Theta1_grad = bigDelta1 / m
    Theta2_grad = bigDelta2 / m

    # only regularize for j >= 1, so skip the first column
    Theta1_grad_unregularized = np.copy(Theta1_grad)
    Theta2_grad_unregularized = np.copy(Theta2_grad)
    Theta1_grad += (float(lambda_reg) / m) * Theta1
    Theta2_grad += (float(lambda_reg) / m) * Theta2
    Theta1_grad[:, 0] = Theta1_grad_unregularized[:, 0]
    Theta2_grad[:, 0] = Theta2_grad_unregularized[:, 0]

    # # -------------------------------------------------------------

    # # =========================================================================

    # Unroll gradients
    grad = np.concatenate((Theta1_grad.reshape(Theta1_grad.size, order='F'),
                           Theta2_grad.reshape(Theta2_grad.size, order='F')))

    return J, grad
Exemplo n.º 4
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, Lambda):
    """computes the cost and gradient of the neural network. The
  parameters for the neural network are "unrolled" into the vector
  nn_params and need to be converted back into the weight matrices.

  The returned parameter grad should be a "unrolled" vector of the
  partial derivatives of the neural network.

    # Reshape nn_params back into the parameters theta1 and theta2, the weight matrices
    # for our 2 layer neural network
    # Obtain theta1 and theta2 back from nn_params

    theta1 = nn_params[0:(hidden_layer_size * (input_layer_size + 1))].reshape((input_layer_size + 1),
    theta2 = nn_params[(hidden_layer_size * (input_layer_size + 1)):].reshape((hidden_layer_size + 1), num_labels).T

    # Setup some useful variables
    m, _ = X.shape

    # You need to return the following variables correctly
    J = 0
    theta1_grad = np.zeros(theta1.shape);
    theta2_grad = np.zeros(theta2.shape);

    # ====================== YOUR CODE HERE ======================
    # Instructions: You should complete the code by working through the
    #               following parts.
    # Part 1: Feedforward the neural network and return the cost in the
    #         variable J. After implementing Part 1, you can verify that your
    #         cost function computation is correct by verifying the cost
    #         computed in ex4.py
    #	NOTE DE GLC: JE VOUS AI MIS LA CORRECTION DES PART 2 ET 3 dans la suite du code
    # Part 2: Implement the backpropagation algorithm to compute the gradients
    #         theta1_grad and theta2_grad. You should return the partial derivatives of
    #         the cost function with respect to theta1 and theta2 in theta1_grad and
    #         theta2_grad, respectively. After implementing Part 2, you can check
    #         that your implementation is correct by running checkNNGradients
    #         Note: The vector y passed into the function is a vector of labels
    #               containing values from 1...K. You need to map this vector into a 
    #               binary vector of 1's and 0's to be used with the neural network
    #               cost function.
    #         Hint: We recommend implementing backpropagation using a for-loop
    #               over the training examples if you are implementing it for the 
    #               first time.
    # Part 3: Implement regularization with the cost function and gradients.
    #         Hint: You can implement this around the code for
    #               backpropagation. That is, you can compute the gradients for
    #               the regularization separately and then add them to theta1_grad
    #               and theta2_grad from Part 2.
    # =========================================================================

    # Add ones to the X data matrix

    # Construct a 10xm "y" matrix with all zeros and only one "1" entry
    # note here if the hand-written digit is "0", then that corresponds
    # to a y- vector with 1 in the 10th spot (different from what the
    # homework suggests)
    y_matrix = np.zeros((num_labels, m))  # A compléter
    for i in range(m):
        y_matrix[y[i]-1, i] = 1
    a1 = np.ones((X.shape[0], X.shape[1] + 1))
    a1[:, 1:] = X
    z2 = np.dot(theta1, a1.T)
    a2 = np.ones((z2.shape[0] + 1, a1.shape[0]))
    a2[1:, :] = sigmoid(z2)

    z3 = np.dot(theta2, a2)
    a3 = sigmoid(z3).T
    inner1 = (y_matrix.T * np.log(a3))
    inner2 = (1 - y_matrix.T) * np.log(1 - a3)
    J = (1 / m) * np.sum(-inner1 - inner2)

    # Compute Cost

    # =========================================================================

    # Cost regularisation
    reg = (Lambda / (2 * m)) * (np.sum(np.square(theta1[:, 1:])) + np.sum(np.square(theta2[:, 1:])))
    J = J + reg

    # Gradients
    d3 = a3 - y_matrix.T  # 10x5000
    d2 = theta2[:, 1:].T.dot(d3.T) * sigmoidGradient(z2)  # 25x10 *10x5000 * 25x5000 = 25x5000

    delta1 = d2.dot(a1)  # 25x5000 * 5000x401 = 25x401
    delta2 = d3.T.dot(a2.T)  # 10x5000 *5000x26 = 10x26

    # Gradient regularisation
    theta1_grad = delta1 / m
    reg = (theta1[:, 1:] * Lambda) / m
    theta1_grad[:, 1:] = theta1_grad[:, 1:] + reg

    theta2_grad = delta2 / m
    reg = (theta2[:, 1:] * Lambda) / m
    theta2_grad[:, 1:] = theta2_grad[:, 1:] + reg

    # Unroll gradient
    grad = np.hstack((theta1_grad.T.ravel(), theta2_grad.T.ravel()))

    return J, grad
Exemplo n.º 5
def nnCostFunction(nn_params,input_layer_size,hidden_layer_size,\
    import numpy as np
    import sigmoid as sg
    import sub2ind
    import sigmoidGradient as sG
    #restructuring nn_params back to Theta1 and Theta2 python has a o based indexing unlike matlab which has 1 based indexing
    Theta1 = np.reshape(nn_params[0:hidden_layer_size * (input_layer_size + 1)],(hidden_layer_size, (input_layer_size + 1)));
    Theta2 = np.reshape(nn_params[((hidden_layer_size * (input_layer_size + 1))):len(nn_params)],(num_labels, (hidden_layer_size + 1)));
    # Setup some useful variables
    m =np.size(X, 0);
    # You need to return the following variables correctly 
    J = 0;
    Theta1_grad = np.zeros(np.size(Theta1),dtype=float);
    Theta2_grad = np.zeros(np.size(Theta2),dtype=float);
    % Part 1: Feedforward the neural network and return the cost in the
    %         variable J. After implementing Part 1, you can verify that your
    %         cost function computation is correct by verifying the cost
    %         computed in ex4.m
    % Part 2: Implement the backpropagation algorithm to compute the gradients
    %         Theta1_grad and Theta2_grad. You should return the partial derivatives of
    %         the cost function with respect to Theta1 and Theta2 in Theta1_grad and
    %         Theta2_grad, respectively. After implementing Part 2, you can check
    %         that your implementation is correct by running checkNNGradients
    %         Note: The vector y passed into the function is a vector of labels
    %               containing values from 1..K. You need to map this vector into a 
    %               binary vector of 1's and 0's to be used with the neural network
    %               cost function.
    %         Hint: We recommend implementing backpropagation using a for-loop
    %               over the training examples if you are implementing it for the 
    %               first time.
    % Part 3: Implement regularization with the cost function and gradients.
    %         Hint: You can implement this around the code for
    %               backpropagation. That is, you can compute the gradients for
    %               the regularization separately and then add them to Theta1_grad
    %               and Theta2_grad from Part 2.

    z2 = np.dot(Theta1 , a1);
    a2 =np.r_[np.ones((1, m),dtype=float), sg.sigmoid(z2)]; # 26 x m
    a3 = sg.sigmoid(np.dot(Theta2 , a2)); # 10 x m
    # Explode y into 10 values with Y[i] := i == y.
    Y = np.zeros((num_labels, m),dtype=float).flatten(1);
    J = (1.0/m) * np.sum(np.sum((-Y*np.log(a3)) -( (1 - Y) * np.log(1 - a3))));
    # Add regularized error. Drop the bias terms in the 1st columns.
    J = J + (lamda / (2*m)) * np.sum(np.sum(Theta1[:, 1:] ** 2));

    J = J + (lamda / (2*m)) * np.sum(np.sum(Theta2[:, 1:] ** 2));
    # 2. Backpropagate to get gradient information.
    d3 = a3 - Y; # 10 x m
    d2 = (np.dot(Theta2.conj().T , d3)) * np.r_[np.ones((1, m),dtype=float) ,sG.sigmoidGradient(z2)];#  26 x m
    # Vectorized ftw:
    Theta2_grad = (1/m) *np.dot( d3 , a2.conj().T);
    Theta1_grad = (1/m) * np.dot(d2[1:, :], a1.conj().T);
    # Add gradient regularization.
    Theta2_grad = Theta2_grad +  (lamda / m) * (np.c_[np.zeros((np.size(Theta2, 0), 1),dtype=float), Theta2[:, 1:]])
    Theta1_grad = Theta1_grad +  (lamda / m) * (np.c_[np.zeros((np.size(Theta1, 0), 1),dtype=float), Theta1[:, 1:]])

    return [J,grad]
Exemplo n.º 6
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels,
                   X, y, lambda_param):
    [J grad] = NNCOSTFUNCTON(nn_params, hidden_layer_size, num_labels,
    X, y, lambda) computes the cost and gradient of the neural network. 
    The parameters for the neural network are "unrolled" into the vector
    nn_params and need to be converted back into the weight matrices. 

    The returned parameter grad should be a "unrolled" vector of the
    partial derivatives of the neural network.

    import numpy as np
    from sigmoid import sigmoid
    from sigmoidGradient import sigmoidGradient

    # Reshape nn_params back into the parameters Theta1 and Theta2
    # the weight matrices for our 2 layer neural network

    Theta1 = np.reshape(
        nn_params[0:hidden_layer_size * (input_layer_size + 1)],
        (hidden_layer_size, (input_layer_size + 1)))

    Theta2 = np.reshape(
        nn_params[(hidden_layer_size * (input_layer_size + 1))::],
        (num_labels, (hidden_layer_size + 1)))

    # Setup some useful variables
    m = X.shape[0]

    # Retrun the following variables correctly
    J = []
    Theta1_grad = np.zeros(Theta1.shape)
    Theta2_grad = np.zeros(Theta2.shape)

    # Part 1:
    # Feedforward the neural network and return the cost in the variable J.

    for i in range(m):
        act_1 = X[i]
        act_1 = np.append(1, act_1)  # add 1
        z_2 = np.dot(Theta1, act_1)
        act_2 = sigmoid(z_2)
        act_2 = np.append(1, act_2)  # add 1
        z_3 = np.dot(Theta2, act_2)
        h = sigmoid(z_3)

        # Logical arrays (binary vector of 1's and 0's)
        y_vect = np.zeros(num_labels)
        y_vect[y[i] - 1] = 1

        cost = -1 / m * (
            np.dot(np.transpose(np.vstack(y_vect)), np.log(h)) +
            np.dot(np.transpose(np.vstack(1 - y_vect)), np.log(1 - h)))


        # Part 2: Implement the backpropagation algorithm to compute the gradients
        # Theta1_grad and Theta2_grad.
        # You should return the partial derivatives of the cost function with respect
        # to Theta1 and Theta2 in Theta1_grad and Theta2_grad, respectively.

        # delta at the output layer
        delta_3 = (h - y_vect)
        # delta for the hidden layer
        # remove delta_2_0 (gradients of bias units) by doing Theta2[:,1:]
        delta_2 = np.dot(np.transpose(Theta2[:, 1:]),
                         delta_3) * sigmoidGradient(z_2)
        # Accumulate the gradients (DELTA)
        Theta1_grad = Theta1_grad + \
            np.dot(np.vstack(delta_2), np.transpose(np.vstack(act_1)))

        Theta2_grad = Theta2_grad + \
            np.dot(np.vstack(delta_3), np.transpose(np.vstack(act_2)))

# Part 3: Implement regularization with the cost function and gradients.
# Regularized gradient for tall
    capital_delta1 = 1 / m * Theta1_grad + np.dot(lambda_param / m, Theta1)
    capital_delta2 = 1 / m * Theta2_grad + np.dot(lambda_param / m, Theta2)

    # Adjust for the first column of Theta. Not regularization for j=0
    capital_delta1[:, 0] = 1 / m * Theta1_grad[:, 0]
    capital_delta2[:, 0] = 1 / m * Theta2_grad[:, 0]

    # Regularized term
    # Take out the bias term in the first column
    regul_term = lambda_param / (2 * m) * (np.sum(np.power(Theta1[:, 1:], 2)) +
                                           np.sum(np.power(Theta2[:, 1:], 2)))

    J = sum(J) + regul_term

    # Unroll gradients
    grad = []
        (list(capital_delta1.flatten()) + list(capital_delta2.flatten())))

    grad = np.array(grad)

    return J, grad
Exemplo n.º 7
g = sigmoid(array([-1, -0.5, 0,  0.5, 1]))
print "Sigmoid evaluated at [1 -0.5 0 0.5 1]:  "
print g

raw_input('\nProgram paused. Press enter to continue!!!')

# ================================ Step 4: Sigmoid Gradient ================================
#  Before you start implementing the neural network, you will first
#  implement the gradient for the sigmoid function. You should complete the
#  code in the sigmoidGradient.m file.

print "\nEvaluating Sigmoid Gradient function ...\n"

g = sigmoidGradient(array([-1, -0.5, 0,  0.5, 1]))
print "Sigmoid gradient evaluated at [1 -0.5 0 0.5 1]:  "
print g

raw_input('\nProgram paused. Press enter to continue!!!')

# ================================ Step 5: Implement Feedforward (Cost Function) ================================

print "\nChecking Cost Function without Regularization (Feedforward) ...\n"

lambd = 0.0

print 'This value should be about 2.09680198349'

raw_input('\nProgram paused. Press enter to continue!!!')
Exemplo n.º 8
def backwards(nn_weights, layers, X, y, num_labels, lambd):
    # Computes the gradient fo the neural network.
    # nn_weights: Neural network parameters (vector)
    # layers: a list with the number of units per layer.
    # X: a matrix where every row is a training example for a handwritten digit image
    # y: a vector with the labels of each instance
    # num_labels: the number of units in the output layer
    # lambd: regularization factor
    # Setup some useful variables
    m = X.shape[0]
    num_layers = len(layers)

    # Roll Params
    # The parameters for the neural network are "unrolled" into the vector
    # nn_params and need to be converted back into the weight matrices.
    Theta = roll_params(nn_weights, layers)
    # You need to return the following variables correctly 
    Theta_grad = [zeros(w.shape) for w in Theta]

    # The vector y passed into the function is a vector of labels
    # containing values from 1..K. You need to map this vector into a 
    # binary vector of 1's and 0's to be used with the neural network
    # cost function.
    yv = zeros((num_labels, m))
    for i in range(m):
	yv[y[i],i] = 1

    # In this point implement the backpropagaition algorithm 
    A = []
    a = ones(X.shape[0])
    a = vstack((a,X.transpose()))
    Z = []
    for i in range(num_layers-1):
	z = dot(Theta[i],a)
	a = sigmoid(z)
	if i != num_layers-2:
	    a = vstack((ones(a.shape[1]),a))  
    # A: list of result after each layer
    h = a.transpose()

    # delta for the last layer
    delta = h - yv.transpose()
    # calculate of gradients
    for j in range(num_layers-2,0,-1):
	Theta_grad[j] = Theta_grad[j] + dot(delta.transpose(),A[j])
	# calculate of delta for current layer(have to remove the first column of Theta)
	tmp = dot(Theta[j][:,1:].transpose(),delta.transpose())
	tmp = tmp.transpose()
	tmp_matrix = zeros(tmp.shape)
	for i in range(m):
	    tmp_matrix[i] = sigmoidGradient(Z[j].transpose()[i])
	delta = tmp_matrix * tmp
    Theta_grad[0] = Theta_grad[0] + dot(delta.transpose(),A[0])

    # regularization
    for i in range(num_layers-1):
	for j in range((Theta_grad[i].shape)[0]):
	    for k in range((Theta_grad[i].shape)[1]):
		Theta_grad[i][j,k] = Theta_grad[i][j,k]/m
		if k >=1:
			Theta_grad[i][j,k] = Theta_grad[i][j,k] + lambd/m*Theta[i][j,k]
    # Unroll Params
    Theta_grad = unroll_params(Theta_grad)

    return Theta_grad
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels,
                   X, y, l):
    #NNCOSTFUNCTION Implements the neural network cost function for a two layer
    #neural network which performs classification
    #   [J grad] = NNCOSTFUNCTON(nn_params, hidden_layer_size, num_labels, ...
    #   X, y, lambda) computes the cost and gradient of the neural network. The
    #   parameters for the neural network are "unrolled" into the vector
    #   nn_params and need to be converted back into the weight matrices.
    #   The returned parameter grad should be a "unrolled" vector of the
    #   partial derivatives of the neural network.

    # Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices
    # for our 2 layer neural network
    Theta1 = np.reshape(
        nn_params[0:(hidden_layer_size * (input_layer_size + 1)), ],
        (hidden_layer_size, input_layer_size + 1))
    Theta2 = np.reshape(
        nn_params[(hidden_layer_size * (input_layer_size + 1)):, ],
        (num_labels, hidden_layer_size + 1))

    # ====================== YOUR CODE HERE ======================
    # Instructions: You should complete the code by working through the
    #               following parts.
    # Part 1: Feedforward the neural network and return the cost in the
    #         variable J. After implementing Part 1, you can verify that your
    #         cost function computation is correct by verifying the cost
    #         computed in ex4.m
    m, n = X.shape
    a1 = np.hstack((np.ones((m, 1)), X))
    a2 = np.hstack((np.ones((m, 1)), sigmoid(a1.dot(Theta1.T))))
    h = sigmoid(a2.dot(Theta2.T))

    # Constructing a vector of result ex: for 5 of 10 the 1 should be at
    # fifth position [0 0 0 0 1 0 0 0 0 0] where rows are training set samples
    yVec = np.equal(np.matlib.repmat(list(range(1, 11)), m, 1),
                    np.matlib.repmat(y, num_labels, 1).T).astype(np.int)

    # Cost Function
    cost = -yVec * np.log(h) - (1 - yVec) * np.log(1 - h)
    J = (1 / m) * sum(sum(cost))

    # Part 2: Implement the backpropagation algorithm to compute the gradients
    #         Theta1_grad and Theta2_grad. You should return the partial derivatives of
    #         the cost function with respect to Theta1 and Theta2 in Theta1_grad and
    #         Theta2_grad, respectively. After implementing Part 2, you can check
    #         that your implementation is correct by running checkNNGradients
    #         Note: The vector y passed into the function is a vector of labels
    #               containing values from 1..K. You need to map this vector into a
    #               binary vector of 1's and 0's to be used with the neural network
    #               cost function.
    #         Hint: We recommend implementing backpropagation using a for-loop
    #               over the training examples if you are implementing it for the
    #               first time.
    theta1ExcludingBias = Theta1[:, 1:]
    theta2ExcludingBias = Theta2[:, 1:]
    reg = 1.0 * l / (2 * m) * (sum(sum(np.square(theta1ExcludingBias))) +

    J = J + reg

    d3 = h - yVec
    D2 = d3.T.dot(a2)

    Z2 = np.hstack((np.ones((m, 1)), a1.dot(Theta1.T)))
    d2 = d3.dot(Theta2) * sigmoidGradient(Z2)
    d2 = d2[:, 1:]
    D1 = d2.T.dot(a1)

    Theta_1_grad = 1.0 * D1 / m

    Theta_2_grad = 1.0 * D2 / m

    # Part 3: Implement regularization with the cost function and gradients.
    #         Hint: You can implement this around the code for
    #               backpropagation. That is, you can compute the gradients for
    #               the regularization separately and then add them to Theta1_grad
    #               and Theta2_grad from Part 2.
    Theta_1_grad[:, 1:] = Theta_1_grad[:, 1:] + 1.0 * l / m * Theta1[:, 1:]
    Theta_2_grad[:, 1:] = Theta_2_grad[:, 1:] + 1.0 * l / m * Theta2[:, 1:]

    # Unroll gradients
    grad = np.hstack((Theta_1_grad.ravel(), Theta_2_grad.ravel()))

    return J, grad
Exemplo n.º 10
# Вычисление значений стоимостной функции
J = computeCost(X, y, num_labels, Theta1, Theta2, lam)

    'Значение стоимостной функции с регуляризацией для загруженных параметров модели: {:.4f}'

input('Программа остановлена. Нажмите Enter для продолжения ... \n')

# ====== Часть 5. Вычисление производной сигмоидной функции ======

print('Часть 5. Вычисление производной сигмоидной функции')

z = np.array([1, -0.5, 0, 0.5, 1])
g = sigmoidGradient(z)

print('Значения производной сигмоидной функции для [1, -0.5, 0, 0.5, 1]:')
print('{:.4f} {:.4f} {:.4f} {:.4f} {:.4f}'.format(g[0], g[1], g[2], g[3],

input('Программа остановлена. Нажмите Enter для продолжения ... \n')

# ============== Часть 6. Инициализация параметров ===============

print('Часть 6. Инициализация параметров')

initial_Theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
initial_Theta2 = randInitializeWeights(hidden_layer_size, num_labels)

input('Программа остановлена. Нажмите Enter для продолжения ... \n')
Exemplo n.º 11
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels,
                   X, y, lambda_value):
    #NNCOSTFUNCTION Implements the neural network cost function for a two layer
    #neural network which performs classification
    #   [J grad] = NNCOSTFUNCTON(nn_params, hidden_layer_size, num_labels, ...
    #   X, y, lambda_value) computes the cost and gradient of the neural network. The
    #   parameters for the neural network are "unrolled" into the vector
    #   nn_params and need to be converted back into the weight matrices.
    #   The returned parameter grad should be a "unrolled" vector of the
    #   partial derivatives of the neural network.

    # Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices
    # for our 2 layer neural network
    tmp = nn_params.copy()
    Theta1 = np.reshape(tmp[0:hidden_layer_size * (input_layer_size + 1)],
                        (hidden_layer_size, (input_layer_size + 1)),
    Theta2 = np.reshape(tmp[(hidden_layer_size *
                             (input_layer_size + 1)):len(tmp)],
                        (num_labels, (hidden_layer_size + 1)),

    # Setup some useful variables
    m = np.shape(X)[0]

    # Computation of the Cost function including regularisation
    # Feedforward
    a2 = sigmoid(np.dot(np.hstack((np.ones((m, 1)), X)), np.transpose(Theta1)))
    a3 = sigmoid(np.dot(np.hstack((np.ones((m, 1)), a2)),

    # Cost function for Logistic Regression summed over all output nodes
    Cost = np.empty((num_labels, 1))
    for k in range(num_labels):
        # which examples fit this label
        y_binary = (y == k + 1)
        # select all predictions for label k
        hk = a3[:, k]
        # compute two parts of cost function for all examples for node k
        Cost[k][0] = np.sum(np.transpose(y_binary) * np.log(hk)) + np.sum(
            ((1 - np.transpose(y_binary)) * np.log(1 - hk)))

# Sum over all labels and average over examples
    J_no_regularisation = -1. / m * sum(Cost)
    # No regularization over intercept
    Theta1_no_intercept = Theta1[:, 1:]
    Theta2_no_intercept = Theta2[:, 1:]

    # Sum all parameters squared
    RegSum1 = np.sum(np.sum(np.power(Theta1_no_intercept, 2)))
    RegSum2 = np.sum(np.sum(np.power(Theta2_no_intercept, 2)))
    # Add regularisation term to final cost
    J = J_no_regularisation + (lambda_value / (2 * m)) * (RegSum1 + RegSum2)

    # You need to return the following variables correctly
    Theta1_grad = np.zeros(np.shape(Theta1))
    Theta2_grad = np.zeros(np.shape(Theta2))

    # ====================== YOUR CODE HERE ======================
    # Implement the backpropagation algorithm to compute the gradients
    # Theta1_grad and Theta2_grad. You should return the partial derivatives of
    # the cost function with respect to Theta1 and Theta2 in Theta1_grad and
    # Theta2_grad, respectively. After implementing Part 2, you can check
    # that your implementation is correct by running checkNNGradients
    # Note: The vector y passed into the function is a vector of labels
    #       containing values from 1..K. You need to map this vector into a
    #       binary vector of 1's and 0's to be used with the neural network
    #       cost function.
    # Hint: It is recommended implementing backpropagation using a for-loop
    #       over the training examples if you are implementing it for the
    #       first time.
    I = np.eye(num_labels)
    Y = np.zeros((m, num_labels))
    for i in range(m):
        Y[i, :] = I[y[i] - 1, :]

    for t in range(m):
        a1 = X[t, :]
        a1 = np.append([1], a1)
        z2 = np.dot(Theta1, a1)
        a2 = sigmoid(z2)
        a2 = np.append([1], a2)
        z3 = np.dot(Theta2, a2)
        a3 = sigmoid(z3)

        # sigma3 shape is 10 by 1
        sigma3 = a3 - Y[t, :]
        # sigma2 shape is 25 by 1 (eliminate bias)
        sigma2 = np.multiply(
            np.dot(np.transpose(Theta2), sigma3)[1:], sigmoidGradient(z2))
        # combine the forward pass and backwardpass; the delta l/ delta w
        delta2 = np.multiply(sigma3[np.newaxis].T, a2[np.newaxis])
        delta1 = np.multiply(sigma2[np.newaxis].T, a1[np.newaxis])

        Theta1_grad = Theta1_grad + delta1
        Theta2_grad = Theta2_grad + delta2

    # average on the Theta gradient
    Theta1_grad = Theta1_grad / m + (lambda_value / m) * np.hstack((np.zeros(
        (Theta1.shape[0], 1)), Theta1[:, 1:]))
    Theta2_grad = Theta2_grad / m + (lambda_value / m) * np.hstack((np.zeros(
        (Theta2.shape[0], 1)), Theta2[:, 1:]))

    # -------------------------------------------------------------

    # =========================================================================

    # Unroll gradients
    Theta1_grad = np.reshape(Theta1_grad, Theta1_grad.size, order='F')
    Theta2_grad = np.reshape(Theta2_grad, Theta2_grad.size, order='F')
    grad = np.expand_dims(np.hstack((Theta1_grad, Theta2_grad)), axis=1)

    return J, grad
Exemplo n.º 12
def nnCostFunctionVec(nn_params,

    import numpy as np
    from sigmoid import sigmoid
    from sigmoidGradient import sigmoidGradient

    Theta1 = nn_params[0:hidden_layer_size * (input_layer_size + 1)].reshape(
        hidden_layer_size, input_layer_size + 1, order='F')
    Theta2 = nn_params[(hidden_layer_size * (input_layer_size + 1)):].reshape(
        num_labels, (hidden_layer_size + 1), order='F')

    (m, n) = X.shape

    J = 0
    Theta1_grad = np.zeros((Theta1.shape))
    Theta2_grad = np.zeros((Theta2.shape))
    grad = 0

    ident = np.eye(Theta2.shape[0])

    yNodes = ident[y.flatten()].T
    yNodes = np.append(yNodes[1:, ], yNodes[0:1, ], axis=0)

    X = np.append(np.ones((m, 1)), X, axis=1)

    if returnType == '' or returnType == 'J':
        h = sigmoid(
            np.dot(Theta2, (np.append(
                np.ones((1, m)), sigmoid(np.dot(Theta1, X.T)), axis=0))))
        J = np.sum(-yNodes * np.log(h) -
                   ((1 - yNodes) * np.log(1 - h))) / m + lam * (
                       np.sum(np.square(Theta2[:, 1:])) +
                       np.sum(np.square(Theta1[:, 1:]))) / (2 * m)

    if returnType == '' or returnType == 'grad':
        delta3 = sigmoid(
                np.append(np.ones((1, m)),
                          sigmoid(np.dot(Theta1, X.T)),
                          axis=0))) - yNodes
        delta2 = (np.dot(Theta2.T, delta3) * sigmoidGradient(
            np.append(np.ones((1, m)), np.dot(Theta1, X.T), axis=0)))[1:, ]

        Theta1_grad = np.dot(delta2, X)
        Theta2_grad = np.dot(
            np.append(np.ones((1, m)), sigmoid(np.dot(Theta1, X.T)), axis=0).T)

        Theta1_grad = Theta1_grad / m + (lam * np.append(
            np.zeros((Theta1.shape[0], 1)), Theta1[:, 1:], axis=1)) / m
        Theta2_grad = Theta2_grad / m + (lam * np.append(
            np.zeros((Theta2.shape[0], 1)), Theta2[:, 1:], axis=1)) / m

        grad = np.append(Theta1_grad.flatten('F'), Theta2_grad.flatten('F'))

    if returnType == '':
        return [J, grad]
    elif returnType == 'J':
        return J
    elif returnType == 'grad':
        return grad
def backwards(nn_weights, layers, X, y, num_labels, lambd):
    :param nn_weights: Neural network parameters (vector)
    :param layers: a list with the number of units per layer.
    :param X: a matrix where every row is a training example for a handwritten digit image
    :param y: a vector with the labels of each instance
    :param num_labels: the number of units in the output layer
    :param lambd: regularization factor
    :return: Computes the gradient fo the neural network.

    # Setup some useful variables
    m = X.shape[0]
    num_layers = len(layers)

    # Roll Params
    # The parameters for the neural network are "unrolled" into the vector
    # nn_params and need to be converted back into the weight matrices.
    Theta = roll_params(nn_weights, layers)

    # The vector y passed into the function is a vector of labels
    # containing values from 1..K. You need to map this vector into a
    # binary vector of 1's and 0's to be used with the neural network
    # cost function.
    yv = np.zeros((num_labels, m))
    for i in range(len(y)):
        yv[int(y[i]), i] = 1
    yv = np.transpose(yv)

    a = []
    z = []
    x = np.copy(X)

    # if you want to be able to follow the training accuracy:
    # pred = predict(Theta, X)
    # accuracy = np.mean(y == pred) * 100
    # print(accuracy)

    for i in range(num_layers - 1):

        s = np.shape(Theta[i])
        theta = Theta[i][:, 1:s[1]]
        x = np.dot(x, np.transpose(theta))
        x = x + Theta[i][:, 0]
        x = sigmoid(x)

    delta = [np.zeros(w.shape) for w in z]
    delta[num_layers - 1] = (x - yv)

    for i in range(num_layers - 2, 0, -1):
        s = np.shape(Theta[i])
        theta = np.copy(Theta[i][:, 1:s[1]])
        temp = np.dot(np.transpose(theta), np.transpose(delta[i + 1]))
        delta[i] = np.transpose(temp) * sigmoidGradient(z[i])

    Delta = []
    for i in range(num_layers - 1):
        temp = np.dot(np.transpose(delta[i + 1]), a[i])

    # if you want to follow the cost during the training:
    # cost = (yv * np.log(x) + (1 - yv) * np.log(1 - x)) / m
    # cost = -np.sum(cost)
    # somme = 0
    # for i in range(num_layers - 1):
    #     somme += lambd * np.sum(Theta[i] ** 2) / (2 * m)
    # cost += somme

    Theta_grad = [(d / m) for d in Delta]

    i = 0
    for t in Theta:
        current = lambd * t / m
        # d'après le poly il faudrait qu'il y ait cette ligne
        # mais après quand on son checkNNGradient il vaut mieux enlever
        # cette ligne donc je ne sais pas ...:
        # current[:, 0] = current[:, 0]*0
        Theta_grad[i] += current
        i += 1

    # Unroll Params
    Theta_grad = unroll_params(Theta_grad)

    return Theta_grad
Exemplo n.º 14
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, \
num_labels, X, y, lambda_reg):

    Theta1 = np.reshape(nn_params[:hidden_layer_size * (input_layer_size + 1)],\

    Theta2 = np.reshape(nn_params[hidden_layer_size*(input_layer_size+1):],\

    m = len(X)
    J = 0

    Theta1_grad = np.zeros(Theta1.shape)
    Theta2_grad = np.zeros(Theta2.shape)

    X = np.column_stack((np.ones((m, 1)), X))

    a2 = s.sigmoid(np.dot(X, Theta1.T))

    a2 = np.column_stack((np.ones((a2.shape[0], 1)), a2))

    a3 = s.sigmoid(np.dot(a2, Theta2.T))

    labels = y

    y = np.zeros((m, num_labels))

    for i in xrange(m):
        y[i, labels[i] - 1] = 1

    cost = 0

    for i in xrange(m):
        cost += np.sum(y[i] * np.log(a3[i]) + (1 - y[i]) * np.log(1 - a3[i]))

    J = -(1.0 / m) * cost

    sum0fTheta1 = np.sum(np.sum(Theta1[:, 1]**2))

    sum0fTheta2 = np.sum(np.sum(Theta2[:, 1]**2))

    J = J + ((lambda_reg / (2.0 * m)) * (sum0fTheta1 + sum0fTheta2))

    bigDelta1 = 0
    bigDelta2 = 0

    for t in xrange(m):
        x = X[t]

        a2 = s.sigmoid(np.dot(x, Theta1.T))

        a2 = np.concatenate((np.array([1]), a2))

        a3 = s.sigmoid(np.dot(a2, Theta2.T))

        delta3 = np.zeros((num_labels))

        for k in xrange(num_labels):
            y_k = y[t, k]
            delta3[k] = a3[k] - y_k

        delta2 = (np.dot(Theta2[:, 1:].T, delta3).T) * sg.sigmoidGradient(
            np.dot(x, Theta1.T))

        bigDelta1 += np.outer(delta2, x)
        bigDelta2 += np.outer(delta3, a2)

    Theta1_grad = bigDelta1 / m
    Theta2_grad = bigDelta2 / m

    Theta1_grad_unregularized = np.copy(Theta1_grad)
    Theta2_grad_unregularized = np.copy(Theta2_grad)

    Theta1_grad += (float(lambda_reg) / m) * Theta1
    Theta2_grad += (float(lambda_reg) / m) * Theta2

    print Theta1_grad.shape
    print Theta2_grad.shape
    print Theta1_grad

    Theta1_grad[:, 0] = Theta1_grad_unregularized[:, 0]
    Theta2_grad[:, 0] = Theta2_grad_unregularized[:, 0]

    print Theta1_grad

    grad = np.concatenate((Theta1_grad.reshape(Theta1_grad.size, order='F'),
                           Theta2_grad.reshape(Theta2_grad.size, order='F')))

    return J, grad
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels,
                   X, y, Lambda):
    """computes the cost and gradient of the neural network. The
  parameters for the neural network are "unrolled" into the vector
  nn_params and need to be converted back into the weight matrices.

  The returned parameter grad should be a "unrolled" vector of the
  partial derivatives of the neural network.

    # Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices
    # for our 2 layer neural network
    # Obtain Theta1 and Theta2 back from nn_params
    input_layer_size = int(input_layer_size)
    hidden_layer_size = int(hidden_layer_size)
    num_labels = int(num_labels)
    Theta1 = np.reshape(nn_params[:hidden_layer_size * (input_layer_size + 1)],
                        (hidden_layer_size, input_layer_size + 1),

    Theta2 = np.reshape(nn_params[hidden_layer_size * (input_layer_size + 1):],
                        (num_labels, (hidden_layer_size + 1)),

    # Setup some useful variables
    m, _ = X.shape

    # ====================== YOUR CODE HERE ======================
    # Instructions: You should complete the code by working through the
    #               following parts.
    # Part 1: Feedforward the neural network and return the cost in the
    #         variable J. After implementing Part 1, you can verify that your
    #         cost function computation is correct by verifying the cost
    #         computed in ex4.m
    # Part 2: Implement the backpropagation algorithm to compute the gradients
    #         Theta1_grad and Theta2_grad. You should return the partial derivatives of
    #         the cost function with respect to Theta1 and Theta2 in Theta1_grad and
    #         Theta2_grad, respectively. After implementing Part 2, you can check
    #         that your implementation is correct by running checkNNGradients
    #         Note: The vector y passed into the function is a vector of labels
    #               containing values from 1..K. You need to map this vector into a
    #               binary vector of 1's and 0's to be used with the neural network
    #               cost function.
    #         Hint: We recommend implementing backpropagation using a for-loop
    #               over the training examples if you are implementing it for the
    #               first time.
    # Part 3: Implement regularization with the cost function and gradients.
    #         Hint: You can implement this around the code for
    #               backpropagation. That is, you can compute the gradients for
    #               the regularization separately and then add them to Theta1_grad
    #               and Theta2_grad from Part 2.
    import pandas as pd
    y_categorical = pd.get_dummies(y.ravel()).as_matrix()

    a1 = np.column_stack((np.ones((m, 1)), X))
    z2 = a1.dot(Theta1.T)
    a2 = np.column_stack((np.ones((z2.shape[0], 1)), sigmoid(z2)))
    a3 = sigmoid(a2.dot(Theta2.T))
    J = np.sum(np.log(a3) * y_categorical + np.log(1 - a3) * (1 - y_categorical)) / float(-m) \
        + Lambda * (np.sum(np.square(Theta1[:, 1:])) + np.sum(np.square(Theta2[:, 1:]))) / (2 * m)

    a3_grad = a3 - y_categorical
    Theta2_grad = a3_grad.T.dot(a2) / m + Lambda * np.column_stack((np.zeros(
        (Theta2.shape[0], 1)), Theta2[:, 1:])) / m
    a2_grad = (a3_grad).dot(Theta2[:, 1:]) * sigmoidGradient(z2)
    Theta1_grad = a2_grad.T.dot(a1) / m + Lambda * np.column_stack((np.zeros(
        (Theta1.shape[0], 1)), Theta1[:, 1:])) / m

    # =========================================================================

    # Unroll gradient
    grad = np.hstack((Theta1_grad.T.ravel(), Theta2_grad.T.ravel()))

    return J, grad
Exemplo n.º 16
def backwards(nn_weights, layers, X, y, num_labels, lambd):
    # Computes the gradient fo the neural network.
    # nn_weights: Neural network parameters (vector)
    # layers: a list with the number of units per layer.
    # X: a matrix where every row is a training example for a handwritten digit image
    # y: a vector with the labels of each instance
    # num_labels: the number of units in the output layer
    # lambd: regularization factor

    # Setup some useful variables
    m = X.shape[0]
    num_layers = len(layers)

    # Roll Params
    # The parameters for the neural network are "unrolled" into the vector
    # nn_params and need to be converted back into the weight matrices.
    Theta = roll_params(nn_weights, layers)

    # You need to return the following variables correctly
    Theta_grad = [zeros(w.shape) for w in Theta]

    # ================================ DONE ================================
    # The vector y passed into the function is a vector of labels
    # containing values from 1..K. You need to map this vector into a
    # binary vector of 1's and 0's to be used with the neural network
    # cost function.
    yv = zeros((m, num_labels))
    for i in range(m):
        yv[i][y[i]] += 1

    # ================================ DONE ================================
    # In this point implement the backpropagation algorithm

    # In this point calculate the cost of the neural network (feedforward)

    # Step 1: Initialization of useful variables

    # Z and A will store the hidden states of the network, as lists of matrices, of size num_layers
    A = [addColumnOne(X)]
    Z = [addColumnOne(X)]

    # delta will store the delta for each layer from the last to the second layer (in reverse order)
    delta = []

    # Step 2: Feedforward
    for i in range(num_layers - 1):
        h = A[i].dot(Theta[i].T)
        h = addColumnOne(sigmoid(h))

    # Step 3: Backpropagation
    d = removeFirstColumn(A[-1]) - yv

    for i in range(num_layers - 2, 0, -1):
        d = removeFirstColumn(d.dot(Theta[i])) * sigmoidGradient(Z[i])

    # delta is of size num_layers-1 (no delta for the input layer)

    for i in range(num_layers - 1):
        Theta_grad[i] += delta[i].T.dot(A[i])
        # DONE: no regularization on the bias weights !!
        Theta_grad[i] += lambd * Theta[i]
        for j in range(Theta[i].shape[0]):
            Theta_grad[i][j, 0] -= lambd * Theta[i][j, 0]
        Theta_grad[i] /= m

    # Unroll Params
    Theta_grad = unroll_params(Theta_grad)

    return Theta_grad
print('Program paused. Press enter to continue.\n')

"""## Part 5: Sigmoid Gradient  ================
  Before you start implementing the neural network, you will first
  implement the gradient for the sigmoid function. You should complete the
  code in the sigmoidGradient.py file.

print('\nEvaluating sigmoid gradient...\n')

test_array = np.array([[1, -0.5, 0, 0.5, 1]])
g = sigmoidGradient(test_array)
print('Sigmoid gradient evaluated at [1 -0.5 0 0.5 1]:\n ')

print('Program paused. Press enter to continue.\n')

"""## Part 6: Initializing Pameters ================
  In this part of the exercise, you will be starting to implment a two
  layer neural network that classifies digits. You will start by
  implementing a function to initialize the weights of the neural network

print('\nInitializing Neural Network Parameters ...\n')
Exemplo n.º 18
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, \
                   num_labels, X, y, lambda_val):
    # NNCOSTFUNCTION Implements the neural network cost function for a two layer
    # neural network which performs classification
    #   [J grad] = NNCOSTFUNCTON(nn_params, hidden_layer_size, num_labels, ...
    #   X, y, lambda) computes the cost and gradient of the neural network. The
    #   parameters for the neural network are "unrolled" into the vector
    #   nn_params and need to be converted back into the weight matrices.
    #   The returned parameter grad should be a "unrolled" vector of the
    #   partial derivatives of the neural network.

    # Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices
    # for our 2 layer neural network
    Theta1 = np.reshape(nn_params[:hidden_layer_size * (input_layer_size + 1)], \
                        (hidden_layer_size, input_layer_size + 1), order='F')

    Theta2 = np.reshape(nn_params[hidden_layer_size * (input_layer_size + 1):], \
                        (num_labels, hidden_layer_size + 1), order='F')

    # Setup some useful variables
    m = len(X)

    # # You need to return the following variables correctly
    J = 0
    Theta1_grad = np.zeros(Theta1.shape)
    Theta2_grad = np.zeros(Theta2.shape)

    # ====================== YOUR CODE HERE ======================
    #         Implement the backpropagation algorithm to compute the gradients
    #         Theta1_grad and Theta2_grad. You should return the partial derivatives of
    #         the cost function with respect to Theta1 and Theta2 in Theta1_grad and
    #         Theta2_grad, respectively. After implementing Part 2, you can check
    #         that your implementation is correct by running checkNNGradients
    #         Note: The vector y passed into the function is a vector of labels
    #               containing values from 1..K. You need to map this vector into a
    #               binary vector of 1's and 0's to be used with the neural network
    #               cost function.
    #         Hint: We recommend implementing backpropagation using a for-loop
    #               over the training examples if you are implementing it for the
    #               first time.

    # the algorithm is described in our given assignment pdf

    # one bias column form input to second layer (only ones)
    # X is a1
    X = np.column_stack((np.ones((m, 1)), X))

    # hidden layer + adding bias column
    a2 = s.sigmoid(np.dot(X, Theta1.T))
    a2 = np.column_stack((np.ones((a2.shape[0], 1)), a2))

    # outer layer
    a3 = s.sigmoid(np.dot(a2, Theta2.T))

    # calculating the cost function (non-regularized)
    # only values 0 or 1
    cost = 0
    y_ = y
    # setting y to a matrix with m (as the number of inputs) and num_labels
    y = np.zeros((m, num_labels))
    # for every label, convert it into vector of 0s and a 1 in the appropriate position
    for i in range(m):
        y[i, y_[i] - 1] = 1

    # calculate cost by summing
    for i in range(m):
        cost += np.sum(y[i] * np.log(a3[i]) + (1 - y[i]) * np.log(1 - a3[i]))

    J = -(1.0 / m) * cost

    # note first column are bias units - this is why we start from the second column
    sumOfTheta1 = np.sum(np.sum(Theta1[:, 1:]**2))
    sumOfTheta2 = np.sum(np.sum(Theta2[:, 1:]**2))

    J = J + ((lambda_val / (2.0 * m)) * (sumOfTheta1 + sumOfTheta2))

    # foward propagation step
    # iterate over training examples
    for t in range(m):

        x = X[t]

        # hidden layer
        # z2 = Theta1 * a1, sigmoid(z)
        # returns vector
        a2 = s.sigmoid(np.dot(x, Theta1.T))

        # append bias values - to third layer
        a2 = np.append([1], a2)

        # outer layer
        # z3 = Theta2 * a2, sigmoid(z)
        # returns vector
        a3 = s.sigmoid(np.dot(a2, Theta2.T))

        # create delta with only zeros
        delta3 = np.zeros((num_labels))

        # num_labels is 10, used to compute delta3
        # number of hypotheses
        # indicates wheather the current training example belongs to class k
        # (y[t, k] = 1), or if it belongs to a different class (y[t, k] = 0)
        for k in range(num_labels):

            delta3[k] = a3[k] - y[t, k]

        # computing delta2, with all of the THETA2 values times DELTA3 * the SIGMUNDGRADIENT values
        delta2 = (np.dot(Theta2[:, 1:].T, delta3).T) * sg.sigmoidGradient(
            np.dot(x, Theta1.T))

        Theta1_grad += np.outer(delta2, x)
        Theta2_grad += np.outer(delta3, a2)


    # devide accumalted gradients by 1/m
    Theta1_grad = Theta1_grad / m
    Theta2_grad = Theta2_grad / m


    # just temporary variable
    tmp1 = np.copy(Theta1_grad)
    tmp2 = np.copy(Theta2_grad)

    # regularize using lambda
    Theta1_grad += (float(lambda_val) / m) * Theta1
    Theta2_grad += (float(lambda_val) / m) * Theta2

    Theta1_grad[:, 0] = tmp1[:, 0]
    Theta2_grad[:, 0] = tmp2[:, 0]

    # # =========================================================================

    # Unroll gradients
    Theta1_grad = np.reshape(Theta1_grad, Theta1_grad.size, order='F')
    Theta2_grad = np.reshape(Theta2_grad, Theta2_grad.size, order='F')
    grad = np.expand_dims(np.hstack((Theta1_grad, Theta2_grad)), axis=1)

    return J, grad
Exemplo n.º 19
def nnCostFunction(thetas, X, y, struc, lambd=1.0, bias=1):
    j = 0.0
    grad = {}
    grad_final = np.empty_like([]) 
    m,n = X.shape
    hidden = []
    t1 = 0
    t2 = 0
#     try:
#         my2, ny2 = y2.shape
#     except:
#         ny2 = 1
#     if ny2 < 2:
#         y = np.zeros((len(y2),y2.max()+1))
#         for i in range(0,len(y2)):
#             for ii in range(0,len(y[i])):
#                 if y2[i] == ii:
#                     y[i][ii] = 1
#     else:
#         y = y2
    for i in range(0,len(struc)):
        m2 = struc[i][0]
        n2 = struc[i][1]
        t2 += m2 * n2
        hidden.append({'layer': i,'theta': thetas[t1:t2].reshape(n2,m2).transpose()})
        t1 = t2
    local = {'a1': X,'t': 0.0}
    c = 1
    last = ''
    if bias == 1:
        for layer in hidden:
            theta = layer['theta']
            local['Theta' + str(c)] = theta 
            local['theta' + str(c)] = theta.copy()
            local['theta' + str(c)][:,0] = 0.0
            local['t'] += (local['theta' + str(c)][:]**2).sum()
            local['a'+ str(c)] = np.hstack((np.ones((m,1)),local['a'+ str(c)]))
            c += 1
            local['z'+ str(c)] = local['a'+ str(c - 1)].dot(theta.conj().transpose())
            local['a'+ str(c)] = s.sigmoid(local['z'+ str(c)])
            last = 'a' + str(c)
        cost = y * np.log(local[last]) + (1 - y) * np.log(1 - local[last])
        r = (lambd / (2.0 * m)) * local['t']
        j = -(1.0 / m) * cost.sum() + r

        local['s' + str(c)] = local['a'+ str(c)] - y
        for i in range(1,(c)):
            local['s' + str(c-i)] = ((local['s' + str(c)]).dot(local['Theta' + str(c-1)][:,1:])) * sigg.sigmoidGradient(local['z'+ str(c-1)])
        for i in range(0,c-1):
            delta = (local['s' + str(c-i)].conj().transpose()).dot(local['a'+ str(c-(i+1))])
            r = (lambd / m) * local['theta' + str(c-(i+1))]
            grad['Theta' + str(c-(i+1))] = (1.0 / m) * delta + r
        for i in range(1,c):
            grad_final =  np.hstack((grad_final.T.ravel(), grad['Theta' + str(i)].T.ravel()))
    return (j, grad_final)
Exemplo n.º 20
def main():
    ''' Main function  '''

    ## %% =========== Part 1: Loading and Visualizing Data =============
    #%  We start the exercise by first loading and visualizing the dataset. 
    #%  You will be working with a dataset that contains handwritten digits.

    # Read the Matlab data
    m, n, X, y = getMatlabTrainingData()

    # number of features
    input_layer_size = n    

    # Select some random images from X
    print('Selecting random examples of the data to display.\n')
    sel = np.random.permutation(m)
    sel = sel[0:100]
    #  Re-work the data orientation of each training example
    image_size = 20
    XMatlab = np.copy(X) # Need a deep copy, not just the reference
    for i in range(m): 
        XMatlab[i, :] = XMatlab[i, :].reshape(image_size, image_size).transpose().reshape(1, image_size*image_size)

    # display the sample images
    displayData(XMatlab[sel, :])

    # Print Out the labels for what is being seen. 
    print('These are the labels for the data ...\n')
    print(y[sel, :].reshape(10, 10))

    # Pause program
    print("Program paused. Press Ctrl-D to continue.\n")
    code.interact(local=dict(globals(), **locals()))
    print(" ... continuing\n ")  

#%% ================ Part 2: Loading Parameters ================
#% In this part of the exercise, we load some pre-initialized 
# % neural network parameters.

    print('\nLoading Saved Neural Network Parameters ...\n')

    # Load the weights into variables Theta1 and Theta2
    import scipy .io as sio
    fnWeights = '/home/jennym/Kaggle/DigitRecognizer/ex4/ex4weights.mat'
    weights = sio.loadmat(fnWeights)
    Theta1 = weights['Theta1']
    Theta2 = weights['Theta2']

    #% Unroll parameters 
    nn_params = np.hstack((Theta1.ravel(order='F'), Theta2.ravel(order='F')))

#%% ================ Part 3: Compute Cost (Feedforward) ================
#%  To the neural network, you should first start by implementing the
#%  feedforward part of the neural network that returns the cost only. You
#%  should complete the code in nnCostFunction.m to return cost. After
#%  implementing the feedforward to compute the cost, you can verify that
#%  your implementation is correct by verifying that you get the same cost
#%  as us for the fixed debugging parameters.
#%  We suggest implementing the feedforward cost *without* regularization
#%  first so that it will be easier for you to debug. Later, in part 4, you
#%  will get to implement the regularized cost.
    print('\nFeedforward Using Neural Network ...\n')

    #% Weight regularization parameter (we set this to 0 here).
    MLlambda = 0.0

    # Cluge, put y back to matlab version, then adjust to use python
    #  indexing later into y_matrix
    y[(y == 0)] = 10
    y = y - 1
    J, _ = nnCostFunction(nn_params, input_layer_size, hidden_layer_size,
                   num_labels, X, y, MLlambda)

    print('Cost at parameters (loaded from ex4weights): ' + str(J) + 
          '\n (this value should be about 0.287629)\n')

    # Pause
    print("Program paused. Press Ctrl-D to continue.\n")
    code.interact(local=dict(globals(), **locals()))
    print(" ... continuing\n ")  

#%% =============== Part 4: Implement Regularization ===============
#%  Once your cost function implementation is correct, you should now
#%  continue to implement the regularization with the cost.

    print('\nChecking Cost Function (with Regularization) ... \n')

    # % Weight regularization parameter (we set this to 1 here).
    MLlambda = 1.0

    J, _ = nnCostFunction(nn_params, input_layer_size, hidden_layer_size,
                   num_labels, X, y, MLlambda)

    print('Cost at parameters (loaded from ex4weights): ' + str(J) +
         '\n(this value should be about 0.383770)\n');

    # Pause
    print("Program paused. Press Ctrl-D to continue.\n")
    code.interact(local=dict(globals(), **locals()))
    print(" ... continuing\n ")  

#%% ================ Part 5: Sigmoid Gradient  ================
#%  Before you start implementing the neural network, you will first
#%  implement the gradient for the sigmoid function. You should complete the
#%  code in the sigmoidGradient.m file.

    print('\nEvaluating sigmoid gradient...\n')
    g = sigmoidGradient(np.array([1, -0.5,  0,  0.5, 1]))
    print('Sigmoid gradient evaluated at [1 -0.5 0 0.5 1]:\n  ')

    # Pause
    print("Program paused. Press Ctrl-D to continue.\n")
    code.interact(local=dict(globals(), **locals()))
    print(" ... continuing\n ")  

#%% ================ Part 6: Initializing Parameters ================
#%  In this part of the exercise, you will be starting to implement a two
#%  layer neural network that classifies digits. You will start by
#%  implementing a function to initialize the weights of the neural network
#%  (randInitializeWeights.m)

    print('\nInitializing Neural Network Parameters ...\n')

    initial_Theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
    initial_Theta2 = randInitializeWeights(hidden_layer_size, num_labels)

    #% Unroll parameters
    initial_nn_params = np.hstack(( initial_Theta1.ravel(order = 'F'),
                                   initial_Theta2.ravel(order = 'F')))
    # Pause
    print("Program paused. Press Ctrl-D to continue.\n")
    code.interact(local=dict(globals(), **locals()))
    print(" ... continuing\n ")  

#%% =============== Part 7: Implement Backpropagation ===============
#%  Once your cost matches up with ours, you should proceed to implement the
#%  backpropagation algorithm for the neural network. You should add to the
#%  code you've written in nnCostFunction.m to return the partial
#%  derivatives of the parameters.
    print('\nChecking Backpropagation... \n')

    #%  Check gradients by running checkNNGradients

    # Pause
    print("Program paused. Press Ctrl-D to continue.\n")
    code.interact(local=dict(globals(), **locals()))
    print(" ... continuing\n ")  

#%% =============== Part 8: Implement Regularization ===============
#%  Once your backpropagation implementation is correct, you should now
#%  continue to implement the regularization with the cost and gradient.

    print('\nChecking Backpropagation (w/ Regularization) ... \n')

    #%  Check gradients by running checkNNGradients
    MLlambda = 3

    # Pause
    print("Program paused. Press Ctrl-D to continue.\n")
    code.interact(local=dict(globals(), **locals()))
    print(" ... continuing\n ")  

    #% Also output the costFunction debugging values
    debug_J, _  = nnCostFunction(nn_params, input_layer_size,
                          hidden_layer_size, num_labels, X, y, MLlambda)

    print('\n\n Cost at (fixed) debugging parameters (w/ lambda = ' + 
          '{0}): {1}'.format(MLlambda, debug_J))
    print('\n  (this value should be about 0.576051)\n\n')

    # Pause
    print("Program paused. Press Ctrl-D to continue.\n")
    code.interact(local=dict(globals(), **locals()))
    print(" ... continuing\n ")

#%% =================== Part 8b: Training NN ===================
#%  You have now implemented all the code necessary to train a neural 
#%  network. To train your neural network, we will now use "fmincg", which
#%  is a function which works similarly to "fminunc". Recall that these
#%  advanced optimizers are able to train our cost functions efficiently as
#%  long as we provide them with the gradient computations.
    print ('\nTraining Neural Network... \n')

    #%  After you have completed the assignment, change the MaxIter to a larger
    #%  value to see how more training helps.
    #% jkm change maxIter from 50-> 400
    options = {'maxiter': MAXITER}

    #%  You should also try different values of lambda
    MLlambda = 1

    #% Create "short hand" for the cost function to be minimized
    costFunc = lambda p: nnCostFunction(p, input_layer_size, hidden_layer_size,
                               num_labels, X, y, MLlambda)

    #% Now, costFunction is a function that takes in only one argument (the
    #% neural network parameters)

    NOTES: Call scipy optimize minimize function
        method : str or callable, optional Type of solver. 
           CG -> Minimization of scalar function of one or more variables 
                 using the conjugate gradient algorithm.

        jac : bool or callable, optional Jacobian (gradient) of objective function. 
              Only for CG, BFGS, Newton-CG, L-BFGS-B, TNC, SLSQP, dogleg, trust-ncg. 
              If jac is a Boolean and is True, fun is assumed to return the gradient 
              along with the objective function. If False, the gradient will be 
              estimated numerically. jac can also be a callable returning the 
              gradient of the objective. In this case, it must accept the same 
              arguments as fun.
        callback : callable, optional. Called after each iteration, as callback(xk), 
              where xk is the current parameter vector.
    # Setup a callback for displaying the cost at the end of each iteration 
    class Callback(object): 
        def __init__(self): 
            self.it = 0 
        def __call__(self, p): 
            self.it += 1 
            print "Iteration %5d | Cost: %e" % (self.it, costFunc(p)[0]) 
    result = sci.minimize(costFunc, initial_nn_params, method='CG', 
                   jac=True, options=options, callback=Callback()) 
    nn_params = result.x 
    cost = result.fun 
    # matlab: [nn_params, cost] = fmincg(costFunction, initial_nn_params, options);

    #% Obtain Theta1 and Theta2 back from nn_params
    Theta1 = np.reshape(nn_params[:hidden_layer_size * (input_layer_size + 1)],
               (hidden_layer_size, (input_layer_size + 1)), 
                order = 'F')

    Theta2 = np.reshape(nn_params[hidden_layer_size * (input_layer_size + 1):], 
               (num_labels, (hidden_layer_size + 1)), 
               order = 'F')  

    # Pause
    print("Program paused. Press Ctrl-D to continue.\n")
    code.interact(local=dict(globals(), **locals()))
    print(" ... continuing\n ")

#%% ================= Part 9: Visualize Weights =================
#%  You can now "visualize" what the neural network is learning by 
#%  displaying the hidden units to see what features they are capturing in 
#%  the data.#

    print('\nVisualizing Neural Network... \n')

    displayData(Theta1[:, 1:])

    # Pause
    print("Program paused. Press Ctrl-D to continue.\n")
    code.interact(local=dict(globals(), **locals()))
    print(" ... continuing\n ")

#%% ================= Part 10: Implement Predict =================
#%  After training the neural network, we would like to use it to predict
#%  the labels. You will now implement the "predict" function to use the
#%  neural network to predict the labels of the training set. This lets
#%  you compute the training set accuracy.

    pred = predict(Theta1, Theta2, X)

    # JKM - my array was column stacked - don't understand why this works
    pp = np.row_stack(pred)
    accuracy = np.mean(np.double(pp == y)) * 100

    print('\nTraining Set Accuracy: {0} \n'.format(accuracy))

    # Pause
    print("Program paused. Press Ctrl-D to continue.\n")
    code.interact(local=dict(globals(), **locals()))
    print(" ... continuing\n ")

# ========================================

    # All Done!
Exemplo n.º 21
def backwards(nn_weights, layers, X, y, num_labels, lambd):
    # Computes the gradient fo the neural network.
    # nn_weights: Neural network parameters (vector)
    # layers: a list with the number of units per layer.
    # X: a matrix where every row is a training example for a handwritten digit image
    # y: a vector with the labels of each instance
    # num_labels: the number of units in the output layer
    # lambd: regularization factor

    # Setup some useful variables
    m = X.shape[0]
    num_layers = len(layers)

    # Roll Params
    # The parameters for the neural network are "unrolled" into the vector
    # nn_params and need to be converted back into the weight matrices.
    Theta = roll_params(nn_weights, layers)

    Theta_grad = [np.zeros(w.shape) for w in Theta]

    yv = np.zeros((num_labels, m))
    for i in range(m):
        yv[y[i]][i] = 1

    # Implementation of the backpropagation algorithm

    for i in range(m):

        a_values, z_values = [], [
        ]  # arrays where the values of the activations are to be stored

        a = np.append([1], X[i, :])

        # Loop of the feedforward algorithm
        for k in range(num_layers - 1):
            z = np.dot(Theta[k], a)
            a = np.append([1], sigmoid(z))

        delta_layer = a[1:] - yv[:, i]  # error array of the outer layer
        # np.outer to calculate the matrix product of delta_layer.T and a_values[-2]
        Theta_grad[-1] += np.outer(delta_layer, a_values[-2]) / m

        # Descending loop
        for h in range(num_layers - 2):
            # Error of the (num_layers - 2 - h)-th hidden layer
            # The error that corresponds to the bias factors is not taken into account
            delta_layer = np.dot(Theta[-1 - h].T,
                                 delta_layer)[1:] * sigmoidGradient(
                                     z_values[-2 - h])
            # Calculation of the gradient
            Theta_grad[-2 - h] += np.outer(delta_layer, a_values[-3 - h]) / m

    for h in range(num_layers - 1):
        # The terms corresponding to the bias factors are not regularized
        Theta_grad[h][:, 1:] += lambd * Theta[h][:, 1:] / m

    # Unroll Params
    Theta_grad = unroll_params(Theta_grad)

    return Theta_grad
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels,
                   X, y, lambda_value):
    #NNCOSTFUNCTION Implements the neural network cost function for a two layer
    #neural network which performs classification
    #   [J grad] = NNCOSTFUNCTON(nn_params, hidden_layer_size, num_labels, ...
    #   X, y, lambda) computes the cost and gradient of the neural network. The
    #   parameters for the neural network are "unrolled" into the vector
    #   nn_params and need to be converted back into the weight matrices.
    #   The returned parameter grad should be a "unrolled" vector of the
    #   partial derivatives of the neural network.

    # Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices
    # for our 2 layer neural network
    Theta1 = nn_params[0:hidden_layer_size * (input_layer_size + 1)].reshape(
        hidden_layer_size, input_layer_size + 1)
    Theta2 = nn_params[hidden_layer_size * (input_layer_size + 1):].reshape(
        num_labels, hidden_layer_size + 1)

    # Setup some useful variables
    m, n = X.shape

    # You need to return the following variables correctly
    J = 0
    Theta1_grad = np.zeros(Theta1.shape)
    Theta2_grad = np.zeros(Theta2.shape)

    Instructions: You should complete the code by working through the following parts.
    # Instructions: You should complete the code by working through the
    #               following parts.
    # Part 1: Feedforward the neural network and return the cost in the
    #         variable J. After implementing Part 1, you can verify that your
    #         cost function computation is correct by verifying the cost
    #         computed in ex4.m
    # Part 2: Implement the backpropagation algorithm to compute the gradients
    #         Theta1_grad and Theta2_grad. You should return the partial derivatives of
    #         the cost function with respect to Theta1 and Theta2 in Theta1_grad and
    #         Theta2_grad, respectively. After implementing Part 2, you can check
    #         that your implementation is correct by running checkNNGradients
    #         Note: The vector y passed into the function is a vector of labels
    #               containing values from 1..K. You need to map this vector into a
    #               binary vector of 1's and 0's to be used with the neural network
    #               cost function.
    #         Hint: We recommend implementing backpropagation using a for-loop
    #               over the training examples if you are implementing it for the
    #               first time.
    # Part 3: Implement regularization with the cost function and gradients.
    #         Hint: You can implement this around the code for
    #               backpropagation. That is, you can compute the gradients for
    #               the regularization separately and then add them to Theta1_grad
    #               and Theta2_grad from Part 2.

    # Feed forward
    a1 = np.column_stack([np.ones(m), X])
    z2 = np.matmul(a1, Theta1.T)
    a2 = sigmoid(z2)
    a2 = np.column_stack([np.ones(m), a2])
    z3 = np.matmul(a2, Theta2.T)
    h = sigmoid(z3)

    # Main term of the cost function
    for k in range(1, num_labels + 1):
        yk = (y == k).astype(int)
        hk = h[:, k - 1]
        Jk = np.sum(-yk * np.log(hk) - (1 - yk) * np.log(1 - hk)) / m
        J = J + Jk

    # Regularization term of the cost function
    J = J + lambda_value * (np.sum(np.sum(Theta1[:, 1:]**2)) +
                            np.sum(np.sum(Theta2[:, 1:]**2))) / (2 * m)

    # Backpropagation
    for t in range(1, m + 1):
        # For each training sample
        d3 = np.zeros((1, num_labels))
        for k in range(1, num_labels + 1):
            yk = (y[t - 1] == k).astype(int)
            d3[0, k - 1] = h[t - 1, k - 1] - yk
        d2 = np.multiply(np.dot(Theta2.T, d3.T),
                         sigmoidGradient(np.r_[1, z2[t - 1, :]])[None].T)
        d2 = d2[1:]
        Theta1_grad = Theta1_grad + np.dot(d2, a1[t - 1, :][None])
        Theta2_grad = Theta2_grad + np.dot(d3.T, a2[t - 1, :][None])
    # Main term of the gradient
    Theta1_grad = Theta1_grad / m
    Theta2_grad = Theta2_grad / m
    # Regularization term of the gradient
    Theta1_grad[:, 1:] = Theta1_grad[:, 1:] + lambda_value * Theta1[:, 1:] / m
    Theta2_grad[:, 1:] = Theta2_grad[:, 1:] + lambda_value * Theta2[:, 1:] / m

    # -------------------------------------------------------------

    # =========================================================================

    # Unroll gradients
    grad = np.concatenate([Theta1_grad.ravel(), Theta2_grad.ravel()])

    return (J, grad)
        pred = predict(Theta1, Theta2, tmp)
        print('Neural Network Prediction: ', pred, '(digit ', pred % 10, ')')

        input('Program paused. Press enter to continue')

# ================ Part 4: Sigmoid Gradient  ================
#  Before you start implementing backpropagation, you will first
#  implement the gradient for the sigmoid function. You should complete the
#  code in the sigmoidGradient.m file.

print('Evaluating sigmoid gradient...')
example = np.array([-15, -1, -0.5, 0, 0.5, 1, 15])
g = sigmoidGradient(example)
print('Sigmoid gradient evaluated at', example, ':')

# ================ Part 5: Initializing Pameters ================
#  To learn a two layer neural network that classifies digits. You will start
#  by implementing a function to initialize the weights of the neural network
#  (randInitializeWeights.m)

print('Initializing Neural Network Parameters ...')

initial_Theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
initial_Theta2 = randInitializeWeights(hidden_layer_size, num_labels)

# Unroll parameters
initial_Theta1 = np.reshape(initial_Theta1, initial_Theta1.size, order='F')
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels,
                   X, y, _lambda):

    # Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices
    # for our 2 layer neural network
    Theta1 = nn_params[0:hidden_layer_size * (input_layer_size + 1)].reshape(
        hidden_layer_size, input_layer_size + 1)

    Theta2 = nn_params[hidden_layer_size * (input_layer_size + 1):].reshape(
        num_labels, hidden_layer_size + 1)

    # Setup some useful variables
    m = len(y)  # number of training examples

    Instructions: You should complete the code by working through the following parts.
    # Instructions: You should complete the code by working through the
    #               following parts.
    # Part 1: Feedforward the neural network and return the cost in the
    #         variable J. After implementing Part 1, you can verify that your
    #         cost function computation is correct by verifying the cost
    #         computed in ex4.m
    # Part 2: Implement the backpropagation algorithm to compute the gradients
    #         Theta1_grad and Theta2_grad. You should return the partial derivatives of
    #         the cost function with respect to Theta1 and Theta2 in Theta1_grad and
    #         Theta2_grad, respectively. After implementing Part 2, you can check
    #         that your implementation is correct by running checkNNGradients
    #         Note: The vector y passed into the function is a vector of labels
    #               containing values from 1..K. You need to map this vector into a
    #               binary vector of 1's and 0's to be used with the neural network
    #               cost function.
    #         Hint: We recommend implementing backpropagation using a for-loop
    #               over the training examples if you are implementing it for the
    #               first time.
    # Part 3: Implement regularization with the cost function and gradients.
    #         Hint: You can implement this around the code for
    #               backpropagation. That is, you can compute the gradients for
    #               the regularization separately and then add them to Theta1_grad
    #               and Theta2_grad from Part 2.

    a1 = np.vstack((np.ones(m), X.T)).T
    a2 = sigmoid(np.dot(a1, Theta1.T))
    a2 = np.vstack((np.ones(m), a2.T)).T
    a3 = sigmoid(np.dot(a2, Theta2.T))
    y = np.tile((np.arange(num_labels) + 1) % 10,
                (m, 1)) == np.tile(y, (1, num_labels))

    regTheta1 = Theta1[:, 1:]
    regTheta2 = Theta2[:, 1:]

    J = -np.sum(y * np.log(a3) + (1-y) * np.log(1-a3)) / m + \
        _lambda * np.sum(regTheta1*regTheta1) / m/2 + \
        _lambda * np.sum(regTheta2*regTheta2) / m/2

    delta1 = np.zeros(Theta1.shape)
    delta2 = np.zeros(Theta2.shape)
    for i in range(m):
        a1_ = a1[i]
        a2_ = a2[i]
        a3_ = a3[i]
        d3 = a3_ - y[i]
        d2 = np.dot(d3, Theta2) * sigmoidGradient(
            np.append(1, np.dot(a1_, Theta1.T)))
        delta1 = delta1 + np.dot(d2[1:].reshape(-1, 1), a1_.reshape(1, -1))
        delta2 = delta2 + np.dot(d3.reshape(-1, 1), a2_.reshape(1, -1))

    regTheta1 = np.vstack((np.zeros(Theta1.shape[0]), regTheta1.T)).T
    regTheta2 = np.vstack((np.zeros(Theta2.shape[0]), regTheta2.T)).T
    Theta1_grad = delta1 / m + _lambda * regTheta1 / m
    Theta2_grad = delta2 / m + _lambda * regTheta2 / m

    grad = np.append(Theta1_grad.flatten(), Theta2_grad.flatten())
    print('cost value: %lf' % J)

    return J, grad
g = sigmoid(array([-1, -0.5, 0, 0.5, 1]))
print "Sigmoid evaluated at [1 -0.5 0 0.5 1]:  "
print g

raw_input('\nProgram paused. Press enter to continue!!!')

# ================================ Step 4: Sigmoid Gradient ================================
#  Before you start implementing the neural network, you will first
#  implement the gradient for the sigmoid function. You should complete the
#  code in the sigmoidGradient.m file.

print "\nEvaluating Sigmoid Gradient function ...\n"

g = sigmoidGradient(array([-1, -0.5, 0, 0.5, 1]))
print "Sigmoid gradient evaluated at [1 -0.5 0 0.5 1]:  "
print g

raw_input('\nProgram paused. Press enter to continue!!!')

# ================================ Step 5: Implement Feedforward (Cost Function) ================================

print "\nChecking Cost Function without Regularization (Feedforward) ...\n"

lambd = 0.0

print 'This value should be about 2.09680198349'

raw_input('\nProgram paused. Press enter to continue!!!')
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, _lambda):
    # Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices
    # for our 2 layer neural network
    Theta1 = nn_params[0:hidden_layer_size * (input_layer_size + 1)].reshape(\
                 hidden_layer_size, input_layer_size + 1)
    Theta2 = nn_params[hidden_layer_size * (input_layer_size + 1):].reshape(\
                 num_labels, hidden_layer_size + 1)

    # Setup some useful variables
    m = len(y) # number of training examples

    Instructions: You should complete the code by working through the following parts.
    # Instructions: You should complete the code by working through the
    #               following parts.
    # Part 1: Feedforward the neural network and return the cost in the
    #         variable J. After implementing Part 1, you can verify that your
    #         cost function computation is correct by verifying the cost
    #         computed in ex4.m
    # Part 2: Implement the backpropagation algorithm to compute the gradients
    #         Theta1_grad and Theta2_grad. You should return the partial derivatives of
    #         the cost function with respect to Theta1 and Theta2 in Theta1_grad and
    #         Theta2_grad, respectively. After implementing Part 2, you can check
    #         that your implementation is correct by running checkNNGradients
    #         Note: The vector y passed into the function is a vector of labels
    #               containing values from 1..K. You need to map this vector into a 
    #               binary vector of 1's and 0's to be used with the neural network
    #               cost function.
    #         Hint: We recommend implementing backpropagation using a for-loop
    #               over the training examples if you are implementing it for the 
    #               first time.
    # Part 3: Implement regularization with the cost function and gradients.
    #         Hint: You can implement this around the code for
    #               backpropagation. That is, you can compute the gradients for
    #               the regularization separately and then add them to Theta1_grad
    #               and Theta2_grad from Part 2.

    a1 = np.vstack((np.ones(m), X.T)).T
    a2 = sigmoid(np.dot(a1, Theta1.T))
    a2 = np.vstack((np.ones(m), a2.T)).T
    a3 = sigmoid(np.dot(a2, Theta2.T))
    y = np.tile((np.arange(num_labels)+1)%10,(m,1)) == np.tile(y,(1,num_labels))

    regTheta1 = Theta1[:,1:]
    regTheta2 = Theta2[:,1:]

    J = -np.sum( y * np.log(a3) + (1-y) * np.log(1-a3) ) / m + \
        _lambda * np.sum(regTheta1*regTheta1) / m/2 + \
        _lambda * np.sum(regTheta2*regTheta2) / m/2

    delta1 = np.zeros(Theta1.shape)
    delta2 = np.zeros(Theta2.shape)
    for i in range(m):
        a1_ = a1[i]; a2_ = a2[i]; a3_ = a3[i]
        d3 = a3_ - y[i]; d2 = np.dot(d3,Theta2) * sigmoidGradient(np.append(1,np.dot(a1_, Theta1.T)))
        delta1 = delta1 + np.dot(d2[1:].reshape(-1,1),a1_.reshape(1,-1)); 
        delta2 = delta2 + np.dot(d3.reshape(-1,1), a2_.reshape(1,-1))

    regTheta1 = np.vstack((np.zeros(Theta1.shape[0]), regTheta1.T)).T
    regTheta2 = np.vstack((np.zeros(Theta2.shape[0]), regTheta2.T)).T
    Theta1_grad = delta1 / m + _lambda * regTheta1 / m
    Theta2_grad = delta2 / m + _lambda * regTheta2 / m

    grad = np.append(Theta1_grad.flatten(), Theta2_grad.flatten())
    print('cost value: %lf'%J)
    return J, grad
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, _lambda):

    theta1 = nn_params[:(hidden_layer_size * (input_layer_size +1))].reshape(hidden_layer_size, (input_layer_size+1))
    theta2 = nn_params[(hidden_layer_size * (input_layer_size +1)):].reshape(num_labels, (hidden_layer_size+1))

    # Setup some useful variables
    m = X.shape[0]

    # You need to return the following variables correctly
    J = 0
    theta1_grad = np.zeros(np.size(theta1))
    theta2_grad = np.zeros(np.size(theta2))
    % Instructions: You should complete the code by working through the following parts.
    % Instructions: You should complete the code by working through the
    %               following parts.
%  Part 1: Feedforward the neural network and return the cost in the
%         variable J. After implementing Part 1, you can verify that your
%         cost function computation is correct by verifying the cost
%         computed in ex4.m
% Part 2: Implement the backpropagation algorithm to compute the gradients
%         Theta1_grad and Theta2_grad. You should return the partial derivatives of
%         the cost function with respect to Theta1 and Theta2 in Theta1_grad and
%         Theta2_grad, respectively. After implementing Part 2, you can check
%         that your implementation is correct by running checkNNGradients
%         Note: The vector y passed into the function is a vector of labels
%               containing values from 1..K. You need to map this vector into a
%               binary vector of 1's and 0's to be used with the neural network
%               cost function.
%         Hint: We recommend implementing backpropagation using a for-loop
%               over the training examples if you are implementing it for the
%               first time.
% Part 3: Implement regularization with the cost function and gradients.
%         Hint: You can implement this around the code for
%               backpropagation. That is, you can compute the gradients for
%               the regularization separately and then add them to Theta1_grad
%               and Theta2_grad from Part 2.

    X = np.hstack((np.ones((m, 1)), X))

    #Calculating z2 5000 by 25
    #X 5000 401 Theta1 25 x 401 Theta2 10 x 26
    z2 = X.dot(theta1.T) # %5000 x 25
    a2 = sigmoid(z2)
    a2 = np.hstack((np.ones((a2.shape[0], 1)), a2))
    z3 = a2.dot(theta2.T)
    a3 = sigmoid(z3) #5000 x 10

    labels_transform = np.eye(a3.shape[1])
    y_new = labels_transform[y[:,0],:] #5000x x 10

    s = 0

    y_new_flat = y_new.reshape(1,-1)
    h_flat = a3.reshape(-1,1)
    s = (-y_new_flat).dot(np.log(h_flat))
    s = s-(1-y_new_flat).dot(np.log(1-h_flat))
    J = (s/m)[0][0]

    unbias_Theta1 = theta1[:, 1:theta1.shape[1]]
    unbias_Theta2 = theta2[:, 1:theta2.shape[1]]

    #regularizing cost function

    regularizator_cost = _lambda/(2*m)*(sum(sum(unbias_Theta1**2))+sum(sum(unbias_Theta2**2)))


    delta3 = a3-y_new # 5000 x 10
    delta2 = delta3.dot(theta2) #5000 x 26
    delta2 = delta2[:, 1:delta2.shape[1]] #%5000 x 25

    delta2 = delta2 * sigmoidGradient(z2) #5000 x 25

    DEL1 = 0
    DEL2 = 0

    DEL1 = delta2.T.dot(X) # 25 X 401 - ok
    DEL2  = delta3.T.dot(a2)#; % 10 X 26   vs Theta 10 x 26
    DEL1 = DEL1/m
    DEL2  = DEL2/m

    theta1_regul = np.zeros((unbias_Theta1.shape[0],1))
    theta1_regul = np.hstack((theta1_regul, unbias_Theta1))
    theta2_regul = np.zeros((unbias_Theta2.shape[0],1))
    theta2_regul = np.hstack((theta2_regul, unbias_Theta2))

    theta1_grad = DEL1+(_lambda/m)*theta1_regul
    theta2_grad = DEL2+(_lambda/m)*theta2_regul

    grad = np.hstack((theta1_grad.flatten(), theta2_grad.flatten()))
    return J[0][0], grad
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels,
                   X, y, Lambda):
    """computes the cost and gradient of the neural network. The
	parameters for the neural network are "unrolled" into the vector
	nn_params and need to be converted back into the weight matrices.
	The returned parameter grad should be a "unrolled" vector of the
	partial derivatives of the neural network.
    # Obtain Theta1 and Theta2 back from nn_params
    # hidden_layer_size =25, input_layer_size = 400
    #										25, 401
    Theta1 = nn_params[0:(hidden_layer_size * (input_layer_size + 1))].reshape(
        hidden_layer_size, (input_layer_size + 1))
    Theta2 = nn_params[(hidden_layer_size * (input_layer_size + 1)):].reshape(
        num_labels, (hidden_layer_size + 1))
    m, _ = X.shape

    # Part 1: Feedforward the neural network and return the cost in the
    #         variable J. After implementing Part 1, you can verify that your
    #         cost function computation is correct by verifying the cost computed in ex4.m
    a1 = X  # 5000 * 400
    a1_ = np.column_stack((np.ones((m, 1)), X))  # 5000 * 401
    z2 = Theta1.dot(a1_.T)  # Theta1 shape: (25, 401) -->  25 * 5000
    a2 = np.column_stack(
        (np.ones((z2.T.shape[0], 1)),
         sigmoid(z2.T)))  # (5000, 26) # after sigmoid you add 1's
    z3 = Theta2.dot(a2.T)  # Theta2 shape: (10, 26) -->
    a3 = sigmoid(z3)  #  (10, 5000)
    a3_ = a3.T  # (5000, 10)

    y_ = np.zeros((X.shape[0], 10))  # (5000, 10)
    for i in xrange(m):
        y_[i][y[i] - 1] = 1  # index 9 is zero.

    J = 1.0/m * np.sum(  np.sum( np.multiply( -y_, np.log(a3_) ) - np.multiply(1-y_, np.log(1-a3_)) , 0) )\
     +Lambda/(2.0*m)* ( np.sum( np.square(Theta1[:,1:])) + np.sum( np.square(Theta2[:,1:]))     )
    #								 5000*10,  5000*10

    # Part 2: Implement the backpropagation algorithm to compute the gradients
    #         Theta1_grad and Theta2_grad. You should return the partial derivatives of
    #         the cost function with respect to Theta1 and Theta2 in Theta1_grad and
    #         Theta2_grad, respectively. After implementing Part 2, you can check
    #         that your implementation is correct by running checkNNGradients
    #         Hint: We recommend implementing backpropagation using a for-loop
    #               over the training examples if you are implementing it for the
    #               first time.

    #Note: the Delta matrices should include the bias unit
    #The Delta matrices have the same shape as the theta matrices
    Delta1 = np.zeros((hidden_layer_size, input_layer_size + 1))
    Delta2 = np.zeros((num_labels, hidden_layer_size + 1))

    # Loop over the training points (rows in myX, already contain bias unit)
    for irow in xrange(m):
        myrow = a1_[irow]
        a1 = myrow.reshape((input_layer_size + 1, 1))
        # propagateForward returns (zs, activations) for each layer excluding the input layer
        temp = propagateForward(myrow, [Theta1, Theta2])
        z2 = temp[0][0]
        a2 = temp[0][1]
        z3 = temp[1][0]
        a3 = temp[1][1]
        delta3 = a3 - y_[irow].reshape(a3.shape[0], 1)
        delta2 = Theta2.T[1:, :].dot(delta3) * sigmoidGradient(
            z2)  #remove 0th element
        a2 = np.insert(a2, 0, 1, axis=0)
        Delta1 += delta2.dot(a1.T)  #(25,1)x(1,401) = (25,401) (correct)
        Delta2 += delta3.dot(a2.T)  #(10,1)x(1,25) = (10,25) (should be 10,26)

    D1 = Delta1 / float(m)
    D2 = Delta2 / float(m)

    D1[:, 1:] = D1[:, 1:] + (float(Lambda) / m) * Theta1[:, 1:]
    D2[:, 1:] = D2[:, 1:] + (float(Lambda) / m) * Theta2[:, 1:]
    """Vectorized version
		d3 = a3_ - y_ # 5000x10
		d2 = np.dot(Theta2[:,1:].T, d3.T ) * sigmoidGradient(z2) 
		  # 25x10 *10x5000 * 25x5000 = 25x5000
		#why isn't this theta1 dot delta2?
		delta1 = d2.dot(a1) # 25x5000 * 5000x401 = 25x401 
		delta2 = d3.T.dot(a2) # 10x5000 *5000x26 = 10x26
		theta1_ = np.c_[np.ones((theta1.shape[0],1)),theta1[:,1:]]
		theta2_ = np.c_[np.ones((theta2.shape[0],1)),theta2[:,1:]]
		theta1_grad = delta1/m + (theta1_*reg)/m
		theta2_grad = delta2/m + (theta2_*reg)/m
    # Unroll gradient
    # grad = np.hstack((Theta1_grad.T.ravel(), Theta2_grad.T.ravel()))

    # Part 3: Implement regularization with the cost function and gradients.
    #         Hint: You can implement this around the code for
    #               backpropagation. That is, you can compute the gradients for
    #               the regularization separately and then add them to Theta1_grad
    #               and Theta2_grad from Part 2.
    return J, flattenParams([D1, D2], input_layer_size, hidden_layer_size,
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels,
                   X, y, lamda):
    # Reshape nn_params back into the parameters Theta1 and Theta2,
    # the weight matrices for our 2 layer neural network
    Theta1 = nn_params[:hidden_layer_size * (input_layer_size + 1)].reshape(
        (hidden_layer_size, input_layer_size + 1), order='F')
    Theta2 = nn_params[hidden_layer_size * (input_layer_size + 1):].reshape(
        (num_labels, hidden_layer_size + 1), order='F')

    m = X.shape[0]
    J = 0
    Theta1_grad = np.zeros(Theta1.shape)
    Theta2_grad = np.zeros(Theta2.shape)

    X = np.hstack((np.ones((m, 1)), X))
    yv = np.zeros((m, num_labels))
    for i in range(m):
        yv[i, y[i][0] - 1] = 1

    # Part 1: Feedforward the neural network and return the cost in the
    #         variable J. After implementing Part 1, you can verify that your
    #         cost function computation is correct by verifying the cost
    #         computed in ex4.py
    a1 = X
    a2 = np.hstack((np.ones((m, 1)), sigmoid(a1.dot(Theta1.T))))
    a3 = sigmoid(a2.dot(Theta2.T))

    for i in range(m):
        J += (-yv[i, :] * np.log(a3[i, :]) -
              (1 - yv[i, :]) * np.log(1 - a3[i, :])).sum()
    J /= m
    J += ((Theta1[:, 1:]**2).sum() + (Theta2[:, 1:]**2).sum()) * lamda / 2 / m

    # Part 2: Implement the backpropagation algorithm to compute the gradients
    #         Theta1_grad and Theta2_grad. You should return the partial derivatives of
    #         the cost function with respect to Theta1 and Theta2 in Theta1_grad and
    #         Theta2_grad, respectively. After implementing Part 2, you can check
    #         that your implementation is correct by running checkNNGradients
    #         Note: The vector y passed into the function is a vector of labels
    #               containing values from 1..K. You need to map this vector into a
    #               binary vector of 1's and 0's to be used with the neural network
    #               cost function.
    #         Hint: We recommend implementing backpropagation using a for-loop
    #               over the training examples if you are implementing it for the
    #               first time.
    for i in range(m):
        a1 = X[i:i + 1, :].T
        z2 = Theta1.dot(a1)
        a2 = np.vstack(([1], sigmoid(z2)))
        z3 = Theta2.dot(a2)
        a3 = sigmoid(z3)

        delta3 = a3 - yv[i:i + 1, :].T
        delta2 = Theta2.T.dot(delta3) * np.vstack(([1], sigmoidGradient(z2)))

        Theta1_grad += delta2[1:, :].dot(a1.T)
        Theta2_grad += delta3.dot(a2.T)

    Theta1_grad /= m
    Theta2_grad /= m

    # Part 3: Implement regularization with the cost function and gradients.
    #         Hint: You can implement this around the code for
    #               backpropagation. That is, you can compute the gradients for
    #               the regularization separately and then add them to Theta1_grad
    #               and Theta2_grad from Part 2.
    Theta1_grad[:, 1:] += lamda / m * Theta1[:, 1:]
    Theta2_grad[:, 1:] += lamda / m * Theta2[:, 1:]

    # Unroll gradients
    grad = np.hstack((Theta1_grad.T.ravel(), Theta2_grad.T.ravel()))

    return J, grad
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels,
                   X, y, lamda):
    # Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices
    # for our 2 layer neural network
    Theta1 = nn_params[:hidden_layer_size * (input_layer_size + 1)].reshape(
        hidden_layer_size, input_layer_size + 1)
    Theta2 = nn_params[hidden_layer_size * (input_layer_size + 1):].reshape(
        num_labels, hidden_layer_size + 1)

    m = X.shape[0]

    Instructions: You should complete the code by working through the following parts.
    # Instructions: You should complete the code by working through the
    #               following parts.
    # Part 1: Feedforward the neural network and return the cost in the
    #         variable J. After implementing Part 1, you can verify that your
    #         cost function computation is correct by verifying the cost
    #         computed in ex4.m
    # Part 2: Implement the backpropagation algorithm to compute the gradients
    #         Theta1_grad and Theta2_grad. You should return the partial derivatives of
    #         the cost function with respect to Theta1 and Theta2 in Theta1_grad and
    #         Theta2_grad, respectively. After implementing Part 2, you can check
    #         that your implementation is correct by running checkNNGradients
    #         Note: The vector y passed into the function is a vector of labels
    #               containing values from 1..K. You need to map this vector into a
    #               binary vector of 1's and 0's to be used with the neural network
    #               cost function.
    #         Hint: We recommend implementing backpropagation using a for-loop
    #               over the training examples if you are implementing it for the
    #               first time.
    # Part 3: Implement regularization with the cost function and gradients.
    #         Hint: You can implement this around the code for
    #               backpropagation. That is, you can compute the gradients for
    #               the regularization separately and then add them to Theta1_grad
    #               and Theta2_grad from Part 2.
    a1 = np.c_[np.ones(m), X]  # 加一列:bias
    z2 = a1.dot(Theta1.T)
    a2 = np.c_[np.ones(m), expit(z2)]  # 加一列:bias
    z3 = a2.dot(Theta2.T)
    h = expit(z3)
    a3 = h.T
    # print("h(x) shape:", h.shape)

    # 首先把原先label表示的y变成向量模式的output
    y_vec = np.zeros((num_labels, m))
    for i in range(m):
        y_vec[y[i][0] - 1][i] = 1

    #每一training example的cost function是使用的向量计算,然后for loop累加所有m个training example
    #的cost function
    J = 0
    for i in range(m):
        J += np.log(h[i, :]).dot(
            y_vec[:, i]) + np.log(1 - h[i, :]).dot(1 - y_vec[:, i])

    J = -J / m

    # 梯度
    delta3 = a3 - y_vec
    delta2 = Theta2[:, 1:].T.dot(delta3) * sigmoidGradient(z2).T
    D2 = delta3.dot(a2)
    D1 = delta2.dot(a1)

    D2 /= m
    D1 /= m
    # print("delta3", delta3.shape, "delta2:", delta2.shape, "D2:", D2.shape, "D1", D1.shape)

    # 加入正则项 (regularization)
    t1 = Theta1[:, 1:]
    t2 = Theta2[:, 1:]
    J += lamda * 0.5 / m * (np.sum(np.square(t1)) + np.sum(np.square(t2)))

    D2[:, 1:] = D2[:, 1:] + lamda * t2 / m
    D1[:, 1:] = D1[:, 1:] + lamda * t1 / m

    return J, unrollParams([D1, D2])
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, \
	num_labels, X, y, lambda_reg):
    #NNCOSTFUNCTION Implements the neural network cost function for a two layer
    #neural network which performs classification
    #   [J grad] = NNCOSTFUNCTON(nn_params, hidden_layer_size, num_labels, ...
    #   X, y, lambda) computes the cost and gradient of the neural network. The
    #   parameters for the neural network are "unrolled" into the vector
    #   nn_params and need to be converted back into the weight matrices. 
    #   The returned parameter grad should be a "unrolled" vector of the
    #   partial derivatives of the neural network.

    # Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices
    # for our 2 layer neural network
    Theta1 = np.reshape(nn_params[:hidden_layer_size * (input_layer_size + 1)], \
                     (hidden_layer_size, input_layer_size + 1), order='F')

    Theta2 = np.reshape(nn_params[hidden_layer_size * (input_layer_size + 1):], \
                     (num_labels, hidden_layer_size + 1), order='F')

    # Setup some useful variables
    m = len(X)
    # # You need to return the following variables correctly 
    J = 0;
    Theta1_grad = np.zeros( Theta1.shape )
    Theta2_grad = np.zeros( Theta2.shape )

    Instructions: You should complete the code by working through the following parts.
    # Instructions: You should complete the code by working through the
    #               following parts.
    # Part 1: Feedforward the neural network and return the cost in the
    #         variable J. After implementing Part 1, you can verify that your
    #         cost function computation is correct by verifying the cost
    #         computed in ex4.m
    # Part 2: Implement the backpropagation algorithm to compute the gradients
    #         Theta1_grad and Theta2_grad. You should return the partial derivatives of
    #         the cost function with respect to Theta1 and Theta2 in Theta1_grad and
    #         Theta2_grad, respectively. After implementing Part 2, you can check
    #         that your implementation is correct by running checkNNGradients
    #         Note: The vector y passed into the function is a vector of labels
    #               containing values from 1..K. You need to map this vector into a 
    #               binary vector of 1's and 0's to be used with the neural network
    #               cost function.
    #         Hint: We recommend implementing backpropagation using a for-loop
    #               over the training examples if you are implementing it for the 
    #               first time.
    # Part 3: Implement regularization with the cost function and gradients.
    #         Hint: You can implement this around the code for
    #               backpropagation. That is, you can compute the gradients for
    #               the regularization separately and then add them to Theta1_grad
    #               and Theta2_grad from Part 2.

    # add column of ones as bias unit from input layer to second layer
    X = np.column_stack((np.ones((m,1)), X)) # = a1

    # calculate second layer as sigmoid( z2 ) where z2 = Theta1 * a1
    a2 = s.sigmoid( np.dot(X,Theta1.T) )

    # add column of ones as bias unit from second layer to third layer
    a2 = np.column_stack((np.ones((a2.shape[0],1)), a2))

    # calculate third layer as sigmoid ( z3 ) where z3 = Theta2 * a2
    a3 = s.sigmoid( np.dot(a2,Theta2.T) )



    # recode labels as vectors containing only values 0 or 1
    labels = y
    # set y to be matrix of size m x k
    y = np.zeros((m,num_labels))
    # for every label, convert it into vector of 0s and a 1 in the appropriate position
    for i in xrange(m):
    	y[i, labels[i]-1] = 1

    # at this point, both a3 and y are m x k matrices, where m is the number of inputs
    # and k is the number of hypotheses. Given that the cost function is a sum
    # over m and k, loop over m and in each loop, sum over k by doing a sum over the row

    cost = 0
    for i in xrange(m):
    	cost += np.sum( y[i] * np.log( a3[i] ) + (1 - y[i]) * np.log( 1 - a3[i] ) )

    J = -(1.0/m)*cost

    # note that Theta1[:,1:] is necessary given that the first column corresponds to transitions
    # from the bias terms, and we are not regularizing those parameters. Thus, we get rid
    # of the first column.

    sumOfTheta1 = np.sum(np.sum(Theta1[:,1:]**2))
    sumOfTheta2 = np.sum(np.sum(Theta2[:,1:]**2))

    J = J + ( (lambda_reg/(2.0*m))*(sumOfTheta1+sumOfTheta2) )


    bigDelta1 = 0
    bigDelta2 = 0

    # for each training example
    for t in xrange(m):

    	## step 1: perform forward pass
    	# set lowercase x to the t-th row of X
    	x = X[t]
    	# note that uppercase X already included column of ones 
    	# as bias unit from input layer to second layer, so no need to add it

        # calculate second layer as sigmoid( z2 ) where z2 = Theta1 * a1
        a2 = s.sigmoid( np.dot(x,Theta1.T) )

        # add column of ones as bias unit from second layer to third layer
        a2 = np.concatenate((np.array([1]), a2))
        # calculate third layer as sigmoid ( z3 ) where z3 = Theta2 * a2
        a3 = s.sigmoid( np.dot(a2,Theta2.T) )

    	## step 2: for each output unit k in layer 3, set delta_{k}^{(3)}
    	delta3 = np.zeros((num_labels))

    	# see handout for more details, but y_k indicates whether  
    	# the current training example belongs to class k (y_k = 1), 
    	# or if it belongs to a different class (y_k = 1)
    	for k in xrange(num_labels):
            y_k = y[t, k]
            delta3[k] = a3[k] - y_k

    	## step 3: for the hidden layer l=2, set delta2 = Theta2' * delta3 .* sigmoidGradient(z2)
    	# note that we're skipping delta2_0 (=gradients of bias units, which we don't use here)
    	# by doing (Theta2(:,2:end))' instead of Theta2'
    	delta2 = (np.dot(Theta2[:,1:].T, delta3).T) * sg.sigmoidGradient( np.dot(x, Theta1.T) )

    	## step 4: accumulate gradient from this example
    	# accumulation
        # note that 
        #   delta2.shape = 
        #   x.shape      = 
        #   delta3.shape = 
        #   a2.shape     =
        # np.dot(delta2,x) and np.dot(delta3,a2) don't do outer product
        # could do e.g. np.dot(delta2[:,None], x[None,:])
        # seems faster to do np.outer(delta2, x)
        # solution from http://stackoverflow.com/a/22950320/583834 
    	bigDelta1 += np.outer(delta2, x)
    	bigDelta2 += np.outer(delta3, a2)

    # step 5: obtain gradient for neural net cost function by dividing the accumulated gradients by m
    Theta1_grad = bigDelta1 / m
    Theta2_grad = bigDelta2 / m

    # only regularize for j >= 1, so skip the first column
    Theta1_grad_unregularized = np.copy(Theta1_grad)
    Theta2_grad_unregularized = np.copy(Theta2_grad)
    Theta1_grad += (float(lambda_reg)/m)*Theta1
    Theta2_grad += (float(lambda_reg)/m)*Theta2
    Theta1_grad[:,0] = Theta1_grad_unregularized[:,0]
    Theta2_grad[:,0] = Theta2_grad_unregularized[:,0]

    # # -------------------------------------------------------------

    # # =========================================================================

    # Unroll gradients
    grad = np.concatenate((Theta1_grad.reshape(Theta1_grad.size, order='F'), Theta2_grad.reshape(Theta2_grad.size, order='F')))

    return J, grad
def ex4():
    ## Machine Learning Online Class - Exercise 4 Neural Network Learning

    #  Instructions
    #  ------------
    #  This file contains code that helps you get started on the
    #  linear exercise. You will need to complete the following functions
    #  in this exericse:
    #     sigmoidGradient.m
    #     randInitializeWeights.m
    #     nnCostFunction.m
    #  For this exercise, you will not need to change any code in this file,
    #  or any other files other than those mentioned above.

    ## Initialization
    #clear ; close all; clc

    ## Setup the parameters you will use for this exercise
    input_layer_size = 400  # 20x20 Input Images of Digits
    hidden_layer_size = 25  # 25 hidden units
    num_labels = 10  # 10 labels, from 1 to 10
    # (note that we have mapped "0" to label 10)

    ## =========== Part 1: Loading and Visualizing Data =============
    #  We start the exercise by first loading and visualizing the dataset.
    #  You will be working with a dataset that contains handwritten digits.

    # Load Training Data
    print('Loading and Visualizing Data ...')

    mat = scipy.io.loadmat('ex4data1.mat')
    X = mat['X']
    y = mat['y'].ravel()
    m = X.shape[0]

    # Randomly select 100 data points to display
    sel = np.random.choice(m, 100, replace=False)

    displayData(X[sel, :])

    print('Program paused. Press enter to continue.')

    ## ================ Part 2: Loading Parameters ================
    # In this part of the exercise, we load some pre-initialized
    # neural network parameters.

    print('\nLoading Saved Neural Network Parameters ...')

    # Load the weights into variables Theta1 and Theta2
    mat = scipy.io.loadmat('ex4weights.mat')
    Theta1 = mat['Theta1']
    Theta2 = mat['Theta2']

    # Unroll parameters
    nn_params = np.concatenate([Theta1.ravel(), Theta2.ravel()])

    ## ================ Part 3: Compute Cost (Feedforward) ================
    #  To the neural network, you should first start by implementing the
    #  feedforward part of the neural network that returns the cost only. You
    #  should complete the code in nnCostFunction.m to return cost. After
    #  implementing the feedforward to compute the cost, you can verify that
    #  your implementation is correct by verifying that you get the same cost
    #  as us for the fixed debugging parameters.
    #  We suggest implementing the feedforward cost *without* regularization
    #  first so that it will be easier for you to debug. Later, in part 4, you
    #  will get to implement the regularized cost.
    print('\nFeedforward Using Neural Network ...')

    # Weight regularization parameter (we set this to 0 here).
    lambda_value = 0

    J = nnCostFunction(nn_params, input_layer_size, hidden_layer_size,
                       num_labels, X, y, lambda_value)[0]

        'Cost at parameters (loaded from ex4weights): %f \n(this value should be about 0.287629)'
        % J)

    print('\nProgram paused. Press enter to continue.')

    ## =============== Part 4: Implement Regularization ===============
    #  Once your cost function implementation is correct, you should now
    #  continue to implement the regularization with the cost.

    print('\nChecking Cost Function (w/ Regularization) ... ')

    # Weight regularization parameter (we set this to 1 here).
    lambda_value = 1

    J = nnCostFunction(nn_params, input_layer_size, hidden_layer_size,
                       num_labels, X, y, lambda_value)[0]

        'Cost at parameters (loaded from ex4weights): %f \n(this value should be about 0.383770)'
        % J)

    print('Program paused. Press enter to continue.')

    ## ================ Part 5: Sigmoid Gradient  ================
    #  Before you start implementing the neural network, you will first
    #  implement the gradient for the sigmoid function. You should complete the
    #  code in the sigmoidGradient.m file.

    print('\nEvaluating sigmoid gradient...')

    g = sigmoidGradient(np.array([1, -0.5, 0, 0.5, 1]))
    print('Sigmoid gradient evaluated at [1 -0.5 0 0.5 1]:')
    print(formatter('%f ', g))

    print('Program paused. Press enter to continue.')

    ## ================ Part 6: Initializing Pameters ================
    #  In this part of the exercise, you will be starting to implment a two
    #  layer neural network that classifies digits. You will start by
    #  implementing a function to initialize the weights of the neural network
    #  (randInitializeWeights.m)

    print('\nInitializing Neural Network Parameters ...')

    initial_Theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
    initial_Theta2 = randInitializeWeights(hidden_layer_size, num_labels)

    # Unroll parameters
    initial_nn_params = np.concatenate(

    ## =============== Part 7: Implement Backpropagation ===============
    #  Once your cost matches up with ours, you should proceed to implement the
    #  backpropagation algorithm for the neural network. You should add to the
    #  code you've written in nnCostFunction.m to return the partial
    #  derivatives of the parameters.
    print('\nChecking Backpropagation... ')

    #  Check gradients by running checkNNGradients

    print('\nProgram paused. Press enter to continue.')

    ## =============== Part 8: Implement Regularization ===============
    #  Once your backpropagation implementation is correct, you should now
    #  continue to implement the regularization with the cost and gradient.

    print('\nChecking Backpropagation (w/ Regularization) ... ')

    #  Check gradients by running checkNNGradients
    lambda_value = 3

    # Also output the costFunction debugging values
    debug_J = nnCostFunction(nn_params, input_layer_size, hidden_layer_size,
                             num_labels, X, y, lambda_value)[0]

        '\n\nCost at (fixed) debugging parameters (w/ lambda = 10): %f \n(this value should be about 0.576051)\n\n'
        % debug_J)

    print('Program paused. Press enter to continue.')

    ## =================== Part 8: Training NN ===================
    #  You have now implemented all the code necessary to train a neural
    #  network. To train your neural network, we will now use "fmincg", which
    #  is a function which works similarly to "fminunc". Recall that these
    #  advanced optimizers are able to train our cost functions efficiently as
    #  long as we provide them with the gradient computations.
    print('\nTraining Neural Network... ')

    #  After you have completed the assignment, change the MaxIter to a larger
    #  value to see how more training helps.
    options = {'maxiter': 50}

    #  You should also try different values of lambda
    lambda_value = 1

    # Create "short hand" for the cost function to be minimized
    costFunction = lambda p: nnCostFunction(
        p, input_layer_size, hidden_layer_size, num_labels, X, y, lambda_value)

    # Now, costFunction is a function that takes in only one argument (the
    # neural network parameters)
    res = optimize.minimize(costFunction,
    nn_params = res.x

    # Obtain Theta1 and Theta2 back from nn_params
    Theta1 = nn_params[0:hidden_layer_size * (input_layer_size + 1)].reshape(
        hidden_layer_size, input_layer_size + 1)

    Theta2 = nn_params[hidden_layer_size * (input_layer_size + 1):].reshape(
        num_labels, hidden_layer_size + 1)

    print('Program paused. Press enter to continue.')

    ## ================= Part 9: Visualize Weights =================
    #  You can now "visualize" what the neural network is learning by
    #  displaying the hidden units to see what features they are capturing in
    #  the data.

    print('\nVisualizing Neural Network... ')

    displayData(Theta1[:, 1:])

    print('\nProgram paused. Press enter to continue.')

    ## ================= Part 10: Implement Predict =================
    #  After training the neural network, we would like to use it to predict
    #  the labels. You will now implement the "predict" function to use the
    #  neural network to predict the labels of the training set. This lets
    #  you compute the training set accuracy.

    pred = predict(Theta1, Theta2, X)

    print('\nTraining Set Accuracy: %f' % (np.mean(
        (pred == y).astype(int)) * 100))
    'Cost at parameters (loaded from ex4weights): %f \n(this value should be about 0.383770)'
    % J)

input("Program paused. Press Enter to continue...")

## ================ Part 5: Sigmoid Gradient  ================
#  Before you start implementing the neural network, you will first
#  implement the gradient for the sigmoid function. You should complete the
#  code in the sigmoidGradient.m file.

print('Evaluating sigmoid gradient...')

g = sigmoidGradient(np.array([1, -0.5, 0, 0.5, 1]))
print('Sigmoid gradient evaluated at [1 -0.5 0 0.5 1]: ')

input("Program paused. Press Enter to continue...")

## ================ Part 6: Initializing Pameters ================
#  In this part of the exercise, you will be starting to implment a two
#  layer neural network that classifies digits. You will start by
#  implementing a function to initialize the weights of the neural network
#  (randInitializeWeights.m)

print('Initializing Neural Network Parameters ...')

initial_Theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
initial_Theta2 = randInitializeWeights(hidden_layer_size, num_labels)
# Weight regularization parameter

lambda_param = 1

J, grad = nnCostFunction(nn_params, input_layer_size, hidden_layer_size,
                         num_labels, X, y, lambda_param)

print('Cost at parameters (loaded from ex4weights): {:.6f} '.format(float(J)))
print('\n(this value should be about 0.383770)\n')

input('Program paused. Press enter to continue.\n')

# ================ Part 5: Sigmoid Gradient  ================
print('\nEvaluating sigmoid gradient...\n')

g = sigmoidGradient([-1, -0.5, 0, 0.5, 1])

print('Sigmoid gradient evaluated at [-1 -0.5 0 0.5 1]:\n  ')

input('Program paused. Press enter to continue.\n')

# ================ Part 6: Initializing Pameters ================
# Implment a two layer neural network that classifies digits.
# Start by implementing a function to initialize the weights of the neural network

print('\nInitializing Neural Network Parameters ...\n')

initial_Theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
initial_Theta2 = randInitializeWeights(hidden_layer_size, num_labels)
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size,
                   num_labels, X, y, MLlambda):
    ''' Some comments.
%NNCOSTFUNCTION Implements the neural network cost function for a two layer
%neural network which performs classification
% [J grad] = NNCOSTFUNCTON(nn_params, hidden_layer_size, num_labels, MLlambda) 
%   computes the cost and gradient of the neural network. The
%   parameters for the neural network are "unrolled" into the vector
%   nn_params and need to be converted back into the weight matrices. 
%   The returned parameter grad should be a "unrolled" vector of the
%   partial derivatives of the neural network.
    # make sure all further math with MLlambda is done as float,
    #  sometimes caller sets MLlambda to be an int
    MLlambda = float(MLlambda)  

    Theta1 = np.reshape(nn_params[:hidden_layer_size * (input_layer_size + 1)],
               (hidden_layer_size, (input_layer_size + 1)), 
                order = 'F')

    Theta2 = np.reshape(nn_params[hidden_layer_size * (input_layer_size + 1):], 
               (num_labels, (hidden_layer_size + 1)), 
               order = 'F')  

    # % Setup some useful variables, num examples and features
    m, n = np.shape(X) 
    #% You need to return the following variables correctly 
    J = 0;
    Theta1_grad = np.zeros(np.shape(Theta1))
    Theta2_grad = np.zeros(np.shape(Theta2))

    #% Compute Cost of feed forward.

    # create a 10x10 eye matrix of ones.
    y_eye = np.eye(num_labels)
    y_matrix = y_eye[y[:,0], :] # y_matrix = y_eye(y, :); 

    # % Calculate cost. Assuming a 3 layer neural network. 
    # % Add ones column to the X data matrix
    a1 = np.c_[np.ones(m), X]

    #% Calculate a2 outputs for hidden layer
    z2 = np.dot(a1 , Theta1.transpose())   # m X 25
    a2 = sigmoid(z2)    # m x 25
    a2 = np.c_[np.ones(m), a2] # add a0 = 1, column of 1's -? m x 26

    z3 = np.dot(a2, Theta2.transpose())  # m x 10
    a3 = sigmoid(z3)  # m x 10

    hox = a3

    Inner_J = -y_matrix*np.log(hox) - (1 - y_matrix)*np.log(1 - hox)

    J_wo_reg = np.sum(Inner_J)/m #J_wo_reg = sum(sum(Inner_J))/m;

    #% Calculate Regularization portion
    Theta1_no_bias = Theta1[:, 1:]
    Theta2_no_bias = Theta2[:, 1:]

    Theta1_no_bias_squared = np.square(Theta1_no_bias)
    Theta2_no_bias_squared = np.square(Theta2_no_bias)

    reg = (float(MLlambda)/(2*m)) * ( sum(sum(Theta1_no_bias_squared)) + 
    J = J_wo_reg + reg 

#% ***************************************************************
#% ************************* PART 2 ******************************
#% ***************************************************************

    #% Calculate the gradients
    #% Assuming a 3 layer network.

    #% STEP 1: Calculate error at level 3: d3
    d3 = a3 - y_matrix

    #% STEP 2: Calculate error at Level 2: d2
    siggrad_z2 = sigmoidGradient(z2) 
    # % NOTE: a'b = ba
    # d2 = (d3 * Theta2_no_bias).*siggrad_z2;
    d2 = np.dot(d3,Theta2_no_bias) * siggrad_z2

    #% STEP 3: Calculate Delta's:  Delta1 & Delta2 (ie the triangles)
    #% Note, have already removed bias unit in Delta1 prior, as 
    #%  d2 was computed with Theta2 with bias removed.
    Delta1 = np.dot(d2.transpose(), a1)
    Delta2 = np.dot(d3.transpose(), a2)  

    #% Calculate the back prop gradients.
    Theta1_grad = (1./m)* Delta1
    Theta2_grad = (1./m)* Delta2 

% ***************************************************************
% ************************* PART 3 ******************************
% ***************************************************************

% Calculate regularization component of the gradient.
%  Theta1 and Theta2 include the bias components, but to 
%  calculate the regularization, we do not want to include
%  the bias. So we zero out the bias columns, so it will have
%  no impact when we add it to the gradient that was calculated 
%  above (e.g. without regularization). But we want to keep the
%  matrix sizes the same so we can do the additions using vector
%  or matrix math.

    #% Zero out the bias unit in Theta1
    Theta1_bias_zero = np.copy(Theta1)
    Theta1_bias_zero[:, 0] = 0

    #% Zero out the bias unit in Theta2
    Theta2_bias_zero = np.copy(Theta2)
    Theta2_bias_zero[:, 0] = 0

    #% Scale Theta's by lambda/m 
    Theta1_reg = (MLlambda/m ) * Theta1_bias_zero
    Theta2_reg = (MLlambda/m ) * Theta2_bias_zero

    #% Add regularization component to the gradients
    Theta1_grad = Theta1_grad + Theta1_reg
    Theta2_grad = Theta2_grad + Theta2_reg

    #% Unroll gradients
    #grad = [Theta1_grad(:) ; Theta2_grad(:)];
    grad = np.hstack((Theta1_grad.ravel(order='F'), Theta2_grad.ravel(order='F')))

    # JKMM pause for debug
    #print("JKMM Program paused in nnCostFunction. Press Ctrl-D to continue.\n")
    #code.interact(local=dict(globals(), **locals()))
    #print(" ... continuing\n ")  

    return J, grad
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, reg_lambda, 
                   returnOnlyGrad = None, returnOnlyCost = None, flattenResult=None):
    """Implements the neural network cost function for a two layer
       neural network which performs classification
       [J grad] = NNCOSTFUNCTON(nn_params, hidden_layer_size, num_labels, ...
       X, y, lambda) computes the cost and gradient of the neural network. The
       parameters for the neural network are "unrolled" into the vector
       nn_params and need to be converted back into the weight matrices. 
       The returned parameter grad should be a "unrolled" vector of the
       partial derivatives of the neural network.
    # Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices
    # for our 2 layer neural network
    Theta1 = np.reshape(nn_params[0:hidden_layer_size * (input_layer_size + 1)], (hidden_layer_size, input_layer_size + 1))
    Theta2 = np.reshape(nn_params[(hidden_layer_size * (input_layer_size + 1)):], (num_labels, hidden_layer_size + 1))
    # Setup some useful variables
    m = np.shape(X)[0]

    # Part 1: Feedforward the neural network and return the cost in the 
    #         variable J. After implementing Part 1, you can verify that your
    #         cost function computation is correct by verifying the cost computed in ex4.py
    # Explode each row in y into 10 dimension vector
    # recode y to Y
    Y = np.zeros((m, num_labels))
    for i in range(m):
      Y[i, y[i, 0]]= 1
    # 1. Feed-forward to compute h = a3.
    a1 = np.c_[np.ones((m, 1)), X]
    z2 = a1.dot(Theta1.T)
    a2 = np.c_[np.ones((z2.shape[0], 1)), sigmoid(z2)]
    z3 = a2.dot(Theta2.T)
    a3 = sigmoid(z3)
    h = a3
    J = np.sum(np.sum((-Y) * np.log(h) - (1-Y) * np.log(1-h), 1)) / m
    # Part 2: Implement the backpropagation algorithm to compute the gradients
    #         Theta1_grad and Theta2_grad. You should return the partial derivatives of
    #         the cost function with respect to Theta1 and Theta2 in Theta1_grad and
    #         Theta2_grad, respectively. After implementing Part 2, you can check
    #         that your implementation is correct by running checkNNGradients
    sigma3 = h - Y
    sigma2 = (sigma3.dot(Theta2)) * sigmoidGradient(np.c_[np.ones((np.shape(z2)[0], 1)), z2])
    delta2 =  sigma3.T.dot(a2)
    delta1 =  sigma2[:, 1:].T.dot(a1)
    # Part 3: Implement regularization with the cost function and gradients.
    #         Hint: You can implement this around the code for
    #               backpropagation. That is, you can compute the gradients for
    #               the regularization separately and then add them to Theta1_grad
    #               and Theta2_grad from Part 2.
    # we dont regularize bias
    J = J + (reg_lambda/(2.0 * m)) * np.sum(np.sum(Theta1[:,1:] * Theta1[:,1:]))
    J = J + (reg_lambda/(2.0 * m)) * np.sum(np.sum(Theta2[:,1:] * Theta2[:,1:]))
    # calculate penalties (we dont regularize bias)
    p1 = (reg_lambda/m) * np.c_[np.zeros((np.shape(Theta1)[0], 1)), Theta1[:,1:]]
    p2 = (reg_lambda/m) * np.c_[np.zeros((np.shape(Theta2)[0], 1)), Theta2[:,1:]]
    Theta1_grad = delta1/m + p1
    Theta2_grad = delta2/m + p2
    # Unroll gradients
    grad = np.r_[Theta1_grad.ravel(), Theta2_grad.ravel()]

    if (returnOnlyGrad):
        if (flattenResult):
            return grad.flatten()
        return grad
    if (returnOnlyCost):
        return J
    return (J, grad)
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size,
                   num_labels, X, y, lamda):
    # Reshape nn_params back into the parameters Theta1 and Theta2,
    # the weight matrices for our 2 layer neural network
    Theta1 = nn_params[:hidden_layer_size * (input_layer_size + 1)].reshape(
        (hidden_layer_size, input_layer_size + 1), order='F')
    Theta2 = nn_params[hidden_layer_size * (input_layer_size + 1):].reshape(
        (num_labels, hidden_layer_size + 1), order='F')

    m = X.shape[0]
    J = 0
    Theta1_grad = np.zeros(Theta1.shape)
    Theta2_grad = np.zeros(Theta2.shape)

    X = np.hstack((np.ones((m, 1)), X))
    yv = np.zeros((m, num_labels))
    for i in range(m):
        yv[i, y[i][0] - 1] = 1

    # Part 1: Feedforward the neural network and return the cost in the
    #         variable J. After implementing Part 1, you can verify that your
    #         cost function computation is correct by verifying the cost
    #         computed in ex4.py
    a1 = X
    a2 = np.hstack((np.ones((m, 1)), sigmoid(a1.dot(Theta1.T))))
    a3 = sigmoid(a2.dot(Theta2.T))

    for i in range(m):
        J += (-yv[i, :] * np.log(a3[i, :]) -
              (1 - yv[i, :]) * np.log(1 - a3[i, :])).sum()
    J /= m
    J += ((Theta1[:, 1:] ** 2).sum() +
          (Theta2[:, 1:] ** 2).sum()) * lamda / 2 / m

    # Part 2: Implement the backpropagation algorithm to compute the gradients
    #         Theta1_grad and Theta2_grad. You should return the partial derivatives of
    #         the cost function with respect to Theta1 and Theta2 in Theta1_grad and
    #         Theta2_grad, respectively. After implementing Part 2, you can check
    #         that your implementation is correct by running checkNNGradients
    #         Note: The vector y passed into the function is a vector of labels
    #               containing values from 1..K. You need to map this vector into a
    #               binary vector of 1's and 0's to be used with the neural network
    #               cost function.
    #         Hint: We recommend implementing backpropagation using a for-loop
    #               over the training examples if you are implementing it for the
    #               first time.
    for i in range(m):
        a1 = X[i:i + 1, :].T
        z2 = Theta1.dot(a1)
        a2 = np.vstack(([1], sigmoid(z2)))
        z3 = Theta2.dot(a2)
        a3 = sigmoid(z3)

        delta3 = a3 - yv[i:i + 1, :].T
        delta2 = Theta2.T.dot(delta3) * np.vstack(([1], sigmoidGradient(z2)))

        Theta1_grad += delta2[1:, :].dot(a1.T)
        Theta2_grad += delta3.dot(a2.T)

    Theta1_grad /= m
    Theta2_grad /= m

    # Part 3: Implement regularization with the cost function and gradients.
    #         Hint: You can implement this around the code for
    #               backpropagation. That is, you can compute the gradients for
    #               the regularization separately and then add them to Theta1_grad
    #               and Theta2_grad from Part 2.
    Theta1_grad[:, 1:] += lamda / m * Theta1[:, 1:]
    Theta2_grad[:, 1:] += lamda / m * Theta2[:, 1:]

    # Unroll gradients
    grad = np.hstack((Theta1_grad.T.ravel(), Theta2_grad.T.ravel()))

    return J, grad
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels,
                   X, y, xlambda):
    # Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices for our 2 layer neural network
    Theta1 = nn_params[:hidden_layer_size * (input_layer_size + 1)].reshape(
        hidden_layer_size, input_layer_size + 1)
    Theta2 = nn_params[hidden_layer_size * (input_layer_size + 1):].reshape(
        num_labels, hidden_layer_size + 1)
    m, n = np.shape(X)

    # initial output
    J = 0
    Theta1_grad = np.zeros(np.shape(Theta1))
    Theta2_grad = np.zeros(np.shape(Theta2))

    ## ==========================  Part 1:  ===========================
    # Feedforward the neural network and return the cost in the variable J without regularization
    ylabel = np.zeros([num_labels,
                       m])  # Transform y from a 1-d vector into a 10-d matrix
    for i in range(0, m):
        ylabel[int(y[i] - 1.0), i] = 1

    # process of FP
    X = np.c_[np.ones(m).reshape(m, 1), X]
    z2 = np.dot(X, Theta1.T)
    a2 = sigmoid.sigmoid(z2)
    z3 = np.c_[np.ones(m).reshape(m, 1), a2]
    z3 = np.dot(z3, Theta2.T)
    a3 = sigmoid.sigmoid(z3)  # a3 is the h(x), it is the output layer.

    for i in range(0, m):
        J = J - (np.dot(np.log(a3[i,:].reshape(1,num_labels)),ylabel[:,i].reshape(num_labels,1))+\
    J = J / m

    ## ==================== Part 2: Compute the gradients ======================
    Delta1, Delta2 = np.zeros(np.shape(Theta1)), np.zeros(np.shape(Theta2))

    # BP
    for t in range(0, m):
        delta3 = (a3[t, :] - ylabel[:, t]).reshape(
            num_labels, 1)  # the error between y and a3
        delta2 = np.dot(Theta2.T, delta3) * sigmoidGradient.sigmoidGradient(
            np.c_[np.ones(m).reshape(m, 1), a2][t, :].reshape(
                hidden_layer_size + 1,
                1))  # the error generated in the hidden layer

        Delta1 = Delta1 + np.dot(delta2[1:], X[t, :].reshape(1, n + 1))
        Delta2 = Delta2 + np.dot(
            delta3, np.c_[np.ones(m).reshape(m, 1), a2][t, :].reshape(
                1, hidden_layer_size + 1))

    Theta1_grad = Delta1 / m
    Theta2_grad = Delta2 / m

    m1, n1 = np.shape(Theta1_grad)
    m2, n2 = np.shape(Theta2_grad)

    grad = np.r_[(Theta1_grad.ravel().reshape(m1 * n1, 1),
                  Theta2_grad.ravel().reshape(m2 * n2, 1))]

    return J, grad
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lmbda):
    # Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices
    # for our 2 layer neural network
    if nn_params.shape[0] != 1:
        nn_params = nn_params.reshape((1, nn_params.shape[0]))

    Theta1 = nn_params[:, :(hidden_layer_size * (input_layer_size + 1))
                       ].reshape((hidden_layer_size, input_layer_size + 1))
    Theta2 = nn_params[:, hidden_layer_size *
                       (input_layer_size + 1):].reshape((num_labels, hidden_layer_size + 1))

    # Setup some useful variables
    m = X.shape[0]

    # You need to return the following variables correctly
    J = 0
    Theta1_grad = np.zeros(Theta1.shape)
    Theta2_grad = np.zeros(Theta2.shape)

    Instructions: You should complete the code by working through the following parts.
    new_labels = np.zeros((y.shape[0], num_labels))

    for i in range(m):
        new_labels[i, int(y[i]) - 1] = 1

    X = np.hstack((np.ones((m, 1)), X))
    a_2 = sigmoid(X.dot(Theta1.T))
    a_2 = np.hstack((np.ones((m, 1)), a_2))
    a_3 = sigmoid(a_2.dot(Theta2.T))

    J = np.sum(np.multiply(-new_labels, np.nan_to_num(np.log(a_3))) -
               np.multiply(1 - new_labels, np.nan_to_num(np.log(1 - a_3)))) / m

    t1 = Theta1[:, 1:]
    t2 = Theta2[:, 1:]
    J += (lmbda * (np.sum(np.power(t1, 2)) + np.sum(np.power(t2, 2)))) / (2 * m)

    for t in range(m):
        a_1 = X[t, :]
        z_2 = a_1.dot(Theta1.T)
        a_2 = sigmoid(z_2)
        a_2 = np.matrix(np.append([1], a_2))
        z_3 = a_2.dot(Theta2.T)
        a_3 = sigmoid(z_3)

        delta_3 = a_3 - new_labels[t, :]
        delta_2 = np.multiply(delta_3.dot(Theta2[:, 1:]), sigmoidGradient(z_2))

        Theta1_grad += delta_2.T.dot(a_1)
        Theta2_grad += delta_3.T.dot(a_2)

    Theta1_grad /= m
    Theta2_grad /= m

    Theta1_grad[:, 1:] += (lmbda * Theta1[:, 1:]) / m
    Theta2_grad[:, 1:] += (lmbda * Theta2[:, 1:]) / m

    # ============================================================

    # Unroll gradients
    grad = np.hstack((Theta1_grad.flatten(), Theta2_grad.flatten()))

    return J, grad
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, Lambda):

    """computes the cost and gradient of the neural network. The
  parameters for the neural network are "unrolled" into the vector
  nn_params and need to be converted back into the weight matrices.

  The returned parameter grad should be a "unrolled" vector of the
  partial derivatives of the neural network.
# Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices
# for our 2 layer neural network
# Obtain Theta1 and Theta2 back from nn_params
    Theta1 = np.reshape(nn_params[:hidden_layer_size * (input_layer_size + 1)],
                       (hidden_layer_size, input_layer_size + 1), order='F').copy()

    Theta2 = np.reshape(nn_params[hidden_layer_size * (input_layer_size + 1):],
                       (num_labels, (hidden_layer_size + 1)), order='F').copy()

# Setup some useful variables
    m, _ = X.shape

Instructions: You should complete the code by working through the following parts.
# Instructions: You should complete the code by working through the
#               following parts.
# Part 1: Feedforward the neural network and return the cost in the
#         variable J. After implementing Part 1, you can verify that your
#         cost function computation is correct by verifying the cost
#         computed in ex4.m
    a1=np.column_stack((np.ones((m, 1)), X))
    #z2 = np.dot(Theta1, a1.T).T
    z2 = np.dot(a1,Theta1.T)
    a2 = np.column_stack((np.ones((m, 1)),sigmoid(z2)))
    z3 = np.dot(a2, Theta2.T)
    a3 = sigmoid(z3)
    nn_hx = a3.ravel(order='F')
    #nn_y = np.repeat(y, num_labels)
    for k in range(num_labels):
        nn_y=np.append(nn_y,np.asarray([1 if i==(k+1) else 0 for i in y]))
    MatY=np.reshape(nn_y, (num_labels,m)).T
    #first = -np.dot(MatY.ravel(), np.log(nn_hx))
    #second = -np.dot((1-MatY.ravel()), np.log(1-nn_hx))
    first = -np.dot(nn_y, np.log(nn_hx))
    second = -np.dot((1-nn_y), np.log(1-nn_hx))
    #first = -np.dot(MatY, np.log(a3))
    #second = -np.dot((1-MatY), np.log(1-a3))
    reg = (Theta1[:,1:]**2).sum() + (Theta2[:,1:]**2).sum()
    J=(first+second)/m + reg*Lambda/(2.*m)
    #J = ((-MatY * np.log(a3) - (1-MatY) * np.log(1-a3))/m).sum()

# Part 2: Implement the backpropagation algorithm to compute the gradients
#         Theta1_grad and Theta2_grad. You should return the partial derivatives of
#         the cost function with respect to Theta1 and Theta2 in Theta1_grad and
#         Theta2_grad, respectively. After implementing Part 2, you can check
#         that your implementation is correct by running checkNNGradients
#         Note: The vector y passed into the function is a vector of labels
#               containing values from 1..K. You need to map this vector into a 
#               binary vector of 1's and 0's to be used with the neural network
#               cost function.
#         Hint: We recommend implementing backpropagation using a for-loop
#               over the training examples if you are implementing it for the 
#               first time.

    delta3 = a3 - MatY

    delta2 = np.dot(delta3,Theta2)[:,1:] * sigmoidGradient(z2)
    Theta1_grad = np.dot(delta2.T,a1)/m + Lambda * Theta1/m
    Theta2_grad = np.dot(delta3.T,a2)/m + Lambda * Theta2/m
# Part 3: Implement regularization with the cost function and gradients.
#         Hint: You can implement this around the code for
#               backpropagation. That is, you can compute the gradients for
#               the regularization separately and then add them to Theta1_grad
#               and Theta2_grad from Part 2.

    # -------------------------------------------------------------

    # =========================================================================
    #Unroll gradient
    Ngrad = np.hstack((Theta1_grad.T.ravel(), Theta2_grad.T.ravel()))

    return J, Ngrad
# Weight regularization parameter (we set this to 1 here).
_lambda = 1

J = nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, _lambda)

print('Cost at parameters (loaded from ex4weights): (this value should be about 0.383770)', J)

#% ================ Part 5: Sigmoid Gradient  ================
#  Before you start implementing the neural network, you will first
#  implement the gradient for the sigmoid function. You should complete the
#  code in the sigmoidGradient.m file.

g = sigmoidGradient(np.array([1, -0.5, 0, 0.5, 1]))
print('Sigmoid gradient evaluated at [1 -0.5 0 0.5 1]\n ', g)

#% ================ Part 6: Initializing Pameters ================
#  In this part of the exercise, you will be starting to implment a two
#  layer neural network that classifies digits. You will start by
#  implementing a function to initialize the weights of the neural network
#  (randInitializeWeights.m)

initial_Theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
initial_Theta2 = randInitializeWeights(hidden_layer_size, num_labels)

# Unroll parameters
def backwards(nn_weights, layers, X, y, num_labels, lambd):
    # Computes the gradient fo the neural network.
    # nn_weights: Neural network parameters (vector)
    # layers: a list with the number of units per layer.
    # X: a matrix where every row is a training example for a handwritten digit image
    # y: a vector with the labels of each instance
    # num_labels: the number of units in the output layer
    # lambd: regularization factor
    # Setup some useful variables
    m = X.shape[0]
    num_layers = len(layers)

    # Roll Params
    # The parameters for the neural network are "unrolled" into the vector
    # nn_params and need to be converted back into the weight matrices.
    Theta = roll_params(nn_weights, layers)
    # You need to return the following variables correctly 
    Theta_grad = [zeros(w.shape) for w in Theta]

    # ================================ DONE ================================
    # The vector y passed into the function is a vector of labels
    # containing values from 1..K. You need to map this vector into a
    # binary vector of 1's and 0's to be used with the neural network
    # cost function.
    yv = zeros((m, num_labels))
    for i in range(m):
        yv[i][y[i]] += 1

    # ================================ DONE ================================
    # In this point implement the backpropagation algorithm

    # In this point calculate the cost of the neural network (feedforward)

    # Step 1: Initialization of useful variables

    # Z and A will store the hidden states of the network, as lists of matrices, of size num_layers
    A = [addColumnOne(X)]
    Z = [addColumnOne(X)]

    # delta will store the delta for each layer from the last to the second layer (in reverse order)
    delta = []

    # Step 2: Feedforward
    for i in range(num_layers-1):
        h = A[i].dot(Theta[i].T)
        h = addColumnOne(sigmoid(h))

    # Step 3: Backpropagation
    d = removeFirstColumn(A[-1]) - yv

    for i in range(num_layers-2, 0, -1):
        d = removeFirstColumn(d.dot(Theta[i])) * sigmoidGradient(Z[i])

    # delta is of size num_layers-1 (no delta for the input layer)

    for i in range(num_layers-1):
        Theta_grad[i] += delta[i].T.dot(A[i])
        # DONE: no regularization on the bias weights !!
        Theta_grad[i] += lambd * Theta[i]
        for j in range(Theta[i].shape[0]):
            Theta_grad[i][j, 0] -= lambd * Theta[i][j, 0]
        Theta_grad[i] /= m

    # Unroll Params
    Theta_grad = unroll_params(Theta_grad)

    return Theta_grad