Exemplo n.º 1
0
  def loss(self, X, y=None):
    """
    Evaluate loss and gradient for the three-layer convolutional network.
    """
    W1 = self.params['W1']
    W2, b2 = self.params['W2'], self.params['b2']
    W3, b3 = self.params['W3'], self.params['b3']

    # pass pool_param to the forward pass for the max-pooling layer
    pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}

    scores = None
    conv, cache1 = layers.conv_forward(X,W1)
    relu1, cache2 = layers.relu_forward(conv)
    maxp, cache3 = layers.max_pool_forward(relu1,pool_param)
    fc1, cache4 = layers.fc_forward(maxp,W2,b2)
    relu2, cache5 = layers.relu_forward(fc1)
    scores, cache6 = layers.fc_forward(relu2,W3,b3)

    if y is None:
      return scores

    loss, grads = 0, {}
    loss, dscores = layers.softmax_loss(scores,y)
    dx3, dW3, db3 = layers.fc_backward(dscores,cache6)
    dRelu2 = layers.relu_backward(dx3,cache5)
    dx2, dW2, db2 = layers.fc_backward(dRelu2,cache4)
    dmaxp = layers.max_pool_backward(dx2.reshape(maxp.shape),cache3)
    dRelu1 = layers.relu_backward(dmaxp,cache2)
    dx,dW1 = layers.conv_backward(dRelu1,cache1)
    
    grads = {'W1':dW1,'W2':dW2,'b2':db2,'W3':dW3,'b3':db3}

    return loss, grads
def test_relulayer():
    x = np.random.randn(10, 10)
    dout = np.random.randn(*x.shape)

    dx_num = eval_numerical_gradient_array(lambda x: layers.relu_forward(x)[0], x, dout)
    _, cache = layers.relu_forward(x)
    dx = layers.relu_backward(dout, cache)

    # The error should be around 1e-12
    print 'Testing relu layers:'
    print 'dx error: ', rel_error(dx_num, dx)
def test_relulayer():
    x = np.random.randn(10, 10)
    dout = np.random.randn(*x.shape)

    dx_num = eval_numerical_gradient_array(lambda x: layers.relu_forward(x)[0],
                                           x, dout)
    _, cache = layers.relu_forward(x)
    dx = layers.relu_backward(dout, cache)

    # The error should be around 1e-12
    print 'Testing relu layers:'
    print 'dx error: ', rel_error(dx_num, dx)
Exemplo n.º 4
0
    def test_relu_forward(self):
        # ReLU layer: forward
        x = np.linspace(-0.5, 0.5, num=12).reshape(3, 4)

        out, _ = layers.relu_forward(x)
        correct_out = np.array([[
            0.,
            0.,
            0.,
            0.,
        ], [
            0.,
            0.,
            0.04545455,
            0.13636364,
        ], [
            0.22727273,
            0.31818182,
            0.40909091,
            0.5,
        ]])

        # Compare your output with ours. The error might be around 5e-8
        # As long as your error is small enough, your implementation should pass this test.
        print('\nTesting relu_forward function:')
        print('difference: ', rel_error(out, correct_out))
        np.testing.assert_allclose(out, correct_out, atol=1e-7)
    def train_loss(*args):
      X = args[0]
      y = args[1]

      res = X
      for l in xrange(self.num_layers):
        prev_res = res
        res = affine_forward(prev_res, args[self.w_idx(l)], args[self.b_idx(l)])

        if l < (self.num_layers - 1):
          if self.use_batchnorm:
            res = batchnorm_forward(res, args[self.bn_ga_idx(l)],
                                    args[self.bn_bt_idx(l)], self.bn_params[l])
          res = relu_forward(res)
          if self.use_dropout:
            res = dropout_forward(res, self.dropout_param)

      scores = res

      if mode == 'test':
        return scores

      #loss, _ = softmax_loss(scores, y)
      loss = svm_loss(scores, y)
      return loss
Exemplo n.º 6
0
        def train_loss(*args):
            X = args[0]
            y = args[1]

            res = X
            for l in xrange(self.num_layers):
                prev_res = res
                res = affine_forward(prev_res, args[self.w_idx(l)],
                                     args[self.b_idx(l)])

                if l < (self.num_layers - 1):
                    if self.use_batchnorm:
                        res = batchnorm_forward(res, args[self.bn_ga_idx(l)],
                                                args[self.bn_bt_idx(l)],
                                                self.bn_params[l])
                    res = relu_forward(res)
                    if self.use_dropout:
                        res = dropout_forward(res, self.dropout_param)

            scores = res

            if mode == 'test':
                return scores

            #loss, _ = softmax_loss(scores, y)
            loss = svm_loss(scores, y)
            return loss
Exemplo n.º 7
0
    def test_relu_backward(self):
        # ReLU layer: backward
        np.random.seed(498)
        x = np.random.randn(10, 10)
        dout = np.random.randn(*x.shape)

        dx_num = eval_numerical_gradient_array(
            lambda x: layers.relu_forward(x)[0], x, dout)

        _, cache = layers.relu_forward(x)
        dx = layers.relu_backward(dout, cache)

        # The error should be around 3e-12
        print('\nTesting relu_backward function:')
        print('dx error: ', rel_error(dx_num, dx))
        np.testing.assert_allclose(dx, dx_num, atol=1e-9)
def affine_relu_forward(x, w, b):
  """
  Convenience layer that perorms an affine transform followed by a ReLU

  Inputs:
  - x: Input to the affine layer
  - w, b: Weights for the affine layer

  Returns a tuple of:
  - out: Output from the ReLU
  - cache: Object to give to the backward pass
  """
  a = affine_forward(x, w, b)
  out = relu_forward(a)
  return out
Exemplo n.º 9
0
def affine_relu_forward(x, w, b):
    """
  Convenience layer that perorms an affine transform followed by a ReLU

  Inputs:
  - x: Input to the affine layer
  - w, b: Weights for the affine layer

  Returns a tuple of:
  - out: Output from the ReLU
  - cache: Object to give to the backward pass
  """
    a = affine_forward(x, w, b)
    out = relu_forward(a)
    return out
Exemplo n.º 10
0
def affine_relu_forward(x, w, b):
    '''
     Convenience layer that perorms an affine transform followed by a ReLU
     input:
         x:input to the affine layer
         w: wights
         b: bias
     return: a tuple
         out: output from the relu
         cache: object to give to the backward pass
    '''
    a, fc_cache = layers.affine_forward(x, w, b)  # a=wx+b fc_cache=(x,w,b)
    out, relu_cache = layers.relu_forward(
        a)  # out=np.maximum(0,a)  relu_cache=a
    cache = (fc_cache, relu_cache)
    return out, cache
Exemplo n.º 11
0
    def forward(self, X):
        scores, cache = None, None
        #######################################################################
        # TODO: Implement the forward pass to compute classification scores   #
        # for the input data X. Store into cache any data that will be needed #
        # during the backward pass.                                           #
        #######################################################################
        out11, cache11 = fc_forward(X, self.W1, self.b1)
        out12, cache12 = relu_forward(out11)
        scores, cache2 = fc_forward(out12, self.W2, self.b2)

        cache = (cache11, cache12, cache2)
        #######################################################################
        #                          END OF YOUR CODE                           #
        #######################################################################
        return scores, cache
def conv_relu_forward(x, w, b, conv_param):
  """
  A convenience layer that performs a convolution followed by a ReLU.

  Inputs:
  - x: Input to the convolutional layer
  - w, b, conv_param: Weights and parameters for the convolutional layer
  
  Returns a tuple of:
  - out: Output from the ReLU
  - cache: Object to give to the backward pass
  """
  a, conv_cache = conv_forward_fast(x, w, b, conv_param)
  out, relu_cache = relu_forward(a)
  cache = (conv_cache, relu_cache)
  return out, cache
Exemplo n.º 13
0
def conv_relu_forward(x, w, b, conv_param):
    """
  A convenience layer that performs a convolution followed by a ReLU.

  Inputs:
  - x: Input to the convolutional layer
  - w, b, conv_param: Weights and parameters for the convolutional layer
  
  Returns a tuple of:
  - out: Output from the ReLU
  - cache: Object to give to the backward pass
  """
    a, conv_cache = conv_forward_fast(x, w, b, conv_param)
    out, relu_cache = relu_forward(a)
    cache = (conv_cache, relu_cache)
    return out, cache
Exemplo n.º 14
0
def conv_relu_pool_forward(x, w, b, conv_param, pool_param):
    """
  Convenience layer that performs a convolution, a ReLU, and a pool.

  Inputs:
  - x: Input to the convolutional layer
  - w, b, conv_param: Weights and parameters for the convolutional layer
  - pool_param: Parameters for the pooling layer

  Returns a tuple of:
  - out: Output from the pooling layer
  - cache: Object to give to the backward pass
  """
    a, conv_cache = conv_forward_fast(x, w, b, conv_param)
    s, relu_cache = relu_forward(a)
    out, pool_cache = max_pool_forward_fast(s, pool_param)
    cache = (conv_cache, relu_cache, pool_cache)
    return out, cache
def conv_relu_pool_forward(x, w, b, conv_param, pool_param):
  """
  Convenience layer that performs a convolution, a ReLU, and a pool.

  Inputs:
  - x: Input to the convolutional layer
  - w, b, conv_param: Weights and parameters for the convolutional layer
  - pool_param: Parameters for the pooling layer

  Returns a tuple of:
  - out: Output from the pooling layer
  - cache: Object to give to the backward pass
  """
  a, conv_cache = conv_forward_fast(x, w, b, conv_param)
  s, relu_cache = relu_forward(a)
  out, pool_cache = max_pool_forward_fast(s, pool_param)
  cache = (conv_cache, relu_cache, pool_cache)
  return out, cache
Exemplo n.º 16
0
    def loss(self, X, y=None):
        """
        Compute loss and gradient for a minibatch of data.
        Inputs:
        - X: Array of input data of shape (N, d_in)
        - y: Array of labels, of shape (N,). y[i] gives the label for X[i].
        Returns:
        If y is None, then run a test-time forward pass of the model and return:
        - scores: Array of shape (N, C) giving classification scores, where
          scores[i, c] is the classification score for X[i] and class c.
        If y is not None, then run a training-time forward and backward pass and
        return a tuple of:
        - loss: Scalar value giving the loss
        - grads: Dictionary with the same keys as self.params, mapping parameter
          names to gradients of the loss with respect to those parameters.
        """
        W1, b1 = self.params['W1'], self.params['b1']
        W3, b3 = self.params['W3'], self.params['b3']
        N, d_in = X.shape

        scores = None
        f, cache1 = layers.fc_forward(X, W1, b1)  #fc
        h, cache2 = layers.relu_forward(f)  #relu
        scores, cache3 = layers.fc_forward(h, W3, b3)  #fc

        # If y is None then we are in test mode so just return scores
        if y is None:
            return scores

        loss, grads = 0, {}
        loss, dscores = layers.softmax_loss(scores, y)
        dx2, dW3, db3 = layers.fc_backward(dscores, cache3)
        dx1 = layers.relu_backward(dx2, cache2)
        dx, dW1, db1 = layers.fc_backward(dx1, cache1)

        grads = {'W1': dW1, 'b1': db1, 'W3': dW3, 'b3': db3}

        return loss, grads
Exemplo n.º 17
0
    def loss(self, X, y=None):
        """
        Compute loss and gradient for a minibatch of data.
        Args:
        - X: Input data, numpy array of shape (N, d_1, ..., d_k)
        - y: Array of labels, of shape (N,). y[i] gives the label for X[i].
        Returns:
        If y is None, then run a test-time forward pass of the model and
        return:
        - scores: Array of shape (N, C) giving classification scores, where
          scores[i, c] is the classification score for X[i] and class c.
        If y is not None, then run a training-time forward and backward pass
        and return a tuple of:
        - loss: Scalar value giving the loss
        - grads: Dictionary with the same keys as self.params, mapping
        parameter
          names to gradients of the loss with respect to those parameters.
        """
        scores = None
        X = X.astype(self.dtype)
        linear_cache = dict()
        relu_cache = dict()
        dropout_cache = dict()
        """
        TODO: Implement the forward pass for the fully-connected neural
        network, compute the scores and store them in the scores variable.
        """
        #######################################################################
        #                           BEGIN OF YOUR CODE                        #
        #######################################################################

        VAL = X.copy()

        for i in range(1, self.num_layers):
            linear_cache['L{}'.format(i)] = linear_forward(
                VAL, self.params['W{}'.format(i)],
                self.params['b{}'.format(i)])
            relu_cache['R{}'.format(i)] = relu_forward(
                linear_cache['L{}'.format(i)])
            if self.use_dropout:
                dropout_cache['D{}'.format(i)], dropout_cache['MASK{}'.format(i)] = dropout_forward(relu_cache['R{}'.format(i)],\
                                                                 self.dropout_params['p'], self.dropout_params['train'],\
                                                                 self.dropout_params['seed'])
                VAL = dropout_cache['D{}'.format(i)]
            else:
                VAL = relu_cache['R{}'.format(i)]


        linear_cache['L{}'.format(self.num_layers)] = linear_forward(VAL, self.params['W{}'.format(self.num_layers)],\
                                                           self.params['b{}'.format(self.num_layers)])

        scores = linear_cache['L{}'.format(self.num_layers)]

        #######################################################################
        #                            END OF YOUR CODE                         #
        #######################################################################
        # If y is None then we are in test mode so just return scores
        if y is None:
            return scores
        loss, grads = 0, dict()
        """
        TODO: Implement the backward pass for the fully-connected net. Store
        the loss in the loss variable and all gradients in the grads
        dictionary. Compute the loss with softmax. grads[k] has the gradients
        for self.params[k]. Add L2 regularisation to the loss function.
        NOTE: To ensure that your implementation matches ours and you pass the
        automated tests, make sure that your L2 regularization includes a
        factor of 0.5 to simplify the expression for the gradient.
        """
        #######################################################################
        #                           BEGIN OF YOUR CODE                        #
        #######################################################################

        loss, grad = softmax(scores, y)

        if self.use_dropout:
            VAR = dropout_cache['D{}'.format(self.num_layers - 1)]
        else:
            VAR = relu_cache['R{}'.format(self.num_layers - 1)]

        dX, grads['W{}'.format(self.num_layers)], grads['b{}'.format(self.num_layers)] = linear_backward(grad, \
            VAR, self.params['W{}'.format(self.num_layers)],self.params['b{}'.format(self.num_layers)])

        grads['W{}'.format(
            self.num_layers)] += self.reg * self.params['W{}'.format(
                self.num_layers)]

        loss += 0.5 * self.reg * np.sum(self.params['W' + str(self.num_layers)]
                                        **2)

        for inx in range(self.num_layers - 1, 0, -1):
            if self.use_dropout:
                dX = dropout_backward(dX, dropout_cache['MASK{}'.format(inx)],
                                      self.dropout_params['p'])

            dX = relu_backward(dX, linear_cache['L' + str(inx)])

            if inx - 1 != 0:
                if self.use_dropout:
                    pre_layer = dropout_cache['D{}'.format(inx - 1)]
                else:
                    pre_layer = relu_cache['R{}'.format(inx - 1)]
                dX, grads['W' +
                          str(inx)], grads['b' + str(inx)] = linear_backward(
                              dX, pre_layer, self.params['W{}'.format(inx)],
                              self.params['b{}'.format(inx)])

                grads['W' + str(inx)] += self.reg * self.params['W' + str(inx)]
                loss += 0.5 * self.reg * np.sum(self.params['W' + str(inx)]**2)

            else:

                dX, grads['W' +
                          str(inx)], grads['b' + str(inx)] = linear_backward(
                              dX, X, self.params['W{}'.format(inx)],
                              self.params['b{}'.format(inx)])
                grads['W' + str(inx)] += self.reg * self.params['W' + str(inx)]
                loss += 0.5 * self.reg * np.sum(self.params['W' + str(inx)]**2)

        #######################################################################
        #                            END OF YOUR CODE                         #
        #######################################################################
        return loss, grads
def affine_relu_forward(x, w, b):
    a, fc_cache = affine_forward(x, w, b)
    out, relu_cache = relu_forward(a)
    cache = (fc_cache, relu_cache)
    return out, cache
Exemplo n.º 19
0

# Problem 3.1.3
###################################################################################
#   ReLU layer: forward                                                           #
###################################################################################
#   In the file layers.py implement the forward pass for the ReLU activation in   #
#   the relu_forward function.                                                    #
#   Once you are done you can test your implementation using the following.       #
###################################################################################

# Test the relu_forward function

x = np.linspace(-0.5, 0.5, num=12).reshape(3, 4)

out, _ = layers.relu_forward(x)
correct_out = np.array([[ 0.,          0.,          0.,          0.,        ],
                        [ 0.,          0.,          0.04545455,  0.13636364,],
                        [ 0.22727273,  0.31818182,  0.40909091,  0.5,       ]])

# Compare your output with ours. The error should be around 1e-8

if out is not None:
  print 'Testing relu_forward function:'
  print 'difference (should be around 1e-8): ', rel_error(out, correct_out)

# Problem 3.1.4
###################################################################################
#   ReLU layer: backward                                                          #
###################################################################################
#   In the file layers.py implement the backward pass for the ReLU activation in  #
Exemplo n.º 20
0

# Problem 3.1.3
##########################################################################
#   ReLU layer: forward                                                           #
##########################################################################
#   In the file layers.py implement the forward pass for the ReLU activation in   #
#   the relu_forward function.                                                    #
#   Once you are done you can test your implementation using the following.       #
##########################################################################

# Test the relu_forward function

x = np.linspace(-0.5, 0.5, num=12).reshape(3, 4)

out, _ = layers.relu_forward(x)
correct_out = np.array([[0.,          0.,          0.,          0., ],
                        [0.,          0.,          0.04545455,  0.13636364, ],
                        [0.22727273,  0.31818182,  0.40909091,  0.5, ]])

# Compare your output with ours. The error should be around 1e-8

if out is not None:
    print 'Testing relu_forward function:'
    print 'difference (should be around 1e-8): ', rel_error(out, correct_out)

# Problem 3.1.4
##########################################################################
#   ReLU layer: backward                                                          #
##########################################################################
#   In the file layers.py implement the backward pass for the ReLU activation in  #
Exemplo n.º 21
0
    def loss(self,X,y=None):
        X = X.astype(self.dtype)
        mode = 'test' if y is None else 'train'

        if self.use_dropout :
            self.dropout_param['mode'] = mode
        if self.use_batchnorm:
            for bn_param in self.bn_params:
                bn_param['mode'] = mode

        scores = None


        inputi = X
        batch_size = X.shape[0]
        X = np.reshape(X,[batch_size,-1])

        fc_cache_list = []
        relu_cache_list = []
        bn_cache_list = []
        dropout_cache_list = []


        for i in range(self.num_layers-1):
            fc_act,fc_cache= affine_forward(X,self.params['W'+str(i+1)],self.params['b'+str(i+1)])
            fc_cache_list.append(fc_cache)
            if self.use_batchnorm:
                bn_act,bn_cache = batchnorm_forward(fc_act,self.params['gamma'+str(i+1)],self.params['beta'+str(i+1)],self.bn_params[i])
                bn_cache_list.append(bn_cache)
                relu_act,relu_cache = relu_forward(bn_act)
                relu_cache_list.append(relu_cache)
            else:
                relu_act,relu_cache = relu_forward(fc_act)
                relu_cache_list.append(relu_cache)
            if self.use_dropout:
                relu_act,dropout_cache = dropout_forward(relu_act,self.dropout_param)
                dropout_cache_list.append(dropout_cache)

            X = relu_act.copy()
        ########最后一层
        scores,final_cache = affine_forward(X,self.params['W'+str(self.num_layers)],self.params['b'+str(self.num_layers)])
        #
        # for layer in range(self.num_layers):
        #     Wi,bi = self.params['W%d'%(layer+1)],self.params['b%d'%(layer+1)]
        #     outi,fc_cachei = affine_forward(inputi,Wi,bi)
        #     fc_cache_list.append(fc_cachei)
        #
        #     if self.use_batchnorm and layer!=self.num_layers-1:
        #         gammai,betai = self.params['gamma%d'%(layer+1)],self.params['beta%d'%(layer+1)]
        #
        #         outi,bn_cachei = batchnorm_forward(outi,gammai,betai,self.bn_params[layer])
        #         bn_cache_list.append(bn_cachei)
        #     outi,relu_cachei = relu_forward(outi)
        #     relu_cache_list.append(relu_cachei)
        #
        #     if self.use_dropout:
        #         outi,dropout_cachei = dropout_forward(outi,self.dropout_param)
        #         dropout_cache_list.append(dropout_cachei)
        #
        #     inputi = outi
        #
        # scores = outi

        if mode == 'test':
            return scores

        loss,grads = 0.0,{}

        loss,dsoft = softmax_loss(scores,y)
        loss += 0.5*self.reg*(np.sum(np.square(self.params['W'+str(self.num_layers)])))
        #########最后一层的反向传播
        dx_last,dw_last,db_last = affine_backward(dsoft,final_cache)
        grads['W'+str(self.num_layers)] = dw_last+self.reg*self.params['W'+str(self.num_layers)]
        grads['b'+str(self.num_layers)] = db_last

        for i in range(self.num_layers-1,0,-1):

            if self.use_dropout:
                dx_last = dropout_backward(dx_last,dropout_cache_list[i-1])

            drelu = relu_backward(dx_last,relu_cache_list[i-1])
            if self.use_batchnorm:
                dbatchnorm,dgamma,dbeta = batchnorm_backward(drelu,bn_cache_list[i-1])
                dx_last,dw_last,db_last = affine_backward(dbatchnorm,fc_cache_list[i-1])
                grads['beta'+str(i)] = dbeta
                grads['gamma'+str(i)] = dgamma
            else:
                dx_last,dw_last,db_last = affine_backward(drelu,fc_cache_list[i-1])

            grads['W'+str(i)] = dw_last+self.reg*self.params['W'+str(i)]
            grads['b'+str(i)] = db_last

            loss += 0.5*self.reg*(np.sum(np.square(self.params['W'+str(i)])))

        return loss,grads
Exemplo n.º 22
0
# The error should be around 1e-10
print('Testing affine_backward function:')
print('dx error: ', rel_error(dx_num, dx))
print('dw error: ', rel_error(dw_num, dw))
print('db error: ', rel_error(db_num, db))
#######################################################################################

#######################################################################################
# Test the relu_forward function
#######################################################################################
from layers import relu_forward

x = np.linspace(-0.5, 0.5, num=12).reshape(3, 4)

out, _ = relu_forward(x)
correct_out = np.array([[
    0.,
    0.,
    0.,
    0.,
], [
    0.,
    0.,
    0.04545455,
    0.13636364,
], [
    0.22727273,
    0.31818182,
    0.40909091,
    0.5,
    def loss(self, X, y=None):
        """
        Compute loss and gradient for a minibatch of data.
        Args:
        - X: Input data, numpy array of shape (N, d_1, ..., d_k)
        - y: Array of labels, of shape (N,). y[i] gives the label for X[i].
        Returns:
        If y is None, then run a test-time forward pass of the model and
        return:
        - scores: Array of shape (N, C) giving classification scores, where
          scores[i, c] is the classification score for X[i] and class c.
        If y is not None, then run a training-time forward and backward pass
        and return a tuple of:
        - loss: Scalar value giving the loss
        - grads: Dictionary with the same keys as self.params, mapping
        parameter
          names to gradients of the loss with respect to those parameters.
        """
        scores = None
        X = X.astype(self.dtype)
        linear_cache = dict()
        relu_cache = dict()
        dropout_cache = dict()
        """
        TODO: Implement the forward pass for the fully-connected neural
        network, compute the scores and store them in the scores variable.
        """
        #######################################################################
        #                           BEGIN OF YOUR CODE                        #
        #######################################################################

        # input -> first hidden layer
        linear_cache[1] = linear_forward(X, self.params["W1"],
                                         self.params["b1"])
        input_next = relu_forward(linear_cache[1])
        relu_cache[1] = input_next.copy()

        # hidden layer i -> hidden layer i+1
        for l in range(2, self.num_layers):
            if self.use_dropout:
                input_next, dropout_cache[l - 1] = self.apply_forward_dropout(
                    input_next)

            linear_cache[l] = linear_forward(input_next,
                                             self.params["W%d" % l],
                                             self.params["b%d" % l])
            input_next = relu_forward(linear_cache[l])
            relu_cache[l] = input_next.copy()

        # last hidden layer -> output layer
        if self.use_dropout:
            input_next, dropout_cache[
                self.num_layers - 1] = self.apply_forward_dropout(input_next)

        linear_cache[self.num_layers] = linear_forward(
            input_next, self.params["W%d" % self.num_layers],
            self.params["b%d" % self.num_layers])
        scores = linear_cache[self.num_layers].copy()
        # scores = scores / np.abs(scores).sum(axis=1)[:, None]
        #print(scores)
        # print(scores.shape)
        #######################################################################
        #                            END OF YOUR CODE                         #
        #######################################################################
        # If y is None then we are in test mode so just return scores
        if y is None:
            return scores
        loss, grads = 0, dict()
        """
        TODO: Implement the backward pass for the fully-connected net. Store
        the loss in the loss variable and all gradients in the grads
        dictionary. Compute the loss with softmax. grads[k] has the gradients
        for self.params[k]. Add L2 regularisation to the loss function.
        NOTE: To ensure that your implementation matches ours and you pass the
        automated tests, make sure that your L2 regularization includes a
        factor of 0.5 to simplify the expression for the gradient.
        """
        #######################################################################
        #                           BEGIN OF YOUR CODE                        #
        #######################################################################
        loss, dlogits = softmax(scores, y)
        # add L2 regularization
        loss += 0.5 * self.reg * np.sum([
            np.sum(self.params["W%d" % l]**2)
            for l in range(1, self.num_layers + 1)
        ])

        # last hidden layer <- output layer
        dX_, dW_, db_ = linear_backward(dlogits,
                                        relu_cache[self.num_layers - 1],
                                        self.params["W%d" % self.num_layers],
                                        self.params["b%d" % self.num_layers])
        # add regularization effect to W
        grads["W%d" %
              self.num_layers] = dW_ + self.reg * self.params["W%d" %
                                                              self.num_layers]
        grads["b%d" % self.num_layers] = db_.copy()

        # hidden layer i <- hidden layer i+1
        for l in reversed(range(2, self.num_layers)):
            if self.use_dropout:
                dX_ = self.apply_backward_dropout(dX_, dropout_cache[l])
            dX_ = relu_backward(dX_, linear_cache[l])
            dX_, dW_, db_ = linear_backward(dX_, relu_cache[l - 1],
                                            self.params["W%d" % l],
                                            self.params["b%d" % l])
            # add regularization effect to W
            grads["W%d" % l] = dW_ + self.reg * self.params["W%d" % l]
            grads["b%d" % l] = db_
        # input layer <- first hidden layer
        if self.use_dropout:
            dX_ = self.apply_backward_dropout(dX_, dropout_cache[1])
        dX_ = relu_backward(dX_, linear_cache[1])
        dX_, dW_, db_ = linear_backward(dX_, X, self.params["W1"],
                                        self.params["b1"])
        # add regularization effect to W
        grads["W1"] = dW_ + self.reg * self.params["W1"]
        grads["b1"] = db_
        #######################################################################
        #                            END OF YOUR CODE                         #
        #######################################################################
        return loss, grads