def __init__(self, dim_list, eta = 0.1):
        """
        Constructor for network.
        Params:
        dim_list: a list of the number of dimension for each layer.
        eta: learning rate for each gradient descent step
        """
        depth = len(dim_list)
        self.depth = depth
        self.dim_list = dim_list
        self.eta = eta

        # 1. Initiate each layer: output, partial_output and weight,
        #    although partial_output is useless for the input layer, similarly
        #    weight and bias are useless for the output layer.
        #
        # 2. Partial_weight is an internal variable and will not be stored in
        #    a layer.
        #
        self.layers = [ {'output':Vector.fromIterable(0 for i in xrange(dim_list[l])),
            'partial_output':Vector.fromIterable(0 for i in xrange(dim_list[l])),
            'weight':Matrix.fromRandom(dim_list[l + 1], dim_list[l]),
            'bias':Vector.fromRandom(dim_list[l + 1])}
            for l in xrange(depth - 1) ]
        
        # output layer
        self.layers.append({'output':Vector.fromList([0] * dim_list[depth - 1]),
            'partial_output':Vector.fromList([0] * dim_list[depth - 1]),
            'weight': None, 'bias': None})
    def _backward(self, x, y):
        # output layer
        layer_id = self.depth - 1
        output = self.layers[layer_id]['output']
        self.layers[layer_id]['partial_output'].assign(Vector.fromIterable(
            output[i] - y[i] for i in xrange(self.dim_list[layer_id])
            ))

        loss = sum((output[i] - y[i]) ** 2 for i in xrange(self.dim_list[layer_id]))

        # hidden layer and input layer
        for layer_id in xrange(self.depth - 2, -1, -1):
            weight = self.layers[layer_id]['weight']
            bias = self.layers[layer_id]['bias']
            partial_output = self.layers[layer_id]['partial_output']
            output = self.layers[layer_id]['output']
            last_output = self.layers[layer_id + 1]['output']
            last_partial = self.layers[layer_id + 1]['partial_output']

            """
            Partial output for every layer except the output one is:
            \frac {\partial E} {\partial O_k^{(l)}} =
                \sum_i (\frac {\partial E} {\partial O_i^{ (l+1) }}
                    * O_i^{ (l+1) } * (1 - O_i^{ (l+1) }) * w_{ik}^{ (l) } )
            
            But the partial output of the first layer is unnecessary,
            thus we don't compute it.
            """
            if layer_id > 0:
                self.layers[layer_id]['partial_output'].assign(Vector.fromIterable(
                    sum(last_partial[i] * last_output[i] * (1 - last_output[i])
                        * weight.item(i, k)
                        for i in xrange(self.dim_list[layer_id + 1]))
                    / (self.dim_list[layer_id] + 1.0)
                    for k in xrange(self.dim_list[layer_id] )))

            """
            Partial weight for every layer except the output one:
            \frac {\partial E} {\partial w_{ji}^{(l)}} = 
                \frac {\partial E} {\partial O_j^{(l + 1)}}
                    * O_j^{(l + 1)} * (1 - O_j^{(l+1)}) * O_i^{(l)}
            """
            partial_weight = Matrix.fromIterable(weight.row_num, weight.col_num, (
                    last_partial[row_id] 
                    * last_output[row_id] * (1 - last_output[row_id])
                    * output[col_id]
                    / (self.dim_list[layer_id] + 1.0)
                    for row_id in xrange(self.dim_list[layer_id + 1])
                    for col_id in xrange(self.dim_list[layer_id])
                    ))

            self.layers[layer_id]['weight'] -= self.eta * partial_weight

            """
            Partial bias is almost exact as the partial weight,
            but for every item in the bias vector the last item is 1
            \frac {\partial E}{\partial b_j^{(l)}} =
                \frac {\partial E}{\partial O_j^{(l + 1)}}
                    O_j^{(l + 1)} (1 - O_j^{(l+1)}) * 1
            """
            partial_bias = Vector.fromIterable(
                    last_partial[row_id]
                    * last_output[row_id] * (1 - last_output[row_id]) * 1
                    / (self.dim_list[layer_id] + 1.0)
                    for row_id in xrange(self.dim_list[layer_id + 1])
                    )
            self.layers[layer_id]['bias'] -= self.eta * partial_bias

        return loss