示例#1
0
    def execute_backward_pass(self, d_next_layer):
        """
        Calculates gradient of the cost with respect to the output of this layer and updates the weights.

        Arguments:
            - d_next_layer: gradient of the cost with respect to the output of the next layer.
        
        Returns:
            - the gradient of the cost with respect to the output of this layer.
        """
        d_next_layer = relu_derivative(self.output)
        doutput = np.zeros(self.inputs.shape)
        dfilters = np.zeros(self.filters.shape)

        filter_index = 0
        for input_index in range(self.inputs.shape[0]):
            for y in range(self.filter_size[1]):
                for x in range(self.filter_size[0]):
                    doutput[input_index, y:y + self.filter_size[1],
                            x:x + self.filter_size[0]] += self.filters[
                                filter_index] * d_next_layer[filter_index, y,
                                                             x]
                    dfilters[filter_index, :, :] += self.output[
                        input_index, y:y + self.filter_size[1], x:x +
                        self.filter_size[0]] * d_next_layer[filter_index, y, x]
            if filter_index < self.n_filters - 1:
                filter_index += 1
            else:
                filter_index = 0

        self.filters += self.learning_rate * dfilters
        return doutput
示例#2
0
 def backpropagate(self, x, y):
     gradients_b = [np.zeros(b.shape) for b in self.biases]
     gradients_w = [np.zeros(w.shape) for w in self.weights]
     #feedforward
     a = x
     a_list = [x]
     z_list = []
     for b, w in zip(self.biases[0:-1], self.weights[0:-1]):
         z = np.dot(w, a) + b
         a = utils.relu(z)
         a_list.append(a)
         z_list.append(z)
     z = np.dot(self.weights[-1], a_list[-1]) + self.biases[-1]
     a = utils.softmax(z)
     a_list.append(a)
     z_list.append(z)
     # backward
     # for softmax-cross-entropy layer: delta in last layer = result - ground truth
     delta = a_list[-1] - y
     # update b and w for the last layer L
     gradients_b[-1] = delta
     gradients_w[-1] = np.dot(delta, a_list[-2].transpose())
     # update b and w for the rest of layers L-1, L-2, ...
     for l in range(2, self.num_layers):
         z = z_list[-l]  # lth last layer of z
         r_derivative = utils.relu_derivative(z)
         # update delta based on delta(l) = transpose of w(l+1) * delta(l+1)
         delta = np.dot(self.weights[-l + 1].transpose(),
                        delta) * r_derivative
         gradients_b[-l] = delta
         gradients_w[-l] = np.dot(delta, a_list[-l - 1].transpose())
     return (gradients_b, gradients_w)
示例#3
0
    def back_step(self, xc, grad, state, h_prev, cell_prev):
        # softmax
        self.params.wk_diff = np.dot(state.h, grad.T)
        # self.params.wk_diff = np.dot(grad,self.params.wr)
        self.params.bk_diff = grad

        # Relu
        dr_before = util.relu_derivative(np.dot(self.params.wk, grad))
        self.params.wr_diff += np.dot(
            dr_before.T,
            np.dot(state.h, self.params.wr) + self.params.br)
        self.params.br_diff += dr_before

        # h next state
        state.diff_h_values = np.dot(dr_before, self.params.wr)
        state.diff_h_values += state.h

        # output
        do = np.multiply(state.diff_h_values,
                         util.tanh_normal(state.cell_values))
        db_o = np.multiply(
            do, np.multiply(state.output_values, (1 - state.output_values)))

        self.params.wo_diff += np.dot(db_o, xc.T)
        self.params.bo_diff += db_o

        # cell
        dc = np.multiply(
            state.diff_h_values,
            np.multiply(state.output_values,
                        (1 - util.tanh_normal(state.cell_values)**2)))
        dc += state.cell_values
        dc_temp = np.multiply(dc, state.input_values)
        db_c = np.multiply(dc_temp, (1 - state.cell_temp_values**2))

        self.params.wg_diff += np.dot(db_c, xc.T)
        self.params.bg_diff += db_c

        # input
        di = np.multiply(dc, state.cell_temp_values)
        db_i = np.multiply(
            di, np.multiply(state.input_values, (1 - state.input_values)))
        self.params.wi_diff += np.dot(db_i, xc.T)
        self.params.bi_diff += db_i

        # forget
        df = np.multiply(dc, cell_prev)
        db_f = np.multiply(
            df, np.multiply(state.forget_values, (1 - state.forget_values)))
        self.params.wf_diff += np.dot(db_f, xc.T)
        self.params.bf_diff += db_f

        # xc
        dxc = (np.dot(self.params.wf.T, db_f) +
               np.dot(self.params.wi.T, db_i) +
               np.dot(self.params.wg.T, db_c) + np.dot(self.params.wo.T, db_o))
示例#4
0
def BackPropagationLearner(dataset,
                           net,
                           learning_rate,
                           epochs,
                           activation=sigmoid):
    """[Figure 18.23] The back-propagation algorithm for multilayer networks"""
    # Initialise weights
    for layer in net:
        for node in layer:
            node.weights = random_weights(min_value=-0.5,
                                          max_value=0.5,
                                          num_weights=len(node.weights))

    examples = dataset.examples
    '''
    As of now dataset.target gives an int instead of list,
    Changing dataset class will have effect on all the learners.
    Will be taken care of later.
    '''
    o_nodes = net[-1]
    i_nodes = net[0]
    o_units = len(o_nodes)
    idx_t = dataset.target
    idx_i = dataset.inputs
    n_layers = len(net)

    inputs, targets = init_examples(examples, idx_i, idx_t, o_units)

    for epoch in range(epochs):
        # Iterate over each example
        for e in range(len(examples)):
            i_val = inputs[e]
            t_val = targets[e]

            # Activate input layer
            for v, n in zip(i_val, i_nodes):
                n.value = v

            # Forward pass
            for layer in net[1:]:
                for node in layer:
                    inc = [n.value for n in node.inputs]
                    in_val = dotproduct(inc, node.weights)
                    node.value = node.activation(in_val)

            # Initialize delta
            delta = [[] for _ in range(n_layers)]

            # Compute outer layer delta

            # Error for the MSE cost function
            err = [t_val[i] - o_nodes[i].value for i in range(o_units)]

            # The activation function used is relu or sigmoid function
            if node.activation == sigmoid:
                delta[-1] = [
                    sigmoid_derivative(o_nodes[i].value) * err[i]
                    for i in range(o_units)
                ]
            elif node.activation == relu:
                delta[-1] = [
                    relu_derivative(o_nodes[i].value) * err[i]
                    for i in range(o_units)
                ]
            elif node.activation == tanh:
                delta[-1] = [
                    tanh_derivative(o_nodes[i].value) * err[i]
                    for i in range(o_units)
                ]
            elif node.activation == elu:
                delta[-1] = [
                    elu_derivative(o_nodes[i].value) * err[i]
                    for i in range(o_units)
                ]
            else:
                delta[-1] = [
                    leaky_relu_derivative(o_nodes[i].value) * err[i]
                    for i in range(o_units)
                ]

            # Backward pass
            h_layers = n_layers - 2
            for i in range(h_layers, 0, -1):
                layer = net[i]
                h_units = len(layer)
                nx_layer = net[i + 1]

                # weights from each ith layer node to each i + 1th layer node
                w = [[node.weights[k] for node in nx_layer]
                     for k in range(h_units)]

                if activation == sigmoid:
                    delta[i] = [
                        sigmoid_derivative(layer[j].value) *
                        dotproduct(w[j], delta[i + 1]) for j in range(h_units)
                    ]
                elif activation == relu:
                    delta[i] = [
                        relu_derivative(layer[j].value) *
                        dotproduct(w[j], delta[i + 1]) for j in range(h_units)
                    ]
                elif activation == tanh:
                    delta[i] = [
                        tanh_derivative(layer[j].value) *
                        dotproduct(w[j], delta[i + 1]) for j in range(h_units)
                    ]
                elif activation == elu:
                    delta[i] = [
                        elu_derivative(layer[j].value) *
                        dotproduct(w[j], delta[i + 1]) for j in range(h_units)
                    ]
                else:
                    delta[i] = [
                        leaky_relu_derivative(layer[j].value) *
                        dotproduct(w[j], delta[i + 1]) for j in range(h_units)
                    ]

            #  Update weights
            for i in range(1, n_layers):
                layer = net[i]
                inc = [node.value for node in net[i - 1]]
                units = len(layer)
                for j in range(units):
                    layer[j].weights = vector_add(
                        layer[j].weights,
                        scalar_vector_product(learning_rate * delta[i][j],
                                              inc))

    return net
示例#5
0
def BackPropagationLearner(dataset, net, learning_rate, epochs, activation=sigmoid, momentum=False, beta=0.903):
    """[Figure 18.23] The back-propagation algorithm for multilayer networks"""
    # Initialise weights
    for layer in net:
        for node in layer:
            node.weights = random_weights(min_value=-0.5, max_value=0.5,
                                          num_weights=len(node.weights))

    examples = dataset.examples
    '''
    As of now dataset.target gives an int instead of list,
    Changing dataset class will have effect on all the learners.
    Will be taken care of later.
    '''
    o_nodes = net[-1]
    i_nodes = net[0]
    o_units = len(o_nodes)
    idx_t = dataset.target
    idx_i = dataset.inputs
    n_layers = len(net)

    inputs, targets = init_examples(examples, idx_i, idx_t, o_units)

    for epoch in range(epochs):
        # Iterate over each example
        for e in range(len(examples)):
            i_val = inputs[e]
            t_val = targets[e]

            # Activate input layer
            for v, n in zip(i_val, i_nodes):
                n.value = v

            # Finding the values of the nodes through forward propogation
            for layer in net[1:]:
                for node in layer:
                    inc = [n.value for n in node.inputs]
                    in_val = dotproduct(inc, node.weights)
                    node.value = node.activation(in_val)

            # Initialize delta which stores the values of the gradients for each activation units
            delta = [[] for _ in range(n_layers)]
		
            #initializing the velocity_gradient
            if momentum == True:
                v_dw = [[0 for i in range(len(_))] for _ in net]

            # Compute outer layer delta

            # Error for the MSE cost function
            err = [t_val[i] - o_nodes[i].value for i in range(o_units)]

            # The activation function used is relu or sigmoid function
            # First backward fast 
            if node.activation == sigmoid:
                delta[-1] = [sigmoid_derivative(o_nodes[i].value) * err[i] for i in range(o_units)]
            elif node.activation == relu:
                delta[-1] = [relu_derivative(o_nodes[i].value) * err[i] for i in range(o_units)]
            elif node.activation == tanh:
                delta[-1] = [tanh_derivative(o_nodes[i].value) * err[i] for i in range(o_units)]
            elif node.activation == elu:
                delta[-1] = [elu_derivative(o_nodes[i].value) * err[i] for i in range(o_units)]
            else:
                delta[-1] = [leaky_relu_derivative(o_nodes[i].value) * err[i] for i in range(o_units)]


            # Propogating backward and finding gradients of nodes for each hidden layer
            h_layers = n_layers - 2
            for i in range(h_layers, 0, -1):
                layer = net[i]
                h_units = len(layer)
                nx_layer = net[i+1]

                # weights from each ith layer node to each i + 1th layer node
                w = [[node.weights[k] for node in nx_layer] for k in range(h_units)]

                if activation == sigmoid:
                    delta[i] = [sigmoid_derivative(layer[j].value) * dotproduct(w[j], delta[i+1])
                            for j in range(h_units)]
                elif activation == relu:
                    delta[i] = [relu_derivative(layer[j].value) * dotproduct(w[j], delta[i+1])
                            for j in range(h_units)]
                elif activation == tanh:
                    delta[i] = [tanh_derivative(layer[j].value) * dotproduct(w[j], delta[i+1])
                            for j in range(h_units)]
                elif activation == elu:
                    delta[i] = [elu_derivative(layer[j].value) * dotproduct(w[j], delta[i+1])
                            for j in range(h_units)]
                else:
                    delta[i] = [leaky_relu_derivative(layer[j].value) * dotproduct(w[j], delta[i+1])
                            for j in range(h_units)]

            #optimization with velocity gradient
            t_ = epoch + 1

            if momentum == True:
                if epoch == 0:

                    for i in range(len(delta)):
                        for j in range(len(delta[i])):
                            v_dw[i][j] = ((1-beta)*delta[i][j])/(1-beta**(t_+1))
                else:

                    for i in range(len(delta)):
                        for j in range(len(delta[i])):
                            v_dw[i][j] = (beta*v_dw[i][j]+(1-beta)*delta[i][j])/(1-beta**(t_+1))





            #  Update weights with normal gradient descent
            if momentum == False:
                for i in range(1, n_layers):
                    layer = net[i]
                    inc = [node.value for node in net[i-1]]
                    units = len(layer)
                    for j in range(units):
                        layer[j].weights = vector_add(layer[j].weights,
                                                    scalar_vector_product(
                                                    learning_rate * delta[i][j], 
                                                    inc
                                                    ))                                               
            # Update weights with velocity gradient optimizer in gradient descent
            else:
                for i in range(1, n_layers):
                    layer = net[i]
                    inc = [node.value for node in net[i-1]]
                    units = len(layer)
                    for j in range(units):
                        layer[j].weights = vector_add(layer[j].weights,
                                                    scalar_vector_product(
                                                    learning_rate * v_dw[i][j], 
                                                    inc
                                                    ))
                                                    


    return net