def loss_and_gradients(x, y, params): """ params: a list of the form [W, b, U, b_tag] returns: loss,[gW, gb, gU, gb_tag] loss: scalar gW: matrix, gradients of W gb: vector, gradients of b gU: matrix, gradients of U gb_tag: vector, gradients of b_tag """ W, b, U, b_tag = params pred_vec = classifier_output(x, params) y_vec = np.zeros(len(pred_vec)) y_vec[y] = 1 gb_tag = pred_vec - y_vec sub = gb_tag.reshape(-1, 1) h = get_hidden(x, W, b) gU = (h * sub).T dh_dz1 = U.T * ut.tanh_derivative(np.dot(x, W) + b) gb = np.dot(sub.T, dh_dz1)[0] gW = np.dot(sub.T, dh_dz1).T.dot(x.reshape(-1, 1).T).T loss = 0 if pred_vec[y] > 0: loss = -np.log(pred_vec[y]) return loss, [gW, gb, gU, gb_tag]
def loss_and_gradients(x, y, params): """ params: a list as created by create_classifier(...) returns: loss,[gW1, gb1, gW2, gb2, ...] loss: scalar gW1: matrix, gradients of W1 gb1: vector, gradients of b1 gW2: matrix, gradients of W2 gb2: vector, gradients of b2 ... (of course, if we request a linear classifier (ie, params is of length 2), you should not have gW2 and gb2.) """ pred_vec, mem = classifier_output(x, params) h = mem y_vec = np.zeros(len(pred_vec)) y_vec[y] = 1 weights = [] bias = h[::-1] for i in range(len(params) - 2, -1, -2): weights.append(params[i]) gradients = [] num_weights = len(weights) index = 0 sub = (pred_vec - y_vec).reshape(-1, 1) for i in range(num_weights): if i != index: sub = sub.T.dot( (weights[index]).T * ut.tanh_derivative(bias[index])).T index += 1 gb = sub gW = np.dot(sub, bias[index].reshape(-1, 1).T) gW = gW.T gradients.append(gb) gradients.append(gW) gradients = gradients[::-1] loss = 0 if pred_vec[y] > 0: loss = -np.log(pred_vec[y]) return loss, gradients
def BackPropagationLearner(dataset, net, learning_rate, epochs, activation=sigmoid): """[Figure 18.23] The back-propagation algorithm for multilayer networks""" # Initialise weights for layer in net: for node in layer: node.weights = random_weights(min_value=-0.5, max_value=0.5, num_weights=len(node.weights)) examples = dataset.examples ''' As of now dataset.target gives an int instead of list, Changing dataset class will have effect on all the learners. Will be taken care of later. ''' o_nodes = net[-1] i_nodes = net[0] o_units = len(o_nodes) idx_t = dataset.target idx_i = dataset.inputs n_layers = len(net) inputs, targets = init_examples(examples, idx_i, idx_t, o_units) for epoch in range(epochs): # Iterate over each example for e in range(len(examples)): i_val = inputs[e] t_val = targets[e] # Activate input layer for v, n in zip(i_val, i_nodes): n.value = v # Forward pass for layer in net[1:]: for node in layer: inc = [n.value for n in node.inputs] in_val = dotproduct(inc, node.weights) node.value = node.activation(in_val) # Initialize delta delta = [[] for _ in range(n_layers)] # Compute outer layer delta # Error for the MSE cost function err = [t_val[i] - o_nodes[i].value for i in range(o_units)] # The activation function used is relu or sigmoid function if node.activation == sigmoid: delta[-1] = [ sigmoid_derivative(o_nodes[i].value) * err[i] for i in range(o_units) ] elif node.activation == relu: delta[-1] = [ relu_derivative(o_nodes[i].value) * err[i] for i in range(o_units) ] elif node.activation == tanh: delta[-1] = [ tanh_derivative(o_nodes[i].value) * err[i] for i in range(o_units) ] elif node.activation == elu: delta[-1] = [ elu_derivative(o_nodes[i].value) * err[i] for i in range(o_units) ] else: delta[-1] = [ leaky_relu_derivative(o_nodes[i].value) * err[i] for i in range(o_units) ] # Backward pass h_layers = n_layers - 2 for i in range(h_layers, 0, -1): layer = net[i] h_units = len(layer) nx_layer = net[i + 1] # weights from each ith layer node to each i + 1th layer node w = [[node.weights[k] for node in nx_layer] for k in range(h_units)] if activation == sigmoid: delta[i] = [ sigmoid_derivative(layer[j].value) * dotproduct(w[j], delta[i + 1]) for j in range(h_units) ] elif activation == relu: delta[i] = [ relu_derivative(layer[j].value) * dotproduct(w[j], delta[i + 1]) for j in range(h_units) ] elif activation == tanh: delta[i] = [ tanh_derivative(layer[j].value) * dotproduct(w[j], delta[i + 1]) for j in range(h_units) ] elif activation == elu: delta[i] = [ elu_derivative(layer[j].value) * dotproduct(w[j], delta[i + 1]) for j in range(h_units) ] else: delta[i] = [ leaky_relu_derivative(layer[j].value) * dotproduct(w[j], delta[i + 1]) for j in range(h_units) ] # Update weights for i in range(1, n_layers): layer = net[i] inc = [node.value for node in net[i - 1]] units = len(layer) for j in range(units): layer[j].weights = vector_add( layer[j].weights, scalar_vector_product(learning_rate * delta[i][j], inc)) return net
def BackPropagationLearner(dataset, net, learning_rate, epochs, activation=sigmoid, momentum=False, beta=0.903): """[Figure 18.23] The back-propagation algorithm for multilayer networks""" # Initialise weights for layer in net: for node in layer: node.weights = random_weights(min_value=-0.5, max_value=0.5, num_weights=len(node.weights)) examples = dataset.examples ''' As of now dataset.target gives an int instead of list, Changing dataset class will have effect on all the learners. Will be taken care of later. ''' o_nodes = net[-1] i_nodes = net[0] o_units = len(o_nodes) idx_t = dataset.target idx_i = dataset.inputs n_layers = len(net) inputs, targets = init_examples(examples, idx_i, idx_t, o_units) for epoch in range(epochs): # Iterate over each example for e in range(len(examples)): i_val = inputs[e] t_val = targets[e] # Activate input layer for v, n in zip(i_val, i_nodes): n.value = v # Finding the values of the nodes through forward propogation for layer in net[1:]: for node in layer: inc = [n.value for n in node.inputs] in_val = dotproduct(inc, node.weights) node.value = node.activation(in_val) # Initialize delta which stores the values of the gradients for each activation units delta = [[] for _ in range(n_layers)] #initializing the velocity_gradient if momentum == True: v_dw = [[0 for i in range(len(_))] for _ in net] # Compute outer layer delta # Error for the MSE cost function err = [t_val[i] - o_nodes[i].value for i in range(o_units)] # The activation function used is relu or sigmoid function # First backward fast if node.activation == sigmoid: delta[-1] = [sigmoid_derivative(o_nodes[i].value) * err[i] for i in range(o_units)] elif node.activation == relu: delta[-1] = [relu_derivative(o_nodes[i].value) * err[i] for i in range(o_units)] elif node.activation == tanh: delta[-1] = [tanh_derivative(o_nodes[i].value) * err[i] for i in range(o_units)] elif node.activation == elu: delta[-1] = [elu_derivative(o_nodes[i].value) * err[i] for i in range(o_units)] else: delta[-1] = [leaky_relu_derivative(o_nodes[i].value) * err[i] for i in range(o_units)] # Propogating backward and finding gradients of nodes for each hidden layer h_layers = n_layers - 2 for i in range(h_layers, 0, -1): layer = net[i] h_units = len(layer) nx_layer = net[i+1] # weights from each ith layer node to each i + 1th layer node w = [[node.weights[k] for node in nx_layer] for k in range(h_units)] if activation == sigmoid: delta[i] = [sigmoid_derivative(layer[j].value) * dotproduct(w[j], delta[i+1]) for j in range(h_units)] elif activation == relu: delta[i] = [relu_derivative(layer[j].value) * dotproduct(w[j], delta[i+1]) for j in range(h_units)] elif activation == tanh: delta[i] = [tanh_derivative(layer[j].value) * dotproduct(w[j], delta[i+1]) for j in range(h_units)] elif activation == elu: delta[i] = [elu_derivative(layer[j].value) * dotproduct(w[j], delta[i+1]) for j in range(h_units)] else: delta[i] = [leaky_relu_derivative(layer[j].value) * dotproduct(w[j], delta[i+1]) for j in range(h_units)] #optimization with velocity gradient t_ = epoch + 1 if momentum == True: if epoch == 0: for i in range(len(delta)): for j in range(len(delta[i])): v_dw[i][j] = ((1-beta)*delta[i][j])/(1-beta**(t_+1)) else: for i in range(len(delta)): for j in range(len(delta[i])): v_dw[i][j] = (beta*v_dw[i][j]+(1-beta)*delta[i][j])/(1-beta**(t_+1)) # Update weights with normal gradient descent if momentum == False: for i in range(1, n_layers): layer = net[i] inc = [node.value for node in net[i-1]] units = len(layer) for j in range(units): layer[j].weights = vector_add(layer[j].weights, scalar_vector_product( learning_rate * delta[i][j], inc )) # Update weights with velocity gradient optimizer in gradient descent else: for i in range(1, n_layers): layer = net[i] inc = [node.value for node in net[i-1]] units = len(layer) for j in range(units): layer[j].weights = vector_add(layer[j].weights, scalar_vector_product( learning_rate * v_dw[i][j], inc )) return net