def _check_gradients(X, Y: Tensor2D, parameters: Parameters, gradients: Parameters, lamb: float): epsilon = 1e-7 parameters_ = deepcopy(parameters) numerical_gradients = {} for param_name, param_values in parameters_.items(): print("Calculating numeric gradients for {}".format(param_name)) param_shape = shape(param_values) numerical_gradients[param_name] = zeros(*param_shape) for i in range(param_shape[0]): for j in range(param_shape[1]): numerical_gradients[param_name][i][ j] = _single_param_numerical_gradient( X, Y, parameters_, lamb, param_name, i, j, epsilon) gradients_vector = _params_to_single_vector(gradients) numerical_gradients_vector = _params_to_single_vector(numerical_gradients) assert shape(gradients_vector) == shape(numerical_gradients_vector) delta = l2_norm(minus(numerical_gradients_vector, gradients_vector)) / ( l2_norm(numerical_gradients_vector) + l2_norm(gradients_vector)) if delta > epsilon: print("Gradient check failed delta={} > {} !!!!!".format( delta, epsilon)) else: print("Gradient check passed delta={}".format(delta))
def _update_parameters(parameters, gradients: Parameters, learning_rate: float) -> Parameters: updated_parameters = {} for param in ("W1", "B1", "W2", "B2", "W3", "B3"): updated_parameters[param] = minus( parameters[param], element_multiply([[learning_rate]], gradients["d" + param])) return updated_parameters
def softmax(Z: Tensor2D, stable=True) -> Tensor2D: Z_shape = shape(Z) if stable: # stable softmax via https://eli.thegreenplace.net/2016/the-softmax-function-and-its-derivative/ Z_max = max(Z[0]) Z_minus_max = minus(Z, [[Z_max]]) Z_exp = element_exp(Z_minus_max) else: Z_exp = element_exp(Z) Z_exp_col_sum = zeros(1, Z_shape[1]) for i in range(Z_shape[0]): for j in range(Z_shape[1]): Z_exp_col_sum[0][j] += Z_exp[i][j] Z_softmax = zeros(*Z_shape) for i in range(Z_shape[0]): for j in range(Z_shape[1]): Z_softmax[i][j] = Z_exp[i][j] / Z_exp_col_sum[0][j] return Z_softmax
def test_minus(A, B, expected_C): assert minus(A, B) == expected_C
def _backward_propagation(X, Y: Tensor2D, parameters: Parameters, lamb: float, cache: Parameters) -> Parameters: X_shape = shape(X) batch_size = X_shape[1] W1 = parameters["W1"] B1 = parameters["B1"] W2 = parameters["W2"] B2 = parameters["B2"] W3 = parameters["W3"] B3 = parameters["B3"] A0 = X Z1 = cache["Z1"] A1 = cache["A1"] Z2 = cache["Z2"] A2 = cache["A2"] Z3 = cache["Z3"] A3 = cache["A3"] Y_hat = A3 # Layer 3 (output) derivatives dZ3 = minus(Y_hat, Y) assert shape(dZ3) == shape(Z3) dW3 = element_multiply([[1. / batch_size]], matrix_multiply(dZ3, transpose(A2))) if lamb != 0.: dW3 = add(dW3, _regularization_gradient(lamb, batch_size, W3)) assert shape(dW3) == shape(W3) dB3 = element_multiply([[1. / batch_size]], sum_rows(dZ3)) assert shape(dB3) == shape(B3) # Layer 2 (hidden) derivatives dZ2 = element_multiply(matrix_multiply(transpose(W3), dZ3), relu.relu_derivative(Z2)) assert shape(dZ2) == shape(Z2) dW2 = element_multiply([[1. / batch_size]], matrix_multiply(dZ2, transpose(A1))) if lamb != 0.: dW2 = add(dW2, _regularization_gradient(lamb, batch_size, W2)) assert shape(dW2) == shape(W2) dB2 = element_multiply([[1. / batch_size]], sum_rows(dZ2)) assert shape(dB2) == shape(B2) # Layer 1 (hidden) derivatives dZ1 = element_multiply(matrix_multiply(transpose(W2), dZ2), relu.relu_derivative(Z1)) assert shape(dZ1) == shape(Z1) dW1 = element_multiply([[1. / batch_size]], matrix_multiply(dZ1, transpose(A0))) if lamb != 0.: dW1 = add(dW1, _regularization_gradient(lamb, batch_size, W1)) assert shape(dW1) == shape(W1) dB1 = element_multiply([[1. / batch_size]], sum_rows(dZ1)) assert shape(dB1) == shape(B1) # return gradients for weights and bias for each layer gradients = { "dW1": dW1, "dB1": dB1, "dW2": dW2, "dB2": dB2, "dW3": dW3, "dB3": dB3, } return gradients