def _check_gradients(X, Y: Tensor2D, parameters: Parameters, gradients: Parameters, lamb: float): epsilon = 1e-7 parameters_ = deepcopy(parameters) numerical_gradients = {} for param_name, param_values in parameters_.items(): print("Calculating numeric gradients for {}".format(param_name)) param_shape = shape(param_values) numerical_gradients[param_name] = zeros(*param_shape) for i in range(param_shape[0]): for j in range(param_shape[1]): numerical_gradients[param_name][i][ j] = _single_param_numerical_gradient( X, Y, parameters_, lamb, param_name, i, j, epsilon) gradients_vector = _params_to_single_vector(gradients) numerical_gradients_vector = _params_to_single_vector(numerical_gradients) assert shape(gradients_vector) == shape(numerical_gradients_vector) delta = l2_norm(minus(numerical_gradients_vector, gradients_vector)) / ( l2_norm(numerical_gradients_vector) + l2_norm(gradients_vector)) if delta > epsilon: print("Gradient check failed delta={} > {} !!!!!".format( delta, epsilon)) else: print("Gradient check passed delta={}".format(delta))
def shuffle_truncate_dataset(X: Tensor2D, Y: Tensor2D, truncate: int = None) -> Tuple[Tensor2D, Tensor2D]: assert shape(X)[1] == shape(Y)[1], "X and Y should have the same number of columns (training examples)" index = list(range(shape(X)[1])) shuffle(index) if truncate and truncate < len(index): index = index[:truncate] X_ = [[Xi[j] for j in index] for Xi in X] Y_ = [[Yi[j] for j in index] for Yi in Y] return X_, Y_
def _calculate_cost(Y_hat, Y: Tensor2D, parameters: Parameters, lamb: float) -> float: batch_size = shape(Y)[1] Y_loss = multinomial_logistic.loss(Y_hat, Y) assert shape(Y_loss) == (1, batch_size) # average loss. sum rows and convert to single scalar cost = (1. / batch_size) * sum_all(Y_loss) # regularization if lamb != 0.: param_sq_sum = 0. for param_key, param_values in parameters.items(): # only do regularization on W, not b, parameters if param_key.startswith('W'): param_sq_sum += sum_all(element_sq(param_values)) cost += (lamb / (2. * batch_size)) * param_sq_sum return cost
def _params_to_single_vector(parameters: Parameters) -> Tensor2D: size = 0 for param_values in parameters.values(): param_shape = shape(param_values) size += param_shape[0] * param_shape[1] vector = zeros(size, 1) offset = 0 for param_name in sorted(parameters.keys()): param_values = parameters[param_name] param_shape = shape(param_values) for i in range(param_shape[0]): for j in range(param_shape[1]): index = offset + (j * param_shape[0]) + i vector[index][0] = param_values[i][j] offset += param_shape[0] * param_shape[1] return vector
def split_into_batches(A: Tensor2D, batch_size: int) -> List[Tensor2D]: A_shape = shape(A) num_batches = floor(A_shape[1] / batch_size) overflow = A_shape[1] - (num_batches * batch_size) batches = [] def _one_batch(start, end): return [Ai[start:end] for Ai in A] for b in range(num_batches): batches.append(_one_batch(b * batch_size, (b + 1) * batch_size)) if overflow != 0: batches.append(_one_batch(num_batches * batch_size, A_shape[1])) total_cols_size = 0 for batch in batches: batch_shape = shape(batch) assert batch_shape[0] == A_shape[0] assert batch_shape[1] == batch_size or batch_shape[1] == overflow total_cols_size += batch_shape[1] assert total_cols_size == A_shape[1] return batches
def softmax(Z: Tensor2D, stable=True) -> Tensor2D: Z_shape = shape(Z) if stable: # stable softmax via https://eli.thegreenplace.net/2016/the-softmax-function-and-its-derivative/ Z_max = max(Z[0]) Z_minus_max = minus(Z, [[Z_max]]) Z_exp = element_exp(Z_minus_max) else: Z_exp = element_exp(Z) Z_exp_col_sum = zeros(1, Z_shape[1]) for i in range(Z_shape[0]): for j in range(Z_shape[1]): Z_exp_col_sum[0][j] += Z_exp[i][j] Z_softmax = zeros(*Z_shape) for i in range(Z_shape[0]): for j in range(Z_shape[1]): Z_softmax[i][j] = Z_exp[i][j] / Z_exp_col_sum[0][j] return Z_softmax
def _train_one_epoch(X_train_batches: List[Tensor2D], Y_train_batches: List[Tensor2D], parameters: Parameters, learning_rate: float, lamb: float) -> Parameters: total_batches = len(X_train_batches) trained_examples = 0 for batch_index in range(len(X_train_batches)): batch_start_time = time.time() X_train_batch = X_train_batches[batch_index] Y_train_batch = Y_train_batches[batch_index] loss, parameters, train_accuracy = _train_one_mini_batch( X_train_batch, Y_train_batch, learning_rate, parameters, lamb) batch_duration = time.time() - batch_start_time trained_examples += shape(X_train_batch)[1] print( " batch: {}/{} training loss: {:0.2f} train accuracy: {:0.2f}% duration: {:0.2f}s" .format(batch_index + 1, total_batches, loss, train_accuracy * 100., batch_duration)) return parameters
def test_shape(matrix, expected_shape): assert shape(matrix) == expected_shape
def _calculate_accuracy(X, Y: Tensor2D, parameters: Parameters) -> float: Y_shape = shape(Y) Y_hat, _ = _forward_propagation(X, parameters) num_examples = Y_shape[1] num_correct = sum_all(element_equals(argmax(Y_hat), argmax(Y))) return num_correct / num_examples
def _backward_propagation(X, Y: Tensor2D, parameters: Parameters, lamb: float, cache: Parameters) -> Parameters: X_shape = shape(X) batch_size = X_shape[1] W1 = parameters["W1"] B1 = parameters["B1"] W2 = parameters["W2"] B2 = parameters["B2"] W3 = parameters["W3"] B3 = parameters["B3"] A0 = X Z1 = cache["Z1"] A1 = cache["A1"] Z2 = cache["Z2"] A2 = cache["A2"] Z3 = cache["Z3"] A3 = cache["A3"] Y_hat = A3 # Layer 3 (output) derivatives dZ3 = minus(Y_hat, Y) assert shape(dZ3) == shape(Z3) dW3 = element_multiply([[1. / batch_size]], matrix_multiply(dZ3, transpose(A2))) if lamb != 0.: dW3 = add(dW3, _regularization_gradient(lamb, batch_size, W3)) assert shape(dW3) == shape(W3) dB3 = element_multiply([[1. / batch_size]], sum_rows(dZ3)) assert shape(dB3) == shape(B3) # Layer 2 (hidden) derivatives dZ2 = element_multiply(matrix_multiply(transpose(W3), dZ3), relu.relu_derivative(Z2)) assert shape(dZ2) == shape(Z2) dW2 = element_multiply([[1. / batch_size]], matrix_multiply(dZ2, transpose(A1))) if lamb != 0.: dW2 = add(dW2, _regularization_gradient(lamb, batch_size, W2)) assert shape(dW2) == shape(W2) dB2 = element_multiply([[1. / batch_size]], sum_rows(dZ2)) assert shape(dB2) == shape(B2) # Layer 1 (hidden) derivatives dZ1 = element_multiply(matrix_multiply(transpose(W2), dZ2), relu.relu_derivative(Z1)) assert shape(dZ1) == shape(Z1) dW1 = element_multiply([[1. / batch_size]], matrix_multiply(dZ1, transpose(A0))) if lamb != 0.: dW1 = add(dW1, _regularization_gradient(lamb, batch_size, W1)) assert shape(dW1) == shape(W1) dB1 = element_multiply([[1. / batch_size]], sum_rows(dZ1)) assert shape(dB1) == shape(B1) # return gradients for weights and bias for each layer gradients = { "dW1": dW1, "dB1": dB1, "dW2": dW2, "dB2": dB2, "dW3": dW3, "dB3": dB3, } return gradients