def compute_gradients(dL_da2, da2_dz2, dz2_dW2, dz2_db2, dz2_da1, da1_dz1, dz1_dW1, dz1_db1): ''' Given the local gradients, compute the gradient of the loss function L w.r.t. model parameters: the weights W1, W2 and biases b1 and b2. Input: see details in the above functions. Output: dL_dW2: the gradient of the loss function L w.r.t. the weight matrix W2 dL_db2: the gradient of the loss function L w.r.t. the biases b2 dL_dW1: the gradient of the loss function L w.r.t. the weight matrix W1 dL_db1: the gradient of the loss function L w.r.t. the biases b1 Hint: you could re-use the functions in problem1, such as sr.compute_dL_dz(...) ''' ######################################### ## INSERT YOUR CODE HERE # the 2nd layer dL_dz2 = sr.compute_dL_dz(dL_da2, da2_dz2) dL_dW2 = sr.compute_dL_dW(dL_dz2, dz2_dW2) dL_db2 = sr.compute_dL_db(dL_dz2, dz2_db2) # the 1st layer dL_da1 = compute_dL_da1(dL_dz2, dz2_da1) dL_dz1 = np.multiply(dL_da1, da1_dz1) dL_dW1 = sr.compute_dL_dW(dL_dz1, dz1_dW1) dL_db1 = sr.compute_dL_db(dL_dz1, dz1_db1) ######################################### return dL_dW2, dL_db2, dL_dW1, dL_db1
def compute_dL_dz1(dL_da1, da1_dz1): ''' Compute local gradient of the loss function L w.r.t. the logits z1 using chain rule. (2 points) Input: dL_da1: the gradient of the loss function L w.r.t. the activations a1 da1_dz1: the gradient of the activations z1 L w.r.t. the logits z1 Output: dL_dz1: the partial gradient of the loss function w.r.t. the logits z1, a numpy float vector of shape h by 1. Each element represents the partial gradient of the loss function L w.r.t. the i-th logit z1[i]: d_L / d_z1[i] ''' ######################################### ## INSERT YOUR CODE HERE dL_dz1 = sr.compute_dL_dW(dL_da1, da1_dz1) ######################################### return dL_dz1