def backward(self,model,targets,outputs): self.model = model nn_architecture = self.model.nn_architecture parameters = self.model.parameters memory = self.model.memory dA_prev = 2*(outputs - targets)/outputs.shape[1] + self.wd *self.grads_values['w_sum'] for layer_idx_prev, layer in reversed(list(enumerate(nn_architecture))): layer_idx_curr = layer_idx_prev + 1 dA_curr = dA_prev self.grads_values['w_sum'] += np.sum(parameters[f"W{layer_idx_curr}"]**2) if type(self.grads_values.get(f"dW{layer_idx_curr}")) == np.ndarray: self.grads_values[f"dW_square{layer_idx_curr}"] = self.momentum * self.grads_values[f"dW_square{layer_idx_curr}"] + (self.grads_values[f"dW{layer_idx_curr}"]**2) else: self.grads_values[f"dW_square{layer_idx_curr}"] = 1.0 if type(self.grads_values.get(f"db{layer_idx_curr}")) == np.ndarray: self.grads_values[f"db_square{layer_idx_curr}"] = self.momentum * self.grads_values[f"db_square{layer_idx_curr}"] + (1.0 - self.momentum) * (self.grads_values[f"db{layer_idx_curr}"]**2) else: self.grads_values[f"db_square{layer_idx_curr}"] = 1.0 A_prev = memory[f"A{layer_idx_prev}"] Z_curr = memory[f"Z{layer_idx_curr}"] W_curr = parameters[f"W{layer_idx_curr}"] b_curr = parameters[f"b{layer_idx_curr}"] m = A_prev.shape[1] backward_activation_func = get_activation_function(layer['activation'],backward=True) dZ_curr = backward_activation_func(dA_curr, Z_curr) dW_curr = (1. / m) * np.matmul(dZ_curr, A_prev.T) db_curr = (1. / m) * np.sum(dZ_curr, axis=1, keepdims=True) dA_prev = np.matmul(W_curr.T, dZ_curr) self.grads_values[f"dW{layer_idx_curr}"] = dW_curr self.grads_values[f"db{layer_idx_curr}"] = db_curr self.grads_values['w_sum'] = 0.0 for idx, layer in enumerate(nn_architecture): layer_idx = idx +1 self.grads_values[f'dW{layer_idx}'] = self.lr/np.sqrt(self.grads_values[f"dW_square{layer_idx}"]+1e-8) * self.grads_values[f"dW{layer_idx}"] self.grads_values[f'db{layer_idx}'] = self.lr/np.sqrt(self.grads_values[f"db_square{layer_idx}"]+1e-8) * self.grads_values[f"db{layer_idx}"] parameters[f"W{layer_idx}"] -= self.lr * self.grads_values[f"dW{layer_idx}"] parameters[f"b{layer_idx}"] -= self.lr * self.grads_values[f"db{layer_idx}"] self.model.parameters = parameters return self.model
def forward(self, xb): self.memory = dict() A_curr = xb for idx, (layer) in enumerate(self.nn_architecture): layer_idx = idx + 1 A_prev = A_curr W_curr = self.parameters[f"W{layer_idx}"] b_curr = self.parameters[f"b{layer_idx}"] Z_curr = np.matmul(W_curr, A_prev) + b_curr active_function = get_activation_function(layer['activation']) A_curr = active_function(Z_curr) self.memory[f"A{idx}"] = A_prev self.memory[f"Z{layer_idx}"] = Z_curr return A_curr
def backward(self,model,targets,outputs): self.model = model nn_architecture = self.model.nn_architecture parameters = self.model.parameters memory = self.model.memory dA_prev = 2*(outputs - targets)/outputs.shape[1] + self.wd *self.grads_values['w_sum'] for layer_idx_prev, layer in reversed(list(enumerate(nn_architecture))): layer_idx_curr = layer_idx_prev + 1 dA_curr = dA_prev self.grads_values['w_sum'] += np.sum(parameters[f"W{layer_idx_curr}"]**2) if type(self.grads_values.get(f"dW{layer_idx_curr}")) == np.ndarray: self.grads_values[f"dW_sum{layer_idx_curr}"] += self.grads_values[f"dW{layer_idx_curr}"] else: self.grads_values[f"dW_sum{layer_idx_curr}"] = 0.0 if type(self.grads_values.get(f"db{layer_idx_curr}")) == np.ndarray: self.grads_values[f"db_sum{layer_idx_curr}"] += self.grads_values[f"db{layer_idx_curr}"] else: self.grads_values[f"db_sum{layer_idx_curr}"] = 0.0 if type(self.grads_values.get(f"dW{layer_idx_curr}")) == np.ndarray: vw_curr = self.beta2 * self.grads_values[f"dW_square{layer_idx_curr}"] + (1.0) * self.grads_values[f"dW{layer_idx_curr}"]**2 self.grads_values[f"dW_square{layer_idx_curr}"] = vw_curr else: self.grads_values[f"dW_square{layer_idx_curr}"] = 1.0 if type(self.grads_values.get(f"db{layer_idx_curr}")) == np.ndarray: vb_curr = self.beta2 * self.grads_values[f"db_square{layer_idx_curr}"] + (1.0) * self.grads_values[f"db{layer_idx_curr}"]**2 self.grads_values[f"db_square{layer_idx_curr}"] = vb_curr else: self.grads_values[f"db_square{layer_idx_curr}"] = 1.0 ## beta if self.grads_values.get("beta1"): self.grads_values["beta1"] *= self.beta1 else: self.grads_values["beta1"] = self.beta1 if self.grads_values.get("beta2"): self.grads_values["beta2"] *= self.beta2 else: self.grads_values["beta2"] = self.beta2 A_prev = memory[f"A{layer_idx_prev}"] Z_curr = memory[f"Z{layer_idx_curr}"] W_curr = parameters[f"W{layer_idx_curr}"] b_curr = parameters[f"b{layer_idx_curr}"] m = A_prev.shape[1] backward_activation_func = get_activation_function(layer['activation'],backward=True) dZ_curr = backward_activation_func(dA_curr, Z_curr) dW_curr = (1. / m) * np.matmul(dZ_curr, A_prev.T) db_curr = (1. / m) * np.sum(dZ_curr, axis=1, keepdims=True) dA_prev = np.matmul(W_curr.T, dZ_curr) mw_curr = self.beta1 * self.grads_values[f"dW_sum{layer_idx_curr}"] + (1.0 )* dW_curr mb_curr = self.beta1 * self.grads_values[f"db_sum{layer_idx_curr}"] + (1.0 )* db_curr self.grads_values[f"dW{layer_idx_curr}"] = dW_curr self.grads_values[f"db{layer_idx_curr}"] = db_curr self.grads_values['w_sum'] = 0.0 for idx, layer in enumerate(nn_architecture): layer_idx = idx +1 vtw_curr = self.grads_values[f"dW_square{layer_idx}"] / (1.0 - self.grads_values['beta2']) vtb_curr = self.grads_values[f"db_square{layer_idx}"] / (1.0 - self.grads_values['beta2']) mtw_curr = self.grads_values[f"dW{layer_idx}"] / (1.0 - self.grads_values['beta1']) mtb_curr = self.grads_values[f"db{layer_idx}"] / (1.0 - self.grads_values['beta1']) parameters[f"W{layer_idx}"] -= self.lr/(np.sqrt(vtw_curr) + 1e-8) * (self.beta1 * mtw_curr + ((1-self.beta1) *\ self.grads_values[f"dW{layer_idx}"])/(1.0 - self.grads_values['beta1'])) parameters[f"b{layer_idx}"] -= self.lr/(np.sqrt(vtb_curr) + 1e-8) * (self.beta1 * mtb_curr + ((1-self.beta1) *\ self.grads_values[f"db{layer_idx}"])/(1.0 - self.grads_values['beta1'])) self.model.parameters = parameters return self.model