def pass_backward(self, grad): prev_weights = self.weights if self.is_trainable: dweights = self.inputs.T @ grad dbias = np.sum(grad, axis=0, keepdims=True) self.weights = optimizer(self.weight_optimizer).update( self.weights, dweights) self.bias = optimizer(self.weight_optimizer).update( self.bias, dbias) # endif self.is_trainable return grad @ prev_weights.T
def pass_backward(self, grad): _, time_steps, _ = grad.shape next_grad = np.zeros_like(grad) if self.is_trainable: dW_input = np.zeros_like(self.W_input) dW_recur = np.zeros_like(self.W_recur) dW_output = np.zeros_like(self.W_output) db_input = np.zeros_like(self.b_input) db_output = np.zeros_like(self.b_output) for t in np.arange(time_steps)[::-1]: # reversed dW_output += np.dot(grad[:, t].T, self.states[:, t]) db_output += np.sum(grad[:, t], axis=0) dstate = np.dot(grad[:, t], self.W_output) * activate( self.activation).backward(self.state_inputs[:, t]) next_grad[:, t] = np.dot(dstate, self.W_input) for tt in np.arange(max(0, t - self.bptt_truncate), t + 1)[::-1]: # reversed dW_input += np.dot(dstate.T, self.inputs[:, tt]) dW_recur += np.dot(dstate.T, self.states[:, tt - 1]) db_input += np.sum(dstate, axis=0) dstate = np.dot(dstate, self.W_recur) * activate( self.activation).backward(self.state_inputs[:, tt - 1]) # optimize weights and bias self.W_input = optimizer(self.optimizer_kwargs).update( self.W_input, cg(dW_input)) self.W_output = optimizer(self.optimizer_kwargs).update( self.W_output, cg(dW_output)) self.W_recur = optimizer(self.optimizer_kwargs).update( self.W_recur, cg(dW_recur)) self.b_input = optimizer(self.optimizer_kwargs).update( self.b_input, cg(db_input)) self.b_output = optimizer(self.optimizer_kwargs).update( self.b_output, cg(db_output)) # endif self.is_trainable return next_grad
def pass_backward(self, grad): dinput_norm = grad * self.gamma if self.is_trainable: dbeta = np.sum(grad, axis=0) dgamma = np.sum(grad * self.input_norm, axis=0) self.gamma = optimizer(self.weight_optimizer).update( self.gamma, dgamma) self.beta = optimizer(self.weight_optimizer).update( self.beta, dbeta) # endif self.is_trainable dinput = np.divide(1., grad.shape[0]) * self.inv_stddev * ( grad.shape[0] * dinput_norm - np.sum(dinput_norm, axis=0) - self.input_norm * np.sum(dinput_norm * self.input_norm, axis=0)) return dinput
def pass_backward(self, grad, epoch_num, batch_num, batch_size): prev_weights = self.weights if self.is_trainable: dweights = np.sum(grad @ self.weights.T, axis = 1) self.weights = optimizer(self.weight_optimizer).update(self.weights, dweights.T, epoch_num, batch_num, batch_size) # endif self.is_trainable return grad @ prev_weights.T
def pass_backward(self, grad): input_num, input_depth, input_height, input_width = self.input_shape doutput_reshaped = grad.transpose(1, 2, 3, 0).reshape(self.filter_num, -1) if self.is_trainable: dbias = np.sum(grad, axis=(0, 2, 3)) dbias = dbias.reshape(self.filter_num, -1) dweights = doutput_reshaped @ self.input_col.T dweights = dweights.reshape(self.weights.shape) # optimize the weights and bias self.weights = optimizer(self.weight_optimizer).update( self.weights, dweights) self.bias = optimizer(self.weight_optimizer).update( self.bias, dbias) # endif self.is_trainable weight_reshape = self.weights.reshape(self.filter_num, -1) dinput_col = weight_reshape.T @ doutput_reshaped pad_height, pad_width = get_pad(self.padding, input_height, input_width, self.strides[0], self.strides[1], self.kernel_size[0], self.kernel_size[1]) dinputs = col2im_indices(dinput_col, self.input_shape, self.kernel_size[0], self.kernel_size[1], padding=(pad_height, pad_width), stride=self.strides[0]) return dinputs
def pass_backward(self, grad, epoch_num, batch_num, batch_size): _, time_steps, _ = grad.shape next_grad = np.zeros_like(grad) if self.is_trainable: dW_update = np.zeros_like(self.W_update) dW_reset = np.zeros_like(self.W_reset) dW_cell = np.zeros_like(self.W_cell) dW_final = np.zeros_like(self.W_final) db_update = np.zeros_like(self.b_update) db_reset = np.zeros_like(self.b_reset) db_cell = np.zeros_like(self.b_cell) db_final = np.zeros_like(self.b_final) dstates = np.zeros_like(self.states) dstate_a = np.zeros_like(self.states) dstate_b = np.zeros_like(self.states) dstate_c = np.zeros_like(self.states) dstates_next = np.zeros_like(self.states) dstates_prime = np.zeros_like(self.states) dz_cell = np.zeros_like(self.cell) dcell = np.zeros_like(self.cell) dz_reset = np.zeros_like(self.reset) dreset = np.zeros_like(self.reset) dz_update = np.zeros_like(self.update) dupdate = np.zeros_like(self.update) for t in np.arange(time_steps)[::-1]: # reversed dW_final += np.dot(self.states[:, t].T, grad[:, t]) db_final += np.sum(grad[:, t], axis=0) dstates[:, t] = np.dot(grad[:, t], self.W_final.T) dstates[:, t] += dstates_next[:, t] next_grad = np.dot(dstates, self.W_final) dcell[:, t] = self.update[:, t] * dstates[:, t] dstate_a[:, t] = (1. - self.update[:, t]) * dstates[:, t] dupdate[:, t] = self.cell[:, t] * dstates[:, t] - self.states[:, t - 1] * dstates[:, t] dcell[:, t] = activate(self.activation).backward( self.cell[:, t]) * dcell[:, t] dW_cell += np.dot(self.z_tilde[:, t - 1].T, dcell[:, t]) db_cell += np.sum(dcell[:, t], axis=0) dz_cell = np.dot(dcell[:, t], self.W_cell.T) dstates_prime[:, t] = dz_cell[:, :self.h_units] dstate_b[:, t] = self.reset[:, t] * dstates_prime[:, t] dreset[:, t] = self.states[:, t - 1] * dstates_prime[:, t] dreset[:, t] = activate(self.gate_activation).backward( self.reset[:, t]) * dreset[:, t] dW_reset += np.dot(self.z[:, t].T, dreset[:, t]) db_reset += np.sum(dreset[:, t], axis=0) dz_reset = np.dot(dreset[:, t], self.W_reset.T) dupdate[:, t] = activate(self.gate_activation).backward( self.update[:, t]) * dupdate[:, t] dW_update += np.dot(self.z[:, t].T, dupdate[:, t]) db_update += np.sum(dupdate[:, t], axis=0) dz_update = np.dot(dupdate[:, t], self.W_update.T) dz = dz_reset + dz_update dstate_c[:, t] = dz[:, :self.h_units] dstates_next = dstate_a + dstate_b + dstate_c # optimize weights and bias self.W_final = optimizer(self.optimizer_kwargs).update( self.W_final, cg(dW_final), epoch_num, batch_num, batch_size) self.b_final = optimizer(self.optimizer_kwargs).update( self.b_final, cg(db_final), epoch_num, batch_num, batch_size) self.W_cell = optimizer(self.optimizer_kwargs).update( self.W_cell, cg(dW_cell), epoch_num, batch_num, batch_size) self.b_cell = optimizer(self.optimizer_kwargs).update( self.b_cell, cg(db_cell), epoch_num, batch_num, batch_size) self.W_reset = optimizer(self.optimizer_kwargs).update( self.W_reset, cg(dW_reset), epoch_num, batch_num, batch_size) self.b_reset = optimizer(self.optimizer_kwargs).update( self.b_reset, cg(db_reset), epoch_num, batch_num, batch_size) self.W_update = optimizer(self.optimizer_kwargs).update( self.W_update, cg(dW_update), epoch_num, batch_num, batch_size) self.b_update = optimizer(self.optimizer_kwargs).update( self.b_update, cg(db_update), epoch_num, batch_num, batch_size) # endif self.is_trainable return next_grad
def pass_backward(self, grad): _, time_steps, _ = grad.shape next_grad = np.zeros_like(grad) if self.is_trainable: dW_forget = np.zeros_like(self.W_forget) dW_input = np.zeros_like(self.W_input) dW_output = np.zeros_like(self.W_output) dW_cell = np.zeros_like(self.W_cell) dW_final = np.zeros_like(self.W_final) db_forget = np.zeros_like(self.b_forget) db_input = np.zeros_like(self.b_input) db_output = np.zeros_like(self.b_output) db_cell = np.zeros_like(self.b_cell) db_final = np.zeros_like(self.b_final) dstates = np.zeros_like(self.states) dcell = np.zeros_like(self.cell) dcell_tilde = np.zeros_like(self.cell_tilde) dforget = np.zeros_like(self.forget) dinput = np.zeros_like(self.input) doutput = np.zeros_like(self.output) dcell_next = np.zeros_like(self.cell) dstates_next = np.zeros_like(self.states) for t in np.arange(time_steps)[::-1]: # reversed dW_final += np.dot(self.states[:, t].T, grad[:, t]) db_final += np.sum(grad[:, t], axis=0) dstates[:, t] = np.dot(grad[:, t], self.W_final.T) dstates[:, t] += dstates_next[:, t] next_grad = np.dot(dstates, self.W_final) doutput[:, t] = activate(self.activation).forward( self.cell[:, t]) * dstates[:, t] doutput[:, t] = activate(self.gate_activation).backward( self.output[:, t]) * doutput[:, t] dW_output += np.dot(self.z[:, t].T, doutput[:, t]) db_output += np.sum(doutput[:, t], axis=0) dcell[:, t] += self.output[:, t] * dstates[:, t] * activate( self.activation).backward(self.cell[:, t]) dcell[:, t] += dcell_next[:, t] dcell_tilde[:, t] = dcell[:, t] * self.input[:, t] dcell_tilde[:, t] = dcell_tilde[:, t] * activate( self.activation).backward(dcell_tilde[:, t]) dW_cell += np.dot(self.z[:, t].T, dcell[:, t]) db_cell += np.sum(dcell[:, t], axis=0) dinput[:, t] = self.cell_tilde[:, t] * dcell[:, t] dinput[:, t] = activate(self.gate_activation).backward( self.input[:, t]) * dinput[:, t] dW_input += np.dot(self.z[:, t].T, dinput[:, t]) db_input += np.sum(dinput[:, t], axis=0) dforget[:, t] = self.cell[:, t - 1] * dcell[:, t] dforget[:, t] = activate(self.gate_activation).backward( self.forget[:, t]) * dforget[:, t] dW_forget += np.dot(self.z[:, t].T, dforget[:, t]) db_forget += np.sum(dforget[:, t], axis=0) dz_forget = np.dot(dforget[:, t], self.W_forget.T) dz_input = np.dot(dinput[:, t], self.W_input.T) dz_output = np.dot(doutput[:, t], self.W_output.T) dz_cell = np.dot(dcell[:, t], self.W_cell.T) dz = dz_forget + dz_input + dz_output + dz_cell dstates_next[:, t] = dz[:, :self.h_units] dcell_next = self.forget * dcell # optimize weights and bias self.W_final = optimizer(self.optimizer_kwargs).update( self.W_final, cg(dW_final)) self.b_final = optimizer(self.optimizer_kwargs).update( self.b_final, cg(db_final)) self.W_forget = optimizer(self.optimizer_kwargs).update( self.W_forget, cg(dW_forget)) self.b_forget = optimizer(self.optimizer_kwargs).update( self.b_forget, cg(db_forget)) self.W_input = optimizer(self.optimizer_kwargs).update( self.W_input, cg(dW_input)) self.b_input = optimizer(self.optimizer_kwargs).update( self.b_input, cg(db_input)) self.W_output = optimizer(self.optimizer_kwargs).update( self.W_output, cg(dW_output)) self.b_output = optimizer(self.optimizer_kwargs).update( self.b_output, cg(db_output)) self.W_cell = optimizer(self.optimizer_kwargs).update( self.W_cell, cg(dW_cell)) self.b_cell = optimizer(self.optimizer_kwargs).update( self.b_cell, cg(db_cell)) # endif self.is_trainable return next_grad
def pass_backward(self, grad): input_num, input_depth, input_height, input_width = self.inputs.shape # initialize the gradient(s) dinputs = np.zeros(self.inputs.shape) if self.is_trainable: # initialize the gradient(s) dweights = np.zeros(self.weights.shape) dbias = np.zeros(self.bias.shape) pad_height, pad_width = get_pad(self.padding, input_height, input_width, self.strides[0], self.strides[1], self.kernel_size[0], self.kernel_size[1]) pad_size = (np.sum(pad_height) / 2).astype(int) if pad_size != 0: grad = grad[:, :, pad_size:-pad_size, pad_size:-pad_size] # dweights for f in np.arange(self.filter_num): # filter number for c in np.arange(input_depth): # input depth (channels) for h in np.arange(self.kernel_size[0]): # kernel height for w in np.arange( self.kernel_size[1]): # kernel width input_patch = self.inputs[:, c, h:input_height - self.kernel_size[0] + h + 1:self.strides[0], w:input_width - self.kernel_size[1] + w + 1:self.strides[1]] grad_patch = grad[:, f] dweights[f, c, h, w] = np.sum( input_patch * grad_patch) / input_num # dbias for f in np.arange(self.filter_num): # filter number dbias[f] = np.sum(grad[:, f]) / input_num # optimize the weights and bias self.weights = optimizer(self.weight_optimizer).update( self.weights, dweights) self.bias = optimizer(self.weight_optimizer).update( self.bias, dbias) # endif self.is_trainable # dinputs for b in np.arange(input_num): # batch number for f in np.arange(self.filter_num): # filter number for c in np.arange(input_depth): # input depth (channels) for h in np.arange(self.kernel_size[0]): # kernel height for w in np.arange( self.kernel_size[1]): # kernel width h_stride, w_stride = h * self.strides[ 0], w * self.strides[1] dinputs[b, c, h_stride:h_stride + self.kernel_size[0], w_stride:w_stride + self.kernel_size[1]] += self.weights[ f, c] * grad[b, f, h, w] return dinputs
def pass_backward(self, grad): d_inputs = np.matmul(grad, self.one_hot_inputs) d_embeddings = np.sum(d_inputs, axis=0) self.weights = optimizer(self.weight_optimizer).update( self.weights, d_embeddings.T)