def init_train_updates(self): updates = super(LeakStepAdaptation, self).init_train_updates() alpha = self.alpha beta = self.beta leak_size = self.leak_size step = self.variables.step leak_average = self.variables.leak_average parameters = list(iter_parameters(self)) gradients = T.grad(self.variables.error_func, wrt=parameters) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) leak_avarage_update = ( (1 - leak_size) * leak_average + leak_size * full_gradient ) new_step = step + alpha * step * ( beta * leak_avarage_update.norm(L=2) - step ) updates.extend([ (leak_average, leak_avarage_update), (step, new_step), ]) return updates
def init_param_updates(self, layer, parameter): epoch = self.variables.epoch prev_first_moment = parameter.prev_first_moment prev_weighted_inf_norm = parameter.prev_weighted_inf_norm step = self.variables.step beta1 = self.beta1 beta2 = self.beta2 n_parameters = count_parameters(self) self.variables.hessian = theano.shared( value=asfloat(np.zeros((n_parameters, n_parameters))), name='hessian_inverse') parameters = list(iter_parameters(self)) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters ) gradient = T.grad(self.variables.error_func, wrt=parameter) first_moment = beta1 * prev_first_moment + (1 - beta1) * gradient weighted_inf_norm = T.maximum(beta2 * prev_weighted_inf_norm, T.abs_(gradient)) parameter_delta = ( (1 / (1 - beta1 ** epoch)) * (first_moment / (weighted_inf_norm + self.epsilon)) ) return [ (prev_first_moment, first_moment), (prev_weighted_inf_norm, weighted_inf_norm), (parameter, parameter - step * parameter_delta),(self.variables.hessian, hessian_matrix) ]
def init_train_updates(self): updates = super(LeakStepAdaptation, self).init_train_updates() alpha = self.alpha beta = self.beta leak_size = self.leak_size step = self.variables.step leak_average = self.variables.leak_average parameters = list(iter_parameters(self)) gradients = T.grad(self.variables.error_func, wrt=parameters) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) leak_avarage_update = ((1 - leak_size) * leak_average + leak_size * full_gradient) new_step = step + alpha * step * ( beta * leak_avarage_update.norm(L=2) - step) updates.extend([ (leak_average, leak_avarage_update), (step, new_step), ]) return updates
def init_train_updates(self): network_output = self.variables.network_output prediction_func = self.variables.train_prediction_func last_error = self.variables.last_error error_func = self.variables.error_func mu = self.variables.mu new_mu = ifelse( T.lt(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) mse_for_each_sample = T.mean( (network_output - prediction_func) ** 2, axis=1 ) params = list(iter_parameters(self)) param_vector = parameters2vector(self) J = compute_jaccobian(mse_for_each_sample, params) n_params = J.shape[1] updated_params = param_vector - T.nlinalg.matrix_inverse( J.T.dot(J) + new_mu * T.eye(n_params) ).dot(J.T).dot(mse_for_each_sample) updates = [(mu, new_mu)] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates
def init_train_updates(self): network_output = self.variables.network_output prediction_func = self.variables.train_prediction_func last_error = self.variables.last_error error_func = self.variables.error_func mu = self.variables.mu new_mu = ifelse( T.lt(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) mse_for_each_sample = T.mean((network_output - prediction_func)**2, axis=1) params = list(iter_parameters(self)) param_vector = parameters2vector(self) J = compute_jaccobian(mse_for_each_sample, params) n_params = J.shape[1] updated_params = param_vector - T.nlinalg.matrix_inverse( J.T.dot(J) + new_mu * T.eye(n_params)).dot( J.T).dot(mse_for_each_sample) updates = [(mu, new_mu)] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates
def init_train_updates(self): step = self.variables.step min_eigval = self.min_eigval parameters = list(iter_parameters(self)) param_vector = parameters2vector(self) gradients = T.grad(self.variables.error_func, wrt=parameters) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) second_derivatives = [] for parameter, gradient in zip(parameters, gradients): second_derivative = T.grad(gradient.sum(), wrt=parameter) second_derivatives.append(second_derivative.flatten()) hessian_diag = T.concatenate(second_derivatives) hessian_diag = T.switch( T.abs_(hessian_diag) < min_eigval, T.switch( hessian_diag < 0, -min_eigval, min_eigval, ), hessian_diag) # We divide gradient by Hessian diagonal elementwise is the same # as we just took diagonal Hessian inverse (which is # reciprocal for each diagonal element) and mutliply # by gradient. This operation is less clear, but works faster. updated_parameters = (param_vector - step * full_gradient / hessian_diag) updates = setup_parameter_updates(parameters, updated_parameters) return updates
def init_train_updates(self): step = self.variables.step previous_delta = self.variables.prev_delta previous_gradient = self.variables.prev_gradient n_parameters = count_parameters(self) parameters = list(iter_parameters(self)) param_vector = parameters2vector(self) gradients = T.grad(self.variables.error_func, wrt=parameters) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) beta = self.update_function(previous_gradient, full_gradient, previous_delta) parameter_delta = ifelse( T.eq(T.mod(self.variables.epoch, n_parameters), 1), -full_gradient, -full_gradient + beta * previous_delta) updated_parameters = param_vector + step * parameter_delta updates = [ (previous_gradient, full_gradient), (previous_delta, parameter_delta), ] parameter_updates = setup_parameter_updates(parameters, updated_parameters) updates.extend(parameter_updates) return updates
def init_param_updates(self, layer, parameter): epoch = self.variables.epoch prev_first_moment = parameter.prev_first_moment prev_second_moment = parameter.prev_second_moment # step = asfloat(self.variables.step) step = 0.001 beta1 = asfloat(self.beta1) beta2 = asfloat(self.beta2) epsilon = asfloat(self.epsilon) gradient = T.grad(self.variables.error_func, wrt=parameter) n_parameters = count_parameters(self) self.variables.hessian = theano.shared(value=asfloat( np.zeros((n_parameters, n_parameters))), name='hessian_inverse') parameters = list(iter_parameters(self)) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters) first_moment = (beta1 * prev_first_moment + asfloat(1. - beta1) * gradient) second_moment = (beta2 * prev_second_moment + asfloat(1. - beta2) * gradient**2) first_moment_bias_corrected = first_moment / (1. - beta1**epoch) second_moment_bias_corrected = second_moment / (1. - beta2**epoch) parameter_delta = first_moment_bias_corrected * ( T.sqrt(second_moment_bias_corrected) + epsilon) return [(prev_first_moment, first_moment), (prev_second_moment, second_moment), (parameter, parameter - step * parameter_delta), (self.variables.hessian, hessian_matrix)]
def init_train_updates(self): step = self.variables.step previous_delta = self.variables.prev_delta previous_gradient = self.variables.prev_gradient n_parameters = count_parameters(self) parameters = list(iter_parameters(self)) param_vector = parameters2vector(self) gradients = T.grad(self.variables.error_func, wrt=parameters) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) beta = self.update_function(previous_gradient, full_gradient, previous_delta) parameter_delta = ifelse( T.eq(T.mod(self.variables.epoch, n_parameters), 1), -full_gradient, -full_gradient + beta * previous_delta ) updated_parameters = param_vector + step * parameter_delta updates = [ (previous_gradient, full_gradient), (previous_delta, parameter_delta), ] parameter_updates = setup_parameter_updates(parameters, updated_parameters) updates.extend(parameter_updates) return updates
def init_train_updates(self): network_input = self.variables.network_input network_output = self.variables.network_output inv_hessian = self.variables.inv_hessian prev_params = self.variables.prev_params prev_full_gradient = self.variables.prev_full_gradient params = list(iter_parameters(self)) param_vector = parameters2vector(self) gradients = T.grad(self.variables.error_func, wrt=params) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) new_inv_hessian = ifelse( T.eq(self.variables.epoch, 1), inv_hessian, self.update_function(inv_hessian, param_vector - prev_params, full_gradient - prev_full_gradient) ) param_delta = -new_inv_hessian.dot(full_gradient) def prediction(step): # TODO: I need to update this ugly solution later updated_params = param_vector + step * param_delta layer_input = network_input start_pos = 0 for layer in self.layers: for param in layer.parameters: end_pos = start_pos + param.size parameter_name, parameter_id = param.name.split('_') setattr(layer, parameter_name, T.reshape( updated_params[start_pos:end_pos], param.shape )) start_pos = end_pos layer_input = layer.output(layer_input) return layer_input def phi(step): return self.error(network_output, prediction(step)) def derphi(step): error_func = self.error(network_output, prediction(step)) return T.grad(error_func, wrt=step) step = asfloat(line_search(phi, derphi)) updated_params = param_vector + step * param_delta updates = setup_parameter_updates(params, updated_params) updates.extend([ (inv_hessian, new_inv_hessian), (prev_params, param_vector), (prev_full_gradient, full_gradient), ]) return updates
def init_train_updates(self): network_output = self.variables.network_output prediction_func = self.variables.train_prediction_func last_error = self.variables.last_error error_func = self.variables.error_func mu = self.variables.mu new_mu = ifelse( T.lt(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) mse_for_each_sample = T.mean((network_output - prediction_func)**2, axis=1) params = list(iter_parameters(self)) param_vector = parameters2vector(self) ####################################################################################### n_parameters = count_parameters(self) self.variables.hessian = theano.shared(value=asfloat( np.zeros((n_parameters, n_parameters))), name='hessian_inverse') parameters = list(iter_parameters(self)) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters) ####################################################################################### J = compute_jaccobian(mse_for_each_sample, params) n_params = J.shape[1] updated_params = param_vector - T.nlinalg.matrix_inverse( J.T.dot(J) + new_mu * T.eye(n_params)).dot( J.T).dot(mse_for_each_sample) updates = [(mu, new_mu), [(self.variables.hessian, hessian_matrix)]] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates
def init_train_updates(self): n_parameters = count_parameters(self) parameters = list(iter_parameters(self)) param_vector = parameters2vector(self) penalty_const = asfloat(self.penalty_const) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters) hessian_inverse = T.nlinalg.matrix_inverse(hessian_matrix + penalty_const * T.eye(n_parameters)) updated_parameters = param_vector - hessian_inverse.dot(full_gradient) updates = setup_parameter_updates(parameters, updated_parameters) return updates
def init_variables(self): super(QuasiNewton, self).init_variables() n_params = sum(p.get_value().size for p in iter_parameters(self)) self.variables.update( inv_hessian=theano.shared( name='inv_hessian', value=asfloat(self.h0_scale * np.eye(int(n_params))), ), prev_params=theano.shared( name='prev_params', value=asfloat(np.zeros(n_params)), ), prev_full_gradient=theano.shared( name='prev_full_gradient', value=asfloat(np.zeros(n_params)), ), )
def init_train_updates(self): n_parameters = count_parameters(self) parameters = list(iter_parameters(self)) param_vector = parameters2vector(self) penalty_const = asfloat(self.penalty_const) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters ) hessian_inverse = T.nlinalg.matrix_inverse( hessian_matrix + penalty_const * T.eye(n_parameters) ) updated_parameters = param_vector - hessian_inverse.dot(full_gradient) updates = setup_parameter_updates(parameters, updated_parameters) return updates
def init_param_updates(self, layer, parameter): n_parameters = count_parameters(self) self.variables.hessian = theano.shared(value=asfloat( np.zeros((n_parameters, n_parameters))), name='hessian_inverse') parameters = list(iter_parameters(self)) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters) prev_mean_squred_grad = parameter.prev_mean_squred_grad step = self.variables.step gradient = T.grad(self.variables.error_func, wrt=parameter) mean_squred_grad = (self.decay * prev_mean_squred_grad + (1 - self.decay) * gradient**2) parameter_delta = gradient / T.sqrt(mean_squred_grad + self.epsilon) return [ (prev_mean_squred_grad, mean_squred_grad), (parameter, parameter - step * parameter_delta), ]
def init_train_updates(self): step = self.variables.step min_eigval = self.min_eigval parameters = list(iter_parameters(self)) param_vector = parameters2vector(self) gradients = T.grad(self.variables.error_func, wrt=parameters) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) second_derivatives = [] for parameter, gradient in zip(parameters, gradients): second_derivative = T.grad(gradient.sum(), wrt=parameter) second_derivatives.append(second_derivative.flatten()) hessian_diag = T.concatenate(second_derivatives) hessian_diag = T.switch( T.abs_(hessian_diag) < min_eigval, T.switch( hessian_diag < 0, -min_eigval, min_eigval, ), hessian_diag ) # We divide gradient by Hessian diagonal elementwise is the same # as we just took diagonal Hessian inverse (which is # reciprocal for each diagonal element) and mutliply # by gradient. This operation is less clear, but works faster. updated_parameters = ( param_vector - step * full_gradient / hessian_diag ) updates = setup_parameter_updates(parameters, updated_parameters) return updates