def test_setup_parameter_updates(self): w1 = tf.Variable(np.ones((4, 3))) b1 = tf.Variable(np.zeros((3, ))) w2 = tf.Variable(np.ones((3, 2))) tf_utils.initialize_uninitialized_variables([w1, b1, w2]) updates = 2 * tf_utils.make_single_vector([w1, b1, w2]) + 1 updates = tf_utils.setup_parameter_updates([w1, b1, w2], updates) sess = tf_utils.tensorflow_session() for parameter, new_value in updates: sess.run(parameter.assign(new_value)) np.testing.assert_array_almost_equal( self.eval(w1), 3 * np.ones((4, 3)), ) np.testing.assert_array_almost_equal( self.eval(b1), np.ones(3), ) np.testing.assert_array_almost_equal( self.eval(w2), 3 * np.ones((3, 2)), )
def init_train_updates(self): training_outputs = self.network.training_outputs last_error = self.variables.last_error error_func = self.variables.loss mu = self.variables.mu new_mu = tf.where( tf.less(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) err_for_each_sample = flatten((self.target - training_outputs)**2) variables = self.network.variables params = [var for var in variables.values() if var.trainable] param_vector = make_single_vector(params) J = compute_jacobian(err_for_each_sample, params) J_T = tf.transpose(J) n_params = J.shape[1] parameter_update = tf.matrix_solve( tf.matmul(J_T, J) + new_mu * tf.eye(n_params.value), tf.matmul(J_T, tf.expand_dims(err_for_each_sample, 1))) updated_params = param_vector - flatten(parameter_update) updates = [(mu, new_mu)] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates
def init_train_updates(self): step = self.step inv_min_eigval = 1 / self.min_eigval variables = self.network.variables parameters = [var for var in variables.values() if var.trainable] param_vector = make_single_vector(parameters) gradients = tf.gradients(self.variables.loss, parameters) full_gradient = make_single_vector(gradients) second_derivatives = [] for parameter, gradient in zip(parameters, gradients): second_derivative, = tf.gradients(gradient, parameter) second_derivatives.append(flatten(second_derivative)) hessian_diag = tf.concat(second_derivatives, axis=0) # it's easier to clip inverse hessian rather than the hessian,. inv_hessian_diag = tf.clip_by_value( # inverse for diagonal matrix easy to compute with # elementwise inverse operation. 1 / hessian_diag, -inv_min_eigval, inv_min_eigval, ) updates = setup_parameter_updates( parameters, param_vector - step * full_gradient * inv_hessian_diag) return updates
def init_train_updates(self): penalty_const = asfloat(self.penalty_const) n_parameters = self.network.n_parameters variables = self.network.variables parameters = [var for var in variables.values() if var.trainable] param_vector = make_single_vector(parameters) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.loss, parameters) parameter_update = tf.matrix_solve( hessian_matrix + penalty_const * tf.eye(n_parameters), tf.reshape(full_gradient, [-1, 1])) updated_parameters = param_vector - flatten(parameter_update) updates = setup_parameter_updates(parameters, updated_parameters) return updates
def init_train_updates(self): iteration = self.variables.iteration previous_delta = self.variables.prev_delta previous_gradient = self.variables.prev_gradient n_parameters = self.network.n_parameters variables = self.network.variables parameters = [var for var in variables.values() if var.trainable] param_vector = make_single_vector(parameters) gradients = tf.gradients(self.variables.loss, parameters) full_gradient = make_single_vector(gradients) beta = self.update_function(previous_gradient, full_gradient, previous_delta, self.epsilon) #disable restart parameter_delta = -full_gradient + beta * previous_delta #parameter_delta = tf.where( # tf.equal(tf.mod(iteration, n_parameters), 0), # -full_gradient, # -full_gradient + beta * previous_delta #) step = self.find_optimal_step(param_vector, parameter_delta) updated_parameters = param_vector + step * parameter_delta updates = setup_parameter_updates(parameters, updated_parameters) # We have to compute these values first, otherwise # parallelization in tensorflow can mix update order # and, for example, previous gradient can be equal to # current gradient value. It happens because tensorflow # try to execute operations in parallel. with tf.control_dependencies([full_gradient, parameter_delta]): updates.extend([ previous_gradient.assign(full_gradient), previous_delta.assign(parameter_delta), iteration.assign(iteration + 1), ]) return updates
def init_train_updates(self): self.init_variables() iteration = self.variables.iteration inv_hessian = self.variables.inv_hessian prev_params = self.variables.prev_params prev_full_gradient = self.variables.prev_full_gradient variables = self.network.variables params = [var for var in variables.values() if var.trainable] param_vector = make_single_vector(params) gradients = tf.gradients(self.variables.loss, params) full_gradient = make_single_vector(gradients) new_inv_hessian = tf.where( tf.equal(iteration, 0), inv_hessian, self.update_function(inv_H=inv_hessian, delta_w=param_vector - prev_params, delta_grad=full_gradient - prev_full_gradient, epsilon=self.epsilon)) param_delta = -dot(new_inv_hessian, full_gradient) step = self.find_optimal_step(param_vector, param_delta) updated_params = param_vector + step * param_delta updates = setup_parameter_updates(params, updated_params) # We have to compute these values first, otherwise # parallelization, in tensorflow, can mix update order # and, for example, previous gradient can be equal to # current gradient value. It happens because tensorflow # try to execute operations in parallel. required_variables = [new_inv_hessian, param_vector, full_gradient] with tf.control_dependencies(required_variables): updates.extend([ inv_hessian.assign(new_inv_hessian), prev_params.assign(param_vector), prev_full_gradient.assign(full_gradient), iteration.assign(iteration + 1), ]) return updates