def hentenes_stiefel(old_g, new_g, delta_w, epsilon=1e-7): gradient_delta = new_g - old_g return safe_division( dot(gradient_delta, new_g), dot(delta_w, gradient_delta), epsilon, )
def dfp(inv_H, delta_w, delta_grad, epsilon=1e-7): """ DFP is a method very similar to BFGS. It's rank 2 formula update. It can suffer from round-off error and inaccurate line searches. """ inv_H_dot_grad = dot(inv_H, delta_grad) x = safe_division(outer(delta_w, delta_w), dot(delta_grad, delta_w), epsilon) y = safe_division(tf.matmul(outer(inv_H_dot_grad, delta_grad), inv_H), dot(delta_grad, inv_H_dot_grad), epsilon) return inv_H - y + x
def sr1(inv_H, delta_w, delta_grad, epsilon=1e-7): """ Symmetric rank 1 (SR1). Generates update for the inverse hessian matrix adding symmetric rank-1 matrix. It's possible that there is no rank 1 updates for the matrix and in this case update won't be applied and original inverse hessian will be returned. """ param = delta_w - dot(inv_H, delta_grad) denominator = dot(param, delta_grad) return tf.where( # This check protects from the cases when update # doesn't exist. It's possible that during certain # iteration there is no rank-1 update for the matrix. tf.less(tf.abs(denominator), epsilon * tf.norm(param) * tf.norm(delta_grad)), inv_H, inv_H + outer(param, param) / denominator)
def bfgs(inv_H, delta_w, delta_grad, epsilon=1e-7): """ It can suffer from round-off error and inaccurate line searches. """ n_parameters = int(inv_H.shape[0]) I = tf.eye(n_parameters) rho = safe_reciprocal(dot(delta_grad, delta_w), epsilon) X = I - outer(delta_w, delta_grad) * rho X_T = tf.transpose(X) Z = rho * outer(delta_w, delta_w) return tf.matmul(X, tf.matmul(inv_H, X_T)) + Z
def init_train_updates(self): self.init_variables() iteration = self.variables.iteration inv_hessian = self.variables.inv_hessian prev_params = self.variables.prev_params prev_full_gradient = self.variables.prev_full_gradient variables = self.network.variables params = [var for var in variables.values() if var.trainable] param_vector = make_single_vector(params) gradients = tf.gradients(self.variables.loss, params) full_gradient = make_single_vector(gradients) new_inv_hessian = tf.where( tf.equal(iteration, 0), inv_hessian, self.update_function(inv_H=inv_hessian, delta_w=param_vector - prev_params, delta_grad=full_gradient - prev_full_gradient, epsilon=self.epsilon)) param_delta = -dot(new_inv_hessian, full_gradient) step = self.find_optimal_step(param_vector, param_delta) updated_params = param_vector + step * param_delta updates = setup_parameter_updates(params, updated_params) # We have to compute these values first, otherwise # parallelization, in tensorflow, can mix update order # and, for example, previous gradient can be equal to # current gradient value. It happens because tensorflow # try to execute operations in parallel. required_variables = [new_inv_hessian, param_vector, full_gradient] with tf.control_dependencies(required_variables): updates.extend([ inv_hessian.assign(new_inv_hessian), prev_params.assign(param_vector), prev_full_gradient.assign(full_gradient), iteration.assign(iteration + 1), ]) return updates
def free_energy(visible_sample): with tf.name_scope('free-energy'): wx = tf.matmul(visible_sample, self.weight) wx_b = wx + self.hidden_bias visible_bias_term = dot(visible_sample, self.visible_bias) # We can get infinity when wx_b is a relatively large number # (maybe 100). Taking exponent makes it even larger and # for with float32 it can convert it to infinity. But because # number is so large we don't care about +1 value before taking # logarithms and therefore we can just pick value as it is # since our operation won't change anything. hidden_terms = tf.where( # exp(30) is such a big number that +1 won't # make any difference in the outcome. tf.greater(wx_b, 30), wx_b, tf.log1p(tf.exp(wx_b)), ) hidden_term = tf.reduce_sum(hidden_terms, axis=1) return -(visible_bias_term + hidden_term)
def dai_yuan(old_g, new_g, delta_w, epsilon=1e-7): return safe_division( dot(new_g, new_g), dot(new_g - old_g, delta_w), epsilon, )
def liu_storey(old_g, new_g, delta_w, epsilon=1e-7): return -safe_division( dot(new_g, new_g - old_g), dot(delta_w, old_g), epsilon, )
def polak_ribiere(old_g, new_g, delta_w, epsilon=1e-7): return safe_division( dot(new_g, new_g - old_g), dot(old_g, old_g), epsilon, )
def fletcher_reeves(old_g, new_g, delta_w, epsilon=1e-7): return safe_division( dot(new_g, new_g), dot(old_g, old_g), epsilon, )