def append(self, S, Y, remove=True): """ Add a vector pair. Skipped if the inner product is small or non-positive. Update skip approach similar to that taken in SciPy Git master 0a7bc723d105288f4b728305733ed8cb3c8feeb5, June 20 2020 scipy/optimize/lbfgsb_src/lbfgsb.f mainlb subroutine and in (3.9) of R. H. Byrd, P. Lu, J. Nocedal, and C. Zhu, "A limited memory algorithm for bound constrained optimization", SIAM Journal on Scientific Computing 16(5), 1190--1208, 1995 Specifically, given a step s and gradient change y, only accepts the update if y^T s > max(skip_atol, skip_rtol sqrt(| s^T M s y^T M_inv y |)) where the |.| is used to work around possible underflows. Arguments: S Step Y Gradient change remove Whether to remove any excess vector pairs Returns: (S_inner_Y, S_Y_added, S_Y_removed) with S_inner_Y y^T s S_Y_added Whether the given vector pair was added S_Y_removed A list of any removed vector pairs """ if is_function(S): S = (S,) if is_function(Y): Y = (Y,) if len(S) != len(Y): raise OptimizationException("Incompatible shape") if self._skip_rtol == 0.0: skip_tol = self._skip_atol else: skip_tol = max( self._skip_atol, self._skip_rtol * np.sqrt(abs(functions_inner(S, self._M(*S)) * functions_inner(self._M_inv(*Y), Y)))) # noqa: E501 S_inner_Y = functions_inner(S, Y) if S_inner_Y > skip_tol: rho = 1.0 / S_inner_Y self._iterates.append((rho, functions_copy(S), functions_copy(Y))) if remove: S_Y_removed = self.remove() else: S_Y_removed = [] return S_inner_Y, True, S_Y_removed else: return S_inner_Y, False, []
def M(*X): M_X = M_arg(*X) if is_function(M_X): M_X = (M_X,) if len(M_X) != len(X): raise OptimizationException("Incompatible shape") return M_X
def Fp(*X): Fp_calls[0] += 1 Fp_val = Fp_arg(*X) if is_function(Fp_val): Fp_val = (Fp_val,) if len(Fp_val) != len(X): raise OptimizationException("Incompatible shape") return Fp_val
def action(self, X, H_0=None, theta=1.0): """ Compute the action of the approximate Hessian inverse. Implementation of L-BFGS approximate Hessian inverse action, as in Algorithm 7.4 of J. Nocedal and S. J. Wright, Numerical optimization, second edition, Springer, 2006 with theta scaling, see equation (3.2) of R. H. Byrd, P. Lu, J. Nocedal, and C. Zhu, "A limited memory algorithm for bound constrained optimization", SIAM Journal on Scientific Computing 16(5), 1190--1208, 1995 Arguments: X Vector on which to compute the action H_0 A callable defining the action of the unscaled initial approximate Hessian inverse. Must correspond to the action of a symmetric positive definite matrix. Must not modify input data. Identity used if not supplied. """ if is_function(X): X = (X,) X = functions_copy(X) if H_0 is None: def H_0(*X): return X # copy not required else: H_0 = wrapped_action(H_0) alphas = [] for rho, S, Y in reversed(self._iterates): alpha = rho * functions_inner(S, X) functions_axpy(X, -alpha, Y) alphas.append(alpha) alphas.reverse() R = functions_copy(H_0(*X)) if not np.all(theta == 1.0): if isinstance(theta, (int, np.integer, float, np.floating)): theta = [theta for r in R] assert len(R) == len(theta) for r, th in zip(R, theta): function_set_values(r, function_get_values(r) / th) assert len(self._iterates) == len(alphas) for (rho, S, Y), alpha in zip(self._iterates, alphas): beta = rho * functions_inner(R, Y) functions_axpy(R, alpha - beta, S) return R[0] if len(R) == 1 else R
def inverse_action(self, X, B_0=None, B_approx_decomp=None): """ Compute the action of the approximate Hessian. Computed using the representation in Theorem 7.4 of J. Nocedal and S. J. Wright, Numerical optimization, second edition, Springer, 2006 Arguments: X A Function, or list or tuple of Function objects, defining the vector on which to compute the action B_0 A callable defining the action of the initial approximate Hessian. Must correspond to the action of a symmetric positive definite matrix. Must not modify input data. Identity used if not supplied. B_approx_decomp As returned by self.inverse_update_decomposition """ if is_function(X): X = (X,) if B_0 is None: def B_0(*X): return X # copy not required else: B_0 = wrapped_action(B_0) m = len(self._iterates) if B_approx_decomp is None: G, G_solve, F = self.inverse_update_decomposition(B_0=B_0) else: G, G_solve, F = B_approx_decomp F_X = np.zeros(2 * m, dtype=np.float64) for i in range(2 * m): F_X[i] = functions_inner(X, F[i]) G_inv_F_x = G_solve(F_X) R = functions_copy(B_0(*X)) for i in range(2 * m): functions_axpy(R, -G_inv_F_x[i], F[i]) return R[0] if len(R) == 1 else R
def l_bfgs(F, Fp, X0, m, s_atol, g_atol, converged=None, max_its=1000, H_0=None, theta_scale=True, block_theta_scale=True, delta=1.0, skip_atol=0.0, skip_rtol=1.0e-12, M=None, M_inv=None, c1=1.0e-4, c2=0.9, old_F_val=None, line_search_rank0=line_search_rank0_scipy_line_search, line_search_rank0_kwargs={}, comm=None): """ Minimization using L-BFGS, following Algorithm 7.5 of J. Nocedal and S. J. Wright, Numerical optimization, second edition, Springer, 2006 Theta scaling is similar to that in R. H. Byrd, P. Lu, J. Nocedal, and C. Zhu, "A limited memory algorithm for bound constrained optimization", SIAM Journal on Scientific Computing 16(5), 1190--1208, 1995 but using y_k^T H_0 y_k in place of y_k^T y_k, and with a general H_0. On the first iteration, and when restarting due to line search failures, theta is set equal to theta = { sqrt(| g^T M_inv g |) / delta if delta is not None { 1 if delta is None where g is the (previous) gradient vector -- see 'Implementation' in section 6.1 of J. Nocedal and S. J. Wright, Numerical optimization, second edition, Springer, 2006 Restart on line search failure similar to approach described in C. Zhu, R. H. Byrd, P. Lu, and J. Nocedal, "Algorithm 778: L-BFGS-B: Fortran subroutines for large-scale bound-constrained optimization", ACM Transactions on Mathematical Software 23(4), 550--560, 1997 but using gradient-descent defined using the H_0 norm in place of I Arguments: F A callable defining the functional Fp A callable defining the functional gradient X0 Initial guess m Keep the last m vector pairs s_atol Step absolute tolerance. If None then the step norm convergence test is disabled. g_atol Gradient absolute tolerance. If None then the gradient change norm convergence test is disabled. converged A callable of the form def converged(it, F_old, F_new, X_new, G_new, S, Y): where X_new, G_new, S, and Y are a Function, or a list or tuple of Function objects, and with it The iteration number, an integer F_old The old value of the functional F_new The new value of the functioanl X_new The new value of X, with F_new = F(X_new) G_new The new gradient, Fp(X_new) S The step Y The gradient change and returning True if the problem has converged, and False otherwise. X_new, G_new, S, and Y must not be modified. max_its Maximum number of iterations H_0 A callable defining the action of the unscaled initial approximate Hessian inverse. Must correspond to the action of a symmetric positive definite matrix. Must not modify input data. Identity used if not supplied. If supplied then M must be supplied. theta_scale Whether to apply theta scaling (see above). block_theta_scale Whether to apply separate theta scaling to each control function. Intended to be used with block-diagonal H_0 (and M_inv if delta is not None). delta Defines the initial theta scaling (see above). If delta is None then no scaling is applied on the first iteration, or when restarting due to line search failures. skip_atol Skip absolute tolerance (see H_approximation.append) skip_rtol Skip relative tolerance (see H_approximation.append) M A callable defining the action of a symmetric positive definite matrix, used to define the step norm. Must not modify input data. Identity used if not supplied. If supplied then H_0 or M_inv must be supplied. M_inv A callable defining the action of a symmetric positive definite matrix, used to define the gradient norm. Must not modify input data. Defaults to H_0. If supplied then M must be supplied. c1, c2 Parameters in the Wolfe conditions. See section 3.1 (where values are suggested) and (3.6) of J. Nocedal and S. J. Wright, Numerical optimization, second edition, Springer, 2006 old_F_val Value of F at the initial guess line_search_rank0 See below. line_search_rank0_kwargs See below. comm MPI communicator line_search_rank0 is a callable implementing a one dimensional line search algorithm, yielding a value of alpha_k such that the Wolfe conditions are satisfied as defined in (3.6) of J. Nocedal and S. J. Wright, Numerical optimization, second edition, Springer, 2006 for the case x_k=[0] and p_k=[1]. This has interface: def line_search_rank0( F, Fp, c1, c2, old_F_val=None, old_Fp_val=None, **kwargs): with arguments: F A callable, with a floating point input x, and returning the value of the functional F(x) Fp A callable, with a floating point input x, and returning the value of the functional derivative F'(x) c1, c2 Parameters in the Wolfe conditions. See (3.6) of J. Nocedal and S. J. Wright, Numerical optimization, second edition, Springer, 2006 old_F_val Value of the functional at x = 0, F(x = 0) old_Fp_val Value of the functional at x = 0, F'(x = 0) and with remaining keyword arguments given by line_search_rank0_kwargs. This returns (alpha_k, new_F_val) with: alpha_k Resulting value of alpha_k, or None on failure new_F_val Value of the functional at F(x = alpha), or None if not available Returns: (X, its, conv, reason, F_calls, Fp_calls, H_approx) with: X Result of the minimization its Iterations taken conv Whether converged reason A string describing the reason for return F_calls Number of functional evaluation calls Fp_calls Number of functional gradient evaluation calls H_approx The inverse Hessian approximation """ logger = logging.getLogger("fenics_ice.l_bfgs") F_arg = F F_calls = [0] def F(*X): F_calls[0] += 1 return F_arg(*X) Fp_arg = Fp Fp_calls = [0] def Fp(*X): Fp_calls[0] += 1 Fp_val = Fp_arg(*X) if is_function(Fp_val): Fp_val = (Fp_val,) if len(Fp_val) != len(X): raise OptimizationException("Incompatible shape") return Fp_val if is_function(X0): X0 = (X0,) if converged is None: def converged(it, F_old, F_new, X_new, G_new, S, Y): return False else: converged_arg = converged def converged(it, F_old, F_new, X_new, G_new, S, Y): return converged_arg(it, F_old, F_new, X_new[0] if len(X_new) == 1 else X_new, G_new[0] if len(G_new) == 1 else G_new, S[0] if len(S) == 1 else S, Y[0] if len(Y) == 1 else Y) if (H_0 is None and M_inv is None) and M is not None: raise OptimizationException("If M is supplied, then H_0 or M_inv must " "be supplied") if (H_0 is not None or M_inv is not None) and M is None: raise OptimizationException("If H_0 or M_inv are supplied, then M " "must be supplied") if H_0 is None: def H_0(*X): return X # copy not required else: H_0 = wrapped_action(H_0) if M is None: def M(*X): return X # copy not required else: M = wrapped_action(M) if M_inv is None: M_inv = H_0 else: M_inv = wrapped_action(M_inv) if comm is None: comm = function_comm(X0[0]) X = functions_copy(X0) del X0 if old_F_val is None: old_F_val = F(*X) old_Fp_val = functions_copy(Fp(*X)) old_Fp_norm_sq = abs(functions_inner(M_inv(*old_Fp_val), old_Fp_val)) H_approx = H_approximation(m=m, skip_atol=skip_atol, skip_rtol=skip_rtol, M=M, M_inv=M_inv) if theta_scale and delta is not None: if block_theta_scale and len(old_Fp_val) > 1: old_M_inv_Fp = M_inv(*old_Fp_val) assert len(old_Fp_val) == len(old_M_inv_Fp) theta = [ np.sqrt(abs(function_inner(old_M_inv_Fp[i], old_Fp_val[i]))) / delta for i in range(len(old_Fp_val))] del old_M_inv_Fp else: theta = np.sqrt(old_Fp_norm_sq) / delta else: theta = 1.0 it = 0 conv = None reason = None logger.info(f"L-BFGS: Iteration {it:d}, " f"F calls {F_calls[0]:d}, " f"Fp calls {Fp_calls[0]:d}, " f"functional value {old_F_val:.6e}") while True: logger.debug(f" Gradient norm = {np.sqrt(old_Fp_norm_sq):.6e}") if g_atol is not None and old_Fp_norm_sq <= g_atol * g_atol: conv = True reason = "g_atol reached" break minus_P = H_approx.action(old_Fp_val, H_0=H_0, theta=theta) if is_function(minus_P): minus_P = (minus_P,) alpha, old_Fp_val_rank0, new_F_val, new_Fp_val, new_Fp_val_rank0 = line_search( # noqa: E501 F, Fp, X, minus_P, c1=c1, c2=c2, old_F_val=old_F_val, old_Fp_val=old_Fp_val, line_search_rank0=line_search_rank0, line_search_rank0_kwargs=line_search_rank0_kwargs, comm=comm) if is_function(new_Fp_val): new_Fp_val = (new_Fp_val,) if alpha is None: if it == 0: raise OptimizationException("L-BFGS: Line search failure -- " "consider changing l-bfgs 'delta_lbfgs' value") logger.warning("L-BFGS: Line search failure -- resetting " "Hessian inverse approximation") H_approx.reset() if theta_scale and delta is not None: if block_theta_scale and len(old_Fp_val) > 1: old_M_inv_Fp = M_inv(*old_Fp_val) assert len(old_Fp_val) == len(old_M_inv_Fp) theta = [ np.sqrt(abs(function_inner(old_M_inv_Fp[i], old_Fp_val[i]))) / delta for i in range(len(old_Fp_val))] del old_M_inv_Fp else: theta = np.sqrt(old_Fp_norm_sq) / delta else: theta = 1.0 minus_P = H_approx.action(old_Fp_val, H_0=H_0, theta=theta) if is_function(minus_P): minus_P = (minus_P,) alpha, old_Fp_val_rank0, new_F_val, new_Fp_val, new_Fp_val_rank0 = line_search( # noqa: E501 F, Fp, X, minus_P, c1=c1, c2=c2, old_F_val=old_F_val, old_Fp_val=old_Fp_val, line_search_rank0=line_search_rank0, line_search_rank0_kwargs=line_search_rank0_kwargs, comm=comm) if is_function(new_Fp_val): new_Fp_val = (new_Fp_val,) if alpha is None: raise OptimizationException("L-BFGS: Line search failure") if new_F_val > old_F_val + c1 * alpha * old_Fp_val_rank0: raise OptimizationException("L-BFGS: Armijo condition not " "satisfied") if new_Fp_val_rank0 < c2 * old_Fp_val_rank0: raise OptimizationException("L-BFGS: Curvature condition not " "satisfied") if abs(new_Fp_val_rank0) > c2 * abs(old_Fp_val_rank0): logger.warning("L-BFGS: Strong curvature condition not satisfied") S = functions_new(minus_P) functions_axpy(S, -alpha, minus_P) functions_axpy(X, 1.0, S) Y = functions_copy(new_Fp_val) functions_axpy(Y, -1.0, old_Fp_val) S_inner_Y, S_Y_added, S_Y_removed = H_approx.append(S, Y, remove=True) if S_Y_added: if theta_scale: H_0_Y = H_0(*Y) if block_theta_scale and len(Y) > 1: assert len(S) == len(Y) assert len(S) == len(H_0_Y) theta = [abs(function_inner(H_0_y, y) / function_inner(s, y)) for s, y, H_0_y in zip(S, Y, H_0_Y)] else: theta = functions_inner(H_0_Y, Y) / S_inner_Y del H_0_Y else: logger.warning(f"L-BFGS: Iteration {it + 1:d}, small or negative " f"inner product {S_inner_Y:.6e} -- update skipped") del S_Y_removed it += 1 logger.info(f"L-BFGS: Iteration {it:d}, " f"F calls {F_calls[0]:d}, " f"Fp calls {Fp_calls[0]:d}, " f"functional value {new_F_val:.6e}") if s_atol is not None: s_norm_sq = abs(functions_inner(S, M(*S))) logger.debug(f" Change norm = {np.sqrt(s_norm_sq):.6e}") if s_norm_sq <= s_atol * s_atol: conv = True reason = "s_atol reached" break if converged(it, old_F_val, new_F_val, X, new_Fp_val, S, Y): conv = True reason = "converged" break if it >= max_its: conv = False reason = "max_its reached" break old_F_val = new_F_val old_Fp_val = new_Fp_val del new_F_val, new_Fp_val, new_Fp_val_rank0 old_Fp_norm_sq = abs(functions_inner(M_inv(*old_Fp_val), old_Fp_val)) assert conv is not None assert reason is not None return (X[0] if len(X) == 1 else X, it, conv, reason, F_calls[0], Fp_calls[0], H_approx)
def line_search(F, Fp, X, minus_P, c1=1.0e-4, c2=0.9, old_F_val=None, old_Fp_val=None, line_search_rank0=line_search_rank0_scipy_line_search, line_search_rank0_kwargs={}, comm=None): Fp = wrapped_action(Fp) if is_function(X): X_rank1 = (X,) else: X_rank1 = X del X if is_function(minus_P): minus_P = (minus_P,) if len(minus_P) != len(X_rank1): raise OptimizationException("Incompatible shape") if comm is None: comm = function_comm(X_rank1[0]) comm = comm.Dup() last_F = [None, None] def F_rank0(x): X_rank0 = x del x X = functions_copy(X_rank1) functions_axpy(X, -X_rank0, minus_P) last_F[0] = float(X_rank0) last_F[1] = F(*X) return last_F[1] last_Fp = [None, None, None] def Fp_rank0(x): X_rank0 = x del x X = functions_copy(X_rank1) functions_axpy(X, -X_rank0, minus_P) last_Fp[0] = float(X_rank0) last_Fp[1] = functions_copy(Fp(*X)) last_Fp[2] = -functions_inner(minus_P, last_Fp[1]) return last_Fp[2] if old_F_val is None: old_F_val = F_rank0(0.0) if old_Fp_val is None: old_Fp_val_rank0 = Fp_rank0(0.0) else: if is_function(old_Fp_val): old_Fp_val = (old_Fp_val,) if len(old_Fp_val) != len(X_rank1): raise OptimizationException("Incompatible shape") old_Fp_val_rank0 = -functions_inner(minus_P, old_Fp_val) del old_Fp_val if comm.rank == 0: def F_rank0_bcast(x): comm.bcast(("F_rank0", (x,)), root=0) return F_rank0(x) def Fp_rank0_bcast(x): comm.bcast(("Fp_rank0", (x,)), root=0) return Fp_rank0(x) alpha, new_F_val = line_search_rank0( F_rank0_bcast, Fp_rank0_bcast, c1, c2, old_F_val=old_F_val, old_Fp_val=old_Fp_val_rank0, **line_search_rank0_kwargs) comm.bcast(("return", (alpha, new_F_val)), root=0) else: while True: action, data = comm.bcast(None, root=0) if action == "F_rank0": X_rank0, = data F_rank0(X_rank0) elif action == "Fp_rank0": X_rank0, = data Fp_rank0(X_rank0) elif action == "return": alpha, new_F_val = data break else: raise OptimizationException(f"Unexpected action '{action:s}'") comm.Free() if alpha is None: return None, old_Fp_val_rank0, None, None, None else: if new_F_val is None: if last_F[0] is not None and last_F[0] == alpha: new_F_val = last_F[1] else: new_F_val = F_rank0(alpha) if last_Fp[0] is not None and last_Fp[0] == alpha: new_Fp_val_rank1 = last_Fp[1] new_Fp_val_rank0 = last_Fp[2] else: new_Fp_val_rank0 = Fp_rank0(alpha) assert last_Fp[0] == alpha new_Fp_val_rank1 = last_Fp[1] assert last_Fp[2] == new_Fp_val_rank0 return (alpha, old_Fp_val_rank0, new_F_val, new_Fp_val_rank1[0] if len(new_Fp_val_rank1) == 1 else new_Fp_val_rank1, # noqa: E501 new_Fp_val_rank0)
def minimize_l_bfgs(forward, M0, m, s_atol, g_atol, J0=None, manager=None, **kwargs): if not isinstance(M0, Sequence): (x,), optimization_data = minimize_l_bfgs( forward, (M0,), m, s_atol, g_atol, J0=J0, manager=manager, **kwargs) return x, optimization_data M0 = [m0 if is_function(m0) else m0.m() for m0 in M0] if manager is None: manager = _manager() M = [function_new(m0, static=function_is_static(m0), cache=function_is_cached(m0), checkpoint=function_is_checkpointed(m0)) for m0 in M0] last_F = [None, None, None] if J0 is not None: last_F[0] = functions_copy(M0) last_F[1] = M0 last_F[2] = J0 @restore_manager def F(*X, force=False): if not force and last_F[0] is not None: change_norm = 0.0 assert len(X) == len(last_F[0]) for m, last_m in zip(X, last_F[0]): change = function_copy(m) function_axpy(change, -1.0, last_m) change_norm = max(change_norm, function_linf_norm(change)) if change_norm == 0.0: return last_F[2].value() last_F[0] = functions_copy(X) functions_assign(M, X) clear_caches(*M) set_manager(manager) manager.reset() manager.stop() clear_caches() last_F[1] = M manager.start() last_F[2] = forward(last_F[1]) manager.stop() return last_F[2].value() def Fp(*X): F(*X, force=last_F[1] is None) dJ = manager.compute_gradient(last_F[2], last_F[1]) if manager._cp_method not in ["memory", "periodic_disk"]: last_F[1] = None return dJ X, its, conv, reason, F_calls, Fp_calls, H_approx = l_bfgs( F, Fp, M0, m, s_atol, g_atol, comm=manager.comm(), **kwargs) if is_function(X): X = (X,) return X, (its, conv, reason, F_calls, Fp_calls, H_approx)