def minimize_frank_wolfe( fun, x0, lmo, x0_rep=None, variant='vanilla', jac="2-point", step="backtracking", lipschitz=None, args=(), max_iter=400, tol=1e-12, callback=None, verbose=0, eps=1e-8, ): r"""Frank-Wolfe algorithm. Implements the Frank-Wolfe algorithm, see , see :ref:`frank_wolfe` for a more detailed description. Args: fun : callable The objective function to be minimized. ``fun(x, *args) -> float`` where x is an 1-D array with shape (n,) and `args` is a tuple of the fixed parameters needed to completely specify the function. x0: array-like Initial guess for solution. lmo: callable Takes as input a vector u of same size as x0 and returns both the update direction and the maximum admissible step-size. x0_rep: immutable Is used to initialize the active set when variant == 'pairwise'. variant: {'vanilla, 'pairwise'} Determines which Frank-Wolfe variant to use, along with lmo. Pairwise sets up and updates an active set of vertices. This is needed to make sure to not move out of the constraint set when using a pairwise LMO. jac : {callable, '2-point', bool}, optional Method for computing the gradient vector. If it is a callable, it should be a function that returns the gradient vector: ``jac(x, *args) -> array_like, shape (n,)`` where x is an array with shape (n,) and `args` is a tuple with the fixed parameters. Alternatively, the '2-point' select a finite difference scheme for numerical estimation of the gradient. If `jac` is a Boolean and is True, `fun` is assumed to return the gradient along with the objective function. If False, the gradient will be estimated using '2-point' finite difference estimation. step: str or callable, optional Step-size strategy to use. Should be one of - "backtracking", will use the backtracking line-search from [PANJ2020]_ - "DR", will use the Demyanov-Rubinov step-size. This step-size minimizes a quadratic upper bound ob the objective using the gradient's lipschitz constant, passed in keyword argument `lipschitz`. [P2018]_ - "sublinear", will use a decreasing step-size of the form 2/(k+2). [J2013]_ - callable, if step is a callable function, it will use the step-size returned by step(locals). lipschitz: None or float, optional Estimate for the Lipschitz constant of the gradient. Required when step="DR". max_iter: integer, optional Maximum number of iterations. tol: float, optional Tolerance of the stopping criterion. The algorithm will stop whenever the Frank-Wolfe gap is below tol or the maximum number of iterations is exceeded. callback: callable, optional Callback to execute at each iteration. If the callable returns False then the algorithm with immediately return. eps: float or ndarray If jac is approximated, use this value for the step size. verbose: int, optional Verbosity level. Returns: scipy.optimize.OptimizeResult The optimization result represented as a ``scipy.optimize.OptimizeResult`` object. Important attributes are: ``x`` the solution array, ``success`` a Boolean flag indicating if the optimizer exited successfully and ``message`` which describes the cause of the termination. See `scipy.optimize.OptimizeResult` for a description of other attributes. References: .. [J2013] Jaggi, Martin. `"Revisiting Frank-Wolfe: Projection-Free Sparse Convex Optimization." <http://proceedings.mlr.press/v28/jaggi13-supp.pdf>`_ ICML 2013. .. [P2018] Pedregosa, Fabian `"Notes on the Frank-Wolfe Algorithm" <http://fa.bianp.net/blog/2018/notes-on-the-frank-wolfe-algorithm-part-i/>`_, 2018 .. [PANJ2020] Pedregosa, Fabian, Armin Askari, Geoffrey Negiar, and Martin Jaggi. `"Step-Size Adaptivity in Projection-Free Optimization." <https://arxiv.org/pdf/1806.05123.pdf>`_ arXiv:1806.05123 (2020). Examples: * :ref:`sphx_glr_auto_examples_frank_wolfe_plot_sparse_benchmark.py` * :ref:`sphx_glr_auto_examples_frank_wolfe_plot_vertex_overlap.py` """ x0 = np.asanyarray(x0, dtype=np.float) if tol < 0: raise ValueError("Tol must be non-negative") x = x0.copy() if variant == 'vanilla': active_set = None elif variant == 'pairwise': active_set = defaultdict(float) active_set[x0_rep] = 1. else: raise ValueError("Variant must be one of {'vanilla', 'pairwise'}.") lipschitz_t = None step_size = None if lipschitz is not None: lipschitz_t = lipschitz func_and_grad = utils.build_func_grad(jac, fun, args, eps) f_t, grad = func_and_grad(x) old_f_t = None for it in range(max_iter): update_direction, fw_vertex_rep, away_vertex_rep, max_step_size = lmo( -grad, x, active_set) norm_update_direction = linalg.norm(update_direction)**2 certificate = np.dot(update_direction, -grad) # .. compute an initial estimate for the .. # .. Lipschitz estimate if not given ... if lipschitz_t is None: eps = 1e-3 grad_eps = func_and_grad(x + eps * update_direction)[1] lipschitz_t = linalg.norm(grad - grad_eps) / ( eps * np.sqrt(norm_update_direction)) print("Estimated L_t = %s" % lipschitz_t) if certificate <= tol: break if hasattr(step, "__call__"): step_size = step(locals()) f_next, grad_next = func_and_grad(x + step_size * update_direction) elif step == "backtracking": step_size, lipschitz_t, f_next, grad_next = backtracking_step_size( x, f_t, old_f_t, func_and_grad, certificate, lipschitz_t, max_step_size, update_direction, norm_update_direction, ) elif step == "DR": if lipschitz is None: raise ValueError( 'lipschitz needs to be specified with step="DR"') step_size = min( certificate / (norm_update_direction * lipschitz_t), max_step_size) f_next, grad_next = func_and_grad(x + step_size * update_direction) elif step == "sublinear": # .. without knowledge of the Lipschitz constant .. # .. we take the sublinear 2/(k+2) step-size .. step_size = 2.0 / (it + 2) f_next, grad_next = func_and_grad(x + step_size * update_direction) else: raise ValueError("Invalid option step=%s" % step) if callback is not None: if callback(locals()) is False: # pylint: disable=g-bool-id-comparison break x += step_size * update_direction if variant == 'pairwise': update_active_set(active_set, fw_vertex_rep, away_vertex_rep, step_size) old_f_t = f_t f_t, grad = f_next, grad_next if callback is not None: callback(locals()) return optimize.OptimizeResult(x=x, nit=it, certificate=certificate, active_set=active_set)
def minimize_proximal_gradient( fun, x0, prox=None, jac="2-point", tol=1e-6, max_iter=500, args=(), verbose=0, callback=None, step="backtracking", accelerated=False, eps=1e-8, max_iter_backtracking=1000, backtracking_factor=0.6, trace_certificate=False, ): """Proximal gradient descent. Solves problems of the form minimize_x f(x) + g(x) where f is a differentiable function and we have access to the proximal operator of g. Args: fun : callable The objective function to be minimized. ``fun(x, *args) -> float`` where x is an 1-D array with shape (n,) and `args` is a tuple of the fixed parameters needed to completely specify the function. x0 : ndarray, shape (n,) Initial guess. Array of real elements of size (n,), where 'n' is the number of independent variables. jac : {callable, '2-point', bool}, optional Method for computing the gradient vector. If it is a callable, it should be a function that returns the gradient vector: ``jac(x, *args) -> array_like, shape (n,)`` where x is an array with shape (n,) and `args` is a tuple with the fixed parameters. Alternatively, the '2-point' select a finite difference scheme for numerical estimation of the gradient. If `jac` is a Boolean and is True, `fun` is assumed to return the gradient along with the objective function. If False, the gradient will be estimated using '2-point' finite difference estimation. prox : callable, optional. Proximal operator g. args : tuple, optional Extra arguments passed to the objective function and its derivatives (`fun`, `jac` and `hess` functions). tol: float, optional Tolerance of the optimization procedure. The iteration stops when the gradient mapping (a generalization of the gradient to non-smooth functions) is below this tolerance. max_iter : int, optional. Maximum number of iterations. verbose : int, optional. Verbosity level, from 0 (no output) to 2 (output on each iteration) callback : callable. callback function (optional). Takes a single argument (x) with the current coefficients in the algorithm. The algorithm will exit if callback returns False. step : "backtracking" or callable. Step-size strategy to use. "backtracking" will use a backtracking line-search, while callable will use the value returned by step(locals()). accelerated: boolean Whether to use the accelerated variant of the algorithm. eps: float or ndarray If jac is approximated, use this value for the step size. max_iter_backtracking: int backtracking_factor: float trace_certificate: bool Returns: res : The optimization result represented as a ``scipy.optimize.OptimizeResult`` object. Important attributes are: ``x`` the solution array, ``success`` a Boolean flag indicating if the optimizer exited successfully and ``message`` which describes the cause of the termination. See `scipy.optimize.OptimizeResult` for a description of other attributes. References: Beck, Amir, and Marc Teboulle. "Gradient-based algorithms with applications to signal recovery." Convex optimization in signal processing and communications (2009) Examples: * :ref:`sphx_glr_auto_examples_plot_group_lasso.py` """ x = np.asarray(x0).flatten() if max_iter_backtracking <= 0: raise ValueError("Line search iterations need to be greater than 0") if prox is None: def _prox(x, _): return x prox = _prox success = False certificate = np.NaN func_and_grad = utils.build_func_grad(jac, fun, args, eps) # find initial step-size if step == "backtracking": step_size = 1.8 / utils.init_lipschitz(func_and_grad, x0) else: # to avoid step_size being undefined upon return step_size = None n_iterations = 0 certificate_list = [] # .. a while loop instead of a for loop .. # .. allows for infinite or floating point max_iter .. if not accelerated: fk, grad_fk = func_and_grad(x) while True: if callback is not None: if callback(locals()) is False: # pylint: disable=g-bool-id-comparison break # .. compute gradient and step size if hasattr(step, "__call__"): step_size = step(locals()) x_next = prox(x - step_size * grad_fk, step_size) update_direction = x_next - x f_next, grad_next = func_and_grad(x_next) elif step == "backtracking": x_next = prox(x - step_size * grad_fk, step_size) update_direction = x_next - x step_size *= 1.1 for _ in range(max_iter_backtracking): f_next, grad_next = func_and_grad(x_next) rhs = (fk + grad_fk.dot(update_direction) + update_direction.dot(update_direction) / (2.0 * step_size)) if f_next <= rhs: # .. step size found .. break else: # .. backtracking, reduce step size .. step_size *= backtracking_factor x_next = prox(x - step_size * grad_fk, step_size) update_direction = x_next - x else: warnings.warn( "Maxium number of line-search iterations reached") elif step == "fixed": x_next = prox(x - step_size * grad_fk, step_size) update_direction = x_next - x f_next, grad_next = func_and_grad(x_next) else: raise ValueError("Step-size strategy not understood") certificate = np.linalg.norm((x - x_next) / step_size) if trace_certificate: certificate_list.append(certificate) x[:] = x_next fk = f_next grad_fk = grad_next if certificate < tol: success = True break if n_iterations >= max_iter: break else: n_iterations += 1 else: warnings.warn( "minimize_proximal_gradient did not reach the desired tolerance level", RuntimeWarning, ) else: tk = 1 # .. a while loop instead of a for loop .. # .. allows for infinite or floating point max_iter .. yk = x.copy() while True: grad_fk = func_and_grad(yk)[1] if callback is not None: if callback(locals()) is False: # pylint: disable=g-bool-id-comparison break # .. compute gradient and step size if hasattr(step, "__call__"): current_step_size = step(locals()) x_next = prox(yk - current_step_size * grad_fk, current_step_size) t_next = (1 + np.sqrt(1 + 4 * tk * tk)) / 2 yk = x_next + ((tk - 1.0) / t_next) * (x_next - x) t_next = (1 + np.sqrt(1 + 4 * tk * tk)) / 2 yk = x_next + ((tk - 1.0) / t_next) * (x_next - x) x_prox = prox( x_next - current_step_size * func_and_grad(x_next)[1], current_step_size, ) certificate = np.linalg.norm((x - x_prox) / current_step_size) tk = t_next x = x_next.copy() elif step == "backtracking": current_step_size = step_size x_next = prox(yk - current_step_size * grad_fk, current_step_size) for _ in range(max_iter_backtracking): update_direction = x_next - yk if func_and_grad( x_next )[0] <= func_and_grad(yk)[0] + grad_fk.dot( update_direction) + update_direction.dot( update_direction) / (2.0 * current_step_size): # .. step size found .. break else: # .. backtracking, reduce step size .. current_step_size *= backtracking_factor x_next = prox(yk - current_step_size * grad_fk, current_step_size) else: warnings.warn( "Maxium number of line-search iterations reached") t_next = (1 + np.sqrt(1 + 4 * tk * tk)) / 2 yk = x_next + ((tk - 1.0) / t_next) * (x_next - x) x_prox = prox( x_next - current_step_size * func_and_grad(x_next)[1], current_step_size, ) certificate = np.linalg.norm((x - x_prox) / current_step_size) if trace_certificate: certificate_list.append(certificate) tk = t_next x = x_next.copy() if certificate < tol: success = True break if n_iterations >= max_iter: break else: n_iterations += 1 if n_iterations >= max_iter: warnings.warn( "minimize_proximal_gradient did not reach the desired tolerance level", RuntimeWarning, ) return optimize.OptimizeResult( x=x, success=success, certificate=certificate, nit=n_iterations, step_size=step_size, trace_certificate=certificate_list, )