예제 #1
0
def test_optimize(accelerated, loss, penalty):
    """Test a method on both the line_search and fixed step size strategy."""
    max_iter = 200
    for alpha in np.logspace(-1, 3, 3):
        obj = loss(A, b, alpha)
        if penalty is not None:
            prox = penalty(1e-3).prox
        else:
            prox = None
        opt = cp.minimize_proximal_gradient(
            obj.f_grad,
            np.zeros(n_features),
            prox=prox,
            jac=True,
            step="backtracking",
            max_iter=max_iter,
            accelerated=accelerated,
        )
        grad_x = obj.f_grad(opt.x)[1]
        assert certificate(opt.x, grad_x, prox) < 1e-5

        opt_2 = cp.minimize_proximal_gradient(
            obj.f_grad,
            np.zeros(n_features),
            prox=prox,
            jac=True,
            max_iter=max_iter,
            step=lambda x: 1 / obj.lipschitz,
            accelerated=accelerated,
        )
        grad_2x = obj.f_grad(opt_2.x)[1]
        assert certificate(opt_2.x, grad_2x, prox) < 1e-5
예제 #2
0
    def run(self, n_iter):
        X, y, solver = self.X, self.y, self.solver
        n_features = X.shape[1]

        x0 = np.zeros(n_features)
        if n_iter == 0:
            self.beta = x0
            return

        f = cp.loss.LogLoss(X, y)
        g = cp.penalty.L1Norm(self.lmbd / X.shape[0])

        warnings.filterwarnings('ignore', category=RuntimeWarning)

        if solver == 'pgd':
            if self.line_search:
                step = 'backtracking'
            else:

                def step(x):
                    return 1.0 / f.lipschitz

            result = cp.minimize_proximal_gradient(
                f.f_grad,
                x0,
                g.prox,
                step=step,
                tol=0,
                max_iter=n_iter,
                jac=True,
                accelerated=self.accelerated,
            )
        elif solver == 'saga':
            step_size = 1.0 / (3 * f.max_lipschitz)
            result = cp.minimize_saga(
                f.partial_deriv,
                X,
                y,
                x0,
                prox=g.prox_factory(n_features),
                step_size=step_size,
                tol=0,
                max_iter=n_iter,
            )
        else:
            assert solver == 'svrg'
            step_size = 1.0 / (3 * f.max_lipschitz)
            result = cp.minimize_svrg(
                f.partial_deriv,
                X,
                y,
                x0,
                prox=g.prox_factory(n_features),
                step_size=step_size,
                tol=0,
                max_iter=n_iter,
            )

        self.beta = result.x
예제 #3
0
def test_vrtos_fl(A_data):
    """Test on overlapping group lasso"""
    n_samples, n_features = A_data.shape
    alpha = 1.0 / n_samples
    f = cp.utils.LogLoss(A_data, b, alpha)
    for beta in np.logspace(-3, 3, 3):
        pen = cp.utils.FusedLasso(beta)
        L = cp.utils.get_max_lipschitz(A_data, "logloss") + alpha / density

        opt_vrtos = cp.minimize_vrtos(
            f.partial_deriv,
            A_data,
            b,
            np.zeros(n_features),
            1 / (3 * L),
            alpha=alpha,
            max_iter=2000,
            prox_1=pen.prox_1_factory(n_features),
            prox_2=pen.prox_2_factory(n_features),
            tol=0,
        )

        opt_pgd = cp.minimize_proximal_gradient(
            f.f_grad, np.zeros(n_features), prox=pen.prox, max_iter=2000, tol=0
        )

        norm = np.linalg.norm(opt_pgd.x)
        if norm < 1e-10:
            norm = 1
        assert np.linalg.norm(opt_vrtos.x - opt_pgd.x) / norm < 1e-4

        # check also the gradient mapping
        ss = 1.0 / L
        grad = f.f_grad(opt_vrtos.x)[1]
        grad_map = (opt_vrtos.x - pen.prox(opt_vrtos.x - ss * grad, ss)) / ss
        assert np.linalg.norm(grad_map) < 1e-6
예제 #4
0
X, y = datasets.make_regression()
n_samples, n_features = X.shape


def loss(w):
    """Squared error loss."""
    z = np.dot(X, w) - y
    return np.sum(z * z) / n_samples


# .. use JAX to compute the gradient of loss value_and_grad ..
# .. returns both the gradient and the objective, which is ..
# .. the format that COPT accepts ..
f_grad = jax.value_and_grad(loss)

w0 = onp.zeros(n_features)

l1_ball = copt.penalty.L1Norm(0.1)
cb = cp.utils.Trace(lambda x: loss(x) + l1_ball(x))
sol = cp.minimize_proximal_gradient(f_grad,
                                    w0,
                                    prox=l1_ball.prox,
                                    callback=cb,
                                    jac=True)
plt.plot(cb.trace_fx, lw=3)
plt.yscale("log")
plt.xlabel("# Iterations")
plt.ylabel("Objective value")
plt.grid()
plt.show()
예제 #5
0
def minimize_accelerated(*args, **kw):
    kw["accelerated"] = True
    return cp.minimize_proximal_gradient(*args, **kw)
예제 #6
0
all_betas = [0, 1e-2, 1e-1, 0.2]
all_trace_ls, all_trace_nols = [], []
out_img = []
for i, beta in enumerate(all_betas):
    print("beta = %s" % beta)
    G1 = cp.utils.GroupL1(beta, groups)

    def loss(x):
        return f(x) + G1(x)

    x0 = np.zeros(n_features)
    pgd = cp.minimize_proximal_gradient(
        f.f_grad,
        x0,
        G1.prox,
        jac=True,
        max_iter=max_iter,
        tol=1e-10,
        trace_certificate=True,
    )
    out_img.append(pgd.x)


# .. plot the results ..
fig, ax = plt.subplots(2, 4, sharey=False)
xlim = [0.02, 0.02, 0.1]
markevery = [1000, 1000, 100, 100]
for i, beta in enumerate(all_betas):
    ax[0, i].set_title("regularization=%s" % beta)
    ax[0, i].set_title("$regularization=%s" % beta)
    ax[0, i].plot(out_img[i])
예제 #7
0
import numpy as np
import pylab as plt

# .. construct (random) dataset ..
n_samples, n_features = 1000, 200
np.random.seed(0)
X = np.random.randn(n_samples, n_features)
y = np.random.rand(n_samples)

f = cp.utils.LogLoss(X, y)
step_size = 1. / f.lipschitz

cb_pgd = cp.utils.Trace(f)
result_pgd = cp.minimize_proximal_gradient(f.f_grad,
                                           np.zeros(n_features),
                                           step_size=step_size,
                                           callback=cb_pgd,
                                           tol=0,
                                           accelerated=False)

cb_apgd = cp.utils.Trace(f)
result_apgd = cp.minimize_proximal_gradient(f.f_grad,
                                            np.zeros(n_features),
                                            step_size=step_size,
                                            callback=cb_apgd,
                                            tol=0,
                                            accelerated=True)

# .. plot the result ..
fmin = min(np.min(cb_pgd.trace_fx), np.min(cb_apgd.trace_fx))
plt.title('Comparison of full gradient optimizers')
plt.plot(cb_apgd.trace_fx - fmin, lw=4, label='accelerated gradient descent')
예제 #8
0
def sgl_estimator(
    x_train,
    y_train,
    x_test,
    y_test,
    groups,
    bias_index=None,
    beta0=None,
    alpha1=0.0,
    alpha2=0.0,
    eta=1.0,
    transform_type=None,
    max_iter=5000,
    tol=1e-6,
    verbose=0,
    suppress_warnings=True,
    cb_trace=False,
    accelerate=False,
    loss_type="logloss",
    clf_threshold=0.5,
    random_state=None,
):
    """Find solution to sparse group lasso problem by proximal gradient descent

    Solve sparse group lasso [1]_ problem for feature matrix `x_train` and
    target vector `y_train` with features partitioned into groups. Solve using
    the proximal gradient descent (PGD) algorithm. Compute accuracy and ROC AUC
    using `x_test` and `y_test`.

    Parameters
    ----------
    x_train : numpy.ndarray
        Training feature matrix

    y_train : numpy.ndarray
        Training target array

    x_test : numpy.ndarray
        Testing feature matrix

    y_test : numpy.ndarray
        Testing target array

    groups : numpy.ndarray
        Array of non-overlapping indices for each group. For example, if nine
        features are grouped into equal contiguous groups of three, then groups
        would be an nd.array like [[0, 1, 2], [3, 4, 5], [6, 7, 8]].

    bias_index : int or None, default=None
        the index of the bias feature in x_train and x_test. If None, assume
        no bias feature.

    beta0 : numpy.ndarray
        Initial guess for coefficient array

    alpha1 : float, default=0.0
        Group lasso regularization parameter. This encourages groupwise
        sparsity.

    alpha2 : float, default=0.0
        Lasso regularization parameter. This encourages within group sparsity.

    eta : float, default=1.0
        Target variable transformation parameter.

    transform_type : ["power", "exponentiation", None], default=None
        Type of transformation, see insight.target_transformation

    max_iter : int, default=5000
        Maximum number of iterations for PGD algorithm.

    tol : float, default=1e-6
        Convergence tolerance for PGD algorithm.

    verbose : int, default=0
        Verbosity flag for PGD algorithm.

    suppress_warnings : bool, default=True
        If True, suppress convergence warnings from PGD algorithm.
        This is useful for hyperparameter tuning when some combinations
        of hyperparameters may not converge.

    cb_trace : bool, default=False
        If True, include copt.utils.Trace() object in return

    accelerate : bool, default=False
        If True, use accelerated PGD algorithm, otherwise use standard PGD.

    loss_type : {'logloss', 'square', 'huber'}
        The type of loss function to use. If 'logloss', treat this problem as
        a binary classification problem using logistic regression. Otherwise,
        treat this problem as a regression problem using either the mean
        square error or the Huber loss.

    clf_threshold : float, default=0.5
        Decision threshold for binary classification

    random_state : int, numpy.RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If numpy.RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    Returns
    -------
    dict
        dict with keys:
        alpha1 - group lasso regularization parameter,
        alpha2 - lasso regularization parameter,
        eta - target variable transformation parameter,
        beta_hat - estimate of the optimal beta,
        test - scores dict for test set,
        train - scores dict for train set,
        trace - copt.utils.Trace object if cv_trace is True, None otherwise

    References
    ----------
    .. [1]  Noah Simon, Jerome Friedman, Trevor Hastie & Robert Tibshirani,
        "A Sparse-Group Lasso," Journal of Computational and Graphical
        Statistics, vol. 22:2, pp. 231-245, 2012
        DOI: 10.1080/10618600.2012.681250
    """
    n_features = x_train.shape[1]

    rng = check_random_state(random_state)
    np.random.set_state(rng.get_state())

    if beta0 is None:
        beta0 = np.zeros(n_features)

    sg1 = SparseGroupL1(alpha1, alpha2, groups, bias_index=bias_index)

    if loss_type not in ["logloss", "square", "huber"]:
        raise ValueError("loss_type must be one of "
                         "['logloss', 'square', 'huber'].")

    ind = np.ones(x_train.shape[1], bool)
    if bias_index is not None:
        ind[bias_index] = False

    # Inverse transform target variables
    if loss_type != "logloss":
        y_train = target_transformation(y=y_train,
                                        eta=eta,
                                        transform_type=transform_type,
                                        direction="inverse")

    if loss_type == "logloss":
        f = cp.utils.LogLoss(x_train, y_train)
    elif loss_type == "huber":
        f = cp.utils.HuberLoss(x_train, y_train)
    else:
        f = cp.utils.SquareLoss(x_train, y_train)

    step_size = 1.0 / f.lipschitz

    if cb_trace:
        cb_tos = cp.utils.Trace(f)
    else:
        cb_tos = None

    if suppress_warnings:
        ctx_mgr = warnings.catch_warnings()
    else:
        ctx_mgr = contextlib.suppress()

    with ctx_mgr:
        # For some metaparameters, minimize_PGD or minimize_APGD might not
        # reach the desired tolerance level. This might be okay during
        # hyperparameter optimization. So ignore the warning if the user
        # specifies suppress_warnings=True
        if suppress_warnings:
            warnings.filterwarnings("ignore", category=RuntimeWarning)
        pgd = cp.minimize_proximal_gradient(
            f.f_grad,
            beta0,
            sg1.prox,
            step_size=step_size,
            max_iter=max_iter,
            tol=tol,
            verbose=verbose,
            callback=cb_tos,
            accelerated=accelerate,
        )

    beta_hat = np.copy(pgd.x)

    # Transform the target variables back to original
    if loss_type != "logloss":
        y_train = target_transformation(y=y_train,
                                        eta=eta,
                                        transform_type=transform_type,
                                        direction="forward")

    if loss_type == "logloss":
        train = classification_scores(x=x_train,
                                      y=y_train,
                                      beta_hat=beta_hat,
                                      clf_threshold=clf_threshold)
        test = classification_scores(x=x_test,
                                     y=y_test,
                                     beta_hat=beta_hat,
                                     clf_threshold=clf_threshold)
    else:
        train = regression_scores(
            x=x_train,
            y=y_train,
            beta_hat=beta_hat,
            eta=eta,
            transform_type=transform_type,
        )
        test = regression_scores(
            x=x_test,
            y=y_test,
            beta_hat=beta_hat,
            eta=eta,
            transform_type=transform_type,
        )

    return dict(
        alpha1=alpha1,
        alpha2=alpha2,
        eta=eta,
        transform_type=transform_type,
        beta_hat=beta_hat,
        test=test,
        train=train,
        trace=cb_tos,
        init_random_state=random_state,
    )
예제 #9
0
    def fit(self, X, y, loss="squared_loss"):
        """Fit a linear model using the sparse group lasso.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.

        y : array-like, shape (n_samples,)
            The target values (class labels in classification, real numbers in
            regression).

        loss : ["squared_loss", "huber", "log"]
            The type of loss function to use in the PGD solver.

        Returns
        -------
        self : object
            Returns self.
        """
        if not isinstance(self.warm_start, bool):
            raise ValueError("The argument warm_start must be bool;"
                             " got {0}".format(self.warm_start))

        allowed_losses = ["squared_loss", "huber"]
        if is_regressor(self) and loss.lower() not in allowed_losses:
            raise ValueError(
                "For regression, the argument loss must be one of {0};"
                "got {1}".format(allowed_losses, loss))

        if not 0 <= self.l1_ratio <= 1:
            raise ValueError(
                "The parameter l1_ratio must satisfy 0 <= l1_ratio <= 1;"
                "got {0}".format(self.l1_ratio))

        if y is None:
            raise ValueError(
                "requires y to be passed, but the target y is None")

        X, y = check_X_y(
            X,
            y,
            accept_sparse=False,
            dtype=[np.float64, np.float32],
            y_numeric=not is_classifier(self),
            multi_output=False,
        )

        _, self.n_features_in_ = X.shape

        if is_classifier(self):
            check_classification_targets(y)
            self.classes_ = np.unique(y)
            y = np.logical_not(y == self.classes_[0]).astype(int)

        n_samples, n_features = X.shape
        if self.fit_intercept:
            X = np.hstack([X, np.ones((n_samples, 1))])

        if self.warm_start and hasattr(self, "coef_"):
            # pylint: disable=access-member-before-definition
            if self.fit_intercept:
                coef = np.concatenate(
                    (self.coef_, np.array([self.intercept_])))
            else:
                coef = self.coef_
        else:
            if self.fit_intercept:
                coef = np.zeros(n_features + 1)
                # Initial bias condition gives 50/50 for binary classification
                coef[-1] = 0.5
            else:
                coef = np.zeros(n_features)

        if loss == "huber":
            f = cp.utils.HuberLoss(X, y)
        elif loss == "log":
            f = cp.utils.LogLoss(X, y)
        else:
            f = cp.utils.SquareLoss(X, y)

        if self.include_solver_trace:
            self.solver_trace_ = cp.utils.Trace(f)
        else:
            self.solver_trace_ = None

        if self.suppress_solver_warnings:
            ctx_mgr = warnings.catch_warnings()
        else:
            ctx_mgr = contextlib.suppress()

        groups = check_groups(self.groups,
                              X,
                              allow_overlap=False,
                              fit_intercept=self.fit_intercept)

        if self.scale_l2_by not in ["group_length", None]:
            raise ValueError("scale_l2_by must be 'group_length' or None; "
                             "got {0}".format(self.scale_l2_by))

        bias_index = n_features if self.fit_intercept else None
        sg1 = SparseGroupL1(
            l1_ratio=self.l1_ratio,
            alpha=self.alpha,
            groups=groups,
            bias_index=bias_index,
            scale_l2_by=self.scale_l2_by,
        )

        with ctx_mgr:
            # For some metaparameters, minimize_PGD might not reach the desired
            # tolerance level. This might be okay during hyperparameter
            # optimization. So ignore the warning if the user specifies
            # suppress_solver_warnings=True
            if self.suppress_solver_warnings:
                warnings.filterwarnings("ignore", category=RuntimeWarning)

            pgd = cp.minimize_proximal_gradient(
                f.f_grad,
                coef,
                sg1.prox,
                jac=True,
                step="backtracking",
                max_iter=self.max_iter,
                tol=self.tol,
                verbose=self.verbose,
                callback=self.solver_trace_,
                accelerated=False,
            )

        if self.fit_intercept:
            self.intercept_ = pgd.x[-1]
            self.coef_ = pgd.x[:-1]
        else:
            # set intercept to zero as the other linear models do
            self.intercept_ = 0.0
            self.coef_ = pgd.x

        self.n_iter_ = pgd.nit

        self.is_fitted_ = True
        return self
예제 #10
0
all_trace_ls, all_trace_nols = [], []
out_img = []
for i, beta in enumerate(all_betas):
    print("beta = %s" % beta)
    G1 = cp.utils.GroupL1(beta, groups)

    def loss(x):
        return f(x) + G1(x)

    cb_tosls = cp.utils.Trace()
    x0 = np.zeros(n_features)
    pgd_ls = cp.minimize_proximal_gradient(
        f.f_grad,
        x0,
        G1.prox,
        step_size=step_size,
        max_iter=max_iter,
        tol=1e-14,
        verbose=1,
        callback=cb_tosls,
    )
    trace_ls = np.array([loss(x) for x in cb_tosls.trace_x])
    all_trace_ls.append(trace_ls)

    cb_tos = cp.utils.Trace()
    x0 = np.zeros(n_features)
    pgd = cp.minimize_proximal_gradient(
        f.f_grad,
        x0,
        G1.prox,
        step_size=step_size,
        max_iter=max_iter,