Exemplo n.º 1
0
def elastic_net(X, Y, params):
    """

    :param X: np.ndarray [K x N]
    :param Y: np.ndarray [1 x N]
    :param params: dict
    :return:
    """
    assert check.argument_type(X, np.ndarray)
    assert check.argument_type(Y, np.ndarray)

    (K, N) = X.shape
    X = X.T  # Make X into [N, K]
    Y = Y.flatten()  # Make Y into [N, ]

    # Fit the linear model using the elastic net
    model = ElasticNetCV(**params).fit(X, Y)

    # Set coefficients below threshold to 0
    coefs = model.coef_  # Get all model coefficients [K, ]
    coefs[np.abs(coefs) < MIN_COEF] = 0.  # Threshold coefficients
    coef_nonzero = coefs != 0  # Create a boolean array where coefficients are nonzero [K, ]

    # If there are non-zero coefficients, redo the linear regression with them alone
    # And calculate beta_resc
    if coef_nonzero.sum() > 0:
        x = X[:, coef_nonzero]
        utils.make_array_2d(Y)
        betas = base_regression.recalculate_betas_from_selected(x, Y)
        betas_resc = base_regression.predict_error_reduction(x, Y, betas)
        return dict(pp=coef_nonzero, betas=betas, betas_resc=betas_resc)
    else:
        return dict(pp=np.repeat(True, K).tolist(),
                    betas=np.zeros(K),
                    betas_resc=np.zeros(K))
def best_subset_regression(x, y, gprior):
    """

    :param x: np.ndarray
        Independent (predictor) variables [n x k]
    :param y: np.ndarray
        Dependent (response) variable [n x 1]
    :param gprior: np.ndarray
        Weighted priors [k x 1]
    :return:
    """
    (n, k) = x.shape
    combos = combo_index(k)

    bic_combos = calc_all_expected_BIC(x, y, gprior, combos, check_rank=False)

    best_betas = np.zeros(k, dtype=np.dtype(float))
    try:
        best_combo = combos[:, _best_combo_idx(x, bic_combos, combos)]
    except np.linalg.LinAlgError:
        return best_betas

    if best_combo.sum() > 0:
        best_betas = base_regression.recalculate_betas_from_selected(
            x, y, best_combo)

    return best_betas
 def test_recalculate_betas_from_selected(self):
     # testing rank(xtx) = shape(xtx)
     x = np.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]])
     y = np.array([0, 1, 0, 1, 0])
     result = base_regression.recalculate_betas_from_selected(x,
                                                              y,
                                                              idx=None)
     np.testing.assert_array_almost_equal(result, np.array([-0.4, 0.4]))
 def test_recalculate_betas_from_selected_matrix_rank(self):
     # test that the matrix rank(A) = min(n,m)
     # dim(v) - rank(A) = null(A) = 0
     x = np.array([[2, 4, 6], [4, 8, 12]])
     y = np.array([1, 1])
     result = base_regression.recalculate_betas_from_selected(x,
                                                              y,
                                                              idx=None)
     np.testing.assert_array_almost_equal(result, np.array([0.0, 0.0, 0.0]),
                                          2)
def sklearn_gene(x, y, model, min_coef=None, **kwargs):
    """
    Use a scikit-learn model for regression

    :param x: Feature array
    :type x: np.ndarray [N x K]
    :param y: Response array
    :type y: np.ndarray [N x 1]
    :param model: Instance of a scikit BaseEstimator-derived model
    :type model: BaseEstimator
    :param min_coef: A minimum coefficient value to include in the model. Any values smaller will be set to 0.
    :type min_coef: numeric
    :return: A dict of results for this gene
    :rtype: dict
    """
    assert check.argument_type(x, np.ndarray)
    assert check.argument_type(y, np.ndarray)
    assert check.argument_is_subclass(model, BaseEstimator)

    (N, K) = x.shape

    # Fit the model
    model.fit(x, y, **kwargs)

    # Get all model coefficients [K, ]
    try:
        coefs = model.coef_
    except AttributeError:
        coefs = model.estimator_.coef_

    # Set coefficients below threshold to 0
    if min_coef is not None:
        coefs[np.abs(coefs) < min_coef] = 0.  # Threshold coefficients

    coef_nonzero = coefs != 0  # Create a boolean array where coefficients are nonzero [K, ]

    # If there are non-zero coefficients, redo the linear regression with them alone
    # And calculate beta_resc
    if coef_nonzero.sum() > 0:
        x = x[:, coef_nonzero]
        utils.make_array_2d(y)
        betas = base_regression.recalculate_betas_from_selected(x, y)
        betas_resc = base_regression.predict_error_reduction(x, y, betas)
        return dict(pp=coef_nonzero, betas=betas, betas_resc=betas_resc)
    else:
        return dict(pp=np.repeat(True, K).tolist(),
                    betas=np.zeros(K),
                    betas_resc=np.zeros(K))
Exemplo n.º 6
0
def stars_model_select(x, y, alphas, threshold=_DEFAULT_THRESHOLD, num_subsamples=_DEFAULT_NUM_SUBSAMPLES,
                       random_seed=_DEFAULT_SEED, method='lasso', **kwargs):
    """
    Model using StARS (Stability Approach to Regularization Selection) for model selection

    :param x:
    :param y:
    :param alphas:
    :param threshold:
    :param num_subsamples:
    :param random_seed:
    :param method:
    :param kwargs:
    :return:
    """

    if method.lower() == 'lasso':
        _regress_func = lasso
    elif method.lower() == 'ridge':
        _regress_func = ridge
    else:
        raise ValueError("Method must be 'lasso' or 'ridge'")

    # Number of obs
    n, k = x.shape

    if n < num_subsamples:
        msg = "Subsamples ({ns}) for StARS is larger than the number of samples ({n})".format(ns=num_subsamples, n=n)
        raise ValueError(msg)

    # Calculate the number of obs per subsample
    b = math.floor(n / num_subsamples)

    # Make an index for subsampling
    idx = _make_subsample_idx(n, b, num_subsamples, random_seed=random_seed)

    # Calculate betas for stability selection
    betas = {a: [] for a in alphas}
    for sample in range(num_subsamples):
        # Sample and put into column-major (the coordinate descent implementation in sklearn wants that order)
        x_samp = np.asarray(x[idx == sample, :], order='F')
        y_samp = y[idx == sample]

        for a in alphas:
            betas[a].append(_regress_func(x_samp, y_samp, a, **kwargs))

    # Calculate edge stability
    stabilities = {a: _calculate_stability(betas[a]) for a in alphas}

    # Calculate monotonic increasing (as alpha decreases) mean edge stability
    alphas = np.sort(alphas)[::-1]
    total_instability = np.array([np.mean(stabilities[a]) for a in alphas])

    for i in range(1, len(total_instability)):
        if total_instability[i] < total_instability[i - 1]:
            total_instability[i] = total_instability[i - 1]

    threshold_alphas = np.array(alphas)[total_instability < threshold]
    selected_alpha = np.min(threshold_alphas) if len(threshold_alphas) > 0 else alphas[0]

    refit_betas = _regress_func(x, y, selected_alpha, **kwargs)
    beta_nonzero = _make_bool_matrix(refit_betas)

    if beta_nonzero.sum() == 0:
        return dict(pp=np.repeat(True, k).tolist(),
                    betas=np.zeros(k),
                    betas_resc=np.zeros(k))
    else:
        x = x[:, beta_nonzero]
        utils.make_array_2d(y)
        betas = base_regression.recalculate_betas_from_selected(x, y)
        betas_resc = base_regression.predict_error_reduction(x, y, betas)

        return dict(pp=beta_nonzero,
                    betas=betas,
                    betas_resc=betas_resc)