def elastic_net(X, Y, params): """ :param X: np.ndarray [K x N] :param Y: np.ndarray [1 x N] :param params: dict :return: """ assert check.argument_type(X, np.ndarray) assert check.argument_type(Y, np.ndarray) (K, N) = X.shape X = X.T # Make X into [N, K] Y = Y.flatten() # Make Y into [N, ] # Fit the linear model using the elastic net model = ElasticNetCV(**params).fit(X, Y) # Set coefficients below threshold to 0 coefs = model.coef_ # Get all model coefficients [K, ] coefs[np.abs(coefs) < MIN_COEF] = 0. # Threshold coefficients coef_nonzero = coefs != 0 # Create a boolean array where coefficients are nonzero [K, ] # If there are non-zero coefficients, redo the linear regression with them alone # And calculate beta_resc if coef_nonzero.sum() > 0: x = X[:, coef_nonzero] utils.make_array_2d(Y) betas = base_regression.recalculate_betas_from_selected(x, Y) betas_resc = base_regression.predict_error_reduction(x, Y, betas) return dict(pp=coef_nonzero, betas=betas, betas_resc=betas_resc) else: return dict(pp=np.repeat(True, K).tolist(), betas=np.zeros(K), betas_resc=np.zeros(K))
def test_predict_error_reduction(self): # len(pp_idx) != 1 x = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4]]) y = np.array([0, 1, 0, 1, 0]) betas = np.array([1, 1, 2]) error_reduction = base_regression.predict_error_reduction(x, y, betas) np.testing.assert_array_almost_equal( error_reduction, np.array([-133.333, -133.333, -133.333]), 2)
def bbsr(X, y, pp, weights, max_k): """ Run BBSR to regress a response variable y in n conditions against predictors X in n conditions. Use the prior predictors matrix to filter the number of predictors from something massive to max_k. :param X: np.ndarray [K x N] Predictor features :param y: np.ndarray [N,] Response variables :param pp: np.ndarray [K,] Predictors to model with :param weights: np.ndarray [K,] Weight matrix :param max_k: int Max number of predictors :return: dict pp: Boolean array indicating which predictors are included in the model [K,] betas: Float array indicating the beta for each predictor included in the model [K,] betas_resc: Float array indicating how much each predictor is contributing to the model [K,] """ # Skip everything if there are no predictors in pp if pp.sum() == 0: return dict(pp=np.repeat(True, pp.shape[0]).tolist(), betas=np.zeros(pp.shape[0]), betas_resc=np.zeros(pp.shape[0])) # Subset data to desired predictors pp_idx = base_regression.bool_to_index(pp) utils.Debug.vprint("Beginning regression with {pp_len} predictors".format( pp_len=len(pp_idx)), level=2) x = X[pp_idx, :].T gprior = weights[pp_idx].astype(np.dtype(float)) # Make sure arrays are 2d utils.make_array_2d(x) utils.make_array_2d(y) utils.make_array_2d(gprior) # Reduce predictors to max_k pp[pp_idx] = reduce_predictors(x, y, gprior, max_k) pp_idx = base_regression.bool_to_index(pp) utils.Debug.vprint( "Reduced to {pp_len} predictors".format(pp_len=len(pp_idx)), level=2) # Resubset with the newly reduced predictors x = X[pp_idx, :].T gprior = weights[pp_idx].astype(np.dtype(float)) utils.make_array_2d(gprior) betas = best_subset_regression(x, y, gprior) betas_resc = base_regression.predict_error_reduction(x, y, betas) return dict(pp=pp, betas=betas, betas_resc=betas_resc)
def sklearn_gene(x, y, model, min_coef=None, **kwargs): """ Use a scikit-learn model for regression :param x: Feature array :type x: np.ndarray [N x K] :param y: Response array :type y: np.ndarray [N x 1] :param model: Instance of a scikit BaseEstimator-derived model :type model: BaseEstimator :param min_coef: A minimum coefficient value to include in the model. Any values smaller will be set to 0. :type min_coef: numeric :return: A dict of results for this gene :rtype: dict """ assert check.argument_type(x, np.ndarray) assert check.argument_type(y, np.ndarray) assert check.argument_is_subclass(model, BaseEstimator) (N, K) = x.shape # Fit the model model.fit(x, y, **kwargs) # Get all model coefficients [K, ] try: coefs = model.coef_ except AttributeError: coefs = model.estimator_.coef_ # Set coefficients below threshold to 0 if min_coef is not None: coefs[np.abs(coefs) < min_coef] = 0. # Threshold coefficients coef_nonzero = coefs != 0 # Create a boolean array where coefficients are nonzero [K, ] # If there are non-zero coefficients, redo the linear regression with them alone # And calculate beta_resc if coef_nonzero.sum() > 0: x = x[:, coef_nonzero] utils.make_array_2d(y) betas = base_regression.recalculate_betas_from_selected(x, y) betas_resc = base_regression.predict_error_reduction(x, y, betas) return dict(pp=coef_nonzero, betas=betas, betas_resc=betas_resc) else: return dict(pp=np.repeat(True, K).tolist(), betas=np.zeros(K), betas_resc=np.zeros(K))
def stars_model_select(x, y, alphas, threshold=_DEFAULT_THRESHOLD, num_subsamples=_DEFAULT_NUM_SUBSAMPLES, random_seed=_DEFAULT_SEED, method='lasso', **kwargs): """ Model using StARS (Stability Approach to Regularization Selection) for model selection :param x: :param y: :param alphas: :param threshold: :param num_subsamples: :param random_seed: :param method: :param kwargs: :return: """ if method.lower() == 'lasso': _regress_func = lasso elif method.lower() == 'ridge': _regress_func = ridge else: raise ValueError("Method must be 'lasso' or 'ridge'") # Number of obs n, k = x.shape if n < num_subsamples: msg = "Subsamples ({ns}) for StARS is larger than the number of samples ({n})".format(ns=num_subsamples, n=n) raise ValueError(msg) # Calculate the number of obs per subsample b = math.floor(n / num_subsamples) # Make an index for subsampling idx = _make_subsample_idx(n, b, num_subsamples, random_seed=random_seed) # Calculate betas for stability selection betas = {a: [] for a in alphas} for sample in range(num_subsamples): # Sample and put into column-major (the coordinate descent implementation in sklearn wants that order) x_samp = np.asarray(x[idx == sample, :], order='F') y_samp = y[idx == sample] for a in alphas: betas[a].append(_regress_func(x_samp, y_samp, a, **kwargs)) # Calculate edge stability stabilities = {a: _calculate_stability(betas[a]) for a in alphas} # Calculate monotonic increasing (as alpha decreases) mean edge stability alphas = np.sort(alphas)[::-1] total_instability = np.array([np.mean(stabilities[a]) for a in alphas]) for i in range(1, len(total_instability)): if total_instability[i] < total_instability[i - 1]: total_instability[i] = total_instability[i - 1] threshold_alphas = np.array(alphas)[total_instability < threshold] selected_alpha = np.min(threshold_alphas) if len(threshold_alphas) > 0 else alphas[0] refit_betas = _regress_func(x, y, selected_alpha, **kwargs) beta_nonzero = _make_bool_matrix(refit_betas) if beta_nonzero.sum() == 0: return dict(pp=np.repeat(True, k).tolist(), betas=np.zeros(k), betas_resc=np.zeros(k)) else: x = x[:, beta_nonzero] utils.make_array_2d(y) betas = base_regression.recalculate_betas_from_selected(x, y) betas_resc = base_regression.predict_error_reduction(x, y, betas) return dict(pp=beta_nonzero, betas=betas, betas_resc=betas_resc)
def test_PredictErrorReduction_all_zero_predictors(self): self.X = np.array([[0, 0], [0, 0]]) self.Y = np.array([1, 2]) betas = np.array([0., 0.]) result = base_regression.predict_error_reduction(self.X, self.Y, betas) self.assertTrue((result == [0., 0.]).all())