def test_with_stratified_bootstrap(): n, p, k = 1000, 1000, 5 X, y, important_betas = _generate_dummy_classification_data(n=n, k=k) selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25), verbose=1, bootstrap_func='stratified') selector.fit(X, y) chosen_betas = selector.get_support(indices=True) assert_almost_equal(important_betas, chosen_betas)
def test_stability_selection_classification(): n, p, k = 1000, 1000, 5 X, y, important_betas = _generate_dummy_classification_data(n=n, k=k) selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25), verbose=1) selector.fit(X, y) chosen_betas = selector.get_support(indices=True) X_r = selector.transform(X) assert_almost_equal(important_betas, chosen_betas) assert (X_r.shape == (n, k)) assert (selector.stability_scores_.shape == ( p, selector.lambda_grid.shape[0]))
def test_different_shape(): n, p, k = 100, 200, 5 X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) base_estimator = Pipeline([('scaler', StandardScaler()), ('model', Lasso())]) lambdas_grid = np.logspace(-1, 1, num=10) selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__alpha', lambda_grid=lambdas_grid) selector.fit(X, y) selector.transform(X[:, :-2])
def test_issparse(): n, p = 200, 200 rho = 0.6 weakness = 0.2 X, y = generate_experiment_data(n, p, rho) lambda_grid = np.linspace(0.01, 0.5, num=100) estimator = RandomizedLasso(weakness=weakness) selector = StabilitySelection(base_estimator=estimator, lambda_name='alpha', lambda_grid=lambda_grid, threshold=0.9, verbose=1) selector.fit(csr_matrix(X), y)
def test_stability_plot(): n, p, k = 500, 200, 5 X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) base_estimator = Pipeline([('scaler', StandardScaler()), ('model', Lasso())]) lambdas_grid = np.logspace(-1, 1, num=10) selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__alpha', lambda_grid=lambdas_grid) selector.fit(X, y) plot_stability_path(selector, threshold_highlight=0.5)
def test_no_features(): n, p, k = 100, 200, 0 X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) base_estimator = Pipeline([('scaler', StandardScaler()), ('model', Lasso())]) lambdas_grid = np.logspace(-1, 1, num=10) selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__alpha', lambda_grid=lambdas_grid) selector.fit(X, y) assert_almost_equal(selector.transform(X), np.empty(0).reshape((X.shape[0], 0)))
def stability_selection(self, X, y): """ Wrapper around stability-selection package which is based on the stability selection feature selection algorithm [1]. Bootstrap can be performed using complementary pairs subsampling [2]. https://github.com/scikit-learn-contrib/stability-selection [1]: Meinshausen, N. and Buhlmann, P., 2010. Stability selection. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 72(4), pp.417-473. [2] Shah, R.D. and Samworth, R.J., 2013. Variable selection with error control: another look at stability selection. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 75(1), pp.55-80. """ init_params_dic = { 'base_estimator': self.fit_params.get( 'base_estimator', LogisticRegression(penalty='l1', solver='liblinear')), 'lambda_name': self.fit_params.get('lambda_name', 'C'), 'lambda_grid': self.fit_params.get('lambda_grid', np.logspace(-5, -2, 25)), 'n_bootstrap_iterations': self.fit_params.get('n_bootstrap_iterations', self.n_bsamples), 'sample_fraction': self.fit_params.get('sample_fraction', 0.5), 'threshold': self.fit_params.get('threshold', 0.6), 'bootstrap_func': self.fit_params.get('bootstrap_func', 'subsample'), 'bootstrap_threshold': self.fit_params.get('bootstrap_threshold', None), 'verbose': self.fit_params.get('verbose', 0), 'n_jobs': self.fit_params.get('n_jobs', 1), 'pre_dispatch': self.fit_params.get('pre_dispatch', '2*n_jobs'), 'random_state': self.fit_params.get('random_state', self.random_state) } fit_params = self.fit_params.copy() for init_params in init_params_dic: if init_params in fit_params: fit_params.pop(init_params) feature_selector = StabilitySelection(**init_params_dic) feature_selector.fit(X, y) self.accepted_features_index = feature_selector.get_support( indices=True)
def test_stability_selection_regression(): n, p, k = 500, 1000, 5 X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) base_estimator = Pipeline([('scaler', StandardScaler()), ('model', Lasso())]) lambdas_grid = np.logspace(-1, 1, num=10) selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__alpha', lambda_grid=lambdas_grid) selector.fit(X, y) chosen_betas = selector.get_support(indices=True) assert_almost_equal(important_betas, chosen_betas)
def test_randomized_lasso(): n, p = 200, 200 rho = 0.6 weakness = 0.2 X, y = generate_experiment_data(n, p, rho) lambda_grid = np.linspace(0.01, 0.5, num=100) estimator = RandomizedLasso(weakness=weakness) selector = StabilitySelection(base_estimator=estimator, lambda_name='alpha', lambda_grid=lambda_grid, threshold=0.9, verbose=1) selector.fit(X, y) chosen_betas = selector.get_support(indices=True) assert_almost_equal(np.array([0, 1]), chosen_betas)
if model == 'LogisticRegression': clf = LogisticRegression(penalty='l1', class_weight='balanced', solver='auto',random_state= ) if model == 'LogisticRegression': clf = RidgeClassifier(alpha=1.0, class_weight='balanced', solver='auto', random_state=None) self.model = clf base_estimator = Pipeline([ ('scaler', StandardScaler()), ('model', model) ]) selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__C', lambda_grid=np.logspace(-5, -1, 50)) selector.fit(X, y) fig, ax = plot_stability_path(selector) fig.show() selected_variables = selector.get_support(indices=True) selected_scores = selector.stability_scores_.max(axis=1) print('Selected variables are:') print('-----------------------') for idx, (variable, score) in enumerate(zip(selected_variables, selected_scores[selected_variables])): print('Variable %d: [%d], score %.3f' % (idx + 1, variable, score))
# Extract names of each column (using pandas) headers = np.array(list(data.columns.values)) names = headers[2:] #print ("Feature names shape is {}".format(names.shape)) # Extract features (using pandas and numpy) np_array = data.as_matrix() X = np_array[:,2:] #print ("Features shape is {}".format(X.shape)) # Extract labels (using pandas) Y = data['class_label'].as_matrix() lambda_grid = np.linspace(0.001, 0.5, num=100) base_estimator = Pipeline([('scaler', StandardScaler()),('model', LogisticRegression(penalty='l1'))]) selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__C',lambda_grid=np.logspace(-5, -1, 50)) selector.fit(X, Y) selected_variables = selector.get_support(indices=True,threshold=0.00001) selected_scores = selector.stability_scores_.max(axis=1) print('Ranking is:') print('-----------------------') #for idx, (variable, score) in enumerate(zip(selected_variables, selected_scores[selected_variables])): # print('Variable %d: [%d], score %.3f' % (idx + 1, variable, score)) rank = sorted(zip(selected_scores,names),reverse=True) for el in rank: print(el)