def test_mutual_info_regression(): X, y = make_regression(n_samples=100, n_features=10, n_informative=2, shuffle=False, random_state=0, noise=10) # Test in KBest mode. univariate_filter = SelectKBest(mutual_info_regression, k=2) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect(mutual_info_regression, mode='k_best', param=2).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(10) gtruth[:2] = 1 assert_array_equal(support, gtruth) # Test in Percentile mode. univariate_filter = SelectPercentile(mutual_info_regression, percentile=20) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(mutual_info_regression, mode='percentile', param=20).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(10) gtruth[:2] = 1 assert_array_equal(support, gtruth)
def single_fdr(alpha, n_informative, random_state): X, y = make_regression(n_samples=150, n_features=20, n_informative=n_informative, shuffle=False, random_state=random_state, noise=10) with warnings.catch_warnings(record=True): # Warnings can be raised when no features are selected # (low alpha or very noisy data) univariate_filter = SelectFdr(f_regression, alpha=alpha) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='fdr', param=alpha).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() num_false_positives = np.sum(support[n_informative:] == 1) num_true_positives = np.sum(support[:n_informative] == 1) if num_false_positives == 0: return 0. false_discovery_rate = (num_false_positives / (num_true_positives + num_false_positives)) return false_discovery_rate
def test_select_percentile_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the percentile heuristic X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect(f_regression, mode='percentile', param=25).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_2 = X.copy() X_2[:, np.logical_not(support)] = 0 assert_array_equal(X_2, univariate_filter.inverse_transform(X_r)) # Check inverse_transform respects dtype assert_array_equal(X_2.astype(bool), univariate_filter.inverse_transform(X_r.astype(bool)))
def test_csr_preprocess_data(): # Test output format of _preprocess_data, when input is csr X, y = make_regression() X[X < 2.5] = 0.0 csr = sparse.csr_matrix(X) csr_, y, _, _, _ = _preprocess_data(csr, y, True) assert csr_.getformat() == 'csr'
def generate_dataset(n_train, n_test, n_features, noise=0.1, verbose=False): """Generate a regression dataset with the given parameters.""" if verbose: print("generating dataset...") X, y, coef = make_regression(n_samples=n_train + n_test, n_features=n_features, noise=noise, coef=True) random_seed = 13 X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=n_train, test_size=n_test, random_state=random_seed) X_train, y_train = shuffle(X_train, y_train, random_state=random_seed) X_scaler = StandardScaler() X_train = X_scaler.fit_transform(X_train) X_test = X_scaler.transform(X_test) y_scaler = StandardScaler() y_train = y_scaler.fit_transform(y_train[:, None])[:, 0] y_test = y_scaler.transform(y_test[:, None])[:, 0] gc.collect() if verbose: print("ok") return X_train, y_train, X_test, y_test
def test_f_regression(): # Test whether the F test yields meaningful results # on a simple simulated regression problem X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) F, pv = f_regression(X, y) assert (F > 0).all() assert (pv > 0).all() assert (pv < 1).all() assert (pv[:5] < 0.05).all() assert (pv[5:] > 1.e-4).all() # with centering, compare with sparse F, pv = f_regression(X, y, center=True) F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=True) assert_array_almost_equal(F_sparse, F) assert_array_almost_equal(pv_sparse, pv) # again without centering, compare with sparse F, pv = f_regression(X, y, center=False) F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=False) assert_array_almost_equal(F_sparse, F) assert_array_almost_equal(pv_sparse, pv)
def test_linear_regression_multiple_outcome(random_state=0): # Test multiple-outcome linear regressions X, y = make_regression(random_state=random_state) Y = np.vstack((y, y)).T n_features = X.shape[1] reg = LinearRegression() reg.fit((X), Y) assert reg.coef_.shape == (2, n_features) Y_pred = reg.predict(X) reg.fit(X, y) y_pred = reg.predict(X) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_invalid_percentile(): X, y = make_regression(n_samples=10, n_features=20, n_informative=2, shuffle=False, random_state=0) with pytest.raises(ValueError): SelectPercentile(percentile=-1).fit(X, y) with pytest.raises(ValueError): SelectPercentile(percentile=101).fit(X, y) with pytest.raises(ValueError): GenericUnivariateSelect(mode='percentile', param=-1).fit(X, y) with pytest.raises(ValueError): GenericUnivariateSelect(mode='percentile', param=101).fit(X, y)
def test_select_percentile_regression_full(): # Test whether the relative univariate feature selection # selects all features when '100%' is asked. X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=100) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect(f_regression, mode='percentile', param=100).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.ones(20) assert_array_equal(support, gtruth)
def test_preprocess_copy_data_no_checks(is_sparse, to_copy): X, y = make_regression() X[X < 2.5] = 0.0 if is_sparse: X = sparse.csr_matrix(X) X_, y_, _, _, _ = _preprocess_data(X, y, True, copy=to_copy, check_input=False) if to_copy and is_sparse: assert not np.may_share_memory(X_.data, X.data) elif to_copy: assert not np.may_share_memory(X_, X) elif is_sparse: assert np.may_share_memory(X_.data, X.data) else: assert np.may_share_memory(X_, X)
def test_select_fwe_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the fwe heuristic X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFwe(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='fwe', param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support[:5], np.ones((5, ), dtype=np.bool)) assert np.sum(support[5:] == 1) < 2
def test_select_kbest_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the k best heuristic X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0, noise=10) univariate_filter = SelectKBest(f_regression, k=5) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect(f_regression, mode='k_best', param=5).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def compute_bench(alpha, n_samples, n_features, precompute): lasso_results = [] lars_lasso_results = [] it = 0 for ns in n_samples: for nf in n_features: it += 1 print('==================') print('Iteration %s of %s' % (it, max(len(n_samples), len(n_features)))) print('==================') n_informative = nf // 10 X, Y, coef_ = make_regression(n_samples=ns, n_features=nf, n_informative=n_informative, noise=0.1, coef=True) X /= np.sqrt(np.sum(X ** 2, axis=0)) # Normalize data gc.collect() print("- benchmarking Lasso") clf = Lasso(alpha=alpha, fit_intercept=False, precompute=precompute) tstart = time() clf.fit(X, Y) lasso_results.append(time() - tstart) gc.collect() print("- benchmarking LassoLars") clf = LassoLars(alpha=alpha, fit_intercept=False, normalize=False, precompute=precompute) tstart = time() clf.fit(X, Y) lars_lasso_results.append(time() - tstart) return lasso_results, lars_lasso_results
def compute_bench(samples_range, features_range): it = 0 results = defaultdict(lambda: []) max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print('====================') print('Iteration %03d of %03d' % (it, max_it)) print('====================') dataset_kwargs = { 'n_samples': n_samples, 'n_features': n_features, 'n_informative': n_features // 10, 'effective_rank': min(n_samples, n_features) / 10, #'effective_rank': None, 'bias': 0.0, } print("n_samples: %d" % n_samples) print("n_features: %d" % n_features) X, y = make_regression(**dataset_kwargs) gc.collect() print("benchmarking lars_path (with Gram):", end='') sys.stdout.flush() tstart = time() G = np.dot(X.T, X) # precomputed Gram matrix Xy = np.dot(X.T, y) lars_path_gram(Xy=Xy, Gram=G, n_samples=y.size, method='lasso') delta = time() - tstart print("%0.3fs" % delta) results['lars_path (with Gram)'].append(delta) gc.collect() print("benchmarking lars_path (without Gram):", end='') sys.stdout.flush() tstart = time() lars_path(X, y, method='lasso') delta = time() - tstart print("%0.3fs" % delta) results['lars_path (without Gram)'].append(delta) gc.collect() print("benchmarking lasso_path (with Gram):", end='') sys.stdout.flush() tstart = time() lasso_path(X, y, precompute=True) delta = time() - tstart print("%0.3fs" % delta) results['lasso_path (with Gram)'].append(delta) gc.collect() print("benchmarking lasso_path (without Gram):", end='') sys.stdout.flush() tstart = time() lasso_path(X, y, precompute=False) delta = time() - tstart print("%0.3fs" % delta) results['lasso_path (without Gram)'].append(delta) return results
if __name__ == "__main__": list_n_samples = np.linspace(100, 10000, 5).astype(np.int) list_n_features = [10, 100, 1000] n_test = 1000 max_iter = 1000 noise = 0.1 alpha = 0.01 sgd_results = np.zeros((len(list_n_samples), len(list_n_features), 2)) elnet_results = np.zeros((len(list_n_samples), len(list_n_features), 2)) ridge_results = np.zeros((len(list_n_samples), len(list_n_features), 2)) asgd_results = np.zeros((len(list_n_samples), len(list_n_features), 2)) for i, n_train in enumerate(list_n_samples): for j, n_features in enumerate(list_n_features): X, y, coef = make_regression(n_samples=n_train + n_test, n_features=n_features, noise=noise, coef=True) X_train = X[:n_train] y_train = y[:n_train] X_test = X[n_train:] y_test = y[n_train:] print("=======================") print("Round %d %d" % (i, j)) print("n_features:", n_features) print("n_samples:", n_train) # Shuffle data idx = np.arange(n_train) np.random.seed(13)
""" print(__doc__) from time import time from scipy import sparse from scipy import linalg from mrex.datasets.samples_generator import make_regression from mrex.linear_model import Lasso # ############################################################################# # The two Lasso implementations on Dense data print("--- Dense matrices") X, y = make_regression(n_samples=200, n_features=5000, random_state=0) X_sp = sparse.coo_matrix(X) alpha = 1 sparse_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=1000) dense_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=1000) t0 = time() sparse_lasso.fit(X_sp, y) print("Sparse Lasso done in %fs" % (time() - t0)) t0 = time() dense_lasso.fit(X, y) print("Dense Lasso done in %fs" % (time() - t0)) print("Distance between coefficients : %s" %
scikit_results = [] glmnet_results = [] n = 20 step = 500 n_features = 1000 n_informative = n_features / 10 n_test_samples = 1000 for i in range(1, n + 1): print('==================') print('Iteration %s of %s' % (i, n)) print('==================') X, Y, coef_ = make_regression(n_samples=(i * step) + n_test_samples, n_features=n_features, noise=0.1, n_informative=n_informative, coef=True) X_test = X[-n_test_samples:] Y_test = Y[-n_test_samples:] X = X[:(i * step)] Y = Y[:(i * step)] print("benchmarking scikit-learn: ") scikit_results.append(bench(ScikitLasso, X, Y, X_test, Y_test, coef_)) print("benchmarking glmnet: ") glmnet_results.append(bench(GlmnetLasso, X, Y, X_test, Y_test, coef_)) plt.clf() xx = range(0, n * step, step)