def test_f_regression(): """ Test whether the F test yields meaningful results on a simple simulated regression problem """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) F, pv = f_regression(X, Y) assert(F>0).all() assert(pv>0).all() assert(pv<1).all() assert(pv[:5]<0.05).all() assert(pv[5:]>1.e-4).all()
def test_select_percentile_regression_full(): """ Test whether the relative univariate feature selection selects all features when '100%' is asked. """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=100) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='percentile', param=100).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.ones(20) assert_array_equal(support, gtruth)
def test_select_fdr_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the fdr heuristic """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFdr(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='fdr', param=0.01).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5]=1 assert_array_equal(support, gtruth)
def test_select_percentile_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the percentile heuristic """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=25) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='percentile', param=25).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5]=1 assert_array_equal(support, gtruth) X_2 = X.copy() X_2[:, np.logical_not(support)] = 0 assert_array_equal(X_2, univariate_filter.inverse_transform(X_r))
def compute_bench(alpha, n_samples, n_features, precompute): lasso_results = [] lars_lasso_results = [] n_test_samples = 0 it = 0 for ns in n_samples: for nf in n_features: it += 1 print '==================' print 'Iteration %s of %s' % (it, max(len(n_samples), len(n_features))) print '==================' n_informative = nf // 10 X, Y, coef_ = make_regression(n_samples=ns, n_features=nf, n_informative=n_informative, noise=0.1, coef=True) X /= np.sqrt(np.sum(X**2, axis=0)) # Normalize data gc.collect() print "- benching Lasso" clf = Lasso(alpha=alpha, fit_intercept=False) tstart = time() clf.fit(X, Y, precompute=precompute) lasso_results.append(time() - tstart) gc.collect() print "- benching LassoLars" clf = LassoLars(alpha=alpha, fit_intercept=False) tstart = time() clf.fit(X, Y, normalize=False, precompute=precompute) lars_lasso_results.append(time() - tstart) return lasso_results, lars_lasso_results
from scikits.learn.metrics import mean_square_error from scikits.learn.datasets.samples_generator import make_regression if __name__ == "__main__": list_n_samples = np.linspace(100, 10000, 5).astype(np.int) list_n_features = [10, 100, 1000] n_test = 1000 noise = 0.1 alpha = 0.01 sgd_results = np.zeros((len(list_n_samples), len(list_n_features), 2)) elnet_results = np.zeros((len(list_n_samples), len(list_n_features), 2)) ridge_results = np.zeros((len(list_n_samples), len(list_n_features), 2)) for i, n_train in enumerate(list_n_samples): for j, n_features in enumerate(list_n_features): X, y, coef = make_regression( n_samples=n_train + n_test, n_features=n_features, noise=noise, coef=True) X_train = X[:n_train] y_train = y[:n_train] X_test = X[n_train:] y_test = y[n_train:] print "=======================" print "Round %d %d" % (i, j) print "n_features:", n_features print "n_samples:", n_train # Shuffle data idx = np.arange(n_train) np.random.seed(13)
def compute_bench(samples_range, features_range): it = 0 results = defaultdict(lambda: []) max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print '====================' print 'Iteration %03d of %03d' % (it, max_it) print '====================' dataset_kwargs = { 'n_samples': n_samples, 'n_features': n_features, 'n_informative': n_features / 10, 'effective_rank': min(n_samples, n_features) / 10, #'effective_rank': None, 'bias': 0.0, } print "n_samples: %d" % n_samples print "n_features: %d" % n_features X, y = make_regression(**dataset_kwargs) gc.collect() print "benching lars_path (with Gram):", sys.stdout.flush() tstart = time() G = np.dot(X.T, X) # precomputed Gram matrix Xy = np.dot(X.T, y) lars_path(X, y, Xy=Xy, Gram=G, method='lasso') delta = time() - tstart print "%0.3fs" % delta results['lars_path (with Gram)'].append(delta) gc.collect() print "benching lars_path (without Gram):", sys.stdout.flush() tstart = time() lars_path(X, y, method='lasso') delta = time() - tstart print "%0.3fs" % delta results['lars_path (without Gram)'].append(delta) gc.collect() print "benching lasso_path (with Gram):", sys.stdout.flush() tstart = time() lasso_path(X, y, precompute=True) delta = time() - tstart print "%0.3fs" % delta results['lasso_path (with Gram)'].append(delta) gc.collect() print "benching lasso_path (without Gram):", sys.stdout.flush() tstart = time() lasso_path(X, y, precompute=False) delta = time() - tstart print "%0.3fs" % delta results['lasso_path (without Gram)'].append(delta) return results