def test_randomized_lasso(): """Check randomized lasso""" scaling = 0.3 selection_threshold = 0.5 # or with 1 alpha clf = RandomizedLasso(verbose=False, alpha=1, random_state=42, scaling=scaling, selection_threshold=selection_threshold) feature_scores = clf.fit(X, y).scores_ assert_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:]) # or with many alphas clf = RandomizedLasso(verbose=False, alpha=[1, 0.8], random_state=42, scaling=scaling, selection_threshold=selection_threshold) feature_scores = clf.fit(X, y).scores_ assert_equal(clf.all_scores_.shape, (X.shape[1], 2)) assert_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:]) X_r = clf.transform(X) X_full = clf.inverse_transform(X_r) assert_equal(X_r.shape[1], np.sum(feature_scores > selection_threshold)) assert_equal(X_full.shape, X.shape) clf = RandomizedLasso(verbose=False, alpha='aic', random_state=42, scaling=scaling) feature_scores = clf.fit(X, y).scores_ assert_equal(feature_scores, X.shape[1] * [1.]) clf = RandomizedLasso(verbose=False, scaling=-0.1) assert_raises(ValueError, clf.fit, X, y) clf = RandomizedLasso(verbose=False, scaling=1.1) assert_raises(ValueError, clf.fit, X, y)
def set_selection_method(config): """ Given the configuration settings, this function instantiates the configured feature selection method initialized with the preset parameters. TODO: implement the same method using reflection (load the class dinamically at runtime) @param config: the configuration file object loaded using yaml.load() @return: an object that implements the TransformerMixin class (with fit(), fit_transform() and transform() methods). """ transformer = None selection_cfg = config.get("feature_selection", None) if selection_cfg: method_name = selection_cfg.get("method", None) # checks for RandomizedLasso if method_name == "RandomizedLasso": p = selection_cfg.get("parameters", None) if p: transformer = \ RandomizedLasso(alpha=p.get("alpha", "aic"), scaling=p.get("scaling", .5), sample_fraction=p.get('sample_fraction', .75), n_resampling=p.get('n_resampling', 200), selection_threshold=p.get('selection_threshold', .25), fit_intercept=p.get('fit_intercept', True), # TODO: set verbosity according to global level verbose=True, normalize=p.get('normalize', True), max_iter=p.get('max_iter', 500), n_jobs=p.get('n_jobs', 1)) else: transformer = RandomizedLasso() # checks for ExtraTreesClassifier elif method_name == "ExtraTreesClassifier": p = selection_cfg.get("parameters", None) if p: transformer = \ ExtraTreesClassifier(n_estimators=p.get('n_estimators', 10), max_depth=p.get('max_depth', None), min_samples_split=p.get('min_samples_split', 1), min_samples_leaf=p.get('min_samples_leaf', 1), min_density=p.get('min_density', 1), max_features=p.get('max_features', 'auto'), bootstrap=p.get('bootstrap', False), compute_importances=p.get('compute_importances', True), n_jobs=p.get('n_jobs', 1), random_state=p.get('random_state', None), # TODO: set verbosity according to global level verbose=True) else: transformer = ExtraTreesClassifier() return transformer
def run_feature_selection(self, data, labels, feature_selector=None, feature_selection_params={}, feature_selection_threshold=.25, plot_filename="./featureselection.pdf" ): p = feature_selection_params transformer = None attributes = {} if feature_selector == "RandomizedLasso": transformer = RandomizedLasso(alpha=p.setdefault("alpha", "aic"), scaling=p.setdefault("scaling", .5), sample_fraction=p.setdefault('sample_fraction', .75), n_resampling=p.setdefault('n_resampling', 200), selection_threshold=feature_selection_threshold, fit_intercept=p.setdefault('fit_intercept', True), # TODO: set verbosity according to global level verbose=True, normalize=p.setdefault('normalize', True), max_iter=p.setdefault('max_iter', 500), n_jobs=p.setdefault('n_jobs', 1)) elif feature_selector == "ExtraTreesClassifier": transformer = ExtraTreesClassifier(n_estimators=p.get('n_estimators', 10), max_depth=p.get('max_depth', None), min_samples_split=p.get('min_samples_split', 1), min_samples_leaf=p.get('min_samples_leaf', 1), min_density=p.get('min_density', 1), max_features=p.get('max_features', 'auto'), bootstrap=p.get('bootstrap', False), compute_importances=p.get('compute_importances', True), n_jobs=p.get('n_jobs', 1), random_state=p.get('random_state', None), # TODO: set verbosity according to global level verbose=True) elif feature_selector == "GP": #TODO: add here Gaussian Processes transformer = None elif feature_selector == "RFECV_SVC": return self._fs_rfecv(data, labels, plot_filename, sample=p.get("sample", 0.05)) elif feature_selector == "RFE_SVC": return self._fs_rfe(data, labels, plot_filename, n_features=p.get("n_features", 10)) if transformer: log.info("scikit: Running feature selection {}".format(feature_selector)) log.info("scikit: data dimensions before fit_transform(): {}".format(data.shape)) log.info("scikit: labels dimensions before fit_transform(): {}".format(labels.shape)) data = transformer.fit_transform(data, labels) log.info("scikit: Dimensions after fit_transform(): %s,%s" % data.shape) return transformer, data, attributes
def test_randomized_lasso_precompute(): # Check randomized lasso for different values of precompute n_resampling = 20 alpha = 1 random_state = 42 G = np.dot(X.T, X) clf = RandomizedLasso(alpha=alpha, random_state=random_state, precompute=G, n_resampling=n_resampling) feature_scores_1 = clf.fit(X, y).scores_ for precompute in [True, False, None, 'auto']: clf = RandomizedLasso(alpha=alpha, random_state=random_state, precompute=precompute, n_resampling=n_resampling) feature_scores_2 = clf.fit(X, y).scores_ assert_array_equal(feature_scores_1, feature_scores_2)
def test_randomized_lasso_error_memory(): scaling = 0.3 selection_threshold = 0.5 tempdir = 5 clf = RandomizedLasso(verbose=False, alpha=[1, 0.8], random_state=42, scaling=scaling, selection_threshold=selection_threshold, memory=tempdir) assert_raises_regex( ValueError, "'memory' should either be a string or" " a sklearn.externals.joblib.Memory instance", clf.fit, X, y)
def test_randomized_lasso(): # Check randomized lasso scaling = 0.3 selection_threshold = 0.5 n_resampling = 20 # or with 1 alpha clf = RandomizedLasso(verbose=False, alpha=1, random_state=42, scaling=scaling, n_resampling=n_resampling, selection_threshold=selection_threshold) feature_scores = clf.fit(X, y).scores_ assert_array_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:]) # or with many alphas clf = RandomizedLasso(verbose=False, alpha=[1, 0.8], random_state=42, scaling=scaling, n_resampling=n_resampling, selection_threshold=selection_threshold) feature_scores = clf.fit(X, y).scores_ assert_equal(clf.all_scores_.shape, (X.shape[1], 2)) assert_array_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:]) # test caching try: tempdir = mkdtemp() clf = RandomizedLasso(verbose=False, alpha=[1, 0.8], random_state=42, scaling=scaling, selection_threshold=selection_threshold, memory=tempdir) feature_scores = clf.fit(X, y).scores_ assert_equal(clf.all_scores_.shape, (X.shape[1], 2)) assert_array_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:]) finally: shutil.rmtree(tempdir) X_r = clf.transform(X) X_full = clf.inverse_transform(X_r) assert_equal(X_r.shape[1], np.sum(feature_scores > selection_threshold)) assert_equal(X_full.shape, X.shape) clf = RandomizedLasso(verbose=False, alpha='aic', random_state=42, scaling=scaling, n_resampling=100) feature_scores = clf.fit(X, y).scores_ assert_allclose(feature_scores, [1., 1., 1., 0.225, 1.], rtol=0.2) clf = RandomizedLasso(verbose=False, scaling=-0.1) assert_raises(ValueError, clf.fit, X, y) clf = RandomizedLasso(verbose=False, scaling=1.1) assert_raises(ValueError, clf.fit, X, y)
def test_randomized_lasso(): # Check randomized lasso scaling = 0.3 selection_threshold = 0.5 # or with 1 alpha clf = RandomizedLasso(verbose=False, alpha=1, random_state=42, scaling=scaling, selection_threshold=selection_threshold) feature_scores = clf.fit(X, y).scores_ assert_array_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:]) # or with many alphas clf = RandomizedLasso(verbose=False, alpha=[1, 0.8], random_state=42, scaling=scaling, selection_threshold=selection_threshold) feature_scores = clf.fit(X, y).scores_ assert_equal(clf.all_scores_.shape, (X.shape[1], 2)) assert_array_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:]) X_r = clf.transform(X) X_full = clf.inverse_transform(X_r) assert_equal(X_r.shape[1], np.sum(feature_scores > selection_threshold)) assert_equal(X_full.shape, X.shape) clf = RandomizedLasso(verbose=False, alpha='aic', random_state=42, scaling=scaling) feature_scores = clf.fit(X, y).scores_ assert_array_equal(feature_scores, X.shape[1] * [1.]) clf = RandomizedLasso(verbose=False, scaling=-0.1) assert_raises(ValueError, clf.fit, X, y) clf = RandomizedLasso(verbose=False, scaling=1.1) assert_raises(ValueError, clf.fit, X, y)
def get_lasso_feature_scores(x, y, mode=CLASSIFICATION, scaling=0.5, sample_fraction=0.75, n_resampling=200, random_state=None): ''' Calculate features scores using a randomized lasso (regression) or randomized logistic regression (classification). This is also known as stability selection. see http://scikit-learn.org/stable/modules/feature_selection.html for details. Parameters ---------- x : structured array y : 1D nd.array mode : {CLASSIFICATION, REGRESSION} scaling : float, optional scaling parameter, should be between 0 and 1 sample_fraction : float, optional the fraction of samples to used in each randomized dataset n_resmpling : int, optional the number of times the model is trained on a random subset of the data random_state : int, optional if it is an int, it specifies the seed to use, defaults to None. Returns ------- pandas DataFrame sorted in descending order of tuples with uncertainty and feature scores ''' uncs = recfunctions.get_names(x.dtype) x = _prepare_experiments(x) if mode==CLASSIFICATION: lfs = RandomizedLogisticRegression(scaling=scaling, sample_fraction=sample_fraction, n_resampling=n_resampling, random_state=random_state) lfs.fit(x,y) elif mode==REGRESSION: # we use LassoLarsCV to determine alpha see # http://scikit-learn.org/stable/auto_examples/linear_model/plot_sparse_recovery.html lars_cv = LassoLarsCV(cv=6).fit(x, y,) alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6) # fit the randomized lasso lfs = RandomizedLasso(alpha=alphas,scaling=scaling, sample_fraction=sample_fraction, n_resampling=n_resampling, random_state=random_state) lfs.fit(x, y) else: raise ValueError('{} invalid value for mode'.format(mode)) importances = lfs.scores_ importances = zip(uncs, importances) importances = list(importances) importances.sort(key=itemgetter(1), reverse=True) importances = pd.DataFrame(importances) return importances
'PCA':PCA(), 'PLSCanonical':PLSCanonical(), 'PLSRegression':PLSRegression(), 'PLSSVD':PLSSVD(), 'PassiveAggressiveClassifier':PassiveAggressiveClassifier(), 'PassiveAggressiveRegressor':PassiveAggressiveRegressor(), 'Perceptron':Perceptron(), 'ProjectedGradientNMF':ProjectedGradientNMF(), 'QuadraticDiscriminantAnalysis':QuadraticDiscriminantAnalysis(), 'RANSACRegressor':RANSACRegressor(), 'RBFSampler':RBFSampler(), 'RadiusNeighborsClassifier':RadiusNeighborsClassifier(), 'RadiusNeighborsRegressor':RadiusNeighborsRegressor(), 'RandomForestClassifier':RandomForestClassifier(), 'RandomForestRegressor':RandomForestRegressor(), 'RandomizedLasso':RandomizedLasso(), 'RandomizedLogisticRegression':RandomizedLogisticRegression(), 'RandomizedPCA':RandomizedPCA(), 'Ridge':Ridge(), 'RidgeCV':RidgeCV(), 'RidgeClassifier':RidgeClassifier(), 'RidgeClassifierCV':RidgeClassifierCV(), 'RobustScaler':RobustScaler(), 'SGDClassifier':SGDClassifier(), 'SGDRegressor':SGDRegressor(), 'SVC':SVC(), 'SVR':SVR(), 'SelectFdr':SelectFdr(), 'SelectFpr':SelectFpr(), 'SelectFwe':SelectFwe(), 'SelectKBest':SelectKBest(),
def get_lasso_feature_scores(results, classify, scaling=0.5, sample_fraction=0.75, n_resampling=200, random_state=None): ''' Calculate features scores using a randomized lasso (regression) or randomized logistic regression (classification). This is also known as stability selection. see http://scikit-learn.org/stable/modules/feature_selection.html for details. Parameters ---------- results : tuple classify : callable or str a classify function or variable analogous to PRIM scaling : float, optional scaling parameter, should be between 0 and 1 sample_fraction : float, optional the fraction of samples to used in each randomized dataset n_resmpling : int, optional the number of times the model is trained on a random subset of the data random_state : int, optional if it is an int, it specifies the seed to use, defaults to None. Returns ------- list of tuples sorted in descending order of tuples with uncertainty and feature scores ''' experiments, outcomes = results uncs = recfunctions.get_names(experiments.dtype) x = _prepare_experiments(experiments) y, categorical = _prepare_outcomes(outcomes, classify) if categorical: lfs = RandomizedLogisticRegression(scaling=scaling, sample_fraction=sample_fraction, n_resampling=n_resampling, random_state=random_state) lfs.fit(x, y) else: # we use LassoLarsCV to determine alpha see # http://scikit-learn.org/stable/auto_examples/linear_model/plot_sparse_recovery.html lars_cv = LassoLarsCV(cv=6).fit( x, y, ) alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6) # fit the randomized lasso lfs = RandomizedLasso(alpha=alphas, scaling=scaling, sample_fraction=sample_fraction, n_resampling=n_resampling, random_state=random_state) lfs.fit(x, y) importances = lfs.scores_ importances = zip(uncs, importances) importances = list(importances) importances.sort(key=itemgetter(1), reverse=True) return importances