예제 #1
0
def test_randomized_lasso():
    """Check randomized lasso"""
    scaling = 0.3
    selection_threshold = 0.5

    # or with 1 alpha
    clf = RandomizedLasso(verbose=False, alpha=1, random_state=42,
                    scaling=scaling, selection_threshold=selection_threshold)
    feature_scores = clf.fit(X, y).scores_
    assert_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:])

    # or with many alphas
    clf = RandomizedLasso(verbose=False, alpha=[1, 0.8], random_state=42,
                    scaling=scaling, selection_threshold=selection_threshold)
    feature_scores = clf.fit(X, y).scores_
    assert_equal(clf.all_scores_.shape, (X.shape[1], 2))
    assert_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:])

    X_r = clf.transform(X)
    X_full = clf.inverse_transform(X_r)
    assert_equal(X_r.shape[1], np.sum(feature_scores > selection_threshold))
    assert_equal(X_full.shape, X.shape)

    clf = RandomizedLasso(verbose=False, alpha='aic', random_state=42,
                          scaling=scaling)
    feature_scores = clf.fit(X, y).scores_
    assert_equal(feature_scores, X.shape[1] * [1.])

    clf = RandomizedLasso(verbose=False, scaling=-0.1)
    assert_raises(ValueError, clf.fit, X, y)

    clf = RandomizedLasso(verbose=False, scaling=1.1)
    assert_raises(ValueError, clf.fit, X, y)
예제 #2
0
def set_selection_method(config):
    """
    Given the configuration settings, this function instantiates the configured
    feature selection method initialized with the preset parameters.
    
    TODO: implement the same method using reflection (load the class dinamically
    at runtime)
    
    @param config: the configuration file object loaded using yaml.load()
    @return: an object that implements the TransformerMixin class (with fit(),
    fit_transform() and transform() methods).
    """
    transformer = None

    selection_cfg = config.get("feature_selection", None)
    if selection_cfg:
        method_name = selection_cfg.get("method", None)

        # checks for RandomizedLasso
        if method_name == "RandomizedLasso":
            p = selection_cfg.get("parameters", None)
            if p:
                transformer = \
                RandomizedLasso(alpha=p.get("alpha", "aic"),
                                scaling=p.get("scaling", .5),
                                sample_fraction=p.get('sample_fraction', .75),
                                n_resampling=p.get('n_resampling', 200),
                                selection_threshold=p.get('selection_threshold', .25),
                                fit_intercept=p.get('fit_intercept', True),
                                # TODO: set verbosity according to global level
                                verbose=True,
                                normalize=p.get('normalize', True),
                                max_iter=p.get('max_iter', 500),
                                n_jobs=p.get('n_jobs', 1))
            else:
                transformer = RandomizedLasso()

        # checks for ExtraTreesClassifier
        elif method_name == "ExtraTreesClassifier":
            p = selection_cfg.get("parameters", None)
            if p:
                transformer = \
                ExtraTreesClassifier(n_estimators=p.get('n_estimators', 10),
                                     max_depth=p.get('max_depth', None),
                                     min_samples_split=p.get('min_samples_split', 1),
                                     min_samples_leaf=p.get('min_samples_leaf', 1),
                                     min_density=p.get('min_density', 1),
                                     max_features=p.get('max_features', 'auto'),
                                     bootstrap=p.get('bootstrap', False),
                                     compute_importances=p.get('compute_importances', True),
                                     n_jobs=p.get('n_jobs', 1),
                                     random_state=p.get('random_state', None),
                                     # TODO: set verbosity according to global level
                                     verbose=True)
            else:
                transformer = ExtraTreesClassifier()

    return transformer
예제 #3
0
 def run_feature_selection(self, data, labels, feature_selector=None, 
                           feature_selection_params={}, 
                           feature_selection_threshold=.25,
                           plot_filename="./featureselection.pdf"
                           ):
     p = feature_selection_params
     transformer = None
     attributes = {}
     if feature_selector == "RandomizedLasso":
         transformer = RandomizedLasso(alpha=p.setdefault("alpha", "aic"), 
                                 scaling=p.setdefault("scaling", .5), 
                                 sample_fraction=p.setdefault('sample_fraction', .75), 
                                 n_resampling=p.setdefault('n_resampling', 200),
                                 selection_threshold=feature_selection_threshold, 
                                 fit_intercept=p.setdefault('fit_intercept', True), 
                                 # TODO: set verbosity according to global level
                                 verbose=True, 
                                 normalize=p.setdefault('normalize', True), 
                                 max_iter=p.setdefault('max_iter', 500), 
                                 n_jobs=p.setdefault('n_jobs', 1))
     elif feature_selector == "ExtraTreesClassifier":
         transformer = ExtraTreesClassifier(n_estimators=p.get('n_estimators', 10),
                                  max_depth=p.get('max_depth', None),
                                  min_samples_split=p.get('min_samples_split', 1),
                                  min_samples_leaf=p.get('min_samples_leaf', 1),
                                  min_density=p.get('min_density', 1),
                                  max_features=p.get('max_features', 'auto'),
                                  bootstrap=p.get('bootstrap', False),
                                  compute_importances=p.get('compute_importances', True),
                                  n_jobs=p.get('n_jobs', 1),
                                  random_state=p.get('random_state', None),
                                  # TODO: set verbosity according to global level
                                  verbose=True)
     elif feature_selector == "GP":
         #TODO: add here Gaussian Processes
         transformer = None
         
     elif feature_selector == "RFECV_SVC":
         return self._fs_rfecv(data, labels, plot_filename, sample=p.get("sample", 0.05))
     
     elif feature_selector == "RFE_SVC":
         return self._fs_rfe(data, labels, plot_filename, n_features=p.get("n_features", 10))
     
     if transformer:
         log.info("scikit: Running feature selection {}".format(feature_selector))
         
         log.info("scikit: data dimensions before fit_transform(): {}".format(data.shape))
         log.info("scikit: labels dimensions before fit_transform(): {}".format(labels.shape))
         data = transformer.fit_transform(data, labels)
         log.info("scikit: Dimensions after fit_transform(): %s,%s" % data.shape)
              
     return transformer, data, attributes
예제 #4
0
def test_randomized_lasso_precompute():
    # Check randomized lasso for different values of precompute
    n_resampling = 20
    alpha = 1
    random_state = 42

    G = np.dot(X.T, X)

    clf = RandomizedLasso(alpha=alpha, random_state=random_state,
                          precompute=G, n_resampling=n_resampling)
    feature_scores_1 = clf.fit(X, y).scores_

    for precompute in [True, False, None, 'auto']:
        clf = RandomizedLasso(alpha=alpha, random_state=random_state,
                              precompute=precompute, n_resampling=n_resampling)
        feature_scores_2 = clf.fit(X, y).scores_
        assert_array_equal(feature_scores_1, feature_scores_2)
예제 #5
0
def test_randomized_lasso_error_memory():
    scaling = 0.3
    selection_threshold = 0.5
    tempdir = 5
    clf = RandomizedLasso(verbose=False,
                          alpha=[1, 0.8],
                          random_state=42,
                          scaling=scaling,
                          selection_threshold=selection_threshold,
                          memory=tempdir)
    assert_raises_regex(
        ValueError, "'memory' should either be a string or"
        " a sklearn.externals.joblib.Memory instance", clf.fit, X, y)
예제 #6
0
def test_randomized_lasso_precompute():
    # Check randomized lasso for different values of precompute
    n_resampling = 20
    alpha = 1
    random_state = 42

    G = np.dot(X.T, X)

    clf = RandomizedLasso(alpha=alpha,
                          random_state=random_state,
                          precompute=G,
                          n_resampling=n_resampling)
    feature_scores_1 = clf.fit(X, y).scores_

    for precompute in [True, False, None, 'auto']:
        clf = RandomizedLasso(alpha=alpha,
                              random_state=random_state,
                              precompute=precompute,
                              n_resampling=n_resampling)
        feature_scores_2 = clf.fit(X, y).scores_
        assert_array_equal(feature_scores_1, feature_scores_2)
예제 #7
0
def test_randomized_lasso():
    # Check randomized lasso
    scaling = 0.3
    selection_threshold = 0.5
    n_resampling = 20

    # or with 1 alpha
    clf = RandomizedLasso(verbose=False,
                          alpha=1,
                          random_state=42,
                          scaling=scaling,
                          n_resampling=n_resampling,
                          selection_threshold=selection_threshold)
    feature_scores = clf.fit(X, y).scores_
    assert_array_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:])

    # or with many alphas
    clf = RandomizedLasso(verbose=False,
                          alpha=[1, 0.8],
                          random_state=42,
                          scaling=scaling,
                          n_resampling=n_resampling,
                          selection_threshold=selection_threshold)
    feature_scores = clf.fit(X, y).scores_
    assert_equal(clf.all_scores_.shape, (X.shape[1], 2))
    assert_array_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:])
    # test caching
    try:
        tempdir = mkdtemp()
        clf = RandomizedLasso(verbose=False,
                              alpha=[1, 0.8],
                              random_state=42,
                              scaling=scaling,
                              selection_threshold=selection_threshold,
                              memory=tempdir)
        feature_scores = clf.fit(X, y).scores_
        assert_equal(clf.all_scores_.shape, (X.shape[1], 2))
        assert_array_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:])
    finally:
        shutil.rmtree(tempdir)

    X_r = clf.transform(X)
    X_full = clf.inverse_transform(X_r)
    assert_equal(X_r.shape[1], np.sum(feature_scores > selection_threshold))
    assert_equal(X_full.shape, X.shape)

    clf = RandomizedLasso(verbose=False,
                          alpha='aic',
                          random_state=42,
                          scaling=scaling,
                          n_resampling=100)
    feature_scores = clf.fit(X, y).scores_
    assert_allclose(feature_scores, [1., 1., 1., 0.225, 1.], rtol=0.2)

    clf = RandomizedLasso(verbose=False, scaling=-0.1)
    assert_raises(ValueError, clf.fit, X, y)

    clf = RandomizedLasso(verbose=False, scaling=1.1)
    assert_raises(ValueError, clf.fit, X, y)
def test_randomized_lasso():
    # Check randomized lasso
    scaling = 0.3
    selection_threshold = 0.5

    # or with 1 alpha
    clf = RandomizedLasso(verbose=False,
                          alpha=1,
                          random_state=42,
                          scaling=scaling,
                          selection_threshold=selection_threshold)
    feature_scores = clf.fit(X, y).scores_
    assert_array_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:])

    # or with many alphas
    clf = RandomizedLasso(verbose=False,
                          alpha=[1, 0.8],
                          random_state=42,
                          scaling=scaling,
                          selection_threshold=selection_threshold)
    feature_scores = clf.fit(X, y).scores_
    assert_equal(clf.all_scores_.shape, (X.shape[1], 2))
    assert_array_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:])

    X_r = clf.transform(X)
    X_full = clf.inverse_transform(X_r)
    assert_equal(X_r.shape[1], np.sum(feature_scores > selection_threshold))
    assert_equal(X_full.shape, X.shape)

    clf = RandomizedLasso(verbose=False,
                          alpha='aic',
                          random_state=42,
                          scaling=scaling)
    feature_scores = clf.fit(X, y).scores_
    assert_array_equal(feature_scores, X.shape[1] * [1.])

    clf = RandomizedLasso(verbose=False, scaling=-0.1)
    assert_raises(ValueError, clf.fit, X, y)

    clf = RandomizedLasso(verbose=False, scaling=1.1)
    assert_raises(ValueError, clf.fit, X, y)
예제 #9
0
def test_randomized_lasso():
    # Check randomized lasso
    scaling = 0.3
    selection_threshold = 0.5
    n_resampling = 20

    # or with 1 alpha
    clf = RandomizedLasso(verbose=False, alpha=1, random_state=42,
                          scaling=scaling, n_resampling=n_resampling,
                          selection_threshold=selection_threshold)
    feature_scores = clf.fit(X, y).scores_
    assert_array_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:])

    # or with many alphas
    clf = RandomizedLasso(verbose=False, alpha=[1, 0.8], random_state=42,
                          scaling=scaling, n_resampling=n_resampling,
                          selection_threshold=selection_threshold)
    feature_scores = clf.fit(X, y).scores_
    assert_equal(clf.all_scores_.shape, (X.shape[1], 2))
    assert_array_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:])
    # test caching
    try:
        tempdir = mkdtemp()
        clf = RandomizedLasso(verbose=False, alpha=[1, 0.8], random_state=42,
                              scaling=scaling,
                              selection_threshold=selection_threshold,
                              memory=tempdir)
        feature_scores = clf.fit(X, y).scores_
        assert_equal(clf.all_scores_.shape, (X.shape[1], 2))
        assert_array_equal(np.argsort(F)[-3:], np.argsort(feature_scores)[-3:])
    finally:
        shutil.rmtree(tempdir)

    X_r = clf.transform(X)
    X_full = clf.inverse_transform(X_r)
    assert_equal(X_r.shape[1], np.sum(feature_scores > selection_threshold))
    assert_equal(X_full.shape, X.shape)

    clf = RandomizedLasso(verbose=False, alpha='aic', random_state=42,
                          scaling=scaling, n_resampling=100)
    feature_scores = clf.fit(X, y).scores_
    assert_allclose(feature_scores, [1., 1., 1., 0.225, 1.], rtol=0.2)

    clf = RandomizedLasso(verbose=False, scaling=-0.1)
    assert_raises(ValueError, clf.fit, X, y)

    clf = RandomizedLasso(verbose=False, scaling=1.1)
    assert_raises(ValueError, clf.fit, X, y)
예제 #10
0
def get_lasso_feature_scores(x, y, mode=CLASSIFICATION, scaling=0.5, 
                             sample_fraction=0.75, n_resampling=200,
                             random_state=None):
    '''
    Calculate features scores using a randomized lasso (regression) or 
    randomized logistic regression (classification). This is also known as 
    stability selection.
    
    see http://scikit-learn.org/stable/modules/feature_selection.html for 
    details. 
    
    Parameters
    ----------   
    x : structured array
    y : 1D nd.array
    mode : {CLASSIFICATION, REGRESSION}
    scaling : float, optional
              scaling parameter, should be between 0 and 1
    sample_fraction : float, optional
                      the fraction of samples to used in each randomized 
                      dataset
    n_resmpling : int, optional
                  the number of times the model is trained on a random subset 
                  of the data
    random_state : int, optional
                   if it is an int, it specifies the seed to use, defaults to 
                   None.
                         
    Returns
    -------
    pandas DataFrame
        sorted in descending order of tuples with uncertainty and feature 
        scores         
         
    '''
    
    uncs = recfunctions.get_names(x.dtype)
    
    x = _prepare_experiments(x)
    
    if mode==CLASSIFICATION:

        lfs = RandomizedLogisticRegression(scaling=scaling, 
                                           sample_fraction=sample_fraction,
                                           n_resampling=n_resampling, 
                                           random_state=random_state)
        lfs.fit(x,y)
    elif  mode==REGRESSION:
        # we use LassoLarsCV to determine alpha see
        # http://scikit-learn.org/stable/auto_examples/linear_model/plot_sparse_recovery.html
        lars_cv = LassoLarsCV(cv=6).fit(x, y,)
        alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)
        
        # fit the randomized lasso        
        lfs = RandomizedLasso(alpha=alphas,scaling=scaling, 
                              sample_fraction=sample_fraction,
                              n_resampling=n_resampling,
                              random_state=random_state)
        lfs.fit(x, y)
    else:
        raise ValueError('{} invalid value for mode'.format(mode))

    importances = lfs.scores_
    importances = zip(uncs, importances)
    importances = list(importances)
    importances.sort(key=itemgetter(1), reverse=True)
    importances = pd.DataFrame(importances)


    return importances
예제 #11
0
			'PCA':PCA(),
			'PLSCanonical':PLSCanonical(),
			'PLSRegression':PLSRegression(),
			'PLSSVD':PLSSVD(),
			'PassiveAggressiveClassifier':PassiveAggressiveClassifier(),
			'PassiveAggressiveRegressor':PassiveAggressiveRegressor(),
			'Perceptron':Perceptron(),
			'ProjectedGradientNMF':ProjectedGradientNMF(),
			'QuadraticDiscriminantAnalysis':QuadraticDiscriminantAnalysis(),
			'RANSACRegressor':RANSACRegressor(),
			'RBFSampler':RBFSampler(),
			'RadiusNeighborsClassifier':RadiusNeighborsClassifier(),
			'RadiusNeighborsRegressor':RadiusNeighborsRegressor(),
			'RandomForestClassifier':RandomForestClassifier(),
			'RandomForestRegressor':RandomForestRegressor(),
			'RandomizedLasso':RandomizedLasso(),
			'RandomizedLogisticRegression':RandomizedLogisticRegression(),
			'RandomizedPCA':RandomizedPCA(),
			'Ridge':Ridge(),
			'RidgeCV':RidgeCV(),
			'RidgeClassifier':RidgeClassifier(),
			'RidgeClassifierCV':RidgeClassifierCV(),
			'RobustScaler':RobustScaler(),
			'SGDClassifier':SGDClassifier(),
			'SGDRegressor':SGDRegressor(),
			'SVC':SVC(),
			'SVR':SVR(),
			'SelectFdr':SelectFdr(),
			'SelectFpr':SelectFpr(),
			'SelectFwe':SelectFwe(),
			'SelectKBest':SelectKBest(),
def get_lasso_feature_scores(results,
                             classify,
                             scaling=0.5,
                             sample_fraction=0.75,
                             n_resampling=200,
                             random_state=None):
    '''
    Calculate features scores using a randomized lasso (regression) or 
    randomized logistic regression (classification). This is also known as 
    stability selection.
    
    see http://scikit-learn.org/stable/modules/feature_selection.html for 
    details. 
    
    Parameters
    ----------   
    results : tuple
    classify : callable or str
               a classify function or variable analogous to PRIM
    scaling : float, optional
              scaling parameter, should be between 0 and 1
    sample_fraction : float, optional
                      the fraction of samples to used in each randomized 
                      dataset
    n_resmpling : int, optional
                  the number of times the model is trained on a random subset 
                  of the data
    random_state : int, optional
                   if it is an int, it specifies the seed to use, defaults to 
                   None.
                         
    Returns
    -------
    list of tuples 
        sorted in descending order of tuples with uncertainty and feature 
        scores         
         
    '''

    experiments, outcomes = results
    uncs = recfunctions.get_names(experiments.dtype)

    x = _prepare_experiments(experiments)
    y, categorical = _prepare_outcomes(outcomes, classify)

    if categorical:

        lfs = RandomizedLogisticRegression(scaling=scaling,
                                           sample_fraction=sample_fraction,
                                           n_resampling=n_resampling,
                                           random_state=random_state)
        lfs.fit(x, y)
    else:
        # we use LassoLarsCV to determine alpha see
        # http://scikit-learn.org/stable/auto_examples/linear_model/plot_sparse_recovery.html
        lars_cv = LassoLarsCV(cv=6).fit(
            x,
            y,
        )
        alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)

        # fit the randomized lasso
        lfs = RandomizedLasso(alpha=alphas,
                              scaling=scaling,
                              sample_fraction=sample_fraction,
                              n_resampling=n_resampling,
                              random_state=random_state)
        lfs.fit(x, y)

    importances = lfs.scores_
    importances = zip(uncs, importances)
    importances = list(importances)
    importances.sort(key=itemgetter(1), reverse=True)

    return importances