Exemplo n.º 1
0
def fs_continuous(X, y, method):
    """
    All 4 methods are implemented, but for Boruta and MIFS the method is over-
    riden and set to L1.
    """
    n, p = X.shape
    if method == 'Boruta':
        rf = RandomForestRegressor(n_jobs=-1)
        Boruta = boruta.BorutaPy(rf, n_estimators='auto')
        Boruta.fit(X, y)
        selected = np.where(Boruta.support_)[0]
    elif method == 'JMI':
        MIFS = mifs.MutualInformationFeatureSelector(method='JMI',
                                                     categorical=False)
        MIFS.fit(X, y)
        selected = np.where(MIFS.support_)[0]
    elif method == 'L1':
        lasso = LassoCV(n_jobs=-1, normalize=False)
        sfm = SelectFromModel(lasso)
        sfm.fit(X, y)
        selected = sfm.transform(np.arange(p).reshape(1, -1))[0]
    elif method == 'FDR':
        FDR = fs.SelectFdr(fs.f_regression, .05)
        FDR.fit(X, y)
        selected = FDR.transform(np.arange(p).reshape(1, -1))[0]
    return selected
Exemplo n.º 2
0
def get_fs_model(model, method, train, target=None, cv=None):
    """Connects given model with specified feature selection method and trains
    the final structure.
    """
    if method == "RFE":
        model = fs_scikit.RFE(model, 2, step=5)
        if target is not None:
            return model.fit(train, target)
        else:
            return model.fit(train)
    if method == "RFECV":
        model = fs_scikit.RFECV(model, 3, cv=cv)
        if target is not None:
            return model.fit(train, target)
        else:
            return model.fit(train)
    elif method == "linearSVC":
        sel = SelectFromModel(LinearSVC(penalty='l1', dual=False))
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "fromModel":
        fm = fs_scikit.SelectFromModel(model)
        if target is not None:
            fm.fit(train, target)
        else:
            fm.fit(train)
        model = Pipeline([('feature_selection', fm), ('data_mining', model)])

    # elif method == "Anova":
    # ANOVA SVM-C
    # anova_filter = fs_scikit.SelectKBest(f_regression, k=5)
    # model = Pipeline([
    #     ('feature_selection', anova_filter),
    #     ('data_mining', model)
    # ])
    elif method == "VarianceThreshold":
        sel = fs_scikit.VarianceThreshold(threshold=(.8 * (1 - .8)))
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectPercentile":
        sel = fs_scikit.SelectPercentile(fs_scikit.f_classif, percentile=30)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectFpr":
        sel = fs_scikit.SelectFpr(alpha=0.2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectFdr":
        sel = fs_scikit.SelectFdr(alpha=0.2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectFwe":
        sel = fs_scikit.SelectFwe(alpha=0.2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "ch2":
        sel = fs_scikit.SelectKBest(fs_scikit.chi2, k=2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    else:
        print("Feature selection method was not found: " + method)
        sys.exit(1)
    return model
Exemplo n.º 3
0
def do_fs(X, y, method):
    s, f = X.shape
    y_test = np.arange(f).reshape(1, -1)
    if method == "fdr":
        sel = fs.SelectFdr(fs.f_classif, .05).fit(X, y).transform(y_test)[0]
    elif method == "l1svc":
        sel = lsvc_cv.recursive_lsvc_cv(X, y, -3, 3, 7)
    elif method == "boruta":
        rf = RandomForestClassifier(n_jobs=-1)
        b = boruta.BorutaPy(rf, n_estimators='auto')
        b.fit(X, y)
        sel = np.where(b.support_)[0]
    elif method == "jmi":
        MIFS = mifs.MutualInformationFeatureSelector(method='JMI')
        MIFS.fit(X, y)
        sel = np.where(MIFS.support_)[0]
    return sel
Exemplo n.º 4
0
def fs_categorical(X, y, method):
    n, p = X.shape
    selected = []
    if method == 'Boruta':
        rf = RandomForestClassifier(n_jobs=-1)
        Boruta = boruta.BorutaPy(rf, n_estimators='auto')
        Boruta.fit(X, y)
        selected = np.where(Boruta.support_)[0]
    elif method == 'JMI':
        MIFS = mifs.MutualInformationFeatureSelector(method='JMI')
        MIFS.fit(X, y)
        selected = np.where(MIFS.support_)[0]
    elif method == 'L1':
        selected = lsvc_cv.recursive_lsvc_cv(X, y, -3, 3, 7)
    elif method == 'FDR':
        FDR = fs.SelectFdr(fs.f_classif, .05)
        FDR.fit(X, y)
        selected = FDR.transform(np.arange(p).reshape(1, -1))[0]
    return selected
Exemplo n.º 5
0
def select_best():
    df = pd.merge(
        acw.gen_long_data(tpt)
            .normalize(columns="metric")
            .add_net_meta(tpt.net_hierarchy(HierarchyName.RESTRICTED_PERIPHERY_CORE))
            .groupby(["task", "subject", "region", "net_meta"]).mean().reset_index()
            .rename(columns={"metric": "acw"}),
        acz.gen_long_data(tpt)
            .normalize(columns="metric")
            .add_net_meta(tpt.net_hierarchy(HierarchyName.RESTRICTED_PERIPHERY_CORE))
            .groupby(["task", "subject", "region", "net_meta"]).mean().reset_index()
            .rename(columns={"metric": "acz"}),
        on=["task", "subject", "region", "net_meta"], sort=False).and_filter(NOTnet_meta="M")

    X = df.iloc[:, -2:].values
    y = df.net_meta.map({"C": 0, "P": 1}).values

    functions = [fs.mutual_info_classif, fs.f_classif, fs.chi2]
    for func in functions:
        for method in [fs.SelectKBest(func, k=1), fs.SelectPercentile(func), fs.SelectFdr(func), fs.SelectFpr(func),
                       fs.SelectFwe(func)]:
            method.fit(X, y)
            print(f'{str(method).split("(")[0]} {func.__name__}: {np.argmax(method.scores_) + 1}')
Exemplo n.º 6
0
except:
    if _sklearn_ver > 17:
        raise

_feature_selectors = []
_feature_selectors.append((feature_selection.SelectKBest(k=1),
                           pd_feature_selection.SelectKBest(k=1), True))
_feature_selectors.append(
    (feature_selection.SelectKBest(k=1),
     pickle.loads(pickle.dumps(pd_feature_selection.SelectKBest(k=1))), True))
_feature_selectors.append((feature_selection.SelectKBest(k=2),
                           pd_feature_selection.SelectKBest(k=2), True))
_feature_selectors.append((feature_selection.SelectPercentile(),
                           pd_feature_selection.SelectPercentile(), True))
_feature_selectors.append(
    (feature_selection.SelectFdr(), pd_feature_selection.SelectFdr(), True))
_feature_selectors.append(
    (feature_selection.SelectFwe(), pd_feature_selection.SelectFwe(), True))
# Tmp Ami
if False:
    _feature_selectors.append(
        (feature_selection.RFE(linear_model.LogisticRegression()),
         pd_feature_selection.RFE(pd_linear_model.LogisticRegression()), True))

_keras_estimators = []
if _level > 0:
    _keras_estimators.append(
        (KerasClassifier(_build_classifier_nn, verbose=0),
         PdKerasClassifier(_build_classifier_nn,
                           _load_iris()[0]['class'].unique(),
                           verbose=0), False))
Exemplo n.º 7
0
def _eval_search_params(params_builder):
    search_params = {}

    for p in params_builder['param_set']:
        search_list = p['sp_list'].strip()
        if search_list == '':
            continue

        param_name = p['sp_name']
        if param_name.lower().endswith(NON_SEARCHABLE):
            print("Warning: `%s` is not eligible for search and was "
                  "omitted!" % param_name)
            continue

        if not search_list.startswith(':'):
            safe_eval = SafeEval(load_scipy=True, load_numpy=True)
            ev = safe_eval(search_list)
            search_params[param_name] = ev
        else:
            # Have `:` before search list, asks for estimator evaluatio
            safe_eval_es = SafeEval(load_estimators=True)
            search_list = search_list[1:].strip()
            # TODO maybe add regular express check
            ev = safe_eval_es(search_list)
            preprocessings = (
                preprocessing.StandardScaler(), preprocessing.Binarizer(),
                preprocessing.MaxAbsScaler(), preprocessing.Normalizer(),
                preprocessing.MinMaxScaler(),
                preprocessing.PolynomialFeatures(),
                preprocessing.RobustScaler(), feature_selection.SelectKBest(),
                feature_selection.GenericUnivariateSelect(),
                feature_selection.SelectPercentile(),
                feature_selection.SelectFpr(), feature_selection.SelectFdr(),
                feature_selection.SelectFwe(),
                feature_selection.VarianceThreshold(),
                decomposition.FactorAnalysis(random_state=0),
                decomposition.FastICA(random_state=0),
                decomposition.IncrementalPCA(),
                decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS),
                decomposition.LatentDirichletAllocation(random_state=0,
                                                        n_jobs=N_JOBS),
                decomposition.MiniBatchDictionaryLearning(random_state=0,
                                                          n_jobs=N_JOBS),
                decomposition.MiniBatchSparsePCA(random_state=0,
                                                 n_jobs=N_JOBS),
                decomposition.NMF(random_state=0),
                decomposition.PCA(random_state=0),
                decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS),
                decomposition.TruncatedSVD(random_state=0),
                kernel_approximation.Nystroem(random_state=0),
                kernel_approximation.RBFSampler(random_state=0),
                kernel_approximation.AdditiveChi2Sampler(),
                kernel_approximation.SkewedChi2Sampler(random_state=0),
                cluster.FeatureAgglomeration(),
                skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS),
                skrebate.SURFstar(n_jobs=N_JOBS),
                skrebate.MultiSURF(n_jobs=N_JOBS),
                skrebate.MultiSURFstar(n_jobs=N_JOBS),
                imblearn.under_sampling.ClusterCentroids(random_state=0,
                                                         n_jobs=N_JOBS),
                imblearn.under_sampling.CondensedNearestNeighbour(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.EditedNearestNeighbours(random_state=0,
                                                                n_jobs=N_JOBS),
                imblearn.under_sampling.RepeatedEditedNearestNeighbours(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.InstanceHardnessThreshold(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.NearMiss(random_state=0,
                                                 n_jobs=N_JOBS),
                imblearn.under_sampling.NeighbourhoodCleaningRule(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.OneSidedSelection(random_state=0,
                                                          n_jobs=N_JOBS),
                imblearn.under_sampling.RandomUnderSampler(random_state=0),
                imblearn.under_sampling.TomekLinks(random_state=0,
                                                   n_jobs=N_JOBS),
                imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.RandomOverSampler(random_state=0),
                imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.BorderlineSMOTE(random_state=0,
                                                       n_jobs=N_JOBS),
                imblearn.over_sampling.SMOTENC(categorical_features=[],
                                               random_state=0,
                                               n_jobs=N_JOBS),
                imblearn.combine.SMOTEENN(random_state=0),
                imblearn.combine.SMOTETomek(random_state=0))
            newlist = []
            for obj in ev:
                if obj is None:
                    newlist.append(None)
                elif obj == 'all_0':
                    newlist.extend(preprocessings[0:35])
                elif obj == 'sk_prep_all':  # no KernalCenter()
                    newlist.extend(preprocessings[0:7])
                elif obj == 'fs_all':
                    newlist.extend(preprocessings[7:14])
                elif obj == 'decomp_all':
                    newlist.extend(preprocessings[14:25])
                elif obj == 'k_appr_all':
                    newlist.extend(preprocessings[25:29])
                elif obj == 'reb_all':
                    newlist.extend(preprocessings[30:35])
                elif obj == 'imb_all':
                    newlist.extend(preprocessings[35:54])
                elif type(obj) is int and -1 < obj < len(preprocessings):
                    newlist.append(preprocessings[obj])
                elif hasattr(obj, 'get_params'):  # user uploaded object
                    if 'n_jobs' in obj.get_params():
                        newlist.append(obj.set_params(n_jobs=N_JOBS))
                    else:
                        newlist.append(obj)
                else:
                    sys.exit("Unsupported estimator type: %r" % (obj))

            search_params[param_name] = newlist

    return search_params
def get_search_params(params_builder):
    search_params = {}
    safe_eval = SafeEval(load_scipy=True, load_numpy=True)
    safe_eval_es = SafeEval(load_estimators=True)

    for p in params_builder['param_set']:
        search_p = p['search_param_selector']['search_p']
        if search_p.strip() == '':
            continue
        param_type = p['search_param_selector']['selected_param_type']

        lst = search_p.split(':')
        assert (
            len(lst) == 2
        ), "Error, make sure there is one and only one colon in search parameter input."
        literal = lst[1].strip()
        param_name = lst[0].strip()
        if param_name:
            if param_name.lower() == 'n_jobs':
                sys.exit("Parameter `%s` is invalid for search." % param_name)
            elif not param_name.endswith('-'):
                ev = safe_eval(literal)
                if param_type == 'final_estimator_p':
                    search_params['estimator__' + param_name] = ev
                else:
                    search_params['preprocessing_' + param_type[5:6] + '__' +
                                  param_name] = ev
            else:
                # only for estimator eval, add `-` to the end of param
                #TODO maybe add regular express check
                ev = safe_eval_es(literal)
                for obj in ev:
                    if 'n_jobs' in obj.get_params():
                        obj.set_params(n_jobs=N_JOBS)
                if param_type == 'final_estimator_p':
                    search_params['estimator__' + param_name[:-1]] = ev
                else:
                    search_params['preprocessing_' + param_type[5:6] + '__' +
                                  param_name[:-1]] = ev
        elif param_type != 'final_estimator_p':
            #TODO regular express check ?
            ev = safe_eval_es(literal)
            preprocessors = [
                preprocessing.StandardScaler(),
                preprocessing.Binarizer(),
                preprocessing.Imputer(),
                preprocessing.MaxAbsScaler(),
                preprocessing.Normalizer(),
                preprocessing.MinMaxScaler(),
                preprocessing.PolynomialFeatures(),
                preprocessing.RobustScaler(),
                feature_selection.SelectKBest(),
                feature_selection.GenericUnivariateSelect(),
                feature_selection.SelectPercentile(),
                feature_selection.SelectFpr(),
                feature_selection.SelectFdr(),
                feature_selection.SelectFwe(),
                feature_selection.VarianceThreshold(),
                decomposition.FactorAnalysis(random_state=0),
                decomposition.FastICA(random_state=0),
                decomposition.IncrementalPCA(),
                decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS),
                decomposition.LatentDirichletAllocation(random_state=0,
                                                        n_jobs=N_JOBS),
                decomposition.MiniBatchDictionaryLearning(random_state=0,
                                                          n_jobs=N_JOBS),
                decomposition.MiniBatchSparsePCA(random_state=0,
                                                 n_jobs=N_JOBS),
                decomposition.NMF(random_state=0),
                decomposition.PCA(random_state=0),
                decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS),
                decomposition.TruncatedSVD(random_state=0),
                kernel_approximation.Nystroem(random_state=0),
                kernel_approximation.RBFSampler(random_state=0),
                kernel_approximation.AdditiveChi2Sampler(),
                kernel_approximation.SkewedChi2Sampler(random_state=0),
                cluster.FeatureAgglomeration(),
                skrebate.ReliefF(n_jobs=N_JOBS),
                skrebate.SURF(n_jobs=N_JOBS),
                skrebate.SURFstar(n_jobs=N_JOBS),
                skrebate.MultiSURF(n_jobs=N_JOBS),
                skrebate.MultiSURFstar(n_jobs=N_JOBS),
                imblearn.under_sampling.ClusterCentroids(random_state=0,
                                                         n_jobs=N_JOBS),
                imblearn.under_sampling.CondensedNearestNeighbour(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.EditedNearestNeighbours(random_state=0,
                                                                n_jobs=N_JOBS),
                imblearn.under_sampling.RepeatedEditedNearestNeighbours(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.InstanceHardnessThreshold(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.NearMiss(random_state=0,
                                                 n_jobs=N_JOBS),
                imblearn.under_sampling.NeighbourhoodCleaningRule(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.OneSidedSelection(random_state=0,
                                                          n_jobs=N_JOBS),
                imblearn.under_sampling.RandomUnderSampler(random_state=0),
                imblearn.under_sampling.TomekLinks(random_state=0,
                                                   n_jobs=N_JOBS),
                imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.RandomOverSampler(random_state=0),
                imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.BorderlineSMOTE(random_state=0,
                                                       n_jobs=N_JOBS),
                imblearn.over_sampling.SMOTENC(categorical_features=[],
                                               random_state=0,
                                               n_jobs=N_JOBS),
                imblearn.combine.SMOTEENN(random_state=0),
                imblearn.combine.SMOTETomek(random_state=0)
            ]
            newlist = []
            for obj in ev:
                if obj is None:
                    newlist.append(None)
                elif obj == 'all_0':
                    newlist.extend(preprocessors[0:36])
                elif obj == 'sk_prep_all':  # no KernalCenter()
                    newlist.extend(preprocessors[0:8])
                elif obj == 'fs_all':
                    newlist.extend(preprocessors[8:15])
                elif obj == 'decomp_all':
                    newlist.extend(preprocessors[15:26])
                elif obj == 'k_appr_all':
                    newlist.extend(preprocessors[26:30])
                elif obj == 'reb_all':
                    newlist.extend(preprocessors[31:36])
                elif obj == 'imb_all':
                    newlist.extend(preprocessors[36:55])
                elif type(obj) is int and -1 < obj < len(preprocessors):
                    newlist.append(preprocessors[obj])
                elif hasattr(obj, 'get_params'):  # user object
                    if 'n_jobs' in obj.get_params():
                        newlist.append(obj.set_params(n_jobs=N_JOBS))
                    else:
                        newlist.append(obj)
                else:
                    sys.exit("Unsupported preprocessor type: %r" % (obj))
            search_params['preprocessing_' + param_type[5:6]] = newlist
        else:
            sys.exit("Parameter name of the final estimator can't be skipped!")

    return search_params
Exemplo n.º 9
0
def do_fs(X, y):
    s, f = X.shape
    y_test = np.arange(f).reshape(1, -1)

    # --------------------------------------------------------------
    # UNIVARIATE FEATURE SELECTION
    # percentile - take the top10% of features
    sel_uni_perc = fs.SelectPercentile(fs.f_classif,
                                       10).fit(X, y).transform(y_test)[0]

    # fdr - minimize false discovery rate at alpha = .05
    sel_uni_fdr = fs.SelectFdr(fs.f_classif, .05).fit(X,
                                                      y).transform(y_test)[0]

    # --------------------------------------------------------------
    # RFECV
    # do a cross-validated grid search for the optimal C
    gridC = {'C': np.logspace(-6, 3, 10)}
    svc = LinearSVC(penalty='l1', loss='squared_hinge', dual=False, tol=1e-4)
    grid_cv = GridSearchCV(svc, gridC, scoring='accuracy', n_jobs=-1)
    grid_cv.fit(X, y)

    # set the optimal C
    # adjust for the smaller training sample size, due to cross validation
    # http://scikit-learn.org/stable/auto_examples/svm/plot_svm_scale_c.html
    cv_num = 3
    train_size = 1 - 1 / float(cv_num)
    adjust_c = float(s * train_size)
    svc.set_params(C=grid_cv.best_params_['C'] * adjust_c)
    # do a stratified 3 fold cross-validated recursive feature elimination,
    # with 1% of the worst feautres removed each round

    rfecv = fs.RFECV(estimator=svc, step=.01, cv=cv_num, scoring='accuracy')
    rfecv.fit(X, y)
    sel_rfecv = rfecv.transform(y_test)[0]

    # --------------------------------------------------------------
    # L1 SVC
    sel_lsvc = lsvc_cv.recursive_lsvc_cv(X, y, -3, 3, 7)

    # --------------------------------------------------------------
    # STABILITY SELECTION
    rlr = RandomizedLogisticRegression(n_resampling=1000,
                                       C=np.logspace(-2, 2, 5),
                                       selection_threshold=0.7,
                                       sample_fraction=0.5)
    sel_rlr = rlr.fit(X, y).transform(y_test)[0]

    # --------------------------------------------------------------
    # BORUTA
    rf = RandomForestClassifier(n_jobs=-1)
    b = boruta.BorutaPy(rf, n_estimators='auto')
    b.fit(X, y)
    sel_b_rf = np.where(b.support_)[0]

    # --------------------------------------------------------------
    # JMI
    MIFS = mifs.MutualInformationFeatureSelector(method='JMI')
    MIFS.fit(X, y)
    sel_jmi = np.where(MIFS.support_)[0]

    return (sel_uni_perc, sel_uni_fdr, sel_rfecv, sel_lsvc, sel_rlr, sel_b_rf,
            sel_jmi)