Exemplo n.º 1
0
def test_surfstar_pandas_inputs():
    """Check: Data (pandas DataFrame/Series): SURF* works with pandas DataFrame and Series inputs"""
    np.random.seed(9238745)
    clf = make_pipeline(SURFstar(n_features_to_select=2),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))
    assert np.mean(cross_val_score(clf, features_df, labels_s, cv=3,
                                   n_jobs=-1)) > 0.7
Exemplo n.º 2
0
def test_surfstar_pipeline_parallel():
    """Check: Data (Binary Endpoint, Discrete Features): SURF* works in a sklearn pipeline when SURF* is parallelized"""
    np.random.seed(9238745)

    clf = make_pipeline(SURFstar(n_features_to_select=2, n_jobs=-1),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
Exemplo n.º 3
0
def test_surfstar_pipeline():
    """Ensure that SURF* works in a sklearn pipeline when it is parallelized"""
    np.random.seed(9238745)

    clf = make_pipeline(SURFstar(n_features_to_select=2, n_jobs=-1),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
Exemplo n.º 4
0
def test_surfstar_pipeline_mixed_attributes():
    """Check: Data (Mixed Attributes): SURF* works in a sklearn pipeline"""
    np.random.seed(9238745)

    clf = make_pipeline(SURFstar(n_features_to_select=2),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(cross_val_score(clf, features_mixed_attributes,
                                   labels_mixed_attributes, cv=3, n_jobs=-1)) > 0.7
Exemplo n.º 5
0
def test_surfstar_pipeline_cont_endpoint():
    """Check: Data (Continuous Endpoint): SURF* works in a sklearn pipeline"""
    np.random.seed(9238745)

    clf = make_pipeline(SURFstar(n_features_to_select=2),
                        RandomForestRegressor(n_estimators=100, n_jobs=-1))

    assert abs(np.mean(cross_val_score(clf, features_cont_endpoint,
                                       labels_cont_endpoint, cv=3, n_jobs=-1))) < 0.5
Exemplo n.º 6
0
def test_surfstar_pipeline_missing_values():
    """Ensure that SURF* works in a sklearn pipeline with missing values"""
    np.random.seed(9238745)

    clf = make_pipeline(SURFstar(n_features_to_select=2, n_jobs=-1), Imputer(),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(
        cross_val_score(
            clf, features_missing_values, labels_missing_values, cv=3)) > 0.7
Exemplo n.º 7
0
def test_surfstar_pipeline_multiclass():
    """Check: Data (Multiclass Endpoint): SURF* works in a sklearn pipeline"""
    np.random.seed(9238745)

    clf = make_pipeline(SURFstar(n_features_to_select=2), SimpleImputer(),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(
        cross_val_score(
            clf, features_multiclass, labels_multiclass, cv=3,
            n_jobs=-1)) > 0.7
Exemplo n.º 8
0
        RandomForestClassifier(n_jobs=-1, random_state=args.randomseed),
        'ext':
        ExtraTreesClassifier(n_estimators=100,
                             n_jobs=-1,
                             random_state=args.randomseed),
        'svm':
        SVC(kernel='sigmoid', random_state=args.randomseed)
    }

    # 初始化 fs 字典
    fs = {
        'ReliefF': ReliefF(n_features_to_select=100, verbose=False, n_jobs=-1),
        # 'TuRF': TuRF(core_algorithm="ReliefF", n_features_to_select=100, verbose=False, n_jobs=-1),
        'SURF': SURF(n_features_to_select=100, verbose=False, n_jobs=-1),
        'SURFstar': SURFstar(n_features_to_select=100,
                             verbose=False,
                             n_jobs=-1)
    }

    print('\nClassifier parameters:', clf[args.classifier])

    print('\nStarting cross validating without feature selection...\n')

    # 特征排序前的增量特征预测,根据名称调用字典中指定分类器
    y_pred_list = [
        cross_val_predict(clf[args.classifier],
                          X[:, 0:i + 1],
                          y,
                          cv=args.kfolds,
                          n_jobs=-1) for i in trange(0, X.shape[1])
    ]
Exemplo n.º 9
0
def rank_features_by_rebate_methods(data_split_list,
                                    fs_method,
                                    iterate,
                                    remove_percent=0.1,
                                    verbose=False):
    ## 0. Input arguments:
    # data_split_list: data frame that contains the learning data
    # fs_method: feature ranking methods to be used: 'SURF', 'SURFstar', 'MultiSURF', or 'MultiSURFstar'
    # iterate: whether to implement TURF: True or False (TURF will remove low-ranking features after each iteration, effective when #features is large)
    # remove_percent: percentage of features removed at each iteration (only applied when iterate = True)
    # verbose: whether to show progress by each fold: True of False

    ## 1. Define function for feature ranking method
    # SURF
    if fs_method == 'SURF':
        # Implement TURF extension when 'iterate == True'
        if iterate == True:
            fs = TuRF(core_algorithm='SURF', pct=remove_percent)
        else:
            fs = SURF()
    # SURFstar
    if fs_method == 'SURFstar':
        if iterate == True:
            fs = TuRF(core_algorithm='SURFstar', pct=remove_percent)
        else:
            fs = SURFstar()
    # MultiSURF
    if fs_method == 'MultiSURF':
        if iterate == True:
            fs = TuRF(core_algorithm='MultiSURF', pct=remove_percent)
        else:
            fs = MultiSURF()
    # MultiSURFstar
    if fs_method == 'MultiSURFstar':
        if iterate == True:
            fs = TuRF(core_algorithm='MultiSURFstar', pct=remove_percent)
        else:
            fs = MultiSURFstar()

    ## 2. Perform feature ranking on each fold of training data
    # iterate by folds
    feat_impt_dict = {}
    for i in range(0, len(data_split_list)):
        # intermediate output
        if verbose == True:
            print('Computing feature importance scores using data from fold ' +
                  str(i) + '\n')
# obtain training feature matrix and response vector
        feat_train, label_train, _, _ = data_split_list[i]
        # fit feature ranking model using the specified method
        if iterate == True:
            fs.fit(feat_train.values, label_train.values, list(feat_train))
        else:
            fs.fit(feat_train.values, label_train.values)
        # output feature importance scores in a data frame
        fold_name = 'Fold_' + str(i)
        feat_impt_dict[fold_name] = fs.feature_importances_
    # aggregate results from muliple folds into one data frame
    feat_impt_df = pd.DataFrame(feat_impt_dict)
    feat_impt_df.index = feat_train.columns

    return feat_impt_df