Exemplo n.º 1
0
def test_multisurfstar_pandas_inputs():
    """Check: Data (pandas DataFrame/Series): MultiSURF* works with pandas DataFrame and Series inputs"""
    np.random.seed(320931)
    clf = make_pipeline(MultiSURFstar(n_features_to_select=2),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))
    assert np.mean(cross_val_score(clf, features_df, labels_s, cv=3,
                                   n_jobs=-1)) > 0.7
Exemplo n.º 2
0
def test_multisurfstar_pipeline_parallel():
    """Check: Data (Binary Endpoint, Discrete Features): MultiSURF* works in a sklearn pipeline when MultiSURF* is parallelized"""
    np.random.seed(320931)

    clf = make_pipeline(MultiSURFstar(n_features_to_select=2, n_jobs=-1),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
Exemplo n.º 3
0
def test_multisurfstar_pipeline():
    """Ensure that MultiSURF* works in a sklearn pipeline when it is parallelized"""
    np.random.seed(320931)

    clf = make_pipeline(MultiSURFstar(n_features_to_select=2, n_jobs=-1),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
Exemplo n.º 4
0
def test_multisurfstar_pipeline_mixed_attributes():
    """Check: Data (Mixed Attributes): MultiSURF* works in a sklearn pipeline"""
    np.random.seed(320931)

    clf = make_pipeline(MultiSURFstar(n_features_to_select=2),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(cross_val_score(clf, features_mixed_attributes,
                                   labels_mixed_attributes, cv=3, n_jobs=-1)) > 0.7
Exemplo n.º 5
0
def test_multisurfstar_pipeline_cont_endpoint():
    """Check: Data (Continuous Endpoint): MultiSURF* works in a sklearn pipeline"""
    np.random.seed(320931)

    clf = make_pipeline(MultiSURFstar(n_features_to_select=2),
                        RandomForestRegressor(n_estimators=100, n_jobs=-1))

    assert abs(np.mean(cross_val_score(clf, features_cont_endpoint,
                                       labels_cont_endpoint, cv=3, n_jobs=-1))) < 0.5
Exemplo n.º 6
0
def rebate(df, target, n_features):
    """
    Run the MultiSURF* algorithm on a dataframe, returning the reduced df.

    Args:
        df (pandas.DataFrame): A dataframe
        target (str): The target key (must be present in df)
        n_features (int): The number of features desired to be returned.

    Returns:
        pd.DataFrame The dataframe with fewer features, and no target

    """
    X = df.drop(target, axis=1)
    y = df[target]
    rf = MultiSURFstar(n_features_to_select=n_features, n_jobs=-1)
    matrix = rf.fit_transform(X.values, y.values)
    feats = []
    for c in matrix.T:
        for f in X.columns.values:
            if np.array_equal(c, X[f].values) and f not in feats:
                feats.append(f)
    return df[feats]
Exemplo n.º 7
0
def rank_features_by_rebate_methods(data_split_list,
                                    fs_method,
                                    iterate,
                                    remove_percent=0.1,
                                    verbose=False):
    ## 0. Input arguments:
    # data_split_list: data frame that contains the learning data
    # fs_method: feature ranking methods to be used: 'SURF', 'SURFstar', 'MultiSURF', or 'MultiSURFstar'
    # iterate: whether to implement TURF: True or False (TURF will remove low-ranking features after each iteration, effective when #features is large)
    # remove_percent: percentage of features removed at each iteration (only applied when iterate = True)
    # verbose: whether to show progress by each fold: True of False

    ## 1. Define function for feature ranking method
    # SURF
    if fs_method == 'SURF':
        # Implement TURF extension when 'iterate == True'
        if iterate == True:
            fs = TuRF(core_algorithm='SURF', pct=remove_percent)
        else:
            fs = SURF()
    # SURFstar
    if fs_method == 'SURFstar':
        if iterate == True:
            fs = TuRF(core_algorithm='SURFstar', pct=remove_percent)
        else:
            fs = SURFstar()
    # MultiSURF
    if fs_method == 'MultiSURF':
        if iterate == True:
            fs = TuRF(core_algorithm='MultiSURF', pct=remove_percent)
        else:
            fs = MultiSURF()
    # MultiSURFstar
    if fs_method == 'MultiSURFstar':
        if iterate == True:
            fs = TuRF(core_algorithm='MultiSURFstar', pct=remove_percent)
        else:
            fs = MultiSURFstar()

    ## 2. Perform feature ranking on each fold of training data
    # iterate by folds
    feat_impt_dict = {}
    for i in range(0, len(data_split_list)):
        # intermediate output
        if verbose == True:
            print('Computing feature importance scores using data from fold ' +
                  str(i) + '\n')
# obtain training feature matrix and response vector
        feat_train, label_train, _, _ = data_split_list[i]
        # fit feature ranking model using the specified method
        if iterate == True:
            fs.fit(feat_train.values, label_train.values, list(feat_train))
        else:
            fs.fit(feat_train.values, label_train.values)
        # output feature importance scores in a data frame
        fold_name = 'Fold_' + str(i)
        feat_impt_dict[fold_name] = fs.feature_importances_
    # aggregate results from muliple folds into one data frame
    feat_impt_df = pd.DataFrame(feat_impt_dict)
    feat_impt_df.index = feat_train.columns

    return feat_impt_df