示例#1
0
def test_sparse_support():
    # Make sure sparse data is supported

    X, y = make_regression(n_features=10)
    X = scipy.sparse.csr_matrix(X)
    sfs = SequentialFeatureSelector(LinearRegression(), cv=2)
    sfs.fit(X, y)
    sfs.transform(X)
示例#2
0
def test_nan_support():
    # Make sure nans are OK if the underlying estimator supports nans

    rng = np.random.RandomState(0)
    n_samples, n_features = 100, 10
    X, y = make_regression(n_samples, n_features, random_state=0)
    nan_mask = rng.randint(0, 2, size=(n_samples, n_features), dtype=bool)
    X[nan_mask] = np.nan
    sfs = SequentialFeatureSelector(HistGradientBoostingRegressor(), cv=2)
    sfs.fit(X, y)
    sfs.transform(X)

    with pytest.raises(ValueError, match='Input contains NaN'):
        # LinearRegression does not support nans
        SequentialFeatureSelector(LinearRegression(), cv=2).fit(X, y)
def run_sfs(x, y, output=None, caption=''):
    sfs = SequentialFeatureSelector(estimator=KNeighborsClassifier())
    sfs.fit(x, y)
    x_reduced = pd.DataFrame(sfs.transform(x), columns=x.columns[sfs.support_])
    print(f'reduced columns: {x_reduced.columns}')
    x_reduced.to_csv(f'{output}/{caption}-sfs.csv', index=False)
    return x_reduced
示例#4
0
文件: prepare.py 项目: czosel/aml
    def select_greedy(data):
        X, X_test, y = data

        svr = svm.SVR(kernel="rbf", C=100, tol=1).fit(X, y)
        tic = time()
        select = SequentialFeatureSelector(svr,
                                           direction=direction,
                                           n_features_to_select=n_features,
                                           n_jobs=-1).fit(X, y)
        toc = time()

        joblib.dump(select.get_support(), "joblib/greedy_support")

        print(f"features selected: {select.get_support()}")
        print(f"done in: {toc - tic:.2f}s")

        return select.transform(X), select.transform(X_test), y
示例#5
0
def test_pipeline_support():
    # Make sure that pipelines can be passed into SFS and that SFS can be
    # passed into a pipeline

    n_samples, n_features = 50, 3
    X, y = make_regression(n_samples, n_features, random_state=0)

    # pipeline in SFS
    pipe = make_pipeline(StandardScaler(), LinearRegression())
    sfs = SequentialFeatureSelector(pipe, cv=2)
    sfs.fit(X, y)
    sfs.transform(X)

    # SFS in pipeline
    sfs = SequentialFeatureSelector(LinearRegression(), cv=2)
    pipe = make_pipeline(StandardScaler(), sfs)
    pipe.fit(X, y)
    pipe.transform(X)
示例#6
0
def test_unsupervised_model_fit(n_features_to_select):
    # Make sure that models without classification labels are not being
    # validated

    X, y = make_blobs(n_features=6)
    sfs = SequentialFeatureSelector(
        KMeans(),
        n_features_to_select=n_features_to_select,
    )
    sfs.fit(X)
    assert sfs.transform(X).shape[1] == n_features_to_select
示例#7
0
def test_n_features_to_select(direction, n_features_to_select):
    # Make sure n_features_to_select is respected

    X, y = make_regression(n_features=10)
    sfs = SequentialFeatureSelector(LinearRegression(),
                                    n_features_to_select=n_features_to_select,
                                    direction=direction, cv=2)
    sfs.fit(X, y)
    if n_features_to_select is None:
        n_features_to_select = 5  # n_features // 2
    assert sfs.get_support(indices=True).shape[0] == n_features_to_select
    assert sfs.n_features_to_select_ == n_features_to_select
    assert sfs.transform(X).shape[1] == n_features_to_select
示例#8
0
def test_n_features_to_select_auto(direction):
    """Check the behaviour of `n_features_to_select="auto"` with different
    values for the parameter `tol`.
    """

    n_features = 10
    tol = 1e-3
    X, y = make_regression(n_features=n_features, random_state=0)
    sfs = SequentialFeatureSelector(
        LinearRegression(),
        n_features_to_select="auto",
        tol=tol,
        direction=direction,
        cv=2,
    )
    sfs.fit(X, y)

    max_features_to_select = n_features - 1

    assert sfs.get_support(indices=True).shape[0] <= max_features_to_select
    assert sfs.n_features_to_select_ <= max_features_to_select
    assert sfs.transform(X).shape[1] <= max_features_to_select
    assert sfs.get_support(indices=True).shape[0] == sfs.n_features_to_select_
示例#9
0
    'AY_Dias_passive', 'AZ_Dias_passive', 'AT_Dias_passive'
]

sfs1 = SequentialFeatureSelector(reg_model,
                                 n_features_to_select=5,
                                 direction='forward',
                                 scoring='neg_root_mean_squared_error')
sfs1.fit(X_train_test, y_train_test)
select_features = sfs1.get_feature_names_out(input_features=feature_set)
#print selected features
print('Selected features: ')
print(select_features)
print('\n')

#transform the data to only use the selected features for the model development
X_select_train_test = sfs1.transform(X_train_test)
X_select_val = sfs1.transform(X_val)

X_train_test = X_select_train_test
X_val = X_select_val

####get feature importance with SVR##########
fit = reg_model.fit(X_train_test, y_train_test)
weights = fit.coef_
print('Feature Weights: ')
print(weights)
print('\n')

##############perform grid search on training-testing set############
estimator = SVR()  #regression model for this analysis
param_dist = {
示例#10
0
def main():
    # train_file_name = sys.argv[1]
    # output_file_name = sys.argv[2]
    train_data_file_name = "NEWS_Training_data.csv"
    train_label_file_name = "NEWS_Training_label.csv"
    test_data_file_name = "NEWS_Test_data.csv"
    test_label_file_name = "NEWS_Test_label.csv"

    tr_data_frame_X = read_csv(train_data_file_name)
    tr_data_frame_y = read_csv(train_label_file_name)
    te_data_frame_X = read_csv(test_data_file_name)
    te_data_frame_y = read_csv(test_label_file_name)
    alias = test_data_file_name.split('/')[-1].split('_')[0]
    model_filename = "model_{}.pkl".format(alias)
    try:
        linear_reg_model = pickle.load(open(model_filename, 'rb'))
        mode = "rfe"  # prefix to open the models
        lasso_reg_model = pickle.load(open(mode + "lasso_reg_model.pkl", 'rb'))
        logistic_reg_model = pickle.load(
            open(mode + "logistic_reg_model.pkl", 'rb'))
        # logistic_reg_model = pickle.load(open("model_NEWS.pkl", 'rb'))
        extra_forest_reg_model = pickle.load(
            open(mode + "extra_forest_reg_model.pkl", 'rb'))
        svm_reg_model = pickle.load(open(mode + "svm_reg_model.pkl", 'rb'))
        knn_reg_model = pickle.load(open(mode + "knn_reg_model.pkl", 'rb'))

    except:
        print(
            "ERROR: You must provide model.pkl file in the current directory.")
        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),
                                model_filename)

    X_train, y_train = toNumpy(tr_data_frame_X, tr_data_frame_y)
    X_test, y_test = toNumpy(te_data_frame_X, te_data_frame_y)
    print(X_test.shape[0])
    trivial_y_pred = np.repeat(np.mean(y_train), y_test.shape[0])
    baseline_y_pred = model_predict(linear_reg_model, X_test)
    measure_performance(y_pred=trivial_y_pred,
                        y_test=y_test,
                        mode="trivial testing")
    measure_performance(y_pred=baseline_y_pred,
                        y_test=y_test,
                        mode="baseline testing")
    measure_model(lasso_reg_model,
                  logistic_reg_model,
                  extra_forest_reg_model,
                  svm_reg_model,
                  knn_reg_model,
                  X_test=X_test,
                  y_test=y_test)

    one_hot_cols = [11, 12, 13, 14, 15, 16, 29, 30, 31, 32, 33, 34, 35, 36]
    z_score_cols = [
        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 37, 42, 44, 45, 46, 47, 48, 49,
        50, 54, 56, 57
    ]
    normal_cols = [17, 19, 38, 39, 40, 41, 43, 51, 52, 53, 55]
    min_max_cols = [20, 21, 22, 23, 24, 25, 26, 27, 28]
    prepro_tr_X_data = np.zeros((X_train.shape[0], X_train.shape[1]),
                                dtype='float')
    prepro_te_X_data = np.zeros((X_test.shape[0], X_test.shape[1]),
                                dtype='float')
    prepro_tr_X_data[one_hot_cols] = X_train[one_hot_cols]
    prepro_te_X_data[one_hot_cols] = X_test[one_hot_cols]
    # Standardize the test data
    z_score_scaler = StandardScaler()
    z_score_scaler.fit(X_train[z_score_cols])
    prepro_tr_X_data[z_score_cols] = z_score_scaler.transform(
        X_train[z_score_cols])
    prepro_te_X_data[z_score_cols] = z_score_scaler.transform(
        X_test[z_score_cols])
    # Normalize the test data
    prepro_tr_X_data[normal_cols] = normalize(X_train[normal_cols],
                                              norm='l2',
                                              axis=0)
    prepro_te_X_data[normal_cols] = normalize(X_test[normal_cols],
                                              norm='l2',
                                              axis=0)
    # MinMax scale the test data
    minmax_scaler = MinMaxScaler()
    minmax_scaler.fit(X_train[min_max_cols])
    prepro_tr_X_data[min_max_cols] = minmax_scaler.transform(
        X_train[min_max_cols])
    prepro_te_X_data[min_max_cols] = minmax_scaler.transform(
        X_test[min_max_cols])

    measure_model(lasso_reg_model,
                  logistic_reg_model,
                  extra_forest_reg_model,
                  svm_reg_model,
                  knn_reg_model,
                  X_test=prepro_te_X_data,
                  y_test=y_test)

    # Do feature selection (UFS)
    # Univariate Feature Selection
    usf_f = SelectKBest(f_regression, k=50)
    usf_f.fit(prepro_tr_X_data, y_train)
    ufs_f_te_X_data = usf_f.transform(prepro_te_X_data)
    measure_model(lasso_reg_model,
                  logistic_reg_model,
                  extra_forest_reg_model,
                  svm_reg_model,
                  knn_reg_model,
                  X_test=ufs_f_te_X_data,
                  y_test=y_test)

    usf_mu = SelectKBest(mutual_info_regression, k=50)
    usf_mu.fit(prepro_tr_X_data, y_train)
    ufs_mu_te_X_data = usf_mu.transform(prepro_te_X_data)
    measure_model(lasso_reg_model,
                  logistic_reg_model,
                  extra_forest_reg_model,
                  svm_reg_model,
                  knn_reg_model,
                  X_test=ufs_mu_te_X_data,
                  y_test=y_test)

    estimator = linear_model.LinearRegression()
    # Recursive feature elimination
    rfe_selector = RFE(estimator, n_features_to_select=50, step=1)
    rfe_selector.fit(prepro_tr_X_data, y_train)
    rfe_X_data = rfe_selector.transform(prepro_te_X_data)
    measure_model(lasso_reg_model,
                  logistic_reg_model,
                  extra_forest_reg_model,
                  svm_reg_model,
                  knn_reg_model,
                  X_test=rfe_X_data,
                  y_test=y_test)

    # Sequential Feature Selection (SFS)
    sfs = SequentialFeatureSelector(estimator,
                                    n_features_to_select=50,
                                    direction='backward')
    sfs.fit(prepro_tr_X_data, y_train)
    sfs_X_data = sfs.transform(prepro_te_X_data)
    measure_model(lasso_reg_model,
                  logistic_reg_model,
                  extra_forest_reg_model,
                  svm_reg_model,
                  knn_reg_model,
                  X_test=sfs_X_data,
                  y_test=y_test)
示例#11
0
def test_n_features_to_select_stopping_criterion(direction):
    """Check the behaviour stopping criterion for feature selection
    depending on the values of `n_features_to_select` and `tol`.

    When `direction` is `'forward'`, select a new features at random
    among those not currently selected in selector.support_,
    build a new version of the data that includes all the features
    in selector.support_ + this newly selected feature.
    And check that the cross-validation score of the model trained on
    this new dataset variant is lower than the model with
    the selected forward selected features or at least does not improve
    by more than the tol margin.

    When `direction` is `'backward'`, instead of adding a new feature
    to selector.support_, try to remove one of those selected features at random
    And check that the cross-validation score is either decreasing or
    not improving by more than the tol margin.
    """

    X, y = make_regression(n_features=50, n_informative=10, random_state=0)

    tol = 1e-3

    sfs = SequentialFeatureSelector(
        LinearRegression(),
        n_features_to_select="auto",
        tol=tol,
        direction=direction,
        cv=2,
    )
    sfs.fit(X, y)
    selected_X = sfs.transform(X)

    rng = np.random.RandomState(0)

    added_candidates = list(
        set(range(X.shape[1])) - set(sfs.get_support(indices=True)))
    added_X = np.hstack([
        selected_X,
        (X[:, rng.choice(added_candidates)])[:, np.newaxis],
    ])

    removed_candidate = rng.choice(list(range(sfs.n_features_to_select_)))
    removed_X = np.delete(selected_X, removed_candidate, axis=1)

    plain_cv_score = cross_val_score(LinearRegression(), X, y, cv=2).mean()
    sfs_cv_score = cross_val_score(LinearRegression(), selected_X, y,
                                   cv=2).mean()
    added_cv_score = cross_val_score(LinearRegression(), added_X, y,
                                     cv=2).mean()
    removed_cv_score = cross_val_score(LinearRegression(), removed_X, y,
                                       cv=2).mean()

    assert sfs_cv_score >= plain_cv_score

    if direction == "forward":
        assert (sfs_cv_score - added_cv_score) <= tol
        assert (sfs_cv_score - removed_cv_score) >= tol
    else:
        assert (added_cv_score - sfs_cv_score) <= tol
        assert (removed_cv_score - sfs_cv_score) <= tol