def test_check_pandas_dataframe_fit():

    knn = KNeighborsClassifier(n_neighbors=4)
    iris = load_iris()
    X = iris.data
    y = iris.target
    efs1 = EFS(knn,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)

    df = pd.DataFrame(X, columns=['sepal length', 'sepal width',
                                  'petal length', 'petal width'])

    sfs1 = efs1.fit(X, y)
    assert efs1.best_idx_ == (2, 3), efs1.best_idx_
    assert efs1.best_feature_names_ == ('2', '3')
    assert efs1.interrupted_ is False

    sfs1._TESTING_INTERRUPT_MODE = True
    sfs1 = sfs1.fit(df, y)
    assert efs1.best_idx_ == (0, 1), efs1.best_idx_
    assert efs1.best_feature_names_ == ('sepal length', 'sepal width')
    assert efs1.interrupted_ is True
def test_knn_cv3_groups():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    efs1 = EFS(knn,
               min_features=3,
               max_features=3,
               scoring='accuracy',
               cv=GroupKFold(n_splits=3),
               print_progress=False)
    np.random.seed(1630672634)
    groups = np.random.randint(0, 6, size=len(y))
    efs1 = efs1.fit(X, y, groups=groups)
    # print(efs1.subsets_)
    expect = {0: {'cv_scores': np.array([0.97916667, 0.93877551, 0.9245283]),
                  'feature_idx': (0, 1, 2),
                  'avg_score': 0.9474901595858469,
                  'feature_names': ('0', '1', '2')},
              1: {'cv_scores': np.array([1., 0.93877551, 0.9245283]),
                  'feature_idx': (0, 1, 3),
                  'avg_score': 0.9544346040302915,
                  'feature_names': ('0', '1', '3')},
              2: {'cv_scores': np.array([0.97916667, 0.95918367, 0.9245283]),
                  'feature_idx': (0, 2, 3),
                  'avg_score': 0.9542928806742822,
                  'feature_names': ('0', '2', '3')},
              3: {'cv_scores': np.array([0.97916667, 0.95918367, 0.94339623]),
                  'feature_idx': (1, 2, 3),
                  'avg_score': 0.9605821888503829,
                  'feature_names': ('1', '2', '3')}}
    dict_compare_utility(d1=expect, d2=efs1.subsets_)
def test_knn_cv3():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    efs1 = EFS(knn,
               min_features=3,
               max_features=3,
               scoring='accuracy',
               cv=4,
               print_progress=False)
    efs1 = efs1.fit(X, y)
    expect = {0: {'avg_score': 0.9391025641025641,
                  'feature_idx': (0, 1, 2),
                  'cv_scores': np.array([0.97435897, 0.94871795,
                                         0.88888889, 0.94444444])},
              1: {'avg_score': 0.94017094017094016,
                  'feature_idx': (0, 1, 3),
                  'cv_scores': np.array([0.92307692, 0.94871795,
                                         0.91666667, 0.97222222])},
              2: {'avg_score': 0.95299145299145294,
                  'feature_idx': (0, 2, 3),
                  'cv_scores': np.array([0.97435897, 0.94871795,
                                         0.91666667, 0.97222222])},
              3: {'avg_score': 0.97275641025641035,
                  'feature_idx': (1, 2, 3),
                  'cv_scores': np.array([0.97435897, 1.,
                                         0.94444444, 0.97222222])}}
    dict_compare_utility(d1=expect, d2=efs1.subsets_)
    assert efs1.best_idx_ == (1, 2, 3)
    assert round(efs1.best_score_, 4) == 0.9728
def test_knn_wo_cv():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    efs1 = EFS(knn,
               min_features=2,
               max_features=3,
               scoring='accuracy',
               cv=0,
               print_progress=False)
    efs1 = efs1.fit(X, y)
    expect = {0: {'feature_idx': (0, 1),
                  'feature_names': ('0', '1'),
                  'avg_score': 0.82666666666666666,
                  'cv_scores': np.array([0.82666667])},
              1: {'feature_idx': (0, 2),
                  'feature_names': ('0', '2'),
                  'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96])},
              2: {'feature_idx': (0, 3),
                  'feature_names': ('0', '3'),
                  'avg_score': 0.96666666666666667,
                  'cv_scores': np.array([0.96666667])},
              3: {'feature_idx': (1, 2),
                  'feature_names': ('1', '2'),
                  'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96])},
              4: {'feature_idx': (1, 3),
                  'feature_names': ('1', '3'),
                  'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96])},
              5: {'feature_idx': (2, 3),
                  'feature_names': ('2', '3'),
                  'avg_score': 0.97333333333333338,
                  'cv_scores': np.array([0.97333333])},
              6: {'feature_idx': (0, 1, 2),
                  'feature_names': ('0', '1', '2'),
                  'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96])},
              7: {'feature_idx': (0, 1, 3),
                  'feature_names': ('0', '1', '3'),
                  'avg_score': 0.96666666666666667,
                  'cv_scores': np.array([0.96666667])},
              8: {'feature_idx': (0, 2, 3),
                  'feature_names': ('0', '2', '3'),
                  'avg_score': 0.96666666666666667,
                  'cv_scores': np.array([0.96666667])},
              9: {'feature_idx': (1, 2, 3),
                  'feature_names': ('1', '2', '3'),
                  'avg_score': 0.97333333333333338,
                  'cv_scores': np.array([0.97333333])}}
    dict_compare_utility(d1=expect, d2=efs1.subsets_)
def test_knn_wo_cv():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    efs1 = EFS(knn,
               min_features=2,
               max_features=3,
               scoring='accuracy',
               cv=0,
               print_progress=False)
    efs1 = efs1.fit(X, y)
    expect = {0: {'feature_idx': (0, 1),
                  'feature_names': ('0', '1'),
                  'avg_score': 0.82666666666666666,
                  'cv_scores': np.array([0.82666667])},
              1: {'feature_idx': (0, 2),
                  'feature_names': ('0', '2'),
                  'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96])},
              2: {'feature_idx': (0, 3),
                  'feature_names': ('0', '3'),
                  'avg_score': 0.96666666666666667,
                  'cv_scores': np.array([0.96666667])},
              3: {'feature_idx': (1, 2),
                  'feature_names': ('1', '2'),
                  'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96])},
              4: {'feature_idx': (1, 3),
                  'feature_names': ('1', '3'),
                  'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96])},
              5: {'feature_idx': (2, 3),
                  'feature_names': ('2', '3'),
                  'avg_score': 0.97333333333333338,
                  'cv_scores': np.array([0.97333333])},
              6: {'feature_idx': (0, 1, 2),
                  'feature_names': ('0', '1', '2'),
                  'avg_score': 0.95999999999999996,
                  'cv_scores': np.array([0.96])},
              7: {'feature_idx': (0, 1, 3),
                  'feature_names': ('0', '1', '3'),
                  'avg_score': 0.96666666666666667,
                  'cv_scores': np.array([0.96666667])},
              8: {'feature_idx': (0, 2, 3),
                  'feature_names': ('0', '2', '3'),
                  'avg_score': 0.96666666666666667,
                  'cv_scores': np.array([0.96666667])},
              9: {'feature_idx': (1, 2, 3),
                  'feature_names': ('1', '2', '3'),
                  'avg_score': 0.97333333333333338,
                  'cv_scores': np.array([0.97333333])}}
    dict_compare_utility(d1=expect, d2=efs1.subsets_)
def test_regression():
    boston = load_boston()
    X, y = boston.data[:, [1, 2, 6, 8, 12]], boston.target
    lr = LinearRegression()
    efs_r = EFS(lr,
                min_features=3,
                max_features=4,
                scoring='neg_mean_squared_error',
                cv=10,
                print_progress=False)
    efs_r = efs_r.fit(X, y)
    assert efs_r.best_idx_ == (0, 2, 4)
    assert round(efs_r.best_score_, 4) == -40.8777
def test_regression():
    boston = load_boston()
    X, y = boston.data[:, [1, 2, 6, 8, 12]], boston.target
    lr = LinearRegression()
    efs_r = EFS(lr,
                min_features=3,
                max_features=4,
                scoring='neg_mean_squared_error',
                cv=10,
                print_progress=False)
    efs_r = efs_r.fit(X, y)
    assert efs_r.best_idx_ == (0, 2, 4)
    assert round(efs_r.best_score_, 4) == -40.8777
Пример #8
0
def test_fit_params():
    iris = load_iris()
    X = iris.data
    y = iris.target
    sample_weight = np.ones(X.shape[0])
    forest = RandomForestClassifier(n_estimators=100, random_state=123)
    efs1 = EFS(forest,
               min_features=3,
               max_features=3,
               scoring='accuracy',
               cv=4,
               print_progress=False)
    efs1 = efs1.fit(X, y, sample_weight=sample_weight)
    expect = {0: {'feature_idx': (0, 1, 2),
                  'feature_names': ('0', '1', '2'),
                  'cv_scores': np.array([0.947, 0.868, 0.919, 0.973]),
                  'avg_score': 0.9269203413940257},
              1: {'feature_idx': (0, 1, 3),
                  'feature_names': ('0', '1', '3'),
                  'cv_scores': np.array([0.921, 0.921, 0.892, 1.]),
                  'avg_score': 0.9337606837606838},
              2: {'feature_idx': (0, 2, 3),
                  'feature_names': ('0', '2', '3'),
                  'cv_scores': np.array([0.974, 0.947, 0.919, 0.973]),
                  'avg_score': 0.9532361308677098},
              3: {'feature_idx': (1, 2, 3),
                  'feature_names': ('1', '2', '3'),
                  'cv_scores': np.array([0.974, 0.947, 0.892, 1.]),
                  'avg_score': 0.9532361308677098}}

    if Version(sklearn_version) < Version("0.22"):
        expect[0]['avg_score'] = 0.9401709401709402
        expect[0]['cv_scores'] = np.array([0.94871795, 0.92307692,
                                           0.91666667, 0.97222222])
        expect[1]['cv_scores'] = np.array([0.94871795, 0.92307692,
                                           0.91666667, 0.97222222])
        expect[2]['cv_scores'] = np.array([0.94871795, 0.92307692,
                                           0.91666667, 0.97222222])
        expect[2]['avg_score'] = 0.9599358974358974
        expect[3]['avg_score'] = 0.9599358974358974
        expect[3]['cv_scores'] = np.array([0.97435897, 0.94871795,
                                           0.91666667, 1.])
        assert round(efs1.best_score_, 4) == 0.9599

    else:
        assert round(efs1.best_score_, 4) == 0.9532

    dict_compare_utility(d1=expect, d2=efs1.subsets_)
    assert efs1.best_idx_ == (0, 2, 3)
def test_clone_params_pass():
    iris = load_iris()
    X = iris.data
    y = iris.target
    lr = SoftmaxRegression(random_seed=1)
    efs1 = EFS(lr,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)
    efs1 = efs1.fit(X, y)
    assert(efs1.best_idx_ == (1, 3))
def test_clone_params_pass():
    iris = load_iris()
    X = iris.data
    y = iris.target
    lr = SoftmaxRegression(random_seed=1)
    efs1 = EFS(lr,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)
    efs1 = efs1.fit(X, y)
    assert(efs1.best_idx_ == (1, 3))
Пример #11
0
def test_knn_cv3():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    efs1 = EFS(knn,
               min_features=3,
               max_features=3,
               scoring='accuracy',
               cv=4,
               print_progress=False)
    efs1 = efs1.fit(X, y)
    expect = {0: {'avg_score': 0.9391025641025641,
                  'feature_idx': (0, 1, 2),
                  'feature_names': ('0', '1', '2'),
                  'cv_scores': np.array([0.974, 0.947, 0.892, 0.946])},
              1: {'avg_score': 0.9400782361308677,
                  'feature_idx': (0, 1, 3),
                  'feature_names': ('0', '1', '3'),
                  'cv_scores': np.array([0.921, 0.947, 0.919, 0.973])},
              2: {'avg_score': 0.95299145299145294,
                  'feature_idx': (0, 2, 3),
                  'feature_names': ('0', '2', '3'),
                  'cv_scores': np.array([0.974, 0.947, 0.919, 0.973])},
              3: {'avg_score': 0.97275641025641035,
                  'feature_idx': (1, 2, 3),
                  'feature_names': ('1', '2', '3'),
                  'cv_scores': np.array([0.974, 1.   , 0.946, 0.973])}}

    if Version(sklearn_version) < Version("0.22"):
        expect[0]['cv_scores'] = np.array([0.97435897, 0.94871795,
                                           0.88888889, 0.94444444])
        expect[1]['cv_scores'] = np.array([0.92307692, 0.94871795,
                                           0.91666667, 0.97222222])
        expect[2]['cv_scores'] = np.array([0.97435897, 0.94871795,
                                           0.91666667, 0.97222222])
        expect[3]['cv_scores'] = np.array([0.97435897, 0.94871795,
                                           0.91666667, 0.97222222])
        expect[1]['avg_score'] = 0.94017094017094016
        assert round(efs1.best_score_, 4) == 0.9728
    else:
        assert round(efs1.best_score_, 4) == 0.9732

    dict_compare_utility(d1=expect, d2=efs1.subsets_)
    assert efs1.best_idx_ == (1, 2, 3)
    assert efs1.best_feature_names_ == ('1', '2', '3')
def test_fit_transform():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)

    efs1 = EFS(knn,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)

    X_t = efs1.fit_transform(X, y)
    assert X_t.shape == (150, 2)
def test_fit_transform():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)

    efs1 = EFS(knn,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)

    X_t = efs1.fit_transform(X, y)
    assert X_t.shape == (150, 2)
def test_knn_cv3():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    efs1 = EFS(knn,
               min_features=3,
               max_features=3,
               scoring='accuracy',
               cv=4,
               print_progress=False)
    efs1 = efs1.fit(X, y)
    expect = {
        0: {
            'avg_score': 0.9391025641025641,
            'feature_idx': (0, 1, 2),
            'feature_names': ('0', '1', '2'),
            'cv_scores':
            np.array([0.97435897, 0.94871795, 0.88888889, 0.94444444])
        },
        1: {
            'avg_score': 0.94017094017094016,
            'feature_idx': (0, 1, 3),
            'feature_names': ('0', '1', '3'),
            'cv_scores':
            np.array([0.92307692, 0.94871795, 0.91666667, 0.97222222])
        },
        2: {
            'avg_score': 0.95299145299145294,
            'feature_idx': (0, 2, 3),
            'feature_names': ('0', '2', '3'),
            'cv_scores':
            np.array([0.97435897, 0.94871795, 0.91666667, 0.97222222])
        },
        3: {
            'avg_score': 0.97275641025641035,
            'feature_idx': (1, 2, 3),
            'feature_names': ('1', '2', '3'),
            'cv_scores': np.array([0.97435897, 1., 0.94444444, 0.97222222])
        }
    }
    dict_compare_utility(d1=expect, d2=efs1.subsets_)
    assert efs1.best_idx_ == (1, 2, 3)
    assert efs1.best_feature_names_ == ('1', '2', '3')
    assert round(efs1.best_score_, 4) == 0.9728
def perform_efs(curr_model, X, y, min_cols, max_cols):

    efs1 = EFS(curr_model,
               min_features=min_cols,
               max_features=max_cols,
               print_progress=True,
               scoring='accuracy',
               cv=5,
               n_jobs=-1)

    efs1 = efs1.fit(X, y)

    df = pd.DataFrame.from_dict(efs1.get_metric_dict()).T
    #    df['test_acc'] = df['feature_idx'].apply(
    #        lambda x: make_predictions_on_test(efs1, curr_model, X_train, X_test, y_train, y_test, x)
    #    )

    return df
Пример #16
0
def test_custom_feature_names():
    knn = KNeighborsClassifier(n_neighbors=4)
    iris = load_iris()
    X = iris.data
    y = iris.target
    efs1 = EFS(knn,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)

    efs1 = efs1.fit(X, y, custom_feature_names=(
          'sepal length', 'sepal width', 'petal length', 'petal width'))
    assert efs1.best_idx_ == (2, 3), efs1.best_idx_
    assert efs1.best_feature_names_ == ('petal length', 'petal width')
def test_custom_feature_names():
    knn = KNeighborsClassifier(n_neighbors=4)
    iris = load_iris()
    X = iris.data
    y = iris.target
    efs1 = EFS(knn,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)

    efs1 = efs1.fit(X, y, custom_feature_names=(
          'sepal length', 'sepal width', 'petal length', 'petal width'))
    assert efs1.best_idx_ == (2, 3), efs1.best_idx_
    assert efs1.best_feature_names_ == ('petal length', 'petal width')
Пример #18
0
def ExhaustiveFeatureSelector(X, y, min_features=1, max_features=4):
    from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
    from sklearn.linear_model import LinearRegression
    lr = LinearRegression()

    efs1 = EFS(lr,
               min_features=min_features,
               max_features=max_features,
               scoring='r2',
               print_progress=True,
               cv=5)

    efs1 = efs1.fit(X, y)

    #print('Best subset:', efs1.best_idx_)
    print('Best subset (corresponding names):', efs1.best_feature_names_)
    print('Best R² score: %.2f' % efs1.best_score_)
    return efs1.best_feature_names_, efs1.best_score_
def test_fit_params():
    iris = load_iris()
    X = iris.data
    y = iris.target
    sample_weight = np.ones(X.shape[0])
    forest = RandomForestClassifier(n_estimators=100, random_state=123)
    efs1 = EFS(forest,
               min_features=3,
               max_features=3,
               scoring='accuracy',
               cv=4,
               print_progress=False)
    efs1 = efs1.fit(X, y, sample_weight=sample_weight)
    expect = {
        0: {
            'feature_idx': (0, 1, 2),
            'feature_names': ('0', '1', '2'),
            'cv_scores':
            np.array([0.94871795, 0.92307692, 0.91666667, 0.97222222]),
            'avg_score': 0.9401709401709402
        },
        1: {
            'feature_idx': (0, 1, 3),
            'feature_names': ('0', '1', '3'),
            'cv_scores': np.array([0.92307692, 0.92307692, 0.88888889, 1.]),
            'avg_score': 0.9337606837606838
        },
        2: {
            'feature_idx': (0, 2, 3),
            'feature_names': ('0', '2', '3'),
            'cv_scores':
            np.array([0.97435897, 0.94871795, 0.94444444, 0.97222222]),
            'avg_score': 0.9599358974358974
        },
        3: {
            'feature_idx': (1, 2, 3),
            'feature_names': ('1', '2', '3'),
            'cv_scores': np.array([0.97435897, 0.94871795, 0.91666667, 1.]),
            'avg_score': 0.9599358974358974
        }
    }
    dict_compare_utility(d1=expect, d2=efs1.subsets_)
    assert efs1.best_idx_ == (0, 2, 3)
    assert round(efs1.best_score_, 4) == 0.9599
def test_check_pandas_dataframe_transform():
    knn = KNeighborsClassifier(n_neighbors=4)
    iris = load_iris()
    X = iris.data
    y = iris.target
    efs1 = EFS(knn,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)

    df = pd.DataFrame(X, columns=['sepal length', 'sepal width',
                                  'petal length', 'petal width'])
    efs1 = efs1.fit(df, y)
    assert efs1.best_idx_ == (2, 3)
    assert (150, 2) == efs1.transform(df).shape
Пример #21
0
def test_check_pandas_dataframe_transform():
    knn = KNeighborsClassifier(n_neighbors=4)
    iris = load_iris()
    X = iris.data
    y = iris.target
    efs1 = EFS(knn,
               min_features=2,
               max_features=2,
               scoring='accuracy',
               cv=0,
               clone_estimator=False,
               print_progress=False,
               n_jobs=1)

    df = pd.DataFrame(X, columns=['sepal length', 'sepal width',
                                  'petal length', 'petal width'])
    efs1 = efs1.fit(df, y)
    assert efs1.best_idx_ == (2, 3)
    assert (150, 2) == efs1.transform(df).shape
Пример #22
0
def test_knn_cv3_groups():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn = KNeighborsClassifier(n_neighbors=4)
    efs1 = EFS(knn,
               min_features=3,
               max_features=3,
               scoring='accuracy',
               cv=GroupKFold(n_splits=3),
               print_progress=False)
    np.random.seed(1630672634)
    groups = np.random.randint(0, 6, size=len(y))
    efs1 = efs1.fit(X, y, groups=groups)

    expect = {
        0: {
            'cv_scores': np.array([0.97916667, 0.93877551, 0.9245283]),
            'feature_idx': (0, 1, 2),
            'avg_score': 0.9474901595858469,
            'feature_names': ('0', '1', '2')
        },
        1: {
            'cv_scores': np.array([1., 0.93877551, 0.9245283]),
            'feature_idx': (0, 1, 3),
            'avg_score': 0.9544346040302915,
            'feature_names': ('0', '1', '3')
        },
        2: {
            'cv_scores': np.array([0.97916667, 0.95918367, 0.9245283]),
            'feature_idx': (0, 2, 3),
            'avg_score': 0.9542928806742822,
            'feature_names': ('0', '2', '3')
        },
        3: {
            'cv_scores': np.array([0.97916667, 0.95918367, 0.94339623]),
            'feature_idx': (1, 2, 3),
            'avg_score': 0.9605821888503829,
            'feature_names': ('1', '2', '3')
        }
    }
    dict_compare_utility(d1=expect, d2=efs1.subsets_)
Пример #23
0
def wrapper_selection():
    print('--------------------------------------------------------')
    print('Utilizando tecnica de Envoltura...')
    models = [
        svm.SVC(),
        RandomForestClassifier(),
        GaussianNB(),
        LogisticRegression(),
        KNeighborsClassifier()
    ]
    for model in models:
        efs = ExhaustiveFeatureSelector(model,
                                        min_features=1,
                                        max_features=5,
                                        scoring='accuracy',
                                        cv=5)
        efs = efs.fit(data, labels)
        selected_features = columns[list(efs.best_idx_)]
        print(
            f'Variables seleccionadas utilizando {model}: {selected_features}')
Пример #24
0
def wrapper(x_train_df):
    x_train = x_train_df.drop(["id", "failed test"], axis=1)
    y_train = x_train_df["failed test"]
    feature_selector = EFS(RandomForestClassifier(max_depth=17,
                                                  n_estimators=136,
                                                  max_features=0.307,
                                                  min_samples_split=30,
                                                  random_state=42),
                           min_features=6,
                           max_features=7,
                           scoring='log_loss',
                           print_progress=True,
                           n_jobs=1,
                           cv=5)
    features = feature_selector.fit(x_train, y_train)
    print('Best recall score: %.2f' % feature_selector.best_score_)
    print('Best subset (indices):', feature_selector.best_idx_)
    print('Best subset (corresponding names):',
          feature_selector.best_feature_names_)
    print('Subsets_: ', feature_selector.subsets_)
Пример #25
0
def exhaustive_feature_selection(x_data, y_data, min_feat, max_feat):
    print(f"Applying exhaustive feature selection to numeric data")
    print(
        f"cat variables before backward feature selection {x_data.select_dtypes(include='object').columns.shape}"
    )
    print(
        f"numeric variables before backward feature selection {x_data.select_dtypes(include='number').columns.shape}"
    )

    numeric_cols = x_data.select_dtypes(include='number').columns

    temp = x_data[numeric_cols]

    efs = EFS(RandomForestRegressor(n_jobs=4),
              max_features=max_feat,
              min_features=min_feat,
              scoring='r2',
              print_progress=True,
              cv=2)

    efs.fit(temp, y_data)

    idx = efs.best_idx_

    print(idx)

    idx = list(idx)

    cols_to_keep = x_data.columns[idx]
    cols_to_drop = [x for x in numeric_cols if x not in cols_to_keep]

    print(cols_to_drop.__len__())

    x_data.drop(labels=cols_to_drop, axis=1, inplace=True)
    print(
        f"cat variables after exhaustive feature selection {x_data.select_dtypes(include='object').columns}"
    )
    print(
        f"numeric variables after exhaustive  feature selection {x_data.select_dtypes(include='number').columns}"
    )
    return x_data
    def create_data(X: dt.Frame = None) -> pd.DataFrame:
        if X is None:
            return []

        from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

        X = X.to_pandas()
        y = X[TARGET_COLUMN].values
        X.drop(TARGET_COLUMN, axis=1, inplace=True)

        efs = EFS(ESTIMATOR,
                  min_features=MIN_FEATURES,
                  max_features=MAX_FEATURES,
                  scoring=SCORING,
                  cv=CV,
                  n_jobs=-1)

        efs.fit(X, y)

        X_fs = X.iloc[:, list(efs.best_idx_)]

        return X_fs
Пример #27
0
def exhaustive_feature_selection(x_train, y_train, model=None, num_features=[2, 5], classification_tasks=True,
                                 scoring=None):
    print("============== Exhaustive feature selection ===================")
    if not model:
        if classification_tasks:
            model = LogisticRegression(multi_class='multinomial',
                                       solver='lbfgs',
                                       random_state=123)
        else:
            model = Ridge()

    if not scoring:
        if classification_tasks:
            scoring = "accuracy"
        else:
            scoring = "neg_mean_absolute_error"

    efs = EFS(estimator=model,
               min_features=num_features[0],
               max_features=num_features[1],
               scoring=scoring,
               print_progress=False,
               clone_estimator=False,
               cv=10,
               n_jobs=2)

    X = efs.fit(x_train.values, y_train.values)
    print('Best accuracy score: %.2f' % efs.best_score_)
    col_list = []
    col_list.extend(efs.best_idx_)
    col_names = x_train.columns
    print('Best subset:', col_names[col_list].values)
    x_train = x_train.iloc[:,col_list]

    print("=================================")
    return x_train
def test_fit_params():
    iris = load_iris()
    X = iris.data
    y = iris.target
    sample_weight = np.ones(X.shape[0])
    forest = RandomForestClassifier(n_estimators=100, random_state=123)
    efs1 = EFS(forest,
               min_features=3,
               max_features=3,
               scoring='accuracy',
               cv=4,
               print_progress=False)
    efs1 = efs1.fit(X, y, sample_weight=sample_weight)
    expect = {0: {'feature_idx': (0, 1, 2),
                  'feature_names': ('0', '1', '2'),
                  'cv_scores': np.array([0.94871795, 0.92307692,
                                         0.91666667, 0.97222222]),
                  'avg_score': 0.9401709401709402},
              1: {'feature_idx': (0, 1, 3),
                  'feature_names': ('0', '1', '3'),
                  'cv_scores': np.array([0.92307692, 0.92307692,
                                         0.88888889, 1.]),
                  'avg_score': 0.9337606837606838},
              2: {'feature_idx': (0, 2, 3),
                  'feature_names': ('0', '2', '3'),
                  'cv_scores': np.array([0.97435897, 0.94871795,
                                         0.94444444, 0.97222222]),
                  'avg_score': 0.9599358974358974},
              3: {'feature_idx': (1, 2, 3),
                  'feature_names': ('1', '2', '3'),
                  'cv_scores': np.array([0.97435897, 0.94871795,
                                         0.91666667, 1.]),
                  'avg_score': 0.9599358974358974}}
    dict_compare_utility(d1=expect, d2=efs1.subsets_)
    assert efs1.best_idx_ == (0, 2, 3)
    assert round(efs1.best_score_, 4) == 0.9599
Пример #29
0
    def brute_force(self, X, y, y_type):
        if y_type == "binary":
            est = LinearRegression()
            efs = EFS(
                estimator=est,
                min_features=1,
                max_features=2,
                scoring="neg_mean_squared_error",
                cv=5,
            )

            efs = efs.fit(X, y)
            efs_df = pd.DataFrame.from_dict(efs.get_metric_dict()).T
            efs_df.sort_values("avg_score", inplace=True, ascending=False)

        else:
            est = LogisticRegression()
            efs = EFS(
                estimator=est,
                min_features=1,
                max_features=2,
                scoring="neg_mean_squared_error",
                cv=5,
            )

            efs = efs.fit(X, y)
            efs_df = pd.DataFrame.from_dict(efs.get_metric_dict()).T
            efs_df.sort_values("avg_score", inplace=True, ascending=False)

        # horizontal bar chart
        fig, ax = plt.subplots(figsize=(12, 9))
        y_pos = np.arange(len(efs_df))
        ax.barh(y_pos, efs_df["avg_score"], xerr=efs_df["std_dev"])
        ax.set_yticks(y_pos)
        ax.set_xlabel("Avg Score")
        ax.set_ylabel("Feature Names")
        ax.tick_params(labelleft=False)
        plt.show()

        return efs_df
Пример #30
0
    sc.fit_transform(newdata.drop(['Loan_ID', 'Loan_Status'], axis=1)),
    columns=[
        'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
        'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Gender_Male',
        'Married_Yes', 'Dependents_1', 'Dependents_2', 'Dependents_3+',
        'Education_Not Graduate', 'Self_Employed_Yes'
    ])
# Target class
Y = pd.DataFrame(newdata['Loan_Status'])
#Visualization
sns.pairplot(df1, hue="Loan_Status")
#Splitting data into train and test samples
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
#Running ExhaustiveFeatureSelector() for feature_selection on 3 different classifiers
efs = ExhaustiveFeatureSelector(RandomForestClassifier(),
                                max_features=6,
                                scoring='roc_auc',
                                cv=5)
efs_fit = efs.fit(X_train, Y_train)
selected_features = X_train.columns[list(efs_fit.best_idx_)]
print(selected_features)
print(efs_fit.best_score_)
rClassifier = RandomForestClassifier(random_state=0)
rClassifier.fit(X_train[selected_features], Y_train)
Y_RCF = rClassifier.predict(X_test[selected_features])
print(classification_report(Y_test, Y_RCF))
efs_naive = ExhaustiveFeatureSelector(GaussianNB(),
                                      max_features=6,
                                      scoring='roc_auc',
                                      cv=4)
efs_naive_fit = efs_naive.fit(X_train, Y_train)
selected_features_naive = X_train.columns[list(efs_naive_fit.best_idx_)]
Пример #31
0
print('Accuracy on training set: {}'.format(
    roc_auc_score(train_labels, train_pred[:, 1])))

test_pred = clf.predict_proba(test_features[filtered_features].fillna(0))
print('Accuracy on test set: {}'.format(
    roc_auc_score(test_labels, test_pred[:, 1])))

##################################### XGBoost - Exhaustive feature Selector ############################################

from mlxtend.feature_selection import ExhaustiveFeatureSelector
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

feature_selector = ExhaustiveFeatureSelector(XGBClassifier(),
                                             min_features=2,
                                             max_features=10,
                                             scoring='roc_auc',
                                             print_progress=True,
                                             cv=2)

features = feature_selector.fit(np.array(train_features.fillna(0)),
                                train_labels)

print(type(features))
filtered_features = train_features.columns[list(features.best_idx_)]
print(filtered_features)

# see result of XGBoost:

clf = XGBClassifier(min_child_weight=5,
                    gamma=0.5,
                    subsample=0.6,
 def __init__(self, columns=None, **kwargs):
     self.columns = columns
     self.selector = ExhaustiveFeatureSelector(**kwargs)
     self.transform_cols = None
     self.stat_df = None
correlated_features = set()
correlation_matrix = paribas_data.corr()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[
                i, j]) > 0.8:  #0.8 is a correlation threshold value
            column_name = correlation_matrix.columns[i]
            correlated_features.add(column_name)

train_features.drop(labels=correlated_features, axis=1, inplace=True)
test_features.drop(labels=correlated_features, axis=1, inplace=True)

feature_selector = ExhaustiveFeatureSelector(RandomForestClassifier(n_jobs=-1),
                                             min_features=2,
                                             max_features=4,
                                             scoring='roc_auc',
                                             print_progress=True,
                                             cv=2)
features = feature_selector.fit(np.array(train_features.fillna(0)),
                                train_labels)

filtered_features = train_features.columns[list(features.k_feature_idx_)]

clf = RandomForestClassifier(n_estimators=100, random_state=41, max_depth=3)
clf.fit(train_features[filtered_features].fillna(0), train_labels)

train_pred = clf.predict_proba(train_features[filtered_features].fillna(0))
print('Accuracy on training set: {}'.format(
    roc_auc_score(train_labels, train_pred[:, 1])))

test_pred = clf.predict_proba(test_features[filtered_features].fillna(0))
that employs a search strategy to look through the space of possible feature subsets,
evaluating each subset based on the quality of the performance of a given algorithm.
"""
"""
EXHAUSTIVE FEATURE SELECTION.
This method searches across all possible feature combinations.
Its aim is to find the best performing feature subset.
"""
# import the algorithm you want to evaluate on your features.
from mlxtend.feature_selection import ExhaustiveFeatureSelector
from sklearn.ensemble import RandomForestClassifier

# create the ExhaustiveFeatureSelector object.
efs = ExhaustiveFeatureSelector(RandomForestClassifier(),
                                min_features=45,
                                max_features=70,
                                scoring='accuracy',
                                cv=2)

# fit the object to the training data.
efs = efs.fit(x, y)

# print the selected features.
selected_features1 = x.columns[list(efs.k_feature_idx_)]
print('selected features from exhaustive selection:', selected_features1)

# print the final prediction score.
print('accuracy:', efs.k_score_)

# transform to the newly selected features.
#X_train = efs.transform(X_train)
"""
mlxtend ExhaustiveFeatureSelector, selects best subset of features from all possible combinations of the features
link :http://rasbt.github.io/mlxtend/user_guide/feature_selection/ExhaustiveFeatureSelector/
"""
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
efs_1 = EFS(knn,
            min_features=1,
            max_features=4,
            scoring='accuracy',
            print_progress=True,
            cv=5)  ##where knn is the model

efs_1.get_metric_dict()
print('Selected features:', efs1.best_idx_)
"""
Sequential feature selection is better computationally compared to EFS, however they should not be applied along with embedded feature selection methods like LASSO
Compared to RFE its more computation intensive as it relies on a metric for feature selection,
whereas RFE relies on weight coefficients(linear) or feature importances(tree based algos)

There are 4 different flavors of SFAs available via the SequentialFeatureSelector:

Sequential Forward Selection (SFS)
Sequential Backward Selection (SBS)
Sequential Forward Floating Selection (SFFS)
Sequential Backward Floating Selection (SBFS)
link: http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/
"""
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

sfs1 = SFS(knn,
           k_features=3,
Пример #36
0
    print("The selected feature list:")
    print(feat_cols)
elif (choice == 2):
    sfs1 = sfs(clf,
               k_features=4,
               forward=False,
               floating=False,
               verbose=2,
               scoring='accuracy',
               cv=5)
    # Perform SFFS
    sfs1 = sfs1.fit(X_train, y_train)

    feat_cols = list(sfs1.k_feature_idx_)
    print("******")
    print("The selected feature list:")
    print(feat_cols)
elif (choice == 3):
    efs1 = EFS(knn,
               min_features=4,
               max_features=5,
               scoring='accuracy',
               print_progress=True,
               cv=5)
    efs1 = efs1.fit(X_train, y_train)
    feat_cols = list(efs1.best_idx_)
    print("******")
    print("The selected feature list:")
    print(feat_cols)
else:
    print("Wrong Input")
Пример #37
0
    import ExhaustiveFeatureSelector as EFS
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

#%% load sample data
iris = load_iris()
x = pd.DataFrame(iris.data, \
    columns=iris.feature_names)

#%% create a logistic regression object
lr = LogisticRegression()

#%% create an EFS object
efs = EFS(estimator=lr,        
          min_features=1,      
          max_features=3,      
          scoring='accuracy',  
          cv=5)

#%% fit the model
efs = efs.fit(x, iris.target)

#%% show the selected features
efs.best_feature_names_
# console output:
# ('sepal length (cm)', 'petal length (cm)', 
# 'petal width (cm)')

#%% show a full report on the feature selection
efs_results = pd.DataFrame(efs.get_metric_dict()).\
    T. \
Пример #38
0
         for j in range(i):
             if abs(corr_matrix.iloc[i,j])>threshold:
                 colname=corr_matrix.columns[i]
                 col_corr.add(colname)
    return col_corr             
                 
corr_features=correlation(X_train,0.8)
print('correlated features:',len(set(corr_features)))

X_train.drop(labels=corr_features,axis=1,inplace=True)
X_test.drop(labels=corr_features,axis=1,inplace=True)

efs1=EFS(RandomForestClassifier(n_jobs=4,random_state=0),
         min_features=1 ,
         max_features=4,
         scoring='roc_auc',
         print_progress = True,
         cv=2
         )

efs1=efs1.fit(np.array(X_train[X_train.columns[0:4]].fillna(0)),y_train)
select_feat= X_train.columns[list(efs1.best_idx_)]
select_feat

def run_randomForests(X_train,X_test,y_train,y_test):
    rf=RandomForestClassifier(n_estimators=200,random_state=39,max_depth=4)
    rf.fit(X_train,y_train)
    print('Train set')
    pred=rf.predict_proba(X_train)
    print('Random Forests roc_auc :{}'.format(roc_auc_score(y_train,pred[:,1])))
    print('Test set')
Пример #39
0
df = df.sort_values(by=['importances'])
print('\n\n')

for feature_choices in [10, 20, 30, 40, 50]:
    for max_len in [5, 10]:

        these_choices = df.tail(feature_choices)
        #print(these_choices)
        #print(df)
        test_cols = these_choices['feature'].values
        print(test_cols)
        efs = EFS(
            estimator=rfc,
            min_features=3,
            max_features=max_len,
            print_progress=False,
            scoring='accuracy',
            n_jobs=15,
            cv=4,
        )

        start_time = time.time()
        try:
            efs = efs.fit(X_train[test_cols], y_train)
        except:
            continue
        end_time = time.time()
        #print()
        #print(feature_choices, end_time - start_time)
        best_features = list(efs.best_feature_names_)
        best_score = efs.best_score_
Пример #40
0
num_chunks = pd.read_csv("train_numeric.csv",
                         index_col=0,
                         usecols=list(range(969)),
                         chunksize=100000,
                         dtype=np.float32)
X = pd.concat([
    pd.concat([dchunk, nchunk], axis=1).sample(frac=0.05)
    for dchunk, nchunk in zip(date_chunks, num_chunks)
])
y = pd.read_csv("train_numeric.csv",
                index_col=0,
                usecols=[0, 969],
                dtype=np.float32).loc[X.index].values.ravel()
X = X.values
model = XGBClassifier()
efs1 = EFS(model, min_features=100, max_features=900, scoring='accuracy', cv=5)

efs1 = efs1.fit(X, y)

print('Best accuracy score: %.2f' % efs1.best_score_)
important_indices = efs1.best_idx_

# Got important_indices from above code
#important_indices = []
print("Found important features %s" % important_indices)
# load entire dataset for these features.
# note where the feature indices are split so we can load the correct ones straight from read_csv
n_date_features = 1156
X = np.concatenate([
    pd.read_csv("train_date.csv",
                index_col=0,