Exemplo n.º 1
0
def test_selective_imputer_bad_strategies():
    # raises for a bad strategy string
    imputer = SelectiveImputer(strategy="bad strategy")
    assert_raises(ValueError, imputer.fit, X)

    # raises for a dim mismatch in cols and strategy
    imputer = SelectiveImputer(cols=['a'], strategy=['mean', 'mean'])
    assert_raises(ValueError, imputer.fit, X)

    # test type error for bad strategy
    imputer = SelectiveImputer(strategy=1)
    assert_raises(TypeError, imputer.fit, X)

    # test dict input that does not match dim-wise
    imputer = SelectiveImputer(cols=['a'],
                               strategy={
                                   'a': 'mean',
                                   'b': 'median'
                               })
    assert_raises(ValueError, imputer.fit, X)

    # test a dict input with bad columns breaks
    imputer = SelectiveImputer(strategy={'a': 'mean', 'D': 'median'})
    assert_raises(ValueError, imputer.fit, X)
Exemplo n.º 2
0
def test_complex_grid_search():
    # build a pipeline
    pipe = Pipeline([
        ('dropper', FeatureFilter()),  # won't drop any
        ('collinearity', MultiCorrFilter(threshold=0.85)),
        ('imputer', SelectiveImputer()),  # pass through since all full
        ('scaler', SelectiveMaxAbsScaler()),
        ('boxcox', BoxCoxTransformer(suppress_warnings=True)),
        ('nzv', NearZeroVarianceFilter()),
        ('pca', SelectivePCA(n_components=0.9)),
        ('custom', make_transformer(subtract_k, k=1)),
        ('model', RandomForestClassifier(n_jobs=1))
    ])

    # let's define a set of hyper-parameters over which to search
    hp = {
        'collinearity__threshold': uniform(loc=.8, scale=.15),
        'collinearity__method': ['pearson', 'kendall', 'spearman'],
        'pca__n_components': uniform(loc=.75, scale=.2),
        'pca__whiten': [True, False],
        'custom__k': [1, 2, 3],
        'custom__func': [subtract_k, add_k],
        'model__n_estimators': randint(5, 10),
        'model__max_depth': randint(2, 5),
        'model__min_samples_leaf': randint(1, 5),
        'model__max_features': uniform(loc=.5, scale=.5),
        'model__max_leaf_nodes': randint(10, 15)
    }

    # define the gridsearch
    search = RandomizedSearchCV(
        pipe,
        hp,
        n_iter=2,  # just to test it even works
        scoring='accuracy',
        cv=cv,
        random_state=42,
        # in parallel so we are testing pickling of the classes
        n_jobs=2)

    # fit the search
    search.fit(X_train, y_train)

    # Show we can profile the best estimator
    profile_estimator(search.best_estimator_)

    # Assert that it's persistable
    assert_persistable(pipe, "location.pkl", X_train, y_train)
Exemplo n.º 3
0
def test_complex_grid_search():
    # build a pipeline
    pipe = Pipeline([
        ('dropper', FeatureFilter()),  # won't drop any
        ('collinearity', MultiCorrFilter(threshold=0.85)),
        ('imputer', SelectiveImputer()),  # pass through since all full
        ('scaler', SelectiveScaler()),
        ('boxcox', BoxCoxTransformer()),
        ('nzv', NearZeroVarianceFilter()),
        ('pca', SelectivePCA(n_components=0.9)),
        ('model', RandomForestClassifier(n_jobs=1))
    ])

    # let's define a set of hyper-parameters over which to search
    hp = {
        'collinearity__threshold': uniform(loc=.8, scale=.15),
        'collinearity__method': ['pearson', 'kendall', 'spearman'],
        'scaler__scaler': [None, RobustScaler()],
        'pca__n_components': uniform(loc=.75, scale=.2),
        'pca__whiten': [True, False],
        'model__n_estimators': randint(5, 10),
        'model__max_depth': randint(2, 5),
        'model__min_samples_leaf': randint(1, 5),
        'model__max_features': uniform(loc=.5, scale=.5),
        'model__max_leaf_nodes': randint(10, 15)
    }

    # define the gridsearch
    search = RandomizedSearchCV(
        pipe,
        hp,
        n_iter=2,  # just to test it even works
        scoring='accuracy',
        cv=cv,
        random_state=42)

    # fit the search
    search.fit(X_train, y_train)