def test_complex_grid_search(): # build a pipeline pipe = Pipeline([ ('dropper', FeatureFilter()), # won't drop any ('collinearity', MultiCorrFilter(threshold=0.85)), ('imputer', SelectiveImputer()), # pass through since all full ('scaler', SelectiveMaxAbsScaler()), ('boxcox', BoxCoxTransformer(suppress_warnings=True)), ('nzv', NearZeroVarianceFilter()), ('pca', SelectivePCA(n_components=0.9)), ('custom', make_transformer(subtract_k, k=1)), ('model', RandomForestClassifier(n_jobs=1)) ]) # let's define a set of hyper-parameters over which to search hp = { 'collinearity__threshold': uniform(loc=.8, scale=.15), 'collinearity__method': ['pearson', 'kendall', 'spearman'], 'pca__n_components': uniform(loc=.75, scale=.2), 'pca__whiten': [True, False], 'custom__k': [1, 2, 3], 'custom__func': [subtract_k, add_k], 'model__n_estimators': randint(5, 10), 'model__max_depth': randint(2, 5), 'model__min_samples_leaf': randint(1, 5), 'model__max_features': uniform(loc=.5, scale=.5), 'model__max_leaf_nodes': randint(10, 15) } # define the gridsearch search = RandomizedSearchCV( pipe, hp, n_iter=2, # just to test it even works scoring='accuracy', cv=cv, random_state=42, # in parallel so we are testing pickling of the classes n_jobs=2) # fit the search search.fit(X_train, y_train) # Show we can profile the best estimator profile_estimator(search.best_estimator_) # Assert that it's persistable assert_persistable(pipe, "location.pkl", X_train, y_train)
def test_haversine_persistable(): assert_persistable( HaversineFeatures(cols=[('from_lat', 'from_lon'), ('to_lat', 'to_lon')]), "location.pkl", X)
def test_schema_persistable(): assert_persistable(SchemaNormalizer(schema), "location.pkl", X)
def test_date_transformer_persistable(): assert_persistable( DateTransformer(cols=["b", "c", "d"], date_format=["%m/%d/%Y", None, None]), "location.pkl", df)
def test_all_persistable(): for est in (BoxCoxTransformer, YeoJohnsonTransformer): assert_persistable(est(), "location.pkl", X)
def test_all_persistable(): for est in (SelectiveStandardScaler, SelectiveRobustScaler, SelectiveMinMaxScaler, SelectiveMaxAbsScaler): assert_persistable(est(), "location.pkl", X)
def test_all_persistable(): for est in (SelectivePCA, SelectiveTruncatedSVD, SelectiveNMF, SelectiveKernelPCA, SelectiveIncrementalPCA): assert_persistable(est(), location="location.pkl", X=X)
def test_all_persistable(): for est in (FeatureFilter, SparseFeatureFilter, MultiCorrFilter, NearZeroVarianceFilter): assert_persistable(est(), location="loc.pkl", X=iris)
def test_classifier_imputers_persistable(): for est in (BaggedClassifierImputer, ): assert_persistable(est(cols=['label'], random_state=42), "location.pkl", Y)
def test_regressor_imputers_persistable(): for est in (SelectiveImputer, BaggedRegressorImputer): assert_persistable(est(), "location.pkl", X)
def test_binning_persistable(): assert_persistable(BinningTransformer(), "location.pkl", iris)
def test_interaction_persistable(): assert_persistable(InteractionTermTransformer(cols=['a', 'b']), location='loc.pkl', X=X_pd)
def test_combos_persistable(): assert_persistable(LinearCombinationFilter(), location="loc.pkl", X=X)
def test_time_deltas_persistable(): assert_persistable(TimeDeltaFeatures(cols=['b', 'c']), location='loc.pkl', X=df2)
def test_date_factorizer_persistable(): assert_persistable(DateFactorizer(cols=['b']), location="loc.pkl", X=df)
def test_dummy_persistable(): assert_persistable(DummyEncoder(cols=iris.columns.tolist()), "location.pkl", iris)