def test_stacked_different_input_pipelines_classification(): input_pipelines = [ make_pipeline_from_components([RandomForestClassifier()], ProblemTypes.MULTICLASS), make_pipeline_from_components([RandomForestClassifier()], ProblemTypes.BINARY) ] with pytest.raises(ValueError, match="All pipelines must have the same problem type."): StackedEnsembleClassifier(input_pipelines=input_pipelines)
def test_stacked_fit_predict_classification(X_y_binary, X_y_multi, stackable_classifiers, problem_type): if problem_type == ProblemTypes.BINARY: X, y = X_y_binary num_classes = 2 elif problem_type == ProblemTypes.MULTICLASS: X, y = X_y_multi num_classes = 3 input_pipelines = [make_pipeline_from_components([classifier], problem_type) for classifier in stackable_classifiers] clf = StackedEnsembleClassifier(input_pipelines=input_pipelines, n_jobs=1) clf.fit(X, y) y_pred = clf.predict(X) assert len(y_pred) == len(y) assert isinstance(y_pred, ww.DataColumn) assert not np.isnan(y_pred.to_series()).all() y_pred_proba = clf.predict_proba(X) assert isinstance(y_pred_proba, ww.DataTable) assert y_pred_proba.shape == (len(y), num_classes) assert not np.isnan(y_pred_proba.to_dataframe()).all().all() clf = StackedEnsembleClassifier(input_pipelines=input_pipelines, final_estimator=RandomForestClassifier(), n_jobs=1) clf.fit(X, y) y_pred = clf.predict(X) assert len(y_pred) == len(y) assert isinstance(y_pred, ww.DataColumn) assert not np.isnan(y_pred.to_series()).all() y_pred_proba = clf.predict_proba(X) assert y_pred_proba.shape == (len(y), num_classes) assert isinstance(y_pred_proba, ww.DataTable) assert not np.isnan(y_pred_proba.to_dataframe()).all().all()
def test_get_component(example_graph): graph = example_graph component_graph = ComponentGraph(graph) assert component_graph.get_component('OneHot_ElasticNet') == OneHotEncoder assert component_graph.get_component( 'Logistic Regression') == LogisticRegressionClassifier with pytest.raises(ValueError, match='not in the graph'): component_graph.get_component('Fake Component') component_graph.instantiate({ 'OneHot_RandomForest': { 'top_n': 3 }, 'Random Forest': { 'max_depth': 4, 'n_estimators': 50 } }) assert component_graph.get_component( 'OneHot_ElasticNet') == OneHotEncoder() assert component_graph.get_component( 'OneHot_RandomForest') == OneHotEncoder(top_n=3) assert component_graph.get_component( 'Random Forest') == RandomForestClassifier(n_estimators=50, max_depth=4)
def test_scikit_learn_wrapper_invalid_problem_type(): evalml_pipeline = make_pipeline_from_components([RandomForestClassifier()], ProblemTypes.MULTICLASS) evalml_pipeline.problem_type = None with pytest.raises( ValueError, match="Could not wrap EvalML object in scikit-learn wrapper."): scikit_learn_wrapped_estimator(evalml_pipeline)
def test_get_estimators(example_graph): component_graph = ComponentGraph(example_graph) with pytest.raises(ValueError, match='Cannot get estimators until'): component_graph.get_estimators() component_graph.instantiate({}) assert component_graph.get_estimators() == [RandomForestClassifier(), ElasticNetClassifier(), LogisticRegressionClassifier()] component_graph = ComponentGraph.from_list(['Imputer', 'One Hot Encoder']) component_graph.instantiate({}) assert component_graph.get_estimators() == []
def test_generate_code_errors(): with pytest.raises(ValueError, match="Element must be a component instance"): generate_component_code(make_pipeline_from_components([RandomForestClassifier()], ProblemTypes.BINARY)) with pytest.raises(ValueError, match="Element must be a component instance"): generate_component_code(LinearRegressor) with pytest.raises(ValueError, match="Element must be a component instance"): generate_component_code(Imputer) with pytest.raises(ValueError, match="Element must be a component instance"): generate_component_code(ComponentBase)
def test_iteration(example_graph): component_graph = ComponentGraph(example_graph) expected = [ Imputer, OneHotEncoder, ElasticNetClassifier, OneHotEncoder, RandomForestClassifier, LogisticRegressionClassifier ] iteration = [component for component in component_graph] assert iteration == expected component_graph.instantiate({'OneHot_RandomForest': {'top_n': 32}}) expected = [ Imputer(), OneHotEncoder(), ElasticNetClassifier(), OneHotEncoder(top_n=32), RandomForestClassifier(), LogisticRegressionClassifier() ] iteration = [component for component in component_graph] assert iteration == expected
def test_serialization(X_y_binary, tmpdir, helper_functions): X, y = X_y_binary path = os.path.join(str(tmpdir), 'component.pkl') for component_class in all_components(): print('Testing serialization of component {}'.format( component_class.name)) try: component = helper_functions.safe_init_component_with_njobs_1( component_class) except EnsembleMissingPipelinesError: if (component_class == StackedEnsembleClassifier): component = component_class(input_pipelines=[ make_pipeline_from_components([RandomForestClassifier()], ProblemTypes.BINARY) ], n_jobs=1) elif (component_class == StackedEnsembleRegressor): component = component_class(input_pipelines=[ make_pipeline_from_components([RandomForestRegressor()], ProblemTypes.REGRESSION) ], n_jobs=1) component.fit(X, y) for pickle_protocol in range(cloudpickle.DEFAULT_PROTOCOL + 1): component.save(path, pickle_protocol=pickle_protocol) loaded_component = ComponentBase.load(path) assert component.parameters == loaded_component.parameters assert component.describe( return_dict=True) == loaded_component.describe( return_dict=True) if (issubclass(component_class, Estimator) and not (isinstance(component, StackedEnsembleClassifier) or isinstance(component, StackedEnsembleRegressor))): assert (component.feature_importance == loaded_component.feature_importance).all()
def test_describe_component(): enc = OneHotEncoder() imputer = Imputer() simple_imputer = SimpleImputer("mean") column_imputer = PerColumnImputer({"a": "mean", "b": ("constant", 100)}) scaler = StandardScaler() feature_selection_clf = RFClassifierSelectFromModel(n_estimators=10, number_features=5, percent_features=0.3, threshold=-np.inf) feature_selection_reg = RFRegressorSelectFromModel(n_estimators=10, number_features=5, percent_features=0.3, threshold=-np.inf) drop_col_transformer = DropColumns(columns=['col_one', 'col_two']) drop_null_transformer = DropNullColumns() datetime = DateTimeFeaturizer() text_featurizer = TextFeaturizer() lsa = LSA() pca = PCA() lda = LinearDiscriminantAnalysis() ft = DFSTransformer() us = Undersampler() assert enc.describe(return_dict=True) == {'name': 'One Hot Encoder', 'parameters': {'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}} assert imputer.describe(return_dict=True) == {'name': 'Imputer', 'parameters': {'categorical_impute_strategy': "most_frequent", 'categorical_fill_value': None, 'numeric_impute_strategy': "mean", 'numeric_fill_value': None}} assert simple_imputer.describe(return_dict=True) == {'name': 'Simple Imputer', 'parameters': {'impute_strategy': 'mean', 'fill_value': None}} assert column_imputer.describe(return_dict=True) == {'name': 'Per Column Imputer', 'parameters': {'impute_strategies': {'a': 'mean', 'b': ('constant', 100)}, 'default_impute_strategy': 'most_frequent'}} assert scaler.describe(return_dict=True) == {'name': 'Standard Scaler', 'parameters': {}} assert feature_selection_clf.describe(return_dict=True) == {'name': 'RF Classifier Select From Model', 'parameters': {'number_features': 5, 'n_estimators': 10, 'max_depth': None, 'percent_features': 0.3, 'threshold': -np.inf, 'n_jobs': -1}} assert feature_selection_reg.describe(return_dict=True) == {'name': 'RF Regressor Select From Model', 'parameters': {'number_features': 5, 'n_estimators': 10, 'max_depth': None, 'percent_features': 0.3, 'threshold': -np.inf, 'n_jobs': -1}} assert drop_col_transformer.describe(return_dict=True) == {'name': 'Drop Columns Transformer', 'parameters': {'columns': ['col_one', 'col_two']}} assert drop_null_transformer.describe(return_dict=True) == {'name': 'Drop Null Columns Transformer', 'parameters': {'pct_null_threshold': 1.0}} assert datetime.describe(return_dict=True) == {'name': 'DateTime Featurization Component', 'parameters': {'features_to_extract': ['year', 'month', 'day_of_week', 'hour'], 'encode_as_categories': False}} assert text_featurizer.describe(return_dict=True) == {'name': 'Text Featurization Component', 'parameters': {}} assert lsa.describe(return_dict=True) == {'name': 'LSA Transformer', 'parameters': {}} assert pca.describe(return_dict=True) == {'name': 'PCA Transformer', 'parameters': {'n_components': None, 'variance': 0.95}} assert lda.describe(return_dict=True) == {'name': 'Linear Discriminant Analysis Transformer', 'parameters': {'n_components': None}} assert ft.describe(return_dict=True) == {'name': 'DFS Transformer', 'parameters': {"index": "index"}} assert us.describe(return_dict=True) == {'name': 'Undersampler', 'parameters': {"balanced_ratio": 4, "min_samples": 100, "min_percentage": 0.1}} # testing estimators base_classifier = BaselineClassifier() base_regressor = BaselineRegressor() lr_classifier = LogisticRegressionClassifier() en_classifier = ElasticNetClassifier() en_regressor = ElasticNetRegressor() et_classifier = ExtraTreesClassifier(n_estimators=10, max_features="auto") et_regressor = ExtraTreesRegressor(n_estimators=10, max_features="auto") rf_classifier = RandomForestClassifier(n_estimators=10, max_depth=3) rf_regressor = RandomForestRegressor(n_estimators=10, max_depth=3) linear_regressor = LinearRegressor() svm_classifier = SVMClassifier() svm_regressor = SVMRegressor() assert base_classifier.describe(return_dict=True) == {'name': 'Baseline Classifier', 'parameters': {'strategy': 'mode'}} assert base_regressor.describe(return_dict=True) == {'name': 'Baseline Regressor', 'parameters': {'strategy': 'mean'}} assert lr_classifier.describe(return_dict=True) == {'name': 'Logistic Regression Classifier', 'parameters': {'penalty': 'l2', 'C': 1.0, 'n_jobs': -1, 'multi_class': 'auto', 'solver': 'lbfgs'}} assert en_classifier.describe(return_dict=True) == {'name': 'Elastic Net Classifier', 'parameters': {'alpha': 0.5, 'l1_ratio': 0.5, 'n_jobs': -1, 'max_iter': 1000, "loss": 'log', 'penalty': 'elasticnet'}} assert en_regressor.describe(return_dict=True) == {'name': 'Elastic Net Regressor', 'parameters': {'alpha': 0.5, 'l1_ratio': 0.5, 'max_iter': 1000, 'normalize': False}} assert et_classifier.describe(return_dict=True) == {'name': 'Extra Trees Classifier', 'parameters': {'n_estimators': 10, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1}} assert et_regressor.describe(return_dict=True) == {'name': 'Extra Trees Regressor', 'parameters': {'n_estimators': 10, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1}} assert rf_classifier.describe(return_dict=True) == {'name': 'Random Forest Classifier', 'parameters': {'n_estimators': 10, 'max_depth': 3, 'n_jobs': -1}} assert rf_regressor.describe(return_dict=True) == {'name': 'Random Forest Regressor', 'parameters': {'n_estimators': 10, 'max_depth': 3, 'n_jobs': -1}} assert linear_regressor.describe(return_dict=True) == {'name': 'Linear Regressor', 'parameters': {'fit_intercept': True, 'normalize': False, 'n_jobs': -1}} assert svm_classifier.describe(return_dict=True) == {'name': 'SVM Classifier', 'parameters': {'C': 1.0, 'kernel': 'rbf', 'gamma': 'scale', 'probability': True}} assert svm_regressor.describe(return_dict=True) == {'name': 'SVM Regressor', 'parameters': {'C': 1.0, 'kernel': 'rbf', 'gamma': 'scale'}} try: xgb_classifier = XGBoostClassifier(eta=0.1, min_child_weight=1, max_depth=3, n_estimators=75) xgb_regressor = XGBoostRegressor(eta=0.1, min_child_weight=1, max_depth=3, n_estimators=75) assert xgb_classifier.describe(return_dict=True) == {'name': 'XGBoost Classifier', 'parameters': {'eta': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 75}} assert xgb_regressor.describe(return_dict=True) == {'name': 'XGBoost Regressor', 'parameters': {'eta': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 75}} except ImportError: pass try: cb_classifier = CatBoostClassifier() cb_regressor = CatBoostRegressor() assert cb_classifier.describe(return_dict=True) == {'name': 'CatBoost Classifier', 'parameters': {'allow_writing_files': False, 'n_estimators': 10, 'eta': 0.03, 'max_depth': 6, 'bootstrap_type': None, 'silent': True}} assert cb_regressor.describe(return_dict=True) == {'name': 'CatBoost Regressor', 'parameters': {'allow_writing_files': False, 'n_estimators': 10, 'eta': 0.03, 'max_depth': 6, 'bootstrap_type': None, 'silent': False}} except ImportError: pass try: lg_classifier = LightGBMClassifier() lg_regressor = LightGBMRegressor() assert lg_classifier.describe(return_dict=True) == {'name': 'LightGBM Classifier', 'parameters': {'boosting_type': 'gbdt', 'learning_rate': 0.1, 'n_estimators': 100, 'max_depth': 0, 'num_leaves': 31, 'min_child_samples': 20, 'n_jobs': -1, 'bagging_fraction': 0.9, 'bagging_freq': 0}} assert lg_regressor.describe(return_dict=True) == {'name': 'LightGBM Regressor', 'parameters': {'boosting_type': 'gbdt', 'learning_rate': 0.1, 'n_estimators': 20, 'max_depth': 0, 'num_leaves': 31, 'min_child_samples': 20, 'n_jobs': -1, 'bagging_fraction': 0.9, 'bagging_freq': 0}} except ImportError: pass
def test_make_pipeline_from_components(X_y_binary, logistic_regression_binary_pipeline_class): with pytest.raises(ValueError, match="Pipeline needs to have an estimator at the last position of the component list"): make_pipeline_from_components([Imputer()], problem_type='binary') with pytest.raises(KeyError, match="Problem type 'invalid_type' does not exist"): make_pipeline_from_components([RandomForestClassifier()], problem_type='invalid_type') with pytest.raises(TypeError, match="Custom pipeline name must be a string"): make_pipeline_from_components([RandomForestClassifier()], problem_type='binary', custom_name=True) with pytest.raises(TypeError, match="Every element of `component_instances` must be an instance of ComponentBase"): make_pipeline_from_components([RandomForestClassifier], problem_type='binary') with pytest.raises(TypeError, match="Every element of `component_instances` must be an instance of ComponentBase"): make_pipeline_from_components(['RandomForestClassifier'], problem_type='binary') imp = Imputer(numeric_impute_strategy='median', random_seed=5) est = RandomForestClassifier(random_seed=7) pipeline = make_pipeline_from_components([imp, est], ProblemTypes.BINARY, custom_name='My Pipeline', random_seed=15) assert [c.__class__ for c in pipeline] == [Imputer, RandomForestClassifier] assert [(c.random_seed == 15) for c in pipeline] assert pipeline.problem_type == ProblemTypes.BINARY assert pipeline.custom_name == 'My Pipeline' expected_parameters = { 'Imputer': { 'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'median', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'Random Forest Classifier': { 'n_estimators': 100, 'max_depth': 6, 'n_jobs': -1} } assert pipeline.parameters == expected_parameters assert pipeline.random_seed == 15 class DummyEstimator(Estimator): name = "Dummy!" model_family = "foo" supported_problem_types = [ProblemTypes.BINARY] parameters = {'bar': 'baz'} random_seed = 42 pipeline = make_pipeline_from_components([DummyEstimator(random_seed=3)], ProblemTypes.BINARY, random_seed=random_seed) components_list = [c for c in pipeline] assert len(components_list) == 1 assert isinstance(components_list[0], DummyEstimator) assert components_list[0].random_seed == random_seed expected_parameters = {'Dummy!': {'bar': 'baz'}} assert pipeline.parameters == expected_parameters assert pipeline.random_seed == random_seed X, y = X_y_binary pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}, random_seed=42) component_instances = [c for c in pipeline] new_pipeline = make_pipeline_from_components(component_instances, ProblemTypes.BINARY) pipeline.fit(X, y) predictions = pipeline.predict(X) new_pipeline.fit(X, y) new_predictions = new_pipeline.predict(X) assert np.array_equal(predictions, new_predictions) assert np.array_equal(pipeline.feature_importance, new_pipeline.feature_importance) assert new_pipeline.name == 'Templated Pipeline' assert pipeline.parameters == new_pipeline.parameters for component, new_component in zip(pipeline._component_graph, new_pipeline._component_graph): assert isinstance(new_component, type(component)) assert pipeline.describe() == new_pipeline.describe()