def test_baseline_binary_mode(data_type, make_data_type): X = pd.DataFrame({ 'one': [1, 2, 3, 4], 'two': [2, 3, 4, 5], 'three': [1, 2, 3, 4] }) y = pd.Series([10, 11, 10, 10]) X = make_data_type(data_type, X) y = make_data_type(data_type, y) clf = BaselineClassifier(strategy="mode") fitted = clf.fit(X, y) assert isinstance(fitted, BaselineClassifier) assert clf.classes_ == [10, 11] expected_predictions = pd.Series(np.array([10] * X.shape[0]), dtype="Int64") predictions = clf.predict(X) assert_series_equal(expected_predictions, predictions.to_series()) predicted_proba = clf.predict_proba(X) assert predicted_proba.shape == (X.shape[0], 2) expected_predictions_proba = pd.DataFrame({ 10: [1., 1., 1., 1.], 11: [0., 0., 0., 0.] }) assert_frame_equal(expected_predictions_proba, predicted_proba.to_dataframe()) np.testing.assert_allclose(clf.feature_importance, np.array([0.0] * X.shape[1]))
def test_stacked_ensemble_nonstackable_model_families(): with pytest.raises( ValueError, match= "Pipelines with any of the following model families cannot be used as base pipelines" ): StackedEnsembleClassifier(input_pipelines=[ make_pipeline_from_components([BaselineClassifier()], ProblemTypes.BINARY) ])
def test_baseline_binary_random_weighted(X_y_binary): X, y = X_y_binary values, counts = np.unique(y, return_counts=True) percent_freq = counts.astype(float) / len(y) assert percent_freq.sum() == 1.0 clf = BaselineClassifier(strategy="random_weighted", random_state=0) clf.fit(X, y) assert clf.classes_ == [0, 1] expected_predictions = pd.Series(get_random_state(0).choice( np.unique(y), len(X), p=percent_freq), dtype="Int64") predictions = clf.predict(X) assert_series_equal(expected_predictions, predictions.to_series()) predicted_proba = clf.predict_proba(X) assert predicted_proba.shape == (len(X), 2) expected_predictions_proba = pd.DataFrame( np.array([[percent_freq[i] for i in range(len(values))]] * len(X))) assert_frame_equal(expected_predictions_proba, predicted_proba.to_dataframe()) np.testing.assert_allclose(clf.feature_importance, np.array([0.0] * X.shape[1]))
def test_baseline_multiclass_mode(): X = pd.DataFrame({ 'one': [1, 2, 3, 4], 'two': [2, 3, 4, 5], 'three': [1, 2, 3, 4] }) y = pd.Series([10, 12, 11, 11]) clf = BaselineClassifier(strategy="mode") clf.fit(X, y) assert clf.classes_ == [10, 11, 12] predictions = clf.predict(X) expected_predictions = pd.Series([11] * len(X), dtype="Int64") assert_series_equal(expected_predictions, predictions.to_series()) predicted_proba = clf.predict_proba(X) assert predicted_proba.shape == (len(X), 3) expected_predictions_proba = pd.DataFrame({ 10: [0., 0., 0., 0.], 11: [1., 1., 1., 1.], 12: [0., 0., 0., 0.] }) assert_frame_equal(expected_predictions_proba, predicted_proba.to_dataframe()) np.testing.assert_allclose(clf.feature_importance, np.array([0.0] * X.shape[1]))
def test_baseline_binary_mode(data_type, X_y_binary): X = pd.DataFrame({ 'one': [1, 2, 3, 4], 'two': [2, 3, 4, 5], 'three': [1, 2, 3, 4] }) y = pd.Series([10, 11, 10, 10]) if data_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) clf = BaselineClassifier(strategy="mode") fitted = clf.fit(X, y) assert isinstance(fitted, BaselineClassifier) assert clf.classes_ == [10, 11] np.testing.assert_allclose(clf.predict(X), np.array([10] * X.shape[0])) predicted_proba = clf.predict_proba(X) assert predicted_proba.shape == (X.shape[0], 2) expected_predicted_proba = pd.DataFrame({ 10: [1., 1., 1., 1.], 11: [0., 0., 0., 0.] }) pd.testing.assert_frame_equal(expected_predicted_proba, predicted_proba) np.testing.assert_allclose(clf.feature_importance, np.array([0.0] * X.shape[1]))
def test_baseline_no_mode(): X = pd.DataFrame([[1, 2, 3, 0, 1]]) y = pd.Series([1, 0, 2, 0, 1]) clf = BaselineClassifier() clf.fit(X, y) assert clf.classes_ == [0, 1, 2] np.testing.assert_allclose(clf.predict(X), np.array([0] * len(X))) predicted_proba = clf.predict_proba(X) assert predicted_proba.shape == (len(X), 3) np.testing.assert_allclose( predicted_proba, np.array([[1.0 if i == 0 else 0.0 for i in range(3)]] * len(X))) np.testing.assert_allclose(clf.feature_importance, np.array([0.0] * X.shape[1]))
def test_baseline_multiclass_random(X_y_multi): X, y = X_y_multi values = np.unique(y) clf = BaselineClassifier(strategy="random", random_seed=0) clf.fit(X, y) assert clf.classes_ == [0, 1, 2] expected_predictions = pd.Series(get_random_state(0).choice(np.unique(y), len(X)), dtype="Int64") predictions = clf.predict(X) assert_series_equal(expected_predictions, predictions.to_series()) predicted_proba = clf.predict_proba(X) assert predicted_proba.shape == (len(X), 3) assert_frame_equal(pd.DataFrame(np.array([[1. / 3 for i in range(len(values))]] * len(X))), predicted_proba.to_dataframe()) np.testing.assert_allclose(clf.feature_importance, np.array([0.0] * X.shape[1]))
def test_baseline_no_mode(): X = pd.DataFrame([[1, 2, 3, 0, 1]]) y = pd.Series([1, 0, 2, 0, 1]) clf = BaselineClassifier() clf.fit(X, y) assert clf.classes_ == [0, 1, 2] expected_predictions = pd.Series([0] * len(X), dtype="Int64") predictions = clf.predict(X) assert_series_equal(expected_predictions, predictions.to_series()) predicted_proba = clf.predict_proba(X) assert predicted_proba.shape == (len(X), 3) assert_frame_equal(pd.DataFrame(np.array([[1.0 if i == 0 else 0.0 for i in range(3)]] * len(X))), predicted_proba.to_dataframe()) np.testing.assert_allclose(clf.feature_importance, np.array([0.0] * X.shape[1]))
def test_baseline_binary_random(X_y_binary): X, y = X_y_binary values = np.unique(y) clf = BaselineClassifier(strategy="random", random_state=0) clf.fit(X, y) assert clf.classes_ == [0, 1] np.testing.assert_allclose( clf.predict(X), get_random_state(0).choice(np.unique(y), len(X))) predicted_proba = clf.predict_proba(X) assert predicted_proba.shape == (len(X), 2) np.testing.assert_allclose( predicted_proba, np.array([[0.5 for i in range(len(values))]] * len(X))) np.testing.assert_allclose(clf.feature_importance, np.array([0.0] * X.shape[1]))
def test_baseline_binary_random_weighted(X_y_binary): X, y = X_y_binary values, counts = np.unique(y, return_counts=True) percent_freq = counts.astype(float) / len(y) assert percent_freq.sum() == 1.0 clf = BaselineClassifier(strategy="random_weighted", random_state=0) clf.fit(X, y) assert clf.classes_ == [0, 1] np.testing.assert_allclose( clf.predict(X), get_random_state(0).choice(np.unique(y), len(X), p=percent_freq)) predicted_proba = clf.predict_proba(X) assert predicted_proba.shape == (len(X), 2) np.testing.assert_allclose( predicted_proba, np.array([[percent_freq[i] for i in range(len(values))]] * len(X))) np.testing.assert_allclose(clf.feature_importance, np.array([0.0] * X.shape[1]))
def test_describe_component(): enc = OneHotEncoder() imputer = Imputer() simple_imputer = SimpleImputer("mean") column_imputer = PerColumnImputer({"a": "mean", "b": ("constant", 100)}) scaler = StandardScaler() feature_selection_clf = RFClassifierSelectFromModel(n_estimators=10, number_features=5, percent_features=0.3, threshold=-np.inf) feature_selection_reg = RFRegressorSelectFromModel(n_estimators=10, number_features=5, percent_features=0.3, threshold=-np.inf) drop_col_transformer = DropColumns(columns=['col_one', 'col_two']) drop_null_transformer = DropNullColumns() datetime = DateTimeFeaturizer() text_featurizer = TextFeaturizer() lsa = LSA() pca = PCA() lda = LinearDiscriminantAnalysis() ft = DFSTransformer() us = Undersampler() assert enc.describe(return_dict=True) == {'name': 'One Hot Encoder', 'parameters': {'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}} assert imputer.describe(return_dict=True) == {'name': 'Imputer', 'parameters': {'categorical_impute_strategy': "most_frequent", 'categorical_fill_value': None, 'numeric_impute_strategy': "mean", 'numeric_fill_value': None}} assert simple_imputer.describe(return_dict=True) == {'name': 'Simple Imputer', 'parameters': {'impute_strategy': 'mean', 'fill_value': None}} assert column_imputer.describe(return_dict=True) == {'name': 'Per Column Imputer', 'parameters': {'impute_strategies': {'a': 'mean', 'b': ('constant', 100)}, 'default_impute_strategy': 'most_frequent'}} assert scaler.describe(return_dict=True) == {'name': 'Standard Scaler', 'parameters': {}} assert feature_selection_clf.describe(return_dict=True) == {'name': 'RF Classifier Select From Model', 'parameters': {'number_features': 5, 'n_estimators': 10, 'max_depth': None, 'percent_features': 0.3, 'threshold': -np.inf, 'n_jobs': -1}} assert feature_selection_reg.describe(return_dict=True) == {'name': 'RF Regressor Select From Model', 'parameters': {'number_features': 5, 'n_estimators': 10, 'max_depth': None, 'percent_features': 0.3, 'threshold': -np.inf, 'n_jobs': -1}} assert drop_col_transformer.describe(return_dict=True) == {'name': 'Drop Columns Transformer', 'parameters': {'columns': ['col_one', 'col_two']}} assert drop_null_transformer.describe(return_dict=True) == {'name': 'Drop Null Columns Transformer', 'parameters': {'pct_null_threshold': 1.0}} assert datetime.describe(return_dict=True) == {'name': 'DateTime Featurization Component', 'parameters': {'features_to_extract': ['year', 'month', 'day_of_week', 'hour'], 'encode_as_categories': False}} assert text_featurizer.describe(return_dict=True) == {'name': 'Text Featurization Component', 'parameters': {}} assert lsa.describe(return_dict=True) == {'name': 'LSA Transformer', 'parameters': {}} assert pca.describe(return_dict=True) == {'name': 'PCA Transformer', 'parameters': {'n_components': None, 'variance': 0.95}} assert lda.describe(return_dict=True) == {'name': 'Linear Discriminant Analysis Transformer', 'parameters': {'n_components': None}} assert ft.describe(return_dict=True) == {'name': 'DFS Transformer', 'parameters': {"index": "index"}} assert us.describe(return_dict=True) == {'name': 'Undersampler', 'parameters': {"balanced_ratio": 4, "min_samples": 100, "min_percentage": 0.1}} # testing estimators base_classifier = BaselineClassifier() base_regressor = BaselineRegressor() lr_classifier = LogisticRegressionClassifier() en_classifier = ElasticNetClassifier() en_regressor = ElasticNetRegressor() et_classifier = ExtraTreesClassifier(n_estimators=10, max_features="auto") et_regressor = ExtraTreesRegressor(n_estimators=10, max_features="auto") rf_classifier = RandomForestClassifier(n_estimators=10, max_depth=3) rf_regressor = RandomForestRegressor(n_estimators=10, max_depth=3) linear_regressor = LinearRegressor() svm_classifier = SVMClassifier() svm_regressor = SVMRegressor() assert base_classifier.describe(return_dict=True) == {'name': 'Baseline Classifier', 'parameters': {'strategy': 'mode'}} assert base_regressor.describe(return_dict=True) == {'name': 'Baseline Regressor', 'parameters': {'strategy': 'mean'}} assert lr_classifier.describe(return_dict=True) == {'name': 'Logistic Regression Classifier', 'parameters': {'penalty': 'l2', 'C': 1.0, 'n_jobs': -1, 'multi_class': 'auto', 'solver': 'lbfgs'}} assert en_classifier.describe(return_dict=True) == {'name': 'Elastic Net Classifier', 'parameters': {'alpha': 0.5, 'l1_ratio': 0.5, 'n_jobs': -1, 'max_iter': 1000, "loss": 'log', 'penalty': 'elasticnet'}} assert en_regressor.describe(return_dict=True) == {'name': 'Elastic Net Regressor', 'parameters': {'alpha': 0.5, 'l1_ratio': 0.5, 'max_iter': 1000, 'normalize': False}} assert et_classifier.describe(return_dict=True) == {'name': 'Extra Trees Classifier', 'parameters': {'n_estimators': 10, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1}} assert et_regressor.describe(return_dict=True) == {'name': 'Extra Trees Regressor', 'parameters': {'n_estimators': 10, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1}} assert rf_classifier.describe(return_dict=True) == {'name': 'Random Forest Classifier', 'parameters': {'n_estimators': 10, 'max_depth': 3, 'n_jobs': -1}} assert rf_regressor.describe(return_dict=True) == {'name': 'Random Forest Regressor', 'parameters': {'n_estimators': 10, 'max_depth': 3, 'n_jobs': -1}} assert linear_regressor.describe(return_dict=True) == {'name': 'Linear Regressor', 'parameters': {'fit_intercept': True, 'normalize': False, 'n_jobs': -1}} assert svm_classifier.describe(return_dict=True) == {'name': 'SVM Classifier', 'parameters': {'C': 1.0, 'kernel': 'rbf', 'gamma': 'scale', 'probability': True}} assert svm_regressor.describe(return_dict=True) == {'name': 'SVM Regressor', 'parameters': {'C': 1.0, 'kernel': 'rbf', 'gamma': 'scale'}} try: xgb_classifier = XGBoostClassifier(eta=0.1, min_child_weight=1, max_depth=3, n_estimators=75) xgb_regressor = XGBoostRegressor(eta=0.1, min_child_weight=1, max_depth=3, n_estimators=75) assert xgb_classifier.describe(return_dict=True) == {'name': 'XGBoost Classifier', 'parameters': {'eta': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 75}} assert xgb_regressor.describe(return_dict=True) == {'name': 'XGBoost Regressor', 'parameters': {'eta': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 75}} except ImportError: pass try: cb_classifier = CatBoostClassifier() cb_regressor = CatBoostRegressor() assert cb_classifier.describe(return_dict=True) == {'name': 'CatBoost Classifier', 'parameters': {'allow_writing_files': False, 'n_estimators': 10, 'eta': 0.03, 'max_depth': 6, 'bootstrap_type': None, 'silent': True}} assert cb_regressor.describe(return_dict=True) == {'name': 'CatBoost Regressor', 'parameters': {'allow_writing_files': False, 'n_estimators': 10, 'eta': 0.03, 'max_depth': 6, 'bootstrap_type': None, 'silent': False}} except ImportError: pass try: lg_classifier = LightGBMClassifier() lg_regressor = LightGBMRegressor() assert lg_classifier.describe(return_dict=True) == {'name': 'LightGBM Classifier', 'parameters': {'boosting_type': 'gbdt', 'learning_rate': 0.1, 'n_estimators': 100, 'max_depth': 0, 'num_leaves': 31, 'min_child_samples': 20, 'n_jobs': -1, 'bagging_fraction': 0.9, 'bagging_freq': 0}} assert lg_regressor.describe(return_dict=True) == {'name': 'LightGBM Regressor', 'parameters': {'boosting_type': 'gbdt', 'learning_rate': 0.1, 'n_estimators': 20, 'max_depth': 0, 'num_leaves': 31, 'min_child_samples': 20, 'n_jobs': -1, 'bagging_fraction': 0.9, 'bagging_freq': 0}} except ImportError: pass
def test_baseline_y_is_None(X_y_binary): X, _ = X_y_binary with pytest.raises(ValueError): BaselineClassifier().fit(X, y=None)
def test_baseline_invalid_strategy(): with pytest.raises(ValueError): BaselineClassifier(strategy="unfortunately invalid strategy")
def test_baseline_init(): baseline = BaselineClassifier() assert baseline.parameters["strategy"] == "mode" assert baseline.model_family == ModelFamily.BASELINE assert baseline.classes_ is None