def test_foreshadow_titanic(tmpdir): import pandas as pd train_data = pd.read_csv(get_file_path("data", "titanic-train.csv")) X_train_df = train_data.loc[:, "Pclass":"Embarked"] y_train_df = train_data.loc[:, "Survived"] X_train_df = X_train_df.drop(columns=["SibSp", "Parch", "Cabin"]) X_train, X_test, y_train, y_test = train_test_split(X_train_df, y_train_df, test_size=0.2, random_state=42) from foreshadow.estimators import AutoEstimator estimator = AutoEstimator( problem_type=ProblemType.CLASSIFICATION, auto="tpot", estimator_kwargs={ "max_time_mins": 1, "random_state": 42 }, ) shadow = Foreshadow(estimator=estimator, problem_type=ProblemType.CLASSIFICATION) shadow.override_intent(column_name="Name", intent=IntentType.TEXT) shadow.fit(X_train, y_train) score = shadow.score(X_test, y_test) print(score)
def test_foreshadow_sampling_performance_comparison(): X_train, X_test, y_train, y_test = train_test_split_local_file_common( file_path=get_file_path("data", "adult_small.csv"), X_start="age", X_end="workclass", target="class", ) shadow = construct_foreshadow_object_common( problem_type=ProblemType.CLASSIFICATION) import time start = time.time() shadow.X_preparer.fit_transform(X_train, y_train) end = time.time() time_taken1 = end - start shadow2 = construct_foreshadow_object_common( problem_type=ProblemType.CLASSIFICATION) shadow2.configure_sampling(enable_sampling=False) start = time.time() shadow2.X_preparer.fit_transform(X_train, y_train) end = time.time() time_taken2 = end - start # using sampling should be faster than without sampling on this dataset # as it has more than 40,000 rows. assert time_taken1 < time_taken2
def test_foreshadow_serialization_adults_small_classification_override(): from foreshadow.foreshadow import Foreshadow import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression np.random.seed(1337) data_path = get_file_path("data", "adult_small.csv") adult = pd.read_csv(data_path) X_df = adult.loc[:, "age":"workclass"] y_df = adult.loc[:, "class"] X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2) shadow = Foreshadow(estimator=LogisticRegression(), problem_type=ProblemType.CLASSIFICATION) shadow.fit(X_train, y_train) score1 = shadow.score(X_test, y_test) from foreshadow.intents import IntentType shadow.override_intent("age", IntentType.CATEGORICAL) shadow.override_intent("workclass", IntentType.CATEGORICAL) shadow.fit(X_train, y_train) assert shadow.get_intent("age") == IntentType.CATEGORICAL assert shadow.get_intent("workclass") == IntentType.CATEGORICAL score2 = shadow.score(X_test, y_test) print(score1, score2)
def test_smarttransformer_function(smart_child): """Test overall SmartTransformer functionality Args: smart_child: A subclass of SmartTransformer. """ import numpy as np import pandas as pd from foreshadow.concrete import StandardScaler boston_path = get_file_path("data", "boston_housing.csv") df = pd.read_csv(boston_path) smart = smart_child(cache_manager=CacheManager()) smart_data = smart.fit_transform(df[["crim"]]) std = StandardScaler() std_data = std.fit_transform(df[["crim"]]) assert smart_data.equals(std_data) smart.fit(df[["crim"]]) smart_data = smart.transform(df[["crim"]]) std.fit(df[["crim"]]) std_data = std.transform(df[["crim"]]) # TODO, remove when SmartTransformer is no longer wrapped # Column names will be different, thus np.allclose() is used assert np.allclose(smart_data, std_data)
def test_foreshadow_abort_on_empty_data_frame_after_cleaning( filename, problem_type, X_start, X_end, target): from foreshadow.foreshadow import Foreshadow import pandas as pd import numpy as np from sklearn.model_selection import train_test_split np.random.seed(1337) data_path = get_file_path("data", filename) data = pd.read_csv(data_path) X_df = data.loc[:, X_start:X_end] y_df = data.loc[:, target] X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2) from foreshadow.estimators import AutoEstimator estimator = AutoEstimator( problem_type=problem_type, auto="tpot", estimator_kwargs={"max_time_mins": 1}, ) shadow = Foreshadow(estimator=estimator, problem_type=problem_type) with pytest.raises(ValueError) as excinfo: shadow.fit(X_train, y_train) error_msg = ("All columns are dropped since they all have over 90% of " "missing values. Aborting foreshadow.") assert error_msg in str(excinfo.value)
def test_transformer_wrapper_function(): import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler as StandardScaler from foreshadow.concrete import StandardScaler as CustomScaler boston_path = get_file_path("data", "boston_housing.csv") df = pd.read_csv(boston_path) custom = CustomScaler() sklearn = StandardScaler() custom.fit(df[["crim"]]) sklearn.fit(df[["crim"]]) custom_tf = custom.transform(df[["crim"]]) sklearn_tf = sklearn.transform(df[["crim"]]) assert np.array_equal(custom_tf.values, sklearn_tf) custom_tf = custom.fit_transform(df[["crim"]]) sklearn_tf = sklearn.fit_transform(df[["crim"]]) assert np.array_equal(custom_tf.values, sklearn_tf)
def test_foreshadow_param_optimize(): # TODO: Make this test faster import pickle import json import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from foreshadow.foreshadow import Foreshadow from foreshadow.preparer import DataPreparer from foreshadow.optimizers.param_mapping import param_mapping boston_path = get_file_path("data", "boston_housing.csv") test_json_path = get_file_path("configs", "optimizer_test.json") truth_path = get_file_path("configs", "search_space_optimize.pkl") data = pd.read_csv(boston_path) js = json.load(open(test_json_path, "r")) fs = Foreshadow( DataPreparer(from_json=js), False, LinearRegression(), ProblemType.REGRESSION, GridSearchCV, ) fs.pipeline = Pipeline([("preparer", fs.X_preparer), ("estimator", fs.estimator)]) x = data.drop(["medv"], axis=1, inplace=False) y = data[["medv"]] x_train, _, y_train, _ = train_test_split(x, y, test_size=0.25) results = param_mapping(fs.pipeline, x_train, y_train) # (If you change default configs) or file structure, you will need to # verify the outputs are correct manually and regenerate the pickle # truth file. truth = pickle.load(open(truth_path, "rb")) assert results[0].keys() == truth[0].keys()
def test_console_generate_level3(filename, y_var, problem_type, estimator): data_path = get_file_path("data", filename) args = ["--level", "3", data_path, y_var, problem_type] model = generate_model(args) assert isinstance(model[0].estimator, AutoEstimator)
def test_foreshadow_integration_data_cleaner_can_drop(filename, problem_type, X_start, X_end, target, tmpdir): from foreshadow.foreshadow import Foreshadow import pandas as pd import numpy as np from sklearn.model_selection import train_test_split np.random.seed(1337) data_path = get_file_path("data", filename) data = pd.read_csv(data_path) # local_file_folder = "examples" # data = pd.read_csv("/".join([local_file_folder, filename])) X_df = data.loc[:, X_start:X_end] y_df = data.loc[:, target] X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2) from foreshadow.estimators import AutoEstimator estimator = AutoEstimator( problem_type=problem_type, auto="tpot", estimator_kwargs={"max_time_mins": 1}, ) shadow = Foreshadow(estimator=estimator, problem_type=problem_type) pickled_fitted_pipeline_location = tmpdir.join("fitted_pipeline.p") shadow.fit(X_train, y_train) shadow.pickle_fitted_pipeline(pickled_fitted_pipeline_location) import pickle with open(pickled_fitted_pipeline_location, "rb") as fopen: pipeline = pickle.load(fopen) # If there are new empty columns in the test set, the program should # not fail. X_test[X_start] = np.nan score1 = shadow.score(X_test, y_test) score2 = pipeline.score(X_test, y_test) import unittest assertions = unittest.TestCase("__init__") # given the randomness of the tpot algorithm and the short run # time we configured, there is no guarantee the performance can # converge. The test here aims to evaluate if both cases have # produced a reasonable score and the difference is small. # assert score1 > 0.76 and score2 > 0.76 assertions.assertAlmostEqual(score1, score2, places=2)
def test_smart_impute_simple_mean(): import numpy as np import pandas as pd from foreshadow.smart import SimpleFillImputer heart_path = get_file_path("data", "heart-h.csv") heart_impute_path = get_file_path("data", "heart-h_impute_mean.csv") impute = SimpleFillImputer() df = pd.read_csv(heart_path) data = df[["chol"]] impute.fit(data) out = impute.transform(data) truth = pd.read_csv(heart_impute_path, index_col=0) assert np.array_equal(out, truth)
def test_console_generate_ignore_time(): from foreshadow.console import generate_model data_path = get_file_path("data", "boston_housing.csv") args = [data_path, "medv", "--level", "2", "--time", "20"] with pytest.warns(UserWarning, match="Time parameter not applicable"): generate_model(args)
def test_smart_impute_multiple(): import numpy as np import pandas as pd from foreshadow.smart import MultiImputer heart_path = get_file_path("data", "heart-h.csv") heart_impute_path = get_file_path("data", "heart-h_impute_multi.csv") impute = MultiImputer() df = pd.read_csv(heart_path) data = df[["thalach", "chol", "trestbps", "age"]] impute.fit(data) out = impute.transform(data) truth = pd.read_csv(heart_impute_path, index_col=0) assert np.allclose(truth.values, out.values)
def test_console_generate_invalid_file(): from foreshadow.console import generate_model data_path = get_file_path("data", "missing_file.csv") args = ["--level", "5", data_path, "badtarget", "regression"] with pytest.raises(ValueError) as e: generate_model(args) assert "Failed to load file." in str(e.value)
def test_transformer_fancy_impute_set_params(): import numpy as np import pandas as pd from foreshadow.concrete import FancyImputer impute_kwargs = {"fill_method": "mean"} impute = FancyImputer(method="SimpleFill", impute_kwargs=impute_kwargs) heart_path = get_file_path("data", "heart-h.csv") heart_impute_path = get_file_path("data", "heart-h_impute_mean.csv") df = pd.read_csv(heart_path) data = df[["chol"]] impute.fit(data) out = impute.transform(data) truth = pd.read_csv(heart_impute_path, index_col=0) assert np.array_equal(out, truth)
def test_console_generate_invalid_target(): from foreshadow.console import generate_model data_path = get_file_path("data", "boston_housing.csv") args = ["--level", "5", data_path, "badtarget", "regression"] with pytest.raises(ValueError) as e: generate_model(args) assert "Invalid target variable" in str(e.value)
def test_transformer_keep_cols(): import pandas as pd from foreshadow.concrete import StandardScaler as CustomScaler boston_path = get_file_path("data", "boston_housing.csv") df = pd.read_csv(boston_path) custom = CustomScaler(keep_columns=True) custom_tf = custom.fit_transform(df[["crim"]]) assert custom_tf.shape[1] == 2
def test_console_parse_args_multiprocess(): from foreshadow.console import process_argument data_path = get_file_path("data", "boston_housing.csv") args = ["--level", "1", data_path, "medv", "regression"] cargs = process_argument(args) assert cargs.multiprocess is False args = ["--level", "1", "--multiprocess", data_path, "medv", "regression"] cargs = process_argument(args) assert cargs.multiprocess is True
def test_transformer_naming_default(): from foreshadow.concrete import StandardScaler import pandas as pd boston_path = get_file_path("data", "boston_housing.csv") df = pd.read_csv(boston_path) scaler = StandardScaler(keep_columns=False) out = scaler.fit_transform(df[["crim"]]) assert out.iloc[:, 0].name == "crim"
def test_foreshadow_param_optimize_invalid_array_idx(): import json import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from foreshadow.foreshadow import Foreshadow from foreshadow.preparer import DataPreparer from foreshadow.cachemanager import CacheManager boston_path = get_file_path("data", "boston_housing.csv") test_path = get_file_path("configs", "invalid_optimizer_config.json") data = pd.read_csv(boston_path) cfg = json.load(open(test_path, "r")) fs = Foreshadow( DataPreparer(CacheManager(), from_json=cfg), False, LinearRegression(), ProblemType.REGRESSION, GridSearchCV, ) fs.pipeline = Pipeline([("preprocessor", fs.X_preparer), ("estimator", fs.estimator)]) x = data.drop(["medv"], axis=1, inplace=False) y = data[["medv"]] x_train, _, y_train, _ = train_test_split(x, y, test_size=0.25) with pytest.raises(ValueError) as e: param_mapping(fs.pipeline, x_train, y_train) # noqa: F821 assert str(e.value).startswith("Attempted to index list")
def test_console_generate_and_execute_model(filename, family, y_var, problem_type, estimator): from foreshadow.console import generate_model, execute_model data_path = get_file_path("data", filename) args = ["--family", family, data_path, y_var, problem_type] model = generate_model(args) assert isinstance(model[0].estimator, estimator) execute_model(*model)
def test_foreshadow_param_optimize_no_combinations(): import pickle import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from foreshadow.foreshadow import Foreshadow from foreshadow.preparer import DataPreparer from foreshadow.cachemanager import CacheManager boston_path = get_file_path("data", "boston_housing.csv") test_path = get_file_path("configs", "search_space_no_combo.pkl") data = pd.read_csv(boston_path) fs = Foreshadow( DataPreparer(cache_manager=CacheManager(), from_json={}), False, LinearRegression(), ProblemType.REGRESSION, GridSearchCV, ) fs.pipeline = Pipeline([("preprocessor", fs.X_preparer), ("estimator", fs.estimator)]) x = data.drop(["medv"], axis=1, inplace=False) y = data[["medv"]] x_train, _, y_train, _ = train_test_split(x, y, test_size=0.25) results = param_mapping(fs.pipeline, x_train, y_train) # noqa: F821 truth = pickle.load(open(test_path, "rb")) assert results[0].keys() == truth[0].keys()
def test_smarttransformer_attributeerror(smart_child, mocker): import pandas as pd from foreshadow.exceptions import TransformerNotFound boston_path = get_file_path("data", "boston_housing.csv") df = pd.read_csv(boston_path) smart = smart_child() smart.pick_transformer = mocker.Mock() smart.pick_transformer.return_value = "INVALID" with pytest.raises(TransformerNotFound): smart.fit(df[["crim"]])
def test_foreshadow_adults_classification(): X_train, X_test, y_train, y_test = train_test_split_local_file_common( file_path=get_file_path("data", "adult.csv"), X_start="age", X_end="native-country", target="class", ) shadow = construct_foreshadow_object_common( problem_type=ProblemType.CLASSIFICATION) shadow.fit(X_train, y_train) score = shadow.score(X_test, y_test) print(score)
def test_foreshadow_param_optimize_fit(mocker): import pandas as pd from foreshadow.base import BaseEstimator, TransformerMixin from sklearn.model_selection._search import BaseSearchCV from foreshadow.foreshadow import Foreshadow boston_path = get_file_path("data", "boston_housing.csv") data = pd.read_csv(boston_path) class DummyRegressor(BaseEstimator, TransformerMixin): def fit(self, X, y): return self class DummySearch(BaseSearchCV): def __init__(self, estimator, params): self.best_estimator_ = estimator def fit(self, X, y=None, **fit_params): return self class DummyDataPreparer(BaseEstimator, TransformerMixin): def fit(self, X, y): return self mocker.patch("foreshadow.preparer.DataPreparer", return_value=DummyDataPreparer) fs = Foreshadow( problem_type=ProblemType.REGRESSION, estimator=DummyRegressor(), optimizer=DummySearch, ) x = data.drop(["medv"], axis=1, inplace=False) y = data[["medv"]] fs.fit(x, y) assert isinstance(fs.pipeline.steps[-1][1].estimator, DummyRegressor) fs2 = Foreshadow( problem_type=ProblemType.REGRESSION, X_preparer=False, y_preparer=False, estimator=DummyRegressor(), optimizer=DummySearch, ) fs2.fit(x, y) assert isinstance(fs2.pipeline.steps[-1][1], DummyRegressor)
def test_smart_impute_simple_none(): import numpy as np import pandas as pd from foreshadow.smart import SimpleFillImputer heart_path = get_file_path("data", "heart-h.csv") impute = SimpleFillImputer(threshold=0.05) df = pd.read_csv(heart_path) data = df[["chol"]] impute.fit(data) out = impute.transform(data) assert np.allclose(data, out, equal_nan=True)
def test_console_generate_ignore_method(): from foreshadow.console import generate_model data_path = get_file_path("data", "boston_housing.csv") args = [ "--level", "3", data_path, "medv", "regression", "--method", "method", ] with pytest.warns(UserWarning, match="Method will be ignored"): generate_model(args)
def test_data_preparer_fit(cleaner_kwargs): """Test fitting of DataPreparer after creation with kwargs. Args: cleaner_kwargs: kwargs to CleanerMapper step """ from foreshadow.preparer import DataPreparer from foreshadow.cachemanager import CacheManager import pandas as pd boston_path = get_file_path("data", "boston_housing.csv") data = pd.read_csv(boston_path) cs = CacheManager() dp = DataPreparer(cs, cleaner_kwargs=cleaner_kwargs) dp.fit(data)
def test_smart_impute_multiple_none(): import pandas as pd from sklearn.pipeline import Pipeline from foreshadow.smart import MultiImputer from foreshadow.utils import PipelineStep boston_path = get_file_path("data", "boston_housing.csv") impute = MultiImputer() df = pd.read_csv(boston_path) data = df[["crim", "nox", "indus"]] impute.fit(data) impute.transform(data) assert isinstance(impute.transformer, Pipeline) assert impute.transformer.steps[0][PipelineStep["NAME"]] == "null"
def test_smarttransformer_function_override(smart_child): """Test SmartTransformer override through parameter specification. Args: smart_child: A subclass of SmartTransformer. """ import numpy as np import pandas as pd from foreshadow.concrete import SimpleImputer boston_path = get_file_path("data", "boston_housing.csv") df = pd.read_csv(boston_path) smart = smart_child( transformer="SimpleImputer", name="impute", cache_manager=CacheManager(), ) smart_data = smart.fit_transform(df[["crim"]]) assert isinstance(smart.transformer, SimpleImputer) # assert smart.transformer.name == "impute" # not relevant anymore. std = SimpleImputer() std_data = std.fit_transform(df[["crim"]]) assert smart_data.equals(std_data) smart.fit(df[["crim"]]) smart_data = smart.transform(df[["crim"]]) std.fit(df[["crim"]]) std_data = std.transform(df[["crim"]]) assert std_data.columns[0] == "crim" # TODO, remove when SmartTransformer is no longer wrapped # Column names will be different, thus np.allclose() is used assert np.allclose(smart_data, std_data)
def test_get_config_only_sys(): import pickle from foreshadow.config import config from foreshadow.utils.testing import get_file_path resolved = config.get_config() test_data_path = get_file_path("configs", "configs_default.pkl") # (If you change default configs) or file structure, you will need to # verify the outputs are correct manually and regenerate the pickle # truth file. # with open(test_data_path, "wb") as fopen: # pickle.dump(config[cfg_hash], fopen) with open(test_data_path, "rb") as fopen: test_data = pickle.load(fopen) assert resolved == test_data