def test_cache_manager_len(store, expected): """Test that iter iterates over entire internal dict properly. Args: store: the internal dictionary to use. """ from foreshadow.cachemanager import CacheManager cs = CacheManager() cs.store = store assert len(cs) == expected
def test_json_flattening_with_non_json_columns(): data = pd.DataFrame( { "json": [ '{"date": "2019-04-11"}', '{"financial": "$1.0"}', '{"financial": "$1000.00"}', '{"random": "asdf"}', ], "num": [1, 2, 3, 4], }, columns=["json", "num"], ) cs = CacheManager() dc = FlattenMapper(cache_manager=cs) dc.fit(data) transformed_data = dc.transform(data) check = pd.DataFrame( [ ["2019-04-11", np.nan, np.nan, 1], [np.nan, "$1.0", np.nan, 2], [np.nan, "$1000.00", np.nan, 3], [np.nan, np.nan, "asdf", 4], ], columns=["json_date", "json_financial", "json_random", "num"], ) assert np.all( np.equal( transformed_data.values[transformed_data.notna()], check.values[check.notna()], ))
def test_smarttransformer_function(smart_child): """Test overall SmartTransformer functionality Args: smart_child: A subclass of SmartTransformer. """ import numpy as np import pandas as pd from foreshadow.concrete import StandardScaler boston_path = get_file_path("data", "boston_housing.csv") df = pd.read_csv(boston_path) smart = smart_child(cache_manager=CacheManager()) smart_data = smart.fit_transform(df[["crim"]]) std = StandardScaler() std_data = std.fit_transform(df[["crim"]]) assert smart_data.equals(std_data) smart.fit(df[["crim"]]) smart_data = smart.transform(df[["crim"]]) std.fit(df[["crim"]]) std_data = std.transform(df[["crim"]]) # TODO, remove when SmartTransformer is no longer wrapped # Column names will be different, thus np.allclose() is used assert np.allclose(smart_data, std_data)
def test_feature_reducer_get_mapping_by_intent(): import pandas as pd from foreshadow.cachemanager import CacheManager from foreshadow.steps import FeatureReducerMapper from foreshadow.steps.preparerstep import PreparerMapping from foreshadow.smart import FeatureReducer data = pd.DataFrame( { "age": [10, 20, 33, 44], "weights": [20, 30, 50, 60], "occupation": ["engineer", "artist", "doctor", "inspector"], }, columns=["age", "weights", "occupation"], ) cs = CacheManager() cs["intent", "age"] = "Numeric" cs["intent", "weights"] = "Numeric" cs["intent", "occupation"] = "Categorical" fr = FeatureReducerMapper(cache_manager=cs) column_mapping = fr.get_mapping(data) check = PreparerMapping() check.add(["age", "weights"], [FeatureReducer(cache_manager=cs)], "Numeric") check.add(["occupation"], [FeatureReducer(cache_manager=cs)], "Categorical") for key in column_mapping.store: assert key in check.store assert str(column_mapping.store[key]) == str(check.store[key])
def test_feature_reducer_fit_no_ops(): import numpy as np import pandas as pd from foreshadow.cachemanager import CacheManager from foreshadow.steps import FeatureReducerMapper data = pd.DataFrame( { "age": [10, 20, 33, 44], "weights": [20, 30, 50, 60], "occupation": ["engineer", "artist", "doctor", "inspector"], }, columns=["age", "weights", "occupation"], ) cs = CacheManager() cs["intent", "age"] = "Numeric" cs["intent", "weights"] = "Numeric" cs["intent", "occupation"] = "Categorical" fr = FeatureReducerMapper(cache_manager=cs) fr.fit(data) transformed_data = fr.transform(data) assert np.all( np.equal( data.values[data.notna()], transformed_data.values[transformed_data.notna()], ))
def test_cache_manager_setitem(capsys, key, item_to_set, expected, warning): """Test that getitem works for all valid key combinations or error raised. Args: capsys: captures stdout and stderr. Pytest fixture. key (list): key to access on ColumnSharer item_to_set: the item to set on the key as starting data. Dependent on the length of the key. expected: the expected result or error warning: True to check if should raise warning. False to not. """ from foreshadow.cachemanager import CacheManager cs = CacheManager() if len(key) == 1: cs[key[0]] = item_to_set assert cs[key[0]] == expected if warning: out, err = capsys.readouterr() assert out.find("WARNING") != -1 elif len(key) == 2: cs[key[0], key[1]] = item_to_set print(cs.store) assert cs[key[0], key[1]] == expected if warning: out, err = capsys.readouterr() assert out.find("WARNING") != -1 else: raise NotImplementedError("test case not implemented")
def test_json_flattening(): """Test json input are flattened correctly.""" data = pd.DataFrame( { "json": [ '{"date": "2019-04-11"}', '{"financial": "$1.0"}', '{"financial": "$1000.00"}', '{"random": "asdf"}', ] }, columns=["json"], ) cs = CacheManager() dc = FlattenMapper(cache_manager=cs) dc.fit(data) transformed_data = dc.transform(data) check = pd.DataFrame( [ ["2019-04-11", np.nan, np.nan], [np.nan, "$1.0", np.nan], [np.nan, "$1000.00", np.nan], [np.nan, np.nan, "asdf"], ], columns=["json_date", "json_financial", "json_random"], ) assert np.all( np.equal( transformed_data.values[transformed_data.notna()], check.values[check.notna()], ))
def __init__( self, problem_type, random_state=None, n_jobs=1, estimator=None, allowed_seconds=300, auto_estimator_kwargs=None, ): if problem_type not in [ ProblemType.CLASSIFICATION, ProblemType.REGRESSION, ]: raise ValueError("Unknown Problem Type {}. Please choose from {} " "or {}".format( problem_type, ProblemType.CLASSIFICATION, ProblemType.REGRESSION, )) self.problem_type = problem_type self.random_state = random_state self.n_jobs = n_jobs self._X_preparer = DataPreparer(cache_manager=CacheManager()) self.configure_multiprocessing(self.n_jobs) self._y_preprarer = DataPreparer( cache_manager=CacheManager(), y_var=True, problem_type=self.problem_type, ) if estimator is not None and auto_estimator_kwargs is not None: raise ValueError( "estimator and estimator_kwargs are mutually exclusive") self.allowed_seconds = allowed_seconds self.auto_estimator_kwargs = auto_estimator_kwargs self.estimator = estimator self.pipeline = None self.data_columns = None self.has_fitted = False if self.y_preparer is not None: self.estimator_wrapper = EstimatorWrapper(self.estimator, self.y_preparer) else: self.estimator_wrapper = self.estimator
def test_data_exporter_determine_export_path_default(is_train): cache_manager = CacheManager() exporter = DataExporterMapper(cache_manager=cache_manager) data_path = exporter._determine_export_path(is_train=is_train) expected_data_path = (DefaultConfig.PROCESSED_TRAINING_DATA_EXPORT_PATH if is_train else DefaultConfig.PROCESSED_TEST_DATA_EXPORT_PATH) assert data_path == expected_data_path
def test_cache_manager_checkkey(capsys, key, expected): """Test that getitem works for all valid key combinations. Args: capsys: captures stdout and stderr. Pytest fixture. key (list): key to access on ColumnSharer expected: the expected result or error """ from foreshadow.cachemanager import CacheManager cs = CacheManager() cs.check_key(key) out, err = capsys.readouterr() if expected is not None: assert out.find(expected) != -1 else: assert len(out) == 0 # nothing in out.
def test_cache_manager_convert_key(key, expected): """Test that key conversion and error raising works as expected. Args: key: key to register into ColumnSharer expected: error if error, result if result. """ from foreshadow.cachemanager import CacheManager cs = CacheManager() try: # assume expected is an error if issubclass(expected, BaseException): # this will fail if it isn't with pytest.raises(expected) as e: cs._convert_key(key) assert issubclass(e.type, expected) except TypeError: # then expected will be the true result returned assert cs._convert_key(key) == expected
def test_preprocessor_numbers(mocker): """Test a standard work flow with preprocessor. Args: mocker: A pytest-mocker instance """ import numpy as np import pandas as pd from foreshadow.cachemanager import CacheManager from foreshadow.steps import Preprocessor from foreshadow.concrete import StandardScaler dummy_config = { "Cleaner": [], "Tiebreak": [DummyIntent], "DummyIntent": {"Preprocessor": [StandardScaler]}, } mocker.patch( "foreshadow.steps.preprocessor.config.get_config", return_value=dummy_config, create=True, ) mocker.patch( "foreshadow.smart.intent_resolving.intentresolver.IntentResolver" ".pick_transformer", return_value=DummyIntent(), create=True, ) data = pd.DataFrame({"financials": np.arange(10)}) cs = CacheManager() p = Preprocessor(cache_manager=cs) p = p.fit(data) tf_data = p.transform(data) validate = pd.DataFrame( { "financials": [ -1.5666989036012806, -1.2185435916898848, -0.8703882797784892, -0.5222329678670935, -0.17407765595569785, 0.17407765595569785, 0.5222329678670935, 0.8703882797784892, 1.2185435916898848, 1.5666989036012806, ] } ) assert (tf_data == validate).squeeze().all()
def test_data_exporter_fit_transform(tmpdir): export_path = tmpdir.join("data_export_training.csv") cache_manager = CacheManager() cache_manager[AcceptedKey.CONFIG][ ConfigKey.PROCESSED_TRAINING_DATA_EXPORT_PATH] = export_path exporter = DataExporterMapper(cache_manager=cache_manager) df = _prepare_data_common() processed_df = exporter.fit_transform(X=df) _assert_common(export_path, processed_df, df)
def test_smarttransformer_empty_inverse(smart_child): """Test SmartTransformer inverse_transform. Args: smart_child: A subclass of SmartTransformer. """ smart = smart_child(cache_manager=CacheManager()) smart.fit([1, 2, 10]) smart.inverse_transform([])
def test_cache_manager_iter(store): """Test that iter iterates over entire internal dict properly. Args: store: the internal dictionary to use. """ from foreshadow.cachemanager import CacheManager cs = CacheManager() cs.store = store expected = {} # we try to recreate the internal dict using the keys for key in iter(cs): if expected.get(key[0], None) is None: expected[key[0]] = {} if key[1] is None: expected[key[0]] = cs[key] else: expected[key[0]][key[1]] = cs[key] assert expected == cs.store
def test_cache_manager_create(args, kwargs): """Test creation of a ColumnSharer object. Args: args: args to ColumnSharer init kwargs: kwargs to ColumnSharer init """ from collections import MutableMapping from foreshadow.cachemanager import CacheManager cs = CacheManager(*args, **kwargs) assert isinstance(cs, MutableMapping)
def test_smarttransformer_function_override_invalid(smart_child): """Test invalid SmartTransformer override transformer class. Args: smart_child: A subclass of SmartTransformer. """ from foreshadow.exceptions import TransformerNotFound with pytest.raises(TransformerNotFound) as e: smart_child(transformer="BAD", cache_manager=CacheManager()) assert "Could not find transformer BAD in" in str(e.value)
def test_numerical_input_fittransform(): """Test numerical input.""" import numpy as np import pandas as pd from foreshadow.preparer import CleanerMapper from foreshadow.cachemanager import CacheManager columns = ["financials"] data = pd.DataFrame({"financials": np.arange(10)}, columns=columns) cs = CacheManager() dc = CleanerMapper(cache_manager=cs) transformed_data = dc.fit_transform(data) assert np.array_equal(transformed_data, data)
def test_cache_manager_getitem(key, item_to_set, expected): """Test that getitem works for all valid key combinations or error raised. Args: key (list): key to access on ColumnSharer item_to_set: the item to set on the key as starting data. Dependent on the length of the key. expected: the expected result or error """ from foreshadow.cachemanager import CacheManager cs = CacheManager() if len(key) == 1: cs.store[key[0]] = item_to_set try: # assume expected is an error if issubclass( expected, BaseException ): # this will fail if it isn't with pytest.raises(expected) as e: cs[key[0]] assert issubclass(e.type, expected) except TypeError: # then expected will be the true result returned assert cs[key[0]] == expected elif len(key) == 2: cs.store[key[0]] = item_to_set try: # assume expected is an error if issubclass( expected, BaseException ): # this will fail if it isn't with pytest.raises(expected) as e: cs[key[0], key[1]] assert issubclass(e.type, expected) except TypeError: # then expected will be the true result returned assert cs[key[0], key[1]] == expected else: raise NotImplementedError("test case not implemented")
def test_resolver_overall(): """Big picture intent resolution test.""" import numpy as np import pandas as pd from foreshadow.cachemanager import CacheManager from foreshadow.steps import IntentMapper columns = ["financials"] data = pd.DataFrame({"financials": np.arange(100)}, columns=columns) cs = CacheManager() ir = IntentMapper(cache_manager=cs) ir.fit(data) assert cs["intent", "financials"] == "Droppable"
def test_smarttransformer_should_resolve(smart_child, mocker): """Test SmartTransformer should_resolve functionality. First test if the initial behavior works, only resolves the transformer once and does not update chosen transformer on new data. Next, test if enabling should resolve allows the transformer choice to be updated but only once. Lastly, test if force_reresolve allows the transformer choice to be updated on each fit. Args: smart_child: A subclass of SmartTransformer. """ import pandas as pd from foreshadow.concrete import StandardScaler, MinMaxScaler def pick_transformer(X, y=None, **fit_params): data = X.iloc[:, 0] if data[0] == 0: return StandardScaler() else: return MinMaxScaler() smart = smart_child(cache_manager=CacheManager()) smart.pick_transformer = pick_transformer data1 = pd.DataFrame([0]) data2 = pd.DataFrame([1]) smart.fit(data1) assert isinstance(smart.transformer, StandardScaler) smart.fit(data2) assert isinstance(smart.transformer, StandardScaler) smart.should_resolve = True smart.fit(data2) assert isinstance(smart.transformer, MinMaxScaler) smart.fit(data1) assert isinstance(smart.transformer, MinMaxScaler) smart.force_reresolve = True smart.fit(data1) assert isinstance(smart.transformer, StandardScaler) smart.fit(data2) assert isinstance(smart.transformer, MinMaxScaler)
def test_data_exporter_determine_export_path_user_specified( is_train, user_specified_path): cache_manager = CacheManager() key = (ConfigKey.PROCESSED_TRAINING_DATA_EXPORT_PATH if is_train else ConfigKey.PROCESSED_TEST_DATA_EXPORT_PATH) cache_manager[AcceptedKey.CONFIG][key] = user_specified_path exporter = DataExporterMapper(cache_manager=cache_manager) data_path = exporter._determine_export_path(is_train=is_train) expected_data_path = user_specified_path assert data_path == expected_data_path
def test_data_exporter_transform(tmpdir): export_path = tmpdir.join("data_export_test.csv") cache_manager = CacheManager() cache_manager[AcceptedKey.CONFIG][ ConfigKey.PROCESSED_TEST_DATA_EXPORT_PATH] = export_path exporter = DataExporterMapper(cache_manager=cache_manager) df = _prepare_data_common() # Need to fit before transform, even though this step doesn't fit # anything. This is to stay consistent with all other transformers. _ = exporter.fit(X=df) processed_df = exporter.transform(X=df) _assert_common(export_path, processed_df, df)
def test_cache_manager_delitem(key, expected): """Test that delitem works for all valid key combinations or error raised. Args: key (list): key to delete on ColumnSharer expected: the expected result or error """ from foreshadow.cachemanager import CacheManager cs = CacheManager() cs.store["domain"] = {"test": True} if len(key) == 1 or isinstance(key, str): with pytest.raises(expected) as e: del cs[key] assert issubclass(e.type, expected) if len(key) == 2: if expected is not None: with pytest.raises(expected) as e: del cs[key[0], key[1]] else: del cs[key[0], key[1]]
def test_smart_text_wrong_intent(): import pandas as pd from foreshadow.smart import TextEncoder X1 = pd.DataFrame(data=["1", "4", "a", "a"], columns=["col1"]) manager = CacheManager() manager[AcceptedKey.INTENT, "col1"] = IntentType.TEXT encoder1 = TextEncoder(cache_manager=manager) with pytest.raises(ValueError) as e: encoder1.fit(X1) assert "empty vocabulary" in str(e)
def test_data_cleaner_transform_before_fit(): import pandas as pd from foreshadow.steps import CleanerMapper from foreshadow.cachemanager import CacheManager data = pd.DataFrame( {"financials": ["$1.00", "$550.01", "$1234", "$12353.3345"]}, columns=["financials"], ) cs = CacheManager() dc = CleanerMapper(cache_manager=cs) with pytest.raises(ValueError) as e: dc.transform(data) assert str(e.value) == "Cleaner has not been fitted yet."
def step(): """Get a PreparerStep subclass instance. Note: Always returns StandardScaler. """ from foreshadow.steps.preparerstep import PreparerStep from foreshadow.steps.autointentmap import AutoIntentMixin from foreshadow.cachemanager import CacheManager class Step(PreparerStep, AutoIntentMixin): def get_mapping(self, X): self.check_resolve(X) yield Step(cache_manager=CacheManager())
def test_data_cleaner_fit(): """Test basic fit call.""" import pandas as pd import numpy as np from foreshadow.steps import CleanerMapper from foreshadow.cachemanager import CacheManager data = pd.DataFrame( { "dates": ["2019-02-11", "2019/03/12", "2000-04-15", "1900/01/55"], "json": [ '{"date": "2019-04-11"}', '{"financial": "$1.0"}', '{"financial": "$1000.00"}', '{"random": "asdf"}', ], "financials": ["$1.00", "$550.01", "$1234", "$12353.3345"], }, columns=["dates", "json", "financials"], ) cs = CacheManager() dc = CleanerMapper(cache_manager=cs) dc.fit(data) data = dc.transform(data) check = pd.DataFrame( [ ["2019", "02", "11", "2019", "04", "11", np.nan, np.nan, "1.00"], ["2019", "03", "12", np.nan, "", "", "1.0", np.nan, "550.01"], ["2000", "04", "15", np.nan, "", "", "1000.00", np.nan, "1234"], ["1900", "01", "55", np.nan, "", "", np.nan, "asdf", "12353.3345"], ], columns=[ "dates0", "dates1", "dates2", "json_date0", "json_date1", "json_date2", "json_financial", "json_random", "financials", ], ) print(data.values) print(check.values) assert np.all( np.equal(data.values[data.notna()], check.values[check.notna()]))
def test_drop_entire_data_frame(): """Test drop called when expected to.""" import pandas as pd from foreshadow.preparer import CleanerMapper from foreshadow.cachemanager import CacheManager columns = ["financials"] data = pd.DataFrame({"financials": ["", "", "", ""]}, columns=columns) cs = CacheManager() dc = CleanerMapper(cache_manager=cs) import pytest with pytest.raises(ValueError) as excinfo: dc.fit_transform(data) error_msg = ("All columns are dropped since they all have over 90% of " "missing values. Aborting foreshadow.") assert error_msg in str(excinfo.value)
def test_data_preparer_fit(cleaner_kwargs): """Test fitting of DataPreparer after creation with kwargs. Args: cleaner_kwargs: kwargs to CleanerMapper step """ from foreshadow.preparer import DataPreparer from foreshadow.cachemanager import CacheManager import pandas as pd boston_path = get_file_path("data", "boston_housing.csv") data = pd.read_csv(boston_path) cs = CacheManager() dp = DataPreparer(cs, cleaner_kwargs=cleaner_kwargs) dp.fit(data)