예제 #1
0
def test_cache_manager_len(store, expected):
    """Test that iter iterates over entire internal dict properly.

    Args:
        store: the internal dictionary to use.

    """
    from foreshadow.cachemanager import CacheManager

    cs = CacheManager()
    cs.store = store
    assert len(cs) == expected
예제 #2
0
def test_json_flattening_with_non_json_columns():
    data = pd.DataFrame(
        {
            "json": [
                '{"date": "2019-04-11"}',
                '{"financial": "$1.0"}',
                '{"financial": "$1000.00"}',
                '{"random": "asdf"}',
            ],
            "num": [1, 2, 3, 4],
        },
        columns=["json", "num"],
    )
    cs = CacheManager()
    dc = FlattenMapper(cache_manager=cs)
    dc.fit(data)
    transformed_data = dc.transform(data)
    check = pd.DataFrame(
        [
            ["2019-04-11", np.nan, np.nan, 1],
            [np.nan, "$1.0", np.nan, 2],
            [np.nan, "$1000.00", np.nan, 3],
            [np.nan, np.nan, "asdf", 4],
        ],
        columns=["json_date", "json_financial", "json_random", "num"],
    )
    assert np.all(
        np.equal(
            transformed_data.values[transformed_data.notna()],
            check.values[check.notna()],
        ))
예제 #3
0
def test_smarttransformer_function(smart_child):
    """Test overall SmartTransformer functionality

    Args:
        smart_child: A subclass of SmartTransformer.

    """
    import numpy as np
    import pandas as pd

    from foreshadow.concrete import StandardScaler

    boston_path = get_file_path("data", "boston_housing.csv")

    df = pd.read_csv(boston_path)

    smart = smart_child(cache_manager=CacheManager())
    smart_data = smart.fit_transform(df[["crim"]])

    std = StandardScaler()
    std_data = std.fit_transform(df[["crim"]])

    assert smart_data.equals(std_data)

    smart.fit(df[["crim"]])
    smart_data = smart.transform(df[["crim"]])

    std.fit(df[["crim"]])
    std_data = std.transform(df[["crim"]])

    # TODO, remove when SmartTransformer is no longer wrapped
    # Column names will be different, thus np.allclose() is used
    assert np.allclose(smart_data, std_data)
예제 #4
0
def test_feature_reducer_get_mapping_by_intent():
    import pandas as pd

    from foreshadow.cachemanager import CacheManager
    from foreshadow.steps import FeatureReducerMapper
    from foreshadow.steps.preparerstep import PreparerMapping
    from foreshadow.smart import FeatureReducer

    data = pd.DataFrame(
        {
            "age": [10, 20, 33, 44],
            "weights": [20, 30, 50, 60],
            "occupation": ["engineer", "artist", "doctor", "inspector"],
        },
        columns=["age", "weights", "occupation"],
    )
    cs = CacheManager()
    cs["intent", "age"] = "Numeric"
    cs["intent", "weights"] = "Numeric"
    cs["intent", "occupation"] = "Categorical"

    fr = FeatureReducerMapper(cache_manager=cs)
    column_mapping = fr.get_mapping(data)

    check = PreparerMapping()
    check.add(["age", "weights"], [FeatureReducer(cache_manager=cs)],
              "Numeric")
    check.add(["occupation"], [FeatureReducer(cache_manager=cs)],
              "Categorical")

    for key in column_mapping.store:
        assert key in check.store
        assert str(column_mapping.store[key]) == str(check.store[key])
예제 #5
0
def test_feature_reducer_fit_no_ops():
    import numpy as np
    import pandas as pd

    from foreshadow.cachemanager import CacheManager
    from foreshadow.steps import FeatureReducerMapper

    data = pd.DataFrame(
        {
            "age": [10, 20, 33, 44],
            "weights": [20, 30, 50, 60],
            "occupation": ["engineer", "artist", "doctor", "inspector"],
        },
        columns=["age", "weights", "occupation"],
    )
    cs = CacheManager()
    cs["intent", "age"] = "Numeric"
    cs["intent", "weights"] = "Numeric"
    cs["intent", "occupation"] = "Categorical"

    fr = FeatureReducerMapper(cache_manager=cs)
    fr.fit(data)
    transformed_data = fr.transform(data)
    assert np.all(
        np.equal(
            data.values[data.notna()],
            transformed_data.values[transformed_data.notna()],
        ))
예제 #6
0
def test_cache_manager_setitem(capsys, key, item_to_set, expected, warning):
    """Test that getitem works for all valid key combinations or error raised.

    Args:
        capsys: captures stdout and stderr. Pytest fixture.
        key (list): key to access on ColumnSharer
        item_to_set: the item to set on the key as starting data. Dependent
            on the length of the key.
        expected: the expected result or error
        warning: True to check if should raise warning. False to not.

    """
    from foreshadow.cachemanager import CacheManager

    cs = CacheManager()
    if len(key) == 1:
        cs[key[0]] = item_to_set
        assert cs[key[0]] == expected
        if warning:
            out, err = capsys.readouterr()
            assert out.find("WARNING") != -1

    elif len(key) == 2:
        cs[key[0], key[1]] = item_to_set
        print(cs.store)
        assert cs[key[0], key[1]] == expected
        if warning:
            out, err = capsys.readouterr()
            assert out.find("WARNING") != -1

    else:
        raise NotImplementedError("test case not implemented")
예제 #7
0
def test_json_flattening():
    """Test json input are flattened correctly."""

    data = pd.DataFrame(
        {
            "json": [
                '{"date": "2019-04-11"}',
                '{"financial": "$1.0"}',
                '{"financial": "$1000.00"}',
                '{"random": "asdf"}',
            ]
        },
        columns=["json"],
    )
    cs = CacheManager()
    dc = FlattenMapper(cache_manager=cs)
    dc.fit(data)
    transformed_data = dc.transform(data)
    check = pd.DataFrame(
        [
            ["2019-04-11", np.nan, np.nan],
            [np.nan, "$1.0", np.nan],
            [np.nan, "$1000.00", np.nan],
            [np.nan, np.nan, "asdf"],
        ],
        columns=["json_date", "json_financial", "json_random"],
    )
    assert np.all(
        np.equal(
            transformed_data.values[transformed_data.notna()],
            check.values[check.notna()],
        ))
예제 #8
0
    def __init__(
        self,
        problem_type,
        random_state=None,
        n_jobs=1,
        estimator=None,
        allowed_seconds=300,
        auto_estimator_kwargs=None,
    ):

        if problem_type not in [
                ProblemType.CLASSIFICATION,
                ProblemType.REGRESSION,
        ]:
            raise ValueError("Unknown Problem Type {}. Please choose from {} "
                             "or {}".format(
                                 problem_type,
                                 ProblemType.CLASSIFICATION,
                                 ProblemType.REGRESSION,
                             ))
        self.problem_type = problem_type
        self.random_state = random_state
        self.n_jobs = n_jobs

        self._X_preparer = DataPreparer(cache_manager=CacheManager())
        self.configure_multiprocessing(self.n_jobs)
        self._y_preprarer = DataPreparer(
            cache_manager=CacheManager(),
            y_var=True,
            problem_type=self.problem_type,
        )
        if estimator is not None and auto_estimator_kwargs is not None:
            raise ValueError(
                "estimator and estimator_kwargs are mutually exclusive")
        self.allowed_seconds = allowed_seconds
        self.auto_estimator_kwargs = auto_estimator_kwargs
        self.estimator = estimator
        self.pipeline = None
        self.data_columns = None
        self.has_fitted = False

        if self.y_preparer is not None:
            self.estimator_wrapper = EstimatorWrapper(self.estimator,
                                                      self.y_preparer)
        else:
            self.estimator_wrapper = self.estimator
예제 #9
0
def test_data_exporter_determine_export_path_default(is_train):
    cache_manager = CacheManager()
    exporter = DataExporterMapper(cache_manager=cache_manager)

    data_path = exporter._determine_export_path(is_train=is_train)
    expected_data_path = (DefaultConfig.PROCESSED_TRAINING_DATA_EXPORT_PATH
                          if is_train else
                          DefaultConfig.PROCESSED_TEST_DATA_EXPORT_PATH)
    assert data_path == expected_data_path
예제 #10
0
def test_cache_manager_checkkey(capsys, key, expected):
    """Test that getitem works for all valid key combinations.

    Args:
        capsys: captures stdout and stderr. Pytest fixture.
        key (list): key to access on ColumnSharer
        expected: the expected result or error

    """
    from foreshadow.cachemanager import CacheManager

    cs = CacheManager()
    cs.check_key(key)
    out, err = capsys.readouterr()
    if expected is not None:
        assert out.find(expected) != -1
    else:
        assert len(out) == 0  # nothing in out.
예제 #11
0
def test_cache_manager_convert_key(key, expected):
    """Test that key conversion and error raising works as expected.

    Args:
        key: key to register into ColumnSharer
        expected: error if error, result if result.

    """
    from foreshadow.cachemanager import CacheManager

    cs = CacheManager()
    try:  # assume expected is an error
        if issubclass(expected, BaseException):  # this will fail if it isn't
            with pytest.raises(expected) as e:
                cs._convert_key(key)
            assert issubclass(e.type, expected)
    except TypeError:  # then expected will be the true result returned
        assert cs._convert_key(key) == expected
예제 #12
0
def test_preprocessor_numbers(mocker):
    """Test a standard work flow with preprocessor.

    Args:
        mocker: A pytest-mocker instance

    """
    import numpy as np
    import pandas as pd
    from foreshadow.cachemanager import CacheManager
    from foreshadow.steps import Preprocessor
    from foreshadow.concrete import StandardScaler

    dummy_config = {
        "Cleaner": [],
        "Tiebreak": [DummyIntent],
        "DummyIntent": {"Preprocessor": [StandardScaler]},
    }

    mocker.patch(
        "foreshadow.steps.preprocessor.config.get_config",
        return_value=dummy_config,
        create=True,
    )
    mocker.patch(
        "foreshadow.smart.intent_resolving.intentresolver.IntentResolver"
        ".pick_transformer",
        return_value=DummyIntent(),
        create=True,
    )

    data = pd.DataFrame({"financials": np.arange(10)})
    cs = CacheManager()
    p = Preprocessor(cache_manager=cs)
    p = p.fit(data)
    tf_data = p.transform(data)

    validate = pd.DataFrame(
        {
            "financials": [
                -1.5666989036012806,
                -1.2185435916898848,
                -0.8703882797784892,
                -0.5222329678670935,
                -0.17407765595569785,
                0.17407765595569785,
                0.5222329678670935,
                0.8703882797784892,
                1.2185435916898848,
                1.5666989036012806,
            ]
        }
    )

    assert (tf_data == validate).squeeze().all()
예제 #13
0
def test_data_exporter_fit_transform(tmpdir):
    export_path = tmpdir.join("data_export_training.csv")
    cache_manager = CacheManager()
    cache_manager[AcceptedKey.CONFIG][
        ConfigKey.PROCESSED_TRAINING_DATA_EXPORT_PATH] = export_path

    exporter = DataExporterMapper(cache_manager=cache_manager)

    df = _prepare_data_common()
    processed_df = exporter.fit_transform(X=df)
    _assert_common(export_path, processed_df, df)
예제 #14
0
def test_smarttransformer_empty_inverse(smart_child):
    """Test SmartTransformer inverse_transform.

    Args:
        smart_child: A subclass of SmartTransformer.

    """
    smart = smart_child(cache_manager=CacheManager())
    smart.fit([1, 2, 10])

    smart.inverse_transform([])
예제 #15
0
def test_cache_manager_iter(store):
    """Test that iter iterates over entire internal dict properly.

    Args:
        store: the internal dictionary to use.

    """
    from foreshadow.cachemanager import CacheManager

    cs = CacheManager()
    cs.store = store
    expected = {}  # we try to recreate the internal dict using the keys
    for key in iter(cs):
        if expected.get(key[0], None) is None:
            expected[key[0]] = {}
        if key[1] is None:
            expected[key[0]] = cs[key]
        else:
            expected[key[0]][key[1]] = cs[key]
    assert expected == cs.store
예제 #16
0
def test_cache_manager_create(args, kwargs):
    """Test creation of a ColumnSharer object.

    Args:
        args: args to ColumnSharer init
        kwargs: kwargs to ColumnSharer init

    """
    from collections import MutableMapping
    from foreshadow.cachemanager import CacheManager

    cs = CacheManager(*args, **kwargs)
    assert isinstance(cs, MutableMapping)
예제 #17
0
def test_smarttransformer_function_override_invalid(smart_child):
    """Test invalid SmartTransformer override transformer class.

    Args:
        smart_child: A subclass of SmartTransformer.

    """
    from foreshadow.exceptions import TransformerNotFound

    with pytest.raises(TransformerNotFound) as e:
        smart_child(transformer="BAD", cache_manager=CacheManager())

    assert "Could not find transformer BAD in" in str(e.value)
예제 #18
0
def test_numerical_input_fittransform():
    """Test numerical input."""
    import numpy as np
    import pandas as pd
    from foreshadow.preparer import CleanerMapper
    from foreshadow.cachemanager import CacheManager

    columns = ["financials"]
    data = pd.DataFrame({"financials": np.arange(10)}, columns=columns)
    cs = CacheManager()
    dc = CleanerMapper(cache_manager=cs)
    transformed_data = dc.fit_transform(data)
    assert np.array_equal(transformed_data, data)
예제 #19
0
def test_cache_manager_getitem(key, item_to_set, expected):
    """Test that getitem works for all valid key combinations or error raised.

    Args:
        key (list): key to access on ColumnSharer
        item_to_set: the item to set on the key as starting data. Dependent
            on the length of the key.
        expected: the expected result or error

    """
    from foreshadow.cachemanager import CacheManager

    cs = CacheManager()
    if len(key) == 1:
        cs.store[key[0]] = item_to_set
        try:  # assume expected is an error
            if issubclass(
                expected, BaseException
            ):  # this will fail if it isn't
                with pytest.raises(expected) as e:
                    cs[key[0]]
                assert issubclass(e.type, expected)
        except TypeError:  # then expected will be the true result returned
            assert cs[key[0]] == expected

    elif len(key) == 2:
        cs.store[key[0]] = item_to_set
        try:  # assume expected is an error
            if issubclass(
                expected, BaseException
            ):  # this will fail if it isn't
                with pytest.raises(expected) as e:
                    cs[key[0], key[1]]
                assert issubclass(e.type, expected)
        except TypeError:  # then expected will be the true result returned
            assert cs[key[0], key[1]] == expected

    else:
        raise NotImplementedError("test case not implemented")
예제 #20
0
def test_resolver_overall():
    """Big picture intent resolution test."""

    import numpy as np
    import pandas as pd
    from foreshadow.cachemanager import CacheManager
    from foreshadow.steps import IntentMapper

    columns = ["financials"]
    data = pd.DataFrame({"financials": np.arange(100)}, columns=columns)
    cs = CacheManager()
    ir = IntentMapper(cache_manager=cs)
    ir.fit(data)
    assert cs["intent", "financials"] == "Droppable"
예제 #21
0
def test_smarttransformer_should_resolve(smart_child, mocker):
    """Test SmartTransformer should_resolve functionality.

    First test if the initial behavior works, only resolves the transformer
    once and does not update chosen transformer on new data.

    Next, test if enabling should resolve allows the transformer choice to be
    updated but only once.

    Lastly, test if force_reresolve allows the transformer choice to be updated
    on each fit.

    Args:
        smart_child: A subclass of SmartTransformer.

    """
    import pandas as pd

    from foreshadow.concrete import StandardScaler, MinMaxScaler

    def pick_transformer(X, y=None, **fit_params):
        data = X.iloc[:, 0]

        if data[0] == 0:
            return StandardScaler()
        else:
            return MinMaxScaler()

    smart = smart_child(cache_manager=CacheManager())
    smart.pick_transformer = pick_transformer

    data1 = pd.DataFrame([0])
    data2 = pd.DataFrame([1])

    smart.fit(data1)
    assert isinstance(smart.transformer, StandardScaler)
    smart.fit(data2)
    assert isinstance(smart.transformer, StandardScaler)

    smart.should_resolve = True
    smart.fit(data2)
    assert isinstance(smart.transformer, MinMaxScaler)
    smart.fit(data1)
    assert isinstance(smart.transformer, MinMaxScaler)

    smart.force_reresolve = True
    smart.fit(data1)
    assert isinstance(smart.transformer, StandardScaler)
    smart.fit(data2)
    assert isinstance(smart.transformer, MinMaxScaler)
예제 #22
0
def test_data_exporter_determine_export_path_user_specified(
        is_train, user_specified_path):
    cache_manager = CacheManager()
    key = (ConfigKey.PROCESSED_TRAINING_DATA_EXPORT_PATH
           if is_train else ConfigKey.PROCESSED_TEST_DATA_EXPORT_PATH)

    cache_manager[AcceptedKey.CONFIG][key] = user_specified_path

    exporter = DataExporterMapper(cache_manager=cache_manager)

    data_path = exporter._determine_export_path(is_train=is_train)
    expected_data_path = user_specified_path

    assert data_path == expected_data_path
예제 #23
0
def test_data_exporter_transform(tmpdir):
    export_path = tmpdir.join("data_export_test.csv")
    cache_manager = CacheManager()
    cache_manager[AcceptedKey.CONFIG][
        ConfigKey.PROCESSED_TEST_DATA_EXPORT_PATH] = export_path

    exporter = DataExporterMapper(cache_manager=cache_manager)

    df = _prepare_data_common()
    # Need to fit before transform, even though this step doesn't fit
    # anything. This is to stay consistent with all other transformers.
    _ = exporter.fit(X=df)
    processed_df = exporter.transform(X=df)
    _assert_common(export_path, processed_df, df)
예제 #24
0
def test_cache_manager_delitem(key, expected):
    """Test that delitem works for all valid key combinations or error raised.

    Args:
        key (list): key to delete on ColumnSharer
        expected: the expected result or error

    """
    from foreshadow.cachemanager import CacheManager

    cs = CacheManager()
    cs.store["domain"] = {"test": True}
    if len(key) == 1 or isinstance(key, str):
        with pytest.raises(expected) as e:
            del cs[key]
        assert issubclass(e.type, expected)

    if len(key) == 2:
        if expected is not None:
            with pytest.raises(expected) as e:
                del cs[key[0], key[1]]
        else:
            del cs[key[0], key[1]]
예제 #25
0
def test_smart_text_wrong_intent():
    import pandas as pd

    from foreshadow.smart import TextEncoder

    X1 = pd.DataFrame(data=["1", "4", "a", "a"], columns=["col1"])

    manager = CacheManager()
    manager[AcceptedKey.INTENT, "col1"] = IntentType.TEXT

    encoder1 = TextEncoder(cache_manager=manager)

    with pytest.raises(ValueError) as e:
        encoder1.fit(X1)
        assert "empty vocabulary" in str(e)
예제 #26
0
def test_data_cleaner_transform_before_fit():
    import pandas as pd
    from foreshadow.steps import CleanerMapper
    from foreshadow.cachemanager import CacheManager

    data = pd.DataFrame(
        {"financials": ["$1.00", "$550.01", "$1234", "$12353.3345"]},
        columns=["financials"],
    )
    cs = CacheManager()
    dc = CleanerMapper(cache_manager=cs)

    with pytest.raises(ValueError) as e:
        dc.transform(data)

    assert str(e.value) == "Cleaner has not been fitted yet."
예제 #27
0
def step():
    """Get a PreparerStep subclass instance.

    Note:
        Always returns StandardScaler.

    """
    from foreshadow.steps.preparerstep import PreparerStep
    from foreshadow.steps.autointentmap import AutoIntentMixin
    from foreshadow.cachemanager import CacheManager

    class Step(PreparerStep, AutoIntentMixin):
        def get_mapping(self, X):
            self.check_resolve(X)

    yield Step(cache_manager=CacheManager())
예제 #28
0
def test_data_cleaner_fit():
    """Test basic fit call."""
    import pandas as pd
    import numpy as np
    from foreshadow.steps import CleanerMapper
    from foreshadow.cachemanager import CacheManager

    data = pd.DataFrame(
        {
            "dates": ["2019-02-11", "2019/03/12", "2000-04-15", "1900/01/55"],
            "json": [
                '{"date": "2019-04-11"}',
                '{"financial": "$1.0"}',
                '{"financial": "$1000.00"}',
                '{"random": "asdf"}',
            ],
            "financials": ["$1.00", "$550.01", "$1234", "$12353.3345"],
        },
        columns=["dates", "json", "financials"],
    )
    cs = CacheManager()
    dc = CleanerMapper(cache_manager=cs)
    dc.fit(data)
    data = dc.transform(data)
    check = pd.DataFrame(
        [
            ["2019", "02", "11", "2019", "04", "11", np.nan, np.nan, "1.00"],
            ["2019", "03", "12", np.nan, "", "", "1.0", np.nan, "550.01"],
            ["2000", "04", "15", np.nan, "", "", "1000.00", np.nan, "1234"],
            ["1900", "01", "55", np.nan, "", "", np.nan, "asdf", "12353.3345"],
        ],
        columns=[
            "dates0",
            "dates1",
            "dates2",
            "json_date0",
            "json_date1",
            "json_date2",
            "json_financial",
            "json_random",
            "financials",
        ],
    )
    print(data.values)
    print(check.values)
    assert np.all(
        np.equal(data.values[data.notna()], check.values[check.notna()]))
예제 #29
0
def test_drop_entire_data_frame():
    """Test drop called when expected to."""
    import pandas as pd
    from foreshadow.preparer import CleanerMapper
    from foreshadow.cachemanager import CacheManager

    columns = ["financials"]
    data = pd.DataFrame({"financials": ["", "", "", ""]}, columns=columns)
    cs = CacheManager()
    dc = CleanerMapper(cache_manager=cs)
    import pytest

    with pytest.raises(ValueError) as excinfo:
        dc.fit_transform(data)
    error_msg = ("All columns are dropped since they all have over 90% of "
                 "missing values. Aborting foreshadow.")
    assert error_msg in str(excinfo.value)
예제 #30
0
def test_data_preparer_fit(cleaner_kwargs):
    """Test fitting of DataPreparer after creation with kwargs.

    Args:
          cleaner_kwargs: kwargs to CleanerMapper step

    """
    from foreshadow.preparer import DataPreparer
    from foreshadow.cachemanager import CacheManager
    import pandas as pd

    boston_path = get_file_path("data", "boston_housing.csv")
    data = pd.read_csv(boston_path)

    cs = CacheManager()
    dp = DataPreparer(cs, cleaner_kwargs=cleaner_kwargs)
    dp.fit(data)