예제 #1
0
def test_foreshadow_abort_on_empty_data_frame_after_cleaning(
        filename, problem_type, X_start, X_end, target):
    from foreshadow.foreshadow import Foreshadow
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split

    np.random.seed(1337)

    data_path = get_file_path("data", filename)

    data = pd.read_csv(data_path)
    X_df = data.loc[:, X_start:X_end]
    y_df = data.loc[:, target]

    X_train, X_test, y_train, y_test = train_test_split(X_df,
                                                        y_df,
                                                        test_size=0.2)

    from foreshadow.estimators import AutoEstimator

    estimator = AutoEstimator(
        problem_type=problem_type,
        auto="tpot",
        estimator_kwargs={"max_time_mins": 1},
    )

    shadow = Foreshadow(estimator=estimator, problem_type=problem_type)

    with pytest.raises(ValueError) as excinfo:
        shadow.fit(X_train, y_train)
    error_msg = ("All columns are dropped since they all have over 90% of "
                 "missing values. Aborting foreshadow.")
    assert error_msg in str(excinfo.value)
예제 #2
0
def test_foreshadow_pickling_and_unpickling_unfitted(tmpdir):
    from foreshadow.foreshadow import Foreshadow
    from foreshadow.estimators import AutoEstimator

    estimator = AutoEstimator(
        problem_type=ProblemType.CLASSIFICATION,
        auto="tpot",
        estimator_kwargs={"max_time_mins": 1},
    )
    shadow = Foreshadow(estimator=estimator,
                        problem_type=ProblemType.CLASSIFICATION)
    with pytest.raises(ValueError):
        shadow.pickle_fitted_pipeline(tmpdir.join("fitted_pipeline.p"))
예제 #3
0
def test_text_classification_foreshadow():
    import pandas as pd

    categories = [
        "alt.atheism",
        "soc.religion.christian",
        "comp.graphics",
        "sci.med",
    ]
    from sklearn.datasets import fetch_20newsgroups

    twenty_train = fetch_20newsgroups(subset="train",
                                      categories=categories,
                                      shuffle=True,
                                      random_state=42)
    X_train = pd.DataFrame(
        data=twenty_train.data,
        columns=["text"],
        index=list(range(len(twenty_train.data))),
    )
    y_train = pd.Series(
        data=twenty_train.target,
        name="category",
        index=list(range(len(twenty_train.target))),
    )

    twenty_test = fetch_20newsgroups(subset="test",
                                     categories=categories,
                                     shuffle=True,
                                     random_state=42)
    X_test = pd.DataFrame(data=twenty_test.data, columns=["text"])
    y_test = pd.Series(data=twenty_test.target, name="category")

    from foreshadow.estimators import AutoEstimator

    estimator = AutoEstimator(
        problem_type=ProblemType.CLASSIFICATION,
        auto="tpot",
        estimator_kwargs={
            "max_time_mins": 1,
            "random_state": 42
        },
    )

    shadow = Foreshadow(estimator=estimator,
                        problem_type=ProblemType.CLASSIFICATION)

    shadow.fit(X_train, y_train)

    score = shadow.score(X_test, y_test)
    print(score)  # this gives about 87.5%
예제 #4
0
def test_foreshadow_get_params_keys(deep):
    """Test that the desired keys show up for the Foreshadow object.

    Args:
        deep: deep param to get_params

    """
    from foreshadow.foreshadow import Foreshadow

    fs = Foreshadow(problem_type=ProblemType.CLASSIFICATION)
    params = fs.get_params(deep=deep)

    desired_keys = ["problem_type", "estimator", "data_columns"]
    for key in desired_keys:
        assert key in params
예제 #5
0
def test_foreshadow_unknown_problem_type(problem_type):
    from foreshadow.foreshadow import Foreshadow

    with pytest.raises(ValueError) as e:
        _ = Foreshadow(problem_type=problem_type)

    assert "Unknown Problem Type" in str(e.value)
예제 #6
0
def test_foreshadow_param_optimize():  # TODO: Make this test faster
    import pickle
    import json

    import pandas as pd
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import GridSearchCV

    from foreshadow.foreshadow import Foreshadow
    from foreshadow.preparer import DataPreparer
    from foreshadow.optimizers.param_mapping import param_mapping

    boston_path = get_file_path("data", "boston_housing.csv")
    test_json_path = get_file_path("configs", "optimizer_test.json")

    truth_path = get_file_path("configs", "search_space_optimize.pkl")

    data = pd.read_csv(boston_path)
    js = json.load(open(test_json_path, "r"))

    fs = Foreshadow(
        DataPreparer(from_json=js),
        False,
        LinearRegression(),
        ProblemType.REGRESSION,
        GridSearchCV,
    )

    fs.pipeline = Pipeline([("preparer", fs.X_preparer),
                            ("estimator", fs.estimator)])

    x = data.drop(["medv"], axis=1, inplace=False)
    y = data[["medv"]]

    x_train, _, y_train, _ = train_test_split(x, y, test_size=0.25)

    results = param_mapping(fs.pipeline, x_train, y_train)

    # (If you change default configs) or file structure, you will need to
    # verify the outputs are correct manually and regenerate the pickle
    # truth file.
    truth = pickle.load(open(truth_path, "rb"))

    assert results[0].keys() == truth[0].keys()
예제 #7
0
def test_foreshadow_estimator_custom():
    from foreshadow.foreshadow import Foreshadow
    from foreshadow.base import BaseEstimator

    estimator = BaseEstimator()
    foreshadow = Foreshadow(problem_type=ProblemType.CLASSIFICATION,
                            estimator=estimator)
    assert isinstance(foreshadow.estimator, BaseEstimator)
예제 #8
0
def test_foreshadow_serialization_boston_housing_regression_multiprocessing(
        tmpdir):
    from foreshadow.foreshadow import Foreshadow
    import pandas as pd
    import numpy as np
    from sklearn.datasets import load_boston
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LinearRegression

    np.random.seed(1337)

    boston = load_boston()
    X_df = pd.DataFrame(boston.data, columns=boston.feature_names)
    y_df = pd.DataFrame(boston.target, columns=["target"])

    X_train, X_test, y_train, y_test = train_test_split(X_df,
                                                        y_df,
                                                        test_size=0.2)

    shadow = Foreshadow(estimator=LinearRegression(),
                        problem_type=ProblemType.REGRESSION)

    shadow.configure_multiprocessing(n_job=-1)

    shadow.fit(X_train, y_train)

    score = shadow.score(X_test, y_test)
    print(score)
예제 #9
0
def test_foreshadow_titanic(tmpdir):
    import pandas as pd

    train_data = pd.read_csv(get_file_path("data", "titanic-train.csv"))
    X_train_df = train_data.loc[:, "Pclass":"Embarked"]
    y_train_df = train_data.loc[:, "Survived"]

    X_train_df = X_train_df.drop(columns=["SibSp", "Parch", "Cabin"])

    X_train, X_test, y_train, y_test = train_test_split(X_train_df,
                                                        y_train_df,
                                                        test_size=0.2,
                                                        random_state=42)

    from foreshadow.estimators import AutoEstimator

    estimator = AutoEstimator(
        problem_type=ProblemType.CLASSIFICATION,
        auto="tpot",
        estimator_kwargs={
            "max_time_mins": 1,
            "random_state": 42
        },
    )

    shadow = Foreshadow(estimator=estimator,
                        problem_type=ProblemType.CLASSIFICATION)

    shadow.override_intent(column_name="Name", intent=IntentType.TEXT)
    shadow.fit(X_train, y_train)

    score = shadow.score(X_test, y_test)
    print(score)
예제 #10
0
def test_foreshadow_estimator_error():
    from foreshadow.foreshadow import Foreshadow

    estimator = "Invalid"
    with pytest.raises(ValueError) as e:
        _ = Foreshadow(problem_type=ProblemType.CLASSIFICATION,
                       estimator=estimator)

    assert str(e.value) == "Invalid value passed as estimator"
예제 #11
0
def test_foreshadow_predict_before_fit():
    import numpy as np
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    from foreshadow.foreshadow import Foreshadow

    np.random.seed(0)
    estimator = LinearRegression()
    X = np.arange(200).reshape((-1, 2))
    y = np.random.normal(0, 1, 100).reshape((-1, 1))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

    foreshadow = Foreshadow(problem_type=ProblemType.REGRESSION,
                            estimator=estimator)

    with pytest.raises(ValueError) as e:
        _ = foreshadow.predict(X_test)

    assert str(e.value) == "Foreshadow has not been fit yet"
예제 #12
0
def test_foreshadow_integration_data_cleaner_can_drop(filename, problem_type,
                                                      X_start, X_end, target,
                                                      tmpdir):
    from foreshadow.foreshadow import Foreshadow
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split

    np.random.seed(1337)

    data_path = get_file_path("data", filename)

    data = pd.read_csv(data_path)
    # local_file_folder = "examples"
    # data = pd.read_csv("/".join([local_file_folder, filename]))

    X_df = data.loc[:, X_start:X_end]
    y_df = data.loc[:, target]

    X_train, X_test, y_train, y_test = train_test_split(X_df,
                                                        y_df,
                                                        test_size=0.2)

    from foreshadow.estimators import AutoEstimator

    estimator = AutoEstimator(
        problem_type=problem_type,
        auto="tpot",
        estimator_kwargs={"max_time_mins": 1},
    )

    shadow = Foreshadow(estimator=estimator, problem_type=problem_type)

    pickled_fitted_pipeline_location = tmpdir.join("fitted_pipeline.p")
    shadow.fit(X_train, y_train)
    shadow.pickle_fitted_pipeline(pickled_fitted_pipeline_location)

    import pickle

    with open(pickled_fitted_pipeline_location, "rb") as fopen:
        pipeline = pickle.load(fopen)

    # If there are new empty columns in the test set, the program should
    # not fail.
    X_test[X_start] = np.nan
    score1 = shadow.score(X_test, y_test)
    score2 = pipeline.score(X_test, y_test)

    import unittest

    assertions = unittest.TestCase("__init__")
    # given the randomness of the tpot algorithm and the short run
    # time we configured, there is no guarantee the performance can
    # converge. The test here aims to evaluate if both cases have
    # produced a reasonable score and the difference is small.
    # assert score1 > 0.76 and score2 > 0.76
    assertions.assertAlmostEqual(score1, score2, places=2)
예제 #13
0
def test_core_foreshadow_example_regression():
    import numpy as np
    import pandas as pd
    from sklearn.datasets import load_boston
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import r2_score
    from sklearn.model_selection import train_test_split
    from foreshadow.foreshadow import Foreshadow

    np.random.seed(0)
    boston = load_boston()
    bostonX_df = pd.DataFrame(boston.data, columns=boston.feature_names)
    bostony_df = pd.DataFrame(boston.target, columns=["target"])
    X_train, X_test, y_train, y_test = train_test_split(bostonX_df,
                                                        bostony_df,
                                                        test_size=0.2)
    model = Foreshadow(estimator=LinearRegression(),
                       problem_type=ProblemType.REGRESSION)
    model.fit(X_train, y_train)
    score = r2_score(y_test, model.predict(X_test))
    print("Boston score: %f" % score)
예제 #14
0
def test_foreshadow_param_optimize_invalid_array_idx():
    import json

    import pandas as pd

    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import GridSearchCV
    from sklearn.pipeline import Pipeline

    from foreshadow.foreshadow import Foreshadow
    from foreshadow.preparer import DataPreparer
    from foreshadow.cachemanager import CacheManager

    boston_path = get_file_path("data", "boston_housing.csv")
    test_path = get_file_path("configs", "invalid_optimizer_config.json")

    data = pd.read_csv(boston_path)
    cfg = json.load(open(test_path, "r"))

    fs = Foreshadow(
        DataPreparer(CacheManager(), from_json=cfg),
        False,
        LinearRegression(),
        ProblemType.REGRESSION,
        GridSearchCV,
    )

    fs.pipeline = Pipeline([("preprocessor", fs.X_preparer),
                            ("estimator", fs.estimator)])

    x = data.drop(["medv"], axis=1, inplace=False)
    y = data[["medv"]]

    x_train, _, y_train, _ = train_test_split(x, y, test_size=0.25)

    with pytest.raises(ValueError) as e:
        param_mapping(fs.pipeline, x_train, y_train)  # noqa: F821

    assert str(e.value).startswith("Attempted to index list")
예제 #15
0
def test_foreshadow_param_optimize_invalid_dict_key():
    import pandas as pd
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import GridSearchCV
    from sklearn.pipeline import Pipeline

    from foreshadow.foreshadow import Foreshadow
    from foreshadow.preparer import DataPreparer
    from foreshadow.cachemanager import CacheManager

    boston_path = get_file_path("data", "boston_housing.csv")

    data = pd.read_csv(boston_path)

    fs = Foreshadow(
        DataPreparer(
            cache_manager=CacheManager(),
            from_json={"combinations": [{
                "fake.fake": "[1,2]"
            }]},
        ),
        False,
        LinearRegression(),
        ProblemType.REGRESSION,
        GridSearchCV,
    )

    fs.pipeline = Pipeline([("preprocessor", fs.X_preparer),
                            ("estimator", fs.estimator)])

    x = data.drop(["medv"], axis=1, inplace=False)
    y = data[["medv"]]

    x_train, _, y_train, _ = train_test_split(x, y, test_size=0.25)

    with pytest.raises(ValueError) as e:
        param_mapping(fs.pipeline, x_train, y_train)  # noqa: F821

    assert str(e.value) == "Invalid JSON Key fake in {}"
예제 #16
0
def construct_foreshadow_object_common(estimator=None,
                                       problem_type=None,
                                       estimator_kwargs={"max_time_mins": 1}):
    if not estimator:
        from foreshadow.estimators import AutoEstimator

        estimator = AutoEstimator(
            problem_type=problem_type,
            auto="tpot",
            estimator_kwargs=estimator_kwargs,
        )
    shadow = Foreshadow(estimator=estimator, problem_type=problem_type)
    return shadow
예제 #17
0
def test_foreshadow_param_optimize_no_combinations():
    import pickle

    import pandas as pd
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import GridSearchCV
    from sklearn.pipeline import Pipeline

    from foreshadow.foreshadow import Foreshadow
    from foreshadow.preparer import DataPreparer
    from foreshadow.cachemanager import CacheManager

    boston_path = get_file_path("data", "boston_housing.csv")
    test_path = get_file_path("configs", "search_space_no_combo.pkl")

    data = pd.read_csv(boston_path)

    fs = Foreshadow(
        DataPreparer(cache_manager=CacheManager(), from_json={}),
        False,
        LinearRegression(),
        ProblemType.REGRESSION,
        GridSearchCV,
    )

    fs.pipeline = Pipeline([("preprocessor", fs.X_preparer),
                            ("estimator", fs.estimator)])

    x = data.drop(["medv"], axis=1, inplace=False)
    y = data[["medv"]]

    x_train, _, y_train, _ = train_test_split(x, y, test_size=0.25)

    results = param_mapping(fs.pipeline, x_train, y_train)  # noqa: F821

    truth = pickle.load(open(test_path, "rb"))

    assert results[0].keys() == truth[0].keys()
예제 #18
0
def test_foreshadow_param_optimize_fit(mocker):
    import pandas as pd
    from foreshadow.base import BaseEstimator, TransformerMixin
    from sklearn.model_selection._search import BaseSearchCV

    from foreshadow.foreshadow import Foreshadow

    boston_path = get_file_path("data", "boston_housing.csv")
    data = pd.read_csv(boston_path)

    class DummyRegressor(BaseEstimator, TransformerMixin):
        def fit(self, X, y):
            return self

    class DummySearch(BaseSearchCV):
        def __init__(self, estimator, params):
            self.best_estimator_ = estimator

        def fit(self, X, y=None, **fit_params):
            return self

    class DummyDataPreparer(BaseEstimator, TransformerMixin):
        def fit(self, X, y):
            return self

    mocker.patch("foreshadow.preparer.DataPreparer",
                 return_value=DummyDataPreparer)

    fs = Foreshadow(
        problem_type=ProblemType.REGRESSION,
        estimator=DummyRegressor(),
        optimizer=DummySearch,
    )
    x = data.drop(["medv"], axis=1, inplace=False)
    y = data[["medv"]]

    fs.fit(x, y)
    assert isinstance(fs.pipeline.steps[-1][1].estimator, DummyRegressor)

    fs2 = Foreshadow(
        problem_type=ProblemType.REGRESSION,
        X_preparer=False,
        y_preparer=False,
        estimator=DummyRegressor(),
        optimizer=DummySearch,
    )

    fs2.fit(x, y)
    assert isinstance(fs2.pipeline.steps[-1][1], DummyRegressor)
예제 #19
0
def test_core_foreshadow_example_classification():
    import numpy as np
    import pandas as pd
    from sklearn.datasets import load_iris
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import f1_score
    from sklearn.model_selection import train_test_split
    from foreshadow.foreshadow import Foreshadow

    np.random.seed(0)
    iris = load_iris()
    irisX_df = pd.DataFrame(iris.data, columns=iris.feature_names)
    irisy_df = pd.DataFrame(iris.target, columns=["target"])
    X_train, X_test, y_train, y_test = train_test_split(irisX_df,
                                                        irisy_df,
                                                        test_size=0.2)

    model = Foreshadow(estimator=LogisticRegression(),
                       problem_type=ProblemType.CLASSIFICATION)
    model.fit(X_train, y_train)

    score = f1_score(y_test, model.predict(X_test), average="weighted")
    print("Iris score: %f" % score)
예제 #20
0
def test_foreshadow_defaults():
    from foreshadow.foreshadow import Foreshadow
    from foreshadow.preparer import DataPreparer
    from foreshadow.estimators import AutoEstimator
    from foreshadow.estimators import EstimatorWrapper

    foreshadow = Foreshadow(problem_type=ProblemType.CLASSIFICATION)
    # defaults
    assert (isinstance(foreshadow.X_preparer, DataPreparer)
            and isinstance(foreshadow.y_preparer, DataPreparer)
            and isinstance(foreshadow.estimator_wrapper, EstimatorWrapper)
            and isinstance(foreshadow.estimator, AutoEstimator)
            # and foreshadow.optimizer is None
            and
            foreshadow.pipeline is None and foreshadow.data_columns is None)
예제 #21
0
def test_foreshadow_serialization_breast_cancer_non_auto_estimator():
    from foreshadow.foreshadow import Foreshadow
    import pandas as pd
    import numpy as np
    from sklearn.datasets import load_breast_cancer
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression

    np.random.seed(1337)

    cancer = load_breast_cancer()
    cancerX_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
    cancery_df = pd.DataFrame(cancer.target, columns=["target"])

    X_train, X_test, y_train, y_test = train_test_split(cancerX_df,
                                                        cancery_df,
                                                        test_size=0.2)

    shadow = Foreshadow(estimator=LogisticRegression(),
                        problem_type=ProblemType.CLASSIFICATION)

    shadow.fit(X_train, y_train)
    score = shadow.score(X_test, y_test)
    print(score)
예제 #22
0
def test_foreshadow_pickling_and_unpickling_tpot(tmpdir):
    from foreshadow.foreshadow import Foreshadow
    import pandas as pd
    import numpy as np
    from sklearn.datasets import load_breast_cancer
    from sklearn.model_selection import train_test_split

    np.random.seed(1337)

    cancer = load_breast_cancer()
    cancerX_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
    cancery_df = pd.DataFrame(cancer.target, columns=["target"])

    X_train, X_test, y_train, y_test = train_test_split(cancerX_df,
                                                        cancery_df,
                                                        test_size=0.2)

    from foreshadow.estimators import AutoEstimator

    estimator = AutoEstimator(
        problem_type=ProblemType.CLASSIFICATION,
        auto="tpot",
        estimator_kwargs={"max_time_mins": 1},
    )

    shadow = Foreshadow(estimator=estimator,
                        problem_type=ProblemType.CLASSIFICATION)
    pickled_file_location = tmpdir.join("fitted_pipeline.p")
    shadow.fit(X_train, y_train)
    shadow.pickle_fitted_pipeline(pickled_file_location)

    import pickle

    with open(pickled_file_location, "rb") as fopen:
        pipeline = pickle.load(fopen)

    score1 = shadow.score(X_test, y_test)
    score2 = pipeline.score(X_test, y_test)

    import unittest

    assertions = unittest.TestCase("__init__")
    # given the randomness of the tpot algorithm and the short run
    # time we configured, there is no guarantee the performance can
    # converge. The test here aims to evaluate if both cases have
    # produced a reasonable score and the difference is small.

    # Changing the decimal point to 1 due to failure on azure pipeline but
    # cannot be reproduced locally.
    assertions.assertAlmostEqual(score1, score2, places=2)
예제 #23
0
def test_foreshadow_y_preparer(mocker):
    import numpy as np
    from sklearn.pipeline import Pipeline
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import train_test_split
    from foreshadow.foreshadow import Foreshadow
    import pandas as pd

    np.random.seed(0)

    y_pipeline = Pipeline([("yohe", StandardScaler())])
    setattr(y_pipeline, "pipeline", y_pipeline)
    estimator = LinearRegression()

    X = pd.DataFrame(np.array([0] * 50 + [1] * 50).reshape((-1, 1)),
                     columns=["col1"])
    y = pd.DataFrame(np.random.normal(100, 10, 100).reshape((-1, 1)),
                     columns=["y"])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

    # Let foreshadow set to defaults, we will overwrite them
    y_preparer = mocker.PropertyMock(return_value=y_pipeline)
    mocker.patch.object(Foreshadow, "y_preparer", y_preparer)
    foreshadow = Foreshadow(problem_type=ProblemType.REGRESSION,
                            estimator=estimator)
    foreshadow.fit(X_train, y_train)
    foreshadow_predict = foreshadow.predict(X_test)
    foreshadow_score = foreshadow.score(X_test, y_test)
    expected_predict = np.array([
        [102.19044770619593],
        [102.19044770619593],
        [102.19044770619593],
        [100.05275170774354],
        [102.19044770619593],
        [102.19044770619593],
        [102.19044770619593],
        [102.19044770619593],
        [100.05275170774354],
        [100.05275170774354],
    ])
    expected_score = -0.3576910440975052

    assert np.allclose(foreshadow_predict, expected_predict)
    assert np.allclose(foreshadow_score, expected_score)
예제 #24
0
def test_set_processed_data_export_path():
    from foreshadow.foreshadow import Foreshadow
    from sklearn.linear_model import LogisticRegression
    from foreshadow.utils import ConfigKey

    shadow = Foreshadow(estimator=LogisticRegression(),
                        problem_type=ProblemType.CLASSIFICATION)
    processed_training_data_path = "datapath1.csv"
    shadow.set_processed_data_export_path(
        data_path=processed_training_data_path, is_train=True)
    assert (shadow.X_preparer.cache_manager[AcceptedKey.CONFIG][
        ConfigKey.PROCESSED_TRAINING_DATA_EXPORT_PATH] ==
            processed_training_data_path)

    processed_test_data_path = "datapath2.csv"
    shadow.set_processed_data_export_path(data_path=processed_test_data_path,
                                          is_train=False)
    assert (shadow.X_preparer.cache_manager[AcceptedKey.CONFIG][
        ConfigKey.PROCESSED_TEST_DATA_EXPORT_PATH] == processed_test_data_path)
예제 #25
0
def test_foreshadow_configure_sampling():
    from foreshadow.foreshadow import Foreshadow
    from sklearn.linear_model import LogisticRegression
    from foreshadow.utils import ConfigKey

    shadow = Foreshadow(estimator=LogisticRegression(),
                        problem_type=ProblemType.CLASSIFICATION)
    shadow.configure_sampling(enable_sampling=False)
    assert (shadow.X_preparer.cache_manager[AcceptedKey.CONFIG][
        ConfigKey.ENABLE_SAMPLING] is False)

    shadow.configure_sampling(enable_sampling=True,
                              sampling_fraction=0.3,
                              replace=False)
    assert (shadow.X_preparer.cache_manager[AcceptedKey.CONFIG][
        ConfigKey.ENABLE_SAMPLING] is True)
    assert (shadow.X_preparer.cache_manager[AcceptedKey.CONFIG][
        ConfigKey.SAMPLING_FRACTION] == 0.3)
    assert (shadow.X_preparer.cache_manager[AcceptedKey.CONFIG][
        ConfigKey.SAMPLING_WITH_REPLACEMENT] is False)
예제 #26
0
def generate_model(args):  # noqa: C901
    """Process command line args and generate a Foreshadow model to fit.

    Args:
        args (list): A list of string arguments to process

    Returns:
        tuple: A tuple of `fs, X_train, y_train, X_test, y_test` which \
            represents the foreshadow model along with the split data.

    Raises:
        ValueError: if invalid file or invalid y.

    """
    cargs = process_argument(args)

    if cargs.level == 3 and cargs.method is not None:
        warnings.warn(
            "WARNING: Level 3 model search enabled. Method will be ignored.")

    if cargs.level != 3 and cargs.time != 10:
        warnings.warn("WARNING: Time parameter not applicable "
                      "to feature engineering. Must be in level 3.")

    try:
        df = pd.read_csv(cargs.data)
    except Exception:
        raise ValueError(
            "Failed to load file. Please verify it exists and is a valid CSV.")

    try:
        X_df = df.drop(columns=cargs.target)
        y_df = df[[cargs.target]]
    except Exception:
        raise ValueError("Invalid target variable")

    X_train, X_test, y_train, y_test = train_test_split(X_df,
                                                        y_df,
                                                        test_size=0.2)

    if cargs.level == 1:
        # Default everything with basic estimator
        fs = Foreshadow(
            problem_type=cargs.problem_type,
            estimator=get_method(cargs.method, y_train, cargs.family,
                                 cargs.problem_type),
        )

    # elif cargs.level == 2:
    #     # Parameter search on all matched intents
    #
    #     if cargs.x_config is not None:
    #         try:
    #             with open(cargs.x_config, "r") as f:
    #                 X_search = Preprocessor(from_json=json.load(f))
    #         except Exception:
    #             raise ValueError(
    #                 "Could not read X config file {}".format(cargs.x_config)
    #             )
    #         print("Reading config for X Preprocessor")
    #     else:
    #         X_search = search_intents(X_train)
    #         print("Searching over valid intent space for X data")
    #
    #     if cargs.y_config is not None:
    #         try:
    #             with open(cargs.y_config, "r") as f:
    #                 y_search = Preprocessor(from_json=json.load(f))
    #         except Exception:
    #             raise ValueError(
    #                 "Could not read y config file {}".format(cargs.y_config)
    #             )
    #         print("Reading config for y Preprocessor")
    #     else:
    #         y_search = search_intents(y_train, y_var=True)
    #         print("Searching over valid intent space for y data")
    #
    #     # If level 3 also do model parameter search with AutoEstimator
    #     # Input time limit into Foreshadow to be passed into AutoEstimator
    #
    #     fs = Foreshadow(
    #         X_preparer=X_search,
    #         y_preparer=y_search,
    #         estimator=get_method(cargs.method, y_train),
    #         optimizer=GridSearchCV,
    #     )
    #
    elif cargs.level == 3:
        # Default intent and advanced model search using 3rd party AutoML

        estimator = AutoEstimator(problem_type=cargs.problem_type, auto="tpot")
        estimator.construct_estimator(y_train)

        # TODO move this into the configure_estimator method "max_time_mins"
        #  is an argument for the TPOT library. We cannot assign it
        #   based on the problem type here. For testing purpose, I'm going
        #   to hardcode it for TPOT.
        # kwargs = (
        #     "max_time_mins"
        #     if estimator.problem_type == ProblemType.REGRESSION
        #     else "time_left_for_this_task"
        # )
        kwargs = "max_time_mins"
        estimator.estimator_kwargs = {
            kwargs: cargs.time,
            **estimator.estimator_kwargs,
        }

        fs = Foreshadow(problem_type=cargs.problem_type, estimator=estimator)

    else:
        raise ValueError("Invalid Level. Only levels 1 and 3 supported.")

    if cargs.multiprocess:
        fs.configure_multiprocessing(-1)
        logging.info("multiprocessing enabled.")

    return fs, X_train, y_train, X_test, y_test
예제 #27
0
def test_foreshadow_adults_small_user_provided_cleaner():
    from foreshadow.foreshadow import Foreshadow
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression

    np.random.seed(1337)

    data_path = get_file_path("data", "adult_small.csv")

    adult = pd.read_csv(data_path)
    X_df = adult.loc[:, "age":"workclass"]
    y_df = adult.loc[:, "class"]

    X_train, X_test, y_train, y_test = train_test_split(X_df,
                                                        y_df,
                                                        test_size=0.2)

    shadow = Foreshadow(estimator=LogisticRegression(),
                        problem_type=ProblemType.CLASSIFICATION)

    from foreshadow.concrete.internals.cleaners.customizable_base import (
        CustomizableBaseCleaner, )

    def lowercase_row(row):
        """Lowercase a row.

        Args:
            row: string of text

        Returns:
            transformed row.

        """
        # Without using the customizable base cleaner, we have to explain
        # the meaning of the matched length. I don't know a good way to
        # explain it clearly without diving into the internal details yet.
        # return (row, 0) if row is None else (str(row).lower(), 1)

        return row if row is None else str(row).lower()

    class LowerCaseCleaner(CustomizableBaseCleaner):
        def __init__(self):
            super().__init__(transformation=lowercase_row)

        def metric_score(self, X: pd.DataFrame) -> float:
            """Calculate the matching metric score of the cleaner on this col.

            In this method, you specify the condition on when to apply the
            cleaner and calculate a confidence score between 0 and 1 where 1
            means 100% certainty to apply the transformation.

            Args:
                X: a column as a dataframe.

            Returns:
                the confidence score.

            """
            # The user needs to know what cleaners are provided so that
            # they don't create something duplicate or overlapping.
            column_name = list(X.columns)[0]
            if column_name == "workclass":
                return 1
            else:
                return 0

    shadow.register_customized_data_cleaner(data_cleaners=[LowerCaseCleaner])

    workclass_values = list(X_train["workclass"].unique())
    print(workclass_values)

    X_train_cleaned = shadow.X_preparer.steps[1][1].fit_transform(X_train)

    workclass_values_transformed = list(X_train_cleaned["workclass"].unique())
    for value in workclass_values_transformed:
        assert not any([c.isupper() for c in value])
예제 #28
0
def test_foreshadow_adults_small_classification_override_upfront():
    from foreshadow.foreshadow import Foreshadow
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression

    np.random.seed(1337)

    data_path = get_file_path("data", "adult_small.csv")

    adult = pd.read_csv(data_path)
    X_df = adult.loc[:, "age":"workclass"]
    y_df = adult.loc[:, "class"]

    X_train, X_test, y_train, y_test = train_test_split(X_df,
                                                        y_df,
                                                        test_size=0.2)

    shadow = Foreshadow(estimator=LogisticRegression(),
                        problem_type=ProblemType.CLASSIFICATION)

    from foreshadow.intents import IntentType

    shadow.override_intent("age", IntentType.CATEGORICAL)
    shadow.override_intent("workclass", IntentType.CATEGORICAL)
    shadow.fit(X_train, y_train)
    assert shadow.get_intent("age") == IntentType.CATEGORICAL
    assert shadow.get_intent("workclass") == IntentType.CATEGORICAL
    score = shadow.score(X_test, y_test)
    summary = shadow.get_data_summary()
    assert summary.shape[1] == 3
    assert summary.at["intent", "age"] == IntentType.CATEGORICAL
    assert summary.at["intent", "workclass"] == IntentType.CATEGORICAL
    assert summary.at["intent", "class"] == "Label"
    print(score)