Пример #1
0
def test_foreshadow_serialization_boston_housing_regression_multiprocessing(
        tmpdir):
    from foreshadow.foreshadow import Foreshadow
    import pandas as pd
    import numpy as np
    from sklearn.datasets import load_boston
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LinearRegression

    np.random.seed(1337)

    boston = load_boston()
    X_df = pd.DataFrame(boston.data, columns=boston.feature_names)
    y_df = pd.DataFrame(boston.target, columns=["target"])

    X_train, X_test, y_train, y_test = train_test_split(X_df,
                                                        y_df,
                                                        test_size=0.2)

    shadow = Foreshadow(estimator=LinearRegression(),
                        problem_type=ProblemType.REGRESSION)

    shadow.configure_multiprocessing(n_job=-1)

    shadow.fit(X_train, y_train)

    score = shadow.score(X_test, y_test)
    print(score)
Пример #2
0
def test_foreshadow_abort_on_empty_data_frame_after_cleaning(
        filename, problem_type, X_start, X_end, target):
    from foreshadow.foreshadow import Foreshadow
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split

    np.random.seed(1337)

    data_path = get_file_path("data", filename)

    data = pd.read_csv(data_path)
    X_df = data.loc[:, X_start:X_end]
    y_df = data.loc[:, target]

    X_train, X_test, y_train, y_test = train_test_split(X_df,
                                                        y_df,
                                                        test_size=0.2)

    from foreshadow.estimators import AutoEstimator

    estimator = AutoEstimator(
        problem_type=problem_type,
        auto="tpot",
        estimator_kwargs={"max_time_mins": 1},
    )

    shadow = Foreshadow(estimator=estimator, problem_type=problem_type)

    with pytest.raises(ValueError) as excinfo:
        shadow.fit(X_train, y_train)
    error_msg = ("All columns are dropped since they all have over 90% of "
                 "missing values. Aborting foreshadow.")
    assert error_msg in str(excinfo.value)
Пример #3
0
def test_foreshadow_titanic(tmpdir):
    import pandas as pd

    train_data = pd.read_csv(get_file_path("data", "titanic-train.csv"))
    X_train_df = train_data.loc[:, "Pclass":"Embarked"]
    y_train_df = train_data.loc[:, "Survived"]

    X_train_df = X_train_df.drop(columns=["SibSp", "Parch", "Cabin"])

    X_train, X_test, y_train, y_test = train_test_split(X_train_df,
                                                        y_train_df,
                                                        test_size=0.2,
                                                        random_state=42)

    from foreshadow.estimators import AutoEstimator

    estimator = AutoEstimator(
        problem_type=ProblemType.CLASSIFICATION,
        auto="tpot",
        estimator_kwargs={
            "max_time_mins": 1,
            "random_state": 42
        },
    )

    shadow = Foreshadow(estimator=estimator,
                        problem_type=ProblemType.CLASSIFICATION)

    shadow.override_intent(column_name="Name", intent=IntentType.TEXT)
    shadow.fit(X_train, y_train)

    score = shadow.score(X_test, y_test)
    print(score)
Пример #4
0
def test_foreshadow_serialization_adults_small_classification_override():
    from foreshadow.foreshadow import Foreshadow
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression

    np.random.seed(1337)

    data_path = get_file_path("data", "adult_small.csv")

    adult = pd.read_csv(data_path)
    X_df = adult.loc[:, "age":"workclass"]
    y_df = adult.loc[:, "class"]

    X_train, X_test, y_train, y_test = train_test_split(X_df,
                                                        y_df,
                                                        test_size=0.2)

    shadow = Foreshadow(estimator=LogisticRegression(),
                        problem_type=ProblemType.CLASSIFICATION)
    shadow.fit(X_train, y_train)
    score1 = shadow.score(X_test, y_test)

    from foreshadow.intents import IntentType

    shadow.override_intent("age", IntentType.CATEGORICAL)
    shadow.override_intent("workclass", IntentType.CATEGORICAL)
    shadow.fit(X_train, y_train)

    assert shadow.get_intent("age") == IntentType.CATEGORICAL
    assert shadow.get_intent("workclass") == IntentType.CATEGORICAL
    score2 = shadow.score(X_test, y_test)
    print(score1, score2)
Пример #5
0
def test_foreshadow_integration_data_cleaner_can_drop(filename, problem_type,
                                                      X_start, X_end, target,
                                                      tmpdir):
    from foreshadow.foreshadow import Foreshadow
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split

    np.random.seed(1337)

    data_path = get_file_path("data", filename)

    data = pd.read_csv(data_path)
    # local_file_folder = "examples"
    # data = pd.read_csv("/".join([local_file_folder, filename]))

    X_df = data.loc[:, X_start:X_end]
    y_df = data.loc[:, target]

    X_train, X_test, y_train, y_test = train_test_split(X_df,
                                                        y_df,
                                                        test_size=0.2)

    from foreshadow.estimators import AutoEstimator

    estimator = AutoEstimator(
        problem_type=problem_type,
        auto="tpot",
        estimator_kwargs={"max_time_mins": 1},
    )

    shadow = Foreshadow(estimator=estimator, problem_type=problem_type)

    pickled_fitted_pipeline_location = tmpdir.join("fitted_pipeline.p")
    shadow.fit(X_train, y_train)
    shadow.pickle_fitted_pipeline(pickled_fitted_pipeline_location)

    import pickle

    with open(pickled_fitted_pipeline_location, "rb") as fopen:
        pipeline = pickle.load(fopen)

    # If there are new empty columns in the test set, the program should
    # not fail.
    X_test[X_start] = np.nan
    score1 = shadow.score(X_test, y_test)
    score2 = pipeline.score(X_test, y_test)

    import unittest

    assertions = unittest.TestCase("__init__")
    # given the randomness of the tpot algorithm and the short run
    # time we configured, there is no guarantee the performance can
    # converge. The test here aims to evaluate if both cases have
    # produced a reasonable score and the difference is small.
    # assert score1 > 0.76 and score2 > 0.76
    assertions.assertAlmostEqual(score1, score2, places=2)
Пример #6
0
def test_text_classification_foreshadow():
    import pandas as pd

    categories = [
        "alt.atheism",
        "soc.religion.christian",
        "comp.graphics",
        "sci.med",
    ]
    from sklearn.datasets import fetch_20newsgroups

    twenty_train = fetch_20newsgroups(subset="train",
                                      categories=categories,
                                      shuffle=True,
                                      random_state=42)
    X_train = pd.DataFrame(
        data=twenty_train.data,
        columns=["text"],
        index=list(range(len(twenty_train.data))),
    )
    y_train = pd.Series(
        data=twenty_train.target,
        name="category",
        index=list(range(len(twenty_train.target))),
    )

    twenty_test = fetch_20newsgroups(subset="test",
                                     categories=categories,
                                     shuffle=True,
                                     random_state=42)
    X_test = pd.DataFrame(data=twenty_test.data, columns=["text"])
    y_test = pd.Series(data=twenty_test.target, name="category")

    from foreshadow.estimators import AutoEstimator

    estimator = AutoEstimator(
        problem_type=ProblemType.CLASSIFICATION,
        auto="tpot",
        estimator_kwargs={
            "max_time_mins": 1,
            "random_state": 42
        },
    )

    shadow = Foreshadow(estimator=estimator,
                        problem_type=ProblemType.CLASSIFICATION)

    shadow.fit(X_train, y_train)

    score = shadow.score(X_test, y_test)
    print(score)  # this gives about 87.5%
Пример #7
0
def test_foreshadow_pickling_and_unpickling_tpot(tmpdir):
    from foreshadow.foreshadow import Foreshadow
    import pandas as pd
    import numpy as np
    from sklearn.datasets import load_breast_cancer
    from sklearn.model_selection import train_test_split

    np.random.seed(1337)

    cancer = load_breast_cancer()
    cancerX_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
    cancery_df = pd.DataFrame(cancer.target, columns=["target"])

    X_train, X_test, y_train, y_test = train_test_split(cancerX_df,
                                                        cancery_df,
                                                        test_size=0.2)

    from foreshadow.estimators import AutoEstimator

    estimator = AutoEstimator(
        problem_type=ProblemType.CLASSIFICATION,
        auto="tpot",
        estimator_kwargs={"max_time_mins": 1},
    )

    shadow = Foreshadow(estimator=estimator,
                        problem_type=ProblemType.CLASSIFICATION)
    pickled_file_location = tmpdir.join("fitted_pipeline.p")
    shadow.fit(X_train, y_train)
    shadow.pickle_fitted_pipeline(pickled_file_location)

    import pickle

    with open(pickled_file_location, "rb") as fopen:
        pipeline = pickle.load(fopen)

    score1 = shadow.score(X_test, y_test)
    score2 = pipeline.score(X_test, y_test)

    import unittest

    assertions = unittest.TestCase("__init__")
    # given the randomness of the tpot algorithm and the short run
    # time we configured, there is no guarantee the performance can
    # converge. The test here aims to evaluate if both cases have
    # produced a reasonable score and the difference is small.

    # Changing the decimal point to 1 due to failure on azure pipeline but
    # cannot be reproduced locally.
    assertions.assertAlmostEqual(score1, score2, places=2)
Пример #8
0
def test_foreshadow_param_optimize_fit(mocker):
    import pandas as pd
    from foreshadow.base import BaseEstimator, TransformerMixin
    from sklearn.model_selection._search import BaseSearchCV

    from foreshadow.foreshadow import Foreshadow

    boston_path = get_file_path("data", "boston_housing.csv")
    data = pd.read_csv(boston_path)

    class DummyRegressor(BaseEstimator, TransformerMixin):
        def fit(self, X, y):
            return self

    class DummySearch(BaseSearchCV):
        def __init__(self, estimator, params):
            self.best_estimator_ = estimator

        def fit(self, X, y=None, **fit_params):
            return self

    class DummyDataPreparer(BaseEstimator, TransformerMixin):
        def fit(self, X, y):
            return self

    mocker.patch("foreshadow.preparer.DataPreparer",
                 return_value=DummyDataPreparer)

    fs = Foreshadow(
        problem_type=ProblemType.REGRESSION,
        estimator=DummyRegressor(),
        optimizer=DummySearch,
    )
    x = data.drop(["medv"], axis=1, inplace=False)
    y = data[["medv"]]

    fs.fit(x, y)
    assert isinstance(fs.pipeline.steps[-1][1].estimator, DummyRegressor)

    fs2 = Foreshadow(
        problem_type=ProblemType.REGRESSION,
        X_preparer=False,
        y_preparer=False,
        estimator=DummyRegressor(),
        optimizer=DummySearch,
    )

    fs2.fit(x, y)
    assert isinstance(fs2.pipeline.steps[-1][1], DummyRegressor)
Пример #9
0
def test_foreshadow_y_preparer(mocker):
    import numpy as np
    from sklearn.pipeline import Pipeline
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import train_test_split
    from foreshadow.foreshadow import Foreshadow
    import pandas as pd

    np.random.seed(0)

    y_pipeline = Pipeline([("yohe", StandardScaler())])
    setattr(y_pipeline, "pipeline", y_pipeline)
    estimator = LinearRegression()

    X = pd.DataFrame(np.array([0] * 50 + [1] * 50).reshape((-1, 1)),
                     columns=["col1"])
    y = pd.DataFrame(np.random.normal(100, 10, 100).reshape((-1, 1)),
                     columns=["y"])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

    # Let foreshadow set to defaults, we will overwrite them
    y_preparer = mocker.PropertyMock(return_value=y_pipeline)
    mocker.patch.object(Foreshadow, "y_preparer", y_preparer)
    foreshadow = Foreshadow(problem_type=ProblemType.REGRESSION,
                            estimator=estimator)
    foreshadow.fit(X_train, y_train)
    foreshadow_predict = foreshadow.predict(X_test)
    foreshadow_score = foreshadow.score(X_test, y_test)
    expected_predict = np.array([
        [102.19044770619593],
        [102.19044770619593],
        [102.19044770619593],
        [100.05275170774354],
        [102.19044770619593],
        [102.19044770619593],
        [102.19044770619593],
        [102.19044770619593],
        [100.05275170774354],
        [100.05275170774354],
    ])
    expected_score = -0.3576910440975052

    assert np.allclose(foreshadow_predict, expected_predict)
    assert np.allclose(foreshadow_score, expected_score)
Пример #10
0
def test_core_foreshadow_example_regression():
    import numpy as np
    import pandas as pd
    from sklearn.datasets import load_boston
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import r2_score
    from sklearn.model_selection import train_test_split
    from foreshadow.foreshadow import Foreshadow

    np.random.seed(0)
    boston = load_boston()
    bostonX_df = pd.DataFrame(boston.data, columns=boston.feature_names)
    bostony_df = pd.DataFrame(boston.target, columns=["target"])
    X_train, X_test, y_train, y_test = train_test_split(bostonX_df,
                                                        bostony_df,
                                                        test_size=0.2)
    model = Foreshadow(estimator=LinearRegression(),
                       problem_type=ProblemType.REGRESSION)
    model.fit(X_train, y_train)
    score = r2_score(y_test, model.predict(X_test))
    print("Boston score: %f" % score)
Пример #11
0
def test_core_foreshadow_example_classification():
    import numpy as np
    import pandas as pd
    from sklearn.datasets import load_iris
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import f1_score
    from sklearn.model_selection import train_test_split
    from foreshadow.foreshadow import Foreshadow

    np.random.seed(0)
    iris = load_iris()
    irisX_df = pd.DataFrame(iris.data, columns=iris.feature_names)
    irisy_df = pd.DataFrame(iris.target, columns=["target"])
    X_train, X_test, y_train, y_test = train_test_split(irisX_df,
                                                        irisy_df,
                                                        test_size=0.2)

    model = Foreshadow(estimator=LogisticRegression(),
                       problem_type=ProblemType.CLASSIFICATION)
    model.fit(X_train, y_train)

    score = f1_score(y_test, model.predict(X_test), average="weighted")
    print("Iris score: %f" % score)
Пример #12
0
def test_foreshadow_serialization_breast_cancer_non_auto_estimator():
    from foreshadow.foreshadow import Foreshadow
    import pandas as pd
    import numpy as np
    from sklearn.datasets import load_breast_cancer
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression

    np.random.seed(1337)

    cancer = load_breast_cancer()
    cancerX_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
    cancery_df = pd.DataFrame(cancer.target, columns=["target"])

    X_train, X_test, y_train, y_test = train_test_split(cancerX_df,
                                                        cancery_df,
                                                        test_size=0.2)

    shadow = Foreshadow(estimator=LogisticRegression(),
                        problem_type=ProblemType.CLASSIFICATION)

    shadow.fit(X_train, y_train)
    score = shadow.score(X_test, y_test)
    print(score)