Пример #1
0
def test_do_almost_nothing():
    X, target = get_random_data()
    afreg = AutoFeatRegressor(verbose=1, feateng_steps=0, featsel_runs=0)
    df = afreg.fit_transform(pd.DataFrame(X, columns=["x1", "x2", "x3"]),
                             target)
    assert list(df.columns) == ["x1", "x2", "x3"], "Only original columns"
    df = afreg.transform(pd.DataFrame(X, columns=["x1", "x2", "x3"]))
    assert list(df.columns) == ["x1", "x2", "x3"], "Only original columns"
Пример #2
0
def test_regular_df_X_y():
    # autofeat with df without column names
    X, target = get_random_data()
    afreg = AutoFeatRegressor(verbose=1, feateng_steps=3)
    df = afreg.fit_transform(pd.DataFrame(X), pd.DataFrame(target))
    # score once with original, once with transformed data
    assert afreg.score(pd.DataFrame(X), target) >= 0.999, "R^2 should be 1."
    assert afreg.score(df, target) >= 0.999, "R^2 should be 1."
    assert list(df.columns)[:3] == ["0", "1", "2"], "Wrong column names"
Пример #3
0
def test_regular_X_y():
    # autofeat with numpy arrays
    X, target = get_random_data()
    afreg = AutoFeatRegressor(verbose=1, feateng_steps=3)
    df = afreg.fit_transform(X, target)
    assert afreg.score(X, target) >= 0.999, "R^2 should be 1."
    assert afreg.score(df, target) >= 0.999, "R^2 should be 1."
    assert list(df.columns)[:3] == ["x000", "x001",
                                    "x002"], "Wrong column names"
Пример #4
0
def test_units():
    np.random.seed(15)
    x1 = np.random.rand(1000)
    x2 = np.random.randn(1000)
    x3 = np.random.rand(1000)
    target = 2 + 15 * x1 + 3 / (x2 - 1 / x3) + 5 * (x2 * np.log(x1))**3
    X = np.vstack([x1, x2, x3]).T
    units = {"x2": "m/sec", "x3": "min/mm"}
    afreg = AutoFeatRegressor(verbose=1, units=units, feateng_steps=3)
    _ = afreg.fit_transform(pd.DataFrame(X, columns=["x1", "x2", "x3"]),
                            target)
    assert afreg.score(pd.DataFrame(X, columns=["x1", "x2", "x3"]),
                       target) >= 0.999, "R^2 should be 1."
Пример #5
0
def test_categorical_cols():
    np.random.seed(15)
    x1 = np.random.rand(1000)
    x2 = np.random.randn(1000)
    x3 = np.random.rand(1000)
    x4 = np.array(200 * [4] + 300 * [5] + 500 * [2], dtype=int)
    target = 2 + 15 * x1 + 3 / (x2 - 1 / x3) + 5 * (x2 + np.log(x1))**3 + x4
    X = np.vstack([x1, x2, x3, x4]).T
    afreg = AutoFeatRegressor(verbose=1,
                              categorical_cols=["x4", "x5"],
                              feateng_steps=3)
    try:
        df = afreg.fit_transform(
            pd.DataFrame(X, columns=["x1", "x2", "x3", "x4"]), target)
    except ValueError:
        pass
    else:
        raise AssertionError(
            "categorical_cols not in df should throw an error")
    afreg = AutoFeatRegressor(verbose=1,
                              categorical_cols=["x4"],
                              feateng_steps=3)
    df = afreg.fit_transform(pd.DataFrame(X, columns=["x1", "x2", "x3", "x4"]),
                             target)
    assert list(df.columns)[3:6] == [
        "cat_x4_2.0", "cat_x4_4.0", "cat_x4_5.0"
    ], "categorical_cols were not transformed correctly"
    assert "x4" not in df.columns, "categorical_cols weren't deleted from df"
    df = afreg.transform(pd.DataFrame(X, columns=["x1", "x2", "x3", "x4"]))
    assert list(df.columns)[3:6] == [
        "cat_x4_2.0", "cat_x4_4.0", "cat_x4_5.0"
    ], "categorical_cols were not transformed correctly"
    assert "x4" not in df.columns, "categorical_cols weren't deleted from df"
    assert afreg.score(pd.DataFrame(X, columns=["x1", "x2", "x3", "x4"]),
                       target) >= 0.999, "R^2 should be 1."
Пример #6
0
def test_nans():
    # nans are ok in transform but not fit or predict (due to sklearn model)
    X, target = get_random_data()
    X[998, 0] = np.nan
    X[999, 1] = np.nan
    afreg = AutoFeatRegressor(verbose=1, feateng_steps=3)
    try:
        _ = afreg.fit_transform(pd.DataFrame(X, columns=["x 1.1", 2, "x/3"]),
                                pd.DataFrame(target))
    except ValueError:
        pass
    else:
        raise AssertionError("fit with NaNs should throw an error")
    _ = afreg.fit_transform(pd.DataFrame(X[:900], columns=["x 1.1", 2, "x/3"]),
                            pd.DataFrame(target[:900]))
    try:
        _ = afreg.predict(pd.DataFrame(X[900:], columns=["x 1.1", 2, "x/3"]))
    except ValueError:
        pass
    else:
        raise AssertionError("predict with NaNs should throw an error")
    df = afreg.transform(pd.DataFrame(X, columns=["x 1.1", 2, "x/3"]))
    assert all([pd.isna(df.iloc[998, 0]),
                pd.isna(df.iloc[999, 1])]), "Original features should be NaNs"
    assert np.sum(
        np.array(pd.isna(df.iloc[998]),
                 dtype=int)) >= 2, "There should be at least 2 NaNs in row 998"
    assert np.sum(
        np.array(pd.isna(df.iloc[999]),
                 dtype=int)) >= 2, "There should be at least 3 NaNs in row 999"
Пример #7
0
def test_weird_colnames():
    # autofeat with df with weird column names
    X, target = get_random_data()
    afreg = AutoFeatRegressor(verbose=1, feateng_steps=3)
    df = afreg.fit_transform(pd.DataFrame(X, columns=["x 1.1", 2, "x/3"]),
                             pd.DataFrame(target))
    assert afreg.score(pd.DataFrame(X, columns=["x 1.1", 2, "x/3"]),
                       target) >= 0.999, "R^2 should be 1."
    assert list(df.columns)[:3] == ["x 1.1", "2", "x/3"], "Wrong column names"
    # error if the column names aren't the same as before
    try:
        afreg.score(pd.DataFrame(X, columns=["x 11", 2, "x/3"]), target)
    except ValueError:
        pass
    else:
        raise AssertionError("Should throw error on mismatch column names")
Пример #8
0
def test_feateng_cols():
    X, target = get_random_data()
    afreg = AutoFeatRegressor(verbose=1,
                              feateng_cols=["x1", "x3", "x4"],
                              feateng_steps=3)
    try:
        df = afreg.fit_transform(pd.DataFrame(X, columns=["x1", "x2", "x3"]),
                                 target)
    except ValueError:
        pass
    else:
        raise AssertionError("feateng_cols not in df should throw an error")
    afreg = AutoFeatRegressor(verbose=1,
                              feateng_cols=["x1", "x3"],
                              feateng_steps=3)
    df = afreg.fit_transform(pd.DataFrame(X, columns=["x1", "x2", "x3"]),
                             target)
    for c in df.columns[3:]:
        assert "x2" not in c, "only feateng_cols should occur in engineered features"
Пример #9
0
    def regression(self):
        X_train, X_test, y_train, y_test, categoricalCols = \
            self.dataFunction(preprocessed=True, specifics="AUTOFEAT", trainSize=self.trainSize, steps=self.steps, nDataPoints=self.nDataPoints)

        featureEngColumns = None
        # If feature engineering not wanted for categorical values, uncomment
        # if categoricalCols is not None:
        #    featureEngColumns = X_train.columns.values.tolist()
        #    featureEngColumns = [i for i in featureEngColumns + categoricalCols if i not in featureEngColumns or i not in categoricalCols]

        # Measure runtime
        start_time = time.time()
        print(f"Start time: {start_time}")

        # Automated feature engineering with autofeat
        model = AutoFeatRegressor(verbose=1,
                                  feateng_steps=self.feateng_steps,
                                  featsel_runs=self.featuresel_steps,
                                  categorical_cols=categoricalCols,
                                  feateng_cols=featureEngColumns)

        # Fit model and get transformed dataframe with additional features
        x_train_extended = model.fit_transform(X_train, y_train)
        total_time = int(divmod(time.time() - start_time, 60)[0])
        print(f"Time: {total_time}")

        # Export model
        dump(
            model,
            f"{self.savePath}/feng{model.feateng_steps}_fsel{model.featsel_runs}_time{total_time}_model.joblib"
        )

        x_test_extended = model.transform(X_test)

        # Predictions
        predictions = {}

        predictionModel = DecisionTreeRegressor()
        predictionModel.fit(x_train_extended, y_train)
        predictions["DecisionTree"] = mean_squared_error(
            y_test, predictionModel.predict(x_test_extended))
        print(f"Final MSE prediction score: {predictions['DecisionTree']}")

        predictionModel = RandomForestRegressor(n_estimators=10)
        predictionModel.fit(x_train_extended, y_train)
        predictions["RandomForest"] = mean_squared_error(
            y_test, predictionModel.predict(x_test_extended))
        print(f"Final MSE prediction score: {predictions['RandomForest']}")

        predictionModel = LinearRegression()
        predictionModel.fit(x_train_extended, y_train)
        predictions["LinearRegression"] = mean_squared_error(
            y_test, predictionModel.predict(x_test_extended))
        print(f"Final MSE prediction score: {predictions['LinearRegression']}")

        predictionModel = LassoLarsCV(cv=5)
        predictionModel.fit(x_train_extended, y_train)
        predictions["LassoLarsCV"] = mean_squared_error(
            y_test, predictionModel.predict(x_test_extended))
        print(f"Final MSE prediction score: {predictions['LassoLarsCV']}")

        # Additionally save transformations steps since not saved in joblib file
        predictions["new_features"] = model.new_feat_cols_

        # Export predictions
        with open(
                f"{self.savePath}/feng{model.feateng_steps}_fsel{model.featsel_runs}_performance.pkl",
                'wb') as file:
            pickle.dump(predictions, file)

        return model
Пример #10
0
    # autofeat with numpy arrays but as classification
    X, target = get_random_data()
    target = np.array(target > target.mean(), dtype=int)
    afreg = AutoFeatClassifier(verbose=1, feateng_steps=3)
    df = afreg.fit_transform(X, target)
    assert afreg.score(X, target) >= 0.9999, "Accuracy should be 1."
    assert afreg.score(df, target) >= 0.9999, "Accuracy should be 1."
    assert list(df.columns)[:3] == ["x000", "x001",
                                    "x002"], "Wrong column names"


if __name__ == '__main__':
    print("## Running sklearn Regressor tests")
    # we allow for nan in transform
    successful_tests = set(["check_estimators_nan_inf"])
    for estimator, check in check_estimator(AutoFeatRegressor(
            feateng_steps=1, featsel_runs=1, always_return_numpy=True),
                                            generate_only=True):
        if check.func.__name__ not in successful_tests:
            print(check.func.__name__)
            successful_tests.add(check.func.__name__)
            check(estimator)
    # additionally check the class, but don't run all the other tests
    for estimator, check in check_estimator(AutoFeatRegressor,
                                            generate_only=True):
        if check.func.__name__ not in successful_tests:
            print(check.func.__name__)
            successful_tests.add(check.func.__name__)
            check(estimator)

    print("## Running sklearn Classifier tests")
    # we allow for nan in transform
Пример #11
0
import pandas as pd
import matplotlib.pyplot as plt
from autofeat import FeatureSelector, AutoFeatRegressor
from sklearn.pipeline import make_pipeline
import pickle
def main():
    # Get the dataset from the users GitHub repository
    dataset_path = "https://raw.githubusercontent.com/" + os.environ["GITHUB_REPOSITORY"] +"/master/dataset.csv"
    df = pd.read_csv(dataset_path)
    print()
    print(df.describe())

for steps in range(5):
    np.random.seed(55)
    print("### AutoFeat with %i feateng_steps" % steps)
    afreg = AutoFeatRegressor(verbose=1, feateng_steps=steps)
    df = afreg.fit_transform(df_org, target)
    r2 = afreg.score(df_org, target)
    print("## Final R^2: %.4f" % r2)
    plt.figure()
    plt.scatter(afreg.predict(df_org), target, s=2);
    plt.title("%i FE steps (R^2: %.4f; %i new features)" % (steps, r2, len(afreg.new_feat_cols_)))
afreg = AutoFeatRegressor(verbose=1, feateng_steps=3)
# train on noisy data
df = afreg.fit_transform(df_org, target_noisy)
# test on real targets
print("Final R^2: %.4f" % afreg.score(df, target))
plt.figure()
plt.scatter(afreg.predict(df), target, s=2);
afreg = AutoFeatRegressor(verbose=1, feateng_steps=3)
# train on noisy data