def test_do_almost_nothing(): X, target = get_random_data() afreg = AutoFeatRegressor(verbose=1, feateng_steps=0, featsel_runs=0) df = afreg.fit_transform(pd.DataFrame(X, columns=["x1", "x2", "x3"]), target) assert list(df.columns) == ["x1", "x2", "x3"], "Only original columns" df = afreg.transform(pd.DataFrame(X, columns=["x1", "x2", "x3"])) assert list(df.columns) == ["x1", "x2", "x3"], "Only original columns"
def test_regular_df_X_y(): # autofeat with df without column names X, target = get_random_data() afreg = AutoFeatRegressor(verbose=1, feateng_steps=3) df = afreg.fit_transform(pd.DataFrame(X), pd.DataFrame(target)) # score once with original, once with transformed data assert afreg.score(pd.DataFrame(X), target) >= 0.999, "R^2 should be 1." assert afreg.score(df, target) >= 0.999, "R^2 should be 1." assert list(df.columns)[:3] == ["0", "1", "2"], "Wrong column names"
def test_regular_X_y(): # autofeat with numpy arrays X, target = get_random_data() afreg = AutoFeatRegressor(verbose=1, feateng_steps=3) df = afreg.fit_transform(X, target) assert afreg.score(X, target) >= 0.999, "R^2 should be 1." assert afreg.score(df, target) >= 0.999, "R^2 should be 1." assert list(df.columns)[:3] == ["x000", "x001", "x002"], "Wrong column names"
def test_units(): np.random.seed(15) x1 = np.random.rand(1000) x2 = np.random.randn(1000) x3 = np.random.rand(1000) target = 2 + 15 * x1 + 3 / (x2 - 1 / x3) + 5 * (x2 * np.log(x1))**3 X = np.vstack([x1, x2, x3]).T units = {"x2": "m/sec", "x3": "min/mm"} afreg = AutoFeatRegressor(verbose=1, units=units, feateng_steps=3) _ = afreg.fit_transform(pd.DataFrame(X, columns=["x1", "x2", "x3"]), target) assert afreg.score(pd.DataFrame(X, columns=["x1", "x2", "x3"]), target) >= 0.999, "R^2 should be 1."
def test_categorical_cols(): np.random.seed(15) x1 = np.random.rand(1000) x2 = np.random.randn(1000) x3 = np.random.rand(1000) x4 = np.array(200 * [4] + 300 * [5] + 500 * [2], dtype=int) target = 2 + 15 * x1 + 3 / (x2 - 1 / x3) + 5 * (x2 + np.log(x1))**3 + x4 X = np.vstack([x1, x2, x3, x4]).T afreg = AutoFeatRegressor(verbose=1, categorical_cols=["x4", "x5"], feateng_steps=3) try: df = afreg.fit_transform( pd.DataFrame(X, columns=["x1", "x2", "x3", "x4"]), target) except ValueError: pass else: raise AssertionError( "categorical_cols not in df should throw an error") afreg = AutoFeatRegressor(verbose=1, categorical_cols=["x4"], feateng_steps=3) df = afreg.fit_transform(pd.DataFrame(X, columns=["x1", "x2", "x3", "x4"]), target) assert list(df.columns)[3:6] == [ "cat_x4_2.0", "cat_x4_4.0", "cat_x4_5.0" ], "categorical_cols were not transformed correctly" assert "x4" not in df.columns, "categorical_cols weren't deleted from df" df = afreg.transform(pd.DataFrame(X, columns=["x1", "x2", "x3", "x4"])) assert list(df.columns)[3:6] == [ "cat_x4_2.0", "cat_x4_4.0", "cat_x4_5.0" ], "categorical_cols were not transformed correctly" assert "x4" not in df.columns, "categorical_cols weren't deleted from df" assert afreg.score(pd.DataFrame(X, columns=["x1", "x2", "x3", "x4"]), target) >= 0.999, "R^2 should be 1."
def test_nans(): # nans are ok in transform but not fit or predict (due to sklearn model) X, target = get_random_data() X[998, 0] = np.nan X[999, 1] = np.nan afreg = AutoFeatRegressor(verbose=1, feateng_steps=3) try: _ = afreg.fit_transform(pd.DataFrame(X, columns=["x 1.1", 2, "x/3"]), pd.DataFrame(target)) except ValueError: pass else: raise AssertionError("fit with NaNs should throw an error") _ = afreg.fit_transform(pd.DataFrame(X[:900], columns=["x 1.1", 2, "x/3"]), pd.DataFrame(target[:900])) try: _ = afreg.predict(pd.DataFrame(X[900:], columns=["x 1.1", 2, "x/3"])) except ValueError: pass else: raise AssertionError("predict with NaNs should throw an error") df = afreg.transform(pd.DataFrame(X, columns=["x 1.1", 2, "x/3"])) assert all([pd.isna(df.iloc[998, 0]), pd.isna(df.iloc[999, 1])]), "Original features should be NaNs" assert np.sum( np.array(pd.isna(df.iloc[998]), dtype=int)) >= 2, "There should be at least 2 NaNs in row 998" assert np.sum( np.array(pd.isna(df.iloc[999]), dtype=int)) >= 2, "There should be at least 3 NaNs in row 999"
def test_weird_colnames(): # autofeat with df with weird column names X, target = get_random_data() afreg = AutoFeatRegressor(verbose=1, feateng_steps=3) df = afreg.fit_transform(pd.DataFrame(X, columns=["x 1.1", 2, "x/3"]), pd.DataFrame(target)) assert afreg.score(pd.DataFrame(X, columns=["x 1.1", 2, "x/3"]), target) >= 0.999, "R^2 should be 1." assert list(df.columns)[:3] == ["x 1.1", "2", "x/3"], "Wrong column names" # error if the column names aren't the same as before try: afreg.score(pd.DataFrame(X, columns=["x 11", 2, "x/3"]), target) except ValueError: pass else: raise AssertionError("Should throw error on mismatch column names")
def test_feateng_cols(): X, target = get_random_data() afreg = AutoFeatRegressor(verbose=1, feateng_cols=["x1", "x3", "x4"], feateng_steps=3) try: df = afreg.fit_transform(pd.DataFrame(X, columns=["x1", "x2", "x3"]), target) except ValueError: pass else: raise AssertionError("feateng_cols not in df should throw an error") afreg = AutoFeatRegressor(verbose=1, feateng_cols=["x1", "x3"], feateng_steps=3) df = afreg.fit_transform(pd.DataFrame(X, columns=["x1", "x2", "x3"]), target) for c in df.columns[3:]: assert "x2" not in c, "only feateng_cols should occur in engineered features"
def regression(self): X_train, X_test, y_train, y_test, categoricalCols = \ self.dataFunction(preprocessed=True, specifics="AUTOFEAT", trainSize=self.trainSize, steps=self.steps, nDataPoints=self.nDataPoints) featureEngColumns = None # If feature engineering not wanted for categorical values, uncomment # if categoricalCols is not None: # featureEngColumns = X_train.columns.values.tolist() # featureEngColumns = [i for i in featureEngColumns + categoricalCols if i not in featureEngColumns or i not in categoricalCols] # Measure runtime start_time = time.time() print(f"Start time: {start_time}") # Automated feature engineering with autofeat model = AutoFeatRegressor(verbose=1, feateng_steps=self.feateng_steps, featsel_runs=self.featuresel_steps, categorical_cols=categoricalCols, feateng_cols=featureEngColumns) # Fit model and get transformed dataframe with additional features x_train_extended = model.fit_transform(X_train, y_train) total_time = int(divmod(time.time() - start_time, 60)[0]) print(f"Time: {total_time}") # Export model dump( model, f"{self.savePath}/feng{model.feateng_steps}_fsel{model.featsel_runs}_time{total_time}_model.joblib" ) x_test_extended = model.transform(X_test) # Predictions predictions = {} predictionModel = DecisionTreeRegressor() predictionModel.fit(x_train_extended, y_train) predictions["DecisionTree"] = mean_squared_error( y_test, predictionModel.predict(x_test_extended)) print(f"Final MSE prediction score: {predictions['DecisionTree']}") predictionModel = RandomForestRegressor(n_estimators=10) predictionModel.fit(x_train_extended, y_train) predictions["RandomForest"] = mean_squared_error( y_test, predictionModel.predict(x_test_extended)) print(f"Final MSE prediction score: {predictions['RandomForest']}") predictionModel = LinearRegression() predictionModel.fit(x_train_extended, y_train) predictions["LinearRegression"] = mean_squared_error( y_test, predictionModel.predict(x_test_extended)) print(f"Final MSE prediction score: {predictions['LinearRegression']}") predictionModel = LassoLarsCV(cv=5) predictionModel.fit(x_train_extended, y_train) predictions["LassoLarsCV"] = mean_squared_error( y_test, predictionModel.predict(x_test_extended)) print(f"Final MSE prediction score: {predictions['LassoLarsCV']}") # Additionally save transformations steps since not saved in joblib file predictions["new_features"] = model.new_feat_cols_ # Export predictions with open( f"{self.savePath}/feng{model.feateng_steps}_fsel{model.featsel_runs}_performance.pkl", 'wb') as file: pickle.dump(predictions, file) return model
# autofeat with numpy arrays but as classification X, target = get_random_data() target = np.array(target > target.mean(), dtype=int) afreg = AutoFeatClassifier(verbose=1, feateng_steps=3) df = afreg.fit_transform(X, target) assert afreg.score(X, target) >= 0.9999, "Accuracy should be 1." assert afreg.score(df, target) >= 0.9999, "Accuracy should be 1." assert list(df.columns)[:3] == ["x000", "x001", "x002"], "Wrong column names" if __name__ == '__main__': print("## Running sklearn Regressor tests") # we allow for nan in transform successful_tests = set(["check_estimators_nan_inf"]) for estimator, check in check_estimator(AutoFeatRegressor( feateng_steps=1, featsel_runs=1, always_return_numpy=True), generate_only=True): if check.func.__name__ not in successful_tests: print(check.func.__name__) successful_tests.add(check.func.__name__) check(estimator) # additionally check the class, but don't run all the other tests for estimator, check in check_estimator(AutoFeatRegressor, generate_only=True): if check.func.__name__ not in successful_tests: print(check.func.__name__) successful_tests.add(check.func.__name__) check(estimator) print("## Running sklearn Classifier tests") # we allow for nan in transform
import pandas as pd import matplotlib.pyplot as plt from autofeat import FeatureSelector, AutoFeatRegressor from sklearn.pipeline import make_pipeline import pickle def main(): # Get the dataset from the users GitHub repository dataset_path = "https://raw.githubusercontent.com/" + os.environ["GITHUB_REPOSITORY"] +"/master/dataset.csv" df = pd.read_csv(dataset_path) print() print(df.describe()) for steps in range(5): np.random.seed(55) print("### AutoFeat with %i feateng_steps" % steps) afreg = AutoFeatRegressor(verbose=1, feateng_steps=steps) df = afreg.fit_transform(df_org, target) r2 = afreg.score(df_org, target) print("## Final R^2: %.4f" % r2) plt.figure() plt.scatter(afreg.predict(df_org), target, s=2); plt.title("%i FE steps (R^2: %.4f; %i new features)" % (steps, r2, len(afreg.new_feat_cols_))) afreg = AutoFeatRegressor(verbose=1, feateng_steps=3) # train on noisy data df = afreg.fit_transform(df_org, target_noisy) # test on real targets print("Final R^2: %.4f" % afreg.score(df, target)) plt.figure() plt.scatter(afreg.predict(df), target, s=2); afreg = AutoFeatRegressor(verbose=1, feateng_steps=3) # train on noisy data