def test_nans(): # nans are ok in transform but not fit or predict (due to sklearn model) X, target = get_random_data() X[998, 0] = np.nan X[999, 1] = np.nan afreg = AutoFeatRegressor(verbose=1, feateng_steps=3) try: _ = afreg.fit_transform(pd.DataFrame(X, columns=["x 1.1", 2, "x/3"]), pd.DataFrame(target)) except ValueError: pass else: raise AssertionError("fit with NaNs should throw an error") _ = afreg.fit_transform(pd.DataFrame(X[:900], columns=["x 1.1", 2, "x/3"]), pd.DataFrame(target[:900])) try: _ = afreg.predict(pd.DataFrame(X[900:], columns=["x 1.1", 2, "x/3"])) except ValueError: pass else: raise AssertionError("predict with NaNs should throw an error") df = afreg.transform(pd.DataFrame(X, columns=["x 1.1", 2, "x/3"])) assert all([pd.isna(df.iloc[998, 0]), pd.isna(df.iloc[999, 1])]), "Original features should be NaNs" assert np.sum( np.array(pd.isna(df.iloc[998]), dtype=int)) >= 2, "There should be at least 2 NaNs in row 998" assert np.sum( np.array(pd.isna(df.iloc[999]), dtype=int)) >= 2, "There should be at least 3 NaNs in row 999"
def main(): # Get the dataset from the users GitHub repository dataset_path = "https://raw.githubusercontent.com/" + os.environ["GITHUB_REPOSITORY"] +"/master/dataset.csv" df = pd.read_csv(dataset_path) print() print(df.describe()) for steps in range(5): np.random.seed(55) print("### AutoFeat with %i feateng_steps" % steps) afreg = AutoFeatRegressor(verbose=1, feateng_steps=steps) df = afreg.fit_transform(df_org, target) r2 = afreg.score(df_org, target) print("## Final R^2: %.4f" % r2) plt.figure() plt.scatter(afreg.predict(df_org), target, s=2); plt.title("%i FE steps (R^2: %.4f; %i new features)" % (steps, r2, len(afreg.new_feat_cols_))) afreg = AutoFeatRegressor(verbose=1, feateng_steps=3) # train on noisy data df = afreg.fit_transform(df_org, target_noisy) # test on real targets print("Final R^2: %.4f" % afreg.score(df, target)) plt.figure() plt.scatter(afreg.predict(df), target, s=2); afreg = AutoFeatRegressor(verbose=1, feateng_steps=3) # train on noisy data df = afreg.fit_transform(df_org, target_very_noisy) # test on real targets print("Final R^2: %.4f" % afreg.score(df, target)) plt.figure() plt.scatter(afreg.predict(df), target, s=2);