def test_DataCleaner_big_nan_handler_warning(self): """Ensure the DataCleaner throws a warning or error when the number of nan samples and fraction is high (i.e., something has gone horribly wrong in featurization!)""" dc = DataCleaner(max_na_frac=0.01) df = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=list("ABCD")) dc.fit(df, "D") self.assertEqual(len(dc.warnings), 0) df["A"].iloc[10:20] = np.nan df["B"].iloc[:99] = np.nan dc.fit(df, "D") self.assertEqual(len(dc.warnings), 1)
def test_DataCleaner_emergency_na_transform_imputation(self): """For the case where a fit DataCleaner must include feature X, but in the df-to-be-transformed that feature is all nan, which makes it unable to be imputed correctly. Current implementation dictates this "emergency" be resolved by imputing with the mean of feature x from the fitted_df.""" dc = DataCleaner() # should work regardless of default df = self.test_df fit_df = df.iloc[:150] trs_df = df.iloc[151:] trs_df["range X"] = [np.nan] * trs_df.shape[0] dc.fit(fit_df, self.target) trs_df2 = dc.transform(trs_df, self.target) self.assertAlmostEqual(trs_df2["range X"].mean(), fit_df["range X"].mean()) self.assertAlmostEqual(trs_df2["range X"].std(), 0.0)