def test_get_stats(self): tmp = np.array([1, np.nan, 2, 3, np.nan, np.nan]) self.assertEqual(1, PreprocessingUtils.get_min(tmp)) self.assertEqual(2, PreprocessingUtils.get_mean(tmp)) self.assertEqual(2, PreprocessingUtils.get_median(tmp)) d = {"col1": [1, 2, 1, 3, 1, np.nan], "col2": ["a", np.nan, "b", "a", "c", "a"]} df = pd.DataFrame(data=d) self.assertEqual(1, PreprocessingUtils.get_min(df["col1"])) self.assertEqual(8.0 / 5.0, PreprocessingUtils.get_mean(df["col1"])) self.assertEqual(1, PreprocessingUtils.get_median(df["col1"])) self.assertEqual(1, PreprocessingUtils.get_most_frequent(df["col1"])) self.assertEqual("a", PreprocessingUtils.get_most_frequent(df["col2"]))
def _get_fill_value(self, x): # categorical type if PreprocessingUtils.get_type(x) == PreprocessingUtils.CATEGORICAL: if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MIN: return (PreprocessingMissingValues.MISSING_VALUE ) # add new categorical value return PreprocessingUtils.get_most_frequent(x) # numerical type if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MIN: return PreprocessingUtils.get_min(x) - 1.0 if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MEAN: return PreprocessingUtils.get_mean(x) return PreprocessingUtils.get_median(x)