def test_empty_column(self): # training data d = { "col1": [np.nan, np.nan, np.nan, np.nan], "col2": [5, 6, 7, 0], "col3": [1, 1, 1, 3], "col4": [2, 2, 4, 3], "y": [0, 1, 0, 1], } df = pd.DataFrame(data=d) X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] y_train = df.loc[:, "y"] preprocessing_params = {"columns_preprocessing": {"col1": ["remove_column"]}} ps = Preprocessing(preprocessing_params) X_train1, _ = ps.fit_and_transform(X_train, y_train) self.assertTrue("col1" not in X_train1.columns) self.assertEqual(3, len(X_train1.columns)) X_train2, _ = ps.transform(X_train, y_train) self.assertTrue("col1" not in X_train2.columns) self.assertEqual(3, len(X_train2.columns)) for col in ["col2", "col3", "col4"]: self.assertTrue(col in X_train2.columns) params_json = ps.to_json() ps2 = Preprocessing() ps2.from_json(params_json) X_train3, _ = ps2.transform(X_train, y_train) self.assertTrue("col1" not in X_train3.columns) self.assertEqual(3, len(X_train3.columns)) for col in ["col2", "col3", "col4"]: self.assertTrue(col in X_train3.columns)
def test_run_fill_median_convert_integer(self): # training data d = { "col1": [1, 1, np.nan, 3], "col2": ["a", "a", np.nan, "a"], "col3": [1, 1, 1, 3], "col4": ["a", "a", "b", "c"], "y": [0, 1, 0, 1], } df = pd.DataFrame(data=d) X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] y_train = df.loc[:, "y"] preprocessing_params = { "columns_preprocessing": { "col1": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], "col2": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], "col3": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], "col4": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], } } ps = Preprocessing(preprocessing_params) X_train, y_train = ps.fit_and_transform(X_train, y_train) for col in ["col1", "col2", "col3", "col4"]: self.assertTrue(col in X_train.columns) self.assertEqual(X_train["col1"][2], 1) self.assertEqual(X_train["col2"][2], 0) self.assertEqual(X_train["col4"][0], 0) self.assertEqual(X_train["col4"][1], 0) self.assertEqual(X_train["col4"][2], 1) self.assertEqual(X_train["col4"][3], 2) params_json = ps.to_json() self.assertTrue("missing_values" in params_json) self.assertTrue("categorical" in params_json) self.assertTrue("categorical_y" not in params_json) self.assertTrue("fill_params" in params_json["missing_values"][0]) self.assertEqual( "na_fill_median", params_json["missing_values"][0]["fill_method"] ) self.assertTrue("convert_params" in params_json["categorical"][0]) self.assertEqual( "categorical_to_int", params_json["categorical"][0]["convert_method"] )
def test_to_and_from_json_run_fill_median_convert_integer(self): # training data d = { "col1": [1, 1, np.nan, 3], "col2": ["a", "a", np.nan, "a"], "col3": [1, 1, 1, 3], "col4": ["a", "a", "b", "c"], "y": [0, 1, 0, 1], } df = pd.DataFrame(data=d) X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] y_train = df.loc[:, "y"] preprocessing_params = { "columns_preprocessing": { "col1": [PreprocessingMissingValues.FILL_NA_MEDIAN], "col2": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], "col4": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], }, "target_preprocessing": [], } ps = Preprocessing(preprocessing_params) _, _ = ps.fit_and_transform(X_train, y_train) ps2 = Preprocessing() ps2.from_json(ps.to_json()) del ps d_test = { "col1": [1, 1, np.nan, 3], "col2": ["a", "a", np.nan, "a"], "col3": [1, 1, 1, 3], "col4": ["a", "a", "b", "c"], "y": [np.nan, np.nan, 1, 1], } df_test = pd.DataFrame(data=d_test) X_test = df_test.loc[:, ["col1", "col2", "col3", "col4"]] y_test = df_test.loc[:, "y"] X_test, y_test = ps2.transform(X_test, y_test) self.assertEqual(2, y_test.shape[0]) self.assertEqual(2, np.sum(y_test)) self.assertEqual(1, X_test["col1"].iloc[0]) self.assertEqual(0, X_test["col2"].iloc[0])
def test_run_all_good(self): # training data d = { "col1": [1, 1, 1, 3], "col2": [5, 6, 7, 0], "col3": [1, 1, 1, 3], "col4": [2, 2, 4, 3], "y": [0, 1, 0, 1], } df = pd.DataFrame(data=d) X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] y_train = df.loc[:, "y"] preprocessing_params = { "columns_preprocessing": { "col1": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], "col2": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], "col3": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], "col4": [ PreprocessingMissingValues.FILL_NA_MEDIAN, PreprocessingCategorical.CONVERT_INTEGER, ], } } ps = Preprocessing(preprocessing_params) X_train, y_train = ps.fit_and_transform(X_train, y_train) for col in ["col1", "col2", "col3", "col4"]: self.assertTrue(col in X_train.columns) params_json = ps.to_json() self.assertEqual(len(params_json), 1) # should store params only self.assertTrue("params" in params_json)