def test_fit(self): # training data d = { "col1": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10.0], "col2": [21, 22, 23, 24, 25, 26, 27, 28, 29, 30.0], } df = pd.DataFrame(data=d) scale = PreprocessingScale(["col1"]) scale.fit(df) df = scale.transform(df) assert_almost_equal(np.mean(df["col1"]), 0) assert_almost_equal(np.mean(df["col2"]), 25.5)
def test_to_and_from_json(self): # training data d = { "col1": [1, 2, 3, 4, 5, 6, 7, 8.0, 9, 10], "col2": [21, 22.0, 23, 24, 25, 26, 27, 28, 29, 30], } df = pd.DataFrame(data=d) scale = PreprocessingScale(["col1"]) scale.fit(df) # do not transform assert_almost_equal(np.mean(df["col1"]), 5.5) assert_almost_equal(np.mean(df["col2"]), 25.5) # to and from json json_data = scale.to_json() print(json_data) scale2 = PreprocessingScale() scale2.from_json(json_data) # transform with loaded scaler df = scale2.transform(df) assert_almost_equal(np.mean(df["col1"]), 0) assert_almost_equal(np.mean(df["col2"]), 25.5)
def run(self, train_data=None, validation_data=None): log.debug("PreprocessingStep.run") X_train, y_train = None, None if train_data is not None: if "X" in train_data: X_train = train_data.get("X").copy() if "y" in train_data: y_train = train_data.get("y").copy() X_validation, y_validation = None, None if validation_data is not None: if "X" in validation_data: X_validation = validation_data.get("X").copy() if "y" in validation_data: y_validation = validation_data.get("y").copy() if y_train is not None: # target preprocessing # this must be used first, maybe we will drop some rows because of missing target values target_preprocessing = self._params.get("target_preprocessing") log.debug( "target_preprocessing -> {}".format(target_preprocessing)) # if PreprocessingMissingValues.NA_EXCLUDE in target_preprocessing: X_train, y_train = PreprocessingExcludeMissingValues.transform( X_train, y_train) if validation_data is not None: X_validation, y_validation = PreprocessingExcludeMissingValues.transform( X_validation, y_validation) if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing: self._categorical_y = LabelEncoder() self._categorical_y.fit(y_train) y_train = pd.Series(self._categorical_y.transform(y_train)) if y_validation is not None and self._categorical_y is not None: y_validation = pd.Series( self._categorical_y.transform(y_validation)) if PreprocessingScale.SCALE_LOG_AND_NORMAL in target_preprocessing: log.error("not implemented SCALE_LOG_AND_NORMAL") raise Exception("not implemented SCALE_LOG_AND_NORMAL") if PreprocessingScale.SCALE_NORMAL in target_preprocessing: log.error("not implemented SCALE_NORMAL") raise Exception("not implemented SCALE_NORMAL") # columns preprocessing columns_preprocessing = self._params.get("columns_preprocessing") for column in columns_preprocessing: transforms = columns_preprocessing[column] log.debug("Preprocess column -> {}, {}".format(column, transforms)) # remove empty or constant columns cols_to_remove = list( filter( lambda k: "remove_column" in columns_preprocessing[k], columns_preprocessing, )) if X_train is not None: X_train.drop(cols_to_remove, axis=1, inplace=True) if X_validation is not None: X_validation.drop(cols_to_remove, axis=1, inplace=True) self._remove_columns = cols_to_remove for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]: cols_to_process = list( filter( lambda k: missing_method in columns_preprocessing[k], columns_preprocessing, )) missing = PreprocessingMissingValues(cols_to_process, missing_method) missing.fit(X_train) X_train = missing.transform(X_train) if X_validation is not None: X_validation = missing.transform(X_validation) self._missing_values += [missing] for convert_method in [PreprocessingCategorical.CONVERT_INTEGER]: cols_to_process = list( filter( lambda k: convert_method in columns_preprocessing[k], columns_preprocessing, )) convert = PreprocessingCategorical(cols_to_process, convert_method) convert.fit(X_train) X_train = convert.transform(X_train) if X_validation is not None: X_validation = convert.transform(X_validation) self._categorical += [convert] # SCALE for scale_method in [PreprocessingScale.SCALE_NORMAL]: cols_to_process = list( filter( lambda k: scale_method in columns_preprocessing[k], columns_preprocessing, )) if len(cols_to_process): scale = PreprocessingScale(cols_to_process) scale.fit(X_train) X_train = scale.transform(X_train) if X_validation is not None: X_validation = scale.transform(X_validation) self._scale += [scale] return { "X": X_train, "y": y_train }, { "X": X_validation, "y": y_validation }