Exemplo n.º 1
0
    def from_json(self, data_json):

        self._params = data_json.get("params", self._params)

        if "remove_columns" in data_json:
            self._remove_columns = data_json.get("remove_columns", [])
        if "missing_values" in data_json:
            self._missing_values = []
            for mv_data in data_json["missing_values"]:
                mv = PreprocessingMissingValues()
                mv.from_json(mv_data)
                self._missing_values += [mv]
        if "categorical" in data_json:
            self._categorical = []
            for cat_data in data_json["categorical"]:
                cat = PreprocessingCategorical()
                cat.from_json(cat_data)
                self._categorical += [cat]

        if "datetime_transforms" in data_json:
            self._datetime_transforms = []
            for dtt_params in data_json["datetime_transforms"]:
                dtt = DateTimeTransformer()
                dtt.from_json(dtt_params)
                self._datetime_transforms += [dtt]

        if "text_transforms" in data_json:
            self._text_transforms = []
            for tt_params in data_json["text_transforms"]:
                tt = TextTransformer()
                tt.from_json(tt_params)
                self._text_transforms += [tt]

        if "golden_features" in data_json:
            self._golden_features = GoldenFeaturesTransformer()
            self._golden_features.from_json(data_json["golden_features"])

        if "scale" in data_json:
            self._scale = []
            for scale_data in data_json["scale"]:
                sc = Scale()
                sc.from_json(scale_data)
                self._scale += [sc]
        if "categorical_y" in data_json:
            if "new_columns" in data_json["categorical_y"]:
                self._categorical_y = LabelBinarizer()
            else:
                self._categorical_y = LabelEncoder()

            self._categorical_y.from_json(data_json["categorical_y"])
        if "scale_y" in data_json:
            self._scale_y = Scale()
            self._scale_y.from_json(data_json["scale_y"])
        if "ml_task" in data_json:
            self._params["ml_task"] = data_json["ml_task"]

        self._add_random_feature = data_json.get("add_random_feature", False)
        self._drop_features = data_json.get("drop_features", [])
Exemplo n.º 2
0
    def fit_and_transform(self, X_train, y_train, sample_weight=None):
        logger.debug("Preprocessing.fit_and_transform")

        if y_train is not None:
            # target preprocessing
            # this must be used first, maybe we will drop some rows because of missing target values
            target_preprocessing = self._params.get("target_preprocessing")
            logger.debug(
                "target_preprocessing params: {}".format(target_preprocessing))

            X_train, y_train, sample_weight = ExcludeRowsMissingTarget.transform(
                X_train, y_train, sample_weight)

            if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
                logger.debug("Convert target to integer")
                self._categorical_y = LabelEncoder(try_to_fit_numeric=True)
                self._categorical_y.fit(y_train)
                y_train = pd.Series(self._categorical_y.transform(y_train))

            if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing:
                logger.debug("Convert target to one-hot coding")
                self._categorical_y = LabelBinarizer()
                self._categorical_y.fit(pd.DataFrame({"target": y_train}),
                                        "target")
                y_train = self._categorical_y.transform(
                    pd.DataFrame({"target": y_train}), "target")

            if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing:
                logger.debug("Scale log and normal")

                self._scale_y = Scale(["target"],
                                      scale_method=Scale.SCALE_LOG_AND_NORMAL)
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

            if Scale.SCALE_NORMAL in target_preprocessing:
                logger.debug("Scale normal")

                self._scale_y = Scale(["target"],
                                      scale_method=Scale.SCALE_NORMAL)
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

        # columns preprocessing
        columns_preprocessing = self._params.get("columns_preprocessing")
        for column in columns_preprocessing:
            transforms = columns_preprocessing[column]
            # logger.debug("Preprocess column {} with: {}".format(column, transforms))

        # remove empty or constant columns
        cols_to_remove = list(
            filter(
                lambda k: "remove_column" in columns_preprocessing[k],
                columns_preprocessing,
            ))

        if X_train is not None:
            X_train.drop(cols_to_remove, axis=1, inplace=True)
        self._remove_columns = cols_to_remove

        numeric_cols = []  # get numeric cols before text transformations
        # needed for golden features
        if X_train is not None and ("golden_features" in self._params
                                    or "kmeans_features" in self._params):
            numeric_cols = X_train.select_dtypes(
                include="number").columns.tolist()

        # there can be missing values in the text data,
        # but we don't want to handle it by fill missing methods
        # zeros will be imputed by text_transform method
        cols_to_process = list(
            filter(
                lambda k: "text_transform" in columns_preprocessing[k],
                columns_preprocessing,
            ))

        new_text_columns = []
        for col in cols_to_process:
            t = TextTransformer()
            t.fit(X_train, col)
            X_train = t.transform(X_train)
            self._text_transforms += [t]
            new_text_columns += t._new_columns
        # end of text transform

        for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]:
            cols_to_process = list(
                filter(
                    lambda k: missing_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            missing = PreprocessingMissingValues(cols_to_process,
                                                 missing_method)
            missing.fit(X_train)
            X_train = missing.transform(X_train)
            self._missing_values += [missing]

        # golden features
        golden_columns = []
        if "golden_features" in self._params:
            results_path = self._params["golden_features"]["results_path"]
            ml_task = self._params["golden_features"]["ml_task"]
            self._golden_features = GoldenFeaturesTransformer(
                results_path, ml_task)
            self._golden_features.fit(X_train[numeric_cols], y_train)
            X_train = self._golden_features.transform(X_train)
            golden_columns = self._golden_features._new_columns

        kmeans_columns = []
        if "kmeans_features" in self._params:
            results_path = self._params["kmeans_features"]["results_path"]
            self._kmeans = KMeansTransformer(results_path, self._model_name,
                                             self._k_fold)
            self._kmeans.fit(X_train[numeric_cols], y_train)
            X_train = self._kmeans.transform(X_train)
            kmeans_columns = self._kmeans._new_features

        for convert_method in [
                PreprocessingCategorical.CONVERT_INTEGER,
                PreprocessingCategorical.CONVERT_ONE_HOT,
                PreprocessingCategorical.CONVERT_LOO,
        ]:
            cols_to_process = list(
                filter(
                    lambda k: convert_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            convert = PreprocessingCategorical(cols_to_process, convert_method)
            convert.fit(X_train, y_train)
            X_train = convert.transform(X_train)
            self._categorical += [convert]

        # datetime transform
        cols_to_process = list(
            filter(
                lambda k: "datetime_transform" in columns_preprocessing[k],
                columns_preprocessing,
            ))

        new_datetime_columns = []
        for col in cols_to_process:

            t = DateTimeTransformer()
            t.fit(X_train, col)
            X_train = t.transform(X_train)
            self._datetime_transforms += [t]
            new_datetime_columns += t._new_columns

        # SCALE
        for scale_method in [Scale.SCALE_NORMAL, Scale.SCALE_LOG_AND_NORMAL]:
            cols_to_process = list(
                filter(
                    lambda k: scale_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            if (len(cols_to_process) and len(new_datetime_columns)
                    and scale_method == Scale.SCALE_NORMAL):
                cols_to_process += new_datetime_columns
            if (len(cols_to_process) and len(new_text_columns)
                    and scale_method == Scale.SCALE_NORMAL):
                cols_to_process += new_text_columns

            if (len(cols_to_process) and len(golden_columns)
                    and scale_method == Scale.SCALE_NORMAL):
                cols_to_process += golden_columns

            if (len(cols_to_process) and len(kmeans_columns)
                    and scale_method == Scale.SCALE_NORMAL):
                cols_to_process += kmeans_columns

            if len(cols_to_process):
                scale = Scale(cols_to_process)
                scale.fit(X_train)
                X_train = scale.transform(X_train)
                self._scale += [scale]

        if self._add_random_feature:
            # -1, 1, with 0 mean
            X_train["random_feature"] = np.random.rand(
                X_train.shape[0]) * 2.0 - 1.0

        if self._drop_features:
            available_cols = X_train.columns.tolist()
            drop_cols = [c for c in self._drop_features if c in available_cols]
            if len(drop_cols) == X_train.shape[1]:
                raise AutoMLException(
                    "All features are droppped! Your data looks like random data."
                )
            if drop_cols:
                X_train.drop(drop_cols, axis=1, inplace=True)
            self._drop_features = drop_cols

        if X_train is not None:
            # there can be catagorical columns (in CatBoost) which cant be clipped
            numeric_cols = X_train.select_dtypes(
                include="number").columns.tolist()
            X_train[numeric_cols] = X_train[numeric_cols].clip(
                lower=np.finfo(np.float32).min + 1000,
                upper=np.finfo(np.float32).max - 1000,
            )

        return X_train, y_train, sample_weight
Exemplo n.º 3
0
    def fit_and_transform(self, X_train, y_train):
        logger.debug("Preprocessing.fit_and_transform")

        if y_train is not None:
            # target preprocessing
            # this must be used first, maybe we will drop some rows because of missing target values
            target_preprocessing = self._params.get("target_preprocessing")
            logger.debug("target_preprocessing params: {}".format(target_preprocessing))

            X_train, y_train = ExcludeRowsMissingTarget.transform(X_train, y_train)

            if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
                logger.debug("Convert target to integer")
                self._categorical_y = LabelEncoder()
                self._categorical_y.fit(y_train)
                y_train = pd.Series(self._categorical_y.transform(y_train))

            if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing:
                logger.debug("Convert target to one-hot coding")
                self._categorical_y = LabelBinarizer()
                self._categorical_y.fit(pd.DataFrame({"target": y_train}), "target")
                y_train = self._categorical_y.transform(
                    pd.DataFrame({"target": y_train}), "target"
                )

            if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing:
                logger.debug("Scale log and normal")

                self._scale_y = Scale(
                    ["target"], scale_method=Scale.SCALE_LOG_AND_NORMAL
                )
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

            if Scale.SCALE_NORMAL in target_preprocessing:
                logger.debug("Scale normal")

                self._scale_y = Scale(["target"], scale_method=Scale.SCALE_NORMAL)
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

        # columns preprocessing
        columns_preprocessing = self._params.get("columns_preprocessing")
        for column in columns_preprocessing:
            transforms = columns_preprocessing[column]
            # logger.debug("Preprocess column {} with: {}".format(column, transforms))

        # remove empty or constant columns
        cols_to_remove = list(
            filter(
                lambda k: "remove_column" in columns_preprocessing[k],
                columns_preprocessing,
            )
        )

        if X_train is not None:
            X_train.drop(cols_to_remove, axis=1, inplace=True)
        self._remove_columns = cols_to_remove

        # there can be missing values in the text data,
        # but we don't want to handle it by fill missing methods
        # zeros will be imputed by text_transform method
        cols_to_process = list(
            filter(
                lambda k: "text_transform" in columns_preprocessing[k],
                columns_preprocessing,
            )
        )

        new_text_columns = []
        for col in cols_to_process:
            t = TextTransformer()
            t.fit(X_train, col)
            X_train = t.transform(X_train)
            self._text_transforms += [t]
            new_text_columns += t._new_columns
        # end of text transform

        for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]:
            cols_to_process = list(
                filter(
                    lambda k: missing_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            missing = PreprocessingMissingValues(cols_to_process, missing_method)
            missing.fit(X_train)
            X_train = missing.transform(X_train)
            self._missing_values += [missing]

        for convert_method in [
            PreprocessingCategorical.CONVERT_INTEGER,
            PreprocessingCategorical.CONVERT_ONE_HOT,
        ]:
            cols_to_process = list(
                filter(
                    lambda k: convert_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            convert = PreprocessingCategorical(cols_to_process, convert_method)
            convert.fit(X_train)
            X_train = convert.transform(X_train)
            self._categorical += [convert]

        # datetime transform
        cols_to_process = list(
            filter(
                lambda k: "datetime_transform" in columns_preprocessing[k],
                columns_preprocessing,
            )
        )

        new_datetime_columns = []
        for col in cols_to_process:

            t = DateTimeTransformer()
            t.fit(X_train, col)
            X_train = t.transform(X_train)
            self._datetime_transforms += [t]
            new_datetime_columns += t._new_columns

        # SCALE
        for scale_method in [Scale.SCALE_NORMAL, Scale.SCALE_LOG_AND_NORMAL]:
            cols_to_process = list(
                filter(
                    lambda k: scale_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            if (
                len(cols_to_process)
                and len(new_datetime_columns)
                and scale_method == Scale.SCALE_NORMAL
            ):
                cols_to_process += new_datetime_columns
            if (
                len(cols_to_process)
                and len(new_text_columns)
                and scale_method == Scale.SCALE_NORMAL
            ):
                cols_to_process += new_text_columns

            if len(cols_to_process):
                scale = Scale(cols_to_process)
                scale.fit(X_train)
                X_train = scale.transform(X_train)
                self._scale += [scale]

        return X_train, y_train
Exemplo n.º 4
0
    def test_transformer(self):

        d = {
            "col1": [
                "This is the first document.",
                "This document is the second document.",
                "And this is the third one.",
                None,
                "Is this the first document?",
            ]
        }
        df = pd.DataFrame(data=d)
        df_org = df.copy()

        transf = TextTransformer()
        transf.fit(df, "col1")
        df = transf.transform(df)
        self.assertTrue(df.shape[0] == 5)
        self.assertTrue("col1" not in df.columns)

        transf2 = TextTransformer()
        transf2.from_json(transf.to_json())
        df2 = transf2.transform(df_org)
        self.assertTrue("col1" not in df2.columns)

        assert_almost_equal(df.iloc[0, 0], df2.iloc[0, 0])