예제 #1
0
    def transform(self, input_df: XDataFrame) -> XDataFrame:
        """Transform data frame.

        Args:
            input_df (XDataFrame): Input data frame.
        Returns:
            XDataFrame : Output data frame.
        """
        new_df = input_df.copy()

        input_cols = self._input_cols
        if not input_cols:
            input_cols = new_df.columns.tolist()

        if self._exclude_cols:
            for col in self._exclude_cols:
                input_cols.remove(col)

        for col in input_cols:
            out_col = self._output_prefix + col + self._output_suffix
            if cudf_is_available() and isinstance(new_df, cudf.DataFrame):
                X = self._uniques[col].get_indexer(new_df[col].to_array())
            else:
                X = self._uniques[col].get_indexer(new_df[col])
            if self._unseen == "n_unique":
                missing_values = new_df[col].isna()
                unseen_values = np.invert(new_df[col].isin(self._uniques[col]))
                unseen_mask = np.bitwise_xor(missing_values, unseen_values)
                X[unseen_mask] = len(self._uniques[col])

            new_df[out_col] = X

        return new_df
예제 #2
0
    def transform(self, input_df: XDataFrame) -> XDataFrame:
        """Transform data frame.

        Args:
            input_df (XDataFrame): Input data frame.
        Returns:
            XDataFrame : Output data frame.
        """
        new_df = input_df.copy()

        input_cols = self._input_cols
        if not input_cols:
            input_cols = new_df.columns.tolist()

        for col in input_cols:
            out_col = self._output_prefix + col + self._output_suffix
            X = self._uniques[col].get_indexer(new_df[col])

            if self._unseen == "n_unique":
                missing_values = new_df[col].isna()
                unseen_values = np.invert(new_df[col].isin(self._uniques[col]))
                unseen_mask = np.bitwise_xor(missing_values, unseen_values)
                X[unseen_mask] = len(self._uniques[col])

            missing_values = new_df[col].isna()
            unseen_values = np.invert(new_df[col].isin(self._uniques[col]))
            unseen_mask = np.bitwise_xor(missing_values, unseen_values)
            X[~unseen_mask] = np.array(self._labels[col])[X[~unseen_mask]]

            new_df[out_col] = X

        return new_df
    def fit_transform(self, input_df: XDataFrame) -> XDataFrame:
        """Fit to data frame, then transform it.

        Args:
            input_df (XDataFrame): Input data frame.
        Returns:
            XDataFrame : Output data frame.
        """
        new_df = input_df.copy()

        if not self._input_cols:
            self._input_cols = [
                col for col in new_df.columns.tolist()
                if (col not in self._exclude_cols)
            ]

        return self.transform(new_df)
예제 #4
0
    def transform(self, input_df: XDataFrame) -> XDataFrame:
        """Transform data frame.

        Args:
            input_df (XDataFrame): Input data frame.

        Returns:
            XDataFrame: Output data frame.
        """
        new_df = input_df.copy()

        for col in self._input_cols:
            out_col = self._output_prefix + col + self._output_suffix
            count_encoder = self._count_encoders[col]
            new_df[out_col] = count_encoder.transform(new_df[col].copy())

        return new_df
    def transform(self, input_df: XDataFrame) -> XDataFrame:
        """Transform data frame.

        Args:
            input_df (XDataFrame): Input data frame.
        """
        new_df = input_df.copy()
        generated_cols = []

        n_fixed_cols = len(self._include_cols)

        for cols_pairs in combinations(self._input_cols,
                                       r=self._r - n_fixed_cols):
            fixed_cols_str = "".join(self._include_cols)
            pairs_cols_str = "".join(cols_pairs)
            new_col = (self._output_prefix + fixed_cols_str + pairs_cols_str +
                       self._output_suffix)
            generated_cols.append(new_col)

            concat_cols = self._include_cols + list(cols_pairs)
            new_ser = None

            for col in concat_cols:
                if new_ser is None:
                    new_ser = new_df[col].copy()
                else:
                    if self._operator == "+":
                        new_ser = new_ser + new_df[col]
                    elif self._operator == "-":
                        new_ser = new_ser - new_df[col]
                    elif self._operator == "*":
                        new_ser = new_ser * new_df[col]
                    elif self._operator == "/":
                        new_ser = new_ser / new_df[col]
                    elif self._operator == "%":
                        new_ser = new_ser % new_df[col]
                    else:
                        raise RuntimeError("Unknown operator is used.")

            new_df[new_col] = new_ser

        if self._drop_origin:
            return new_df[generated_cols]

        return new_df
예제 #6
0
    def transform(self, input_df: XDataFrame) -> XDataFrame:
        out_df = input_df.copy()

        for col in self._input_cols:
            out_col = self._output_prefix + col + self._output_suffix
            if isinstance(input_df[col], pd.Series):
                X = column_or_1d(input_df[col], warn=True)
            elif cudf and isinstance(input_df[col], cudf.Series):
                X = input_df[col]
            else:
                raise TypeError

            out_df[out_col] = self._target_encoders[col].transform(X)

        if self.noise_level > 0:
            np.random.seed(self.random_state)
            out_df += np.random.normal(0, self.noise_level, out_df.shape)

        return out_df
예제 #7
0
    def transform(self, input_df: XDataFrame) -> XDataFrame:
        """Transform data frame.

        Args:
            input_df (XDataFrame): Input data frame.
        Returns:
            XDataFrame : Output data frame.
        """
        if isinstance(input_df, pd.DataFrame):
            new_df = input_df.copy()
        elif cudf_is_available() and isinstance(input_df, cudf.DataFrame):
            new_df = input_df.to_pandas()
        else:
            raise RuntimeError("Unexpected data type: {}".format(type(input_df)))
        generated_cols = []

        input_cols = self._input_cols
        if not input_cols:
            input_cols = new_df.columns.tolist()
        if len(self._exclude_cols) > 0:
            input_cols = [col for col in input_cols if col not in self._exclude_cols]

        for col in input_cols:
            new_col = self._output_prefix + col + self._output_suffix
            if self._fillna is not None:
                new_df[new_col] = (
                    new_df[col].fillna(self._fillna).apply(self._lambda_func)
                )
            else:
                new_df[new_col] = new_df[col].apply(self._lambda_func)

            generated_cols.append(new_col)

        if cudf_is_available() and isinstance(input_df, cudf.DataFrame):
            new_df = cudf.from_pandas(new_df)

        if self._drop_origin:
            return new_df[generated_cols]

        return new_df
예제 #8
0
    def transform(self, input_df: XDataFrame) -> XDataFrame:
        """Transform data frame.

        Args:
            input_df (XDataFrame): Input data frame.
        Returns:
            XDataFrame : Output data frame.
        """
        out_df = input_df.copy()

        for col in self._input_cols:
            out_col = self._output_prefix + col + self._output_suffix
            if isinstance(input_df[col], pd.Series):
                X = column_or_1d(input_df[col], warn=True)
            elif cudf and isinstance(input_df[col], cudf.Series):
                X = input_df[col]
            else:
                raise TypeError

            out_df[out_col] = self._target_encoders[col].transform(X)

        return out_df
예제 #9
0
def aggregation(
    input_df: XDataFrame,
    group_key: str,
    group_values: List[str],
    agg_methods: List[str],
):
    """Aggregate values after grouping table rows by a given key.

    Arguments:
        input_df (XDataFrame) : Input data frame.
        group_key (str) : Used to determine the groups for the groupby.
        group_values (List[str]) : Used to aggregate values for the groupby.
        agg_methods (List[str]) : List of function names, e.g. ['mean', 'max', 'min', 'std'].
    Returns:
        Tuple[XDataFrame, List[str]] : Tuple of output dataframe and new column names.
    """
    new_df = input_df.copy()

    new_cols = []
    for agg_method in agg_methods:
        for col in group_values:
            new_col = f"agg_{agg_method}_{col}_grpby_{group_key}"

            # NOTE(smly):
            # Failed when cudf.DataFrame try to merge with cudf.Series.
            # Use workaround to merge with cudf.DataFrame.
            # Ref: http://github.com/rapidsai/cudf/issues/5013
            df_agg = (input_df[[col] + [group_key]].groupby(group_key)[[
                col
            ]].agg(agg_method))
            df_agg.columns = [new_col]
            new_cols.append(new_col)
            new_df = new_df.merge(df_agg,
                                  how="left",
                                  right_index=True,
                                  left_on=group_key)

    return new_df, new_cols
예제 #10
0
    def fit_transform(self, input_df: XDataFrame) -> XDataFrame:
        """Fit to data frame, then transform it.

        Args:
            input_df (XDataFrame): Input data frame.

        Returns:
            XDataFrame: Output data frame.
        """
        new_df = input_df.copy()

        input_cols = self._input_cols
        if not input_cols:
            input_cols = new_df.columns.tolist()
            self._input_cols = input_cols

        for col in self._input_cols:
            out_col = self._output_prefix + col + self._output_suffix
            count_encoder = _CountEncoder()
            self._count_encoders[col] = count_encoder
            new_df[out_col] = count_encoder.fit_transform(new_df[col].copy())

        return new_df
예제 #11
0
    def fit_transform(self, input_df: XDataFrame, y: XSeries = None) -> XDataFrame:
        out_df = input_df.copy()

        input_cols = self._input_cols
        if not input_cols:
            input_cols = input_df.columns.tolist()
            self._input_cols = input_cols

        # Remove `target_col` from `self._input_cols`.
        if self._target_col in self._input_cols:
            self._input_cols.remove(self._target_col)

        for col in self._input_cols:
            out_col = self._output_prefix + col + self._output_suffix
            target_encoder = _TargetEncoder(self.fold)
            self._target_encoders[col] = target_encoder

            if isinstance(input_df[col], pd.Series):
                X = column_or_1d(input_df[col], warn=True)
                if y is None:
                    y = column_or_1d(input_df[self._target_col], warn=True)
                else:
                    y = column_or_1d(y, warn=True)
            elif cudf and isinstance(input_df[col], cudf.Series):
                X = input_df[col]
                if y is None:
                    y = input_df[self._target_col]
            else:
                raise TypeError

            out_df[out_col] = target_encoder.fit_transform(X, y).copy()

        if self.noise_level > 0:
            np.random.seed(self.random_state)
            out_df += np.random.normal(0, self.noise_level, out_df.shape)

        return out_df
예제 #12
0
    def fit_transform(self, input_df: XDataFrame) -> XDataFrame:
        """Fit to data frame, then transform it.

        Args:
            input_df (XDataFrame): Input data frame.
        Returns:
            XDataFrame : Output data frame.
        """
        out_df = input_df.copy()

        input_cols = self._input_cols
        if not input_cols:
            input_cols = input_df.columns.tolist()
            self._input_cols = input_cols

        # Remove `target_col` from `self._input_cols`.
        if self._target_col in self._input_cols:
            self._input_cols.remove(self._target_col)

        for col in self._input_cols:
            out_col = self._output_prefix + col + self._output_suffix
            target_encoder = _TargetEncoder(self.fold)
            self._target_encoders[col] = target_encoder

            if isinstance(input_df[col], pd.Series):
                X = column_or_1d(input_df[col], warn=True)
                y = column_or_1d(input_df[self._target_col], warn=True)
            elif cudf and isinstance(input_df[col], cudf.Series):
                X = input_df[col]
                y = input_df[self._target_col]
            else:
                raise TypeError

            out_df[out_col] = target_encoder.fit_transform(X, y).copy()

        return out_df
예제 #13
0
    def cv(
        self,
        y_train: AoS,
        train_features: XDataFrame,
        test_features: XDataFrame,
        y_valid: Optional[AoS],
        valid_features: Optional[XDataFrame],
        feature_name: List[str],
        folds_ids: List[Tuple[np.ndarray, np.ndarray]],
        target_scaler: Optional[MinMaxScaler],
        config: dict,
        log: bool = True,
    ) -> Tuple[List[Model], np.ndarray, np.ndarray, Optional[np.ndarray],
               pd.DataFrame, dict]:
        # initialize
        valid_exists = True if valid_features is not None else False
        test_preds = np.zeros(len(test_features))
        oof_preds = np.zeros(len(train_features))
        if valid_exists:
            valid_preds = np.zeros(len(valid_features))
        else:
            valid_preds = None
        importances = pd.DataFrame(index=feature_name)
        best_iteration = 0.0
        cv_score_list: List[dict] = []
        models: List[Model] = []

        with timer("make X"):
            X_train = train_features.copy()
            X_test = test_features.copy()
            X_valid = valid_features.copy(
            ) if valid_features is not None else None

        with timer("make y"):
            y = y_train.values if isinstance(y_train, pd.Series) else y_train
            y_valid = y_valid.values if isinstance(y_valid,
                                                   pd.Series) else y_valid

        for i_fold, (trn_idx, val_idx) in enumerate(folds_ids):
            self.fold = i_fold
            # get train data and valid data
            x_trn = X_train.iloc[trn_idx]
            y_trn = y[trn_idx]
            x_val = X_train.iloc[val_idx]
            y_val = y[val_idx]

            x_trn, y_trn = get_sampling(x_trn, y_trn, config)

            # train model
            model, best_score = self.fit(x_trn, y_trn, x_val, y_val, config)
            cv_score_list.append(best_score)
            models.append(model)
            best_iteration += self.get_best_iteration(model) / len(folds_ids)

            # predict oof and test
            oof_preds[val_idx] = self.predict(model, x_val).reshape(-1)
            test_preds += self.predict(model,
                                       X_test).reshape(-1) / len(folds_ids)

            if valid_exists:
                valid_preds += self.predict(
                    model, valid_features).reshape(-1) / len(folds_ids)

            # get feature importances
            importances_tmp = pd.DataFrame(
                self.get_feature_importance(model),
                columns=[f"gain_{i_fold+1}"],
                index=feature_name,
            )
            importances = importances.join(importances_tmp, how="inner")

        # summary of feature importance
        feature_importance = importances.mean(axis=1)

        # save raw prediction
        self.raw_oof_preds = oof_preds
        self.raw_test_preds = test_preds
        self.raw_valid_preds = valid_preds

        # post_process (if you have any)
        y, oof_preds, test_preds, y_valid, valid_preds = self.post_process(
            oof_preds=oof_preds,
            test_preds=test_preds,
            valid_preds=valid_preds,
            y_train=y_train,
            y_valid=y_valid,
            train_features=train_features,
            test_features=test_features,
            valid_features=valid_features,
            target_scaler=target_scaler,
            config=config,
        )

        # print oof score
        oof_score = calc_metric(y, oof_preds)
        print(f"oof score: {oof_score:.5f}")

        if valid_exists:
            valid_score = calc_metric(y_valid, valid_preds)
            print(f"valid score: {valid_score:.5f}")

        if log:
            logging.info(f"oof score: {oof_score:.5f}")
            if valid_exists:
                logging.info(f"valid score: {valid_score:.5f}")

        evals_results = {
            "evals_result": {
                "oof_score":
                oof_score,
                "cv_score": {
                    f"cv{i + 1}": cv_score
                    for i, cv_score in enumerate(cv_score_list)
                },
                "n_data":
                np.shape(X_train)[0],
                "best_iteration":
                best_iteration,
                "n_features":
                np.shape(X_train)[1],
                "feature_importance":
                feature_importance.sort_values(ascending=False).to_dict(),
            }
        }

        if valid_exists:
            evals_results["valid_score"] = valid_score
        return (
            models,
            oof_preds,
            test_preds,
            valid_preds,
            feature_importance,
            evals_results,
        )