示例#1
0
def compress_df(df: XDataFrame, verbose=False) -> XDataFrame:
    """Reduce memory usage by converting data types.

    For compatibility with feather, float16 is not used.

    Returns:
        The reduce data frame.
    """
    _num_dtypes = [
        "int16",
        "int32",
        "int64",
        "float32",
        "float64",
    ]
    start_mem_usage = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype
        if col_type in _num_dtypes:
            min_val, max_val = df[col].min(), df[col].max()
            if str(col_type).startswith("int"):
                if (min_val >= np.iinfo(np.int8).min
                        and max_val <= np.iinfo(np.int8).max):
                    df[col] = df[col].astype(np.int8)
                elif (min_val >= np.iinfo(np.int16).min
                      and max_val <= np.iinfo(np.int16).max):
                    df[col] = df[col].astype(np.int16)
                elif (min_val >= np.iinfo(np.int32).min
                      and max_val <= np.iinfo(np.int32).max):
                    df[col] = df[col].astype(np.int32)
                elif (min_val >= np.iinfo(np.int64).min
                      and max_val <= np.iinfo(np.int64).max):
                    df[col] = df[col].astype(np.int64)
            else:
                # NOTE: half float is not supported in feather.

                if (min_val >= np.finfo(np.float32).min
                        and max_val <= np.finfo(np.float32).max):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem_usage = df.memory_usage().sum() / 1024**2
    if verbose:
        logger.warning("Memory reduced from {:.2f} MB to {:.2f} MB".format(
            start_mem_usage,
            end_mem_usage,
        ))

    return df
示例#2
0
    def transform(self, input_df: XDataFrame) -> XDataFrame:
        """Transform data frame.

        Args:
            input_df (XDataFrame): Input data frame.
        Returns:
            XDataFrame : Output data frame.
        """
        new_df = input_df.copy()

        input_cols = self._input_cols
        if not input_cols:
            input_cols = new_df.columns.tolist()

        if self._exclude_cols:
            for col in self._exclude_cols:
                input_cols.remove(col)

        for col in input_cols:
            out_col = self._output_prefix + col + self._output_suffix
            if cudf_is_available() and isinstance(new_df, cudf.DataFrame):
                X = self._uniques[col].get_indexer(new_df[col].to_array())
            else:
                X = self._uniques[col].get_indexer(new_df[col])
            if self._unseen == "n_unique":
                missing_values = new_df[col].isna()
                unseen_values = np.invert(new_df[col].isin(self._uniques[col]))
                unseen_mask = np.bitwise_xor(missing_values, unseen_values)
                X[unseen_mask] = len(self._uniques[col])

            new_df[out_col] = X

        return new_df
示例#3
0
    def transform(self, input_df: XDataFrame) -> XDataFrame:
        """Transform data frame.

        Args:
            input_df (XDataFrame): Input data frame.
        Returns:
            XDataFrame : Output data frame.
        """
        new_df = input_df.copy()

        input_cols = self._input_cols
        if not input_cols:
            input_cols = new_df.columns.tolist()

        for col in input_cols:
            out_col = self._output_prefix + col + self._output_suffix
            X = self._uniques[col].get_indexer(new_df[col])

            if self._unseen == "n_unique":
                missing_values = new_df[col].isna()
                unseen_values = np.invert(new_df[col].isin(self._uniques[col]))
                unseen_mask = np.bitwise_xor(missing_values, unseen_values)
                X[unseen_mask] = len(self._uniques[col])

            missing_values = new_df[col].isna()
            unseen_values = np.invert(new_df[col].isin(self._uniques[col]))
            unseen_mask = np.bitwise_xor(missing_values, unseen_values)
            X[~unseen_mask] = np.array(self._labels[col])[X[~unseen_mask]]

            new_df[out_col] = X

        return new_df
示例#4
0
def karunru_analyze_columns(
        input_df: XDataFrame) -> Tuple[List[str], List[str]]:
    """Classify columns to numerical or categorical.

    Args:
        input_df (XDataFrame) : Input data frame.
    Returns:
        Tuple[List[str], List[str]] : List of num cols and cat cols.

    Example:
        ::
            >>> import pandas as pd
            >>> from xfeat.utils import analyze_columns
            >>> df = pd.DataFrame({"col1": [1, 2], "col2": [2, 3], "col3": ["a", "b"]})
            >>> analyze_columns(df)
            (['col1', 'col2'], ['col3'])
    """
    numerical_cols = []
    categorical_cols = input_df.select_dtypes("category").columns.tolist()
    for col in [
            col for col in input_df.columns if col not in categorical_cols
    ]:
        if pd.api.types.is_numeric_dtype(input_df[col]):
            numerical_cols.append(col)
        else:
            categorical_cols.append(col)
    return numerical_cols, categorical_cols
示例#5
0
def reduce_mem_usage(df: XDataFrame,
                     verbose: bool = True,
                     debug: bool = True) -> XDataFrame:
    start_mem = df.memory_usage().sum() / 1024**2

    df = compress_df(df)

    end_mem = df.memory_usage().sum() / 1024**2
    reduction = (start_mem - end_mem) / start_mem

    msg = (f"Mem. usage decreased to {end_mem:5.2f} MB" +
           f" ({reduction * 100:.1f} % reduction)")
    if verbose:
        print(msg)

    if debug:
        logging.debug(msg)

    return df
示例#6
0
    def transform(self, input_df: XDataFrame) -> XDataFrame:
        """Transform data frame.

        Args:
            input_df (XDataFrame): Input data frame.
        Returns:
            XDataFrame : Output data frame.
        """
        if isinstance(input_df, pd.DataFrame):
            new_df = input_df.copy()
        elif cudf_is_available() and isinstance(input_df, cudf.DataFrame):
            new_df = input_df.to_pandas()
        else:
            raise RuntimeError("Unexpected data type: {}".format(type(input_df)))
        generated_cols = []

        input_cols = self._input_cols
        if not input_cols:
            input_cols = new_df.columns.tolist()
        if len(self._exclude_cols) > 0:
            input_cols = [col for col in input_cols if col not in self._exclude_cols]

        for col in input_cols:
            new_col = self._output_prefix + col + self._output_suffix
            if self._fillna is not None:
                new_df[new_col] = (
                    new_df[col].fillna(self._fillna).apply(self._lambda_func)
                )
            else:
                new_df[new_col] = new_df[col].apply(self._lambda_func)

            generated_cols.append(new_col)

        if cudf_is_available() and isinstance(input_df, cudf.DataFrame):
            new_df = cudf.from_pandas(new_df)

        if self._drop_origin:
            return new_df[generated_cols]

        return new_df
    def fit_transform(self, input_df: XDataFrame) -> XDataFrame:
        """Fit to data frame, then transform it.

        Args:
            input_df (XDataFrame): Input data frame.
        Returns:
            XDataFrame : Output data frame.
        """
        new_df = input_df.copy()

        if not self._input_cols:
            self._input_cols = [
                col for col in new_df.columns.tolist()
                if (col not in self._exclude_cols)
            ]

        return self.transform(new_df)
示例#8
0
    def transform(self, input_df: XDataFrame) -> XDataFrame:
        """Transform data frame.

        Args:
            input_df (XDataFrame): Input data frame.

        Returns:
            XDataFrame: Output data frame.
        """
        new_df = input_df.copy()

        for col in self._input_cols:
            out_col = self._output_prefix + col + self._output_suffix
            count_encoder = self._count_encoders[col]
            new_df[out_col] = count_encoder.transform(new_df[col].copy())

        return new_df
    def transform(self, input_df: XDataFrame) -> XDataFrame:
        """Transform data frame.

        Args:
            input_df (XDataFrame): Input data frame.
        """
        new_df = input_df.copy()
        generated_cols = []

        n_fixed_cols = len(self._include_cols)

        for cols_pairs in combinations(self._input_cols,
                                       r=self._r - n_fixed_cols):
            fixed_cols_str = "".join(self._include_cols)
            pairs_cols_str = "".join(cols_pairs)
            new_col = (self._output_prefix + fixed_cols_str + pairs_cols_str +
                       self._output_suffix)
            generated_cols.append(new_col)

            concat_cols = self._include_cols + list(cols_pairs)
            new_ser = None

            for col in concat_cols:
                if new_ser is None:
                    new_ser = new_df[col].copy()
                else:
                    if self._operator == "+":
                        new_ser = new_ser + new_df[col]
                    elif self._operator == "-":
                        new_ser = new_ser - new_df[col]
                    elif self._operator == "*":
                        new_ser = new_ser * new_df[col]
                    elif self._operator == "/":
                        new_ser = new_ser / new_df[col]
                    elif self._operator == "%":
                        new_ser = new_ser % new_df[col]
                    else:
                        raise RuntimeError("Unknown operator is used.")

            new_df[new_col] = new_ser

        if self._drop_origin:
            return new_df[generated_cols]

        return new_df
示例#10
0
    def fit_transform(self, input_df: XDataFrame) -> XDataFrame:
        """Fit to data frame, then transform it.

        Args:
            input_df (XDataFrame): Input data frame.
        Returns:
            XDataFrame : Output data frame.
        """
        if cudf_is_available() and isinstance(input_df, cudf.DataFrame):
            self._selected_cols = (
                input_df.to_pandas()
                .T.drop_duplicates(keep="first")
                .index.values.tolist()
            )
        else:
            self._selected_cols = input_df.T.drop_duplicates(
                keep="first"
            ).index.values.tolist()
        return input_df[self._selected_cols]
示例#11
0
    def transform(self, input_df: XDataFrame) -> XDataFrame:
        out_df = input_df.copy()

        for col in self._input_cols:
            out_col = self._output_prefix + col + self._output_suffix
            if isinstance(input_df[col], pd.Series):
                X = column_or_1d(input_df[col], warn=True)
            elif cudf and isinstance(input_df[col], cudf.Series):
                X = input_df[col]
            else:
                raise TypeError

            out_df[out_col] = self._target_encoders[col].transform(X)

        if self.noise_level > 0:
            np.random.seed(self.random_state)
            out_df += np.random.normal(0, self.noise_level, out_df.shape)

        return out_df
示例#12
0
    def transform(self, input_df: XDataFrame) -> XDataFrame:
        """Transform data frame.

        Args:
            input_df (XDataFrame): Input data frame.
        Returns:
            XDataFrame : Output data frame.
        """
        out_df = input_df.copy()

        for col in self._input_cols:
            out_col = self._output_prefix + col + self._output_suffix
            if isinstance(input_df[col], pd.Series):
                X = column_or_1d(input_df[col], warn=True)
            elif cudf and isinstance(input_df[col], cudf.Series):
                X = input_df[col]
            else:
                raise TypeError

            out_df[out_col] = self._target_encoders[col].transform(X)

        return out_df
示例#13
0
    def predict(self, model: TabNetModel, features: XDataFrame) -> np.ndarray:

        for col in features.select_dtypes(include="category").columns:
            features[col] = features[col].cat.add_categories("Unknown")
            features[col] = features[col].fillna("Unknown")
            features[col] = features[col].cat.codes

        numerical_cols = [
            col
            for col in features.columns
            if col not in self.config["categorical_cols"]
        ]
        for col in numerical_cols:
            features[col] = features[col].fillna(features[col].mean())

        if self.mode != "multiclass":
            return model.predict(features.values).reshape(
                -1,
            )
        else:
            preds = model.predict_proba(features, ntree_limit=model.best_ntree_limit)
            return preds @ np.arange(4) / 3
示例#14
0
def aggregation(
    input_df: XDataFrame,
    group_key: str,
    group_values: List[str],
    agg_methods: List[str],
):
    """Aggregate values after grouping table rows by a given key.

    Arguments:
        input_df (XDataFrame) : Input data frame.
        group_key (str) : Used to determine the groups for the groupby.
        group_values (List[str]) : Used to aggregate values for the groupby.
        agg_methods (List[str]) : List of function names, e.g. ['mean', 'max', 'min', 'std'].
    Returns:
        Tuple[XDataFrame, List[str]] : Tuple of output dataframe and new column names.
    """
    new_df = input_df.copy()

    new_cols = []
    for agg_method in agg_methods:
        for col in group_values:
            new_col = f"agg_{agg_method}_{col}_grpby_{group_key}"

            # NOTE(smly):
            # Failed when cudf.DataFrame try to merge with cudf.Series.
            # Use workaround to merge with cudf.DataFrame.
            # Ref: http://github.com/rapidsai/cudf/issues/5013
            df_agg = (input_df[[col] + [group_key]].groupby(group_key)[[
                col
            ]].agg(agg_method))
            df_agg.columns = [new_col]
            new_cols.append(new_col)
            new_df = new_df.merge(df_agg,
                                  how="left",
                                  right_index=True,
                                  left_on=group_key)

    return new_df, new_cols
示例#15
0
    def fit_transform(self, input_df: XDataFrame) -> XDataFrame:
        """Fit to data frame, then transform it.

        Args:
            input_df (XDataFrame): Input data frame.

        Returns:
            XDataFrame: Output data frame.
        """
        new_df = input_df.copy()

        input_cols = self._input_cols
        if not input_cols:
            input_cols = new_df.columns.tolist()
            self._input_cols = input_cols

        for col in self._input_cols:
            out_col = self._output_prefix + col + self._output_suffix
            count_encoder = _CountEncoder()
            self._count_encoders[col] = count_encoder
            new_df[out_col] = count_encoder.fit_transform(new_df[col].copy())

        return new_df
示例#16
0
    def fit_transform(self, input_df: XDataFrame, y: XSeries = None) -> XDataFrame:
        out_df = input_df.copy()

        input_cols = self._input_cols
        if not input_cols:
            input_cols = input_df.columns.tolist()
            self._input_cols = input_cols

        # Remove `target_col` from `self._input_cols`.
        if self._target_col in self._input_cols:
            self._input_cols.remove(self._target_col)

        for col in self._input_cols:
            out_col = self._output_prefix + col + self._output_suffix
            target_encoder = _TargetEncoder(self.fold)
            self._target_encoders[col] = target_encoder

            if isinstance(input_df[col], pd.Series):
                X = column_or_1d(input_df[col], warn=True)
                if y is None:
                    y = column_or_1d(input_df[self._target_col], warn=True)
                else:
                    y = column_or_1d(y, warn=True)
            elif cudf and isinstance(input_df[col], cudf.Series):
                X = input_df[col]
                if y is None:
                    y = input_df[self._target_col]
            else:
                raise TypeError

            out_df[out_col] = target_encoder.fit_transform(X, y).copy()

        if self.noise_level > 0:
            np.random.seed(self.random_state)
            out_df += np.random.normal(0, self.noise_level, out_df.shape)

        return out_df
示例#17
0
    def fit_transform(self, input_df: XDataFrame) -> XDataFrame:
        """Fit to data frame, then transform it.

        Args:
            input_df (XDataFrame): Input data frame.
        Returns:
            XDataFrame : Output data frame.
        """
        out_df = input_df.copy()

        input_cols = self._input_cols
        if not input_cols:
            input_cols = input_df.columns.tolist()
            self._input_cols = input_cols

        # Remove `target_col` from `self._input_cols`.
        if self._target_col in self._input_cols:
            self._input_cols.remove(self._target_col)

        for col in self._input_cols:
            out_col = self._output_prefix + col + self._output_suffix
            target_encoder = _TargetEncoder(self.fold)
            self._target_encoders[col] = target_encoder

            if isinstance(input_df[col], pd.Series):
                X = column_or_1d(input_df[col], warn=True)
                y = column_or_1d(input_df[self._target_col], warn=True)
            elif cudf and isinstance(input_df[col], cudf.Series):
                X = input_df[col]
                y = input_df[self._target_col]
            else:
                raise TypeError

            out_df[out_col] = target_encoder.fit_transform(X, y).copy()

        return out_df
示例#18
0
    def cv(
        self,
        y_train: AoS,
        train_features: XDataFrame,
        test_features: XDataFrame,
        y_valid: Optional[AoS],
        valid_features: Optional[XDataFrame],
        feature_name: List[str],
        folds_ids: List[Tuple[np.ndarray, np.ndarray]],
        target_scaler: Optional[MinMaxScaler],
        config: dict,
        log: bool = True,
    ) -> Tuple[List[Model], np.ndarray, np.ndarray, Optional[np.ndarray],
               pd.DataFrame, dict]:
        # initialize
        valid_exists = True if valid_features is not None else False
        test_preds = np.zeros(len(test_features))
        oof_preds = np.zeros(len(train_features))
        if valid_exists:
            valid_preds = np.zeros(len(valid_features))
        else:
            valid_preds = None
        importances = pd.DataFrame(index=feature_name)
        best_iteration = 0.0
        cv_score_list: List[dict] = []
        models: List[Model] = []

        with timer("make X"):
            X_train = train_features.copy()
            X_test = test_features.copy()
            X_valid = valid_features.copy(
            ) if valid_features is not None else None

        with timer("make y"):
            y = y_train.values if isinstance(y_train, pd.Series) else y_train
            y_valid = y_valid.values if isinstance(y_valid,
                                                   pd.Series) else y_valid

        for i_fold, (trn_idx, val_idx) in enumerate(folds_ids):
            self.fold = i_fold
            # get train data and valid data
            x_trn = X_train.iloc[trn_idx]
            y_trn = y[trn_idx]
            x_val = X_train.iloc[val_idx]
            y_val = y[val_idx]

            x_trn, y_trn = get_sampling(x_trn, y_trn, config)

            # train model
            model, best_score = self.fit(x_trn, y_trn, x_val, y_val, config)
            cv_score_list.append(best_score)
            models.append(model)
            best_iteration += self.get_best_iteration(model) / len(folds_ids)

            # predict oof and test
            oof_preds[val_idx] = self.predict(model, x_val).reshape(-1)
            test_preds += self.predict(model,
                                       X_test).reshape(-1) / len(folds_ids)

            if valid_exists:
                valid_preds += self.predict(
                    model, valid_features).reshape(-1) / len(folds_ids)

            # get feature importances
            importances_tmp = pd.DataFrame(
                self.get_feature_importance(model),
                columns=[f"gain_{i_fold+1}"],
                index=feature_name,
            )
            importances = importances.join(importances_tmp, how="inner")

        # summary of feature importance
        feature_importance = importances.mean(axis=1)

        # save raw prediction
        self.raw_oof_preds = oof_preds
        self.raw_test_preds = test_preds
        self.raw_valid_preds = valid_preds

        # post_process (if you have any)
        y, oof_preds, test_preds, y_valid, valid_preds = self.post_process(
            oof_preds=oof_preds,
            test_preds=test_preds,
            valid_preds=valid_preds,
            y_train=y_train,
            y_valid=y_valid,
            train_features=train_features,
            test_features=test_features,
            valid_features=valid_features,
            target_scaler=target_scaler,
            config=config,
        )

        # print oof score
        oof_score = calc_metric(y, oof_preds)
        print(f"oof score: {oof_score:.5f}")

        if valid_exists:
            valid_score = calc_metric(y_valid, valid_preds)
            print(f"valid score: {valid_score:.5f}")

        if log:
            logging.info(f"oof score: {oof_score:.5f}")
            if valid_exists:
                logging.info(f"valid score: {valid_score:.5f}")

        evals_results = {
            "evals_result": {
                "oof_score":
                oof_score,
                "cv_score": {
                    f"cv{i + 1}": cv_score
                    for i, cv_score in enumerate(cv_score_list)
                },
                "n_data":
                np.shape(X_train)[0],
                "best_iteration":
                best_iteration,
                "n_features":
                np.shape(X_train)[1],
                "feature_importance":
                feature_importance.sort_values(ascending=False).to_dict(),
            }
        }

        if valid_exists:
            evals_results["valid_score"] = valid_score
        return (
            models,
            oof_preds,
            test_preds,
            valid_preds,
            feature_importance,
            evals_results,
        )
示例#19
0
    def fit(self, input_df: XDataFrame) -> None:
        """Fit to data frame

        Args:
            input_df (XDataFrame): Input data frame.
        Returns:
            XDataFrame : Output data frame.
        """
        org_cols = input_df.columns.tolist()

        input_df = (input_df.to_pandas()
                    if isinstance(input_df, cudf.DataFrame) else input_df)

        seen_cols_pairs = (load_pickle(self.save_path /
                                       "seen_feats_pairs.pkl") if
                           (self.save_path / "seen_feats_pairs.pkl").exists()
                           else defaultdict(list))
        removed_cols_pairs = (load_pickle(self.save_path /
                                          "removed_feats_pairs.pkl") if
                              (self.save_path /
                               "removed_feats_pairs.pkl").exists() else
                              defaultdict(list))
        removed_cols = sum(removed_cols_pairs.values(), [])
        if self.dry_run:
            self._selected_cols = [
                col for col in org_cols if col not in set(removed_cols)
            ]
            return

        org_cols = [col for col in org_cols if col not in removed_cols]
        counter = 0
        for i in tqdm(range(len(org_cols) - 1)):
            feat_a_name = org_cols[i]
            if feat_a_name in removed_cols:
                continue

            feat_a = input_df[feat_a_name]

            for j in range(i + 1, len(org_cols)):
                feat_b_name = org_cols[j]

                if self._has_seen(feat_a_name, feat_b_name, seen_cols_pairs):
                    continue
                else:
                    seen_cols_pairs[feat_a_name].append(feat_b_name)
                    seen_cols_pairs[feat_b_name].append(feat_a_name)

                if self._has_removed(feat_a_name, feat_b_name, removed_cols):
                    continue

                feat_b = input_df[feat_b_name]
                c = np.corrcoef(feat_a, feat_b)[0][1]

                if abs(c) > self._threshold:
                    counter += 1
                    removed_cols.append(feat_b_name)
                    removed_cols_pairs[feat_a_name].append(feat_b_name)
                    print("{}: FEAT_A: {} FEAT_B: {} - Correlation: {}".format(
                        counter, feat_a_name, feat_b_name, c))

        save_pickle(removed_cols_pairs,
                    self.save_path / "removed_feats_pairs.pkl")
        save_pickle(seen_cols_pairs, self.save_path / "seen_feats_pairs.pkl")
        self._selected_cols = [
            col for col in org_cols if col not in set(removed_cols)
        ]