Пример #1
0
def _train_test_split_with_object(X,
                                  y,
                                  shuffle=True,
                                  random_state=None,
                                  stratify=None,
                                  **kwargs):
    """
    cuml.train_test_split raise exception if y.dtype=='object', so we encode it
    """
    le = LabelEncoder()
    yt = le.fit_transform(y)

    if stratify is y:
        stratify = yt
    elif stratify is not None and str(stratify.dtype) == 'object':
        stratify = LabelEncoder().fit_transform(stratify)

    X_train, X_test, y_train, y_test = \
        train_test_split(X, yt, shuffle=shuffle, random_state=random_state, stratify=stratify, **kwargs)

    y_train_decoded = le.inverse_transform(y_train)
    y_test_decoded = le.inverse_transform(y_test)
    y_train_decoded.index = y_train.index
    y_test_decoded.index = y_test.index

    return X_train, X_test, y_train_decoded, y_test_decoded
    def create_features(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
    ):

        with timer("load data"):
            train = train_df.copy()
            len_train = len(train)
            org_cols = train.columns.tolist()
            test = test_df.copy()

        with timer("concat train and test"):
            total = cudf.concat([train, test], ignore_index=True).reset_index()
            del train, test
            gc.collect()

        with timer("combi cats"):
            new_cat_df = cudf.concat(
                [
                    xfeat.ConcatCombination(
                        drop_origin=True, r=r).fit_transform(
                            total[cat_cols].astype(str).fillna("none"))
                    for r in [2, 3, 4]
                ],
                axis="columns",
            )

            for col in new_cat_df.columns:
                le = LabelEncoder()
                new_cat_df[col] = le.fit_transform(
                    new_cat_df[col]).astype("category")

            total = cudf.concat(
                [total, new_cat_df],
                axis="columns",
            )

        with timer("end"):
            total = total.sort_values("index")
            new_cols = [
                col for col in total.columns
                if col not in org_cols + ["index"]
            ]

            self.train = total[new_cols].iloc[:len_train].reset_index(
                drop=True)
            self.test = total[new_cols].iloc[len_train:].reset_index(drop=True)