Exemplo n.º 1
0
def test_target_encoder_with_categorical_values(dataframes):
    for df in dataframes:
        fold = KFold(n_splits=2, shuffle=False)
        encoder = TargetEncoder(input_cols=["col1", "col2"], fold=fold)
        df_encoded = encoder.fit_transform(df)

        assert encoder.fold.get_n_splits() == 2
        assert list(sorted(
            encoder._target_encoders.keys())) == ["col1", "col2"]

        assert allclose(
            df_encoded["col1_te"],
            np.array([
                0.0,
                0.0,
                0.0,
                0.66666667,
                1.0,
                1.0,
                1.0,
            ]),
        )
        assert df.columns.tolist() == [
            "col1",
            "col2",
            "target",
        ]
        assert df_encoded.columns.tolist() == [
            "col1",
            "col2",
            "target",
            "col1_te",
            "col2_te",
        ]
Exemplo n.º 2
0
    def transform(self, X_train, y_train, X_test):
        X_train = X_train.copy()
        X_test = X_test.copy()

        if self.group_cols:
            X_train, _ = self._grouping(X_train, self.group_cols)
            X_test, group_keys = self._grouping(X_test, self.group_cols)
            _train_df = pd.concat([X_train, y_train], axis=1)
            df_cols = X_test.columns
            self.cols.extend(group_keys)
        else:
            _train_df = pd.concat([X_train, y_train], axis=1)
            df_cols = X_test.columns

        self.encoder = TargetEncoder(
            input_cols=self.cols,
            target_col=self.target,
            fold=self.splitter,
            output_suffix=f"_TE_{self.target}",
        )

        self.encoder.fit_transform(_train_df)

        output_df = self.encoder.transform(X_test)
        output_df = output_df.drop(columns=df_cols)

        return output_df
Exemplo n.º 3
0
def main():
    key_nunique = 5_000
    n_samples = 1_000_000

    records = []
    for i in range(1, 8):
        df = pd.DataFrame({
            "target":
            np.random.randint(-1, 1, n_samples * i),
            "col":
            np.random.randint(0, key_nunique, n_samples * i),
        })
        df_cudf = cudf.from_pandas(df)

        # Pandas version
        time_records = []
        for _ in range(5):
            t = time.process_time()
            fold = KFold(n_splits=5, shuffle=False)
            TargetEncoder(input_cols=["col"], fold=fold).fit_transform(df)
            elapsed_time = time.process_time() - t
            time_records.append(elapsed_time)

        records.append({
            "n_samples": n_samples * i,
            "n_unique_keys": key_nunique,
            "process_time_mean": np.mean(time_records),
            "process_time_std": np.std(time_records),
            "method": "CPU-pandas",
        })
        print(records[-1])

        # cuDF version
        time_records = []
        for _ in range(5):
            t = time.process_time()
            fold = KFold(n_splits=5, shuffle=False)
            TargetEncoder(input_cols=["col"], fold=fold).fit_transform(df_cudf)
            elapsed_time = time.process_time() - t
            time_records.append(elapsed_time)

        records.append({
            "n_samples": n_samples * i,
            "n_unique_keys": key_nunique,
            "process_time_mean": np.mean(time_records),
            "process_time_std": np.std(time_records),
            "method": "GPU-cuDF",
        })
        print(records[-1])

    pd.DataFrame(records).to_csv("./benchmark_target_encoding.csv",
                                 index=False)
Exemplo n.º 4
0
def xfeat_target_encoding(target_col: str, input_df: pd.DataFrame,
                          output_filename: str) -> None:

    _train = input_df.dropna(subset=[target_col]).copy()
    _test = input_df.loc[input_df[target_col].isnull()].copy()

    fold = KFold(n_splits=5, shuffle=True, random_state=111)
    encoder = TargetEncoder(fold=fold, target_col=target_col, output_suffix="")
    _train = encoder.fit_transform(_train)
    _test = encoder.transform(_test)

    pd.concat([_train, _test], sort=False).drop(
        target_col, axis=1).reset_index(drop=True).to_feather(output_filename)
Exemplo n.º 5
0
def test_target_encoder(dataframes_targetencoder):
    for df, df_test in dataframes_targetencoder:
        fold = KFold(n_splits=2, shuffle=False)
        encoder = TargetEncoder(input_cols=["col1", "col2"], fold=fold)
        df_encoded = encoder.fit_transform(df)
        assert allclose(df_encoded["col1_te"],
                        np.array([
                            0.0,
                            0.0,
                            0.0,
                            0.66666667,
                            1.0,
                            1.0,
                            1.0,
                        ]))
        assert df_encoded.columns.tolist() == [
            "col1",
            "col2",
            "target",
            "col1_te",
            "col2_te",
        ]
        assert df.columns.tolist() == [
            "col1",
            "col2",
            "target",
        ]

        df_test_encoded = encoder.transform(df_test)

        assert allclose(df_test_encoded["col1_te"],
                        np.array([0.333333, 0.833333]))
        assert allclose(df_test_encoded["col2_te"], np.array([0.5, 0.5]))
        assert df_test_encoded.columns.tolist() == [
            "col1",
            "col2",
            "col1_te",
            "col2_te",
        ]
        assert df_test.columns.tolist() == [
            "col1",
            "col2",
        ]
Exemplo n.º 6
0
    def fit_transform(self, input_df, y):
        input_df = input_df.copy()
        num_fold = input_df["fold"].nunique()
        output_df = pd.DataFrame()

        if self.group_cols:
            input_df, group_keys = self._grouping(input_df, self.group_cols)
            df_cols = input_df.columns.append(y.columns)
            self.cols.extend(group_keys)
        else:
            df_cols = input_df.columns.append(y.columns)

        self.encoder = TargetEncoder(
            input_cols=self.cols,
            target_col=self.target,
            fold=self.splitter,
            output_suffix=f"_TE_{self.target}",
        )

        for fold_id in range(num_fold):
            X_train_fold = input_df[input_df["fold"] != fold_id]
            X_valid_fold = input_df[input_df["fold"] == fold_id]

            y_train_fold = y.iloc[X_train_fold.index]
            y_valid_fold = y.iloc[X_valid_fold.index]

            _train_df = pd.concat([X_train_fold, y_train_fold], axis=1)
            _valid_df = pd.concat([X_valid_fold, y_valid_fold], axis=1)

            self.encoder.fit_transform(_train_df)
            _valid_df = self.encoder.transform(_valid_df)

            output_df = pd.concat([output_df, _valid_df], axis=0)

        output_df = output_df.sort_index()
        output_df = output_df.drop(columns=df_cols)

        return output_df
Exemplo n.º 7
0
            LabelEncoder(output_suffix=''),
            ConcatCombination(drop_origin=True, r=2),
            LabelEncoder(output_suffix=''),
        ],
        input_df=train[categorical_cols],
        output_filename=
        '../input/petfinder-adoption-prediction/ConcatCombinationR2.ftr')

    # ConcatCombination r=2 & CountEncoder
    xfeat_runner(
        pipelines=[
            LabelEncoder(output_suffix=''),
            ConcatCombination(drop_origin=True, r=2),
            CountEncoder(),
        ],
        input_df=train[categorical_cols],
        output_filename=
        '../input/petfinder-adoption-prediction/ConcatCombinationCountEncoder.ftr'
    )

    # TargetEncoder
    xfeat_runner(pipelines=[
        TargetEncoder(
            fold=KFold(n_splits=5, shuffle=True, random_state=7),
            target_col=target_col,
        ),
    ],
                 input_df=train[categorical_cols + [target_col]],
                 output_filename=
                 '../input/petfinder-adoption-prediction/TargetEncoder.ftr')
Exemplo n.º 8
0
class TargetEncodingBlock(BaseBlock):
    def __init__(self, cols, group_cols, target, splitter):
        self.cols = cols
        self.group_cols = group_cols if group_cols is not None else None
        self.target = target
        self.splitter = splitter if splitter is not None else None
        self.encoder = None

    def fit_transform(self, input_df, y):
        input_df = input_df.copy()
        num_fold = input_df["fold"].nunique()
        output_df = pd.DataFrame()

        if self.group_cols:
            input_df, group_keys = self._grouping(input_df, self.group_cols)
            df_cols = input_df.columns.append(y.columns)
            self.cols.extend(group_keys)
        else:
            df_cols = input_df.columns.append(y.columns)

        self.encoder = TargetEncoder(
            input_cols=self.cols,
            target_col=self.target,
            fold=self.splitter,
            output_suffix=f"_TE_{self.target}",
        )

        for fold_id in range(num_fold):
            X_train_fold = input_df[input_df["fold"] != fold_id]
            X_valid_fold = input_df[input_df["fold"] == fold_id]

            y_train_fold = y.iloc[X_train_fold.index]
            y_valid_fold = y.iloc[X_valid_fold.index]

            _train_df = pd.concat([X_train_fold, y_train_fold], axis=1)
            _valid_df = pd.concat([X_valid_fold, y_valid_fold], axis=1)

            self.encoder.fit_transform(_train_df)
            _valid_df = self.encoder.transform(_valid_df)

            output_df = pd.concat([output_df, _valid_df], axis=0)

        output_df = output_df.sort_index()
        output_df = output_df.drop(columns=df_cols)

        return output_df

    def transform(self, X_train, y_train, X_test):
        X_train = X_train.copy()
        X_test = X_test.copy()

        if self.group_cols:
            X_train, _ = self._grouping(X_train, self.group_cols)
            X_test, group_keys = self._grouping(X_test, self.group_cols)
            _train_df = pd.concat([X_train, y_train], axis=1)
            df_cols = X_test.columns
            self.cols.extend(group_keys)
        else:
            _train_df = pd.concat([X_train, y_train], axis=1)
            df_cols = X_test.columns

        self.encoder = TargetEncoder(
            input_cols=self.cols,
            target_col=self.target,
            fold=self.splitter,
            output_suffix=f"_TE_{self.target}",
        )

        self.encoder.fit_transform(_train_df)

        output_df = self.encoder.transform(X_test)
        output_df = output_df.drop(columns=df_cols)

        return output_df

    def _grouping(self, input_df, group_cols):
        group_keys = []
        for cols in group_cols:
            key = "grouping_" + "_and_".join([col for col in cols])
            if len(cols) == 2:
                input_df[key] = (input_df[cols[0]].astype(str) + "_" +
                                 input_df[cols[1]].astype(str))
            elif len(cols) == 3:
                input_df[key] = (input_df[cols[0]].astype(str) + "_" +
                                 input_df[cols[1]].astype(str) + "_" +
                                 input_df[cols[2]].astype(str))
            group_keys.append(key)

        return input_df, group_keys
        X_train, X_valid, X_test = extract_text_tfidf(X_train, X_valid, X_test)

    with t.timer('sparse count encording'):
        encoder = CountEncoder(input_cols=count_encording_source)
        X_train = encoder.fit_transform(X_train)
        X_valid = encoder.transform(X_valid)
        X_test = encoder.transform(X_test)

    with t.timer('sparse target encording'):
        y_train = pd.read_csv(f'{INPUT_DIR}/train.solution',
                              header=None).T.values[0].reshape(-1, 1)
        scaler, y_train = label_scaling(y_train)
        X_train['target'] = y_train
        fold = KFold(n_splits=5, random_state=RANDOM_STATE, shuffle=True)
        encoder = TargetEncoder(input_cols=target_encording_source,
                                target_col=['target'],
                                fold=fold)
        X_train = encoder.fit_transform(X_train)
        X_valid = encoder.transform(X_valid)
        X_test = encoder.transform(X_test)

    with t.timer('varlen sparse count encording'):
        for feat in varlen_count_encording_source:
            logging.info(f'[varlen sparse count encording]{feat}')
            X_train, X_valid, X_test = varlen_count_encording(
                feat, X_train, X_valid, X_test)

    with t.timer('varlen sparse target encording'):
        y_train = pd.read_csv(f'{INPUT_DIR}/train.solution',
                              header=None).T.values[0].reshape(-1, 1)
        scaler, y_train = label_scaling(y_train)