Пример #1
0
def _add_fold(s, kfold, fold_seed=None):
    """Deterministically computes a '__fold__' column, given an optional
    random seed"""
    typ = np.min_scalar_type(kfold * 2)
    if fold_seed is None:
        # If we don't have a specific seed,
        # just use a simple modulo-based mapping
        fold = _arange(len(s), like_df=s, dtype=typ)
        np.mod(fold, kfold, out=fold)
        return fold
    else:
        state = _random_state(fold_seed, like_df=s)
        return state.choice(_arange(kfold, like_df=s, dtype=typ), len(s))
Пример #2
0
    def transform(self, col_selector: ColumnSelector,
                  df: DataFrameType) -> DataFrameType:
        new_df = type(df)()
        tmp = "__tmp__"  # Temporary column for sorting
        df[tmp] = _arange(len(df), like_df=df, dtype="int32")

        cat_names, multi_col_group = nvt_cat._get_multicolumn_names(
            col_selector, df.columns, self.name_sep)

        _read_pq_func = _read_parquet_dispatch(df)
        for name in cat_names:
            new_part = type(df)()
            storage_name = self.storage_name.get(name, name)
            name = multi_col_group.get(name, name)
            path = self.categories[storage_name]
            selection_l = list(name) if isinstance(name, tuple) else [name]
            selection_r = list(name) if isinstance(name,
                                                   tuple) else [storage_name]

            stat_df = nvt_cat._read_groupby_stat_df(path, storage_name,
                                                    self.cat_cache,
                                                    _read_pq_func)
            tran_df = df[selection_l + [tmp]].merge(stat_df,
                                                    left_on=selection_l,
                                                    right_on=selection_r,
                                                    how="left")
            tran_df = tran_df.sort_values(tmp)
            tran_df.drop(columns=selection_l + [tmp], inplace=True)
            new_cols = [c for c in tran_df.columns if c not in new_df.columns]
            new_part = tran_df[new_cols].reset_index(drop=True)
            new_df = _concat_columns([new_df, new_part])
        df.drop(columns=[tmp], inplace=True)
        return new_df
Пример #3
0
    def transform(self, columns: ColumnNames, df: DataFrameType) -> DataFrameType:
        # Add temporary column for sorting
        tmp = "__tmp__"
        df[tmp] = _arange(len(df), like_df=df, dtype="int32")

        fit_folds = self.kfold > 1
        if fit_folds:
            df[self.fold_name] = _add_fold(df.index, self.kfold, self.fold_seed)

        # Need mean of contiuous target column
        y_mean = self.target_mean or self.means

        # Loop over categorical-column groups and apply logic
        new_df = None
        for ind, cat_group in enumerate(columns):
            if isinstance(cat_group, tuple):
                cat_group = list(cat_group)
            elif isinstance(cat_group, str):
                cat_group = [cat_group]

            if new_df is None:
                new_df = self._op_group_logic(cat_group, df, y_mean, fit_folds, ind)
            else:
                _df = self._op_group_logic(cat_group, df, y_mean, fit_folds, ind)
                new_df = _concat_columns([new_df, _df])

        # Drop temporary columns
        df.drop(columns=[tmp, "__fold__"] if fit_folds and self.drop_folds else [tmp], inplace=True)
        if fit_folds and not self.drop_folds:
            new_df[self.fold_name] = df[self.fold_name]
        return new_df
Пример #4
0
 def transform(self, col_selector: ColumnSelector,
               df: DataFrameType) -> DataFrameType:
     self.cpu = isinstance(df, pd.DataFrame)
     tmp = "__tmp__"  # Temporary column for sorting
     df[tmp] = _arange(len(df), like_df=df, dtype="int32")
     new_df = self._merge(df, self._ext)
     new_df = new_df.sort_values(tmp)
     new_df.drop(columns=[tmp], inplace=True)
     df.drop(columns=[tmp], inplace=True)
     new_df.reset_index(drop=True, inplace=True)
     return new_df
Пример #5
0
def _encode(
    name,
    storage_name,
    path,
    df,
    cat_cache,
    na_sentinel=-1,
    freq_threshold=0,
    search_sorted=False,
    buckets=None,
    encode_type="joint",
    cat_names=None,
    max_size=0,
):
    if isinstance(buckets, int):
        buckets = {name: buckets for name in cat_names}
    # this is to apply freq_hashing logic
    if max_size:
        freq_threshold = 1
    value = None
    selection_l = name if isinstance(name, list) else [name]
    selection_r = name if isinstance(name, list) else [storage_name]
    list_col = _is_list_col(selection_l, df)
    if path:
        read_pq_func = _read_parquet_dispatch(df)
        if cat_cache is not None:
            cat_cache = (
                cat_cache if isinstance(cat_cache, str) else cat_cache.get(storage_name, "disk")
            )
            if len(df):
                with get_worker_cache("cats") as cache:
                    value = fetch_table_data(
                        cache,
                        path,
                        columns=selection_r,
                        cache=cat_cache,
                        cats_only=True,
                        reader=read_pq_func,
                    )
        else:
            value = read_pq_func(path, columns=selection_r)
            value.index.name = "labels"
            value.reset_index(drop=False, inplace=True)

    if value is None:
        value = type(df)()
        for c in selection_r:
            typ = df[selection_l[0]].dtype if len(selection_l) == 1 else df[c].dtype
            value[c] = df._constructor_sliced([None], dtype=typ)
        value.index.name = "labels"
        value.reset_index(drop=False, inplace=True)

    if not search_sorted:
        if list_col:
            codes = _flatten_list_column(df[selection_l[0]])
            codes["order"] = _arange(len(codes), like_df=df)
        else:
            codes = type(df)({"order": _arange(len(df), like_df=df)}, index=df.index)
            for c in selection_l:
                codes[c] = df[c].copy()
        if buckets and storage_name in buckets:
            na_sentinel = _hash_bucket(df, buckets, selection_l, encode_type=encode_type)
        # apply frequency hashing
        if freq_threshold and buckets and storage_name in buckets:
            merged_df = codes.merge(
                value, left_on=selection_l, right_on=selection_r, how="left"
            ).sort_values("order")
            merged_df.reset_index(drop=True, inplace=True)
            max_id = merged_df["labels"].max()
            merged_df["labels"].fillna(
                df._constructor_sliced(na_sentinel + max_id + 1), inplace=True
            )
            labels = merged_df["labels"].values
        # only do hashing
        elif buckets and storage_name in buckets:
            labels = na_sentinel
        # no hashing
        else:
            na_sentinel = 0
            labels = codes.merge(
                value, left_on=selection_l, right_on=selection_r, how="left"
            ).sort_values("order")["labels"]
            labels.fillna(na_sentinel, inplace=True)
            labels = labels.values
    else:
        # Use `searchsorted` if we are using a "full" encoding
        if list_col:
            labels = value[selection_r].searchsorted(
                df[selection_l[0]].list.leaves, side="left", na_position="first"
            )
        else:
            labels = value[selection_r].searchsorted(
                df[selection_l], side="left", na_position="first"
            )
        labels[labels >= len(value[selection_r])] = na_sentinel

    if list_col:
        labels = _encode_list_column(df[selection_l[0]], labels)

    return labels