def fit(self, X: XSeries, y: XSeries) -> None: """[summary]. Args: X : [description]. y (optional): [description]. Defaults to None. """ # TODO(smly): warn to use fit_transform instead of fit(). # transform() is recommended for encoding test set. if cudf_is_available() and isinstance(X, cudf.Series): pass elif isinstance(X, np.ndarray): X = column_or_1d(X, warn=True) y = column_or_1d(y, warn=True) else: raise RuntimeError # y = column_or_1d(y, warn=True) self.mean_encoders_ = [] # Fit and append mean_encoders for trn_idx, tst_idx in self.fold.split(X): X_trn, _ = X[trn_idx], X[tst_idx] y_trn, _ = y[trn_idx], y[tst_idx] if cudf_is_available() and isinstance(X, cudf.Series): encoder = _CuPy_MeanEncoder() encoder.fit(X_trn, y_trn) self.mean_encoders_.append(encoder) elif isinstance(X, np.ndarray): encoder = _MeanEncoder() encoder.fit(X_trn, y_trn) self.mean_encoders_.append(encoder) else: raise RuntimeError
def test_concat_combination(dataframes): for df in dataframes: encoder = ConcatCombination() df_encoded = encoder.fit_transform(df) assert df_encoded.columns.tolist() == [ "col1", "col2", "col3", "col1col2_combi", "col1col3_combi", "col2col3_combi", ] if cudf_is_available() and isinstance(df_encoded, cudf.DataFrame): assert df_encoded["col1col3_combi"].to_arrow().to_pylist() == [ "aX", "bY" ] else: assert df_encoded["col1col3_combi"].tolist() == ["aX", "bY"] for df in dataframes: encoder = ConcatCombination(output_suffix="", drop_origin=True) df_encoded = encoder.fit_transform(df) assert df_encoded.columns.tolist() == [ "col1col2", "col1col3", "col2col3", ] if cudf_is_available() and isinstance(df_encoded, cudf.DataFrame): assert df_encoded["col2col3"].to_arrow().to_pylist() == [ "@X", "%Y" ] else: assert df_encoded["col2col3"].tolist() == ["@X", "%Y"] for df in dataframes: encoder = ConcatCombination(output_suffix="", drop_origin=True, r=3) df_encoded = encoder.fit_transform(df) if cudf_is_available() and isinstance(df_encoded, cudf.DataFrame): assert df_encoded.columns.tolist() == [ "col1col2col3", ] assert df_encoded["col1col2col3"].to_arrow().to_pylist() == [ "a@X", "b%Y" ] else: assert df_encoded.columns.tolist() == [ "col1col2col3", ] assert df_encoded["col1col2col3"].tolist() == ["a@X", "b%Y"]
def transform(self, X): """Transform ndarray values.""" check_is_fitted(self, "classes_") if cudf_is_available() and isinstance(X, cudf.Series): X = X.to_array() X = column_or_1d(X, warn=True) # Label encoding if necessary if self._label_encoding_uniques is not None: X = self._label_encoding_uniques.get_indexer(pd.Series(X)) missing_mask = np.isnan(X) encode_mask = np.invert(missing_mask) unseen_mask = np.bitwise_xor(np.isin(X, self.classes_, invert=True), missing_mask) X[unseen_mask] = np.max(self.classes_) indices = np.searchsorted(self.classes_, X[encode_mask]) X[encode_mask] = np.take( self.lut_[:, 1], np.take(np.searchsorted(self.lut_[:, 0], self.classes_), indices), ) if np.any(missing_mask): X[missing_mask] = self._default_missing return X
def transform(self, X: XSeries) -> XSeries: """[summary]. Args: X : [description]. Returns: Any : [description]. """ check_is_fitted(self, "mean_encoders_") # Encoding for testing part. Different result from `fit_transform()` # result. if cudf_is_available() and isinstance(X, cudf.Series): n_splits = self.fold.get_n_splits() likelihood_values = cupy.zeros((X.shape[0], n_splits)) for fold_idx, mean_encoder in enumerate(self.mean_encoders_): ret = mean_encoder.transform(X) likelihood_values[:, fold_idx] = ret return np.mean(likelihood_values, axis=1) else: n_splits = self.fold.get_n_splits() likelihood_values = np.zeros((X.shape[0], n_splits)) for fold_idx, mean_encoder in enumerate(self.mean_encoders_): ret = mean_encoder.transform(X) likelihood_values[:, fold_idx] = ret return np.mean(likelihood_values, axis=1)
def transform(self, input_df: XDataFrame) -> XDataFrame: """Transform data frame. Args: input_df (XDataFrame): Input data frame. Returns: XDataFrame : Output data frame. """ new_df = input_df.copy() input_cols = self._input_cols if not input_cols: input_cols = new_df.columns.tolist() if self._exclude_cols: for col in self._exclude_cols: input_cols.remove(col) for col in input_cols: out_col = self._output_prefix + col + self._output_suffix if cudf_is_available() and isinstance(new_df, cudf.DataFrame): X = self._uniques[col].get_indexer(new_df[col].to_array()) else: X = self._uniques[col].get_indexer(new_df[col]) if self._unseen == "n_unique": missing_values = new_df[col].isna() unseen_values = np.invert(new_df[col].isin(self._uniques[col])) unseen_mask = np.bitwise_xor(missing_values, unseen_values) X[unseen_mask] = len(self._uniques[col]) new_df[out_col] = X return new_df
def fit_transform(self, X: XSeries, y: XSeries) -> XNDArray: """[summary]. Args: X : [description]. Returns: XNDArray : [description]. """ self.fit(X, y) check_is_fitted(self, "mean_encoders_") # Encoding for training data. if cudf_is_available() and isinstance(X, cudf.Series): likelihood_values = cupy.zeros(X.shape[0]) for idx, (trn_idx, tst_idx) in enumerate(self.fold.split(X)): X_tst = X[tst_idx] likelihood_values[tst_idx] = self.mean_encoders_[ idx].transform(X_tst) return likelihood_values elif isinstance(X, np.ndarray): likelihood_values = np.zeros(X.shape[0]) for idx, (trn_idx, tst_idx) in enumerate(self.fold.split(X)): X_tst = X[tst_idx] likelihood_values[tst_idx] = self.mean_encoders_[ idx].transform(X_tst) return likelihood_values else: raise RuntimeError
def test_internal_target_encoder_with_cudf(): if not cudf_is_available() or cudf is not None or cupy is not None: # Skip test. return X = cudf.Series( np.array([[2, 2], [2, 4], [2, 6], [8, 7], [8, 8], [8, 9], [8, 10]])) y = cudf.Series(np.array([1, 1, 0, 1, 1, 1, 0])) fold = KFold(n_splits=2, shuffle=False) trn_idx, tst_idx = next(fold.split(X)) assert np.array_equal(tst_idx, np.array([0, 1, 2, 3])) encoder = _TargetEncoder(fold=fold) # Test `fit_transform()`. y_trn = encoder.fit_transform(X[:, 0], y) assert np.allclose(y_trn.values, np.array([ 0.0, 0.0, 0.0, 0.66666667, 1.0, 1.0, 1.0, ])) X_tst = np.array([8, 0, 2]) y_tst = encoder.transform(X_tst) assert np.allclose( y_tst.values, np.array([0.83333334, 0., 0.33333334]), )
def test_select_numerical_cudf(pandas_dataframe): if not cudf_is_available(): return df_cuda = cudf.from_pandas(pandas_dataframe) encoder = SelectNumerical() df_new = encoder.fit_transform(df_cuda) assert df_new.columns.tolist() == ["num"]
def dataframes(): df = pd.DataFrame({"var1": [1, 2, 3]}) if cudf_is_available(): df_cuda = cudf.from_pandas(df) return [df, df_cuda] else: return [df]
def dataframes(): df = pd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3], "d": [1, 2, 9],}) if cudf_is_available(): df_cuda = cudf.from_pandas(df) return [df, df_cuda] else: return [df]
def dataframes(): df = pd.DataFrame({"col": ["a", "a", "b"],}) if cudf_is_available(): df_cuda = cudf.from_pandas(df) return [df, df_cuda] else: return [df]
def dataframes(): df = pd.DataFrame( {"a": [1, 2, 3, 4, 5], "b": ["a", "a", "a", "b", "b"], "c": [0, 0, 1, 1, 1],} ) if cudf_is_available(): df_cuda = cudf.from_pandas(df) return [df, df_cuda] else: return [df]
def dataframes(): df = pd.DataFrame({"target": [1, 0, 0]}) for col in range(100): df.loc[:, "col{}".format(col)] = np.array([1, 2, 3]) if cudf_is_available(): df_cuda = cudf.from_pandas(df) return [df, df_cuda] else: return [df]
def dataframes(): df = pd.DataFrame({ "col": ["A", "B", "B"], "num": [1, 2, 3], }) if cudf_is_available(): df_cuda = cudf.from_pandas(df) return [df, df_cuda] else: return [df]
def dataframes(): df = pd.DataFrame({ "col1": ["2", "2", "2", "8", "8", "8", "8"], "col2": [2, 4, 6, 7, 8, 9, 10], "target": [1, 1, 0, 1, 1, 1, 0], }) if cudf_is_available(): df_cuda = cudf.from_pandas(df) return [df, df_cuda] else: return [df]
def transform(self, input_df: XDataFrame) -> XDataFrame: """Transform data frame. Args: input_df (XDataFrame): Input data frame. Returns: XDataFrame : Output data frame. """ if isinstance(input_df, pd.DataFrame): new_df = input_df.copy() elif cudf_is_available() and isinstance(input_df, cudf.DataFrame): new_df = input_df.to_pandas() else: raise RuntimeError("Unexpected data type: {}".format(type(input_df))) generated_cols = [] input_cols = self._input_cols if not input_cols: input_cols = new_df.columns.tolist() if len(self._exclude_cols) > 0: input_cols = [col for col in input_cols if col not in self._exclude_cols] for col in input_cols: new_col = self._output_prefix + col + self._output_suffix if self._fillna is not None: new_df[new_col] = ( new_df[col].fillna(self._fillna).apply(self._lambda_func) ) else: new_df[new_col] = new_df[col].apply(self._lambda_func) generated_cols.append(new_col) if cudf_is_available() and isinstance(input_df, cudf.DataFrame): new_df = cudf.from_pandas(new_df) if self._drop_origin: return new_df[generated_cols] return new_df
def dataframes(): df = pd.DataFrame({ "col1": [1, 2, 3, 4, 5], "col2": [2, 3, 4, 5, 6], "col3": [3, 4, 5, 6, 7], }) if cudf_is_available(): df_cuda = cudf.from_pandas(df) return [df, df_cuda] else: return [df] return [df]
def dataframes_targetencoder(): df = pd.DataFrame({ "col1": [2, 2, 2, 8, 8, 8, 8], "col2": [2, 4, 6, 7, 8, 9, 10], "target": [1, 1, 0, 1, 1, 1, 0], }) df_test = pd.DataFrame({ "col1": [2, 8], "col2": [2, 8], }) if cudf_is_available(): df_cuda = cudf.from_pandas(df) df_test_cuda = cudf.from_pandas(df_test) return [(df, df_test), (df_cuda, df_test_cuda)] else: return [(df, df_test)]
def test_arithmetic_combinations(dataframes): for df in dataframes: encoder = ArithmeticCombinations(operator="+", output_suffix="_plus") df_new = encoder.fit_transform(df) assert df_new.columns.tolist() == [ "col1", "col2", "col3", "col1col2_plus", "col1col3_plus", "col2col3_plus", ] if cudf_is_available() and isinstance(df_new, cudf.DataFrame): assert df_new["col2col3_plus"].to_arrow().to_pylist() == [ 5, 7, 9, 11, 13 ] else: assert df_new["col2col3_plus"].tolist() == [5, 7, 9, 11, 13]
def fit_transform(self, input_df: XDataFrame) -> XDataFrame: """Fit to data frame, then transform it. Args: input_df (XDataFrame): Input data frame. Returns: XDataFrame : Output data frame. """ if cudf_is_available() and isinstance(input_df, cudf.DataFrame): self._selected_cols = ( input_df.to_pandas() .T.drop_duplicates(keep="first") .index.values.tolist() ) else: self._selected_cols = input_df.T.drop_duplicates( keep="first" ).index.values.tolist() return input_df[self._selected_cols]
def fit(self, X, y=None): """Fit to ndarray, then transform it.""" if cudf_is_available() and isinstance(X, cudf.Series): X = X.to_array() X = column_or_1d(X, warn=True) # Label encoding if necessary if not np.can_cast(X.dtype, np.int64): X, uniques = pd.Series(X).factorize() self._label_encoding_uniques = uniques self.classes_, self.counts_ = np.unique(X[np.isfinite(X)], return_counts=True) self.classes_ = np.append(self.classes_, [np.max(self.classes_) + 1]) self.counts_ = np.append(self.counts_, [self._default_unseen]) self.lut_ = np.hstack( [self.classes_.reshape(-1, 1), self.counts_.reshape(-1, 1)]) return self
def fit(self, X: CSeries, y: CSeries): """[summary]. Args: X (cupy.ndarray): Input cupy ndarray. y (cupy.ndarray): Target cupy ndarray. """ # Label encoding if necessary if not cupy.can_cast(X.dtype, cupy.int): if cudf_is_available() and isinstance(X, cudf.Series): X = X.to_array() X, uniques = pd.Series(cupy.asnumpy(X)).factorize() X = cudf.Series(X) self._label_encoding_uniques = uniques self.classes_, counts = cupy.unique(X, return_counts=True) self.class_means_ = cupy.zeros_like(self.classes_, dtype="float64") assert isinstance(y, cudf.Series) df = cudf.DataFrame() df.insert(0, "X", X) df.insert(0, "y", y.values) agg = df.groupby("X").agg("mean").to_pandas() for idx, uniq_value in enumerate(self.classes_): uniq_value = cupy.asnumpy(uniq_value).item() mean_value = agg.loc[uniq_value]["y"] self.class_means_[idx] = mean_value self.classes_ = cupy.array( np.append(cupy.asnumpy(self.classes_), [cupy.asnumpy(cupy.max(self.classes_)) + 1])) self.class_means_ = cupy.array( np.append(cupy.asnumpy(self.class_means_), [cupy.asnumpy(self.default_unseen_)])) self.lut_ = cupy.hstack( [self.classes_.reshape(-1, 1), self.class_means_.reshape(-1, 1)])
def test_cudf_is_available(): if cudf is None: assert cudf_is_available() is False else: assert cudf_is_available() is True
def test_internal_cupy_mean_encoder_fit_transform(): if not cudf_is_available() or cudf is not None or cupy is not None: # Skip test. return X = np.array([[2, 2], [2, 4], [2, 6], [8, 7], [8, 8], [8, 9], [8, 10]]) y = np.array([1, 1, 0, 0, 1, 1, 0]) X = cupy.asarray(X) y = cupy.asarray(y) col_idx = 0 encoder = _CuPy_MeanEncoder() y_mean = encoder.fit_transform(X[:, col_idx], y) assert np.array_equal( cupy.asnumpy(encoder.classes_), np.array([2, 8, 9]), # 9 (max + 1) is assigned for unseen values. ) assert cupy.allclose( encoder.class_means_, cupy.array([ 0.66666667, 0.5, 0.0, ]) # 2/3 # 2/4 ) assert cupy.allclose( y_mean, cupy.array([ 0.66666667, 0.66666667, 0.66666667, 0.5, 0.5, 0.5, 0.5, ])) # Unseen values col_idx = 0 X_test = cupy.array([9, 1, 8, 2]) y_mean = encoder.transform(X_test) assert cupy.allclose( y_mean, cupy.array([ 0.0, # 9 = recognized as seen value since (max+1) is assigned for unseen value. 0.0, # 1 = unseen value 0.5, # 8 = 2/4 0.66666667, ]), ) # Missing value col_idx = 0 X_test = cupy.array([[cupy.nan, 2], [1, 1], [8, 4]]) y_mean = encoder.transform(X_test[:, col_idx]) assert cupy.allclose( y_mean, cupy.array([ 0.0, 0.0, 0.5, ] # NaN = missing value # 1 = unseen value # 0 = 2/4 ), )