def test_multi_label_encoder(self): df = pd.DataFrame({"A": [1, 2, 3, 4], "B": ['a', 'a', 'a', 'b']}) df_expect = pd.DataFrame({"A": [0, 1, 2, 3], "B": [0, 0, 0, 1]}) ec = skex.MultiLabelEncoder(dtype=np.int64) df_t = ec.fit_transform(df.copy()) assert np.where(df_expect.values == df_t.values, 0, 1).sum() == 0 assert all(df_t.dtypes == pd.Series(dict(A=np.int64, B=np.int64))) ec = skex.MultiLabelEncoder(dtype=np.int32) df_t = ec.fit_transform(df.copy()) assert np.where(df_expect.values == df_t.values, 0, 1).sum() == 0 assert all(df_t.dtypes == pd.Series(dict(A=np.int32, B=np.int32)))
def _categorical_encoding(self, X): start = time.time() logger.info('Categorical encoding...') vars = self.get_categorical_columns() mle = skex.MultiLabelEncoder(vars) X = mle.fit_transform(X) self.X_transformers['label_encoder'] = mle print(f'Categorical encoding taken {time.time() - start}s') return X
def test_func_transformer(self): dfm = DataFrameMapper( [(column_object_category_bool, [ SimpleImputer(strategy='constant'), skex.MultiLabelEncoder(), ] ), ], input_df=True, df_out=True, df_out_dtype_transforms=[ (column_object, 'category') ] ) X, y = get_df() x_new = dfm.fit_transform(X, y) assert x_new.dtypes.to_list() == [pd.CategoricalDtype(categories=[0, 1, 2], ordered=False), pd.CategoricalDtype(categories=[0, 1], ordered=False), pd.CategoricalDtype(categories=[0, 1, 2], ordered=False)]
def test_cache(): clear() df = dsutils.load_bank() t = skex.MultiLabelEncoder() X = t.fit_transform(df.copy()) t1 = CachedMultiLabelEncoder() X1 = t1.fit_transform(df.copy()) t2 = CachedMultiLabelEncoder() X2 = t2.fit_transform(df.copy()) hasher = get_tool_box(df).data_hasher() assert hasher(X) == hasher(X1) == hasher(X2) t3 = CachedMultiLabelEncoder() X3 = t3.fit_transform_as_tuple_result(df.copy()) t4 = CachedMultiLabelEncoder() X4 = t4.fit_transform_as_tuple_result(df.copy()) assert isinstance(X3, (tuple, list)) assert isinstance(X4, (tuple, list)) assert hasher(X3[1]) == hasher(X4[1])
def as_local(self): target = sk_ex.MultiLabelEncoder() target.columns = self.columns target.dtype = self.dtype target.encoders = {k: e.as_local() for k, e in self.encoders.items()} return target
def load_data(): df = dsutils.load_bank() return skex.MultiLabelEncoder().fit_transform(df)