예제 #1
0
    def test_multi_label_encoder(self):
        df = pd.DataFrame({"A": [1, 2, 3, 4], "B": ['a', 'a', 'a', 'b']})
        df_expect = pd.DataFrame({"A": [0, 1, 2, 3], "B": [0, 0, 0, 1]})
        ec = skex.MultiLabelEncoder(dtype=np.int64)
        df_t = ec.fit_transform(df.copy())
        assert np.where(df_expect.values == df_t.values, 0, 1).sum() == 0
        assert all(df_t.dtypes == pd.Series(dict(A=np.int64, B=np.int64)))

        ec = skex.MultiLabelEncoder(dtype=np.int32)
        df_t = ec.fit_transform(df.copy())
        assert np.where(df_expect.values == df_t.values, 0, 1).sum() == 0
        assert all(df_t.dtypes == pd.Series(dict(A=np.int32, B=np.int32)))
예제 #2
0
 def _categorical_encoding(self, X):
     start = time.time()
     logger.info('Categorical encoding...')
     vars = self.get_categorical_columns()
     mle = skex.MultiLabelEncoder(vars)
     X = mle.fit_transform(X)
     self.X_transformers['label_encoder'] = mle
     print(f'Categorical encoding taken {time.time() - start}s')
     return X
예제 #3
0
 def test_func_transformer(self):
     dfm = DataFrameMapper(
         [(column_object_category_bool, [
             SimpleImputer(strategy='constant'),
             skex.MultiLabelEncoder(),
         ]
           ),
          ],
         input_df=True,
         df_out=True,
         df_out_dtype_transforms=[
             (column_object, 'category')
         ]
     )
     X, y = get_df()
     x_new = dfm.fit_transform(X, y)
     assert x_new.dtypes.to_list() == [pd.CategoricalDtype(categories=[0, 1, 2], ordered=False),
                                       pd.CategoricalDtype(categories=[0, 1], ordered=False),
                                       pd.CategoricalDtype(categories=[0, 1, 2], ordered=False)]
예제 #4
0
def test_cache():
    clear()

    df = dsutils.load_bank()
    t = skex.MultiLabelEncoder()
    X = t.fit_transform(df.copy())

    t1 = CachedMultiLabelEncoder()
    X1 = t1.fit_transform(df.copy())
    t2 = CachedMultiLabelEncoder()
    X2 = t2.fit_transform(df.copy())

    hasher = get_tool_box(df).data_hasher()
    assert hasher(X) == hasher(X1) == hasher(X2)

    t3 = CachedMultiLabelEncoder()
    X3 = t3.fit_transform_as_tuple_result(df.copy())
    t4 = CachedMultiLabelEncoder()
    X4 = t4.fit_transform_as_tuple_result(df.copy())
    assert isinstance(X3, (tuple, list))
    assert isinstance(X4, (tuple, list))
    assert hasher(X3[1]) == hasher(X4[1])
예제 #5
0
 def as_local(self):
     target = sk_ex.MultiLabelEncoder()
     target.columns = self.columns
     target.dtype = self.dtype
     target.encoders = {k: e.as_local() for k, e in self.encoders.items()}
     return target
예제 #6
0
 def load_data():
     df = dsutils.load_bank()
     return skex.MultiLabelEncoder().fit_transform(df)