def test_basen(self):
        """

        :return:
        """

        cols = ['C1', 'D', 'E', 'F']
        X = self.create_dataset(n_rows=1000)
        X_t = self.create_dataset(n_rows=100)

        enc = encoders.BaseNEncoder(verbose=1, cols=cols)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))

        enc = encoders.BaseNEncoder(verbose=1)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))

        enc = encoders.BaseNEncoder(verbose=1, drop_invariant=True)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))

        enc = encoders.BaseNEncoder(verbose=1, return_df=False)
        enc.fit(X, None)
        self.assertTrue(isinstance(enc.transform(X_t), np.ndarray))
示例#2
0
 def num_cols(nvals, base):
     """Returns the number of columns output for a given number of distinct input values"""
     vals = [str(i) for i in range(nvals)]
     df = pd.DataFrame({'vals': vals})
     encoder = encoders.BaseNEncoder(base=base)
     encoder.fit(df)
     return len(list(encoder.transform(df)))
    def test_inv_transform_ct_11(self):
        """
        test inv_transform_ct with BaseN Encoder and passthrough option
        """
        y = pd.DataFrame(data=[0, 1], columns=['y'])

        train = pd.DataFrame({'city': ['chicago', 'paris'],
                              'state': ['US', 'FR'],
                              'other': ['A', 'B']})

        enc = ColumnTransformer(
            transformers=[
                ('basen', ce.BaseNEncoder(), ['city', 'state'])
            ],
            remainder='passthrough')
        enc.fit(train, y)
        test = pd.DataFrame({'city': ['chicago', 'chicago', 'paris'],
                             'state': ['US', 'FR', 'FR'],
                             'other': ['A', 'B', 'C']})

        expected = pd.DataFrame({'basen_city': ['chicago', 'chicago', 'paris'],
                                 'basen_state': ['US', 'FR', 'FR'],
                                 'other': ['A', 'B', 'C']})

        result = pd.DataFrame(enc.transform(test))
        result.columns = ['col1_0', 'col1_1', 'col2_0', 'col2_1', 'other']
        original = inverse_transform(result, enc)
        pd.testing.assert_frame_equal(original, expected)
示例#4
0
 def Base_N_Coder(self, path, data, target):
     self.log.writeToLog('Performing Base N Encoding...')
     encodetype = 'base_ncoder'
     df = pd.read_csv(path + data)
     category = None
     for i in df.dtypes:
         if i == 'O':
             category = 'yes'
     if category == 'yes':
         category, droped_data = self.refining(df)
         self.log.writeToLog('No. of columns before encoding : ' +
                             str(len(list(df.columns))))
         le = LabelEncoder()
         bne = ce.BaseNEncoder()
         if target in list(category.columns):
             category = category.drop(target, axis=1)
         df[target] = le.fit_transform(df[target])
         self.log.writeToLog('Target column has been encoded !')
         if not category.empty:
             self.log.writeToLog('Dependant variables encoded is/are:  ' +
                                 str(list(category.columns)))
             bne_data = bne.fit_transform(category)
             dataset = droped_data.join(bne_data)
             #dataset = dataset.join(df[target])
             dataset[target] = df[target]
             self.convert_to_csv(data, dataset, encodetype)
             self.log.writeToLog('No. of columns after encoding : ' +
                                 str(len(list(dataset.columns))))
         else:
             self.log.writeToLog(
                 'Dependant variables has no categories to be encoded !')
     else:
         self.log.writeToLog(
             'No categorical columns found in the dataset to be encoded !')
     '''
示例#5
0
def apply_baseN_encoding(df, categorical_columns):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)
    import category_encoders as ce
    encoder = ce.BaseNEncoder(base=3, cols=categorical_columns).fit(df.values)
    X_transformed = encoder.transform(df)
    return X_transformed
def nominalToNumeric(keyArr, valueArr):
    frameDict = {}
    for i in range(len((keyArr))):
        frameDict[keyArr[i]] = valueArr[i]
    nominal = pd.DataFrame(frameDict)
    baseEncoder = ce.BaseNEncoder(cols=keyArr)
    return baseEncoder.fit_transform(nominal)
示例#7
0
    def test_inverse_transform_HaveData_ExpectResultReturned(self):
        train = pd.Series(list('abcd')).to_frame('letter')

        enc = encoders.BaseNEncoder(base=2)
        result = enc.fit_transform(train)
        inversed_result = enc.inverse_transform(result)

        pd.testing.assert_frame_equal(train, inversed_result)
示例#8
0
    def test_inverse_transform_HaveNanInTrainAndHandleMissingReturnNan_ExpectReturnedWithNan(self):
        train = pd.DataFrame({'city': ['chicago', np.nan]})

        enc = encoders.BaseNEncoder(handle_missing='return_nan', handle_unknown='value')
        result = enc.fit_transform(train)
        original = enc.inverse_transform(result)

        pd.testing.assert_frame_equal(train, original)
示例#9
0
 def create_features(self, df_train, df_test):
     encoder = ce.BaseNEncoder(cols=self.columns)
     encoder.fit(df_train[self.columns],
                 df_train[self.target_column].values.tolist())
     encoded_train = encoder.transform(df_train[self.columns])
     encoded_test = encoder.transform(df_test[self.columns])
     for column in encoded_train.columns:
         self.train[column + '_BaseNEncoder'] = encoded_train[column]
         self.test[column + '_BaseNEncoder'] = encoded_test[column]
示例#10
0
    def test_HandleMissingIndicator_HaveNoNan_ExpectThirdColumn(self):
        train = pd.Series(['a', 'b', 'c'])

        result = encoders.BaseNEncoder(handle_missing='indicator', base=2).fit_transform(train)

        self.assertEqual(3, result.shape[0])
        self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist())
        self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist())
        self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist())
示例#11
0
    def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self):
        train = ['A', 'B']

        encoder = encoders.BaseNEncoder(handle_unknown='indicator')
        result = encoder.fit_transform(train)

        self.assertEqual(2, result.shape[0])
        self.assertListEqual([0, 1], result.iloc[0, :].tolist())
        self.assertListEqual([1, 0], result.iloc[1, :].tolist())
 def test_inverse_transform_13(self):
     """
     Test basen encoding
     """
     train = pd.DataFrame({'city': ['chicago', np.nan]})
     enc = ce.BaseNEncoder(handle_missing='value', handle_unknown='value')
     result = enc.fit_transform(train)
     original = inverse_transform(result, enc)
     pd.testing.assert_frame_equal(train, original)
 def test_inverse_transform_12(self):
     """
     test inverse_transform having data expecting a returned result
     """
     train = pd.Series(list('abcd')).to_frame('letter')
     enc = ce.BaseNEncoder(base=2)
     result = enc.fit_transform(train)
     inversed_result = inverse_transform(result, enc)
     pd.testing.assert_frame_equal(train, inversed_result)
示例#14
0
 def __init__(self, cols=None, drop_invariant=False, return_df=True,
              handle_unknown='value', handle_missing='value'):
     self.cols = cols
     self.drop_invariant = drop_invariant
     self.return_df = return_df
     self.handle_unknown = handle_unknown
     self.handle_missing = handle_missing
     self.base_n_encoder = ce.BaseNEncoder(base=2, cols=self.cols, drop_invariant=self.drop_invariant, 
                                           return_df=self.return_df, handle_unknown=self.handle_unknown, 
                                           handle_missing=self.handle_missing)
 def test_inverse_transform_ce_basen(self):
     """
     Unit test inverse transform base n
     """
     preprocessing = ce.BaseNEncoder(cols=['Age', 'Sex'],
                                     return_df=True,
                                     base=3)
     fitted_dataset = preprocessing.fit_transform(self.ds_titanic_clean)
     output = inverse_transform(fitted_dataset, preprocessing)
     pd.testing.assert_frame_equal(output, self.ds_titanic_clean)
示例#16
0
    def test_HaveIndicatorAndNanValue_ExpectNewColumn(self):
        train = pd.Series(['a', 'b', 'c', np.nan])

        result = encoders.BaseNEncoder(handle_missing='indicator', base=2).fit_transform(train)

        self.assertEqual(4, result.shape[0])
        self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist())
        self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist())
        self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist())
        self.assertListEqual([1, 0, 0], result.iloc[3, :].tolist())
示例#17
0
    def test_inverse_transform_HaveMissingAndNoUnknown_ExpectInversed(self):
        train = pd.DataFrame({'city': ['chicago', np.nan]})
        test = pd.DataFrame({'city': ['chicago', 'los angeles']})

        enc = encoders.BaseNEncoder(handle_missing='value', handle_unknown='return_nan')
        enc.fit(train)
        result = enc.transform(test)
        original = enc.inverse_transform(result)

        pd.testing.assert_frame_equal(train, original)
示例#18
0
    def test_fit_transform_have_base_2_expect_Correct_Encoding(self):
        train = pd.Series(['a', 'b', 'c', 'd'])

        result = encoders.BaseNEncoder(base=2).fit_transform(train)

        self.assertEqual(4, result.shape[0])
        self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist())
        self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist())
        self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist())
        self.assertListEqual([1, 0, 0], result.iloc[3, :].tolist())
    def test_inverse_transform_14(self):
        """
        test inverse_transform having Nan in train and handle missing expected a result with Nan
        """
        train = pd.DataFrame({'city': ['chicago', np.nan]})

        enc = ce.BaseNEncoder(handle_missing='return_nan',
                              handle_unknown='value')
        result = enc.fit_transform(train)
        original = inverse_transform(result, enc)

        pd.testing.assert_frame_equal(train, original)
    def test_basen_np(self):
        """

        :return:
        """

        X = self.create_array(n_rows=1000)
        X_t = self.create_array(n_rows=100)

        enc = encoders.BaseNEncoder(verbose=1)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))
    def __init__(self, encoder_type, columns_name=None):
        """
        :param encoder_type:
        :param columns_name: list, 特征名组成的列表名
        """
        if encoder_type == "BackwardDe":  # 反向差分编码
            self.encoder = ce.BackwardDifferenceEncoder(cols=columns_name)

        elif encoder_type == "BaseN":  # BaseN编码
            self.encoder = ce.BaseNEncoder(cols=columns_name)

        elif encoder_type == "Binary":  # 二值编码
            self.encoder = ce.BinaryEncoder(cols=columns_name)

        elif encoder_type == "Catboost":
            self.encoder = ce.CatBoostEncoder(cols=columns_name)

        elif encoder_type == "Hash":
            self.encoder = ce.HashingEncoder(cols=columns_name)

        elif encoder_type == "Helmert":
            self.encoder = ce.HelmertEncoder(cols=columns_name)

        elif encoder_type == "JamesStein":
            self.encoder = ce.JamesSteinEncoder(cols=columns_name)

        elif encoder_type == "LOO":  # LeaveOneOutEncoder  编码
            self.encoder = ce.LeaveOneOutEncoder(cols=columns_name)

        elif encoder_type == "ME":
            self.encoder = ce.MEstimateEncoder(cols=columns_name)  # M估计编码器

        elif encoder_type == "OneHot":
            self.encoder = ce.OneHotEncoder(cols=columns_name)

        elif encoder_type == "OridinalEncoder":  # 原始编码
            self.encoder = ce.OrdinalEncoder(cols=columns_name)

        elif encoder_type == "Sum":  # 求和编码
            self.encoder = ce.SumEncoder(cols=columns_name)

        elif encoder_type == "Polynomial":  # 多项式编码
            self.encoder = ce.PolynomialEncoder(cols=columns_name)

        elif encoder_type == "Target":  # 目标编码
            self.encoder = ce.TargetEncoder(cols=columns_name)

        elif encoder_type == "WOE":  # WOE 编码器
            self.encoder = ce.WOEEncoder(cols=columns_name)

        else:
            raise ValueError("请选择正确的编码方式")
 def test_inverse_transform_16(self):
     """
     test inverse_transform having handle missing value and Unknown
     """
     train = pd.DataFrame({'city': ['chicago', np.nan]})
     test = pd.DataFrame({'city': ['chicago', np.nan, 'los angeles']})
     expected = pd.DataFrame({'city': ['chicago', np.nan, np.nan]})
     enc = ce.BaseNEncoder(handle_missing='value',
                           handle_unknown='return_nan')
     enc.fit(train)
     result = enc.transform(test)
     original = inverse_transform(result, enc)
     pd.testing.assert_frame_equal(expected, original)
示例#23
0
    def test_HandleUnknown_HaveUnknown_ExpectIndicatorInTest(self):
        train = ['A', 'B', 'C']
        test = ['A', 'B', 'C', 'D']

        encoder = encoders.BaseNEncoder(handle_unknown='indicator')
        encoder.fit(train)
        result = encoder.transform(test)

        self.assertEqual(4, result.shape[0])
        self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist())
        self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist())
        self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist())
        self.assertListEqual([1, 0, 0], result.iloc[3, :].tolist())
示例#24
0
    def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self):
        train = pd.Series(['a', 'b', 'c'])
        test = pd.Series(['a', 'b', 'c', np.nan])

        encoder = encoders.BaseNEncoder(handle_missing='indicator')
        encoder.fit(train)
        result = encoder.transform(test)

        self.assertEqual(4, result.shape[0])
        self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist())
        self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist())
        self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist())
        self.assertListEqual([1, 0, 0], result.iloc[3, :].tolist())
示例#25
0
 def Base_N_Coder(self, data, target, filename):
     self.log.writeToLog('Performing Base N Encoding...')
     encodetype = 'base_ncoder'
     category, x, df = self.refining(data, target)
     self.log.writeToLog('Categorical columns to be encoded:  ' +
                         str(list(category.columns)))
     bne = ce.BaseNEncoder()
     numeric_features = bne.fit_transform(category)
     self.log.writeToLog('Encoded as:  ' +
                         str(list(numeric_features.columns)))
     x = x.join(numeric_features)
     dataset = x.join(df[target])
     self.convert_to_csv(data, dataset, encodetype, filename)
示例#26
0
    def test_inverse_transform_BothFieldsAreReturnNanWithNan_ExpectValueError(self):
        train = pd.DataFrame({'city': ['chicago', np.nan]})
        test = pd.DataFrame({'city': ['chicago', 'los angeles']})

        enc = encoders.BaseNEncoder(handle_missing='return_nan', handle_unknown='return_nan')
        enc.fit(train)
        result = enc.transform(test)
        
        message = 'inverse_transform is not supported because transform impute '\
                  'the unknown category nan when encode city'

        with self.assertWarns(UserWarning, msg=message) as w:
            enc.inverse_transform(result)
示例#27
0
    def test_inverse_transform_BothFieldsAreReturnNanWithNan_ExpectValueError(self):
        train = pd.DataFrame({'city': ['chicago', np.nan]})
        test = pd.DataFrame({'city': ['chicago', 'los angeles']})

        enc = encoders.BaseNEncoder(handle_missing='return_nan', handle_unknown='return_nan')
        enc.fit(train)
        result = enc.transform(test)

        with warnings.catch_warnings(record=True) as w:
            enc.inverse_transform(result)

            self.assertEqual(1, len(w))
            self.assertEqual('inverse_transform is not supported because transform impute '
                             'the unknown category nan when encode city', str(w[0].message))
示例#28
0
def get_encoder_dict():
    encoder_dict = {
        'OneHotEncoder': ce.OneHotEncoder(),
        'BinaryEncoder': ce.BinaryEncoder(),
        'HashingEncoder': ce.HashingEncoder(),
        'LabelEncoder': le.MultiColumnLabelEncoder(),
        'FrequencyEncoder': fe.FrequencyEncoder(),
        'TargetEncoder': ce.TargetEncoder(),
        'HelmertEncoder': ce.HelmertEncoder(),
        'JamesSteinEncoder': ce.JamesSteinEncoder(),
        'BaseNEncoder': ce.BaseNEncoder(),
        'SumEncoder': ce.SumEncoder(),
    }
    return encoder_dict
示例#29
0
 def __init__(self,
              verbose=0,
              cols=None,
              mapping=None,
              drop_invariant=False,
              return_df=True,
              handle_unknown='value',
              handle_missing='value'):
     self.base_n_encoder = ce.BaseNEncoder(base=2,
                                           verbose=verbose,
                                           cols=cols,
                                           mapping=mapping,
                                           drop_invariant=drop_invariant,
                                           return_df=return_df,
                                           handle_unknown=handle_unknown,
                                           handle_missing=handle_missing)
 def test_inverse_transform_contributions_ce_basen(self):
     """
     Unit test inverse transform contributions ce base n
     """
     preprocessing = ce.BaseNEncoder(cols=['Age', 'Sex'],
                                     return_df=True,
                                     base=3)
     fitted_dataset = preprocessing.fit_transform(self.ds_titanic_clean)
     contributions = pd.DataFrame(data=np.random.rand(
         fitted_dataset.shape[0], fitted_dataset.shape[1]),
                                  columns=fitted_dataset.columns,
                                  index=self.ds_titanic_clean.index)
     output = inverse_transform_contributions(contributions, preprocessing)
     assert isinstance(output, pd.DataFrame)
     assert self.ds_titanic_clean.shape == output.shape
     np.testing.assert_almost_equal(contributions.values.sum(axis=1),
                                    output.values.sum(axis=1))