def check_preprocessing(self): """ Check that all transformation of the preprocessing are supported. """ return check_preprocessing(self.preprocessing)
def test_check_preprocessing_1(self): """ Test check preprocessing on multiple preprocessing """ train = pd.DataFrame({'Onehot1': ['A', 'B', 'A', 'B'], 'Onehot2': ['C', 'D', 'C', 'D'], 'Binary1': ['E', 'F', 'E', 'F'], 'Binary2': ['G', 'H', 'G', 'H'], 'Ordinal1': ['I', 'J', 'I', 'J'], 'Ordinal2': ['K', 'L', 'K', 'L'], 'BaseN1': ['M', 'N', 'M', 'N'], 'BaseN2': ['O', 'P', 'O', 'P'], 'Target1': ['Q', 'R', 'Q', 'R'], 'Target2': ['S', 'T', 'S', 'T'], 'other': ['other', np.nan, 'other', 'other']}) y = pd.DataFrame(data=[0, 1, 0, 0], columns=['y']) enc_onehot = ce.OneHotEncoder(cols=['Onehot1', 'Onehot2']).fit(train) train_onehot = enc_onehot.transform(train) enc_binary = ce.BinaryEncoder(cols=['Binary1', 'Binary2']).fit(train_onehot) train_binary = enc_binary.transform(train_onehot) enc_ordinal = ce.OrdinalEncoder(cols=['Ordinal1', 'Ordinal2']).fit(train_binary) train_ordinal = enc_ordinal.transform(train_binary) enc_basen = ce.BaseNEncoder(cols=['BaseN1', 'BaseN2']).fit(train_ordinal) train_basen = enc_basen.transform(train_ordinal) enc_target = ce.TargetEncoder(cols=['Target1', 'Target2']).fit(train_basen, y) input_dict1 = dict() input_dict1['col'] = 'Onehot2' input_dict1['mapping'] = pd.Series(data=['C', 'D', np.nan], index=['C', 'D', 'missing']) input_dict1['data_type'] = 'object' input_dict2 = dict() input_dict2['col'] = 'Binary2' input_dict2['mapping'] = pd.Series(data=['G', 'H', np.nan], index=['G', 'H', 'missing']) input_dict2['data_type'] = 'object' input_dict = dict() input_dict['col'] = 'state' input_dict['mapping'] = pd.Series(data=['US', 'FR-1', 'FR-2'], index=['US', 'FR', 'FR']) input_dict['data_type'] = 'object' input_dict3 = dict() input_dict3['col'] = 'Ordinal2' input_dict3['mapping'] = pd.Series(data=['K', 'L', np.nan], index=['K', 'L', 'missing']) input_dict3['data_type'] = 'object' list_dict = [input_dict2, input_dict3] y = pd.DataFrame(data=[0, 1], columns=['y']) train = pd.DataFrame({'city': ['chicago', 'paris'], 'state': ['US', 'FR'], 'other': ['A', 'B']}) enc = ColumnTransformer( transformers=[ ('onehot', skp.OneHotEncoder(), ['city', 'state']) ], remainder='drop') enc.fit(train, y) wrong_prepro = skp.OneHotEncoder().fit(train, y) check_preprocessing([enc_onehot, enc_binary, enc_ordinal, enc_basen, enc_target, input_dict1, list_dict]) for preprocessing in [enc_onehot, enc_binary, enc_ordinal, enc_basen, enc_target]: check_preprocessing(preprocessing) check_preprocessing(input_dict2) check_preprocessing(enc) check_preprocessing(None) with self.assertRaises(Exception): check_preprocessing(wrong_prepro)