Exemplo n.º 1
0
    def test_check_consistency_model_features_2(self):
        """
        Test check_consistency_model_features 2
        """
        train = pd.DataFrame({'Onehot1': ['A', 'B', 'A', 'B'], 'Onehot2': ['C', 'D', 'C', 'D'],
                              'Binary1': ['E', 'F', 'E', 'F'], 'Binary2': ['G', 'H', 'G', 'H'],
                              'Ordinal1': ['I', 'J', 'I', 'J'], 'Ordinal2': ['K', 'L', 'K', 'L'],
                              'BaseN1': ['M', 'N', 'M', 'N'], 'BaseN2': ['O', 'P', 'O', 'P'],
                              'Target1': ['Q', 'R', 'Q', 'R'], 'Target2': ['S', 'T', 'S', 'T'],
                              'other': ['other', np.nan, 'other', 'other']})

        features_dict = None
        columns_dict = {i: features for i, features in enumerate(train.columns)}
        features_types = {features: str(train[features].dtypes) for features in train.columns}

        mask_params = {
            "features_to_hide": 'Binary3',
            "threshold": None,
            "positive": True,
            "max_contrib": 5
        }

        enc_ordinal_all = ce.OrdinalEncoder(cols=['Onehot1', 'Onehot2', 'Binary1', 'Binary2', 'Ordinal1', 'Ordinal2',
                                                  'BaseN1', 'BaseN2', 'Target1', 'Target2', 'other']).fit(train)
        train_ordinal_all = enc_ordinal_all.transform(train)
        preprocessing = enc_ordinal_all

        y = pd.DataFrame({'y_class': [0, 0, 0, 1]})

        model = cb.CatBoostClassifier(n_estimators=1).fit(train_ordinal_all, y)

        with self.assertRaises(ValueError):
            check_consistency_model_features(features_dict, model, columns_dict,
                                             features_types, mask_params, preprocessing)
Exemplo n.º 2
0
    def test_check_consistency_model_features_4(self):
        """
        Test check_consistency_model_features 1
        """
        train = pd.DataFrame({'Onehot1': ['A', 'B', 'A', 'B'], 'Onehot2': ['C', 'D', 'C', 'D'],
                              'Binary1': ['E', 'F', 'E', 'F'], 'Binary2': ['G', 'H', 'G', 'H'],
                              'Ordinal1': ['I', 'J', 'I', 'J'], 'Ordinal2': ['K', 'L', 'K', 'L'],
                              'BaseN1': ['M', 'N', 'M', 'N'], 'BaseN2': ['O', 'P', 'O', 'P'],
                              'Target1': ['Q', 'R', 'Q', 'R'], 'Target2': ['S', 'T', 'S', 'T'],
                              'other': ['other', np.nan, 'other', 'other']})

        features_dict = None
        columns_dict = {i:features for i,features in enumerate(train.columns)}
        features_types = {features: str(train[features].dtypes) for features in train.columns}
        label_dict = None
        mask_params = None

        enc_ordinal_all = ce.OrdinalEncoder(cols=['Onehot1', 'Onehot2', 'Binary1', 'Binary2', 'Ordinal1', 'Ordinal2',
                                            'BaseN1', 'BaseN2', 'Target1', 'Target2', 'other']).fit(train)
        train_ordinal_all  = enc_ordinal_all.transform(train)
        preprocessing = enc_ordinal_all

        y = pd.DataFrame({'y_class': [0, 0, 0, 1]})

        for model in self.modellist:
            print(type(model))
            model.fit(train_ordinal_all, y)

            check_consistency_model_features(features_dict, model, columns_dict,
                                             features_types, mask_params, preprocessing)
Exemplo n.º 3
0
    def __init__(
        self,
        features_dict,
        model,
        columns_dict,
        explainer,
        features_types,
        label_dict=None,
        preprocessing=None,
        postprocessing=None,
        features_groups=None,
        mask_params={
            "features_to_hide": None,
            "threshold": None,
            "positive": None,
            "max_contrib": None
        }):

        params_dict = [
            features_dict, features_types, label_dict, columns_dict,
            postprocessing
        ]

        for params in params_dict:
            if params is not None and isinstance(params, dict) == False:
                raise ValueError("""
                    {0} must be a dict.
                    """.format(str(params)))

        self.model = model
        self._case, self._classes = self.check_model()
        self.explainer = self.check_explainer(explainer)
        self.preprocessing = preprocessing
        self.check_preprocessing()
        self.features_dict = features_dict
        self.features_types = features_types
        self.label_dict = label_dict
        self.check_label_dict()
        self.columns_dict = columns_dict
        self.mask_params = mask_params
        self.check_mask_params()
        self.postprocessing = postprocessing
        self.features_groups = features_groups
        list_preprocessing = preprocessing_tolist(self.preprocessing)
        check_consistency_model_features(
            self.features_dict, self.model, self.columns_dict,
            self.features_types, self.mask_params, self.preprocessing,
            self.postprocessing, list_preprocessing, self.features_groups)
        check_consistency_model_label(self.columns_dict, self.label_dict)
        self._drop_option = check_preprocessing_options(
            columns_dict, features_dict, preprocessing, list_preprocessing)
Exemplo n.º 4
0
    def test_check_consistency_model_features_5(self):
        """
        Unit test check_consistency_model_features 5
        """
        train = pd.DataFrame(
            {
                'city': ['chicago', 'paris'],
                'state': ['US', 'FR'],
                'other': [5, 10]
            },
            index=['index1', 'index2'])

        features_dict = None
        columns_dict = {
            i: features
            for i, features in enumerate(train.columns)
        }
        features_types = {
            features: str(train[features].dtypes)
            for features in train.columns
        }
        mask_params = None

        enc = ColumnTransformer(transformers=[
            ('Ordinal_ce', ce.OrdinalEncoder(), ['city', 'state']),
            ('Ordinal_skp', skp.OrdinalEncoder(), ['city', 'state'])
        ],
                                remainder='passthrough')

        enc_2 = ColumnTransformer(transformers=[
            ('Ordinal_ce', ce.OrdinalEncoder(), ['city', 'state']),
            ('Ordinal_skp', skp.OrdinalEncoder(), ['city', 'state'])
        ],
                                  remainder='drop')

        enc.fit(train)
        train_1 = pd.DataFrame(
            enc.transform(train),
            columns=["city_ce", "state_ce", "city_skp", "state_skp", "other"])
        train_1["y"] = np.array([1, 0])

        clf_1 = cb.CatBoostClassifier(n_estimators=1) \
            .fit(train_1[["city_ce", "state_ce", "city_skp", "state_skp", "other"]],
                 train_1['y'])

        enc_2.fit(train)
        train_2 = pd.DataFrame(
            enc_2.transform(train),
            columns=["city_ce", "state_ce", "city_skp", "state_skp"])
        train_2["y"] = np.array([1, 0])

        clf_2 = cb.CatBoostClassifier(n_estimators=1) \
            .fit(train_2[["city_ce", "state_ce", "city_skp", "state_skp"]],
                 train_2['y'])

        enc_3 = ce.OneHotEncoder(cols=['city', 'state'])
        enc_3.fit(train)
        train_3 = enc_3.transform(train)
        train_3["y"] = np.array([1, 0])

        clf_3 = cb.CatBoostClassifier(n_estimators=1) \
            .fit(train_3[["city_1", "city_2", "state_1", "state_2", "other"]],
                 train_3['y'])

        dict_4 = {
            'col': 'state',
            'mapping': pd.Series(data=[1, 2], index=['US', 'FR']),
            'data_type': 'object'
        }

        dict_5 = {
            'col': 'city',
            'mapping': pd.Series(data=[1, 2], index=['chicago', 'paris']),
            'data_type': 'object'
        }

        enc_4 = [enc_3, [dict_4]]

        enc_5 = [enc_3, [dict_4, dict_5]]

        check_consistency_model_features(features_dict,
                                         clf_1,
                                         columns_dict,
                                         features_types,
                                         mask_params,
                                         enc,
                                         list_preprocessing=[enc])

        check_consistency_model_features(features_dict,
                                         clf_2,
                                         columns_dict,
                                         features_types,
                                         mask_params,
                                         enc_2,
                                         list_preprocessing=[enc_2])

        check_consistency_model_features(features_dict,
                                         clf_3,
                                         columns_dict,
                                         features_types,
                                         mask_params,
                                         enc_3,
                                         list_preprocessing=[enc_3])

        check_consistency_model_features(features_dict,
                                         clf_3,
                                         columns_dict,
                                         features_types,
                                         mask_params,
                                         enc_4,
                                         list_preprocessing=enc_4)

        check_consistency_model_features(features_dict,
                                         clf_3,
                                         columns_dict,
                                         features_types,
                                         mask_params,
                                         enc_5,
                                         list_preprocessing=enc_5)