예제 #1
0
    def test_fit(self):
        # training data
        d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]}
        df = pd.DataFrame(data=d)
        lb = LabelBinarizer()
        # check first column
        lb.fit(df, "col1")
        data_json = lb.to_json()
        self.assertTrue("new_columns" in data_json)
        # we take alphabetical order
        self.assertTrue("col1_c" in data_json["new_columns"])
        self.assertTrue("col1_a" not in data_json["new_columns"])
        self.assertTrue("unique_values" in data_json)
        self.assertTrue("a" in data_json["unique_values"])
        self.assertTrue("c" in data_json["unique_values"])

        lb = LabelBinarizer()
        # check second column
        lb.fit(df, "col2")
        data_json = lb.to_json()
        self.assertTrue("new_columns" in data_json)
        self.assertTrue("col2_w" in data_json["new_columns"])
        self.assertTrue("col2_e" in data_json["new_columns"])
        self.assertTrue("col2_d" in data_json["new_columns"])
        self.assertTrue("unique_values" in data_json)
        self.assertTrue("w" in data_json["unique_values"])
        self.assertTrue("e" in data_json["unique_values"])
        self.assertTrue("d" in data_json["unique_values"])
예제 #2
0
    def test_to_and_from_json_booleans(self):
        # training data
        d = {"col1": ["a", "a", "c"], "col2": [True, True, False]}
        df = pd.DataFrame(data=d)
        # fit binarizer
        lb1 = LabelBinarizer()
        lb1.fit(df, "col1")
        lb2 = LabelBinarizer()
        lb2.fit(df, "col2")
        # test data
        d_test = {
            "col1": ["c", "c", "a"],
            "col2": [False, False, True],
            "col3": [2, 3, 4],
        }
        df_test = pd.DataFrame(data=d_test)
        # to json and from json
        new_lb1 = LabelBinarizer()
        new_lb2 = LabelBinarizer()
        new_lb1.from_json(lb1.to_json())
        new_lb2.from_json(json.loads(json.dumps(lb2.to_json(), indent=4)))

        # transform
        df_test = new_lb1.transform(df_test, "col1")
        df_test = new_lb2.transform(df_test, "col2")
        # for binary column, only one value is left, old column should be deleted
        self.assertTrue("col1_c" in df_test.columns)
        self.assertTrue("col1" not in df_test.columns)
        self.assertEqual(2, np.sum(df_test["col1_c"]))
        # for multiple value colum, all columns should be added
        self.assertTrue("col2_True" in df_test.columns)
        self.assertTrue("col2" not in df_test.columns)
        self.assertEqual(1, np.sum(df_test["col2_True"]))
        # do not touch continuous attribute
        self.assertTrue("col3" in df_test.columns)
    def _fit_categorical_convert(self, X):
        for column in self._columns:

            if PreprocessingUtils.get_type(
                    X[column]) != PreprocessingUtils.CATEGORICAL:
                # no need to convert, already a number
                continue
            # limit categories - it is needed when doing one hot encoding
            # this code is also used in predict.py file
            # and transform_utils.py
            # TODO it needs refactoring !!!
            too_much_categories = len(np.unique(list(X[column].values))) > 200
            lbl = None
            if (self._convert_method
                    == PreprocessingCategorical.CONVERT_ONE_HOT
                    and not too_much_categories):
                lbl = LabelBinarizer()
                lbl.fit(X, column)
            else:
                lbl = LabelEncoder()
                lbl.fit(X[column])

            if lbl is not None:
                self._convert_params[column] = lbl.to_json()