def test_fit(self): # training data d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]} df = pd.DataFrame(data=d) lb = LabelBinarizer() # check first column lb.fit(df, "col1") data_json = lb.to_json() self.assertTrue("new_columns" in data_json) # we take alphabetical order self.assertTrue("col1_c" in data_json["new_columns"]) self.assertTrue("col1_a" not in data_json["new_columns"]) self.assertTrue("unique_values" in data_json) self.assertTrue("a" in data_json["unique_values"]) self.assertTrue("c" in data_json["unique_values"]) lb = LabelBinarizer() # check second column lb.fit(df, "col2") data_json = lb.to_json() self.assertTrue("new_columns" in data_json) self.assertTrue("col2_w" in data_json["new_columns"]) self.assertTrue("col2_e" in data_json["new_columns"]) self.assertTrue("col2_d" in data_json["new_columns"]) self.assertTrue("unique_values" in data_json) self.assertTrue("w" in data_json["unique_values"]) self.assertTrue("e" in data_json["unique_values"]) self.assertTrue("d" in data_json["unique_values"])
def test_to_and_from_json_booleans(self): # training data d = {"col1": ["a", "a", "c"], "col2": [True, True, False]} df = pd.DataFrame(data=d) # fit binarizer lb1 = LabelBinarizer() lb1.fit(df, "col1") lb2 = LabelBinarizer() lb2.fit(df, "col2") # test data d_test = { "col1": ["c", "c", "a"], "col2": [False, False, True], "col3": [2, 3, 4], } df_test = pd.DataFrame(data=d_test) # to json and from json new_lb1 = LabelBinarizer() new_lb2 = LabelBinarizer() new_lb1.from_json(lb1.to_json()) new_lb2.from_json(json.loads(json.dumps(lb2.to_json(), indent=4))) # transform df_test = new_lb1.transform(df_test, "col1") df_test = new_lb2.transform(df_test, "col2") # for binary column, only one value is left, old column should be deleted self.assertTrue("col1_c" in df_test.columns) self.assertTrue("col1" not in df_test.columns) self.assertEqual(2, np.sum(df_test["col1_c"])) # for multiple value colum, all columns should be added self.assertTrue("col2_True" in df_test.columns) self.assertTrue("col2" not in df_test.columns) self.assertEqual(1, np.sum(df_test["col2_True"])) # do not touch continuous attribute self.assertTrue("col3" in df_test.columns)
def _fit_categorical_convert(self, X): for column in self._columns: if PreprocessingUtils.get_type( X[column]) != PreprocessingUtils.CATEGORICAL: # no need to convert, already a number continue # limit categories - it is needed when doing one hot encoding # this code is also used in predict.py file # and transform_utils.py # TODO it needs refactoring !!! too_much_categories = len(np.unique(list(X[column].values))) > 200 lbl = None if (self._convert_method == PreprocessingCategorical.CONVERT_ONE_HOT and not too_much_categories): lbl = LabelBinarizer() lbl.fit(X, column) else: lbl = LabelEncoder() lbl.fit(X[column]) if lbl is not None: self._convert_params[column] = lbl.to_json()