def _fit(self, df): self.col_groups = None if self.groups: self.col_groups = [ fit_columns(df, c) for c in self.col_groups_init ] if not self.col_groups: self.col_groups = [[i] for i in self.cols]
def fit(self, df, y=None): """Fit the transformer. Args: df (pandas.DataFrame): Dataframe used to fit the transformation. """ cols = fit_columns(df, self.cols_init, self.dtype, self.cols_not_found_error) exclude = fit_columns(df, self.exclude, None, self.cols_not_found_error) if self.exclude else [] self.cols = [i for i in cols if i not in exclude] self._fit(df) # Save column maps and lists. self.col_map = self.get_column_mapping() # Recreate cols, just in case you overwrite the get_column_mapping # but you didn't specify any cols_init. self.cols = list(collections.OrderedDict.fromkeys( flatten_list(self.col_map.keys()))) col_map_1_n, col_map_1_n_inverse = {}, {} for k, v in self.col_map.items(): add_to_map_dict(col_map_1_n, k, v) add_to_map_dict(col_map_1_n_inverse, v, k) self.col_map_1_n = col_map_1_n self.col_map_1_n_inverse = col_map_1_n_inverse self.cols_out = list(collections.OrderedDict.fromkeys( flatten_list(self.col_map.values()))) # Cols in input columns and output columns should be removed from # df_in during the transform phase. # We join df_in with df_out, so we do not want duplicate column names. self.cols_in_out = set(self.cols).intersection(set(self.cols_out)) if self.keep_original and self.cols_in_out: raise ValueError("Rename the output columns if you want to keep " "the original columns, name collisions in " f"{self.cols_in_out}") return self
def test_fit_columns_keep_order(self): """Test that the columns are not alphabetically ordered. """ df = pd.DataFrame({"c2": [], "c1": []}) cols = fit_columns(df, ["c*"]) self.assertListEqual(cols, ["c2", "c1"])
def test_fit_columns_duplicates_drop(self): cols = fit_columns(create_df_3dtypes(), cols=["c*", "c1"]) self.assertListEqual(cols, ["c1", "c2"])
def test_fit_columns_no_dtype_in_df(self): cols = fit_columns(create_df_3dtypes(), dtype=float) self.assertListEqual(cols, [])
def test_fit_columns_no_df_cols_no_cols_dtype(self): cols = fit_columns(pd.DataFrame()) self.assertListEqual(cols, [])
def test_fit_columns_no_cols_dtype(self): cols = fit_columns(create_df_3dtypes()) self.assertListEqual(cols, ["c1", "c2", "t1"])
def test_fit_columns_no_match_no_error(self): cols = fit_columns(create_df_3dtypes(), ["r*"], raise_error=False) self.assertListEqual(cols, [])
def test_fit_columns_no_match(self): with self.assertRaises(ValueError): cols = fit_columns(create_df_3dtypes(), ["r*"])
def test_fit_columns_cols_dtype(self): cols = fit_columns(create_df_3dtypes(), ["c*"], int) self.assertListEqual(cols, ["c1"])
def test_fit_columns_cols_empty(self): cols = fit_columns(create_df_3dtypes(), [], []) self.assertListEqual(cols, ["c1", "c2", "t1"])
def test_fit_columns_cols(self): cols = fit_columns(create_df_3dtypes(), ["c*"]) self.assertListEqual(cols, ["c1", "c2"])