예제 #1
0
 def _fit(self, df):
     self.col_groups = None
     if self.groups:
         self.col_groups = [
             fit_columns(df, c) for c in self.col_groups_init
         ]
     if not self.col_groups:
         self.col_groups = [[i] for i in self.cols]
예제 #2
0
    def fit(self, df, y=None):
        """Fit the transformer.

        Args:
            df (pandas.DataFrame): Dataframe used to fit the transformation.
        """

        cols = fit_columns(df, self.cols_init, self.dtype,
                self.cols_not_found_error)
        exclude = fit_columns(df, self.exclude, None,
                self.cols_not_found_error) if self.exclude else []
        self.cols = [i for i in cols if i not in exclude]
        self._fit(df)
        # Save column maps and lists.
        self.col_map = self.get_column_mapping()
        # Recreate cols, just in case you overwrite the get_column_mapping
        # but you didn't specify any cols_init.
        self.cols = list(collections.OrderedDict.fromkeys(
            flatten_list(self.col_map.keys())))
        col_map_1_n, col_map_1_n_inverse = {}, {}
        for k, v in self.col_map.items():
            add_to_map_dict(col_map_1_n, k, v)
            add_to_map_dict(col_map_1_n_inverse, v, k)
        self.col_map_1_n = col_map_1_n
        self.col_map_1_n_inverse = col_map_1_n_inverse
        self.cols_out = list(collections.OrderedDict.fromkeys(
            flatten_list(self.col_map.values())))
        # Cols in input columns and output columns should be removed from
        # df_in during the transform phase.
        # We join df_in with df_out, so we do not want duplicate column names.
        self.cols_in_out = set(self.cols).intersection(set(self.cols_out))

        if self.keep_original and self.cols_in_out:
            raise ValueError("Rename the output columns if you want to keep "
                             "the original columns, name collisions in "
                             f"{self.cols_in_out}")

        return self
예제 #3
0
    def test_fit_columns_keep_order(self):
        """Test that the columns are not alphabetically ordered. """

        df = pd.DataFrame({"c2": [], "c1": []})
        cols = fit_columns(df, ["c*"])
        self.assertListEqual(cols, ["c2", "c1"])
예제 #4
0
    def test_fit_columns_duplicates_drop(self):

        cols = fit_columns(create_df_3dtypes(), cols=["c*", "c1"])
        self.assertListEqual(cols, ["c1", "c2"])
예제 #5
0
    def test_fit_columns_no_dtype_in_df(self):

        cols = fit_columns(create_df_3dtypes(), dtype=float)
        self.assertListEqual(cols, [])
예제 #6
0
    def test_fit_columns_no_df_cols_no_cols_dtype(self):

        cols = fit_columns(pd.DataFrame())
        self.assertListEqual(cols, [])
예제 #7
0
    def test_fit_columns_no_cols_dtype(self):

        cols = fit_columns(create_df_3dtypes())
        self.assertListEqual(cols, ["c1", "c2", "t1"])
예제 #8
0
    def test_fit_columns_no_match_no_error(self):

        cols = fit_columns(create_df_3dtypes(), ["r*"], raise_error=False)
        self.assertListEqual(cols, [])
예제 #9
0
    def test_fit_columns_no_match(self):

        with self.assertRaises(ValueError):
            cols = fit_columns(create_df_3dtypes(), ["r*"])
예제 #10
0
    def test_fit_columns_cols_dtype(self):

        cols = fit_columns(create_df_3dtypes(), ["c*"], int)
        self.assertListEqual(cols, ["c1"])
예제 #11
0
    def test_fit_columns_cols_empty(self):

        cols = fit_columns(create_df_3dtypes(), [], [])
        self.assertListEqual(cols, ["c1", "c2", "t1"])
예제 #12
0
    def test_fit_columns_cols(self):

        cols = fit_columns(create_df_3dtypes(), ["c*"])
        self.assertListEqual(cols, ["c1", "c2"])