Пример #1
0
def test__nbcols():
    df = pd.DataFrame({
        "a": np.arange(10),
        "b": ["aa", "bb", "cc"] * 3 + ["dd"]
    })
    assert _nbcols(df) == 2
    assert _nbcols(df.values) == 2
    assert _nbcols(df["a"]) == 1
    assert _nbcols(df["a"].values) == 1
Пример #2
0
    def fit_transform(self, X, y=None, **fit_params):
        if self.verbose:
            print("withing 'DebugPassThrought' fit_transform named %s" % self.name)
            if fit_params:
                print("fit_params given")
                print(fit_params)

        if self.debug:
            self._expected_type = dsh.get_type(X)
            self._expected_nbcols = dsh._nbcols(X)
            if self._expected_type in (dsh.DataTypes.DataFrame, dsh.DataTypes.SparseDataFrame):
                self._expected_columns = list(X.columns)

            self.fit_params = fit_params  # stored, just to help test

        Xres = X
        if self.column_prefix is not None:
            Xres = X.copy()
            Xres.columns = [self.column_prefix + "_" + c for c in Xres.columns]

        self._features = getattr(Xres, "columns", None)
        if self._features is not None:
            self._features = list(self._features)

        return Xres
Пример #3
0
    def fit(self, X, y=None, **fit_params):
        if self.verbose:
            print("within 'DebugPassThrough' fit named %s" % self.name)
            if fit_params:
                print("fit_params given")
                print(fit_params)

        if self.debug:
            self._expected_type = dsh.get_type(X)
            self._expected_nbcols = dsh._nbcols(X)
            if self._expected_type in (dsh.DataTypes.DataFrame, dsh.DataTypes.SparseDataFrame):
                self._expected_columns = list(X.columns)

            self.fit_params = fit_params  # stored, just to help test

        if self.column_prefix is None:
            self._features = getattr(X, "columns", None)
            if self._features is not None:
                self._features = list(self._features)
        else:
            if hasattr(X, "columns"):
                self._features = [self.column_prefix + "_" + c for c in X.columns]
            else:
                self._features = None

        return self
Пример #4
0
    def transform(self, X):

        self._check_is_fitted()

        Xtype = dsh.get_type(X)
        Xnbcols = dsh._nbcols(X)

        if self._expected_type != Xtype:
            raise ValueError(
                "I don't have the correct type as input, expected : %s, got : %s"
                % (self._expected_type, Xtype))

        if self._expected_nbcols != Xnbcols:
            raise ValueError(
                "I don't have the correct number of columns, expected : %d, got : %d"
                % (self._expected_nbcols, Xnbcols))

        if self._expected_type in (DataTypes.DataFrame,
                                   DataTypes.SparseDataFrame):
            if self._columns_to_use_is_integer:

                set_col = set(range(X.shape[1]))
                for l in self._final_columns_to_use:
                    if l not in set_col:
                        raise ValueError(
                            "Column %d isn't in the column of the DataFrame" %
                            l)

                return X.iloc[:, self._final_columns_to_use]
            else:

                set_col = set(X.columns)
                for l in self._final_columns_to_use:
                    if l not in set_col:
                        raise ValueError(
                            "Column %s isn't in the column of the DataFrame" %
                            l)

                return X.loc[:, self._final_columns_to_use]

        else:
            if self._columns_to_use_is_integer:

                set_col = set(range(X.shape[1]))
                for l in self._final_columns_to_use:
                    if l not in set_col:
                        raise ValueError(
                            "Column %d isn't in the column of the DataFrame" %
                            l)

                return X[:, self._final_columns_to_use]

            else:
                raise ValueError(
                    "columns_to_use must be integers when type if array or sparseArray"
                )
Пример #5
0
    def transform(self, X):

        self._check_is_fitted()

        Xtype = dsh.get_type(X)
        Xnbcols = dsh._nbcols(X)

        if self._expected_type != Xtype:
            raise ValueError(
                "I don't have the correct type as input, expected : %s, got : %s" % (self._expected_type, Xtype)
            )

        if self.raise_if_shape_differs and self._expected_nbcols != Xnbcols:
            raise ValueError(
                "I don't have the correct number of columns, expected : %d, got : %d" % (self._expected_nbcols, Xnbcols)
            )
            # TODO : remove that check in some cases

        if self._return_data_as_inputed:
            return X  # So no copy is made

        if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame):
            if self._columns_to_use_is_integer:

                set_col = set(range(X.shape[1]))
                for l in self._final_columns_to_use:
                    if l not in set_col:
                        raise ValueError("Column %d isn't in the column of the DataFrame" % l)

                return X.iloc[:, self._final_columns_to_use]
            else:

                set_col = set(X.columns)
                for l in self._final_columns_to_use:
                    if l not in set_col:
                        raise ValueError("Column %s isn't in the column of the DataFrame" % l)

                return X.loc[:, self._final_columns_to_use]

        else:
            if self._columns_to_use_is_integer:

                set_col = set(range(X.shape[1]))
                for l in self._final_columns_to_use:
                    if l not in set_col:
                        raise ValueError("Column %d isn't in the column of the DataFrame" % l)
                if isinstance(X, sps.coo_matrix):
                    return X.tocsc()[:, self._final_columns_to_use].tocoo()  # because COO matrix are not subscriptable
                else:
                    return X[:, self._final_columns_to_use]

            else:
                raise ValueError("columns_to_use must be integers when type if array or sparseArray")
Пример #6
0
    def _verif(self):

        if not isinstance(self.all_datas, (list, dict)):
            raise TypeError(
                "I don't know how to handle that type of Data : %s" %
                type(self.all_datas))

        if hasattr(self.all_datas, "items"):
            nbrows = [_nbrows(data) for key, data in self.all_datas.items()]
            nbcols = [_nbcols(data) for key, data in self.all_datas.items()]
            self._is_dict = True
        else:
            nbrows = [_nbrows(data) for data in self.all_datas]
            nbcols = [_nbcols(data) for data in self.all_datas]
            self._is_dict = False

        if len(set(nbrows)) > 1:
            raise ValueError("All objects don't have the same length")

        self._nbrows = nbrows[0]
        self._nbcols = sum(nbcols)
Пример #7
0
    def _fit_transform(self, X, y, is_fit, is_transform, fit_params=None):
        """ internal method that handle the fit and the transform """

        if fit_params is None:
            fit_params = {}

        if is_fit:
            if isinstance(self.columns_to_use, str) and self.columns_to_use == "auto":
                columns = self._get_default_columns_to_use(X, y)
                self.selector = ColumnsSelector(columns_to_use=columns)
            else:
                self.selector = ColumnsSelector(columns_to_use=self.columns_to_use, regex_match=self.regex_match)

        if hasattr(X, "shape"):
            if X.shape[0] == 0:
                raise ValueError("the X object has 0 rows")

        Xindex = dsh._get_index(X)  # if X has an index retrieve it
        #        if self.columns_to_use is not None:
        if is_fit:
            Xsubset = self.selector.fit_transform(X)
        else:
            Xsubset = self.selector.transform(X)
        # TODO (maybe): here allow a preprocessing pipeline
        #        if self.has_preprocessing:
        #            if is_fit:
        #                self.preprocessing = self._get_preprocessing()
        #                Xsubset = self.preprocessing.fit_transform(Xsubset)
        #            else:
        #                Xsubset = self.preprocessing.transform(Xsubset)

        # Store columns and shape BEFORE any modification
        if self.selector is not None:
            Xsubset_columns = self.selector.get_feature_names()
        else:
            raise NotImplementedError("should not go there anymore")
            # Xsubset_columns = getattr(Xsubset, "columns", None)

        Xsubset_shape = getattr(Xsubset, "shape", None)
        # TODO : ici utiliser d'une facon ou d'une autre un '
        # https://github.com/scikit-learn/scikit-learn/issues/6425

        if is_fit:
            self._expected_type = dsh.get_type(Xsubset)
            self._expected_nbcols = dsh._nbcols(Xsubset)
            self._expected_columns = dsh._get_columns(Xsubset)

        else:
            Xtype = dsh.get_type(Xsubset)
            if Xtype != self._expected_type:
                raise ValueError(
                    "I don't have the correct type as input, expected : %s, got : %s" % (self._expected_type, Xtype)
                )

            nbcols = dsh._nbcols(Xsubset)
            if nbcols != self._expected_nbcols:
                raise ValueError(
                    "I don't have the correct nb of colmns as input, expected : %d, got : %d"
                    % (self._expected_nbcols, nbcols)
                )

            columns = dsh._get_columns(Xsubset)
            expected_columns = getattr(self, "_expected_columns", None)  # to allow pickle compatibility

            if expected_columns is not None and columns is not None and columns != self._expected_columns:
                raise ValueError("I don't have the correct names of columns")

        if self.accepted_input_types is not None and self._expected_type not in self.accepted_input_types:
            Xsubset = dsh.convert_generic(
                Xsubset, mapped_type=self._expected_type, output_type=self.accepted_input_types[0]
            )

        if is_fit:
            self._verif_params()
            self._empty_data = False
            s = getattr(Xsubset, "shape", None)
            if s is not None and len(s) > 1 and s[1] == 0:
                self._empty_data = True

        if self.all_columns_at_once or self._empty_data:

            if is_fit:
                self._model = self._get_model(Xsubset, y)

            ##############################################
            ### Apply the model on ALL columns at ONCE ###
            ##############################################

            if self.work_on_one_column_only:
                Xsubset = dsh.make1dimension(Xsubset)  # will generate an error if 2 dimensions
            else:
                Xsubset = dsh.make2dimensions(Xsubset)

            # Call to underlying model
            Xres = None
            if is_fit and is_transform:
                ##############################
                ###  fit_transform method  ###
                ##############################
                # test if the the data to transform actually has some columns

                if not self._empty_data:
                    # normal case
                    Xres = self._model.fit_transform(Xsubset, y, **fit_params)
                else:
                    # It means there is no columns to transform
                    Xres = Xsubset  # don't do anything

            elif is_fit and not is_transform:
                ####################
                ###  fit method  ###
                ####################
                if self.must_transform_to_get_features_name:
                    Xres = self._model.fit_transform(Xsubset, y, **fit_params)
                else:
                    self._model.fit(Xsubset, y, **fit_params)
            else:
                ####################
                ###  transform   ###
                ####################
                if not self._empty_data:
                    Xres = self._model.transform(Xsubset)
                else:
                    Xres = Xsubset

            if is_fit:
                self._columns_informations = {
                    "output_columns": getattr(Xres, "columns", None),  # names of transformed columns if exist
                    "output_shape": getattr(Xres, "shape", None),  # shape of transformed result if exist
                    "input_columns": Xsubset_columns,  # name of input columns
                    "input_shape": Xsubset_shape,  # shape of input data
                }

                self._feature_names_for_transform = self.try_to_find_feature_names_all_at_once(
                    output_columns=self._columns_informations["output_columns"],
                    output_shape=self._columns_informations["output_shape"],
                    input_columns=self._columns_informations["input_columns"],
                    input_shape=self._columns_informations["input_shape"],
                )

                # self.kept_features_names = None  # for now

            if is_transform:
                Xres = dsh.convert_generic(Xres, output_type=self.desired_output_type)
                Xres = dsh._set_index(Xres, Xindex)

        else:
            ########################################
            ### Apply the model COLUMN BY COLUMN ###
            ########################################
            if is_fit:
                self._models = []

            if is_transform or self.must_transform_to_get_features_name:
                all_Xres = []
            else:
                all_Xres = None

            Xsubset = dsh.make2dimensions(Xsubset)

            for j in range(self._expected_nbcols):

                if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame, DataTypes.Serie):
                    Xsubset_j = Xsubset.iloc[:, j]
                else:
                    Xsubset_j = Xsubset[:, j]

                if is_fit:
                    sub_model = self._get_model(Xsubset, y)
                    self._models.append(sub_model)
                else:
                    sub_model = self._models[j]

                if not self.work_on_one_column_only:
                    Xsubset_j = dsh.make2dimensions(Xsubset_j)

                if is_fit and is_transform:
                    # fit_transform method
                    Xres_j = sub_model.fit_transform(Xsubset_j, y, **fit_params)

                    all_Xres.append(Xres_j)

                elif is_fit and not is_transform:
                    # fit method
                    if self.must_transform_to_get_features_name:
                        Xres_j = sub_model.fit_transform(Xsubset_j, y, **fit_params)
                        all_Xres.append(Xres_j)

                    else:
                        sub_model.fit(Xsubset_j, y, **fit_params)

                elif is_transform:
                    # transform method

                    Xres_j = sub_model.transform(Xsubset_j)
                    all_Xres.append(Xres_j)

            if is_fit:

                self._columns_informations = {
                    "all_output_columns": None
                    if all_Xres is None
                    else [getattr(Xres, "columns", None) for Xres in all_Xres],
                    "all_output_shape": None
                    if all_Xres is None
                    else [getattr(Xres, "shape", None) for Xres in all_Xres],
                    "input_columns": Xsubset_columns,  # name of input columns
                    "input_shape": Xsubset_shape,  # shape of input data
                }

                self._feature_names_for_transform = list(
                    self.try_to_find_feature_names_separate(
                        all_output_columns=self._columns_informations["all_output_columns"],
                        all_output_shape=self._columns_informations["all_output_shape"],
                        input_columns=self._columns_informations["input_columns"],
                        input_shape=self._columns_informations["input_shape"],
                    )
                )

                # self.kept_features_names = None  # for now

            if is_transform:
                Xres = dsh.generic_hstack(all_Xres, output_type=self.desired_output_type)
                Xres = dsh._set_index(Xres, Xindex)

        if is_transform:
            if self._feature_names_for_transform is not None:
                ### LA ca marche pas en transform !!!
                Xres = dsh._set_columns(Xres, self._feature_names_for_transform)

        if is_transform:
            return Xres
        else:
            return self
Пример #8
0
    def fit(self, X, y=None):
        self._expected_type = dsh.get_type(X)
        self._expected_nbcols = dsh._nbcols(X)

        ######################################
        ### Special case : keep everything ###
        ######################################
        self._return_data_as_inputed = False
        if isinstance(self.columns_to_use, str) and self.columns_to_use == "all" and self.columns_to_drop is None:
            self._already_fitted = True
            self._columns_to_use_is_integer = True
            self._final_columns_to_use = list(range(X.shape[0]))
            self._return_data_as_inputed = True
            if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame):
                self._Xcolumns = list(X.columns)
            else:
                self._Xcolumns = list(range(self._expected_nbcols))

        ### Columns to use ###
        list_columns_to_use = self._get_list_of_columns(columns=self.columns_to_use, X=X, regex_match=self.regex_match)
        list_columns_to_drop = self._get_list_of_columns(
            columns=self.columns_to_drop, X=X, regex_match=self.regex_match
        )

        #################################
        ### Special case : no columns ###
        #################################
        if list_columns_to_use is not None and len(list_columns_to_use) == 0:
            # This means that there is nothing to do : no columns will be kept
            self._already_fitted = True
            self._columns_to_use_is_integer = True
            self._final_columns_to_use = []

            if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame):
                self._Xcolumns = list(X.columns)
            else:
                self._Xcolumns = list(range(self._expected_nbcols))

            return self

        ### What is the type of columns_to_use and columns_to_drop :
        if list_columns_to_use is not None:
            is_int = "int" in str(type(list_columns_to_use[0]))
        else:
            is_int = None

        if list_columns_to_drop is not None and len(list_columns_to_drop) > 0:
            is_int_to_drop = "int" in str(type(list_columns_to_drop[0]))
        else:
            is_int_to_drop = is_int

        ### Verify type:
        if is_int is not None and is_int_to_drop is not None:
            if is_int != is_int_to_drop:
                raise ValueError(
                    "Please be consistent between 'columns_to_use' and 'columns_to_drop', both can be integer or str, but they should have the same type"
                )

        if is_int is None and is_int_to_drop is None:
            is_int = True
            is_int_to_drop = True

        if is_int is None and is_int_to_drop is not None:
            is_int = is_int_to_drop
        if is_int_to_drop is None and is_int is not None:
            is_int_to_drop = is_int

        if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame):
            if is_int:

                ##############################################
                ### Case 1 : DataFrame + Integer selection ###
                ##############################################

                if self.regex_match:
                    #######################
                    ## Case 1a : + Regex ##
                    #######################
                    raise ValueError("regex_match can only work with strings 'columns_to_use' not int")

                cols_set = set(range(self._expected_nbcols))
                if list_columns_to_use is not None:

                    # Check all column are available

                    for l in list_columns_to_use:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = list_columns_to_use
                    # final_columns_to_use = intersect( list_columns_to_use  , list(range(self._expected_nbcols)) )
                else:
                    final_columns_to_use = list(range(self._expected_nbcols))

                if list_columns_to_drop is not None:

                    for l in list_columns_to_drop:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop)

                else:
                    final_columns_to_use = []

            else:

                #############################################
                ### Case 2 : DataFrame + String selection ###
                #############################################
                if self.regex_match:
                    #######################
                    ## Case 2a : + Regex ##
                    #######################
                    if list_columns_to_use is not None:
                        cols_that_match = []
                        for col in list(X.columns):
                            for r in list_columns_to_use:
                                if re.search(r, col) is not None:  # TODO : allow a compiled regex
                                    cols_that_match.append(col)
                                    break

                    if list_columns_to_drop is not None:
                        cols_that_match_drop = []
                        for col in list(X.columns):
                            for r in list_columns_to_drop:
                                if re.search(r, col) is not None:  # TODO : allow a compiled regex
                                    cols_that_match_drop.append(col)
                                    break

                    if list_columns_to_use is not None:
                        final_columns_to_use = cols_that_match
                        # final_columns_to_use = intersect(cols_that_match ,  list(X.columns)) # technically the intersect is useless
                    else:
                        final_columns_to_use = list(X.columns)

                    if list_columns_to_drop is not None:
                        final_columns_to_use = diff(final_columns_to_use, cols_that_match_drop)

                else:
                    ########################
                    ## Case 2b : no Regex ##
                    ########################
                    cols_set = set(X.columns)
                    if list_columns_to_use is not None:

                        for l in list_columns_to_use:
                            if l not in cols_set:
                                raise ValueError("Column %s isn't in the columns of the DataFrame" % l)
                        final_columns_to_use = list_columns_to_use  # intersect(list_columns_to_use, list(X.columns))

                    else:
                        final_columns_to_use = list(X.columns)

                    if list_columns_to_drop is not None:

                        for l in list_columns_to_drop:
                            if l not in cols_set:
                                raise ValueError("Column %s isn't in the columns of the DataFrame" % l)

                        final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop)

                    else:
                        final_columns_to_use = []

        else:

            if is_int or is_int is None:
                ##########################################
                ### Case 3 : Array + Integer selection ###
                ##########################################
                if self.regex_match:

                    ########################
                    ## Case 3a  : + Regex ##
                    ########################
                    raise ValueError("regex_match can only work with strings 'columns_to_use' not int")

                ########################
                ## Case 3b : no Regex ##
                ########################
                cols_set = set(range(self._expected_nbcols))
                if list_columns_to_use is not None:

                    for l in list_columns_to_use:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = intersect(list_columns_to_use, list(range(self._expected_nbcols)))
                else:
                    final_columns_to_use = list(range(self._expected_nbcols))

                if list_columns_to_drop is not None:

                    for l in list_columns_to_drop:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop)
                else:
                    final_columns_to_use = []

            else:
                #########################################
                ### Case 4 : Array + String selection ###
                #########################################
                raise ValueError("columns_to_use must be integers when type is array or sparseArray")

        self._columns_to_use_is_integer = is_int
        self._final_columns_to_use = final_columns_to_use

        if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame):
            self._Xcolumns = list(X.columns)
        else:
            self._Xcolumns = list(range(self._expected_nbcols))

        ## TODO : here make a simplification into a slice when it is possible

        self._already_fitted = True

        return self
Пример #9
0
    def fit(self, X, y=None):
        self._expected_type = dsh.get_type(X)
        self._expected_nbcols = dsh._nbcols(X)

        ### Columns to use ###
        if self.columns_to_use is None:
            list_columns_to_use = None  # [i for i in range(self._expected_nbcols)]
        else:
            list_columns_to_use = self.convert_to_list(cols_list=self.columns_to_use)

        ### Columns to drop ###
        if self.columns_to_drop is None:
            list_columns_to_drop = None
        else:
            list_columns_to_drop = self.convert_to_list(cols_list=self.columns_to_drop)

        if list_columns_to_use is not None and len(list_columns_to_use) == 0:
            raise ValueError("columns_to_use is empty")

        ### What is the type of columns_to_use and columns_to_drop :
        if list_columns_to_use is not None:
            is_int = "int" in str(type(list_columns_to_use[0]))
        else:
            is_int = None

        if list_columns_to_drop is not None and len(list_columns_to_drop) > 0:
            is_int_to_drop = "int" in str(type(list_columns_to_drop[0]))
        else:
            is_int_to_drop = is_int

        ### Verify type:
        if is_int is not None and is_int_to_drop is not None:
            if is_int != is_int_to_drop:
                raise ValueError(
                    "Please be consistent between columns_to_use and columns_to_drop, both can be integer or str, but they should have the same type"
                )

        if is_int is None and is_int_to_drop is None:
            is_int = True
            is_int_to_drop = True

        if is_int is None and is_int_to_drop is not None:
            is_int = is_int_to_drop
        if is_int_to_drop is None and is_int is not None:
            is_int_to_drop = is_int

        if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame):
            if is_int:

                ##############################################
                ### Case 1 : DataFrame + Integer selection ###
                ##############################################

                if self.regex_match:
                    #######################
                    ## Case 1a : + Regex ##
                    #######################
                    raise ValueError("regex_match can only work with strings 'columns_to_use' not int")

                cols_set = set(range(self._expected_nbcols))
                if list_columns_to_use is not None:

                    # Check all column are available

                    for l in list_columns_to_use:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = list_columns_to_use
                    # final_columns_to_use = intersect( list_columns_to_use  , list(range(self._expected_nbcols)) )
                else:
                    final_columns_to_use = list(range(self._expected_nbcols))

                if list_columns_to_drop is not None:

                    for l in list_columns_to_drop:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop)

            else:

                #############################################
                ### Case 2 : DataFrame + String selection ###
                #############################################
                if self.regex_match:
                    #######################
                    ## Case 2a : + Regex ##
                    #######################
                    if list_columns_to_use is not None:
                        cols_that_match = []
                        for col in list(X.columns):
                            for r in list_columns_to_use:
                                if re.search(r, col) is not None:
                                    cols_that_match.append(col)
                                    break

                    if list_columns_to_drop is not None:
                        cols_that_match_drop = []
                        for col in list(X.columns):
                            for r in list_columns_to_drop:
                                if re.search(r, col) is not None:
                                    cols_that_match_drop.append(col)
                                    break

                    if list_columns_to_use is not None:
                        final_columns_to_use = cols_that_match
                        # final_columns_to_use = intersect(cols_that_match ,  list(X.columns)) # technically the intersect is useless
                    else:
                        final_columns_to_use = list(X.columns)

                    if list_columns_to_drop is not None:
                        final_columns_to_use = diff(final_columns_to_use, cols_that_match_drop)

                else:
                    ########################
                    ## Case 2b : no Regex ##
                    ########################
                    cols_set = set(X.columns)
                    if list_columns_to_use is not None:

                        for l in list_columns_to_use:
                            if l not in cols_set:
                                raise ValueError("Column %s isn't in the columns of the DataFrame" % l)
                        final_columns_to_use = list_columns_to_use  # intersect(list_columns_to_use, list(X.columns))

                    else:
                        final_columns_to_use = list(X.columns)

                    if list_columns_to_drop is not None:

                        for l in list_columns_to_drop:
                            if l not in cols_set:
                                raise ValueError("Column %s isn't in the columns of the DataFrame" % l)

                        final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop)

        else:

            if is_int:
                ##########################################
                ### Case 3 : Array + Integer selection ###
                ##########################################
                if self.regex_match:

                    ########################
                    ## Case 3a  : + Regex ##
                    ########################
                    raise ValueError("regex_match can only work with strings 'columns_to_use' not int")

                ########################
                ## Case 3b : no Regex ##
                ########################
                cols_set = set(range(self._expected_nbcols))
                if list_columns_to_use is not None:

                    for l in list_columns_to_use:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = intersect(list_columns_to_use, list(range(self._expected_nbcols)))
                else:
                    final_columns_to_use = list(range(self._expected_nbcols))

                if list_columns_to_drop is not None:

                    for l in list_columns_to_drop:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop)

            else:
                #########################################
                ### Case 4 : Array + String selection ###
                #########################################
                raise ValueError("columns_to_use must be integers when type is array or sparseArray")

        self._columns_to_use_is_integer = is_int
        self._final_columns_to_use = final_columns_to_use

        if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame):
            self._Xcolumns = list(X.columns)
        else:
            self._Xcolumns = list(range(self._expected_nbcols))

        ## TODO : here make a simplification into a slice when it is possible

        self._already_fitted = True

        return self