Пример #1
0
    def predict(self, X):
        y_hat = self.regressor.predict(X)  # call regressor
        y_int_hat = (y_hat + 0.5).astype(np.int32)  # conversion to closest int

        y_hat = self._target_encoder.inverse_transform(
            make2dimensions(y_int_hat))

        if self._mono_target:
            y_hat = y_hat[:, 0]

        return y_hat.astype(self._target_dtype)
Пример #2
0
    def _prepare_target(self, y, klass, conversion_type):
        """ prepare the target so that it can be given to the underlying model to use
        
        Parameters
        ----------
        
        y : array
            the original target 
            
        klass : type
            the encoder to use for the target
            
        conversion_type : DataType
            the output type desired by the target
            
        Set
        ---
        self._mono_target : bool 
            does the original problem as one target or not
        self._target_encoded : the encoder used on the target
        
        Returns
        --------
        y_encoded : array
            the modified target
        """
        self._mono_target = y.ndim == 1
        self._target_dtype = y.dtype

        if isinstance(self.classes, str) and self.classes == "auto":
            categories = "auto"
        else:
            if self._mono_target:
                categories = [self.classes
                              ]  # because OrdinalEncoder expect a list
            else:
                if not isinstance(self.classes, list):
                    raise TypeError(
                        "For multi-target classes should be a list, instead I got %s"
                        % str(type(self.classes)))

                categories = self.classes

        self._target_encoder = klass(categories=categories, dtype=np.int32)

        yd2 = convert_generic(make2dimensions(y), output_type=conversion_type)

        if conversion_type == DataTypes.NumpyArray and yd2.dtype.kind == 'U':
            yd2 = yd2.astype(np.object, copy=False)

        y_encoded = self._target_encoder.fit_transform(yd2)

        return y_encoded
Пример #3
0
    def fit(self, X, y):

        self._mono_target = y.ndim == 1

        if self.y_clusterer is None:
            y_clusterer = self.get_default_y_cluster(y)
        else:
            y_clusterer = self.y_clusterer
            # TODO : check that it is a clusterer

        if not is_classifier(self.classifier):
            raise TypeError("classifier should be a classifer")

        yd2 = make2dimensions(y)

        if hasattr(y_clusterer, "fit_predict"):
            y_cl = y_clusterer.fit_predict(yd2)
        else:
            y_cl = y_clusterer.fit_transform(yd2).astype('int32')

        if y_cl.ndim == 1:
            y_cl = y_cl[:, np.newaxis]

        if self._mono_target and y_cl.shape[1] > 1:
            raise ValueError(
                "The cluster should return only 1 dimensional clusters")

        self._mono_cluster = y_cl.shape[1] == 1

        self.classifier.fit(X, y_cl)  # fit classifier on result of cluster

        if self._mono_cluster:
            classes = [self.classifier.classes_]
        else:
            classes = self.classifier.classes_

        all_mean_mapping = self._compute_y_mean(yd2, y_cl)

        all_y_mean_mapping_matrix = []
        for classe, y_mean_mapping in zip(classes, all_mean_mapping):
            mat = np.concatenate([y_mean_mapping[cl] for cl in classe], axis=0)
            all_y_mean_mapping_matrix.append(mat)

        self._all_y_mean_matrix = all_y_mean_mapping_matrix

        return self
def test_make2dimensions():
    df = pd.DataFrame({"a": np.arange(10), "b": ["aa", "bb", "cc"] * 3 + ["dd"]})
    df2 = make2dimensions(df)
    assert id(df2) == id(df)
    assert df2.shape == (10, 2)
    assert make2dimensions(df["a"]).shape == (10, 1)
    assert make2dimensions(df.values).shape == (10, 2)
    assert make2dimensions(df["a"].values).shape == (10, 1)

    xx = np.zeros((10, 2, 2))
    with pytest.raises(ValueError):
        make2dimensions(xx)
Пример #5
0
    def _fit_transform(self, X, y, is_fit, is_transform, fit_params=None):
        """ internal method that handle the fit and the transform """

        if fit_params is None:
            fit_params = {}

        if is_fit:
            if isinstance(self.columns_to_use, str) and self.columns_to_use == "auto":
                columns = self._get_default_columns_to_use(X, y)
                self.selector = ColumnsSelector(columns_to_use=columns)
            else:
                self.selector = ColumnsSelector(columns_to_use=self.columns_to_use, regex_match=self.regex_match)

        if hasattr(X, "shape"):
            if X.shape[0] == 0:
                raise ValueError("the X object has 0 rows")

        Xindex = dsh._get_index(X)  # if X has an index retrieve it
        #        if self.columns_to_use is not None:
        if is_fit:
            Xsubset = self.selector.fit_transform(X)
        else:
            Xsubset = self.selector.transform(X)
        # TODO (maybe): here allow a preprocessing pipeline
        #        if self.has_preprocessing:
        #            if is_fit:
        #                self.preprocessing = self._get_preprocessing()
        #                Xsubset = self.preprocessing.fit_transform(Xsubset)
        #            else:
        #                Xsubset = self.preprocessing.transform(Xsubset)

        # Store columns and shape BEFORE any modification
        if self.selector is not None:
            Xsubset_columns = self.selector.get_feature_names()
        else:
            raise NotImplementedError("should not go there anymore")
            # Xsubset_columns = getattr(Xsubset, "columns", None)

        Xsubset_shape = getattr(Xsubset, "shape", None)
        # TODO : ici utiliser d'une facon ou d'une autre un '
        # https://github.com/scikit-learn/scikit-learn/issues/6425

        if is_fit:
            self._expected_type = dsh.get_type(Xsubset)
            self._expected_nbcols = dsh._nbcols(Xsubset)
            self._expected_columns = dsh._get_columns(Xsubset)

        else:
            Xtype = dsh.get_type(Xsubset)
            if Xtype != self._expected_type:
                raise ValueError(
                    "I don't have the correct type as input, expected : %s, got : %s" % (self._expected_type, Xtype)
                )

            nbcols = dsh._nbcols(Xsubset)
            if nbcols != self._expected_nbcols:
                raise ValueError(
                    "I don't have the correct nb of colmns as input, expected : %d, got : %d"
                    % (self._expected_nbcols, nbcols)
                )

            columns = dsh._get_columns(Xsubset)
            expected_columns = getattr(self, "_expected_columns", None)  # to allow pickle compatibility

            if expected_columns is not None and columns is not None and columns != self._expected_columns:
                raise ValueError("I don't have the correct names of columns")

        if self.accepted_input_types is not None and self._expected_type not in self.accepted_input_types:
            Xsubset = dsh.convert_generic(
                Xsubset, mapped_type=self._expected_type, output_type=self.accepted_input_types[0]
            )

        if is_fit:
            self._verif_params()
            self._empty_data = False
            s = getattr(Xsubset, "shape", None)
            if s is not None and len(s) > 1 and s[1] == 0:
                self._empty_data = True

        if self.all_columns_at_once or self._empty_data:

            if is_fit:
                self._model = self._get_model(Xsubset, y)

            ##############################################
            ### Apply the model on ALL columns at ONCE ###
            ##############################################

            if self.work_on_one_column_only:
                Xsubset = dsh.make1dimension(Xsubset)  # will generate an error if 2 dimensions
            else:
                Xsubset = dsh.make2dimensions(Xsubset)

            # Call to underlying model
            Xres = None
            if is_fit and is_transform:
                ##############################
                ###  fit_transform method  ###
                ##############################
                # test if the the data to transform actually has some columns

                if not self._empty_data:
                    # normal case
                    Xres = self._model.fit_transform(Xsubset, y, **fit_params)
                else:
                    # It means there is no columns to transform
                    Xres = Xsubset  # don't do anything

            elif is_fit and not is_transform:
                ####################
                ###  fit method  ###
                ####################
                if self.must_transform_to_get_features_name:
                    Xres = self._model.fit_transform(Xsubset, y, **fit_params)
                else:
                    self._model.fit(Xsubset, y, **fit_params)
            else:
                ####################
                ###  transform   ###
                ####################
                if not self._empty_data:
                    Xres = self._model.transform(Xsubset)
                else:
                    Xres = Xsubset

            if is_fit:
                self._columns_informations = {
                    "output_columns": getattr(Xres, "columns", None),  # names of transformed columns if exist
                    "output_shape": getattr(Xres, "shape", None),  # shape of transformed result if exist
                    "input_columns": Xsubset_columns,  # name of input columns
                    "input_shape": Xsubset_shape,  # shape of input data
                }

                self._feature_names_for_transform = self.try_to_find_feature_names_all_at_once(
                    output_columns=self._columns_informations["output_columns"],
                    output_shape=self._columns_informations["output_shape"],
                    input_columns=self._columns_informations["input_columns"],
                    input_shape=self._columns_informations["input_shape"],
                )

                # self.kept_features_names = None  # for now

            if is_transform:
                Xres = dsh.convert_generic(Xres, output_type=self.desired_output_type)
                Xres = dsh._set_index(Xres, Xindex)

        else:
            ########################################
            ### Apply the model COLUMN BY COLUMN ###
            ########################################
            if is_fit:
                self._models = []

            if is_transform or self.must_transform_to_get_features_name:
                all_Xres = []
            else:
                all_Xres = None

            Xsubset = dsh.make2dimensions(Xsubset)

            for j in range(self._expected_nbcols):

                if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame, DataTypes.Serie):
                    Xsubset_j = Xsubset.iloc[:, j]
                else:
                    Xsubset_j = Xsubset[:, j]

                if is_fit:
                    sub_model = self._get_model(Xsubset, y)
                    self._models.append(sub_model)
                else:
                    sub_model = self._models[j]

                if not self.work_on_one_column_only:
                    Xsubset_j = dsh.make2dimensions(Xsubset_j)

                if is_fit and is_transform:
                    # fit_transform method
                    Xres_j = sub_model.fit_transform(Xsubset_j, y, **fit_params)

                    all_Xres.append(Xres_j)

                elif is_fit and not is_transform:
                    # fit method
                    if self.must_transform_to_get_features_name:
                        Xres_j = sub_model.fit_transform(Xsubset_j, y, **fit_params)
                        all_Xres.append(Xres_j)

                    else:
                        sub_model.fit(Xsubset_j, y, **fit_params)

                elif is_transform:
                    # transform method

                    Xres_j = sub_model.transform(Xsubset_j)
                    all_Xres.append(Xres_j)

            if is_fit:

                self._columns_informations = {
                    "all_output_columns": None
                    if all_Xres is None
                    else [getattr(Xres, "columns", None) for Xres in all_Xres],
                    "all_output_shape": None
                    if all_Xres is None
                    else [getattr(Xres, "shape", None) for Xres in all_Xres],
                    "input_columns": Xsubset_columns,  # name of input columns
                    "input_shape": Xsubset_shape,  # shape of input data
                }

                self._feature_names_for_transform = list(
                    self.try_to_find_feature_names_separate(
                        all_output_columns=self._columns_informations["all_output_columns"],
                        all_output_shape=self._columns_informations["all_output_shape"],
                        input_columns=self._columns_informations["input_columns"],
                        input_shape=self._columns_informations["input_shape"],
                    )
                )

                # self.kept_features_names = None  # for now

            if is_transform:
                Xres = dsh.generic_hstack(all_Xres, output_type=self.desired_output_type)
                Xres = dsh._set_index(Xres, Xindex)

        if is_transform:
            if self._feature_names_for_transform is not None:
                ### LA ca marche pas en transform !!!
                Xres = dsh._set_columns(Xres, self._feature_names_for_transform)

        if is_transform:
            return Xres
        else:
            return self