Пример #1
0
    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        """ Predict the likelihood of each class.
        
        This function will only work as expected if training
        used the `binary:logistic` loss.
        
        Parameters
        ----------
        X : numpy.ndarray
            The input data
            
        Returns
        -------
        y_proba_pred : numpy.ndarray
            The probabilistic predictions
        """
        validation_utils.check_is_fitted(self, 'best_booster_', self.name)

        if self.scaler_ is not None:
            msg = "transforming the input data"
            self.log(msg, logging.DEBUG)
            X = self.scaler_.transform(X)

        d_x = xgb.DMatrix(X)
        y_proba_pred = xgbooster_predict_proba(self.best_booster_, d_x)
        return y_proba_pred
Пример #2
0
    def inverse_transform(self, y):
        """Transform labels back to original encoding.
        
        Parameters
        ----------
        y : numpy array of shape [n_samples]
            Encoded target values. That is, these should be integers in
            the range [0, n_classes].
            
        Returns
        -------
        y : numpy array of shape [n_samples]
        """
        check_is_fitted(self, 'classes_')

        # mark the nan's
        m_nan = pd.isnull(y)
        y[m_nan] = len(self.classes_) - 1

        diff = np.setdiff1d(y[~m_nan],
                            np.arange(len(self.classes_), dtype=object))
        if diff:
            raise ValueError("y contains new labels: {}".format(str(diff)))

        y = np.asarray(y, dtype=int)
        return self.classes_[y]
Пример #3
0
    def kneighbors_graph(self, n_neighbors=None, as_nx=True):
        """ Build the k-nearest neighbors graph for the training data
        
        Please see the `sklearn` documentation for more details of the
        semantics of this method:

            http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
            
        Parameters
        ----------
        n_neighbors: int
            The number of neighbors. Default: the value passed to the constructor
            
        as_nx: bool
            Whether to return the graph as a networkx Graph data structure
            (`True`) or a scipy.sparse_matrix (`False`).
            
        Returns
        -------
        kneighbors_graph: graph
            The k-nearest neighbors graph. Please see the documentation
            referenced above for more details.
        """
        check_is_fitted(self, ["knn_"])

        if n_neighbors is None:
            n_neighbors = self.n_neighbors

        kneighbors_graph = self.knn_.kneighbors_graph(n_neighbors=n_neighbors)

        if as_nx:
            kneighbors_graph = nx.from_scipy_sparse_matrix(kneighbors_graph)

        return kneighbors_graph
Пример #4
0
    def transform(self, y):
        """Transform labels to normalized encoding.
        
        Parameters
        ----------
        y : array-like of shape [n_samples]
            Target values.
            
        Returns
        -------
        y : array-like of shape [n_samples]
        """
        check_is_fitted(self, 'classes_')
        y = sklearn.utils.column_or_1d(y, warn=True)
        y = np.array(y, dtype=object)

        #print("[nan_le.transform] y: {}".format(y))

        # use our marker for NaNs
        m_nan = pd.isnull(y)

        # check if we want to treat unknown labels as NaNs
        if self.treat_unknown_as_missing:
            m_unknown = np.array([y_i not in self.classes_ for y_i in y])
            m_nan |= m_unknown

        y[m_nan] = self.missing_value_marker

        # then make sure we know all of the labels
        #print("[nan_le.transform] y after: {}".format(y))

        observed_labels = np.unique(y)

        if len(self.classes_) < 20:
            msg = ("[nan_label_encoder.transform] observed labels: {}. "
                   "self.classes_: {}".format(observed_labels, self.classes_))
            logger.debug(msg)
        else:
            msg = ("[nan_le.transform] too many classes to print observed "
                   "labels")
            logger.debug(msg)

        unknown_labels = [l for l in observed_labels if l not in self.classes_]
        if len(unknown_labels) > 0:
            msg = ("[nan_label_encoder.transform] y contains new labels: {}".
                   format(str(unknown_labels)))
            raise ValueError(msg)

        # however, put the NaNs back in
        ret = np.array([self.classes_[y_i] for y_i in y]).astype(float)
        #ret = np.searchsorted(self.classes_, y).astype(float)
        ret[m_nan] = np.nan
        return ret
Пример #5
0
    def transform(self, X, *_):

        to_check = [
            'col_mean_',
            'col_std_',
            'columns_',
            'col_ignore_',
        ]
        validation_utils.check_is_fitted(self, to_check)

        # if we did not see a column in the training, or if it had only one
        # value, we cannot really do anything with it

        # so ignore those

        # do not overwrite our original information
        X = X.copy()

        # now, actually grab the columns depending on the type of X
        if isinstance(X, pd.DataFrame):
            X_cols = X[self.columns_].copy()
            X_cols.iloc[:, self.col_ignore_] = 0

        elif isinstance(X, np.ndarray):
            # check if we have a single vector
            if len(X.shape) == 1:
                #X[self.col_ignore_] = 0
                X = X.reshape(-1, 1)

            X_cols = X[:, self.columns_]
            X_cols[:, self.col_ignore_] = 0
        else:
            msg = ("[NanStandardScaler.transform]: unrecognized data type: {}".
                   format(type(X)))
            raise ValueError(msg)

        X_transform = ((X_cols - self.col_mean_) / self.col_std_)

        # and stick the columns back
        if isinstance(X, pd.DataFrame):
            X[self.columns_] = X_transform
        else:
            X[:, self.columns_] = X_transform

        return X
Пример #6
0
    def transform(self, *_, **__):
        """ Transform the data provided to the constructor
        """
        validation_utils.check_is_fitted(self,
                                         ["num_pipeline_", "bow_pipelines_"])

        Xt_num = self.num_pipeline_.transform(self.num_data)
        Xt_num = scipy.sparse.csr_matrix(Xt_num)

        it = zip(self.bow_pipelines_, self.bow_tokens)
        Xt = [p.transform(nt) for p, nt in it]

        Xt.append(Xt_num)
        Xt = scipy.sparse.hstack(Xt)

        # we generally want to index, so convert to csr
        Xt = Xt.tocsr()

        return Xt
Пример #7
0
    def transform(self, X, *_):
        check_is_fitted(self, "enc_")

        X = X.copy()

        # make sure we have a matrix rather than a vector
        if len(X.shape) == 1:
            X = X.reshape(-1, 1)

        # first, replace the missing categorical values with 0
        masks = {}
        for f in self.categorical_features:
            m = pd.isnull(X[:,f])
            masks[f] = m

            # this overwrites data in the passed array
            X[m,f] = 0

        # now, encode the categorical values (ignoring whatever is in the
        # other fields)
        Xt = _encode_selected(
            X,
            self.enc_,
            selected=self.categorical_features,
            copy=True
        )

        if self.sparse:
            Xt = Xt.tocsr()
            
        # and clear out the missing values
        for i, f in enumerate(self.categorical_features):
            m = masks[f]
            indices = self.enc_.feature_indices_[i:i+1]            
            Xt[m,indices] = 0


        return Xt
Пример #8
0
    def inverse_transform(self, X, *_, **__):
        """ Transform labels back to the original encoding
        """
        check_is_fitted(self, "le_")

        # make a copy to keep around everything we do not encode
        X = X.copy()

        for c in self.columns:
            le = self.le_[c]

            # make sure we actually grab a column
            if self.is_np_array_:
                # so np.array
                y = X[:,c]
                y = le.inverse_transform(y)
                X[:,c] = y
            else:
                # then pd.DataFrame
                y = X[c]
                y = le.inverse_transform(y)
                X[c] = y

        return X
Пример #9
0
    def transform(self, X, *_, **__):
        """ Encode the respective columns of X
        """
        check_is_fitted(self, "le_")

        # make a copy to keep around everything we do not encode
        X = X.copy()

        for c in self.columns:
            le = self.le_[c]

            # make sure we actually grab a column
            if self.is_np_array_:
                # so np.array
                y = X[:,c]
                y = le.transform(y)
                X[:,c] = y
            else:
                # then pd.DataFrame
                y = X[c]
                y = le.transform(y)
                X[c] = y

        return X
Пример #10
0
    def kneighbors(self,
                   X,
                   n_neighbors=None,
                   return_distance=False,
                   as_np=False):
        """ Find the k nearest neighbor of each instance in `X`
        
        If specified in the constructor, then the data will be scaled (using
        the respective parameters learned from the training data).
        
        N.B. This method is not implemented especially efficiently.
        
        Parameters
        ----------
        X: data matrix
            Missing values are represented using np.nan
            
        n_neighbors: int
            The number of neighbors. Default: the value passed to the
            constructor

        return_distances: bool
            Whether to return the distances to the neighbors (`True`) or not
            (`False`)
            
        as_np: bool
            Whether to return the neighbors as an `np.array` (`True`) or a list
            of lists (`False`)
            
        Returns
        -------
        distance: np.array
            The distances to the neighbors. This is only present if
            `return_distance` is `True`.

        neighbors: np.array or list of lists
            The indices of the nearest neighbors of each entity in `X` from the
            original training set.
        """

        check_is_fitted(self, ["knn_"])

        # check if we need to scale the data
        if self.scale:
            X = self.scaler_.transform(X)

        # ensure X is the correct shape
        X = np.atleast_2d(X)

        # first, find the distance from each query point to each indexed point

        # we need n_query rows and n_indexed columns

        # a bit confusing, so use clearer variable names
        queries = X
        indexed = self.X
        distance_matrix = scipy.spatial.distance.cdist(queries,
                                                       indexed,
                                                       metric=self.knn_metric_)

        # convert the infs to very large numbers
        distance_matrix = np.nan_to_num(distance_matrix)

        if n_neighbors is None:
            n_neighbors = self.n_neighbors

        ret = self.knn_.kneighbors(X=distance_matrix,
                                   n_neighbors=n_neighbors,
                                   return_distance=return_distance)

        # find the indices of the neighbors
        ret_indices = ret
        if return_distance:
            ret_distances = ret[0]
            ret_indices = ret[1]

        # check if we want the list of lists
        if not as_np:
            ret_indices = [
                list(ret_indices[i]) for i in range(ret_indices.shape[0])
            ]

        ret = ret_indices
        if return_distance:
            ret = (ret_distances, ret_indices)

        return ret