Пример #1
0
class BSketch:
    """BSketch: binning sketch for numerical values and binary target.

    Parameters
    ----------
    sketch : str, optional (default="gk")
        Sketch algorithm. Supported algorithms are "gk" (Greenwald-Khanna's)
        and "t-digest" (Ted Dunning) algorithm. Algorithm "t-digest" relies on
        `tdigest <https://github.com/CamDavidsonPilon/tdigest>`_.

    eps : float (default=0.01)
        Relative error epsilon.

    K : int (default=25)
        Parameter excess growth K to compute compress threshold in t-digest.

    special_codes : array-like or None, optional (default=None)
        List of special codes. Use special codes to specify the data values
        that must be treated separately.
    """
    def __init__(self, sketch="gk", eps=0.01, K=25, special_codes=None):
        self.sketch = sketch
        self.eps = eps
        self.K = K
        self.special_codes = special_codes

        _check_parameters(sketch, eps, K, special_codes)

        self._count_missing_e = 0
        self._count_missing_ne = 0
        self._count_special_e = 0
        self._count_special_ne = 0

        if sketch == "gk":
            self._sketch_e = GK(eps)
            self._sketch_ne = GK(eps)
        elif sketch == "t-digest":
            self._sketch_e = TDigest(eps, K)
            self._sketch_ne = TDigest(eps, K)

    def add(self, x, y, check_input=False):
        """Add arrays to the sketch.

        Parameters
        ----------
        x : array-like, shape = (n_samples,)
            Training vector, where n_samples is the number of samples.

        y : array-like, shape = (n_samples,)
            Target vector relative to x.

        check_input : bool (default=False)
            Whether to check input arrays.
        """
        xc, yc, xm, ym, xs, ys, _, _, _, _, _, _, _ = split_data(
            dtype=None,
            x=x,
            y=y,
            special_codes=self.special_codes,
            check_input=check_input)

        # Add values to sketch
        mask = yc == 1

        if self.sketch == "gk":
            for v1 in xc[mask]:
                self._sketch_e.add(v1)

            for v0 in xc[~mask]:
                self._sketch_ne.add(v0)

        if self.sketch == "t-digest":
            self._sketch_e.batch_update(xc[mask])
            self._sketch_ne.batch_update(xc[~mask])

        # Keep track of missing and special counts
        n_missing = len(ym)
        if n_missing:
            self._count_missing_e += np.count_nonzero(ym == 1)
            self._count_missing_ne += np.count_nonzero(ym == 0)

        n_special = len(ys)
        if n_special:
            self._count_special_e += np.count_nonzero(ys == 1)
            self._count_special_ne += np.count_nonzero(ys == 0)

    def bins(self, splits):
        """Event and non-events counts for each bin given a list of split
        points.

        Parameters
        ----------
        splits : array-like, shape = (n_splits,)
            List of split points.

        Returns
        -------
        bins : tuple of arrays of size n_splits + 1.
        """
        n_bins = len(splits) + 1
        bins_e = np.zeros(n_bins).astype(np.int64)
        bins_ne = np.zeros(n_bins).astype(np.int64)

        indices_e, count_e = _indices_count(self.sketch, self._sketch_e,
                                            splits)
        indices_ne, count_ne = _indices_count(self.sketch, self._sketch_ne,
                                              splits)

        for i in range(n_bins):
            bins_e[i] = count_e[(indices_e == i)].sum()
            bins_ne[i] = count_ne[(indices_ne == i)].sum()

        return bins_e, bins_ne

    def merge(self, bsketch):
        """Merge current instance with another BSketch instance.

        Parameters
        ----------
        bsketch : object
            BSketch instance.
        """
        if not self._mergeable(bsketch):
            raise Exception("bsketch does not share signature.")

        if bsketch._sketch_e.n == 0 and bsketch._sketch_ne.n == 0:
            return

        if self._sketch_e.n == 0 and self._sketch_ne.n == 0:
            self._copy(bsketch)
            return

        # Merge sketches
        if self.sketch == "gk":
            self._sketch_e.merge(bsketch._sketch_e)
            self._sketch_ne.merge(bsketch._sketch_ne)
        elif self.sketch == "t-digest":
            self._sketch_e += bsketch._sketch_e
            self._sketch_ne += bsketch._sketch_ne

        # Merge missing and special counts
        self._count_missing_e += bsketch._count_missing_e
        self._count_missing_ne += bsketch._count_missing_ne
        self._count_special_e += bsketch._count_special_e
        self._count_special_ne += bsketch._count_special_ne

    def merge_sketches(self):
        """Merge event and non-event data internal sketches."""
        if self.sketch == "gk":
            new_sketch = GK(self.eps)

            new_sketch.merge(self._sketch_e)
            new_sketch.merge(self._sketch_ne)
        else:
            new_sketch = self._sketch_e + self._sketch_ne

        return new_sketch

    def _copy(self, bsketch):
        self._sketch_e = bsketch._sketch_e
        self._sketch_ne = bsketch._sketch_ne

        # Merge missing and special counts
        self._count_missing_e = bsketch._count_missing_e
        self._count_missing_ne = bsketch._count_missing_ne
        self._count_special_e = bsketch._count_special_e
        self._count_special_ne = bsketch._count_special_ne

    def _mergeable(self, other):
        special_eq = True
        if self.special_codes is not None and other.special_codes is not None:
            special_eq = set(self.special_codes) == set(other.special_codes)

        return (self.sketch == other.sketch and self.eps == other.eps
                and self.K == other.K and special_eq)

    @property
    def n_event(self):
        """Event count.

        Returns
        -------
        n_event : int
        """
        count = self._sketch_e.n
        return count + self._count_missing_e + self._count_special_e

    @property
    def n_nonevent(self):
        """Non-event count.

        Returns
        -------
        n_nonevent : int
        """
        count = self._sketch_ne.n
        return count + self._count_missing_ne + self._count_special_ne

    @property
    def n(self):
        """Records count.

        Returns
        -------
        n : int
        """
        return self.n_event + self.n_nonevent