def _preprocess_data(X,
                     y,
                     fit_intercept,
                     epsilon=1.0,
                     bounds_X=None,
                     bounds_y=None,
                     copy=True,
                     check_input=True,
                     **unused_args):
    warn_unused_args(unused_args)

    if check_input:
        X = check_array(X, copy=copy, accept_sparse=False, dtype=FLOAT_DTYPES)
    elif copy:
        X = X.copy(order='K')

    y = np.asarray(y, dtype=X.dtype)
    X_scale = np.ones(X.shape[1], dtype=X.dtype)

    if fit_intercept:
        bounds_X = check_bounds(bounds_X, X.shape[1])
        bounds_y = check_bounds(bounds_y, y.shape[1] if y.ndim > 1 else 1)

        X = clip_to_bounds(X, bounds_X)
        y = clip_to_bounds(y, bounds_y)

        X_offset = mean(X,
                        axis=0,
                        bounds=bounds_X,
                        epsilon=epsilon,
                        accountant=BudgetAccountant())
        X -= X_offset
        y_offset = mean(y,
                        axis=0,
                        bounds=bounds_y,
                        epsilon=epsilon,
                        accountant=BudgetAccountant())
        y = y - y_offset
    else:
        X_offset = np.zeros(X.shape[1], dtype=X.dtype)
        if y.ndim == 1:
            y_offset = X.dtype.type(0)
        else:
            y_offset = np.zeros(y.shape[1], dtype=X.dtype)

    return X, y, X_offset, y_offset, X_scale
    def test_incorrect_parameterisation(self):
        with self.assertRaises(TypeError):
            clip_to_bounds([1, 2, 3], (0, 5))

        with self.assertRaises(TypeError):
            clip_to_bounds(np.ones((5, 1)), [1, 2])

        with self.assertRaises(Exception):
            clip_to_bounds(np.ones((5, 1)), ("One", "Two"))
예제 #3
0
def _sum(array,
         epsilon=1.0,
         bounds=None,
         accountant=None,
         axis=None,
         dtype=None,
         keepdims=False,
         nan=False):
    if bounds is None:
        warnings.warn(
            "Bounds have not been specified and will be calculated on the data provided. This will "
            "result in additional privacy leakage. To ensure differential privacy and no additional "
            "privacy leakage, specify bounds for each dimension.",
            PrivacyLeakWarning)
        bounds = (np.min(array), np.max(array))

    if axis is not None or keepdims:
        return _wrap_axis(_sum,
                          array,
                          epsilon=epsilon,
                          bounds=bounds,
                          accountant=accountant,
                          axis=axis,
                          dtype=dtype,
                          keepdims=keepdims,
                          nan=nan)

    lower, upper = check_bounds(bounds, shape=0, dtype=dtype)

    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    # Let's ravel array to be single-dimensional
    array = clip_to_bounds(np.ravel(array), bounds)

    _func = np.nansum if nan else np.sum
    actual_sum = _func(array, axis=axis, dtype=dtype, keepdims=keepdims)

    mech = GeometricTruncated if dtype is not None and issubclass(
        dtype, Integral) else LaplaceTruncated
    mech = mech(epsilon=epsilon,
                sensitivity=upper - lower,
                lower=lower * array.size,
                upper=upper * array.size)
    output = mech.randomise(actual_sum)

    accountant.spend(epsilon, 0)

    return output
예제 #4
0
def _var(array,
         epsilon=1.0,
         bounds=None,
         axis=None,
         dtype=None,
         keepdims=False,
         accountant=None,
         nan=False):
    if bounds is None:
        warnings.warn(
            "Bounds have not been specified and will be calculated on the data provided. This will "
            "result in additional privacy leakage. To ensure differential privacy and no additional "
            "privacy leakage, specify bounds for each dimension.",
            PrivacyLeakWarning)
        bounds = (np.min(array), np.max(array))

    if axis is not None or keepdims:
        return _wrap_axis(_var,
                          array,
                          epsilon=epsilon,
                          bounds=bounds,
                          axis=axis,
                          dtype=dtype,
                          keepdims=keepdims,
                          accountant=accountant,
                          nan=nan)

    lower, upper = check_bounds(bounds, shape=0, dtype=dtype)

    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    # Let's ravel array to be single-dimensional
    array = clip_to_bounds(np.ravel(array), bounds)

    _func = np.nanvar if nan else np.var
    actual_var = _func(array, axis=axis, dtype=dtype, keepdims=keepdims)

    dp_mech = LaplaceBoundedDomain(
        epsilon=epsilon,
        delta=0,
        sensitivity=((upper - lower) / array.size)**2 * (array.size - 1),
        lower=0,
        upper=float("inf"))
    output = np.minimum(dp_mech.randomise(actual_var), (upper - lower)**2)

    accountant.spend(epsilon, 0)

    return output
    def test_iris(self):
        from sklearn import datasets
        dataset = datasets.load_iris()

        X_train, y_train = dataset.data, dataset.target

        maxes = np.max(X_train, axis=1)
        clip_max = (maxes[0] + maxes[1]) / 2

        X_clipped = clip_to_bounds(X_train, (np.min(X_train), clip_max))
        clipped_maxes = np.max(X_clipped, axis=0)
        self.assertLessEqual(clipped_maxes[0], maxes[0])
        self.assertLessEqual(clipped_maxes[1], maxes[1])
        self.assertTrue(
            np.isclose(clipped_maxes[0], clip_max)
            or np.isclose(clipped_maxes[1], clip_max))
예제 #6
0
def _mean(array,
          epsilon=1.0,
          bounds=None,
          axis=None,
          dtype=None,
          keepdims=False,
          accountant=None,
          nan=False):
    if bounds is None:
        warnings.warn(
            "Bounds have not been specified and will be calculated on the data provided. This will "
            "result in additional privacy leakage. To ensure differential privacy and no additional "
            "privacy leakage, specify bounds for each dimension.",
            PrivacyLeakWarning)
        bounds = (np.min(array), np.max(array))

    if axis is not None or keepdims:
        return _wrap_axis(_mean,
                          array,
                          epsilon=epsilon,
                          bounds=bounds,
                          axis=axis,
                          dtype=dtype,
                          keepdims=keepdims,
                          accountant=accountant,
                          nan=nan)

    lower, upper = check_bounds(bounds, shape=0, dtype=dtype)

    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    array = clip_to_bounds(np.ravel(array), bounds)

    _func = np.nanmean if nan else np.mean
    actual_mean = _func(array, axis=axis, dtype=dtype, keepdims=keepdims)

    mech = LaplaceTruncated(epsilon=epsilon,
                            delta=0,
                            sensitivity=(upper - lower) / array.size,
                            lower=lower,
                            upper=upper)
    output = mech.randomise(actual_mean)

    accountant.spend(epsilon, 0)

    return output
예제 #7
0
    def fit(self, X, y=None, sample_weight=None):
        """Computes k-means clustering with differential privacy.

        Parameters
        ----------
        X : array-like, shape=(n_samples, n_features)
            Training instances to cluster.

        y : Ignored
            not used, present here for API consistency by convention.

        sample_weight : ignored
            Ignored by diffprivlib.  Present for consistency with sklearn API.

        Returns
        -------
        self : class

        """
        self.accountant.check(self.epsilon, 0)

        if sample_weight is not None:
            warn_unused_args("sample_weight")

        del y

        X = check_array(X, accept_sparse=False, dtype=[np.float64, np.float32])
        n_samples, n_dims = X.shape

        if n_samples < self.n_clusters:
            raise ValueError("n_samples=%d should be >= n_clusters=%d" % (n_samples, self.n_clusters))

        iters = self._calc_iters(n_dims, n_samples)

        if self.bounds is None:
            warnings.warn("Bounds have not been specified and will be calculated on the data provided.  This will "
                          "result in additional privacy leakage. To ensure differential privacy and no additional "
                          "privacy leakage, specify `bounds` for each dimension.", PrivacyLeakWarning)
            self.bounds = (np.min(X, axis=0), np.max(X, axis=0))

        self.bounds = check_bounds(self.bounds, n_dims, min_separation=1e-5)
        X = clip_to_bounds(X, self.bounds)

        centers = self._init_centers(n_dims)
        labels = None
        distances = None

        # Run _update_centers first to ensure consistency of `labels` and `centers`, since convergence unlikely
        for _ in range(-1, iters):
            if labels is not None:
                centers = self._update_centers(X, centers=centers, labels=labels, dims=n_dims, total_iters=iters)

            distances, labels = self._distances_labels(X, centers)

        self.cluster_centers_ = centers
        self.labels_ = labels
        self.inertia_ = distances[np.arange(len(labels)), labels].sum()
        self.n_iter_ = iters

        self.accountant.spend(self.epsilon, 0)

        return self
def quantile(array,
             quant,
             epsilon=1.0,
             bounds=None,
             axis=None,
             keepdims=False,
             accountant=None,
             **unused_args):
    r"""
    Compute the differentially private quantile of the array.

    Returns the specified quantile with differential privacy.  The quantile is calculated over the flattened array.
    Differential privacy is achieved with the :class:`.Exponential` mechanism, using the method first proposed by
    Smith, 2011.

    Paper link: https://dl.acm.org/doi/pdf/10.1145/1993636.1993743

    Parameters
    ----------
    array : array_like
        Array containing numbers whose quantile is sought.  If `array` is not an array, a conversion is attempted.

    quant : float or array-like
        Quantile or array of quantiles.  Each quantile must be in the unit interval [0, 1].  If quant is array-like,
        quantiles are returned over the flattened array.

    epsilon : float, default: 1.0
        Privacy parameter :math:`\epsilon`.  Differential privacy is achieved over the entire output, with epsilon split
        evenly between each output value.

    bounds : tuple, optional
        Bounds of the values of the array, of the form (min, max).

    axis : None or int or tuple of ints, optional
        Axis or axes along which a sum is performed.  The default, axis=None, will sum all of the elements of the input
        array.  If axis is negative it counts from the last to the first axis.

        If axis is a tuple of ints, a sum is performed on all of the axes specified in the tuple instead of a single
        axis or all the axes as before.

    keepdims : bool, default: False
        If this is set to True, the axes which are reduced are left in the result as dimensions with size one.  With
        this option, the result will broadcast correctly against the input array.

        If the default value is passed, then `keepdims` will not be passed through to the `mean` method of sub-classes
        of `ndarray`, however any non-default value will be.  If the sub-class' method does not implement `keepdims` any
        exceptions will be raised.

    accountant : BudgetAccountant, optional
        Accountant to keep track of privacy budget.

    Returns
    -------
    m : ndarray
        Returns a new array containing the quantile values.

    See Also
    --------
    numpy.quantile : Equivalent non-private method.

    percentile, median

    """
    warn_unused_args(unused_args)

    if bounds is None:
        warnings.warn(
            "Bounds have not been specified and will be calculated on the data provided. This will "
            "result in additional privacy leakage. To ensure differential privacy and no additional "
            "privacy leakage, specify bounds for each dimension.",
            PrivacyLeakWarning)
        bounds = (np.min(array), np.max(array))

    quant = np.ravel(quant)

    if np.any(quant < 0) or np.any(quant > 1):
        raise ValueError("Quantiles must be in the unit interval [0, 1].")

    if len(quant) > 1:
        return np.array([
            quantile(array,
                     q_i,
                     epsilon=epsilon / len(quant),
                     bounds=bounds,
                     axis=axis,
                     keepdims=keepdims,
                     accountant=accountant) for q_i in quant
        ])

    # Dealing with a single quant from now on
    quant = quant.item()

    if axis is not None or keepdims:
        return _wrap_axis(quantile,
                          array,
                          quant=quant,
                          epsilon=epsilon,
                          bounds=bounds,
                          axis=axis,
                          keepdims=keepdims,
                          accountant=accountant)

    # Dealing with a scalar output from now on
    bounds = check_bounds(bounds, shape=0, min_separation=1e-5)

    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    # Let's ravel array to be single-dimensional
    array = clip_to_bounds(np.ravel(array), bounds)

    k = array.size
    array = np.append(array, list(bounds))
    array.sort()

    interval_sizes = np.diff(array)

    # Todo: Need to find a way to do this in a differentially private way
    if np.isnan(interval_sizes).any():
        return np.nan

    mech = Exponential(epsilon=epsilon,
                       sensitivity=1,
                       utility=list(-np.abs(np.arange(0, k + 1) - quant * k)),
                       measure=list(interval_sizes))
    idx = mech.randomise()
    output = mech._rng.random() * (array[idx + 1] - array[idx]) + array[idx]

    accountant.spend(epsilon, 0)

    return output
예제 #9
0
    def _partial_fit(self,
                     X,
                     y,
                     classes=None,
                     _refit=False,
                     sample_weight=None):
        self.accountant.check(self.epsilon, 0)

        if sample_weight is not None:
            warn_unused_args("sample_weight")

        X, y = check_X_y(X, y)

        if self.bounds is None:
            warnings.warn(
                "Bounds have not been specified and will be calculated on the data provided. This will "
                "result in additional privacy leakage. To ensure differential privacy and no additional "
                "privacy leakage, specify bounds for each dimension.",
                PrivacyLeakWarning)
            self.bounds = (np.min(X, axis=0), np.max(X, axis=0))

        self.bounds = check_bounds(self.bounds, shape=X.shape[1])
        X = clip_to_bounds(X, self.bounds)

        self.epsilon_ = self.var_smoothing

        if _refit:
            self.classes_ = None

        if _check_partial_fit_first_call(self, classes):
            n_features = X.shape[1]
            n_classes = len(self.classes_)
            self.theta_ = np.zeros((n_classes, n_features))
            self.sigma_ = np.zeros((n_classes, n_features))

            self.class_count_ = np.zeros(n_classes, dtype=np.float64)

            if self.priors is not None:
                priors = np.asarray(self.priors)

                if len(priors) != n_classes:
                    raise ValueError(
                        "Number of priors must match number of classes.")
                if not np.isclose(priors.sum(), 1.0):
                    raise ValueError("The sum of the priors should be 1.")
                if (priors < 0).any():
                    raise ValueError("Priors must be non-negative.")
                self.class_prior_ = priors
            else:
                # Initialize the priors to zeros for each class
                self.class_prior_ = np.zeros(len(self.classes_),
                                             dtype=np.float64)
        else:
            if X.shape[1] != self.theta_.shape[1]:
                raise ValueError(
                    "Number of features %d does not match previous data %d." %
                    (X.shape[1], self.theta_.shape[1]))
            # Put epsilon back in each time
            self.sigma_[:, :] -= self.epsilon_

        classes = self.classes_

        unique_y = np.unique(y)
        unique_y_in_classes = np.in1d(unique_y, classes)

        if not np.all(unique_y_in_classes):
            raise ValueError(
                "The target label(s) %s in y do not exist in the initial classes %s"
                % (unique_y[~unique_y_in_classes], classes))

        noisy_class_counts = self._noisy_class_counts(y)

        for _i, y_i in enumerate(unique_y):
            i = classes.searchsorted(y_i)
            X_i = X[y == y_i, :]

            n_i = noisy_class_counts[_i]

            new_theta, new_sigma = self._update_mean_variance(
                self.class_count_[i],
                self.theta_[i, :],
                self.sigma_[i, :],
                X_i,
                n_noisy=n_i)

            self.theta_[i, :] = new_theta
            self.sigma_[i, :] = new_sigma
            self.class_count_[i] += n_i

        self.sigma_[:, :] += self.epsilon_

        # Update if only no priors is provided
        if self.priors is None:
            # Empirical prior, with sample_weight taken into account
            self.class_prior_ = self.class_count_ / self.class_count_.sum()

        self.accountant.spend(self.epsilon, 0)

        return self
    def partial_fit(self, X, y=None, sample_weight=None):
        """Online computation of mean and std with differential privacy on X for later scaling.  All of X is processed
        as a single batch.  This is intended for cases when `fit` is not feasible due to very large number of
        `n_samples` or because X is read from a continuous stream.

        The algorithm for incremental mean and std is given in Equation 1.5a,b in Chan, Tony F., Gene H. Golub, and
        Randall J. LeVeque. "Algorithms for computing the sample variance: Analysis and recommendations." The American
        Statistician 37.3 (1983): 242-247:

        Parameters
        ----------
        X : {array-like}, shape [n_samples, n_features]
            The data used to compute the mean and standard deviation used for later scaling along the features axis.

        y
            Ignored

        sample_weight
            Ignored by diffprivlib.  Present for consistency with sklearn API.

        """
        self.accountant.check(self.epsilon, 0)

        if sample_weight is not None:
            warn_unused_args("sample_weight")

        epsilon_0 = self.epsilon / 2 if self.with_std else self.epsilon

        X = check_array(X,
                        accept_sparse=False,
                        copy=self.copy,
                        estimator=self,
                        dtype=FLOAT_DTYPES,
                        force_all_finite='allow-nan')
        # Hotfix for sklearn v 0.23
        self.n_features_in_ = X.shape[1]

        if self.bounds is None:
            warnings.warn(
                "Range parameter hasn't been specified, so falling back to determining range from the data.\n"
                "This will result in additional privacy leakage.  To ensure differential privacy with no "
                "additional privacy loss, specify `range` for each valued returned by np.mean().",
                PrivacyLeakWarning)
            self.bounds = (np.min(X, axis=0), np.max(X, axis=0))

        self.bounds = check_bounds(self.bounds, X.shape[1])
        X = clip_to_bounds(X, self.bounds)

        # Even in the case of `with_mean=False`, we update the mean anyway. This is needed for the incremental
        # computation of the var See incr_mean_variance_axis and _incremental_mean_variance_axis

        # if n_samples_seen_ is an integer (i.e. no missing values), we need to transform it to a NumPy array of
        # shape (n_features,) required by incr_mean_variance_axis and _incremental_variance_axis
        if hasattr(self, 'n_samples_seen_') and isinstance(
                self.n_samples_seen_, (int, np.integer)):
            self.n_samples_seen_ = np.repeat(self.n_samples_seen_,
                                             X.shape[1]).astype(np.int64)

        if not hasattr(self, 'n_samples_seen_'):
            self.n_samples_seen_ = np.zeros(X.shape[1], dtype=np.int64)

        # First pass
        if not hasattr(self, 'scale_'):
            self.mean_ = .0
            if self.with_std:
                self.var_ = .0
            else:
                self.var_ = None

        if not self.with_mean and not self.with_std:
            self.mean_ = None
            self.var_ = None
            self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0)
        else:
            self.mean_, self.var_, self.n_samples_seen_ = _incremental_mean_and_var(
                X, epsilon_0, self.bounds, self.mean_, self.var_,
                self.n_samples_seen_)

        # for backward-compatibility, reduce n_samples_seen_ to an integer
        # if the number of samples is the same for each feature (i.e. no
        # missing values)
        if np.ptp(self.n_samples_seen_) == 0:
            self.n_samples_seen_ = self.n_samples_seen_[0]

        if self.with_std:
            self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_))
        else:
            self.scale_ = None

        self.accountant.spend(self.epsilon, 0)

        return self
    def test_different_bounds(self):
        X = np.ones((10, 2))

        X_clipped = clip_to_bounds(X, ([0, 0], [0.5, 1]))
        self.assertTrue(np.all(X_clipped[:, 0] == 0.5))
        self.assertTrue(np.all(X_clipped[:, 1] == 1))