예제 #1
0
    def __init__(self,
                 epsilon=1.0,
                 data_norm=None,
                 tol=1e-4,
                 C=1.0,
                 fit_intercept=True,
                 max_iter=100,
                 verbose=0,
                 warm_start=False,
                 n_jobs=None,
                 accountant=None,
                 **unused_args):
        super().__init__(penalty='l2',
                         dual=False,
                         tol=tol,
                         C=C,
                         fit_intercept=fit_intercept,
                         intercept_scaling=1.0,
                         class_weight=None,
                         random_state=None,
                         solver='lbfgs',
                         max_iter=max_iter,
                         multi_class='ovr',
                         verbose=verbose,
                         warm_start=warm_start,
                         n_jobs=n_jobs)
        self.epsilon = epsilon
        self.data_norm = data_norm
        self.classes_ = None
        self.accountant = BudgetAccountant.load_default(accountant)

        warn_unused_args(unused_args)
예제 #2
0
    def _partial_fit(self,
                     X,
                     y,
                     classes=None,
                     _refit=False,
                     sample_weight=None):
        if sample_weight is not None:
            warn_unused_args("sample_weight")

        # Store size of current X to apply differential privacy later on
        self.new_n_samples = X.shape[0]

        if self.bounds is None:
            warnings.warn(
                "Bounds have not been specified and will be calculated on the data provided. This will "
                "result in additional privacy leakage. To ensure differential privacy and no additional "
                "privacy leakage, specify bounds for each dimension.",
                PrivacyLeakWarning)
            self.bounds = list(zip(np.min(X, axis=0), np.max(X, axis=0)))

        self.bounds = _check_bounds(self.bounds, X.shape[1])

        super()._partial_fit(X, y, classes, _refit, sample_weight=None)

        del self.new_n_samples
        return self
예제 #3
0
def percentile(array, percent, epsilon=1.0, bounds=None, axis=None, keepdims=False, accountant=None, **unused_args):
    r"""
    Compute the differentially private percentile of the array.

    This method calls :obj:`.quantile`, where quantile = percentile / 100.

    Parameters
    ----------
    array : array_like
        Array containing numbers whose percentile is sought.  If `array` is not an array, a conversion is attempted.

    percent : float or array-like
        Percentile or list of percentiles sought.  Each percentile must be in [0, 100].  If percent is array-like,
        percentiles are returned over the flattened array.

    epsilon : float, default: 1.0
        Privacy parameter :math:`\epsilon`.  Differential privacy is achieved over the entire output, with epsilon split
        evenly between each output value.

    bounds : tuple, optional
        Bounds of the values of the array, of the form (min, max).

    axis : None or int or tuple of ints, optional
        Axis or axes along which a sum is performed.  The default, axis=None, will sum all of the elements of the input
        array.  If axis is negative it counts from the last to the first axis.

        If axis is a tuple of ints, a sum is performed on all of the axes specified in the tuple instead of a single
        axis or all the axes as before.

    keepdims : bool, default: False
        If this is set to True, the axes which are reduced are left in the result as dimensions with size one.  With
        this option, the result will broadcast correctly against the input array.

        If the default value is passed, then `keepdims` will not be passed through to the `mean` method of sub-classes
        of `ndarray`, however any non-default value will be.  If the sub-class' method does not implement `keepdims` any
        exceptions will be raised.

    accountant : BudgetAccountant, optional
        Accountant to keep track of privacy budget.

    Returns
    -------
    m : ndarray
        Returns a new array containing the percentile values.

    See Also
    --------
    numpy.percentile : Equivalent non-private method.

    quantile, median

    """
    warn_unused_args(unused_args)

    quant = np.asarray(percent) / 100

    if np.any(quant < 0) or np.any(quant > 1):
        raise ValueError("Percentiles must be between 0 and 100 inclusive")

    return quantile(array, quant, epsilon=epsilon, bounds=bounds, axis=axis, keepdims=keepdims, accountant=accountant)
    def fit(self, X, y=None, sample_weight=None):
        """Computes k-means clustering with differential privacy.

        Parameters
        ----------
        X : array-like, shape=(n_samples, n_features)
            Training instances to cluster.

        y : Ignored
            not used, present here for API consistency by convention.

        sample_weight : Ignored
            Not used in diffprivlib, present here for consistency with :obj:`sklearn.cluster.KMeans`. Specifying this
            parameter will result in a :class:`.DiffprivlibCompatibilityWarning`.

        Returns
        -------
        self : class

        """
        if sample_weight is not None:
            warn_unused_args("sample_weight")

        del y

        if X.ndim != 2:
            raise ValueError(
                "Expected 2D array, got array with %d dimensions instead. Reshape your data using array.reshape(-1, 1),"
                "or array.reshape(1, -1) if your data contains only one sample." % X.ndim)

        n_samples, n_dims = X.shape

        iters = self._calc_iters(n_dims, n_samples)

        if self.bounds is None:
            warnings.warn("Bounds have not been specified and will be calculated on the data provided.  This will "
                          "result in additional privacy leakage. To ensure differential privacy and no additional "
                          "privacy leakage, specify `bounds` for each dimension.", PrivacyLeakWarning)
            self.bounds = list(zip(np.min(X, axis=0), np.max(X, axis=0)))

        self.bounds = _check_bounds(self.bounds, n_dims)

        centers = self._init_centers(n_dims)
        labels = None
        distances = None

        # Run _update_centers first to ensure consistency of `labels` and `centers`, since convergence unlikely
        for _ in range(-1, iters):
            if labels is not None:
                centers = self._update_centers(X, centers=centers, labels=labels, dims=n_dims, total_iters=iters)

            distances, labels = self._distances_labels(X, centers)

        self.cluster_centers_ = centers
        self.labels_ = labels
        self.inertia_ = distances[np.arange(len(labels)), labels].sum()
        self.n_iter_ = iters

        return self
def nanmean(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=np._NoValue, accountant=None,
            **unused_args):
    r"""
    Compute the differentially private arithmetic mean along the specified axis, ignoring NaNs.

    Returns the average of the array elements with differential privacy.  The average is taken over the flattened array
    by default, otherwise over the specified axis.  Noise is added using :class:`.Laplace` to satisfy differential
    privacy, where sensitivity is calculated using `bounds`.  Users are advised to consult the documentation of
    :obj:`numpy.mean` for further details, as the behaviour of `mean` closely follows its Numpy variant.

    For all-NaN slices, NaN is returned and a `RuntimeWarning` is raised.

    Parameters
    ----------
    array : array_like
        Array containing numbers whose mean is desired.  If `array` is not an array, a conversion is attempted.

    epsilon : float, default: 1.0
        Privacy parameter :math:`\epsilon`.

    bounds : tuple, optional
        Bounds of the values of the array, of the form (min, max).

    axis : int or tuple of ints, optional
        Axis or axes along which the means are computed.  The default is to compute the mean of the flattened array.

        If this is a tuple of ints, a mean is performed over multiple axes, instead of a single axis or all the axes as
        before.

    dtype : data-type, optional
        Type to use in computing the mean.  For integer inputs, the default is `float64`; for floating point inputs, it
        is the same as the input dtype.

    keepdims : bool, optional
        If this is set to True, the axes which are reduced are left in the result as dimensions with size one.  With
        this option, the result will broadcast correctly against the input array.

        If the default value is passed, then `keepdims` will not be passed through to the `mean` method of sub-classes
        of `ndarray`, however any non-default value will be.  If the sub-class' method does not implement `keepdims` any
        exceptions will be raised.

    accountant : BudgetAccountant, optional
        Accountant to keep track of privacy budget.

    Returns
    -------
    m : ndarray, see dtype parameter above
        Returns a new array containing the mean values.

    See Also
    --------
    std, var, mean

    """
    warn_unused_args(unused_args)

    return _mean(array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims,
                 accountant=accountant, nan=True)
def std(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=np._NoValue, accountant=None, **unused_args):
    r"""
    Compute the standard deviation along the specified axis.

    Returns the standard deviation of the array elements, a measure of the spread of a distribution, with differential
    privacy.  The standard deviation is computed for the flattened array by default, otherwise over the specified axis.
    Noise is added using :class:`.LaplaceBoundedDomain` to satisfy differential privacy, where sensitivity is
    calculated using `bounds`.  Users are advised to consult the documentation of :obj:`numpy.std` for further details,
    as the behaviour of `std` closely follows its Numpy variant.

    Parameters
    ----------
    array : array_like
        Calculate the standard deviation of these values.

    epsilon : float, default: 1.0
        Privacy parameter :math:`\epsilon`.

    bounds : tuple, optional
        Bounds of the values of the array, of the form (min, max).

    axis : int or tuple of ints, optional
        Axis or axes along which the standard deviation is computed.  The default is to compute the standard deviation
        of the flattened array.

        If this is a tuple of ints, a standard deviation is performed over multiple axes, instead of a single axis or
        all the axes as before.

    dtype : dtype, optional
        Type to use in computing the standard deviation.  For arrays of integer type the default is float64, for arrays
        of float types it is the same as the array type.

    keepdims : bool, optional
        If this is set to True, the axes which are reduced are left in the result as dimensions with size one.  With
        this option, the result will broadcast correctly against the input array.

        If the default value is passed, then `keepdims` will not be passed through to the `std` method of sub-classes of
        `ndarray`, however any non-default value will be.  If the sub-class' method does not implement `keepdims` any
        exceptions will be raised.

    accountant : BudgetAccountant, optional
        Accountant to keep track of privacy budget.

    Returns
    -------
    standard_deviation : ndarray, see dtype parameter above.
        Return a new array containing the standard deviation.

    See Also
    --------
    var, mean, nanstd

    """
    warn_unused_args(unused_args)

    return _std(array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims,
                accountant=accountant, nan=False)
예제 #7
0
def nanvar(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=False, accountant=None, **unused_args):
    r"""
    Compute the differentially private variance along the specified axis, ignoring NaNs.

    Returns the variance of the array elements, a measure of the spread of a distribution, with differential privacy.
    The variance is computer for the flattened array by default, otherwise over the specified axis.  Noise is added
    using :class:`.LaplaceBoundedDomain` to satisfy differential privacy, where sensitivity is calculated using
    `bounds`.  Users are advised to consult the documentation of :obj:`numpy.var` for further details, as the behaviour
    of `var` closely follows its Numpy variant.

    For all-NaN slices, NaN is returned and a `RuntimeWarning` is raised.

    Parameters
    ----------
    array : array_like
        Array containing numbers whose variance is desired.  If `array` is not an array, a conversion is attempted.

    epsilon : float, default: 1.0
        Privacy parameter :math:`\epsilon`.

    bounds : tuple, optional
        Bounds of the values of the array, of the form (min, max).

    axis : int or tuple of ints, optional
        Axis or axes along which the variance is computed.  The default is to compute the variance of the flattened
        array.

        If this is a tuple of ints, a variance is performed over multiple axes, instead of a single axis or all the axes
        as before.

    dtype : data-type, optional
        Type to use in computing the variance.  For arrays of integer type the default is `float32`; for arrays of float
        types it is the same as the array type.

    keepdims : bool, default: False
        If this is set to True, the axes which are reduced are left in the result as dimensions with size one.  With
        this option, the result will broadcast correctly against the input array.

    accountant : BudgetAccountant, optional
        Accountant to keep track of privacy budget.

    Returns
    -------
    variance : ndarray, see dtype parameter above
        If ``out=None``, returns a new array containing the variance; otherwise, a reference to the output array is
        returned.

    See Also
    --------
    std , mean, var

    """
    warn_unused_args(unused_args)

    return _var(array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims, accountant=accountant,
                nan=True)
def nansum(array, epsilon=1.0, bounds=None, accountant=None, axis=None, dtype=None, keepdims=np._NoValue,
           **unused_args):
    r"""Sum of array elements over a given axis with differential privacy, ignoring NaNs.

    Parameters
    ----------
    array : array_like
        Elements to sum.

    epsilon : float, default: 1.0
        Privacy parameter :math:`\epsilon`.

    bounds : tuple, optional
        Bounds of the values of the array, of the form (min, max).

    accountant : BudgetAccountant, optional
        Accountant to keep track of privacy budget.

    axis : None or int or tuple of ints, optional
        Axis or axes along which a sum is performed.  The default, axis=None, will sum all of the elements of the input
        array.  If axis is negative it counts from the last to the first axis.

        If axis is a tuple of ints, a sum is performed on all of the axes specified in the tuple instead of a single
        axis or all the axes as before.

    dtype : dtype, optional
        The type of the returned array and of the accumulator in which the elements are summed.  The dtype of `array` is
        used by default unless `array` has an integer dtype of less precision than the default platform integer.  In
        that case, if `array` is signed then the platform integer is used while if `array` is unsigned then an unsigned
        integer of the same precision as the platform integer is used.

    keepdims : bool, optional
        If this is set to True, the axes which are reduced are left in the result as dimensions with size one.  With
        this option, the result will broadcast correctly against the input array.

        If the default value is passed, then `keepdims` will not be passed through to the `sum` method of sub-classes of
        `ndarray`, however any non-default value will be.  If the sub-class' method does not implement `keepdims` any
        exceptions will be raised.

    Returns
    -------
    sum_along_axis : ndarray
        An array with the same shape as `array`, with the specified axis removed.   If `array` is a 0-d array, or if
        `axis` is None, a scalar is returned.  If an output array is specified, a reference to `out` is returned.

    See Also
    --------
    ndarray.sum : Equivalent non-private method.

    mean, sum

    """
    warn_unused_args(unused_args)

    return _sum(array, epsilon=epsilon, bounds=bounds, accountant=accountant, axis=axis, dtype=dtype,
                keepdims=keepdims, nan=True)
예제 #9
0
    def __init__(self, epsilon=1.0, data_norm=None, range_X=None, range_y=None, fit_intercept=True, copy_X=True,
                 **unused_args):
        super().__init__(fit_intercept=fit_intercept, normalize=False, copy_X=copy_X, n_jobs=None)

        self.epsilon = epsilon
        self.data_norm = data_norm
        self.range_X = range_X
        self.range_y = range_y

        warn_unused_args(unused_args)
예제 #10
0
    def __init__(self, n_components=None, centered=False, epsilon=1.0, data_norm=None, bounds=None, copy=True,
                 whiten=False, random_state=None, accountant=None, **unused_args):
        super().__init__(n_components=n_components, copy=copy, whiten=whiten, svd_solver='full', tol=0.0,
                         iterated_power='auto', random_state=random_state)
        self.centered = centered
        self.epsilon = epsilon
        self.data_norm = data_norm
        self.bounds = bounds
        self.accountant = BudgetAccountant.load_default(accountant)

        warn_unused_args(unused_args)
예제 #11
0
def median(array, epsilon=1.0, bounds=None, axis=None, keepdims=False, accountant=None, **unused_args):
    r"""
    Compute the differentially private median of the array.

    Returns the median with differential privacy.  The median is calculated over each axis, or the flattened array
    if an axis is not provided.  This method calls the :obj:`.quantile` method, for the 0.5 quantile.

    Parameters
    ----------
    array : array_like
        Array containing numbers whose median is sought.  If `array` is not an array, a conversion is attempted.

    epsilon : float, default: 1.0
        Privacy parameter :math:`\epsilon`.  Differential privacy is achieved over the entire output, with epsilon split
        evenly between each output value.

    bounds : tuple, optional
        Bounds of the values of the array, of the form (min, max).

    axis : None or int or tuple of ints, optional
        Axis or axes along which a sum is performed.  The default, axis=None, will sum all of the elements of the input
        array.  If axis is negative it counts from the last to the first axis.

        If axis is a tuple of ints, a sum is performed on all of the axes specified in the tuple instead of a single
        axis or all the axes as before.

    keepdims : bool, default: False
        If this is set to True, the axes which are reduced are left in the result as dimensions with size one.  With
        this option, the result will broadcast correctly against the input array.

        If the default value is passed, then `keepdims` will not be passed through to the `mean` method of sub-classes
        of `ndarray`, however any non-default value will be.  If the sub-class' method does not implement `keepdims` any
        exceptions will be raised.

    accountant : BudgetAccountant, optional
        Accountant to keep track of privacy budget.

    Returns
    -------
    m : ndarray
        Returns a new array containing the median values.

    See Also
    --------
    numpy.median : Equivalent non-private method.

    quantile, percentile

    """
    warn_unused_args(unused_args)

    return quantile(array, 0.5, epsilon=epsilon, bounds=bounds, axis=axis, keepdims=keepdims, accountant=accountant)
    def __init__(self, epsilon=1.0, bounds=None, n_clusters=8, **unused_args):
        super().__init__(n_clusters=n_clusters)

        self.epsilon = epsilon
        self.bounds = bounds

        warn_unused_args(unused_args)

        self.cluster_centers_ = None
        self.bounds_processed = None
        self.labels_ = None
        self.inertia_ = None
        self.n_iter_ = None
예제 #13
0
    def __init__(self, epsilon=1.0, bounds=None, n_clusters=8, accountant=None, **unused_args):
        super().__init__(n_clusters=n_clusters)

        self.epsilon = epsilon
        self.bounds = bounds
        self.accountant = BudgetAccountant.load_default(accountant)

        warn_unused_args(unused_args)

        self.cluster_centers_ = None
        self.bounds_processed = None
        self.labels_ = None
        self.inertia_ = None
        self.n_iter_ = None
        self._n_threads = 1
def _preprocess_data(X,
                     y,
                     fit_intercept,
                     epsilon=1.0,
                     bounds_X=None,
                     bounds_y=None,
                     copy=True,
                     check_input=True,
                     **unused_args):
    warn_unused_args(unused_args)

    if check_input:
        X = check_array(X, copy=copy, accept_sparse=False, dtype=FLOAT_DTYPES)
    elif copy:
        X = X.copy(order='K')

    y = np.asarray(y, dtype=X.dtype)
    X_scale = np.ones(X.shape[1], dtype=X.dtype)

    if fit_intercept:
        bounds_X = check_bounds(bounds_X, X.shape[1])
        bounds_y = check_bounds(bounds_y, y.shape[1] if y.ndim > 1 else 1)

        X = clip_to_bounds(X, bounds_X)
        y = clip_to_bounds(y, bounds_y)

        X_offset = mean(X,
                        axis=0,
                        bounds=bounds_X,
                        epsilon=epsilon,
                        accountant=BudgetAccountant())
        X -= X_offset
        y_offset = mean(y,
                        axis=0,
                        bounds=bounds_y,
                        epsilon=epsilon,
                        accountant=BudgetAccountant())
        y = y - y_offset
    else:
        X_offset = np.zeros(X.shape[1], dtype=X.dtype)
        if y.ndim == 1:
            y_offset = X.dtype.type(0)
        else:
            y_offset = np.zeros(y.shape[1], dtype=X.dtype)

    return X, y, X_offset, y_offset, X_scale
    def __init__(self,
                 epsilon=1.0,
                 bounds_X=None,
                 bounds_y=None,
                 fit_intercept=True,
                 copy_X=True,
                 accountant=None,
                 **unused_args):
        super().__init__(fit_intercept=fit_intercept,
                         normalize=False,
                         copy_X=copy_X,
                         n_jobs=None)

        self.epsilon = epsilon
        self.bounds_X = bounds_X
        self.bounds_y = bounds_y
        self.accountant = BudgetAccountant.load_default(accountant)
        self.__repr__()

        warn_unused_args(unused_args)
예제 #16
0
    def fit(self, X, y, sample_weight=None):
        """
        Fit linear model.

        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            Training data

        y : array_like, shape (n_samples, n_targets)
            Target values. Will be cast to X's dtype if necessary

        sample_weight : ignored
            Ignored by diffprivlib. Present for consistency with sklearn API.

        Returns
        -------
        self : returns an instance of self.
        """

        if sample_weight is not None:
            warn_unused_args("sample_weight")

        max_norm = np.linalg.norm(X, axis=1).max()

        if self.data_norm is None:
            warnings.warn("Data norm has not been specified and will be calculated on the data provided.  This will "
                          "result in additional privacy leakage. To ensure differential privacy and no additional "
                          "privacy leakage, specify `data_norm` at initialisation.", PrivacyLeakWarning)
            self.data_norm = max_norm

        if max_norm > self.data_norm:
            warnings.warn("Differential privacy is only guaranteed for data whose rows have a 2-norm of at most %g. "
                          "Got %f\n"
                          "Translate and/or scale the data accordingly to ensure differential privacy is achieved."
                          % (self.data_norm, max_norm), PrivacyLeakWarning)

        if self.fit_intercept and (self.range_X is None or self.range_y is None):
            warnings.warn("Range parameters haven't been specified, so falling back to determining range from the "
                          "data.\n"
                          "This will result in additional privacy leakage. To ensure differential privacy with no "
                          "additional privacy loss, specify `range_X` and `range_y`.",
                          PrivacyLeakWarning)

            if self.range_X is None:
                self.range_X = np.maximum(np.ptp(X, axis=0), 1e-5)
            if self.range_y is None:
                self.range_y = np.maximum(np.ptp(y, axis=0), 1e-5)

        X, y = check_X_y(X, y, accept_sparse=False, y_numeric=True, multi_output=True)

        n_features = X.shape[1]
        epsilon_intercept_scale = 1 / (n_features + 1) if self.fit_intercept else 0

        X, y, X_offset, y_offset, X_scale = self._preprocess_data(X, y, fit_intercept=self.fit_intercept,
                                                                  range_X=self.range_X, range_y=self.range_y,
                                                                  epsilon=self.epsilon * epsilon_intercept_scale,
                                                                  copy=self.copy_X)

        A = np.hstack((X, y[:, np.newaxis] if y.ndim == 1 else y))
        AtA = np.dot(A.T, A)

        mech = Wishart().set_epsilon(self.epsilon * (1 - epsilon_intercept_scale)).set_sensitivity(self.data_norm)
        noisy_AtA = mech.randomise(AtA)

        noisy_AtA = noisy_AtA[:n_features, :]
        XtX = noisy_AtA[:, :n_features]
        Xty = noisy_AtA[:, n_features:]

        self.coef_, self._residues, self.rank_, self.singular_ = np.linalg.lstsq(XtX, Xty, rcond=-1)
        self.coef_ = self.coef_.T

        if y.ndim == 1:
            self.coef_ = np.ravel(self.coef_)
        self._set_intercept(X_offset, y_offset, X_scale)
        return self
예제 #17
0
    def _update_mean_variance(self, n_past, mu, var, X, sample_weight=None):
        """Compute online update of Gaussian mean and variance.

        Given starting sample count, mean, and variance, a new set of points X return the updated mean and variance.
        (NB - each dimension (column) in X is treated as independent -- you get variance, not covariance).

        Can take scalar mean and variance, or vector mean and variance to simultaneously update a number of
        independent Gaussians.

        See Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque:

        http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf

        Parameters
        ----------
        n_past : int
            Number of samples represented in old mean and variance. If sample weights were given, this should contain
            the sum of sample weights represented in old mean and variance.

        mu : array-like, shape (number of Gaussians,)
            Means for Gaussians in original set.

        var : array-like, shape (number of Gaussians,)
            Variances for Gaussians in original set.

        sample_weight : ignored
            Ignored in diffprivlib.

        Returns
        -------
        total_mu : array-like, shape (number of Gaussians,)
            Updated mean for each Gaussian over the combined set.

        total_var : array-like, shape (number of Gaussians,)
            Updated variance for each Gaussian over the combined set.
        """
        if X.shape[0] == 0:
            return mu, var

        # Compute (potentially weighted) mean and variance of new datapoints
        if sample_weight is not None:
            warn_unused_args("sample_weight")

        n_new = X.shape[0]
        new_var = np.var(X, axis=0)
        new_mu = np.mean(X, axis=0)

        # Apply differential privacy to the new means and variances
        new_mu, new_var = self._randomise(new_mu, new_var, self.new_n_samples)

        if n_past == 0:
            return new_mu, new_var

        n_total = float(n_past + n_new)

        # Combine mean of old and new data, taking into consideration
        # (weighted) number of observations
        total_mu = (n_new * new_mu + n_past * mu) / n_total

        # Combine variance of old and new data, taking into consideration
        # (weighted) number of observations. This is achieved by combining
        # the sum-of-squared-differences (ssd)
        old_ssd = n_past * var
        new_ssd = n_new * new_var
        total_ssd = old_ssd + new_ssd + (n_past / float(n_new * n_total)) * (n_new * mu - n_new * new_mu) ** 2
        total_var = total_ssd / n_total

        return total_mu, total_var
    def fit(self, X, y, sample_weight=None):
        """
        Fit linear model.

        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            Training data

        y : array_like, shape (n_samples, n_targets)
            Target values.  Will be cast to X's dtype if necessary

        sample_weight : ignored
            Ignored by diffprivlib.  Present for consistency with sklearn API.

        Returns
        -------
        self : returns an instance of self.
        """
        self.accountant.check(self.epsilon, 0)

        if sample_weight is not None:
            warn_unused_args("sample_weight")

        X, y = check_X_y(X,
                         y,
                         accept_sparse=False,
                         y_numeric=True,
                         multi_output=True)

        if self.bounds_X is None or self.bounds_y is None:
            warnings.warn(
                "Bounds parameters haven't been specified, so falling back to determining bounds from the "
                "data.\n"
                "This will result in additional privacy leakage. To ensure differential privacy with no "
                "additional privacy loss, specify `bounds_X` and `bounds_y`.",
                PrivacyLeakWarning)

            if self.bounds_X is None:
                self.bounds_X = (np.min(X, axis=0), np.max(X, axis=0))
            if self.bounds_y is None:
                self.bounds_y = (np.min(y, axis=0), np.max(y, axis=0))

        self.bounds_X = check_bounds(self.bounds_X, X.shape[1])
        self.bounds_y = check_bounds(self.bounds_y,
                                     y.shape[1] if y.ndim > 1 else 1)

        n_features = X.shape[1]
        n_targets = y.shape[1] if y.ndim > 1 else 1
        epsilon_intercept_scale = 1 / (n_features +
                                       1) if self.fit_intercept else 0

        X, y, X_offset, y_offset, X_scale = self._preprocess_data(
            X,
            y,
            fit_intercept=self.fit_intercept,
            bounds_X=self.bounds_X,
            bounds_y=self.bounds_y,
            epsilon=self.epsilon * epsilon_intercept_scale,
            copy=self.copy_X)

        bounds_X = (self.bounds_X[0] - X_offset, self.bounds_X[1] - X_offset)
        bounds_y = (self.bounds_y[0] - y_offset, self.bounds_y[1] - y_offset)

        objs, obj_coefs = _construct_regression_obj(
            X,
            y,
            bounds_X,
            bounds_y,
            epsilon=self.epsilon * (1 - epsilon_intercept_scale),
            alpha=0)
        coef = np.zeros((n_features, n_targets))
        residues = []

        for i, obj in enumerate(objs):
            opt_result = minimize(obj, np.zeros(n_features), jac=True)
            coef[:, i] = opt_result.x
            residues += [opt_result.fun]

        self.coef_ = coef.T
        self._residues = residues
        self._obj_coefs = obj_coefs

        if y.ndim == 1:
            self.coef_ = np.ravel(self.coef_)
            self._residues = self._residues[0]
        self._set_intercept(X_offset, y_offset, X_scale)

        self.accountant.spend(self.epsilon, 0)

        return self
    def partial_fit(self, X, y=None, sample_weight=None):
        """Online computation of mean and std with differential privacy on X for later scaling.  All of X is processed
        as a single batch.  This is intended for cases when `fit` is not feasible due to very large number of
        `n_samples` or because X is read from a continuous stream.

        The algorithm for incremental mean and std is given in Equation 1.5a,b in Chan, Tony F., Gene H. Golub, and
        Randall J. LeVeque. "Algorithms for computing the sample variance: Analysis and recommendations." The American
        Statistician 37.3 (1983): 242-247:

        Parameters
        ----------
        X : {array-like}, shape [n_samples, n_features]
            The data used to compute the mean and standard deviation used for later scaling along the features axis.

        y
            Ignored

        sample_weight
            Ignored by diffprivlib.  Present for consistency with sklearn API.

        """
        self.accountant.check(self.epsilon, 0)

        if sample_weight is not None:
            warn_unused_args("sample_weight")

        epsilon_0 = self.epsilon / 2 if self.with_std else self.epsilon

        X = check_array(X,
                        accept_sparse=False,
                        copy=self.copy,
                        estimator=self,
                        dtype=FLOAT_DTYPES,
                        force_all_finite='allow-nan')
        # Hotfix for sklearn v 0.23
        self.n_features_in_ = X.shape[1]

        if self.bounds is None:
            warnings.warn(
                "Range parameter hasn't been specified, so falling back to determining range from the data.\n"
                "This will result in additional privacy leakage.  To ensure differential privacy with no "
                "additional privacy loss, specify `range` for each valued returned by np.mean().",
                PrivacyLeakWarning)
            self.bounds = (np.min(X, axis=0), np.max(X, axis=0))

        self.bounds = check_bounds(self.bounds, X.shape[1])
        X = clip_to_bounds(X, self.bounds)

        # Even in the case of `with_mean=False`, we update the mean anyway. This is needed for the incremental
        # computation of the var See incr_mean_variance_axis and _incremental_mean_variance_axis

        # if n_samples_seen_ is an integer (i.e. no missing values), we need to transform it to a NumPy array of
        # shape (n_features,) required by incr_mean_variance_axis and _incremental_variance_axis
        if hasattr(self, 'n_samples_seen_') and isinstance(
                self.n_samples_seen_, (int, np.integer)):
            self.n_samples_seen_ = np.repeat(self.n_samples_seen_,
                                             X.shape[1]).astype(np.int64)

        if not hasattr(self, 'n_samples_seen_'):
            self.n_samples_seen_ = np.zeros(X.shape[1], dtype=np.int64)

        # First pass
        if not hasattr(self, 'scale_'):
            self.mean_ = .0
            if self.with_std:
                self.var_ = .0
            else:
                self.var_ = None

        if not self.with_mean and not self.with_std:
            self.mean_ = None
            self.var_ = None
            self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0)
        else:
            self.mean_, self.var_, self.n_samples_seen_ = _incremental_mean_and_var(
                X, epsilon_0, self.bounds, self.mean_, self.var_,
                self.n_samples_seen_)

        # for backward-compatibility, reduce n_samples_seen_ to an integer
        # if the number of samples is the same for each feature (i.e. no
        # missing values)
        if np.ptp(self.n_samples_seen_) == 0:
            self.n_samples_seen_ = self.n_samples_seen_[0]

        if self.with_std:
            self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_))
        else:
            self.scale_ = None

        self.accountant.spend(self.epsilon, 0)

        return self
예제 #20
0
    def fit(self, X, y, sample_weight=None):
        """
        Fit linear model.

        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            Training data

        y : array_like, shape (n_samples, n_targets)
            Target values.  Will be cast to X's dtype if necessary

        sample_weight : ignored
            Ignored by diffprivlib.  Present for consistency with sklearn API.

        Returns
        -------
        self : returns an instance of self.
        """
        self.accountant.check(self.epsilon, 0)

        if sample_weight is not None:
            warn_unused_args("sample_weight")

        X, y = check_X_y(X,
                         y,
                         accept_sparse=False,
                         y_numeric=True,
                         multi_output=True)

        if self.fit_intercept:
            if self.bounds_X is None or self.bounds_y is None:
                warnings.warn(
                    "Bounds parameters haven't been specified, so falling back to determining bounds from the "
                    "data.\n"
                    "This will result in additional privacy leakage. To ensure differential privacy with no "
                    "additional privacy loss, specify `bounds_X` and `bounds_y`.",
                    PrivacyLeakWarning)

                if self.bounds_X is None:
                    self.bounds_X = (np.min(X, axis=0), np.max(X, axis=0))
                if self.bounds_y is None:
                    self.bounds_y = (np.min(y, axis=0), np.max(y, axis=0))

            self.bounds_X = check_bounds(self.bounds_X, X.shape[1])
            self.bounds_y = check_bounds(self.bounds_y,
                                         y.shape[1] if y.ndim > 1 else 1)

        n_features = X.shape[1]
        epsilon_intercept_scale = 1 / (n_features +
                                       1) if self.fit_intercept else 0

        X, y, X_offset, y_offset, X_scale = self._preprocess_data(
            X,
            y,
            fit_intercept=self.fit_intercept,
            bounds_X=self.bounds_X,
            bounds_y=self.bounds_y,
            epsilon=self.epsilon * epsilon_intercept_scale,
            copy=self.copy_X)

        if self.data_norm is None:
            warnings.warn(
                "Data norm has not been specified and will be calculated on the data provided.  This will "
                "result in additional privacy leakage. To ensure differential privacy and no additional "
                "privacy leakage, specify `data_norm` at initialisation.",
                PrivacyLeakWarning)
            self.data_norm = np.linalg.norm(X, axis=1).max()

        X = clip_to_norm(X, self.data_norm)

        A = np.hstack((X, y[:, np.newaxis] if y.ndim == 1 else y))
        AtA = np.dot(A.T, A)

        mech = Wishart().set_epsilon(
            self.epsilon * (1 - epsilon_intercept_scale)).set_sensitivity(
                self.data_norm)
        noisy_AtA = mech.randomise(AtA)

        noisy_AtA = noisy_AtA[:n_features, :]
        XtX = noisy_AtA[:, :n_features]
        Xty = noisy_AtA[:, n_features:]

        self.coef_, self._residues, self.rank_, self.singular_ = np.linalg.lstsq(
            XtX, Xty, rcond=-1)
        self.coef_ = self.coef_.T

        if y.ndim == 1:
            self.coef_ = np.ravel(self.coef_)
        self._set_intercept(X_offset, y_offset, X_scale)

        self.accountant.spend(self.epsilon, 0)

        return self
예제 #21
0
    def _update_mean_variance(self,
                              n_past,
                              mu,
                              var,
                              X,
                              sample_weight=None,
                              n_noisy=None):
        """Compute online update of Gaussian mean and variance.

        Given starting sample count, mean, and variance, a new set of points X return the updated mean and variance.
        (NB - each dimension (column) in X is treated as independent -- you get variance, not covariance).

        Can take scalar mean and variance, or vector mean and variance to simultaneously update a number of
        independent Gaussians.

        See Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque:

        http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf

        Parameters
        ----------
        n_past : int
            Number of samples represented in old mean and variance.  If sample weights were given, this should contain
            the sum of sample weights represented in old mean and variance.

        mu : array-like, shape (number of Gaussians,)
            Means for Gaussians in original set.

        var : array-like, shape (number of Gaussians,)
            Variances for Gaussians in original set.

        sample_weight : ignored
            Ignored in diffprivlib.

        n_noisy : int, optional
            Noisy count of the given class, satisfying differential privacy.

        Returns
        -------
        total_mu : array-like, shape (number of Gaussians,)
            Updated mean for each Gaussian over the combined set.

        total_var : array-like, shape (number of Gaussians,)
            Updated variance for each Gaussian over the combined set.
        """
        if n_noisy is None:
            warnings.warn(
                "Noisy class count has not been specified and will be read from the data. To use this "
                "method correctly, make sure it is run by the parent GaussianNB class.",
                PrivacyLeakWarning)
            n_noisy = X.shape[0]

        if not n_noisy:
            return mu, var

        if sample_weight is not None:
            warn_unused_args("sample_weight")

        # Split epsilon between each feature, using 1/3 of total budget for each of mean and variance
        n_features = X.shape[1]
        local_epsilon = self.epsilon / 3 / n_features

        new_mu = np.zeros((n_features, ))
        new_var = np.zeros((n_features, ))

        for feature in range(n_features):
            _X = X[:, feature]
            lower, upper = self.bounds[0][feature], self.bounds[1][feature]
            local_diameter = upper - lower

            mech_mu = LaplaceTruncated(epsilon=local_epsilon,
                                       delta=0,
                                       sensitivity=local_diameter,
                                       lower=lower * n_noisy,
                                       upper=upper * n_noisy)
            _mu = mech_mu.randomise(_X.sum()) / n_noisy

            local_sq_sens = max(_mu - lower, upper - _mu)**2
            mech_var = LaplaceBoundedDomain(epsilon=local_epsilon,
                                            delta=0,
                                            sensitivity=local_sq_sens,
                                            lower=0,
                                            upper=local_sq_sens * n_noisy)
            _var = mech_var.randomise(((_X - _mu)**2).sum()) / n_noisy

            new_mu[feature] = _mu
            new_var[feature] = _var

        if n_past == 0:
            return new_mu, new_var

        n_total = float(n_past + n_noisy)

        # Combine mean of old and new data, taking into consideration
        # (weighted) number of observations
        total_mu = (n_noisy * new_mu + n_past * mu) / n_total

        # Combine variance of old and new data, taking into consideration
        # (weighted) number of observations. This is achieved by combining
        # the sum-of-squared-differences (ssd)
        old_ssd = n_past * var
        new_ssd = n_noisy * new_var
        total_ssd = old_ssd + new_ssd + (n_past / float(n_noisy * n_total)) * (
            n_noisy * mu - n_noisy * new_mu)**2
        total_var = total_ssd / n_total

        return total_mu, total_var
예제 #22
0
    def fit(self, X, y=None, sample_weight=None):
        """Computes k-means clustering with differential privacy.

        Parameters
        ----------
        X : array-like, shape=(n_samples, n_features)
            Training instances to cluster.

        y : Ignored
            not used, present here for API consistency by convention.

        sample_weight : ignored
            Ignored by diffprivlib.  Present for consistency with sklearn API.

        Returns
        -------
        self : class

        """
        self.accountant.check(self.epsilon, 0)

        if sample_weight is not None:
            warn_unused_args("sample_weight")

        del y

        X = check_array(X, accept_sparse=False, dtype=[np.float64, np.float32])
        n_samples, n_dims = X.shape

        if n_samples < self.n_clusters:
            raise ValueError("n_samples=%d should be >= n_clusters=%d" % (n_samples, self.n_clusters))

        iters = self._calc_iters(n_dims, n_samples)

        if self.bounds is None:
            warnings.warn("Bounds have not been specified and will be calculated on the data provided.  This will "
                          "result in additional privacy leakage. To ensure differential privacy and no additional "
                          "privacy leakage, specify `bounds` for each dimension.", PrivacyLeakWarning)
            self.bounds = (np.min(X, axis=0), np.max(X, axis=0))

        self.bounds = check_bounds(self.bounds, n_dims, min_separation=1e-5)
        X = clip_to_bounds(X, self.bounds)

        centers = self._init_centers(n_dims)
        labels = None
        distances = None

        # Run _update_centers first to ensure consistency of `labels` and `centers`, since convergence unlikely
        for _ in range(-1, iters):
            if labels is not None:
                centers = self._update_centers(X, centers=centers, labels=labels, dims=n_dims, total_iters=iters)

            distances, labels = self._distances_labels(X, centers)

        self.cluster_centers_ = centers
        self.labels_ = labels
        self.inertia_ = distances[np.arange(len(labels)), labels].sum()
        self.n_iter_ = iters

        self.accountant.spend(self.epsilon, 0)

        return self
def quantile(array,
             quant,
             epsilon=1.0,
             bounds=None,
             axis=None,
             keepdims=False,
             accountant=None,
             **unused_args):
    r"""
    Compute the differentially private quantile of the array.

    Returns the specified quantile with differential privacy.  The quantile is calculated over the flattened array.
    Differential privacy is achieved with the :class:`.Exponential` mechanism, using the method first proposed by
    Smith, 2011.

    Paper link: https://dl.acm.org/doi/pdf/10.1145/1993636.1993743

    Parameters
    ----------
    array : array_like
        Array containing numbers whose quantile is sought.  If `array` is not an array, a conversion is attempted.

    quant : float or array-like
        Quantile or array of quantiles.  Each quantile must be in the unit interval [0, 1].  If quant is array-like,
        quantiles are returned over the flattened array.

    epsilon : float, default: 1.0
        Privacy parameter :math:`\epsilon`.  Differential privacy is achieved over the entire output, with epsilon split
        evenly between each output value.

    bounds : tuple, optional
        Bounds of the values of the array, of the form (min, max).

    axis : None or int or tuple of ints, optional
        Axis or axes along which a sum is performed.  The default, axis=None, will sum all of the elements of the input
        array.  If axis is negative it counts from the last to the first axis.

        If axis is a tuple of ints, a sum is performed on all of the axes specified in the tuple instead of a single
        axis or all the axes as before.

    keepdims : bool, default: False
        If this is set to True, the axes which are reduced are left in the result as dimensions with size one.  With
        this option, the result will broadcast correctly against the input array.

        If the default value is passed, then `keepdims` will not be passed through to the `mean` method of sub-classes
        of `ndarray`, however any non-default value will be.  If the sub-class' method does not implement `keepdims` any
        exceptions will be raised.

    accountant : BudgetAccountant, optional
        Accountant to keep track of privacy budget.

    Returns
    -------
    m : ndarray
        Returns a new array containing the quantile values.

    See Also
    --------
    numpy.quantile : Equivalent non-private method.

    percentile, median

    """
    warn_unused_args(unused_args)

    if bounds is None:
        warnings.warn(
            "Bounds have not been specified and will be calculated on the data provided. This will "
            "result in additional privacy leakage. To ensure differential privacy and no additional "
            "privacy leakage, specify bounds for each dimension.",
            PrivacyLeakWarning)
        bounds = (np.min(array), np.max(array))

    quant = np.ravel(quant)

    if np.any(quant < 0) or np.any(quant > 1):
        raise ValueError("Quantiles must be in the unit interval [0, 1].")

    if len(quant) > 1:
        return np.array([
            quantile(array,
                     q_i,
                     epsilon=epsilon / len(quant),
                     bounds=bounds,
                     axis=axis,
                     keepdims=keepdims,
                     accountant=accountant) for q_i in quant
        ])

    # Dealing with a single quant from now on
    quant = quant.item()

    if axis is not None or keepdims:
        return _wrap_axis(quantile,
                          array,
                          quant=quant,
                          epsilon=epsilon,
                          bounds=bounds,
                          axis=axis,
                          keepdims=keepdims,
                          accountant=accountant)

    # Dealing with a scalar output from now on
    bounds = check_bounds(bounds, shape=0, min_separation=1e-5)

    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    # Let's ravel array to be single-dimensional
    array = clip_to_bounds(np.ravel(array), bounds)

    k = array.size
    array = np.append(array, list(bounds))
    array.sort()

    interval_sizes = np.diff(array)

    # Todo: Need to find a way to do this in a differentially private way
    if np.isnan(interval_sizes).any():
        return np.nan

    mech = Exponential(epsilon=epsilon,
                       sensitivity=1,
                       utility=list(-np.abs(np.arange(0, k + 1) - quant * k)),
                       measure=list(interval_sizes))
    idx = mech.randomise()
    output = mech._rng.random() * (array[idx + 1] - array[idx]) + array[idx]

    accountant.spend(epsilon, 0)

    return output
예제 #24
0
    def _partial_fit(self,
                     X,
                     y,
                     classes=None,
                     _refit=False,
                     sample_weight=None):
        self.accountant.check(self.epsilon, 0)

        if sample_weight is not None:
            warn_unused_args("sample_weight")

        X, y = check_X_y(X, y)

        if self.bounds is None:
            warnings.warn(
                "Bounds have not been specified and will be calculated on the data provided. This will "
                "result in additional privacy leakage. To ensure differential privacy and no additional "
                "privacy leakage, specify bounds for each dimension.",
                PrivacyLeakWarning)
            self.bounds = (np.min(X, axis=0), np.max(X, axis=0))

        self.bounds = check_bounds(self.bounds, shape=X.shape[1])
        X = clip_to_bounds(X, self.bounds)

        self.epsilon_ = self.var_smoothing

        if _refit:
            self.classes_ = None

        if _check_partial_fit_first_call(self, classes):
            n_features = X.shape[1]
            n_classes = len(self.classes_)
            self.theta_ = np.zeros((n_classes, n_features))
            self.sigma_ = np.zeros((n_classes, n_features))

            self.class_count_ = np.zeros(n_classes, dtype=np.float64)

            if self.priors is not None:
                priors = np.asarray(self.priors)

                if len(priors) != n_classes:
                    raise ValueError(
                        "Number of priors must match number of classes.")
                if not np.isclose(priors.sum(), 1.0):
                    raise ValueError("The sum of the priors should be 1.")
                if (priors < 0).any():
                    raise ValueError("Priors must be non-negative.")
                self.class_prior_ = priors
            else:
                # Initialize the priors to zeros for each class
                self.class_prior_ = np.zeros(len(self.classes_),
                                             dtype=np.float64)
        else:
            if X.shape[1] != self.theta_.shape[1]:
                raise ValueError(
                    "Number of features %d does not match previous data %d." %
                    (X.shape[1], self.theta_.shape[1]))
            # Put epsilon back in each time
            self.sigma_[:, :] -= self.epsilon_

        classes = self.classes_

        unique_y = np.unique(y)
        unique_y_in_classes = np.in1d(unique_y, classes)

        if not np.all(unique_y_in_classes):
            raise ValueError(
                "The target label(s) %s in y do not exist in the initial classes %s"
                % (unique_y[~unique_y_in_classes], classes))

        noisy_class_counts = self._noisy_class_counts(y)

        for _i, y_i in enumerate(unique_y):
            i = classes.searchsorted(y_i)
            X_i = X[y == y_i, :]

            n_i = noisy_class_counts[_i]

            new_theta, new_sigma = self._update_mean_variance(
                self.class_count_[i],
                self.theta_[i, :],
                self.sigma_[i, :],
                X_i,
                n_noisy=n_i)

            self.theta_[i, :] = new_theta
            self.sigma_[i, :] = new_sigma
            self.class_count_[i] += n_i

        self.sigma_[:, :] += self.epsilon_

        # Update if only no priors is provided
        if self.priors is None:
            # Empirical prior, with sample_weight taken into account
            self.class_prior_ = self.class_count_ / self.class_count_.sum()

        self.accountant.spend(self.epsilon, 0)

        return self
예제 #25
0
def histogram2d(array_x,
                array_y,
                epsilon=1.0,
                bins=10,
                range=None,
                weights=None,
                density=None,
                accountant=None,
                **unused_args):
    r"""
    Compute the differentially private bi-dimensional histogram of two data samples.

    Parameters
    ----------
    array_x : array_like, shape (N,)
        An array containing the x coordinates of the points to be histogrammed.

    array_y : array_like, shape (N,)
        An array containing the y coordinates of the points to be histogrammed.

    epsilon : float, default: 1.0
        Privacy parameter :math:`\epsilon` to be applied.

    bins : int or array_like or [int, int] or [array, array], default: 10
        The bin specification:

          * If int, the number of bins for the two dimensions (nx=ny=bins).
          * If array_like, the bin edges for the two dimensions (x_edges=y_edges=bins).
          * If [int, int], the number of bins in each dimension (nx, ny = bins).
          * If [array, array], the bin edges in each dimension (x_edges, y_edges = bins).
          * A combination [int, array] or [array, int], where int is the number of bins and array is the bin edges.

    range : array_like, shape(2,2), optional
        The leftmost and rightmost edges of the bins along each dimension (if not specified explicitly in the `bins`
        parameters): ``[[xmin, xmax], [ymin, ymax]]``.  All values outside of this range will be considered outliers and
        not tallied in the histogram.

    density : bool, optional
        If False, the default, returns the number of samples in each bin.  If True, returns the probability *density*
        function at the bin, ``bin_count / sample_count / bin_area``.

    weights : array_like, shape(N,), optional
        An array of values ``w_i`` weighing each sample ``(x_i, y_i)``.  Weights are normalized to 1 if `normed` is
        True.  If `normed` is False, the values of the returned histogram are equal to the sum of the weights belonging
        to the samples falling into each bin.

    accountant : BudgetAccountant, optional
        Accountant to keep track of privacy budget.

    Returns
    -------
    H : ndarray, shape(nx, ny)
        The bi-dimensional histogram of samples `x` and `y`.  Values in `x` are histogrammed along the first dimension
        and values in `y` are histogrammed along the second dimension.

    xedges : ndarray, shape(nx+1,)
        The bin edges along the first dimension.

    yedges : ndarray, shape(ny+1,)
        The bin edges along the second dimension.

    See Also
    --------
    histogram : 1D differentially private histogram
    histogramdd : Differentially private Multidimensional histogram

    Notes
    -----
    When `normed` is True, then the returned histogram is the sample density, defined such that the sum over bins of the
    product ``bin_value * bin_area`` is 1.

    Please note that the histogram does not follow the Cartesian convention where `x` values are on the abscissa and `y`
    values on the ordinate axis.  Rather, `x` is histogrammed along the first dimension of the array (vertical), and `y`
    along the second dimension of the array (horizontal).  This ensures compatibility with `histogramdd`.

    """
    warn_unused_args(unused_args)

    try:
        num_bins = len(bins)
    except TypeError:
        num_bins = 1

    if num_bins not in (1, 2):
        xedges = yedges = np.asarray(bins)
        bins = [xedges, yedges]

    hist, edges = histogramdd([array_x, array_y],
                              epsilon=epsilon,
                              bins=bins,
                              range=range,
                              weights=weights,
                              density=density,
                              accountant=accountant)
    return hist, edges[0], edges[1]
예제 #26
0
def histogramdd(sample,
                epsilon=1.0,
                bins=10,
                range=None,
                weights=None,
                density=None,
                accountant=None,
                **unused_args):
    r"""
    Compute the differentially private multidimensional histogram of some data.

    The histogram is computed using :obj:`numpy.histogramdd`, and noise added using :class:`.GeometricTruncated` to
    satisfy differential privacy.  If the `range` parameter is not specified correctly, a :class:`.PrivacyLeakWarning`
    is thrown.  Users are referred to :obj:`numpy.histogramdd` for more usage notes.

    Parameters
    ----------
    sample : (N, D) array, or (D, N) array_like
        The data to be histogrammed.

        Note the unusual interpretation of sample when an array_like:

        * When an array, each row is a coordinate in a D-dimensional space - such as
          ``histogramgramdd(np.array([p1, p2, p3]))``.
        * When an array_like, each element is the list of values for single coordinate - such as
          ``histogramgramdd((X, Y, Z))``.

        The first form should be preferred.

    epsilon : float, default: 1.0
        Privacy parameter :math:`\epsilon` to be applied.

    bins : sequence or int, default: 10
        The bin specification:

        * A sequence of arrays describing the monotonically increasing bin edges along each dimension.
        * The number of bins for each dimension (nx, ny, ... =bins)
        * The number of bins for all dimensions (nx=ny=...=bins).

    range : sequence, optional
        A sequence of length D, each an optional (lower, upper) tuple giving the outer bin edges to be used if the edges
        are not given explicitly in `bins`.
        An entry of None in the sequence results in the minimum and maximum values being used for the corresponding
        dimension.
        The default, None, is equivalent to passing a tuple of D None values.

    density : bool, optional
        If False, the default, returns the number of samples in each bin.  If True, returns the probability *density*
        function at the bin, ``bin_count / sample_count / bin_volume``.

    weights : (N,) array_like, optional
        An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`.  Weights are normalized to 1 if normed is
        True.  If normed is False, the values of the returned histogram are equal to the sum of the weights belonging to
        the samples falling into each bin.

    accountant : BudgetAccountant, optional
        Accountant to keep track of privacy budget.

    Returns
    -------
    H : ndarray
        The multidimensional histogram of sample x.  See normed and weights for the different possible semantics.
    edges : list
        A list of D arrays describing the bin edges for each dimension.

    See Also
    --------
    histogram: 1-D differentially private histogram
    histogram2d: 2-D differentially private histogram

    """
    warn_unused_args(unused_args)

    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    # Range only required if bin edges not specified
    if np.array(bins, dtype=object).ndim == 0 or not np.all(
        [np.ndim(_bin) for _bin in bins]):
        if range is None or (isinstance(range, list) and None in range):
            warnings.warn(
                "Range parameter has not been specified (or has missing elements). Falling back to taking "
                "range from the data.\n "
                "To ensure differential privacy, and no additional privacy leakage, the range must be "
                "specified for each dimension independently of the data (i.e., using domain knowledge).",
                PrivacyLeakWarning)

    hist, bin_edges = np.histogramdd(sample,
                                     bins=bins,
                                     range=range,
                                     normed=None,
                                     weights=weights,
                                     density=None)

    dp_mech = GeometricTruncated(epsilon=epsilon,
                                 sensitivity=1,
                                 lower=0,
                                 upper=maxsize)

    dp_hist = np.zeros_like(hist)
    iterator = np.nditer(hist, flags=['multi_index'])

    while not iterator.finished:
        dp_hist[iterator.multi_index] = dp_mech.randomise(int(iterator[0]))
        iterator.iternext()

    dp_hist = dp_hist.astype(float, casting='safe')

    if density:
        # calculate the probability density function
        dims = len(dp_hist.shape)
        dp_hist_sum = dp_hist.sum()
        for i in np.arange(dims):
            shape = np.ones(dims, int)
            shape[i] = dp_hist.shape[i]
            # noinspection PyUnresolvedReferences
            dp_hist = dp_hist / np.diff(bin_edges[i]).reshape(shape)

        if dp_hist_sum > 0:
            dp_hist /= dp_hist_sum

    accountant.spend(epsilon, 0)

    return dp_hist, bin_edges
예제 #27
0
    def fit(self, X, y, sample_weight=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and n_features is the number of features.

        y : array-like, shape (n_samples,)
            Target vector relative to X.

        sample_weight : ignored
            Ignored by diffprivlib.  Present for consistency with sklearn API.

        Returns
        -------
        self : class

        """
        self.accountant.check(self.epsilon, 0)

        if sample_weight is not None:
            warn_unused_args("sample_weight")

        if not isinstance(self.C, numbers.Real) or self.C < 0:
            raise ValueError("Penalty term must be positive; got (C=%r)" %
                             self.C)
        if not isinstance(self.max_iter,
                          numbers.Integral) or self.max_iter < 0:
            raise ValueError(
                "Maximum number of iteration must be positive; got (max_iter=%r)"
                % self.max_iter)
        if not isinstance(self.tol, numbers.Real) or self.tol < 0:
            raise ValueError(
                "Tolerance for stopping criteria must be positive; got (tol=%r)"
                % self.tol)

        solver = _check_solver(self.solver, self.penalty, self.dual)
        X, y = check_X_y(X,
                         y,
                         accept_sparse='csr',
                         dtype=np.float64,
                         order="C",
                         accept_large_sparse=solver != 'liblinear')
        check_classification_targets(y)
        self.classes_ = np.unique(y)
        _, n_features = X.shape

        if self.data_norm is None:
            warnings.warn(
                "Data norm has not been specified and will be calculated on the data provided.  This will "
                "result in additional privacy leakage. To ensure differential privacy and no additional "
                "privacy leakage, specify `data_norm` at initialisation.",
                PrivacyLeakWarning)
            self.data_norm = np.linalg.norm(X, axis=1).max()

        X = clip_to_norm(X, self.data_norm)

        self.multi_class = _check_multi_class(self.multi_class, solver,
                                              len(self.classes_))

        n_classes = len(self.classes_)
        classes_ = self.classes_
        if n_classes < 2:
            raise ValueError(
                "This solver needs samples of at least 2 classes in the data, but the data contains only "
                "one class: %r" % classes_[0])

        if len(self.classes_) == 2:
            n_classes = 1
            classes_ = classes_[1:]

        if self.warm_start:
            warm_start_coef = getattr(self, 'coef_', None)
        else:
            warm_start_coef = None
        if warm_start_coef is not None and self.fit_intercept:
            warm_start_coef = np.append(warm_start_coef,
                                        self.intercept_[:, np.newaxis],
                                        axis=1)

        self.coef_ = list()
        self.intercept_ = np.zeros(n_classes)

        if warm_start_coef is None:
            warm_start_coef = [None] * n_classes

        path_func = delayed(_logistic_regression_path)

        fold_coefs_ = Parallel(
            n_jobs=self.n_jobs,
            verbose=self.verbose,
            **_joblib_parallel_args(prefer='processes'))(
                path_func(X,
                          y,
                          epsilon=self.epsilon / n_classes,
                          data_norm=self.data_norm,
                          pos_class=class_,
                          Cs=[self.C],
                          fit_intercept=self.fit_intercept,
                          max_iter=self.max_iter,
                          tol=self.tol,
                          verbose=self.verbose,
                          coef=warm_start_coef_,
                          check_input=False)
                for class_, warm_start_coef_ in zip(classes_, warm_start_coef))

        fold_coefs_, _, n_iter_ = zip(*fold_coefs_)
        self.n_iter_ = np.asarray(n_iter_, dtype=np.int32)[:, 0]

        self.coef_ = np.asarray(fold_coefs_)
        self.coef_ = self.coef_.reshape(n_classes,
                                        n_features + int(self.fit_intercept))

        if self.fit_intercept:
            self.intercept_ = self.coef_[:, -1]
            self.coef_ = self.coef_[:, :-1]

        self.accountant.spend(self.epsilon, 0)

        return self
예제 #28
0
def histogram(sample,
              epsilon=1.0,
              bins=10,
              range=None,
              weights=None,
              density=None,
              accountant=None,
              **unused_args):
    r"""
    Compute the differentially private histogram of a set of data.

    The histogram is computed using :obj:`numpy.histogram`, and noise added using :class:`.GeometricTruncated` to
    satisfy differential privacy.  If the `range` parameter is not specified correctly, a :class:`.PrivacyLeakWarning`
    is thrown.  Users are referred to :obj:`numpy.histogram` for more usage notes.

    Parameters
    ----------
    sample : array_like
        Input data.  The histogram is computed over the flattened array.

    epsilon : float, default: 1.0
        Privacy parameter :math:`\epsilon` to be applied.

    bins : int or sequence of scalars or str, default: 10
        If `bins` is an int, it defines the number of equal-width bins in the given range (10, by default).  If `bins`
        is a sequence, it defines a monotonically increasing array of bin edges, including the rightmost edge, allowing
        for non-uniform bin widths.

        If `bins` is a string, it defines the method used to calculate the optimal bin width, as defined by
        `histogram_bin_edges`.

    range : (float, float), optional
        The lower and upper range of the bins.  If not provided, range is simply ``(a.min(), a.max())``.  Values outside
        the range are ignored.  The first element of the range must be less than or equal to the second. `range` affects
        the automatic bin computation as well.  While bin width is computed to be optimal based on the actual data
        within `range`, the bin count will fill the entire range including portions containing no data.

    weights : array_like, optional
        An array of weights, of the same shape as `a`.  Each value in `a` only contributes its associated weight
        towards the bin count (instead of 1).  If `density` is True, the weights are normalized, so that the integral
        of the density over the range remains 1.

    density : bool, optional
        If ``False``, the result will contain the number of samples in each bin.  If ``True``, the result is the value
        of the probability *density* function at the bin, normalized such that the *integral* over the range is 1.
        Note that the sum of the histogram values will not be equal to 1 unless bins of unity width are chosen; it is
        not a probability *mass* function.

    accountant : BudgetAccountant, optional
        Accountant to keep track of privacy budget.

    Returns
    -------
    hist : array
        The values of the histogram.  See `density` and `weights` for a
        description of the possible semantics.
    bin_edges : array of dtype float
        Return the bin edges ``(length(hist)+1)``.


    See Also
    --------
    histogramdd, histogram2d

    Notes
    -----
    All but the last (righthand-most) bin is half-open.  In other words, if `bins` is::

      [1, 2, 3, 4]

    then the first bin is ``[1, 2)`` (including 1, but excluding 2) and the second ``[2, 3)``.  The last bin, however,
    is ``[3, 4]``, which *includes* 4.

    """
    warn_unused_args(unused_args)

    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    if range is None:
        warnings.warn(
            "Range parameter has not been specified. Falling back to taking range from the data.\n"
            "To ensure differential privacy, and no additional privacy leakage, the range must be "
            "specified independently of the data (i.e., using domain knowledge).",
            PrivacyLeakWarning)

    hist, bin_edges = np.histogram(sample,
                                   bins=bins,
                                   range=range,
                                   weights=weights,
                                   density=None)

    dp_mech = GeometricTruncated(epsilon=epsilon,
                                 sensitivity=1,
                                 lower=0,
                                 upper=maxsize)

    dp_hist = np.zeros_like(hist)

    for i in np.arange(dp_hist.shape[0]):
        dp_hist[i] = dp_mech.randomise(int(hist[i]))

    # dp_hist = dp_hist.astype(float, casting='safe')

    accountant.spend(epsilon, 0)

    if density:
        bin_sizes = np.array(np.diff(bin_edges), float)
        return dp_hist / bin_sizes / (dp_hist.sum()
                                      if dp_hist.sum() else 1), bin_edges

    return dp_hist, bin_edges
예제 #29
0
def _logistic_regression_path(X,
                              y,
                              epsilon,
                              data_norm,
                              pos_class=None,
                              Cs=10,
                              fit_intercept=True,
                              max_iter=100,
                              tol=1e-4,
                              verbose=0,
                              coef=None,
                              check_input=True,
                              **unused_args):
    """Compute a Logistic Regression model with differential privacy for a list of regularization parameters.  Takes
    inspiration from ``_logistic_regression_path`` in scikit-learn, specified to the LBFGS solver and one-vs-rest
    multi class fitting.

    Parameters
    ----------
    X : array-like or sparse matrix, shape (n_samples, n_features)
        Input data.

    y : array-like, shape (n_samples,) or (n_samples, n_targets)
        Input data, target values.

    epsilon : float
        Privacy parameter for differential privacy.

    data_norm : float
        Max norm of the data for which differential privacy is satisfied.

    pos_class : int, optional
        The class with respect to which we perform a one-vs-all fit.  If None, then it is assumed that the given problem
        is binary.

    Cs : int | array-like, shape (n_cs,), default: 10
        List of values for the regularization parameter or integer specifying the number of regularization parameters
        that should be used.  In this case, the parameters will be chosen in a logarithmic scale between 1e-4 and 1e4.

    fit_intercept : bool, default: True
        Whether to fit an intercept for the model.  In this case the shape of the returned array is
        (n_cs, n_features + 1).

    max_iter : int, default: 100
        Maximum number of iterations for the solver.

    tol : float, default: 1e-4
        Stopping criterion.  For the newton-cg and lbfgs solvers, the iteration will stop when ``max{|g_i | i = 1,
        ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient.

    verbose : int, default: 0
        For the liblinear and lbfgs solvers set verbose to any positive number for verbosity.

    coef : array-like, shape (n_features,), optional
        Initialization value for coefficients of logistic regression.  Useless for liblinear solver.

    check_input : bool, default: True
        If False, the input arrays X and y will not be checked.

    Returns
    -------
    coefs : ndarray, shape (n_cs, n_features) or (n_cs, n_features + 1)
        List of coefficients for the Logistic Regression model.  If fit_intercept is set to True then the second
        dimension will be n_features + 1, where the last item represents the intercept.  For
        ``multiclass='multinomial'``, the shape is (n_classes, n_cs, n_features) or (n_classes, n_cs, n_features + 1).

    Cs : ndarray
        Grid of Cs used for cross-validation.

    n_iter : array, shape (n_cs,)
        Actual number of iteration for each Cs.

    """
    warn_unused_args(unused_args)

    if isinstance(Cs, numbers.Integral):
        Cs = np.logspace(-4, 4, int(Cs))

    solver = 'lbfgs'

    # Data norm increases if intercept is included
    if fit_intercept:
        data_norm = np.sqrt(data_norm**2 + 1)

    # Pre-processing.
    if check_input:
        X = check_array(X,
                        accept_sparse='csr',
                        dtype=np.float64,
                        accept_large_sparse=solver != 'liblinear')
        y = check_array(y, ensure_2d=False, dtype=None)
        check_consistent_length(X, y)
    _, n_features = X.shape

    classes = np.unique(y)

    if pos_class is None:
        if classes.size > 2:
            raise ValueError('To fit OvR, use the pos_class argument')
        # np.unique(y) gives labels in sorted order.
        pos_class = classes[1]

    sample_weight = np.ones(X.shape[0], dtype=X.dtype)

    # For doing a ovr, we need to mask the labels first.
    output_vec = np.zeros(n_features + int(fit_intercept), dtype=X.dtype)
    mask = (y == pos_class)
    y_bin = np.ones(y.shape, dtype=X.dtype)
    y_bin[~mask] = -1.
    # for compute_class_weight

    if coef is not None:
        # it must work both giving the bias term and not
        if coef.size not in (n_features, output_vec.size):
            raise ValueError(
                'Initialization coef is of shape %d, expected shape %d or %d' %
                (coef.size, n_features, output_vec.size))
        output_vec[:coef.size] = coef

    target = y_bin

    coefs = list()
    n_iter = np.zeros(len(Cs), dtype=np.int32)
    for i, C in enumerate(Cs):
        vector_mech = Vector(epsilon=epsilon,
                             dimension=n_features + int(fit_intercept),
                             alpha=1. / C,
                             function_sensitivity=0.25,
                             data_sensitivity=data_norm)
        noisy_logistic_loss = vector_mech.randomise(_logistic_loss_and_grad)

        iprint = [-1, 50, 1, 100, 101][np.searchsorted(np.array([0, 1, 2, 3]),
                                                       verbose)]
        output_vec, _, info = optimize.fmin_l_bfgs_b(noisy_logistic_loss,
                                                     output_vec,
                                                     fprime=None,
                                                     args=(X, target, 1. / C,
                                                           sample_weight),
                                                     iprint=iprint,
                                                     pgtol=tol,
                                                     maxiter=max_iter)
        if info["warnflag"] == 1:
            warnings.warn(
                "lbfgs failed to converge. Increase the number of iterations.",
                ConvergenceWarning)

        coefs.append(output_vec.copy())

        n_iter[i] = info['nit']

    return np.array(coefs), np.array(Cs), n_iter