def _sum(array, epsilon=1.0, bounds=None, accountant=None, axis=None, dtype=None, keepdims=False, nan=False): if bounds is None: warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) bounds = (np.min(array), np.max(array)) if axis is not None or keepdims: return _wrap_axis(_sum, array, epsilon=epsilon, bounds=bounds, accountant=accountant, axis=axis, dtype=dtype, keepdims=keepdims, nan=nan) lower, upper = check_bounds(bounds, shape=0, dtype=dtype) accountant = BudgetAccountant.load_default(accountant) accountant.check(epsilon, 0) # Let's ravel array to be single-dimensional array = clip_to_bounds(np.ravel(array), bounds) _func = np.nansum if nan else np.sum actual_sum = _func(array, axis=axis, dtype=dtype, keepdims=keepdims) mech = GeometricTruncated if dtype is not None and issubclass(dtype, Integral) else LaplaceTruncated mech = mech(epsilon=epsilon, sensitivity=upper - lower, lower=lower * array.size, upper=upper * array.size) output = mech.randomise(actual_sum) accountant.spend(epsilon, 0) return output
def __init__(self, n_components=None, centered=False, epsilon=1.0, data_norm=None, bounds=None, copy=True, whiten=False, random_state=None, accountant=None, **unused_args): super().__init__(n_components=n_components, copy=copy, whiten=whiten, svd_solver='full', tol=0.0, iterated_power='auto', random_state=random_state) self.centered = centered self.epsilon = epsilon self.data_norm = data_norm self.bounds = bounds self.accountant = BudgetAccountant.load_default(accountant) warn_unused_args(unused_args)
def __init__(self, epsilon=1.0, data_norm=None, tol=1e-4, C=1.0, fit_intercept=True, max_iter=100, verbose=0, warm_start=False, n_jobs=None, accountant=None, **unused_args): super().__init__(penalty='l2', dual=False, tol=tol, C=C, fit_intercept=fit_intercept, intercept_scaling=1.0, class_weight=None, random_state=None, solver='lbfgs', max_iter=max_iter, multi_class='ovr', verbose=verbose, warm_start=warm_start, n_jobs=n_jobs) self.epsilon = epsilon self.data_norm = data_norm self.classes_ = None self.accountant = BudgetAccountant.load_default(accountant) warn_unused_args(unused_args)
def _var(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=False, accountant=None, nan=False): if bounds is None: warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) bounds = (np.min(array), np.max(array)) if axis is not None or keepdims: return _wrap_axis(_var, array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims, accountant=accountant, nan=nan) lower, upper = check_bounds(bounds, shape=0, dtype=dtype) accountant = BudgetAccountant.load_default(accountant) accountant.check(epsilon, 0) # Let's ravel array to be single-dimensional array = clip_to_bounds(np.ravel(array), bounds) _func = np.nanvar if nan else np.var actual_var = _func(array, axis=axis, dtype=dtype, keepdims=keepdims) dp_mech = LaplaceBoundedDomain(epsilon=epsilon, delta=0, sensitivity=((upper - lower) / array.size) ** 2 * (array.size - 1), lower=0, upper=((upper - lower) ** 2) / 4) output = dp_mech.randomise(actual_var) accountant.spend(epsilon, 0) return output
def __init__(self, n_estimators=10, *, epsilon=1.0, cat_feature_threshold=10, n_jobs=1, verbose=0, accountant=None, max_depth=15, random_state=None, feature_domains=None, **unused_args): super().__init__(base_estimator=DecisionTreeClassifier(), n_estimators=n_estimators, estimator_params=("cat_feature_threshold", "max_depth", "epsilon", "random_state"), n_jobs=n_jobs, random_state=random_state, verbose=verbose) self.epsilon = epsilon self.cat_feature_threshold = cat_feature_threshold self.max_depth = max_depth self.accountant = BudgetAccountant.load_default(accountant) self.feature_domains = feature_domains if random_state is not None: np.random.seed(random_state) self._warn_unused_args(unused_args)
def __init__(self, epsilon=1.0, bounds=None, priors=None, var_smoothing=1e-9, accountant=None): super().__init__(priors=priors, var_smoothing=var_smoothing) self.epsilon = epsilon self.bounds = bounds self.accountant = BudgetAccountant.load_default(accountant)
def __init__(self, epsilon=1.0, bounds=None, copy=True, with_mean=True, with_std=True, accountant=None): super().__init__(copy=copy, with_mean=with_mean, with_std=with_std) self.epsilon = epsilon self.bounds = bounds self.accountant = BudgetAccountant.load_default(accountant)
def _sum(array, epsilon=1.0, bounds=None, accountant=None, axis=None, dtype=None, keepdims=np._NoValue, nan=False): accountant = BudgetAccountant.load_default(accountant) accountant.check(epsilon, 0) _func = np.nansum if nan else np.sum output_form = _func(np.zeros_like(array), axis=axis, keepdims=keepdims) vector_out = (np.ndim(output_form) == 1) n_datapoints = np.sum(np.ones_like(array, dtype=int), axis=axis, keepdims=keepdims).flat[0] if bounds is None: warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) if np.ndim(output_form) <= 1: bounds = (np.min(array, axis=axis, keepdims=keepdims), np.max(array, axis=axis, keepdims=keepdims)) else: bounds = (np.min(array), np.max(array)) lower, upper = check_bounds(bounds, output_form.shape[0] if vector_out else 1, dtype=dtype or float) array = np.clip(array, lower, upper) actual_sum = _func(array, axis=axis, dtype=dtype, keepdims=keepdims) dp_mech = GeometricTruncated if dtype is not None and issubclass(dtype, Integral) else LaplaceTruncated if isinstance(actual_sum, np.ndarray): dp_sum = np.zeros_like(actual_sum, dtype=dtype) iterator = np.nditer(actual_sum, flags=['multi_index']) while not iterator.finished: idx = iterator.multi_index _lower, _upper = (lower[idx], upper[idx]) if vector_out else (lower[0], upper[0]) local_diam = _upper - _lower mech = dp_mech().set_epsilon(epsilon).set_sensitivity(local_diam).\ set_bounds(_lower * n_datapoints, _upper * n_datapoints) dp_sum[idx] = mech.randomise(actual_sum[idx]) iterator.iternext() accountant.spend(epsilon, 0) return dp_sum local_diam = upper[0] - lower[0] mech = dp_mech().set_epsilon(epsilon).set_sensitivity(local_diam).set_bounds(lower[0] * n_datapoints, upper[0] * n_datapoints) accountant.spend(epsilon, 0) return mech.randomise(actual_sum)
def test_load_wrong_type(self): with self.assertRaises(TypeError): BudgetAccountant.load_default(0) with self.assertRaises(TypeError): BudgetAccountant.load_default([1, 2, 3]) with self.assertRaises(TypeError): BudgetAccountant.load_default("BudgetAccountant")
def __init__(self, epsilon=1.0, bounds=None, n_clusters=8, accountant=None, **unused_args): super().__init__(n_clusters=n_clusters) self.epsilon = epsilon self.bounds = bounds self.accountant = BudgetAccountant.load_default(accountant) warn_unused_args(unused_args) self.cluster_centers_ = None self.bounds_processed = None self.labels_ = None self.inertia_ = None self.n_iter_ = None self._n_threads = 1
def _var(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=np._NoValue, accountant=None, nan=False): accountant = BudgetAccountant.load_default(accountant) accountant.check(epsilon, 0) _func = np.nanvar if nan else np.var output_form = _func(np.zeros_like(array), axis=axis, keepdims=keepdims) vector_out = (np.ndim(output_form) == 1) n_datapoints = np.sum(np.ones_like(array, dtype=int), axis=axis, keepdims=keepdims).flat[0] if bounds is None: warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) if np.ndim(output_form) <= 1: bounds = (np.min(array, axis=axis, keepdims=keepdims), np.max(array, axis=axis, keepdims=keepdims)) else: bounds = (np.min(array), np.max(array)) lower, upper = check_bounds(bounds, output_form.shape[0] if vector_out else 1, dtype=dtype or float) array = np.clip(array, lower, upper) actual_var = _func(array, axis=axis, dtype=dtype, keepdims=keepdims) if isinstance(actual_var, np.ndarray): dp_var = np.zeros_like(actual_var) iterator = np.nditer(actual_var, flags=['multi_index']) while not iterator.finished: idx = iterator.multi_index local_diam = upper[idx] - lower[idx] if vector_out else upper[0] - lower[0] dp_mech = LaplaceBoundedDomain().set_epsilon(epsilon).set_bounds(0, float("inf")) \ .set_sensitivity((local_diam / n_datapoints) ** 2 * (n_datapoints - 1)) dp_var[iterator.multi_index] = np.minimum(dp_mech.randomise(actual_var[idx]), local_diam ** 2) iterator.iternext() accountant.spend(epsilon, 0) return dp_var local_diam = upper[0] - lower[0] dp_mech = LaplaceBoundedDomain().set_epsilon(epsilon).set_bounds(0, float("inf")). \ set_sensitivity((local_diam / n_datapoints) ** 2 * (n_datapoints - 1)) accountant.spend(epsilon, 0) return np.minimum(dp_mech.randomise(actual_var), local_diam ** 2)
def __init__(self, *, epsilon=1.0, bounds_X=None, bounds_y=None, fit_intercept=True, copy_X=True, accountant=None, **unused_args): super().__init__(fit_intercept=fit_intercept, copy_X=copy_X, n_jobs=None) self.epsilon = epsilon self.bounds_X = bounds_X self.bounds_y = bounds_y self.accountant = BudgetAccountant.load_default(accountant) self._warn_unused_args(unused_args)
def histogram(sample, epsilon=1.0, bins=10, range=None, weights=None, density=None, accountant=None, **unused_args): r""" Compute the differentially private histogram of a set of data. The histogram is computed using :obj:`numpy.histogram`, and noise added using :class:`.GeometricTruncated` to satisfy differential privacy. If the `range` parameter is not specified correctly, a :class:`.PrivacyLeakWarning` is thrown. Users are referred to :obj:`numpy.histogram` for more usage notes. Parameters ---------- sample : array_like Input data. The histogram is computed over the flattened array. epsilon : float, default: 1.0 Privacy parameter :math:`\epsilon` to be applied. bins : int or sequence of scalars or str, default: 10 If `bins` is an int, it defines the number of equal-width bins in the given range (10, by default). If `bins` is a sequence, it defines a monotonically increasing array of bin edges, including the rightmost edge, allowing for non-uniform bin widths. If `bins` is a string, it defines the method used to calculate the optimal bin width, as defined by `histogram_bin_edges`. range : (float, float), optional The lower and upper range of the bins. If not provided, range is simply ``(a.min(), a.max())``. Values outside the range are ignored. The first element of the range must be less than or equal to the second. `range` affects the automatic bin computation as well. While bin width is computed to be optimal based on the actual data within `range`, the bin count will fill the entire range including portions containing no data. weights : array_like, optional An array of weights, of the same shape as `a`. Each value in `a` only contributes its associated weight towards the bin count (instead of 1). If `density` is True, the weights are normalized, so that the integral of the density over the range remains 1. density : bool, optional If ``False``, the result will contain the number of samples in each bin. If ``True``, the result is the value of the probability *density* function at the bin, normalized such that the *integral* over the range is 1. Note that the sum of the histogram values will not be equal to 1 unless bins of unity width are chosen; it is not a probability *mass* function. accountant : BudgetAccountant, optional Accountant to keep track of privacy budget. Returns ------- hist : array The values of the histogram. See `density` and `weights` for a description of the possible semantics. bin_edges : array of dtype float Return the bin edges ``(length(hist)+1)``. See Also -------- histogramdd, histogram2d Notes ----- All but the last (righthand-most) bin is half-open. In other words, if `bins` is:: [1, 2, 3, 4] then the first bin is ``[1, 2)`` (including 1, but excluding 2) and the second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which *includes* 4. """ warn_unused_args(unused_args) accountant = BudgetAccountant.load_default(accountant) accountant.check(epsilon, 0) if range is None: warnings.warn( "Range parameter has not been specified. Falling back to taking range from the data.\n" "To ensure differential privacy, and no additional privacy leakage, the range must be " "specified independently of the data (i.e., using domain knowledge).", PrivacyLeakWarning) hist, bin_edges = np.histogram(sample, bins=bins, range=range, weights=weights, density=None) dp_mech = GeometricTruncated(epsilon=epsilon, sensitivity=1, lower=0, upper=maxsize) dp_hist = np.zeros_like(hist) for i in np.arange(dp_hist.shape[0]): dp_hist[i] = dp_mech.randomise(int(hist[i])) # dp_hist = dp_hist.astype(float, casting='safe') accountant.spend(epsilon, 0) if density: bin_sizes = np.array(np.diff(bin_edges), float) return dp_hist / bin_sizes / (dp_hist.sum() if dp_hist.sum() else 1), bin_edges return dp_hist, bin_edges
def histogramdd(sample, epsilon=1.0, bins=10, range=None, weights=None, density=None, accountant=None, **unused_args): r""" Compute the differentially private multidimensional histogram of some data. The histogram is computed using :obj:`numpy.histogramdd`, and noise added using :class:`.GeometricTruncated` to satisfy differential privacy. If the `range` parameter is not specified correctly, a :class:`.PrivacyLeakWarning` is thrown. Users are referred to :obj:`numpy.histogramdd` for more usage notes. Parameters ---------- sample : (N, D) array, or (D, N) array_like The data to be histogrammed. Note the unusual interpretation of sample when an array_like: * When an array, each row is a coordinate in a D-dimensional space - such as ``histogramgramdd(np.array([p1, p2, p3]))``. * When an array_like, each element is the list of values for single coordinate - such as ``histogramgramdd((X, Y, Z))``. The first form should be preferred. epsilon : float, default: 1.0 Privacy parameter :math:`\epsilon` to be applied. bins : sequence or int, default: 10 The bin specification: * A sequence of arrays describing the monotonically increasing bin edges along each dimension. * The number of bins for each dimension (nx, ny, ... =bins) * The number of bins for all dimensions (nx=ny=...=bins). range : sequence, optional A sequence of length D, each an optional (lower, upper) tuple giving the outer bin edges to be used if the edges are not given explicitly in `bins`. An entry of None in the sequence results in the minimum and maximum values being used for the corresponding dimension. The default, None, is equivalent to passing a tuple of D None values. density : bool, optional If False, the default, returns the number of samples in each bin. If True, returns the probability *density* function at the bin, ``bin_count / sample_count / bin_volume``. weights : (N,) array_like, optional An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`. Weights are normalized to 1 if normed is True. If normed is False, the values of the returned histogram are equal to the sum of the weights belonging to the samples falling into each bin. accountant : BudgetAccountant, optional Accountant to keep track of privacy budget. Returns ------- H : ndarray The multidimensional histogram of sample x. See normed and weights for the different possible semantics. edges : list A list of D arrays describing the bin edges for each dimension. See Also -------- histogram: 1-D differentially private histogram histogram2d: 2-D differentially private histogram """ warn_unused_args(unused_args) accountant = BudgetAccountant.load_default(accountant) accountant.check(epsilon, 0) # Range only required if bin edges not specified if np.array(bins, dtype=object).ndim == 0 or not np.all( [np.ndim(_bin) for _bin in bins]): if range is None or (isinstance(range, list) and None in range): warnings.warn( "Range parameter has not been specified (or has missing elements). Falling back to taking " "range from the data.\n " "To ensure differential privacy, and no additional privacy leakage, the range must be " "specified for each dimension independently of the data (i.e., using domain knowledge).", PrivacyLeakWarning) hist, bin_edges = np.histogramdd(sample, bins=bins, range=range, normed=None, weights=weights, density=None) dp_mech = GeometricTruncated(epsilon=epsilon, sensitivity=1, lower=0, upper=maxsize) dp_hist = np.zeros_like(hist) iterator = np.nditer(hist, flags=['multi_index']) while not iterator.finished: dp_hist[iterator.multi_index] = dp_mech.randomise(int(iterator[0])) iterator.iternext() dp_hist = dp_hist.astype(float, casting='safe') if density: # calculate the probability density function dims = len(dp_hist.shape) dp_hist_sum = dp_hist.sum() for i in np.arange(dims): shape = np.ones(dims, int) shape[i] = dp_hist.shape[i] # noinspection PyUnresolvedReferences dp_hist = dp_hist / np.diff(bin_edges[i]).reshape(shape) if dp_hist_sum > 0: dp_hist /= dp_hist_sum accountant.spend(epsilon, 0) return dp_hist, bin_edges
def quantile(array, quant, epsilon=1.0, bounds=None, axis=None, keepdims=False, accountant=None, **unused_args): r""" Compute the differentially private quantile of the array. Returns the specified quantile with differential privacy. The quantile is calculated over the flattened array. Differential privacy is achieved with the :class:`.Exponential` mechanism, using the method first proposed by Smith, 2011. Paper link: https://dl.acm.org/doi/pdf/10.1145/1993636.1993743 Parameters ---------- array : array_like Array containing numbers whose quantile is sought. If `array` is not an array, a conversion is attempted. quant : float or array-like Quantile or array of quantiles. Each quantile must be in the unit interval [0, 1]. If quant is array-like, quantiles are returned over the flattened array. epsilon : float, default: 1.0 Privacy parameter :math:`\epsilon`. Differential privacy is achieved over the entire output, with epsilon split evenly between each output value. bounds : tuple, optional Bounds of the values of the array, of the form (min, max). axis : None or int or tuple of ints, optional Axis or axes along which a sum is performed. The default, axis=None, will sum all of the elements of the input array. If axis is negative it counts from the last to the first axis. If axis is a tuple of ints, a sum is performed on all of the axes specified in the tuple instead of a single axis or all the axes as before. keepdims : bool, default: False If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the result will broadcast correctly against the input array. If the default value is passed, then `keepdims` will not be passed through to the `mean` method of sub-classes of `ndarray`, however any non-default value will be. If the sub-class' method does not implement `keepdims` any exceptions will be raised. accountant : BudgetAccountant, optional Accountant to keep track of privacy budget. Returns ------- m : ndarray Returns a new array containing the quantile values. See Also -------- numpy.quantile : Equivalent non-private method. percentile, median """ warn_unused_args(unused_args) if bounds is None: warnings.warn( "Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) bounds = (np.min(array), np.max(array)) quant = np.ravel(quant) if np.any(quant < 0) or np.any(quant > 1): raise ValueError("Quantiles must be in the unit interval [0, 1].") if len(quant) > 1: return np.array([ quantile(array, q_i, epsilon=epsilon / len(quant), bounds=bounds, axis=axis, keepdims=keepdims, accountant=accountant) for q_i in quant ]) # Dealing with a single quant from now on quant = quant.item() if axis is not None or keepdims: return _wrap_axis(quantile, array, quant=quant, epsilon=epsilon, bounds=bounds, axis=axis, keepdims=keepdims, accountant=accountant) # Dealing with a scalar output from now on bounds = check_bounds(bounds, shape=0, min_separation=1e-5) accountant = BudgetAccountant.load_default(accountant) accountant.check(epsilon, 0) # Let's ravel array to be single-dimensional array = clip_to_bounds(np.ravel(array), bounds) k = array.size array = np.append(array, list(bounds)) array.sort() interval_sizes = np.diff(array) # Todo: Need to find a way to do this in a differentially private way if np.isnan(interval_sizes).any(): return np.nan mech = Exponential(epsilon=epsilon, sensitivity=1, utility=list(-np.abs(np.arange(0, k + 1) - quant * k)), measure=list(interval_sizes)) idx = mech.randomise() output = mech._rng.random() * (array[idx + 1] - array[idx]) + array[idx] accountant.spend(epsilon, 0) return output
def sample_model2(epsilon=1.0, accountant=None): accountant = BudgetAccountant.load_default(accountant) accountant.check(epsilon, 0.0) accountant.spend(epsilon, 0.0)