def test_scalar_output(self): bounds = check_bounds((1, 2), shape=0) self.assertIsInstance(bounds[0], Real) self.assertIsInstance(bounds[1], Real) bounds = check_bounds((1, 2), shape=0, dtype=int) self.assertIsInstance(bounds[0], int) self.assertIsInstance(bounds[1], int) bounds = check_bounds((1, 2), shape=0, dtype=float) self.assertIsInstance(bounds[0], float) self.assertIsInstance(bounds[1], float)
def test_min_separation(self): bounds = check_bounds((1, 1), min_separation=2) self.assertEqual(0, bounds[0]) self.assertEqual(2, bounds[1]) bounds = check_bounds((1., 1.), min_separation=1) self.assertEqual(0.5, bounds[0]) self.assertEqual(1.5, bounds[1]) bounds = check_bounds((0.9, 1.1), min_separation=1) self.assertEqual(0.5, bounds[0]) self.assertEqual(1.5, bounds[1])
def _var(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=False, accountant=None, nan=False): if bounds is None: warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) bounds = (np.min(array), np.max(array)) if axis is not None or keepdims: return _wrap_axis(_var, array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims, accountant=accountant, nan=nan) lower, upper = check_bounds(bounds, shape=0, dtype=dtype) accountant = BudgetAccountant.load_default(accountant) accountant.check(epsilon, 0) # Let's ravel array to be single-dimensional array = clip_to_bounds(np.ravel(array), bounds) _func = np.nanvar if nan else np.var actual_var = _func(array, axis=axis, dtype=dtype, keepdims=keepdims) dp_mech = LaplaceBoundedDomain(epsilon=epsilon, delta=0, sensitivity=((upper - lower) / array.size) ** 2 * (array.size - 1), lower=0, upper=((upper - lower) ** 2) / 4) output = dp_mech.randomise(actual_var) accountant.spend(epsilon, 0) return output
def _sum(array, epsilon=1.0, bounds=None, accountant=None, axis=None, dtype=None, keepdims=False, nan=False): if bounds is None: warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) bounds = (np.min(array), np.max(array)) if axis is not None or keepdims: return _wrap_axis(_sum, array, epsilon=epsilon, bounds=bounds, accountant=accountant, axis=axis, dtype=dtype, keepdims=keepdims, nan=nan) lower, upper = check_bounds(bounds, shape=0, dtype=dtype) accountant = BudgetAccountant.load_default(accountant) accountant.check(epsilon, 0) # Let's ravel array to be single-dimensional array = clip_to_bounds(np.ravel(array), bounds) _func = np.nansum if nan else np.sum actual_sum = _func(array, axis=axis, dtype=dtype, keepdims=keepdims) mech = GeometricTruncated if dtype is not None and issubclass(dtype, Integral) else LaplaceTruncated mech = mech(epsilon=epsilon, sensitivity=upper - lower, lower=lower * array.size, upper=upper * array.size) output = mech.randomise(actual_sum) accountant.spend(epsilon, 0) return output
def _preprocess_data(X, y, fit_intercept, epsilon=1.0, bounds_X=None, bounds_y=None, copy=True, check_input=True, **unused_args): warn_unused_args(unused_args) if check_input: X = check_array(X, copy=copy, accept_sparse=False, dtype=FLOAT_DTYPES) elif copy: X = X.copy(order='K') y = np.asarray(y, dtype=X.dtype) X_scale = np.ones(X.shape[1], dtype=X.dtype) if fit_intercept: bounds_X = check_bounds(bounds_X, X.shape[1]) bounds_y = check_bounds(bounds_y, y.shape[1] if y.ndim > 1 else 1) X = clip_to_bounds(X, bounds_X) y = clip_to_bounds(y, bounds_y) X_offset = mean(X, axis=0, bounds=bounds_X, epsilon=epsilon, accountant=BudgetAccountant()) X -= X_offset y_offset = mean(y, axis=0, bounds=bounds_y, epsilon=epsilon, accountant=BudgetAccountant()) y = y - y_offset else: X_offset = np.zeros(X.shape[1], dtype=X.dtype) if y.ndim == 1: y_offset = X.dtype.type(0) else: y_offset = np.zeros(y.shape[1], dtype=X.dtype) return X, y, X_offset, y_offset, X_scale
def _sum(array, epsilon=1.0, bounds=None, accountant=None, axis=None, dtype=None, keepdims=np._NoValue, nan=False): accountant = BudgetAccountant.load_default(accountant) accountant.check(epsilon, 0) _func = np.nansum if nan else np.sum output_form = _func(np.zeros_like(array), axis=axis, keepdims=keepdims) vector_out = (np.ndim(output_form) == 1) n_datapoints = np.sum(np.ones_like(array, dtype=int), axis=axis, keepdims=keepdims).flat[0] if bounds is None: warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) if np.ndim(output_form) <= 1: bounds = (np.min(array, axis=axis, keepdims=keepdims), np.max(array, axis=axis, keepdims=keepdims)) else: bounds = (np.min(array), np.max(array)) lower, upper = check_bounds(bounds, output_form.shape[0] if vector_out else 1, dtype=dtype or float) array = np.clip(array, lower, upper) actual_sum = _func(array, axis=axis, dtype=dtype, keepdims=keepdims) dp_mech = GeometricTruncated if dtype is not None and issubclass(dtype, Integral) else LaplaceTruncated if isinstance(actual_sum, np.ndarray): dp_sum = np.zeros_like(actual_sum, dtype=dtype) iterator = np.nditer(actual_sum, flags=['multi_index']) while not iterator.finished: idx = iterator.multi_index _lower, _upper = (lower[idx], upper[idx]) if vector_out else (lower[0], upper[0]) local_diam = _upper - _lower mech = dp_mech().set_epsilon(epsilon).set_sensitivity(local_diam).\ set_bounds(_lower * n_datapoints, _upper * n_datapoints) dp_sum[idx] = mech.randomise(actual_sum[idx]) iterator.iternext() accountant.spend(epsilon, 0) return dp_sum local_diam = upper[0] - lower[0] mech = dp_mech().set_epsilon(epsilon).set_sensitivity(local_diam).set_bounds(lower[0] * n_datapoints, upper[0] * n_datapoints) accountant.spend(epsilon, 0) return mech.randomise(actual_sum)
def test_incorrect_entries(self): with self.assertRaises(ValueError): check_bounds(([1, 2], 1)) with self.assertRaises(ValueError): check_bounds(([1, 2], [1, 2, 3])) with self.assertRaises(ValueError): check_bounds(([1, 2], [1, 2], [1, 2]))
def _var(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=np._NoValue, accountant=None, nan=False): accountant = BudgetAccountant.load_default(accountant) accountant.check(epsilon, 0) _func = np.nanvar if nan else np.var output_form = _func(np.zeros_like(array), axis=axis, keepdims=keepdims) vector_out = (np.ndim(output_form) == 1) n_datapoints = np.sum(np.ones_like(array, dtype=int), axis=axis, keepdims=keepdims).flat[0] if bounds is None: warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) if np.ndim(output_form) <= 1: bounds = (np.min(array, axis=axis, keepdims=keepdims), np.max(array, axis=axis, keepdims=keepdims)) else: bounds = (np.min(array), np.max(array)) lower, upper = check_bounds(bounds, output_form.shape[0] if vector_out else 1, dtype=dtype or float) array = np.clip(array, lower, upper) actual_var = _func(array, axis=axis, dtype=dtype, keepdims=keepdims) if isinstance(actual_var, np.ndarray): dp_var = np.zeros_like(actual_var) iterator = np.nditer(actual_var, flags=['multi_index']) while not iterator.finished: idx = iterator.multi_index local_diam = upper[idx] - lower[idx] if vector_out else upper[0] - lower[0] dp_mech = LaplaceBoundedDomain().set_epsilon(epsilon).set_bounds(0, float("inf")) \ .set_sensitivity((local_diam / n_datapoints) ** 2 * (n_datapoints - 1)) dp_var[iterator.multi_index] = np.minimum(dp_mech.randomise(actual_var[idx]), local_diam ** 2) iterator.iternext() accountant.spend(epsilon, 0) return dp_var local_diam = upper[0] - lower[0] dp_mech = LaplaceBoundedDomain().set_epsilon(epsilon).set_bounds(0, float("inf")). \ set_sensitivity((local_diam / n_datapoints) ** 2 * (n_datapoints - 1)) accountant.spend(epsilon, 0) return np.minimum(dp_mech.randomise(actual_var), local_diam ** 2)
def _wrap_axis(func, array, *, axis, keepdims, epsilon, bounds, **kwargs): """Wrapper for functions with axis and keepdims parameters to ensure the function only needs to be evaluated on scalar outputs. """ dummy = np.zeros_like(array).sum(axis=axis, keepdims=keepdims) array = np.asarray(array) ndim = array.ndim bounds = check_bounds(bounds, np.size(dummy) if np.ndim(dummy) == 1 else 0) if isinstance(axis, int): axis = (axis, ) elif axis is None: axis = tuple(range(ndim)) # Ensure all axes are non-negative axis = tuple(ndim + ax if ax < 0 else ax for ax in axis) if isinstance(dummy, np.ndarray): iterator = np.nditer(dummy, flags=['multi_index']) while not iterator.finished: idx = list(iterator.multi_index) # Multi index on 'dummy' _bounds = (bounds[0][idx], bounds[1][idx]) if np.ndim(dummy) == 1 else bounds # Construct slicing tuple on 'array' if len(idx) + len(axis) > ndim: full_slice = tuple( slice(None) if ax in axis else idx[ax] for ax in range(ndim)) else: idx.reverse() full_slice = tuple( slice(None) if ax in axis else idx.pop() for ax in range(ndim)) dummy[iterator.multi_index] = func(array[full_slice], epsilon=epsilon / dummy.size, bounds=_bounds, **kwargs) iterator.iternext() return dummy return func(array, bounds=bounds, epsilon=epsilon, **kwargs)
def test_wrong_order(self): with self.assertRaises(ValueError): check_bounds((2, 1))
def fit(self, X, y, sample_weight=None): """ Fit linear model. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) Training data y : array_like, shape (n_samples, n_targets) Target values. Will be cast to X's dtype if necessary sample_weight : ignored Ignored by diffprivlib. Present for consistency with sklearn API. Returns ------- self : returns an instance of self. """ self.accountant.check(self.epsilon, 0) if sample_weight is not None: warn_unused_args("sample_weight") X, y = check_X_y(X, y, accept_sparse=False, y_numeric=True, multi_output=True) if self.bounds_X is None or self.bounds_y is None: warnings.warn( "Bounds parameters haven't been specified, so falling back to determining bounds from the " "data.\n" "This will result in additional privacy leakage. To ensure differential privacy with no " "additional privacy loss, specify `bounds_X` and `bounds_y`.", PrivacyLeakWarning) if self.bounds_X is None: self.bounds_X = (np.min(X, axis=0), np.max(X, axis=0)) if self.bounds_y is None: self.bounds_y = (np.min(y, axis=0), np.max(y, axis=0)) self.bounds_X = check_bounds(self.bounds_X, X.shape[1]) self.bounds_y = check_bounds(self.bounds_y, y.shape[1] if y.ndim > 1 else 1) n_features = X.shape[1] n_targets = y.shape[1] if y.ndim > 1 else 1 epsilon_intercept_scale = 1 / (n_features + 1) if self.fit_intercept else 0 X, y, X_offset, y_offset, X_scale = self._preprocess_data( X, y, fit_intercept=self.fit_intercept, bounds_X=self.bounds_X, bounds_y=self.bounds_y, epsilon=self.epsilon * epsilon_intercept_scale, copy=self.copy_X) bounds_X = (self.bounds_X[0] - X_offset, self.bounds_X[1] - X_offset) bounds_y = (self.bounds_y[0] - y_offset, self.bounds_y[1] - y_offset) objs, obj_coefs = _construct_regression_obj( X, y, bounds_X, bounds_y, epsilon=self.epsilon * (1 - epsilon_intercept_scale), alpha=0) coef = np.zeros((n_features, n_targets)) residues = [] for i, obj in enumerate(objs): opt_result = minimize(obj, np.zeros(n_features), jac=True) coef[:, i] = opt_result.x residues += [opt_result.fun] self.coef_ = coef.T self._residues = residues self._obj_coefs = obj_coefs if y.ndim == 1: self.coef_ = np.ravel(self.coef_) self._residues = self._residues[0] self._set_intercept(X_offset, y_offset, X_scale) self.accountant.spend(self.epsilon, 0) return self
def _fit_full(self, X, n_components): self.accountant.check(self.epsilon, 0) n_samples, n_features = X.shape if self.centered: self.mean_ = np.zeros_like(np.mean(X, axis=0)) else: if self.bounds is None: warnings.warn( "Bounds parameter hasn't been specified, so falling back to determining range from the data.\n" "This will result in additional privacy leakage. To ensure differential privacy with no " "additional privacy loss, specify `range` for each valued returned by np.mean().", PrivacyLeakWarning) self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) self.bounds = check_bounds(self.bounds, n_features) self.mean_ = mean(X, epsilon=self.epsilon / 2, bounds=self.bounds, axis=0, accountant=BudgetAccountant()) X -= self.mean_ if self.data_norm is None: warnings.warn( "Data norm has not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify `data_norm` at initialisation.", PrivacyLeakWarning) self.data_norm = np.linalg.norm(X, axis=1).max() X = clip_to_norm(X, self.data_norm) XtX = np.dot(X.T, X) mech = Wishart().set_epsilon(self.epsilon if self.centered else self.epsilon / 2).\ set_sensitivity(self.data_norm) noisy_input = mech.randomise(XtX) u, s, v = np.linalg.svd(noisy_input) u, v = svd_flip(u, v) s = np.sqrt(s) components_ = v # Get variance explained by singular values explained_variance_ = (s**2) / (n_samples - 1) total_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / total_var singular_values_ = s.copy() # Store the singular values. # Post-process the number of components required if n_components == 'mle': try: n_components = sk_pca._infer_dimension(explained_variance_, n_samples) except AttributeError: n_components = sk_pca._infer_dimension_( explained_variance_, n_samples, n_features) elif 0 < n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold ratio_cumsum = stable_cumsum(explained_variance_ratio_) n_components = np.searchsorted(ratio_cumsum, n_components) + 1 # Compute noise covariance using Probabilistic PCA model # The sigma2 maximum likelihood (cf. eq. 12.46) if n_components < min(n_features, n_samples): self.noise_variance_ = explained_variance_[n_components:].mean() else: self.noise_variance_ = 0. self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = components_[:n_components] self.n_components_ = n_components self.explained_variance_ = explained_variance_[:n_components] self.explained_variance_ratio_ = explained_variance_ratio_[: n_components] self.singular_values_ = singular_values_[:n_components] self.accountant.spend(self.epsilon, 0) return u, s, v
def test_non_numeric(self): with self.assertRaises(ValueError): check_bounds(("One", "Two"))
def fit(self, X, y, sample_weight=None): """ Fit linear model. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) Training data y : array_like, shape (n_samples, n_targets) Target values. Will be cast to X's dtype if necessary sample_weight : ignored Ignored by diffprivlib. Present for consistency with sklearn API. Returns ------- self : returns an instance of self. """ self.accountant.check(self.epsilon, 0) if sample_weight is not None: warn_unused_args("sample_weight") X, y = check_X_y(X, y, accept_sparse=False, y_numeric=True, multi_output=True) if self.fit_intercept: if self.bounds_X is None or self.bounds_y is None: warnings.warn( "Bounds parameters haven't been specified, so falling back to determining bounds from the " "data.\n" "This will result in additional privacy leakage. To ensure differential privacy with no " "additional privacy loss, specify `bounds_X` and `bounds_y`.", PrivacyLeakWarning) if self.bounds_X is None: self.bounds_X = (np.min(X, axis=0), np.max(X, axis=0)) if self.bounds_y is None: self.bounds_y = (np.min(y, axis=0), np.max(y, axis=0)) self.bounds_X = check_bounds(self.bounds_X, X.shape[1]) self.bounds_y = check_bounds(self.bounds_y, y.shape[1] if y.ndim > 1 else 1) n_features = X.shape[1] epsilon_intercept_scale = 1 / (n_features + 1) if self.fit_intercept else 0 X, y, X_offset, y_offset, X_scale = self._preprocess_data( X, y, fit_intercept=self.fit_intercept, bounds_X=self.bounds_X, bounds_y=self.bounds_y, epsilon=self.epsilon * epsilon_intercept_scale, copy=self.copy_X) if self.data_norm is None: warnings.warn( "Data norm has not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify `data_norm` at initialisation.", PrivacyLeakWarning) self.data_norm = np.linalg.norm(X, axis=1).max() X = clip_to_norm(X, self.data_norm) A = np.hstack((X, y[:, np.newaxis] if y.ndim == 1 else y)) AtA = np.dot(A.T, A) mech = Wishart().set_epsilon( self.epsilon * (1 - epsilon_intercept_scale)).set_sensitivity( self.data_norm) noisy_AtA = mech.randomise(AtA) noisy_AtA = noisy_AtA[:n_features, :] XtX = noisy_AtA[:, :n_features] Xty = noisy_AtA[:, n_features:] self.coef_, self._residues, self.rank_, self.singular_ = np.linalg.lstsq( XtX, Xty, rcond=-1) self.coef_ = self.coef_.T if y.ndim == 1: self.coef_ = np.ravel(self.coef_) self._set_intercept(X_offset, y_offset, X_scale) self.accountant.spend(self.epsilon, 0) return self
def fit(self, X, y=None, sample_weight=None): """Computes k-means clustering with differential privacy. Parameters ---------- X : array-like, shape=(n_samples, n_features) Training instances to cluster. y : Ignored not used, present here for API consistency by convention. sample_weight : ignored Ignored by diffprivlib. Present for consistency with sklearn API. Returns ------- self : class """ self.accountant.check(self.epsilon, 0) if sample_weight is not None: warn_unused_args("sample_weight") del y X = check_array(X, accept_sparse=False, dtype=[np.float64, np.float32]) n_samples, n_dims = X.shape if n_samples < self.n_clusters: raise ValueError("n_samples=%d should be >= n_clusters=%d" % (n_samples, self.n_clusters)) iters = self._calc_iters(n_dims, n_samples) if self.bounds is None: warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify `bounds` for each dimension.", PrivacyLeakWarning) self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) self.bounds = check_bounds(self.bounds, n_dims, min_separation=1e-5) X = clip_to_bounds(X, self.bounds) centers = self._init_centers(n_dims) labels = None distances = None # Run _update_centers first to ensure consistency of `labels` and `centers`, since convergence unlikely for _ in range(-1, iters): if labels is not None: centers = self._update_centers(X, centers=centers, labels=labels, dims=n_dims, total_iters=iters) distances, labels = self._distances_labels(X, centers) self.cluster_centers_ = centers self.labels_ = labels self.inertia_ = distances[np.arange(len(labels)), labels].sum() self.n_iter_ = iters self.accountant.spend(self.epsilon, 0) return self
def test_complex(self): with self.assertRaises(TypeError): check_bounds((1.0, 1 + 2j), dtype=complex)
def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None): self.accountant.check(self.epsilon, 0) if sample_weight is not None: warn_unused_args("sample_weight") X, y = check_X_y(X, y) if self.bounds is None: warnings.warn( "Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) self.bounds = check_bounds(self.bounds, shape=X.shape[1]) X = clip_to_bounds(X, self.bounds) self.epsilon_ = self.var_smoothing if _refit: self.classes_ = None if _check_partial_fit_first_call(self, classes): n_features = X.shape[1] n_classes = len(self.classes_) self.theta_ = np.zeros((n_classes, n_features)) self.sigma_ = np.zeros((n_classes, n_features)) self.class_count_ = np.zeros(n_classes, dtype=np.float64) if self.priors is not None: priors = np.asarray(self.priors) if len(priors) != n_classes: raise ValueError( "Number of priors must match number of classes.") if not np.isclose(priors.sum(), 1.0): raise ValueError("The sum of the priors should be 1.") if (priors < 0).any(): raise ValueError("Priors must be non-negative.") self.class_prior_ = priors else: # Initialize the priors to zeros for each class self.class_prior_ = np.zeros(len(self.classes_), dtype=np.float64) else: if X.shape[1] != self.theta_.shape[1]: raise ValueError( "Number of features %d does not match previous data %d." % (X.shape[1], self.theta_.shape[1])) # Put epsilon back in each time self.sigma_[:, :] -= self.epsilon_ classes = self.classes_ unique_y = np.unique(y) unique_y_in_classes = np.in1d(unique_y, classes) if not np.all(unique_y_in_classes): raise ValueError( "The target label(s) %s in y do not exist in the initial classes %s" % (unique_y[~unique_y_in_classes], classes)) noisy_class_counts = self._noisy_class_counts(y) for _i, y_i in enumerate(unique_y): i = classes.searchsorted(y_i) X_i = X[y == y_i, :] n_i = noisy_class_counts[_i] new_theta, new_sigma = self._update_mean_variance( self.class_count_[i], self.theta_[i, :], self.sigma_[i, :], X_i, n_noisy=n_i) self.theta_[i, :] = new_theta self.sigma_[i, :] = new_sigma self.class_count_[i] += n_i self.sigma_[:, :] += self.epsilon_ # Update if only no priors is provided if self.priors is None: # Empirical prior, with sample_weight taken into account self.class_prior_ = self.class_count_ / self.class_count_.sum() self.accountant.spend(self.epsilon, 0) return self
def quantile(array, quant, epsilon=1.0, bounds=None, axis=None, keepdims=False, accountant=None, **unused_args): r""" Compute the differentially private quantile of the array. Returns the specified quantile with differential privacy. The quantile is calculated over the flattened array. Differential privacy is achieved with the :class:`.Exponential` mechanism, using the method first proposed by Smith, 2011. Paper link: https://dl.acm.org/doi/pdf/10.1145/1993636.1993743 Parameters ---------- array : array_like Array containing numbers whose quantile is sought. If `array` is not an array, a conversion is attempted. quant : float or array-like Quantile or array of quantiles. Each quantile must be in the unit interval [0, 1]. If quant is array-like, quantiles are returned over the flattened array. epsilon : float, default: 1.0 Privacy parameter :math:`\epsilon`. Differential privacy is achieved over the entire output, with epsilon split evenly between each output value. bounds : tuple, optional Bounds of the values of the array, of the form (min, max). axis : None or int or tuple of ints, optional Axis or axes along which a sum is performed. The default, axis=None, will sum all of the elements of the input array. If axis is negative it counts from the last to the first axis. If axis is a tuple of ints, a sum is performed on all of the axes specified in the tuple instead of a single axis or all the axes as before. keepdims : bool, default: False If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the result will broadcast correctly against the input array. If the default value is passed, then `keepdims` will not be passed through to the `mean` method of sub-classes of `ndarray`, however any non-default value will be. If the sub-class' method does not implement `keepdims` any exceptions will be raised. accountant : BudgetAccountant, optional Accountant to keep track of privacy budget. Returns ------- m : ndarray Returns a new array containing the quantile values. See Also -------- numpy.quantile : Equivalent non-private method. percentile, median """ warn_unused_args(unused_args) if bounds is None: warnings.warn( "Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) bounds = (np.min(array), np.max(array)) quant = np.ravel(quant) if np.any(quant < 0) or np.any(quant > 1): raise ValueError("Quantiles must be in the unit interval [0, 1].") if len(quant) > 1: return np.array([ quantile(array, q_i, epsilon=epsilon / len(quant), bounds=bounds, axis=axis, keepdims=keepdims, accountant=accountant) for q_i in quant ]) # Dealing with a single quant from now on quant = quant.item() if axis is not None or keepdims: return _wrap_axis(quantile, array, quant=quant, epsilon=epsilon, bounds=bounds, axis=axis, keepdims=keepdims, accountant=accountant) # Dealing with a scalar output from now on bounds = check_bounds(bounds, shape=0, min_separation=1e-5) accountant = BudgetAccountant.load_default(accountant) accountant.check(epsilon, 0) # Let's ravel array to be single-dimensional array = clip_to_bounds(np.ravel(array), bounds) k = array.size array = np.append(array, list(bounds)) array.sort() interval_sizes = np.diff(array) # Todo: Need to find a way to do this in a differentially private way if np.isnan(interval_sizes).any(): return np.nan mech = Exponential(epsilon=epsilon, sensitivity=1, utility=list(-np.abs(np.arange(0, k + 1) - quant * k)), measure=list(interval_sizes)) idx = mech.randomise() output = mech._rng.random() * (array[idx + 1] - array[idx]) + array[idx] accountant.spend(epsilon, 0) return output
def test_non_tuple(self): with self.assertRaises(TypeError): check_bounds([1, 2, 3])
def partial_fit(self, X, y=None, sample_weight=None): """Online computation of mean and std with differential privacy on X for later scaling. All of X is processed as a single batch. This is intended for cases when `fit` is not feasible due to very large number of `n_samples` or because X is read from a continuous stream. The algorithm for incremental mean and std is given in Equation 1.5a,b in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. "Algorithms for computing the sample variance: Analysis and recommendations." The American Statistician 37.3 (1983): 242-247: Parameters ---------- X : {array-like}, shape [n_samples, n_features] The data used to compute the mean and standard deviation used for later scaling along the features axis. y Ignored sample_weight Ignored by diffprivlib. Present for consistency with sklearn API. """ self.accountant.check(self.epsilon, 0) if sample_weight is not None: warn_unused_args("sample_weight") epsilon_0 = self.epsilon / 2 if self.with_std else self.epsilon X = check_array(X, accept_sparse=False, copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan') # Hotfix for sklearn v 0.23 self.n_features_in_ = X.shape[1] if self.bounds is None: warnings.warn( "Range parameter hasn't been specified, so falling back to determining range from the data.\n" "This will result in additional privacy leakage. To ensure differential privacy with no " "additional privacy loss, specify `range` for each valued returned by np.mean().", PrivacyLeakWarning) self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) self.bounds = check_bounds(self.bounds, X.shape[1]) X = clip_to_bounds(X, self.bounds) # Even in the case of `with_mean=False`, we update the mean anyway. This is needed for the incremental # computation of the var See incr_mean_variance_axis and _incremental_mean_variance_axis # if n_samples_seen_ is an integer (i.e. no missing values), we need to transform it to a NumPy array of # shape (n_features,) required by incr_mean_variance_axis and _incremental_variance_axis if hasattr(self, 'n_samples_seen_') and isinstance( self.n_samples_seen_, (int, np.integer)): self.n_samples_seen_ = np.repeat(self.n_samples_seen_, X.shape[1]).astype(np.int64) if not hasattr(self, 'n_samples_seen_'): self.n_samples_seen_ = np.zeros(X.shape[1], dtype=np.int64) # First pass if not hasattr(self, 'scale_'): self.mean_ = .0 if self.with_std: self.var_ = .0 else: self.var_ = None if not self.with_mean and not self.with_std: self.mean_ = None self.var_ = None self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0) else: self.mean_, self.var_, self.n_samples_seen_ = _incremental_mean_and_var( X, epsilon_0, self.bounds, self.mean_, self.var_, self.n_samples_seen_) # for backward-compatibility, reduce n_samples_seen_ to an integer # if the number of samples is the same for each feature (i.e. no # missing values) if np.ptp(self.n_samples_seen_) == 0: self.n_samples_seen_ = self.n_samples_seen_[0] if self.with_std: self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_)) else: self.scale_ = None self.accountant.spend(self.epsilon, 0) return self
def test_non_numeric(self): with self.assertRaises(Exception): check_bounds(("One", "Two"))
def test_bad_shape(self): with self.assertRaises(ValueError): check_bounds(([1, 1], [2, 2]), shape=-2) with self.assertRaises(TypeError): check_bounds(([1, 1], [2, 2]), shape=2.0)
def test_wrong_dims(self): with self.assertRaises(ValueError): check_bounds(([1, 1], [2, 2]), shape=3)
def test_consistency(self): bounds = check_bounds(([1, 1], [2, 2]), shape=2) bounds2 = check_bounds(bounds, shape=2) self.assertTrue(np.all(bounds[0] == bounds2[0])) self.assertTrue(np.all(bounds[1] == bounds2[1]))
def test_array_output(self): bounds = check_bounds(([1, 1], [2, 2]), shape=2) self.assertIsInstance(bounds[0], np.ndarray) self.assertIsInstance(bounds[1], np.ndarray)