def _mean(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=np._NoValue, accountant=None, nan=False):
    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    _func = np.nanmean if nan else np.mean
    output_form = _func(np.zeros_like(array), axis=axis, keepdims=keepdims)
    vector_out = (np.ndim(output_form) == 1)
    n_datapoints = np.sum(np.ones_like(array), axis=axis, keepdims=keepdims).flat[0]

    if bounds is None:
        warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will "
                      "result in additional privacy leakage. To ensure differential privacy and no additional "
                      "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning)
        if np.ndim(output_form) <= 1:
            bounds = (np.min(array, axis=axis, keepdims=keepdims), np.max(array, axis=axis, keepdims=keepdims))
        else:
            bounds = (np.min(array), np.max(array))

    lower, upper = check_bounds(bounds, output_form.shape[0] if vector_out else 1, dtype=dtype or float)
    array = np.clip(array, lower, upper)

    actual_mean = _func(array, axis=axis, dtype=dtype, keepdims=keepdims)

    if isinstance(actual_mean, np.ndarray):
        dp_mean = np.zeros_like(actual_mean)
        iterator = np.nditer(actual_mean, flags=['multi_index'])

        while not iterator.finished:
            idx = iterator.multi_index
            _lower, _upper = (lower[idx], upper[idx]) if vector_out else (lower[0], upper[0])
            local_diam = _upper - _lower
            dp_mech = LaplaceTruncated().set_epsilon(epsilon).set_sensitivity(local_diam / n_datapoints).\
                set_bounds(_lower, _upper)

            dp_mean[iterator.multi_index] = dp_mech.randomise(actual_mean[idx])
            iterator.iternext()

        accountant.spend(epsilon, 0)

        return dp_mean

    local_diam = upper[0] - lower[0]
    dp_mech = LaplaceTruncated().set_epsilon(epsilon).set_sensitivity(local_diam / n_datapoints).\
        set_bounds(lower[0], upper[0])

    accountant.spend(epsilon, 0)

    return dp_mech.randomise(actual_mean)
示例#2
0
def _mean(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=False, accountant=None, nan=False):
    if bounds is None:
        warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will "
                      "result in additional privacy leakage. To ensure differential privacy and no additional "
                      "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning)
        bounds = (np.min(array), np.max(array))

    if axis is not None or keepdims:
        return _wrap_axis(_mean, array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims,
                          accountant=accountant, nan=nan)

    lower, upper = check_bounds(bounds, shape=0, dtype=dtype)

    accountant = BudgetAccountant.load_default(accountant)
    accountant.check(epsilon, 0)

    array = clip_to_bounds(np.ravel(array), bounds)

    _func = np.nanmean if nan else np.mean
    actual_mean = _func(array, axis=axis, dtype=dtype, keepdims=keepdims)

    mech = LaplaceTruncated(epsilon=epsilon, delta=0, sensitivity=(upper - lower) / array.size, lower=lower,
                            upper=upper)
    output = mech.randomise(actual_mean)

    accountant.spend(epsilon, 0)

    return output
def dp_contingency_table(data, epsilon):
    """Compute differentially private contingency table of input data"""
    contingency_table_ = contingency_table(data)

    # if we remove one record from X the count in one cell decreases by 1 while the rest stays the same.
    sensitivity = 1
    dp_mech = LaplaceTruncated(epsilon=epsilon,
                               lower=0,
                               upper=maxsize,
                               sensitivity=sensitivity)

    contingency_table_values = contingency_table_.values.flatten()
    dp_contingency_table = np.zeros_like(contingency_table_values)
    for i in np.arange(dp_contingency_table.shape[0]):
        # round counts upwards to preserve bins with noisy count between [0, 1]
        dp_contingency_table[i] = np.ceil(
            dp_mech.randomise(contingency_table_values[i]))

    return Factor(dp_contingency_table, states=contingency_table_.states)
def dp_joint_distribution(data, epsilon):
    """Compute differentially private joint distribution of input data"""
    joint_distribution_ = joint_distribution(data)

    # removing one record from X will decrease probability 1/n in one cell of the
    # joint distribution and increase the probability 1/n in the remaining cells
    sensitivity = 2 / data.shape[0]
    dp_mech = LaplaceTruncated(epsilon=epsilon,
                               lower=0,
                               upper=maxsize,
                               sensitivity=sensitivity)

    joint_distribution_values = joint_distribution_.values.flatten()
    dp_joint_distribution_ = np.zeros_like(joint_distribution_values)

    for i in np.arange(dp_joint_distribution_.shape[0]):
        dp_joint_distribution_[i] = dp_mech.randomise(
            joint_distribution_values[i])

    dp_joint_distribution_ = _normalize_distribution(dp_joint_distribution_)
    return JPT(dp_joint_distribution_, states=joint_distribution_.states)
def dp_marginal_distribution(data, epsilon):
    """Compute differentially private marginal distribution of input data"""
    marginal_ = marginal_distribution(data)

    # removing one record from X will decrease probability 1/n in one cell of the
    # marginal distribution and increase the probability 1/n in the remaining cells
    sensitivity = 2 / data.shape[0]
    dp_mech = LaplaceTruncated(epsilon=epsilon,
                               lower=0,
                               upper=maxsize,
                               sensitivity=sensitivity)

    marginal_values = marginal_.values.flatten()
    dp_marginal = np.zeros_like(marginal_.values)

    for i in np.arange(dp_marginal.shape[0]):
        # round counts upwards to preserve bins with noisy count between [0, 1]
        dp_marginal[i] = dp_mech.randomise(marginal_.values[i])

    dp_marginal = _normalize_distribution(dp_marginal)
    return Factor(dp_marginal, states=marginal_.states)
class TestLaplaceTruncated(TestCase):
    def setup_method(self, method):
        if method.__name__.endswith("prob"):
            global_seed(314159)

        self.mech = LaplaceTruncated()

    def teardown_method(self, method):
        del self.mech

    def test_not_none(self):
        self.assertIsNotNone(self.mech)

    def test_class(self):
        from diffprivlib.mechanisms import DPMechanism
        self.assertTrue(issubclass(LaplaceTruncated, DPMechanism))

    def test_no_params(self):
        with self.assertRaises(ValueError):
            self.mech.randomise(1)

    def test_no_sensitivity(self):
        self.mech.set_epsilon(1).set_bounds(0, 1)
        with self.assertRaises(ValueError):
            self.mech.randomise(1)

    def test_no_epsilon(self):
        self.mech.set_sensitivity(1).set_bounds(0, 1)
        with self.assertRaises(ValueError):
            self.mech.randomise(1)

    def test_inf_epsilon(self):
        self.mech.set_sensitivity(1).set_epsilon(float("inf")).set_bounds(0, 1)

        for i in range(1000):
            self.assertEqual(self.mech.randomise(0.5), 0.5)

    def test_complex_epsilon(self):
        with self.assertRaises(TypeError):
            self.mech.set_epsilon(1 + 2j)

    def test_string_epsilon(self):
        with self.assertRaises(TypeError):
            self.mech.set_epsilon("Two")

    def test_no_bounds(self):
        self.mech.set_sensitivity(1).set_epsilon(1)
        with self.assertRaises(ValueError):
            self.mech.randomise(1)

    def test_non_numeric(self):
        self.mech.set_sensitivity(1).set_epsilon(1).set_bounds(0, 1)
        with self.assertRaises(TypeError):
            self.mech.randomise("Hello")

    def test_zero_median_prob(self):
        self.mech.set_sensitivity(1).set_epsilon(1).set_bounds(0, 1)
        vals = []

        for i in range(10000):
            vals.append(self.mech.randomise(0.5))

        median = float(np.median(vals))
        self.assertAlmostEqual(np.abs(median), 0.5, delta=0.1)

    def test_neighbors_prob(self):
        epsilon = 1
        runs = 10000
        self.mech.set_sensitivity(1).set_epsilon(1).set_bounds(0, 1)
        count = [0, 0]

        for i in range(runs):
            val0 = self.mech.randomise(0)
            if val0 <= 0.5:
                count[0] += 1

            val1 = self.mech.randomise(1)
            if val1 <= 0.5:
                count[1] += 1

        self.assertGreater(count[0], count[1])
        self.assertLessEqual(count[0] / runs,
                             np.exp(epsilon) * count[1] / runs + 0.1)

    def test_within_bounds(self):
        self.mech.set_sensitivity(1).set_epsilon(1).set_bounds(0, 1)
        vals = []

        for i in range(1000):
            vals.append(self.mech.randomise(0.5))

        vals = np.array(vals)

        self.assertTrue(np.all(vals >= 0))
        self.assertTrue(np.all(vals <= 1))
示例#7
0
    def _update_mean_variance(self,
                              n_past,
                              mu,
                              var,
                              X,
                              sample_weight=None,
                              n_noisy=None):
        """Compute online update of Gaussian mean and variance.

        Given starting sample count, mean, and variance, a new set of points X return the updated mean and variance.
        (NB - each dimension (column) in X is treated as independent -- you get variance, not covariance).

        Can take scalar mean and variance, or vector mean and variance to simultaneously update a number of
        independent Gaussians.

        See Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque:

        http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf

        Parameters
        ----------
        n_past : int
            Number of samples represented in old mean and variance.  If sample weights were given, this should contain
            the sum of sample weights represented in old mean and variance.

        mu : array-like, shape (number of Gaussians,)
            Means for Gaussians in original set.

        var : array-like, shape (number of Gaussians,)
            Variances for Gaussians in original set.

        sample_weight : ignored
            Ignored in diffprivlib.

        n_noisy : int, optional
            Noisy count of the given class, satisfying differential privacy.

        Returns
        -------
        total_mu : array-like, shape (number of Gaussians,)
            Updated mean for each Gaussian over the combined set.

        total_var : array-like, shape (number of Gaussians,)
            Updated variance for each Gaussian over the combined set.
        """
        if n_noisy is None:
            warnings.warn(
                "Noisy class count has not been specified and will be read from the data. To use this "
                "method correctly, make sure it is run by the parent GaussianNB class.",
                PrivacyLeakWarning)
            n_noisy = X.shape[0]

        if not n_noisy:
            return mu, var

        if sample_weight is not None:
            warn_unused_args("sample_weight")

        # Split epsilon between each feature, using 1/3 of total budget for each of mean and variance
        n_features = X.shape[1]
        local_epsilon = self.epsilon / 3 / n_features

        new_mu = np.zeros((n_features, ))
        new_var = np.zeros((n_features, ))

        for feature in range(n_features):
            _X = X[:, feature]
            lower, upper = self.bounds[0][feature], self.bounds[1][feature]
            local_diameter = upper - lower

            mech_mu = LaplaceTruncated(epsilon=local_epsilon,
                                       delta=0,
                                       sensitivity=local_diameter,
                                       lower=lower * n_noisy,
                                       upper=upper * n_noisy)
            _mu = mech_mu.randomise(_X.sum()) / n_noisy

            local_sq_sens = max(_mu - lower, upper - _mu)**2
            mech_var = LaplaceBoundedDomain(epsilon=local_epsilon,
                                            delta=0,
                                            sensitivity=local_sq_sens,
                                            lower=0,
                                            upper=local_sq_sens * n_noisy)
            _var = mech_var.randomise(((_X - _mu)**2).sum()) / n_noisy

            new_mu[feature] = _mu
            new_var[feature] = _var

        if n_past == 0:
            return new_mu, new_var

        n_total = float(n_past + n_noisy)

        # Combine mean of old and new data, taking into consideration
        # (weighted) number of observations
        total_mu = (n_noisy * new_mu + n_past * mu) / n_total

        # Combine variance of old and new data, taking into consideration
        # (weighted) number of observations. This is achieved by combining
        # the sum-of-squared-differences (ssd)
        old_ssd = n_past * var
        new_ssd = n_noisy * new_var
        total_ssd = old_ssd + new_ssd + (n_past / float(n_noisy * n_total)) * (
            n_noisy * mu - n_noisy * new_mu)**2
        total_var = total_ssd / n_total

        return total_mu, total_var