def test_annotate_call(self, mock_annotate_fn): # Arrange total_epsilon, total_delta = 3, 0.0001 budget_accountant = NaiveBudgetAccountant(total_epsilon, total_delta, num_aggregations=3) dp_engine = self._create_dp_engine_default(budget_accountant) aggregate_params, public_partitions = self._create_params_default() select_partition_params = SelectPartitionsParams(2) extractors = self._get_default_extractors() input = [1, 2, 3] # Act and assert dp_engine.select_partitions(input, select_partition_params, extractors) dp_engine.aggregate(input, aggregate_params, extractors, public_partitions) dp_engine.aggregate(input, aggregate_params, extractors, public_partitions) budget_accountant.compute_budgets() # Assert self.assertEqual(3, mock_annotate_fn.call_count) for i_call in range(3): budget = mock_annotate_fn.call_args_list[i_call][1]['budget'] self.assertEqual(total_epsilon / 3, budget.epsilon) self.assertEqual(total_delta / 3, budget.delta)
def test_with_noise(self): budget_accountant = NaiveBudgetAccountant(total_epsilon=10, total_delta=1e-5) budget = budget_accountant.request_budget( pipeline_dp.MechanismType.GAUSSIAN) budget_accountant.compute_budgets() params = pipeline_dp.AggregateParams( min_value=0, max_value=1, max_partitions_contributed=1, max_contributions_per_partition=1, noise_kind=NoiseKind.GAUSSIAN, metrics=[pipeline_dp.Metrics.COUNT]) count_accumulator = accumulator.CountAccumulator( accumulator.CountParams(budget, params), list(range(5))) self.assertAlmostEqual(first=count_accumulator.compute_metrics(), second=5, delta=4) count_accumulator.add_value(50) self.assertAlmostEqual(first=count_accumulator.compute_metrics(), second=6, delta=4) count_accumulator.add_value(list(range(49))) self.assertAlmostEqual(first=count_accumulator.compute_metrics(), second=7, delta=4) count_accumulator.add_value('*' * 100) self.assertAlmostEqual(first=count_accumulator.compute_metrics(), second=8, delta=4)
def run_e2e_private_partition_selection_large_budget(col, backend): # Arrange aggregator_params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.LAPLACE, metrics=[agg.Metrics.COUNT, agg.Metrics.SUM], min_value=1, max_value=10, max_partitions_contributed=1, max_contributions_per_partition=1) # Set a large budget for having the small noise and keeping all # partition keys. budget_accountant = NaiveBudgetAccountant(total_epsilon=100000, total_delta=1) data_extractor = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: x, partition_extractor=lambda x: f"pk{x//2}", value_extractor=lambda x: x) engine = pipeline_dp.DPEngine(budget_accountant, backend) col = engine.aggregate(col=col, params=aggregator_params, data_extractors=data_extractor) budget_accountant.compute_budgets() return col
def test_without_noise(self): budget_accountant = NaiveBudgetAccountant(total_epsilon=1000000, total_delta=0.9999999) budget = budget_accountant.request_budget( pipeline_dp.MechanismType.GAUSSIAN) budget_accountant.compute_budgets() no_noise = pipeline_dp.AggregateParams( min_value=0, max_value=1, max_partitions_contributed=1, max_contributions_per_partition=1, noise_kind=NoiseKind.GAUSSIAN, metrics=[pipeline_dp.Metrics.COUNT]) count_accumulator = accumulator.CountAccumulator( accumulator.CountParams(budget, no_noise), list(range(5))) self.assertEqual(count_accumulator.compute_metrics(), 5) count_accumulator = accumulator.CountAccumulator( accumulator.CountParams(budget, no_noise), 'a' * 50) self.assertEqual(count_accumulator.compute_metrics(), 50) count_accumulator = accumulator.CountAccumulator( accumulator.CountParams(budget, no_noise), list(range(50))) count_accumulator.add_value(49) self.assertEqual(count_accumulator.compute_metrics(), 51) count_accumulator_1 = accumulator.CountAccumulator( accumulator.CountParams(budget, no_noise), list(range(50))) count_accumulator_2 = accumulator.CountAccumulator( accumulator.CountParams(budget, no_noise), 'a' * 50) count_accumulator_1.add_accumulator(count_accumulator_2) self.assertEqual(count_accumulator_1.compute_metrics(), 100)
def test_aggregate_report(self): col = [[1], [2], [3], [3]] data_extractor = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: f"pid{x}", partition_extractor=lambda x: f"pk{x}", value_extractor=lambda x: x) params1 = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=3, max_contributions_per_partition=2, min_value=1, max_value=5, metrics=[ pipeline_dp.Metrics.PRIVACY_ID_COUNT, pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.MEAN ], ) params2 = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=1, max_contributions_per_partition=3, min_value=2, max_value=10, metrics=[pipeline_dp.Metrics.SUM, pipeline_dp.Metrics.MEAN], public_partitions=list(range(1, 40)), ) select_partitions_params = SelectPartitionsParams( max_partitions_contributed=2) budget_accountant = NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10) engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant, backend=pipeline_dp.LocalBackend()) engine.aggregate(col, params1, data_extractor) engine.aggregate(col, params2, data_extractor) engine.select_partitions(col, select_partitions_params, data_extractor) self.assertEqual(3, len(engine._report_generators)) # pylint: disable=protected-access budget_accountant.compute_budgets() self.assertEqual( engine._report_generators[0].report(), "Differentially private: Computing <Metrics: ['privacy_id_count', 'count', 'mean']>" "\n1. Per-partition contribution bounding: randomly selected not more than 2 contributions" "\n2. Cross-partition contribution bounding: randomly selected not more than 3 partitions per user" "\n3. Private Partition selection: using Truncated Geometric method with (eps= 0.1111111111111111, delta = 1.1111111111111111e-11)" ) self.assertEqual( engine._report_generators[1].report(), "Differentially private: Computing <Metrics: ['sum', 'mean']>" "\n1. Public partition selection: dropped non public partitions" "\n2. Per-partition contribution bounding: randomly selected not more than 3 contributions" "\n3. Cross-partition contribution bounding: randomly selected not more than 1 partitions per user" "\n4. Adding empty partitions to public partitions that are missing in data" ) self.assertEqual( engine._report_generators[2].report(), "Differentially private: Computing <Private Partitions>" "\n1. Private Partition selection: using Truncated Geometric method with (eps= 0.3333333333333333, delta = 3.3333333333333335e-11)" )
def test_two_calls_compute_budgets_raise_exception(self): budget_accountant = NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) budget_accountant.request_budget(mechanism_type=MechanismType.LAPLACE) budget_accountant.compute_budgets() with self.assertRaises(Exception): # Budget can be computed only once. budget_accountant.compute_budgets()
def test_request_after_compute_raise_exception(self): budget_accountant = NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) budget_accountant.request_budget(mechanism_type=MechanismType.LAPLACE) budget_accountant.compute_budgets() with self.assertRaises(Exception): # Budget can not be requested after it has been already computed. budget_accountant.request_budget( mechanism_type=MechanismType.LAPLACE)
def test_num_aggregations(self, num_aggregations): total_epsilon, total_delta = 1, 1e-6 budget_accountant = NaiveBudgetAccountant( total_epsilon=total_epsilon, total_delta=total_delta, num_aggregations=num_aggregations) for _ in range(num_aggregations): budget = budget_accountant._compute_budget_for_aggregation(1) expected_epsilon = total_epsilon / num_aggregations expected_delta = total_delta / num_aggregations self.assertAlmostEqual(expected_epsilon, budget.epsilon) self.assertAlmostEqual(expected_delta, budget.delta) budget_accountant.compute_budgets()
def test_compute_budgets(self): budget_accountant = NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) budget1 = budget_accountant.request_budget(noise_kind=NoiseKind.LAPLACE) budget2 = budget_accountant.request_budget( noise_kind=NoiseKind.GAUSSIAN, weight=3) budget_accountant.compute_budgets() self.assertEqual(budget1.eps, 0.25) self.assertEqual(budget1.delta, 0) # Delta should be 0 if mechanism is Gaussian. self.assertEqual(budget2.eps, 0.75) self.assertEqual(budget2.delta, 1e-6)
def test_select_partitions(self): # This test is probabilistic, but the parameters were chosen to ensure # the test has passed at least 10000 runs. # Arrange params = SelectPartitionsParams(max_partitions_contributed=1) budget_accountant = NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-5) # Generate dataset as a list of (user, partition_key) tuples. # There partitions are generated to reflect several scenarios. # A partition with sufficient amount of users. col = [(u, "pk-many-contribs") for u in range(25)] # A partition with many contributions, but only a few unique users. col += [(100 + u // 10, "pk-many-contribs-few-users") for u in range(30)] # A partition with few contributions. col += [(200 + u, "pk-few-contribs") for u in range(3)] # Generating 30 partitions, each with the same group of 25 users # 25 users is sufficient to keep the partition, but because of # contribution bounding, much less users per partition will be kept. for i in range(30): col += [(500 + u, f"few-contribs-after-bound{i}") for u in range(25)] col = list(col) data_extractor = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: x[0], partition_extractor=lambda x: x[1]) engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant, backend=pipeline_dp.LocalBackend()) col = engine.select_partitions(col=col, params=params, data_extractors=data_extractor) budget_accountant.compute_budgets() col = list(col) # Assert # Only one partition is retained, the one that has many unique _after_ # applying the "max_partitions_contributed" bound is retained. self.assertEqual(["pk-many-contribs"], col)
def test_compute_budgets(self): budget_accountant = NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) budget1 = budget_accountant.request_budget( mechanism_type=MechanismType.LAPLACE) budget2 = budget_accountant.request_budget( mechanism_type=MechanismType.GAUSSIAN, weight=3) budget_accountant.compute_budgets() self.assertEqual(budget1.eps, 0.25) self.assertEqual(budget1.delta, 0) # Delta should be 0 if mechanism is Laplace. self.assertEqual(budget2.eps, 0.75) self.assertEqual(budget2.delta, 1e-6)
def test_aggregation_weights(self): total_epsilon, total_delta = 1, 1e-6 weights = [1, 2, 5] budget_accountant = NaiveBudgetAccountant(total_epsilon=total_epsilon, total_delta=total_delta, aggregation_weights=weights) for weight in weights: budget = budget_accountant._compute_budget_for_aggregation(weight) expected_epsilon = total_epsilon * weight / sum(weights) expected_delta = total_delta * weight / sum(weights) self.assertAlmostEqual(expected_epsilon, budget.epsilon) self.assertAlmostEqual(expected_delta, budget.delta) budget_accountant.compute_budgets()
def test_not_enough_aggregations(self, use_num_aggregations): weights = num_aggregations = None if use_num_aggregations: num_aggregations = 2 else: weights = [1, 1] # 2 aggregations budget_accountant = NaiveBudgetAccountant( total_epsilon=1, total_delta=1e-6, num_aggregations=num_aggregations, aggregation_weights=weights) budget_accountant._compute_budget_for_aggregation(1) with self.assertRaises(ValueError): # num_aggregations = 2, but only 1 aggregation_scope was created budget_accountant.compute_budgets()
def test_budget_scopes_no_parentscope(self): budget_accountant = NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) # Allocated in the top-level scope with no weight specified budget1 = budget_accountant.request_budget( mechanism_type=MechanismType.LAPLACE) with budget_accountant.scope(weight=0.5): budget2 = budget_accountant.request_budget( mechanism_type=MechanismType.LAPLACE) budget_accountant.compute_budgets() self.assertEqual(budget1.eps, 1.0 / (1.0 + 0.5)) self.assertEqual(budget2.eps, 0.5 / (1.0 + 0.5))
def test_aggregate_public_partitions_add_empty_public_partitions(self): # Arrange aggregator_params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[ agg.Metrics.COUNT, agg.Metrics.SUM, agg.Metrics.PRIVACY_ID_COUNT ], min_value=0, max_value=1, max_partitions_contributed=1, max_contributions_per_partition=1, public_partitions=["pk0", "pk10", "pk11"]) # Set a high budget to add close to 0 noise. budget_accountant = NaiveBudgetAccountant(total_epsilon=100000, total_delta=1 - 1e-10) # Input collection has 10 elements, such that each privacy id # contributes 1 time and each partition has 1 element. col = list(range(10)) data_extractor = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: x, partition_extractor=lambda x: f"pk{x}", value_extractor=lambda x: 1) engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant, backend=pipeline_dp.LocalBackend()) col = engine.aggregate(col=col, params=aggregator_params, data_extractors=data_extractor) budget_accountant.compute_budgets() col = list(col) partition_keys = [x[0] for x in col] # Assert # Only public partitions ("pk0") should be kept and empty public # partitions ("pk10", "pk11") should be added. self.assertEqual(["pk0", "pk10", "pk11"], partition_keys) self.assertAlmostEqual(1, col[0][1][0]) # "pk0" COUNT ≈ 1 self.assertAlmostEqual(1, col[0][1][1]) # "pk0" SUM ≈ 1 self.assertAlmostEqual(1, col[0][1][2]) # "pk0" PRIVACY_ID_COUNT ≈ 1 self.assertAlmostEqual(0, col[1][1][0]) # "pk10" COUNT ≈ 0 self.assertAlmostEqual(0, col[1][1][1]) # "pk10" SUM ≈ 0 self.assertAlmostEqual(0, col[1][1][2]) # "pk10" PRIVACY_ID_COUNT ≈ 0
def test_aggregate_public_partitions_drop_non_public(self): # Arrange aggregator_params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[ agg.Metrics.COUNT, agg.Metrics.SUM, agg.Metrics.PRIVACY_ID_COUNT ], min_value=0, max_value=1, max_partitions_contributed=1, max_contributions_per_partition=1, public_partitions=["pk0", "pk1", "pk10"]) # Set an arbitrary budget, we are not interested in the DP outputs, only # the partition keys. budget_accountant = NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10) # Input collection has 10 elements, such that each privacy id # contributes 1 time and each partition has 1 element. col = list(range(10)) data_extractor = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: x, partition_extractor=lambda x: f"pk{x}", value_extractor=lambda x: x) engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant, backend=pipeline_dp.LocalBackend()) col = engine.aggregate(col=col, params=aggregator_params, data_extractors=data_extractor) budget_accountant.compute_budgets() col = list(col) partition_keys = [x[0] for x in col] # Assert # Only public partitions (0, 1, 2) should be kept and the rest of the # partitions should be dropped. self.assertEqual(["pk0", "pk1", "pk10"], partition_keys)
def test_budget_scopes(self): budget_accountant = NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) with budget_accountant.scope(weight=0.4): budget1 = budget_accountant.request_budget( mechanism_type=MechanismType.LAPLACE) budget2 = budget_accountant.request_budget( mechanism_type=MechanismType.LAPLACE, weight=3) with budget_accountant.scope(weight=0.6): budget3 = budget_accountant.request_budget( mechanism_type=MechanismType.LAPLACE) budget4 = budget_accountant.request_budget( mechanism_type=MechanismType.LAPLACE, weight=4) budget_accountant.compute_budgets() self.assertEqual(budget1.eps, 0.4 * (1 / 4)) self.assertEqual(budget2.eps, 0.4 * (3 / 4)) self.assertEqual(budget3.eps, 0.6 * (1 / 5)) self.assertEqual(budget4.eps, 0.6 * (4 / 5))
def test_aggregate_private_partition_selection_drop_many(self): # Arrange aggregator_params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[agg.Metrics.COUNT], max_partitions_contributed=1, max_contributions_per_partition=1) # Set a small budget for dropping most partition keys. budget_accountant = NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10) # Input collection has 100 elements, such that each privacy id # contributes 1 time and each partition has 1 element. col = list(range(100)) data_extractor = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: x, partition_extractor=lambda x: f"pk{x}", value_extractor=lambda x: None) engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant, backend=pipeline_dp.LocalBackend()) col = engine.aggregate(col=col, params=aggregator_params, data_extractors=data_extractor) budget_accountant.compute_budgets() col = list(col) # Assert # Most partition should be dropped by private partition selection. # This tests is non-deterministic, but it should pass with probability # very close to 1. self.assertLess(len(col), 5)
def test_aggregate_private_partition_selection_keep_everything(self): # Arrange aggregator_params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[agg.Metrics.COUNT], max_partitions_contributed=1, max_contributions_per_partition=1) # Set a large budget for having the small noise and keeping all # partition keys. budget_accountant = NaiveBudgetAccountant(total_epsilon=100000, total_delta=1e-10) col = list(range(10)) + list(range(100, 120)) data_extractor = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: x, partition_extractor=lambda x: f"pk{x//100}", value_extractor=lambda x: None) engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant, backend=pipeline_dp.LocalBackend()) col = engine.aggregate(col=col, params=aggregator_params, data_extractors=data_extractor) budget_accountant.compute_budgets() col = list(col) # Assert approximate_expected = {"pk0": 10, "pk1": 20} self.assertEqual(2, len(col)) # all partition keys are kept. for pk, metrics_tuple in col: dp_count = metrics_tuple.count self.assertAlmostEqual(approximate_expected[pk], dp_count, delta=1e-3)
def test_aggregate_report(self): col = [[1], [2], [3], [3]] data_extractor = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: f"pid{x}", partition_extractor=lambda x: f"pk{x}", value_extractor=lambda x: x) params1 = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=3, max_contributions_per_partition=2, min_value=1, max_value=5, metrics=[ pipeline_dp.Metrics.PRIVACY_ID_COUNT, pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.MEAN ], ) params2 = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=1, max_contributions_per_partition=3, min_value=2, max_value=10, metrics=[pipeline_dp.Metrics.SUM, pipeline_dp.Metrics.MEAN], ) select_partitions_params = SelectPartitionsParams( max_partitions_contributed=2) budget_accountant = NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10) engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant, backend=pipeline_dp.LocalBackend()) engine.aggregate(col, params1, data_extractor) engine.aggregate(col, params2, data_extractor, list(range(1, 40))) engine.select_partitions(col, select_partitions_params, data_extractor) self.assertEqual(3, len(engine._report_generators)) # pylint: disable=protected-access budget_accountant.compute_budgets() self._check_string_contains_strings( engine._report_generators[0].report(), [ "DPEngine method: aggregate", "metrics=['privacy_id_count', 'count', 'mean']", " noise_kind=gaussian", "max_value=5", "Partition selection: private partitions", "Cross-partition contribution bounding: for each privacy id randomly select max(actual_partition_contributed, 3)", "Private Partition selection: using Truncated Geometric method with (eps=" ], ) self._check_string_contains_strings( engine._report_generators[1].report(), [ "metrics=['sum', 'mean']", " noise_kind=gaussian", "max_value=5", "Partition selection: public partitions", "Per-partition contribution bounding: for each privacy_id and eachpartition, randomly select max(actual_contributions_per_partition, 3)", "Adding empty partitions for public partitions that are missing in data" ], ) self._check_string_contains_strings( engine._report_generators[2].report(), [ "DPEngine method: select_partitions", " budget_weight=1", "max_partitions_contributed=2", "Private Partition selection: using Truncated Geometric method with", ], )