def test_mean_calls_aggregate_with_correct_params(self, mock_aggregate): # Arrange dist_data = PrivateRDDTest.sc.parallelize([(1, 2.0, "pk1"), (2, 2.0, "pk1")]) mock_aggregate.return_value = PrivateRDDTest.sc.parallelize([(2.0, ["pk1"])]) budget_accountant = budget_accounting.NaiveBudgetAccountant(1, 1e-10) def privacy_id_extractor(x): return x[1] prdd = private_spark.make_private(dist_data, budget_accountant, privacy_id_extractor) mean_params = agg.MeanParams(noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=2, max_contributions_per_partition=3, min_value=1.5, max_value=5.78, budget_weight=1.1, public_partitions=None, partition_extractor=lambda x: x[0], value_extractor=lambda x: x) # Act actual_result = prdd.mean(mean_params) # Assert mock_aggregate.assert_called_once() args = mock_aggregate.call_args[0] rdd = dist_data.map(lambda x: (privacy_id_extractor(x), x)) self.assertListEqual(args[0].collect(), rdd.collect()) params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[pipeline_dp.Metrics.MEAN], max_partitions_contributed=mean_params.max_partitions_contributed, max_contributions_per_partition=mean_params. max_contributions_per_partition, min_value=mean_params.min_value, max_value=mean_params.max_value, budget_weight=mean_params.budget_weight, public_partitions=mean_params.public_partitions) self.assertEqual(args[1], params) self.assertEqual(actual_result.collect(), [(2.0, "pk1")])
def test_mean_calls_aggregate_with_params(self, mock_aggregate): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol = pipeline | 'Create produce' >> beam.Create( [1, 2, 3, 4, 5, 6]) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) mean_params = aggregate_params.MeanParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=2, max_contributions_per_partition=3, min_value=1, max_value=5, budget_weight=1, public_partitions=[], partition_extractor=lambda x: f"pk:{x // 10}", value_extractor=lambda x: x) # Act transformer = private_beam.Mean(mean_params=mean_params) private_collection | transformer # Assert self.assertEqual(transformer._budget_accountant, budget_accountant) mock_aggregate.assert_called_once() args = mock_aggregate.call_args[0] params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[pipeline_dp.Metrics.MEAN], max_partitions_contributed=mean_params. max_partitions_contributed, max_contributions_per_partition=mean_params. max_contributions_per_partition, min_value=mean_params.min_value, max_value=mean_params.max_value, public_partitions=mean_params.public_partitions) self.assertEqual(params, args[1])
def test_mean_with_public_partitions_returns_sensible_result(self): # Arrange col = [(u, "pubK1", -100) for u in range(30)] col += [(u + 30, "pubK1", 100) for u in range(10)] col += [(u + 40, "privK1", 100) for u in range(30)] dist_data = PrivateRDDTest.sc.parallelize(col) # Use very high epsilon and delta to minimize noise and test # flakiness. budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=800, total_delta=0.999) def privacy_id_extractor(x): return x[0] prdd = private_spark.make_private(dist_data, budget_accountant, privacy_id_extractor) mean_params = agg.MeanParams(noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=2, max_contributions_per_partition=3, min_value=1.55, max_value=2.789, budget_weight=1, partition_extractor=lambda x: x[1], value_extractor=lambda x: x[2], public_partitions=["pubK1", "pubK2"]) # Act actual_result = prdd.mean(mean_params) budget_accountant.compute_budgets() # Assert # This is a health check to validate that the result is sensible. # Hence, we use a very large tolerance to reduce test flakiness. expected_result_dict = {"pubK1": 1.859, "pubK2": 2.1695} actual_result_dict = self.to_dict(actual_result.collect()) for pk, mean in actual_result_dict.items(): self.assertTrue( self.value_per_key_within_tolerance(mean, expected_result_dict[pk], 5.0))
def test_mean_with_public_partitions_returns_sensible_result(self): with TestPipeline() as pipeline: # Arrange col = [(f"{u}", "pubK1", -100.0) for u in range(30)] col += [(f"{u + 30}", "pubK1", 100.0) for u in range(10)] col += [(f"{u + 40}", "privK1", 100.0) for u in range(30)] pcol = pipeline | 'Create produce' >> beam.Create(col) # Use very high epsilon and delta to minimize noise and test # flakiness. budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=800, total_delta=0.999) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=lambda x: x[0])) mean_params = aggregate_params.MeanParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=1, max_contributions_per_partition=1, min_value=1.55, # -100 should be clipped to this value max_value=2.7889, # 100 should be clipped to this value budget_weight=1, partition_extractor=lambda x: x[1], value_extractor=lambda x: x[2]) # Act result = private_collection | private_beam.Mean( mean_params=mean_params, public_partitions=["pubK1", "pubK2"]) budget_accountant.compute_budgets() # Assert # This is a health check to validate that the result is sensible. # Hence, we use a very large tolerance to reduce test flakiness. beam_util.assert_that( result, # pubK2 has no data points therefore the dataset is assumed to be {min_value, max_value} beam_util.equal_to([("pubK1", 1.859), ("pubK2", 2.1695)], equals_fn=lambda e, a: PrivateBeamTest. value_per_key_within_tolerance(e, a, 0.1)))