def test_select_partitions_returns_sensible_result(self): # Arrange col = [(u, "pk1") for u in range(50)] col += [(50 + u, "pk2") for u in range(50)] dist_data = PrivateRDDTest.sc.parallelize(col) # Use very high epsilon and delta to minimize noise and test # flakiness. budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=800, total_delta=0.999) max_partitions_contributed = 2 def privacy_id_extractor(x): return x[0] def partition_extractor(x): return x[1] # Act prdd = private_spark.make_private(dist_data, budget_accountant, privacy_id_extractor) select_partitions_params = agg.SelectPartitionsParams( max_partitions_contributed=max_partitions_contributed) actual_result = prdd.select_partitions(select_partitions_params, partition_extractor) budget_accountant.compute_budgets() # Assert # This is a health check to validate that the result is sensible. # Hence, we use a very large tolerance to reduce test flakiness. self.assertEqual(sorted(actual_result.collect()), ["pk1", "pk2"])
def test_select_private_partitions_returns_sensible_result(self): with TestPipeline() as pipeline: # Arrange col = [(u, "pk1") for u in range(50)] col += [(50 + u, "pk2") for u in range(50)] pcol = pipeline | 'Create produce' >> beam.Create(col) # Use very high epsilon and delta to minimize noise and test # flakiness. budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=800, total_delta=0.999) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=lambda x: x[0])) select_partitions_params = \ aggregate_params.SelectPartitionsParams( max_partitions_contributed=2, budget_weight=0.9) partition_extractor = lambda x: x[1] # Act result = private_collection | private_beam.SelectPartitions( select_partitions_params=select_partitions_params, partition_extractor=partition_extractor, label="Test select partitions") budget_accountant.compute_budgets() # Assert # This is a health check to validate that the result is sensible. # Hence, we use a very large tolerance to reduce test flakiness. beam_util.assert_that(result, beam_util.equal_to(["pk1", "pk2"]))
def test_select_partitions_calls_select_partitions_with_params( self, mock_select_partitions): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol = pipeline | 'Create produce' >> beam.Create( [1, 2, 3, 4, 5, 6]) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) select_partitions_params = \ aggregate_params.SelectPartitionsParams( max_partitions_contributed=2, budget_weight=0.5) partition_extractor = lambda x: f"pk:{x // 10}" # Act transformer = private_beam.SelectPartitions( select_partitions_params=select_partitions_params, partition_extractor=partition_extractor, label="Test select partitions") private_collection | transformer # Assert self.assertEqual(transformer._budget_accountant, budget_accountant) mock_select_partitions.assert_called_once() args = mock_select_partitions.call_args[0] self.assertEqual(args[1], select_partitions_params)
def test_select_partitions_calls_select_partitions_with_correct_params( self, mock_aggregate): # Arrange dist_data = PrivateRDDTest.sc.parallelize([(1, "pk1"), (2, "pk2")]) expected_result_partitions = ["pk1", "pk2"] mock_aggregate.return_value = PrivateRDDTest.sc.parallelize( expected_result_partitions) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) max_partitions_contributed = 2 def privacy_id_extractor(x): return x[0] def partition_extractor(x): return {x[1]} # Act prdd = private_spark.make_private(dist_data, budget_accountant, privacy_id_extractor) select_partitions_params = agg.SelectPartitionsParams( max_partitions_contributed=max_partitions_contributed) actual_result = prdd.select_partitions(select_partitions_params, partition_extractor) # Assert mock_aggregate.assert_called_once() actual_args = mock_aggregate.call_args[0] actual_rdd = actual_args[0].collect() actual_select_partition_params = actual_args[1] self.assertListEqual(actual_rdd, [(1, (1, "pk1")), (2, (2, "pk2"))]) self.assertEqual( actual_select_partition_params.max_partitions_contributed, max_partitions_contributed) self.assertEqual(actual_result.collect(), expected_result_partitions)