def test_combine_per_returns_sensible_result(self): with TestPipeline() as pipeline: # Arrange col = [(f"{u}", "pk1", 100.0) for u in range(30)] col += [(f"{u + 30}", "pk1", -100.0) for u in range(30)] pcol = pipeline | 'Create produce' >> beam.Create(col) # Use very high epsilon and delta to minimize noise and test # flakiness. budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=800, total_delta=0.999) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=lambda x: x[0])) private_collection = private_collection | private_beam.Map( lambda x: (x[1], x[2])) # Act result = private_collection | private_beam.CombinePerKey( SumCombineFn(), private_beam.CombinePerKeyParams( max_partitions_contributed=2, max_contributions_per_partition=1)) budget_accountant.compute_budgets() # Assert # This is a health check to validate that the result is sensible. # Hence, we use a very large tolerance to reduce test flakiness. beam_util.assert_that( result, beam_util.equal_to([("pk1", 0.0)], equals_fn=lambda e, a: PrivateBeamTest. value_per_key_within_tolerance(e, a, 10.0)))
def test_select_partitions_calls_select_partitions_with_params( self, mock_select_partitions): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol = pipeline | 'Create produce' >> beam.Create( [1, 2, 3, 4, 5, 6]) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) select_partitions_params = \ aggregate_params.SelectPartitionsParams( max_partitions_contributed=2, budget_weight=0.5) partition_extractor = lambda x: f"pk:{x // 10}" # Act transformer = private_beam.SelectPartitions( select_partitions_params=select_partitions_params, partition_extractor=partition_extractor, label="Test select partitions") private_collection | transformer # Assert self.assertEqual(transformer._budget_accountant, budget_accountant) mock_select_partitions.assert_called_once() args = mock_select_partitions.call_args[0] self.assertEqual(args[1], select_partitions_params)
def test_select_private_partitions_returns_sensible_result(self): with TestPipeline() as pipeline: # Arrange col = [(u, "pk1") for u in range(50)] col += [(50 + u, "pk2") for u in range(50)] pcol = pipeline | 'Create produce' >> beam.Create(col) # Use very high epsilon and delta to minimize noise and test # flakiness. budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=800, total_delta=0.999) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=lambda x: x[0])) select_partitions_params = \ aggregate_params.SelectPartitionsParams( max_partitions_contributed=2, budget_weight=0.9) partition_extractor = lambda x: x[1] # Act result = private_collection | private_beam.SelectPartitions( select_partitions_params=select_partitions_params, partition_extractor=partition_extractor, label="Test select partitions") budget_accountant.compute_budgets() # Assert # This is a health check to validate that the result is sensible. # Hence, we use a very large tolerance to reduce test flakiness. beam_util.assert_that(result, beam_util.equal_to(["pk1", "pk2"]))
def test_map_returns_correct_results_and_accountant(self): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol_input = [(1, 2), (2, 3), (3, 4), (4, 5)] pcol = pipeline | 'Create produce' >> beam.Create(pcol_input) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) # Act transformed = private_collection | private_beam.Map( fn=lambda x: x[1]**2) # Assert self.assertIsInstance(transformed, private_beam.PrivatePCollection) beam_util.assert_that( transformed._pcol, beam_util.equal_to( map( lambda x: (PrivateBeamTest.privacy_id_extractor(x), x[1]**2), pcol_input))) self.assertEqual(transformed._budget_accountant, budget_accountant)
def test_flatmap_returns_correct_results_and_accountant(self): def flat_map_fn(x): return [(x[0], x[1] + i) for i in range(2)] runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol_input = [(1, 2), (2, 3), (3, 4)] pcol = pipeline | 'Create produce' >> beam.Create(pcol_input) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) # Act transformed = private_collection | private_beam.FlatMap( flat_map_fn) # Assert self.assertIsInstance(transformed, private_beam.PrivatePCollection) beam_util.assert_that( transformed._pcol, beam_util.equal_to([('pid:(1, 2)', (1, 2)), ('pid:(1, 2)', (1, 3)), ('pid:(2, 3)', (2, 3)), ('pid:(2, 3)', (2, 4)), ('pid:(3, 4)', (3, 4)), ('pid:(3, 4)', (3, 5))])) self.assertEqual(transformed._budget_accountant, budget_accountant)
def test_privacy_id_count_returns_sensible_result(self): with TestPipeline() as pipeline: # Arrange col = [(u, "pk1") for u in range(30)] pcol = pipeline | 'Create produce' >> beam.Create(col) # Use very high epsilon and delta to minimize noise and test # flakiness. budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=800, total_delta=0.999) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=lambda x: x[0])) privacy_id_count_params = aggregate_params.PrivacyIdCountParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=2, budget_weight=1, partition_extractor=lambda x: x[1]) # Act result = private_collection | private_beam.PrivacyIdCount( privacy_id_count_params=privacy_id_count_params) budget_accountant.compute_budgets() # Assert # This is a health check to validate that the result is sensible. # Hence, we use a very large tolerance to reduce test flakiness. beam_util.assert_that( result, beam_util.equal_to([("pk1", 30)], equals_fn=lambda e, a: PrivateBeamTest. value_per_key_within_tolerance(e, a, 5)))
def main(unused_argv): # Setup Beam # Here, we use a local Beam runner. # For a truly distributed calculation, connect to a Beam cluster (e.g. # running on some cloud provider). runner = fn_api_runner.FnApiRunner() # Local Beam runner with beam.Pipeline(runner=runner) as pipeline: # Define the privacy budget available for our computation. budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) # Load and parse input data df = pd.read_csv(FLAGS.input_file) df.rename(inplace=True, columns={ 'VisitorId': 'user_id', 'Time entered': 'enter_time', 'Time spent (minutes)': 'spent_minutes', 'Money spent (euros)': 'spent_money', 'Day': 'day' }) restaraunt_visits_rows = [index_row[1] for index_row in df.iterrows()] beam_data = pipeline | beam.Create(restaraunt_visits_rows) # Wrap Beam's PCollection into it's private version private_restaraunt_visits = beam_data | private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=lambda row: row.user_id) # Calculate the private sum dp_result = private_restaraunt_visits | private_beam.Sum( SumParams(noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=7, max_contributions_per_partition=2, min_value=1, max_value=100, budget_weight=1, public_partitions=None, partition_extractor=lambda row: row.day, value_extractor=lambda row: row.spent_money)) budget_accountant.compute_budgets() # Save the results dp_result | beam.io.WriteToText(FLAGS.output_file) return 0
def test_sum_calls_aggregate_with_params(self, mock_aggregate): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol = pipeline | 'Create produce' >> beam.Create( float(i) for i in range(1, 7)) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) sum_params = aggregate_params.SumParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=2, max_contributions_per_partition=3, min_value=1, max_value=5, budget_weight=1, public_partitions=[], partition_extractor=lambda x: f"pk:{x // 10}", value_extractor=lambda x: x) # Act transformer = private_beam.Sum(sum_params=sum_params) private_collection | transformer # Assert self.assertEqual(transformer._budget_accountant, budget_accountant) mock_aggregate.assert_called_once() args = mock_aggregate.call_args[0] params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[pipeline_dp.Metrics.SUM], max_partitions_contributed=sum_params. max_partitions_contributed, max_contributions_per_partition=sum_params. max_contributions_per_partition, min_value=sum_params.min_value, max_value=sum_params.max_value, public_partitions=sum_params.public_partitions) self.assertEqual(params, args[1])
def main(unused_argv): # Setup Beam # Here, we use a local Beam runner. # For a truly distributed calculation, connect to a Beam cluster (e.g. # running on some cloud provider). runner = fn_api_runner.FnApiRunner() # Local Beam runner with beam.Pipeline(runner=runner) as pipeline: # Define the privacy budget available for our computation. budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) # Load and parse input data movie_views_pcol = pipeline | \ beam.io.ReadFromText(FLAGS.input_file) | \ beam.ParDo(ParseFile()) # Wrap Beam's PCollection into it's private version private_movie_views = ( movie_views_pcol | 'Create private collection' >> pbeam.MakePrivate(budget_accountant=budget_accountant, privacy_id_extractor=lambda mv: mv.user_id)) private_movie_views = private_movie_views | pbeam.Map( lambda mv: (mv.movie_id, mv.rating)) # Calculate the private sum dp_result = private_movie_views | pbeam.CombinePerKey( DPSumCombineFn(min_value=1, max_value=5), pbeam.CombinePerKeyParams( # Limits to how much one user can contribute: # .. at most two movies rated per user max_partitions_contributed=2, # .. at most one rating for each movie max_contributions_per_partition=1)) budget_accountant.compute_budgets() # Save the results dp_result | beam.io.WriteToText(FLAGS.output_file) return 0
def test_make_private_transform_succeeds(self): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol = pipeline | 'Create produce' >> beam.Create( [1, 2, 3, 4, 5, 6]) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) # Act private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) # Assert self.assertIsInstance(private_collection, private_beam.PrivatePCollection) self.assertEqual(private_collection._budget_accountant, budget_accountant)
def test_transform_with_return_anonymized_enabled_returns_pcollection( self): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol = pipeline | 'Create produce' >> beam.Create( [1, 2, 3, 4, 5, 6]) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) # Act transformed = private_collection | SimplePrivatePTransform( return_anonymized=True) # Assert self.assertIsInstance(transformed, pvalue.PCollection)
def test_variance_with_public_partitions_returns_sensible_result(self): with TestPipeline() as pipeline: # Arrange col = [(f"{u}", "pubK1", -100.0) for u in range(30)] col += [(f"{u + 30}", "pubK1", 100.0) for u in range(10)] col += [(f"{u + 40}", "privK1", 100.0) for u in range(30)] pcol = pipeline | 'Create produce' >> beam.Create(col) # Use very high epsilon and delta to minimize noise and test # flakiness. budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=8000, total_delta=0.9999999) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=lambda x: x[0])) variance_params = aggregate_params.VarianceParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=1, max_contributions_per_partition=1, min_value=1.55, # -100 should be clipped to this value max_value=2.7889, # 100 should be clipped to this value budget_weight=1, partition_extractor=lambda x: x[1], value_extractor=lambda x: x[2]) # Act result = private_collection | private_beam.Variance( variance_params=variance_params, public_partitions=["pubK1", "pubK2"]) budget_accountant.compute_budgets() # Assert # This is a health check to validate that the result is sensible. # Hence, we use a very large tolerance to reduce test flakiness. beam_util.assert_that( result, # pubK2 has no data points therefore the dataset is assumed to be {min_value, max_value} beam_util.equal_to([("pubK1", 0.288), ("pubK2", 0.0)], equals_fn=lambda e, a: PrivateBeamTest. value_per_key_within_tolerance(e, a, 0.1)))
def test_private_collection_with_non_private_transform_throws_error(self): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol = pipeline | 'Create produce' >> beam.Create( [1, 2, 3, 4, 5, 6]) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) # Act and Assert with self.assertRaises(TypeError) as context: (private_collection | 'Non private transform on ' 'PrivatePCollection' >> beam.Map(lambda x: x)) self.assertIsInstance(private_collection, private_beam.PrivatePCollection) self.assertTrue( "private_transform should be of type " "PrivatePTransform but is " in str(context.exception))
def test_privacy_id_count_calls_aggregate_with_params( self, mock_aggregate): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol = pipeline | 'Create produce' >> beam.Create( [1, 2, 3, 4, 5, 6]) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) privacy_id_count_params = aggregate_params.PrivacyIdCountParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=2, budget_weight=1, partition_extractor=lambda x: f"pk:{x // 10}") # Act transformer = private_beam.PrivacyIdCount( privacy_id_count_params=privacy_id_count_params) private_collection | transformer # Assert self.assertEqual(transformer._budget_accountant, budget_accountant) mock_aggregate.assert_called_once() args = mock_aggregate.call_args[0] params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT], max_partitions_contributed=privacy_id_count_params. max_partitions_contributed, max_contributions_per_partition=1, public_partitions=privacy_id_count_params.public_partitions) self.assertEqual(args[1], params)