def test_contribution_bounds_already_enforced_sensible_result(self): # Arrange. # Set large budget, so the noise is very small. accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1000, total_delta=0.999) engine = self._create_dp_engine_default(accountant=accountant) aggregate_params, public_partitions = self._create_params_default() aggregate_params.contribution_bounds_already_enforced = True aggregate_params.metrics = [pipeline_dp.Metrics.SUM] input = [(pk, 1) for pk in public_partitions] data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda x: x[0], value_extractor=lambda x: x[1]) data_extractors.privacy_id_extractor = None # Act. col = engine.aggregate(input, aggregate_params, data_extractors, public_partitions) accountant.compute_budgets() col = list(col) # Assert. self.assertLen(col, len(public_partitions)) values = [x[1].sum for x in col] self.assertSequenceAlmostEqual(values, [1.0] * len(public_partitions))
def get_private_movies(movie_views, backend): """Obtains the list of movies in a differentially private manner. This does not calculate any metrics; it merely returns the list of movies, making sure the result is differentially private. """ # Set the total privacy budget. budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=0.1, total_delta=1e-6) # Create a DPEngine instance. dp_engine = pipeline_dp.DPEngine(budget_accountant, backend) # Specify how to extract privacy_id, partition_key and value from an # element of movie view collection. data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda mv: mv.movie_id, privacy_id_extractor=lambda mv: mv.user_id) # Run aggregation. dp_result = dp_engine.select_partitions( movie_views, pipeline_dp.SelectPartitionsParams(max_partitions_contributed=2), data_extractors=data_extractors) budget_accountant.compute_budgets() return dp_result
def calc_dp_rating_metrics(movie_views, backend, public_partitions): """Computes DP metrics.""" # Set the total privacy budget. budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) # Create a DPEngine instance. dp_engine = pipeline_dp.DPEngine(budget_accountant, backend) # Specify which DP aggregated metrics to compute. params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.LAPLACE, metrics=None, max_partitions_contributed=2, max_contributions_per_partition=1, min_value=1, max_value=5, public_partitions=public_partitions, custom_combiners=[CountCombiner()]) # Specify how to extract privacy_id, partition_key and value from an # element of movie view collection. data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda mv: mv.movie_id, privacy_id_extractor=lambda mv: mv.user_id, value_extractor=lambda mv: mv.rating) # Run aggregation. dp_result = dp_engine.aggregate(movie_views, params, data_extractors) budget_accountant.compute_budgets() return dp_result
def main(unused_argv): # Here, we use a local backend for computations. This does not depend on # any pipeline framework and it is implemented in pure Python in # PipelineDP. It keeps all data in memory and is not optimized for large data. # For datasets smaller than ~tens of megabytes, local execution without any # framework is faster than local mode with Beam or Spark. backend = pipeline_dp.LocalBackend() # Define the privacy budget available for our computation. budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) # Load and parse input data movie_views = parse_file(FLAGS.input_file) # Create a DPEngine instance. dp_engine = pipeline_dp.DPEngine(budget_accountant, backend) params = pipeline_dp.AggregateParams( metrics=[ # we can compute multiple metrics at once. pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.SUM, pipeline_dp.Metrics.PRIVACY_ID_COUNT ], # Limits to how much one user can contribute: # .. at most two movies rated per user max_partitions_contributed=2, # .. at most one rating for each movie max_contributions_per_partition=1, # .. with minimal rating of "1" min_value=1, # .. and maximum rating of "5" max_value=5) # Specify how to extract privacy_id, partition_key and value from an # element of movie_views. data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda mv: mv.movie_id, privacy_id_extractor=lambda mv: mv.user_id, value_extractor=lambda mv: mv.rating) # Create a computational graph for the aggregation. # All computations are lazy. dp_result is iterable, but iterating it would # fail until budget is computed (below). # It’s possible to call DPEngine.aggregate multiple times with different # metrics to compute. dp_result = dp_engine.aggregate(movie_views, params, data_extractors) budget_accountant.compute_budgets() # Here's where the lazy iterator initiates computations and gets transformed # into actual results dp_result = list(dp_result) # Save the results write_to_file(dp_result, FLAGS.output_file) return 0
def calc_dp_rating_metrics(movie_views, backend, public_partitions): """Computes DP metrics.""" # Set the total privacy budget. budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) # Create a DPEngine instance. dp_engine = pipeline_dp.DPEngine(budget_accountant, backend) params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.LAPLACE, metrics=[ pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.SUM, pipeline_dp.Metrics.MEAN, pipeline_dp.Metrics.VARIANCE ] + ([pipeline_dp.Metrics.PRIVACY_ID_COUNT] if not FLAGS.contribution_bounds_already_enforced else []), max_partitions_contributed=2, max_contributions_per_partition=1, min_value=1, max_value=5, contribution_bounds_already_enforced=FLAGS. contribution_bounds_already_enforced) value_extractor = lambda mv: mv.rating if FLAGS.vector_metrics: # Specify which DP aggregated metrics to compute for vector values. params.metrics = [pipeline_dp.Metrics.VECTOR_SUM] params.vector_size = 5 # Size of ratings vector params.vector_max_norm = 1 value_extractor = lambda mv: encode_one_hot(mv.rating - 1, params. vector_size) # Specify how to extract privacy_id, partition_key and value from an # element of movie view collection. data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda mv: mv.movie_id, privacy_id_extractor=(lambda mv: mv.user_id) if not FLAGS.contribution_bounds_already_enforced else None, value_extractor=value_extractor) # Run aggregation. dp_result = dp_engine.aggregate(movie_views, params, data_extractors, public_partitions) budget_accountant.compute_budgets() reports = dp_engine.explain_computations_report() for report in reports: print(report) return dp_result
def main(unused_argv): # Setup Beam # Here, we use a local Beam runner. # For a truly distributed calculation, connect to a Beam cluster (e.g. # running on some cloud provider). runner = fn_api_runner.FnApiRunner() # Local Beam runner with beam.Pipeline(runner=runner) as pipeline: # Define the privacy budget available for our computation. budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) # Load and parse input data df = pd.read_csv(FLAGS.input_file) df.rename(inplace=True, columns={ 'VisitorId': 'user_id', 'Time entered': 'enter_time', 'Time spent (minutes)': 'spent_minutes', 'Money spent (euros)': 'spent_money', 'Day': 'day' }) restaraunt_visits_rows = [index_row[1] for index_row in df.iterrows()] beam_data = pipeline | beam.Create(restaraunt_visits_rows) # Wrap Beam's PCollection into it's private version private_restaraunt_visits = beam_data | private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=lambda row: row.user_id) # Calculate the private sum dp_result = private_restaraunt_visits | private_beam.Sum( SumParams(noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=7, max_contributions_per_partition=2, min_value=1, max_value=100, budget_weight=1, public_partitions=None, partition_extractor=lambda row: row.day, value_extractor=lambda row: row.spent_money)) budget_accountant.compute_budgets() # Save the results dp_result | beam.io.WriteToText(FLAGS.output_file) return 0
def main(unused_argv): delete_if_exists(FLAGS.output_file) # Setup Spark # Here, we use one worker thread to load the file as 1 partition. # For a truly distributed calculation, connect to a Spark cluster (e.g. # running on some cloud provider). master = "local[1]" # use one worker thread to load the file as 1 partition conf = pyspark.SparkConf().setMaster(master) sc = pyspark.SparkContext(conf=conf) movie_views = sc \ .textFile(FLAGS.input_file) \ .mapPartitions(parse_partition) # Define the privacy budget available for our computation. budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) # Wrap Spark's RDD into its private version private_movie_views = \ make_private(movie_views, budget_accountant, lambda mv: mv.user_id) # Calculate the private sum dp_result = private_movie_views.sum( SumParams( # Limits to how much one user can contribute: # .. at most two movies rated per user max_partitions_contributed=2, # .. at most one rating for each movie max_contributions_per_partition=1, # .. with minimal rating of "1" min_value=1, # .. and maximum rating of "5" max_value=5, # The aggregation key: we're grouping by movies partition_extractor=lambda mv: mv.movie_id, # The value we're aggregating: we're summing up ratings value_extractor=lambda mv: mv.rating)) budget_accountant.compute_budgets() # Save the results dp_result.saveAsTextFile(FLAGS.output_file) return 0
def main(unused_argv): # Setup Beam # Here, we use a local Beam runner. # For a truly distributed calculation, connect to a Beam cluster (e.g. # running on some cloud provider). runner = fn_api_runner.FnApiRunner() # Local Beam runner with beam.Pipeline(runner=runner) as pipeline: # Define the privacy budget available for our computation. budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) # Load and parse input data movie_views_pcol = pipeline | \ beam.io.ReadFromText(FLAGS.input_file) | \ beam.ParDo(ParseFile()) # Wrap Beam's PCollection into it's private version private_movie_views = (movie_views_pcol | 'Create private collection' >> MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=lambda mv: mv.user_id)) # Calculate the private sum dp_result = private_movie_views | "Private Sum" >> private_beam.Sum( SumParams( # Limits to how much one user can contribute: # .. at most two movies rated per user max_partitions_contributed=2, # .. at most one rating for each movie max_contributions_per_partition=1, # .. with minimal rating of "1" min_value=1, # .. and maximum rating of "5" max_value=5, # The aggregation key: we're grouping data by movies partition_extractor=lambda mv: mv.movie_id, # The value we're aggregating: we're summing up ratings value_extractor=lambda mv: mv.rating)) budget_accountant.compute_budgets() # Save the results dp_result | beam.io.WriteToText(FLAGS.output_file) return 0
def test_aggregate_sketches_sum(self, sketches, epsilon, delta, size_lower_bound, size_upper_bound): params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.LAPLACE, metrics=[pipeline_dp.Metrics.SUM], max_partitions_contributed=1, max_contributions_per_partition=10, min_value=0, max_value=10) budget_accountant = pipeline_dp.NaiveBudgetAccountant( total_epsilon=epsilon, total_delta=delta) dp_engine = peeker_engine.PeekerEngine(budget_accountant, self._pipeline_backend) dp_results = dp_engine.aggregate_sketches(sketches, params) budget_accountant.compute_budgets() dp_results = list(dp_results) self.assertLessEqual(len(dp_results), size_upper_bound) self.assertGreaterEqual(len(dp_results), size_lower_bound)
def test_aggregate_public_partition_applied(self): # Arrange aggregator_params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[pipeline_dp.Metrics.COUNT], max_partitions_contributed=1, max_contributions_per_partition=1) budget_accountant = pipeline_dp.NaiveBudgetAccountant( total_epsilon=1, total_delta=1e-10) public_partitions = ["pk0", "pk1", "pk101"] # Input collection has 100 elements, such that each privacy id # contributes 1 time and each partition has 1 element. col = list(range(100)) data_extractor = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: x, partition_extractor=lambda x: f"pk{x}", value_extractor=lambda x: None) engine = dp_engine.UtilityAnalysisEngine( budget_accountant=budget_accountant, backend=pipeline_dp.LocalBackend()) col = engine.aggregate(col=col, params=aggregator_params, data_extractors=data_extractor, public_partitions=public_partitions) budget_accountant.compute_budgets() col = list(col) # Assert public partitions are applied, i.e. that pk0 and pk1 are kept, # and pk101 is added. self.assertEqual(len(col), 3) self.assertTrue(any(map(lambda x: x[0] == "pk101", col)))
def test_create_compound_combiner_with_custom_combiners(self): # Arrange. # Create Mock CustomCombiners. custom_combiners = [ dp_combiners.CustomCombiner(), dp_combiners.CustomCombiner() ] # Mock request budget and metrics names functions. for i, combiner in enumerate(custom_combiners): combiner.request_budget = mock.Mock() aggregate_params = self._create_aggregate_params(None) budget_accountant = pipeline_dp.NaiveBudgetAccountant(1, 1e-10) # Act compound_combiner = dp_combiners.create_compound_combiner_with_custom_combiners( aggregate_params, budget_accountant, custom_combiners) # Assert self.assertFalse(compound_combiner._return_named_tuple) for combiner in custom_combiners: combiner.request_budget.assert_called_once()
def main(unused_argv): # Here, we use a local backend for computations. This does not depend on # any pipeline framework and it is implemented in pure Python in # PipelineDP. It keeps all data in memory and is not optimized for large data. # For datasets smaller than ~tens of megabytes, local execution without any # framework is faster than local mode with Beam or Spark. backend = pipeline_dp.LocalBackend() # Define the privacy budget available for our computation. budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) # Load and parse input data df = pd.read_csv(FLAGS.input_file) df.rename(inplace=True, columns={ 'VisitorId': 'user_id', 'Time entered': 'enter_time', 'Time spent (minutes)': 'spent_minutes', 'Money spent (euros)': 'spent_money', 'Day': 'day' }) restaraunt_visits_rows = [index_row[1] for index_row in df.iterrows()] # Create a DPEngine instance. dp_engine = pipeline_dp.DPEngine(budget_accountant, backend) params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.LAPLACE, metrics=[pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.SUM], max_partitions_contributed=3, max_contributions_per_partition=2, min_value=0, max_value=60) # Specify how to extract privacy_id, partition_key and value from an # element of restaraunt_visits_rows. data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda row: row.day, privacy_id_extractor=lambda row: row.user_id, value_extractor=lambda row: row.spent_money) # Create a computational graph for the aggregation. # All computations are lazy. dp_result is iterable, but iterating it would # fail until budget is computed (below). # It’s possible to call DPEngine.aggregate multiple times with different # metrics to compute. dp_result = dp_engine.aggregate(restaraunt_visits_rows, params, data_extractors, public_partitions=list(range(1, 8))) budget_accountant.compute_budgets() # Here's where the lazy iterator initiates computations and gets transformed # into actual results dp_result = list(dp_result) # Save the results write_to_file(dp_result, FLAGS.output_file) return 0
def main(unused_argv): # Here, we use a local backend for computations. This does not depend on # any pipeline framework and it is implemented in pure Python in # PipelineDP. It keeps all data in memory and is not optimized for large data. # For datasets smaller than ~tens of megabytes, local execution without any # framework is faster than local mode with Beam or Spark. backend = pipeline_dp.LocalBackend() # Define the privacy budget available for our computation. budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) # Load and parse input data df = pd.read_csv(FLAGS.input_file) df.rename(inplace=True, columns={ 'VisitorId': 'user_id', 'Time entered': 'enter_time', 'Time spent (minutes)': 'spent_minutes', 'Money spent (euros)': 'spent_money', 'Day': 'day' }) # Double the inputs so we have twice as many contributions per partition df_double = pd.concat([df, df]) df_double.columns = df.columns restaurant_visits_rows = [ index_row[1] for index_row in df_double.iterrows() ] # Create a UtilityAnalysisEngine instance. utility_analysis_engine = UtilityAnalysisEngine(budget_accountant, backend) # Limit contributions to 1 per partition, contribution error will be half of the count. params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.LAPLACE, metrics=[pipeline_dp.Metrics.COUNT], max_partitions_contributed=1, max_contributions_per_partition=1) # Specify how to extract privacy_id, partition_key and value from an # element of restaurant_visits_rows. data_extractors = pipeline_dp.DataExtractors( partition_extractor=lambda row: row.day, privacy_id_extractor=lambda row: row.user_id, value_extractor=lambda row: row.spent_money) public_partitions = list(range(1, 8)) if FLAGS.public_partitions else None dp_result = utility_analysis_engine.aggregate(restaurant_visits_rows, params, data_extractors, public_partitions) budget_accountant.compute_budgets() # Here's where the lazy iterator initiates computations and gets transformed # into actual results dp_result = list(dp_result) # Save the results write_to_file(dp_result, FLAGS.output_file) return 0