示例#1
0
    def test_contribution_bounds_already_enforced_sensible_result(self):
        # Arrange.
        # Set large budget, so the noise is very small.
        accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1000,
                                                       total_delta=0.999)
        engine = self._create_dp_engine_default(accountant=accountant)
        aggregate_params, public_partitions = self._create_params_default()
        aggregate_params.contribution_bounds_already_enforced = True
        aggregate_params.metrics = [pipeline_dp.Metrics.SUM]

        input = [(pk, 1) for pk in public_partitions]

        data_extractors = pipeline_dp.DataExtractors(
            partition_extractor=lambda x: x[0], value_extractor=lambda x: x[1])
        data_extractors.privacy_id_extractor = None

        # Act.
        col = engine.aggregate(input, aggregate_params, data_extractors,
                               public_partitions)
        accountant.compute_budgets()
        col = list(col)

        # Assert.
        self.assertLen(col, len(public_partitions))
        values = [x[1].sum for x in col]
        self.assertSequenceAlmostEqual(values, [1.0] * len(public_partitions))
def get_private_movies(movie_views, backend):
    """Obtains the list of movies in a differentially private manner.

    This does not calculate any metrics; it merely returns the list of
    movies, making sure the result is differentially private.
    """

    # Set the total privacy budget.
    budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=0.1,
                                                          total_delta=1e-6)

    # Create a DPEngine instance.
    dp_engine = pipeline_dp.DPEngine(budget_accountant, backend)

    # Specify how to extract privacy_id, partition_key and value from an
    # element of movie view collection.
    data_extractors = pipeline_dp.DataExtractors(
        partition_extractor=lambda mv: mv.movie_id,
        privacy_id_extractor=lambda mv: mv.user_id)

    # Run aggregation.
    dp_result = dp_engine.select_partitions(
        movie_views,
        pipeline_dp.SelectPartitionsParams(max_partitions_contributed=2),
        data_extractors=data_extractors)

    budget_accountant.compute_budgets()
    return dp_result
示例#3
0
def calc_dp_rating_metrics(movie_views, backend, public_partitions):
    """Computes DP metrics."""

    # Set the total privacy budget.
    budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1,
                                                          total_delta=1e-6)

    # Create a DPEngine instance.
    dp_engine = pipeline_dp.DPEngine(budget_accountant, backend)

    # Specify which DP aggregated metrics to compute.
    params = pipeline_dp.AggregateParams(
        noise_kind=pipeline_dp.NoiseKind.LAPLACE,
        metrics=None,
        max_partitions_contributed=2,
        max_contributions_per_partition=1,
        min_value=1,
        max_value=5,
        public_partitions=public_partitions,
        custom_combiners=[CountCombiner()])

    # Specify how to extract privacy_id, partition_key and value from an
    # element of movie view collection.
    data_extractors = pipeline_dp.DataExtractors(
        partition_extractor=lambda mv: mv.movie_id,
        privacy_id_extractor=lambda mv: mv.user_id,
        value_extractor=lambda mv: mv.rating)

    # Run aggregation.
    dp_result = dp_engine.aggregate(movie_views, params, data_extractors)

    budget_accountant.compute_budgets()

    return dp_result
示例#4
0
def main(unused_argv):
    # Here, we use a local backend for computations. This does not depend on
    # any pipeline framework and it is implemented in pure Python in
    # PipelineDP. It keeps all data in memory and is not optimized for large data.
    # For datasets smaller than ~tens of megabytes, local execution without any
    # framework is faster than local mode with Beam or Spark.
    backend = pipeline_dp.LocalBackend()

    # Define the privacy budget available for our computation.
    budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1,
                                                          total_delta=1e-6)

    # Load and parse input data
    movie_views = parse_file(FLAGS.input_file)

    # Create a DPEngine instance.
    dp_engine = pipeline_dp.DPEngine(budget_accountant, backend)

    params = pipeline_dp.AggregateParams(
        metrics=[
            # we can compute multiple metrics at once.
            pipeline_dp.Metrics.COUNT,
            pipeline_dp.Metrics.SUM,
            pipeline_dp.Metrics.PRIVACY_ID_COUNT
        ],
        # Limits to how much one user can contribute:
        # .. at most two movies rated per user
        max_partitions_contributed=2,
        # .. at most one rating for each movie
        max_contributions_per_partition=1,
        # .. with minimal rating of "1"
        min_value=1,
        # .. and maximum rating of "5"
        max_value=5)

    # Specify how to extract privacy_id, partition_key and value from an
    # element of movie_views.
    data_extractors = pipeline_dp.DataExtractors(
        partition_extractor=lambda mv: mv.movie_id,
        privacy_id_extractor=lambda mv: mv.user_id,
        value_extractor=lambda mv: mv.rating)

    # Create a computational graph for the aggregation.
    # All computations are lazy. dp_result is iterable, but iterating it would
    # fail until budget is computed (below).
    # It’s possible to call DPEngine.aggregate multiple times with different
    # metrics to compute.
    dp_result = dp_engine.aggregate(movie_views, params, data_extractors)

    budget_accountant.compute_budgets()

    # Here's where the lazy iterator initiates computations and gets transformed
    # into actual results
    dp_result = list(dp_result)

    # Save the results
    write_to_file(dp_result, FLAGS.output_file)

    return 0
def calc_dp_rating_metrics(movie_views, backend, public_partitions):
    """Computes DP metrics."""

    # Set the total privacy budget.
    budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1,
                                                          total_delta=1e-6)

    # Create a DPEngine instance.
    dp_engine = pipeline_dp.DPEngine(budget_accountant, backend)

    params = pipeline_dp.AggregateParams(
        noise_kind=pipeline_dp.NoiseKind.LAPLACE,
        metrics=[
            pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.SUM,
            pipeline_dp.Metrics.MEAN, pipeline_dp.Metrics.VARIANCE
        ] + ([pipeline_dp.Metrics.PRIVACY_ID_COUNT]
             if not FLAGS.contribution_bounds_already_enforced else []),
        max_partitions_contributed=2,
        max_contributions_per_partition=1,
        min_value=1,
        max_value=5,
        contribution_bounds_already_enforced=FLAGS.
        contribution_bounds_already_enforced)

    value_extractor = lambda mv: mv.rating

    if FLAGS.vector_metrics:
        # Specify which DP aggregated metrics to compute for vector values.
        params.metrics = [pipeline_dp.Metrics.VECTOR_SUM]
        params.vector_size = 5  # Size of ratings vector
        params.vector_max_norm = 1
        value_extractor = lambda mv: encode_one_hot(mv.rating - 1, params.
                                                    vector_size)

    # Specify how to extract privacy_id, partition_key and value from an
    # element of movie view collection.
    data_extractors = pipeline_dp.DataExtractors(
        partition_extractor=lambda mv: mv.movie_id,
        privacy_id_extractor=(lambda mv: mv.user_id)
        if not FLAGS.contribution_bounds_already_enforced else None,
        value_extractor=value_extractor)

    # Run aggregation.
    dp_result = dp_engine.aggregate(movie_views, params, data_extractors,
                                    public_partitions)

    budget_accountant.compute_budgets()

    reports = dp_engine.explain_computations_report()
    for report in reports:
        print(report)

    return dp_result
示例#6
0
def main(unused_argv):
    # Setup Beam

    # Here, we use a local Beam runner.
    # For a truly distributed calculation, connect to a Beam cluster (e.g.
    # running on some cloud provider).
    runner = fn_api_runner.FnApiRunner()  # Local Beam runner
    with beam.Pipeline(runner=runner) as pipeline:

        # Define the privacy budget available for our computation.
        budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1,
                                                              total_delta=1e-6)

        # Load and parse input data
        df = pd.read_csv(FLAGS.input_file)
        df.rename(inplace=True,
                  columns={
                      'VisitorId': 'user_id',
                      'Time entered': 'enter_time',
                      'Time spent (minutes)': 'spent_minutes',
                      'Money spent (euros)': 'spent_money',
                      'Day': 'day'
                  })
        restaraunt_visits_rows = [index_row[1] for index_row in df.iterrows()]
        beam_data = pipeline | beam.Create(restaraunt_visits_rows)

        # Wrap Beam's PCollection into it's private version
        private_restaraunt_visits = beam_data | private_beam.MakePrivate(
            budget_accountant=budget_accountant,
            privacy_id_extractor=lambda row: row.user_id)

        # Calculate the private sum
        dp_result = private_restaraunt_visits | private_beam.Sum(
            SumParams(noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                      max_partitions_contributed=7,
                      max_contributions_per_partition=2,
                      min_value=1,
                      max_value=100,
                      budget_weight=1,
                      public_partitions=None,
                      partition_extractor=lambda row: row.day,
                      value_extractor=lambda row: row.spent_money))
        budget_accountant.compute_budgets()

        # Save the results
        dp_result | beam.io.WriteToText(FLAGS.output_file)

    return 0
示例#7
0
def main(unused_argv):
    delete_if_exists(FLAGS.output_file)

    # Setup Spark

    # Here, we use one worker thread to load the file as 1 partition.
    # For a truly distributed calculation, connect to a Spark cluster (e.g.
    # running on some cloud provider).
    master = "local[1]"  # use one worker thread to load the file as 1 partition
    conf = pyspark.SparkConf().setMaster(master)
    sc = pyspark.SparkContext(conf=conf)
    movie_views = sc \
        .textFile(FLAGS.input_file) \
        .mapPartitions(parse_partition)

    # Define the privacy budget available for our computation.
    budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1,
                                                          total_delta=1e-6)

    # Wrap Spark's RDD into its private version
    private_movie_views = \
        make_private(movie_views, budget_accountant, lambda mv: mv.user_id)

    # Calculate the private sum
    dp_result = private_movie_views.sum(
        SumParams(
            # Limits to how much one user can contribute:
            # .. at most two movies rated per user
            max_partitions_contributed=2,
            # .. at most one rating for each movie
            max_contributions_per_partition=1,
            # .. with minimal rating of "1"
            min_value=1,
            # .. and maximum rating of "5"
            max_value=5,
            # The aggregation key: we're grouping by movies
            partition_extractor=lambda mv: mv.movie_id,
            # The value we're aggregating: we're summing up ratings
            value_extractor=lambda mv: mv.rating))

    budget_accountant.compute_budgets()

    # Save the results
    dp_result.saveAsTextFile(FLAGS.output_file)

    return 0
示例#8
0
def main(unused_argv):
    # Setup Beam

    # Here, we use a local Beam runner.
    # For a truly distributed calculation, connect to a Beam cluster (e.g.
    # running on some cloud provider).
    runner = fn_api_runner.FnApiRunner()  # Local Beam runner
    with beam.Pipeline(runner=runner) as pipeline:

        # Define the privacy budget available for our computation.
        budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1,
                                                              total_delta=1e-6)

        # Load and parse input data
        movie_views_pcol = pipeline | \
                           beam.io.ReadFromText(FLAGS.input_file) | \
                           beam.ParDo(ParseFile())

        # Wrap Beam's PCollection into it's private version
        private_movie_views = (movie_views_pcol
                               | 'Create private collection' >> MakePrivate(
                                   budget_accountant=budget_accountant,
                                   privacy_id_extractor=lambda mv: mv.user_id))

        # Calculate the private sum
        dp_result = private_movie_views | "Private Sum" >> private_beam.Sum(
            SumParams(
                # Limits to how much one user can contribute:
                # .. at most two movies rated per user
                max_partitions_contributed=2,
                # .. at most one rating for each movie
                max_contributions_per_partition=1,
                # .. with minimal rating of "1"
                min_value=1,
                # .. and maximum rating of "5"
                max_value=5,
                # The aggregation key: we're grouping data by movies
                partition_extractor=lambda mv: mv.movie_id,
                # The value we're aggregating: we're summing up ratings
                value_extractor=lambda mv: mv.rating))
        budget_accountant.compute_budgets()

        # Save the results
        dp_result | beam.io.WriteToText(FLAGS.output_file)

    return 0
示例#9
0
 def test_aggregate_sketches_sum(self, sketches, epsilon, delta,
                                 size_lower_bound, size_upper_bound):
     params = pipeline_dp.AggregateParams(
         noise_kind=pipeline_dp.NoiseKind.LAPLACE,
         metrics=[pipeline_dp.Metrics.SUM],
         max_partitions_contributed=1,
         max_contributions_per_partition=10,
         min_value=0,
         max_value=10)
     budget_accountant = pipeline_dp.NaiveBudgetAccountant(
         total_epsilon=epsilon, total_delta=delta)
     dp_engine = peeker_engine.PeekerEngine(budget_accountant,
                                            self._pipeline_backend)
     dp_results = dp_engine.aggregate_sketches(sketches, params)
     budget_accountant.compute_budgets()
     dp_results = list(dp_results)
     self.assertLessEqual(len(dp_results), size_upper_bound)
     self.assertGreaterEqual(len(dp_results), size_lower_bound)
示例#10
0
    def test_aggregate_public_partition_applied(self):
        # Arrange
        aggregator_params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            metrics=[pipeline_dp.Metrics.COUNT],
            max_partitions_contributed=1,
            max_contributions_per_partition=1)

        budget_accountant = pipeline_dp.NaiveBudgetAccountant(
            total_epsilon=1, total_delta=1e-10)

        public_partitions = ["pk0", "pk1", "pk101"]

        # Input collection has 100 elements, such that each privacy id
        # contributes 1 time and each partition has 1 element.
        col = list(range(100))
        data_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: x,
            partition_extractor=lambda x: f"pk{x}",
            value_extractor=lambda x: None)

        engine = dp_engine.UtilityAnalysisEngine(
            budget_accountant=budget_accountant,
            backend=pipeline_dp.LocalBackend())

        col = engine.aggregate(col=col,
                               params=aggregator_params,
                               data_extractors=data_extractor,
                               public_partitions=public_partitions)
        budget_accountant.compute_budgets()

        col = list(col)

        # Assert public partitions are applied, i.e. that pk0 and pk1 are kept,
        # and pk101 is added.
        self.assertEqual(len(col), 3)
        self.assertTrue(any(map(lambda x: x[0] == "pk101", col)))
示例#11
0
    def test_create_compound_combiner_with_custom_combiners(self):
        # Arrange.
        # Create Mock CustomCombiners.
        custom_combiners = [
            dp_combiners.CustomCombiner(),
            dp_combiners.CustomCombiner()
        ]

        # Mock request budget and metrics names functions.
        for i, combiner in enumerate(custom_combiners):
            combiner.request_budget = mock.Mock()

        aggregate_params = self._create_aggregate_params(None)

        budget_accountant = pipeline_dp.NaiveBudgetAccountant(1, 1e-10)

        # Act
        compound_combiner = dp_combiners.create_compound_combiner_with_custom_combiners(
            aggregate_params, budget_accountant, custom_combiners)

        # Assert
        self.assertFalse(compound_combiner._return_named_tuple)
        for combiner in custom_combiners:
            combiner.request_budget.assert_called_once()
def main(unused_argv):
    # Here, we use a local backend for computations. This does not depend on
    # any pipeline framework and it is implemented in pure Python in
    # PipelineDP. It keeps all data in memory and is not optimized for large data.
    # For datasets smaller than ~tens of megabytes, local execution without any
    # framework is faster than local mode with Beam or Spark.
    backend = pipeline_dp.LocalBackend()

    # Define the privacy budget available for our computation.
    budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1,
                                                          total_delta=1e-6)

    # Load and parse input data
    df = pd.read_csv(FLAGS.input_file)
    df.rename(inplace=True,
              columns={
                  'VisitorId': 'user_id',
                  'Time entered': 'enter_time',
                  'Time spent (minutes)': 'spent_minutes',
                  'Money spent (euros)': 'spent_money',
                  'Day': 'day'
              })
    restaraunt_visits_rows = [index_row[1] for index_row in df.iterrows()]

    # Create a DPEngine instance.
    dp_engine = pipeline_dp.DPEngine(budget_accountant, backend)

    params = pipeline_dp.AggregateParams(
        noise_kind=pipeline_dp.NoiseKind.LAPLACE,
        metrics=[pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.SUM],
        max_partitions_contributed=3,
        max_contributions_per_partition=2,
        min_value=0,
        max_value=60)

    # Specify how to extract privacy_id, partition_key and value from an
    # element of restaraunt_visits_rows.
    data_extractors = pipeline_dp.DataExtractors(
        partition_extractor=lambda row: row.day,
        privacy_id_extractor=lambda row: row.user_id,
        value_extractor=lambda row: row.spent_money)

    # Create a computational graph for the aggregation.
    # All computations are lazy. dp_result is iterable, but iterating it would
    # fail until budget is computed (below).
    # It’s possible to call DPEngine.aggregate multiple times with different
    # metrics to compute.
    dp_result = dp_engine.aggregate(restaraunt_visits_rows,
                                    params,
                                    data_extractors,
                                    public_partitions=list(range(1, 8)))

    budget_accountant.compute_budgets()

    # Here's where the lazy iterator initiates computations and gets transformed
    # into actual results
    dp_result = list(dp_result)

    # Save the results
    write_to_file(dp_result, FLAGS.output_file)

    return 0
def main(unused_argv):
    # Here, we use a local backend for computations. This does not depend on
    # any pipeline framework and it is implemented in pure Python in
    # PipelineDP. It keeps all data in memory and is not optimized for large data.
    # For datasets smaller than ~tens of megabytes, local execution without any
    # framework is faster than local mode with Beam or Spark.
    backend = pipeline_dp.LocalBackend()

    # Define the privacy budget available for our computation.
    budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1,
                                                          total_delta=1e-6)

    # Load and parse input data
    df = pd.read_csv(FLAGS.input_file)
    df.rename(inplace=True,
              columns={
                  'VisitorId': 'user_id',
                  'Time entered': 'enter_time',
                  'Time spent (minutes)': 'spent_minutes',
                  'Money spent (euros)': 'spent_money',
                  'Day': 'day'
              })
    # Double the inputs so we have twice as many contributions per partition
    df_double = pd.concat([df, df])
    df_double.columns = df.columns
    restaurant_visits_rows = [
        index_row[1] for index_row in df_double.iterrows()
    ]

    # Create a UtilityAnalysisEngine instance.
    utility_analysis_engine = UtilityAnalysisEngine(budget_accountant, backend)

    # Limit contributions to 1 per partition, contribution error will be half of the count.
    params = pipeline_dp.AggregateParams(
        noise_kind=pipeline_dp.NoiseKind.LAPLACE,
        metrics=[pipeline_dp.Metrics.COUNT],
        max_partitions_contributed=1,
        max_contributions_per_partition=1)

    # Specify how to extract privacy_id, partition_key and value from an
    # element of restaurant_visits_rows.
    data_extractors = pipeline_dp.DataExtractors(
        partition_extractor=lambda row: row.day,
        privacy_id_extractor=lambda row: row.user_id,
        value_extractor=lambda row: row.spent_money)

    public_partitions = list(range(1, 8)) if FLAGS.public_partitions else None

    dp_result = utility_analysis_engine.aggregate(restaurant_visits_rows,
                                                  params, data_extractors,
                                                  public_partitions)

    budget_accountant.compute_budgets()

    # Here's where the lazy iterator initiates computations and gets transformed
    # into actual results
    dp_result = list(dp_result)

    # Save the results
    write_to_file(dp_result, FLAGS.output_file)

    return 0