Пример #1
0
    def test_run_e2e_local(self):
        input = list(range(10))

        output = self.run_e2e_private_partition_selection_large_budget(
            input, pipeline_dp.LocalBackend())

        self.assertEqual(5, len(list(output)))
Пример #2
0
 def test_check_invalid_bounding_params(self, error_msg, min_value,
                                        max_value,
                                        max_partitions_contributed,
                                        max_contributions_per_partition,
                                        metrics):
     with self.assertRaises(Exception, msg=error_msg):
         budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                   total_delta=1e-10)
         engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant,
                                       backend=pipeline_dp.LocalBackend())
         engine.aggregate(
             [0],
             pipeline_dp.AggregateParams(
                 noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                 max_partitions_contributed=max_partitions_contributed,
                 max_contributions_per_partition=
                 max_contributions_per_partition,
                 min_value=min_value,
                 max_value=max_value,
                 metrics=metrics),
             pipeline_dp.DataExtractors(
                 privacy_id_extractor=lambda x: x,
                 partition_extractor=lambda x: x,
                 value_extractor=lambda x: x,
             ))
Пример #3
0
    def test_aggregate_computation_graph_verification(
            self, mock_bound_contributions):
        # Arrange
        aggregator_params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            metrics=[agg.Metrics.COUNT],
            max_partitions_contributed=5,
            max_contributions_per_partition=3)
        budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                  total_delta=1e-10)

        col = [[1], [2], [3], [3]]
        data_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: f"pid{x}",
            partition_extractor=lambda x: f"pk{x}",
            value_extractor=lambda x: x)

        mock_bound_contributions.return_value = [
            [("pid1", "pk1"), (1, [1])],
            [("pid2", "pk2"), (1, [1])],
            [("pid3", "pk3"), (1, [2])],
        ]

        engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant,
                                      backend=pipeline_dp.LocalBackend())
        col = engine.aggregate(col=col,
                               params=aggregator_params,
                               data_extractors=data_extractor)

        # Assert
        mock_bound_contributions.assert_called_with(
            unittest.mock.ANY, aggregator_params.max_partitions_contributed,
            aggregator_params.max_contributions_per_partition,
            unittest.mock.ANY)
Пример #4
0
    def test_aggregate_report(self):
        col = [[1], [2], [3], [3]]
        data_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: f"pid{x}",
            partition_extractor=lambda x: f"pk{x}",
            value_extractor=lambda x: x)
        params1 = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=3,
            max_contributions_per_partition=2,
            min_value=1,
            max_value=5,
            metrics=[
                pipeline_dp.Metrics.PRIVACY_ID_COUNT,
                pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.MEAN
            ],
        )
        params2 = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=1,
            max_contributions_per_partition=3,
            min_value=2,
            max_value=10,
            metrics=[pipeline_dp.Metrics.SUM, pipeline_dp.Metrics.MEAN],
            public_partitions=list(range(1, 40)),
        )

        select_partitions_params = SelectPartitionsParams(
            max_partitions_contributed=2)

        budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                  total_delta=1e-10)
        engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant,
                                      backend=pipeline_dp.LocalBackend())
        engine.aggregate(col, params1, data_extractor)
        engine.aggregate(col, params2, data_extractor)
        engine.select_partitions(col, select_partitions_params, data_extractor)
        self.assertEqual(3, len(engine._report_generators))  # pylint: disable=protected-access
        budget_accountant.compute_budgets()
        self.assertEqual(
            engine._report_generators[0].report(),
            "Differentially private: Computing <Metrics: ['privacy_id_count', 'count', 'mean']>"
            "\n1. Per-partition contribution bounding: randomly selected not more than 2 contributions"
            "\n2. Cross-partition contribution bounding: randomly selected not more than 3 partitions per user"
            "\n3. Private Partition selection: using Truncated Geometric method with (eps= 0.1111111111111111, delta = 1.1111111111111111e-11)"
        )
        self.assertEqual(
            engine._report_generators[1].report(),
            "Differentially private: Computing <Metrics: ['sum', 'mean']>"
            "\n1. Public partition selection: dropped non public partitions"
            "\n2. Per-partition contribution bounding: randomly selected not more than 3 contributions"
            "\n3. Cross-partition contribution bounding: randomly selected not more than 1 partitions per user"
            "\n4. Adding empty partitions to public partitions that are missing in data"
        )
        self.assertEqual(
            engine._report_generators[2].report(),
            "Differentially private: Computing <Private Partitions>"
            "\n1. Private Partition selection: using Truncated Geometric method with (eps= 0.3333333333333333, delta = 3.3333333333333335e-11)"
        )
Пример #5
0
def main(unused_argv):
    # Here, we use a local backend for computations. This does not depend on
    # any pipeline framework and it is implemented in pure Python in
    # PipelineDP. It keeps all data in memory and is not optimized for large data.
    # For datasets smaller than ~tens of megabytes, local execution without any
    # framework is faster than local mode with Beam or Spark.
    backend = pipeline_dp.LocalBackend()

    # Define the privacy budget available for our computation.
    budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1,
                                                          total_delta=1e-6)

    # Load and parse input data
    movie_views = parse_file(FLAGS.input_file)

    # Create a DPEngine instance.
    dp_engine = pipeline_dp.DPEngine(budget_accountant, backend)

    params = pipeline_dp.AggregateParams(
        metrics=[
            # we can compute multiple metrics at once.
            pipeline_dp.Metrics.COUNT,
            pipeline_dp.Metrics.SUM,
            pipeline_dp.Metrics.PRIVACY_ID_COUNT
        ],
        # Limits to how much one user can contribute:
        # .. at most two movies rated per user
        max_partitions_contributed=2,
        # .. at most one rating for each movie
        max_contributions_per_partition=1,
        # .. with minimal rating of "1"
        min_value=1,
        # .. and maximum rating of "5"
        max_value=5)

    # Specify how to extract privacy_id, partition_key and value from an
    # element of movie_views.
    data_extractors = pipeline_dp.DataExtractors(
        partition_extractor=lambda mv: mv.movie_id,
        privacy_id_extractor=lambda mv: mv.user_id,
        value_extractor=lambda mv: mv.rating)

    # Create a computational graph for the aggregation.
    # All computations are lazy. dp_result is iterable, but iterating it would
    # fail until budget is computed (below).
    # It’s possible to call DPEngine.aggregate multiple times with different
    # metrics to compute.
    dp_result = dp_engine.aggregate(movie_views, params, data_extractors)

    budget_accountant.compute_budgets()

    # Here's where the lazy iterator initiates computations and gets transformed
    # into actual results
    dp_result = list(dp_result)

    # Save the results
    write_to_file(dp_result, FLAGS.output_file)

    return 0
    def _run_contribution_bounding(self, input, max_contributions):
        params = MaxContributionsParams(max_contributions)

        bounder = contribution_bounders.SamplingPerPrivacyIdContributionBounder(
        )
        return list(
            bounder.bound_contributions(input, params,
                                        pipeline_dp.LocalBackend(),
                                        _create_report_generator(),
                                        aggregate_fn))
Пример #7
0
    def test_check_aggregate_params(self):
        default_extractors = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: x,
            partition_extractor=lambda x: x,
            value_extractor=lambda x: x,
        )
        default_params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=1,
            max_contributions_per_partition=1,
            metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT])

        test_cases = [
            {
                "desc": "None col",
                "col": None,
                "params": default_params,
                "data_extractor": default_extractors,
            },
            {
                "desc": "empty col",
                "col": [],
                "params": default_params,
                "data_extractor": default_extractors
            },
            {
                "desc": "none params",
                "col": [0],
                "params": None,
                "data_extractor": default_extractors,
            },
            {
                "desc": "None data_extractor",
                "col": [0],
                "params": default_params,
                "data_extractor": None,
            },
            {
                "desc": "data_extractor with an incorrect type",
                "col": [0],
                "params": default_params,
                "data_extractor": 1,
            },
        ]

        for test_case in test_cases:
            with self.assertRaises(Exception, msg=test_case["desc"]):
                budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                          total_delta=1e-10)
                engine = pipeline_dp.DPEngine(
                    budget_accountant=budget_accountant,
                    backend=pipeline_dp.LocalBackend())
                engine.aggregate(test_case["col"], test_case["params"],
                                 test_case["data_extractor"])
    def _run_contribution_bounding(self, input, max_partitions_contributed,
                                   max_contributions_per_partition):
        params = CrossAndPerPartitionContributionParams(
            max_partitions_contributed, max_contributions_per_partition)

        bounder = contribution_bounders.SamplingCrossAndPerPartitionContributionBounder(
        )
        return list(
            bounder.bound_contributions(input, params,
                                        pipeline_dp.LocalBackend(),
                                        _create_report_generator(),
                                        aggregate_fn))
Пример #9
0
    def test_utility_analysis_params(self):
        default_extractors = self._get_default_extractors()
        default_params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=1,
            max_contributions_per_partition=1,
            metrics=[pipeline_dp.Metrics.COUNT])
        params_with_custom_combiners = copy.copy(default_params)
        params_with_custom_combiners.custom_combiners = sum
        params_with_unsupported_metric = copy.copy(default_params)
        params_with_unsupported_metric.metrics = [pipeline_dp.Metrics.MEAN]
        params_with_contribution_bounds_already_enforced = default_params
        params_with_contribution_bounds_already_enforced.contribution_bounds_already_enforced = True

        test_cases = [
            {
                "desc": "custom combiners",
                "params": params_with_custom_combiners,
                "data_extractor": default_extractors,
                "public_partitions": [1]
            },
            {
                "desc": "unsupported metric in metrics",
                "params": params_with_unsupported_metric,
                "data_extractor": default_extractors,
                "public_partitions": [1]
            },
            {
                "desc": "contribution bounds are already enforced",
                "params": params_with_contribution_bounds_already_enforced,
                "data_extractor": default_extractors,
                "public_partitions": [1]
            },
        ]

        for test_case in test_cases:

            with self.assertRaisesRegex(Exception,
                                        expected_regex=test_case["desc"]):
                budget_accountant = budget_accounting.NaiveBudgetAccountant(
                    total_epsilon=1, total_delta=1e-10)
                engine = dp_engine.UtilityAnalysisEngine(
                    budget_accountant=budget_accountant,
                    backend=pipeline_dp.LocalBackend())
                col = [0, 1, 2]
                engine.aggregate(
                    col,
                    test_case["params"],
                    test_case["data_extractor"],
                    public_partitions=test_case["public_partitions"])
Пример #10
0
    def test_select_partitions(self):
        # This test is probabilistic, but the parameters were chosen to ensure
        # the test has passed at least 10000 runs.

        # Arrange
        params = SelectPartitionsParams(max_partitions_contributed=1)

        budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                  total_delta=1e-5)

        # Generate dataset as a list of (user, partition_key) tuples.
        # There partitions are generated to reflect several scenarios.

        # A partition with sufficient amount of users.
        col = [(u, "pk-many-contribs") for u in range(25)]

        # A partition with many contributions, but only a few unique users.
        col += [(100 + u // 10, "pk-many-contribs-few-users")
                for u in range(30)]

        # A partition with few contributions.
        col += [(200 + u, "pk-few-contribs") for u in range(3)]

        # Generating 30 partitions, each with the same group of 25 users
        # 25 users is sufficient to keep the partition, but because of
        # contribution bounding, much less users per partition will be kept.
        for i in range(30):
            col += [(500 + u, f"few-contribs-after-bound{i}")
                    for u in range(25)]

        col = list(col)
        data_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: x[0],
            partition_extractor=lambda x: x[1])

        engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant,
                                      backend=pipeline_dp.LocalBackend())

        col = engine.select_partitions(col=col,
                                       params=params,
                                       data_extractors=data_extractor)
        budget_accountant.compute_budgets()

        col = list(col)

        # Assert
        # Only one partition is retained, the one that has many unique _after_
        # applying the "max_partitions_contributed" bound is retained.
        self.assertEqual(["pk-many-contribs"], col)
Пример #11
0
 def create_dp_engine_default(accountant: NaiveBudgetAccountant = None,
                              backend: PipelineBackend = None):
     if not accountant:
         accountant = NaiveBudgetAccountant(total_epsilon=1,
                                            total_delta=1e-10)
     if not backend:
         backend = pipeline_dp.LocalBackend()
     dp_engine = pipeline_dp.DPEngine(accountant, backend)
     aggregator_params = pipeline_dp.AggregateParams(
         noise_kind=pipeline_dp.NoiseKind.LAPLACE,
         metrics=[],
         max_partitions_contributed=1,
         max_contributions_per_partition=1)
     dp_engine._report_generators.append(ReportGenerator(aggregator_params))
     dp_engine._add_report_stage("DP Engine Test")
     return dp_engine
Пример #12
0
    def test_aggregate_public_partitions_add_empty_public_partitions(self):
        # Arrange
        aggregator_params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            metrics=[
                agg.Metrics.COUNT, agg.Metrics.SUM,
                agg.Metrics.PRIVACY_ID_COUNT
            ],
            min_value=0,
            max_value=1,
            max_partitions_contributed=1,
            max_contributions_per_partition=1,
            public_partitions=["pk0", "pk10", "pk11"])

        # Set a high budget to add close to 0 noise.
        budget_accountant = NaiveBudgetAccountant(total_epsilon=100000,
                                                  total_delta=1 - 1e-10)

        # Input collection has 10 elements, such that each privacy id
        # contributes 1 time and each partition has 1 element.
        col = list(range(10))
        data_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: x,
            partition_extractor=lambda x: f"pk{x}",
            value_extractor=lambda x: 1)

        engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant,
                                      backend=pipeline_dp.LocalBackend())

        col = engine.aggregate(col=col,
                               params=aggregator_params,
                               data_extractors=data_extractor)
        budget_accountant.compute_budgets()

        col = list(col)
        partition_keys = [x[0] for x in col]
        # Assert

        # Only public partitions ("pk0") should be kept and empty public
        # partitions ("pk10", "pk11") should be added.
        self.assertEqual(["pk0", "pk10", "pk11"], partition_keys)
        self.assertAlmostEqual(1, col[0][1][0])  # "pk0" COUNT ≈ 1
        self.assertAlmostEqual(1, col[0][1][1])  # "pk0" SUM ≈ 1
        self.assertAlmostEqual(1, col[0][1][2])  # "pk0" PRIVACY_ID_COUNT ≈ 1
        self.assertAlmostEqual(0, col[1][1][0])  # "pk10" COUNT ≈ 0
        self.assertAlmostEqual(0, col[1][1][1])  # "pk10" SUM ≈ 0
        self.assertAlmostEqual(0, col[1][1][2])  # "pk10" PRIVACY_ID_COUNT ≈ 0
Пример #13
0
    def test_aggregate_public_partitions_drop_non_public(self):
        # Arrange
        aggregator_params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            metrics=[
                agg.Metrics.COUNT, agg.Metrics.SUM,
                agg.Metrics.PRIVACY_ID_COUNT
            ],
            min_value=0,
            max_value=1,
            max_partitions_contributed=1,
            max_contributions_per_partition=1,
            public_partitions=["pk0", "pk1", "pk10"])

        # Set an arbitrary budget, we are not interested in the DP outputs, only
        # the partition keys.
        budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                  total_delta=1e-10)

        # Input collection has 10 elements, such that each privacy id
        # contributes 1 time and each partition has 1 element.
        col = list(range(10))
        data_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: x,
            partition_extractor=lambda x: f"pk{x}",
            value_extractor=lambda x: x)

        engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant,
                                      backend=pipeline_dp.LocalBackend())

        col = engine.aggregate(col=col,
                               params=aggregator_params,
                               data_extractors=data_extractor)
        budget_accountant.compute_budgets()

        col = list(col)
        partition_keys = [x[0] for x in col]
        # Assert

        # Only public partitions (0, 1, 2) should be kept and the rest of the
        # partitions should be dropped.
        self.assertEqual(["pk0", "pk1", "pk10"], partition_keys)
Пример #14
0
 def test_check_invalid_bounding_params(self, error_msg, min_value,
                                        max_value,
                                        max_partitions_contributed,
                                        max_contributions_per_partition,
                                        max_contributions, metrics):
     with self.assertRaisesRegex(ValueError, error_msg):
         budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                   total_delta=1e-10)
         engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant,
                                       backend=pipeline_dp.LocalBackend())
         engine.aggregate(
             [0],
             pipeline_dp.AggregateParams(
                 noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                 max_partitions_contributed=max_partitions_contributed,
                 max_contributions_per_partition=
                 max_contributions_per_partition,
                 min_value=min_value,
                 max_value=max_value,
                 max_contributions=max_contributions,
                 metrics=metrics), self._get_default_extractors())
Пример #15
0
    def test_aggregate_public_partition_applied(self):
        # Arrange
        aggregator_params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            metrics=[pipeline_dp.Metrics.COUNT],
            max_partitions_contributed=1,
            max_contributions_per_partition=1)

        budget_accountant = pipeline_dp.NaiveBudgetAccountant(
            total_epsilon=1, total_delta=1e-10)

        public_partitions = ["pk0", "pk1", "pk101"]

        # Input collection has 100 elements, such that each privacy id
        # contributes 1 time and each partition has 1 element.
        col = list(range(100))
        data_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: x,
            partition_extractor=lambda x: f"pk{x}",
            value_extractor=lambda x: None)

        engine = dp_engine.UtilityAnalysisEngine(
            budget_accountant=budget_accountant,
            backend=pipeline_dp.LocalBackend())

        col = engine.aggregate(col=col,
                               params=aggregator_params,
                               data_extractors=data_extractor,
                               public_partitions=public_partitions)
        budget_accountant.compute_budgets()

        col = list(col)

        # Assert public partitions are applied, i.e. that pk0 and pk1 are kept,
        # and pk101 is added.
        self.assertEqual(len(col), 3)
        self.assertTrue(any(map(lambda x: x[0] == "pk101", col)))
Пример #16
0
    def test_aggregate_private_partition_selection_keep_everything(self):
        # Arrange
        aggregator_params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            metrics=[agg.Metrics.COUNT],
            max_partitions_contributed=1,
            max_contributions_per_partition=1)
        # Set a large budget for having the small noise and keeping all
        # partition keys.
        budget_accountant = NaiveBudgetAccountant(total_epsilon=100000,
                                                  total_delta=1e-10)

        col = list(range(10)) + list(range(100, 120))
        data_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: x,
            partition_extractor=lambda x: f"pk{x//100}",
            value_extractor=lambda x: None)

        engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant,
                                      backend=pipeline_dp.LocalBackend())

        col = engine.aggregate(col=col,
                               params=aggregator_params,
                               data_extractors=data_extractor)
        budget_accountant.compute_budgets()

        col = list(col)

        # Assert
        approximate_expected = {"pk0": 10, "pk1": 20}
        self.assertEqual(2, len(col))  # all partition keys are kept.
        for pk, metrics_tuple in col:
            dp_count = metrics_tuple.count
            self.assertAlmostEqual(approximate_expected[pk],
                                   dp_count,
                                   delta=1e-3)
Пример #17
0
    def test_aggregate_private_partition_selection_drop_many(self):
        # Arrange
        aggregator_params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            metrics=[agg.Metrics.COUNT],
            max_partitions_contributed=1,
            max_contributions_per_partition=1)

        # Set a small budget for dropping most partition keys.
        budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                  total_delta=1e-10)

        # Input collection has 100 elements, such that each privacy id
        # contributes 1 time and each partition has 1 element.
        col = list(range(100))
        data_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: x,
            partition_extractor=lambda x: f"pk{x}",
            value_extractor=lambda x: None)

        engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant,
                                      backend=pipeline_dp.LocalBackend())

        col = engine.aggregate(col=col,
                               params=aggregator_params,
                               data_extractors=data_extractor)
        budget_accountant.compute_budgets()

        col = list(col)

        # Assert

        # Most partition should be dropped by private partition selection.
        # This tests is non-deterministic, but it should pass with probability
        # very close to 1.
        self.assertLess(len(col), 5)
Пример #18
0
def main(unused_argv):
    # Here, we use a local backend for computations. This does not depend on
    # any pipeline framework and it is implemented in pure Python in
    # PipelineDP. It keeps all data in memory and is not optimized for large data.
    # For datasets smaller than ~tens of megabytes, local execution without any
    # framework is faster than local mode with Beam or Spark.
    backend = pipeline_dp.LocalBackend()

    # Define the privacy budget available for our computation.
    budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1,
                                                          total_delta=1e-6)

    # Load and parse input data
    df = pd.read_csv(FLAGS.input_file)
    df.rename(inplace=True,
              columns={
                  'VisitorId': 'user_id',
                  'Time entered': 'enter_time',
                  'Time spent (minutes)': 'spent_minutes',
                  'Money spent (euros)': 'spent_money',
                  'Day': 'day'
              })
    restaraunt_visits_rows = [index_row[1] for index_row in df.iterrows()]

    # Create a DPEngine instance.
    dp_engine = pipeline_dp.DPEngine(budget_accountant, backend)

    params = pipeline_dp.AggregateParams(
        noise_kind=pipeline_dp.NoiseKind.LAPLACE,
        metrics=[pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.SUM],
        max_partitions_contributed=3,
        max_contributions_per_partition=2,
        min_value=0,
        max_value=60)

    # Specify how to extract privacy_id, partition_key and value from an
    # element of restaraunt_visits_rows.
    data_extractors = pipeline_dp.DataExtractors(
        partition_extractor=lambda row: row.day,
        privacy_id_extractor=lambda row: row.user_id,
        value_extractor=lambda row: row.spent_money)

    # Create a computational graph for the aggregation.
    # All computations are lazy. dp_result is iterable, but iterating it would
    # fail until budget is computed (below).
    # It’s possible to call DPEngine.aggregate multiple times with different
    # metrics to compute.
    dp_result = dp_engine.aggregate(restaraunt_visits_rows,
                                    params,
                                    data_extractors,
                                    public_partitions=list(range(1, 8)))

    budget_accountant.compute_budgets()

    # Here's where the lazy iterator initiates computations and gets transformed
    # into actual results
    dp_result = list(dp_result)

    # Save the results
    write_to_file(dp_result, FLAGS.output_file)

    return 0
Пример #19
0
 def setUp(self):
     super().setUp()
     self._pipeline_backend = pipeline_dp.LocalBackend()
Пример #20
0
def compute_on_local_backend():
    movie_views = parse_file(FLAGS.input_file)
    pipeline_backend = pipeline_dp.LocalBackend()
    dp_result = list(calculate_private_result(movie_views, pipeline_backend))
    write_to_file(dp_result, FLAGS.output_file)
def main(unused_argv):
    # Here, we use a local backend for computations. This does not depend on
    # any pipeline framework and it is implemented in pure Python in
    # PipelineDP. It keeps all data in memory and is not optimized for large data.
    # For datasets smaller than ~tens of megabytes, local execution without any
    # framework is faster than local mode with Beam or Spark.
    backend = pipeline_dp.LocalBackend()

    # Define the privacy budget available for our computation.
    budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1,
                                                          total_delta=1e-6)

    # Load and parse input data
    df = pd.read_csv(FLAGS.input_file)
    df.rename(inplace=True,
              columns={
                  'VisitorId': 'user_id',
                  'Time entered': 'enter_time',
                  'Time spent (minutes)': 'spent_minutes',
                  'Money spent (euros)': 'spent_money',
                  'Day': 'day'
              })
    # Double the inputs so we have twice as many contributions per partition
    df_double = pd.concat([df, df])
    df_double.columns = df.columns
    restaurant_visits_rows = [
        index_row[1] for index_row in df_double.iterrows()
    ]

    # Create a UtilityAnalysisEngine instance.
    utility_analysis_engine = UtilityAnalysisEngine(budget_accountant, backend)

    # Limit contributions to 1 per partition, contribution error will be half of the count.
    params = pipeline_dp.AggregateParams(
        noise_kind=pipeline_dp.NoiseKind.LAPLACE,
        metrics=[pipeline_dp.Metrics.COUNT],
        max_partitions_contributed=1,
        max_contributions_per_partition=1)

    # Specify how to extract privacy_id, partition_key and value from an
    # element of restaurant_visits_rows.
    data_extractors = pipeline_dp.DataExtractors(
        partition_extractor=lambda row: row.day,
        privacy_id_extractor=lambda row: row.user_id,
        value_extractor=lambda row: row.spent_money)

    public_partitions = list(range(1, 8)) if FLAGS.public_partitions else None

    dp_result = utility_analysis_engine.aggregate(restaurant_visits_rows,
                                                  params, data_extractors,
                                                  public_partitions)

    budget_accountant.compute_budgets()

    # Here's where the lazy iterator initiates computations and gets transformed
    # into actual results
    dp_result = list(dp_result)

    # Save the results
    write_to_file(dp_result, FLAGS.output_file)

    return 0
Пример #22
0
    def test_check_select_partitions(self):
        """ Tests validation of parameters for select_partitions()"""
        default_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: x,
            partition_extractor=lambda x: x,
            value_extractor=lambda x: x,
        )

        test_cases = [
            {
                "desc":
                "None col",
                "col":
                None,
                "params":
                pipeline_dp.SelectPartitionsParams(
                    max_partitions_contributed=1, ),
                "data_extractor":
                default_extractor,
            },
            {
                "desc":
                "empty col",
                "col": [],
                "params":
                pipeline_dp.SelectPartitionsParams(
                    max_partitions_contributed=1, ),
                "data_extractor":
                default_extractor,
            },
            {
                "desc": "none params",
                "col": [0],
                "params": None,
                "data_extractor": default_extractor,
            },
            {
                "desc":
                "negative max_partitions_contributed",
                "col": [0],
                "params":
                pipeline_dp.SelectPartitionsParams(
                    max_partitions_contributed=-1, ),
                "data_extractor":
                default_extractor,
            },
            {
                "desc":
                "float max_partitions_contributed",
                "col": [0],
                "params":
                pipeline_dp.SelectPartitionsParams(
                    max_partitions_contributed=1.1, ),
                "data_extractor":
                default_extractor,
            },
            {
                "desc":
                "None data_extractor",
                "col": [0],
                "params":
                pipeline_dp.SelectPartitionsParams(
                    max_partitions_contributed=1, ),
                "data_extractor":
                None,
            },
            {
                "desc":
                "Not a function data_extractor",
                "col": [0],
                "params":
                pipeline_dp.SelectPartitionsParams(
                    max_partitions_contributed=1, ),
                "data_extractor":
                1,
            },
        ]

        for test_case in test_cases:
            with self.assertRaises(Exception, msg=test_case["desc"]):
                budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                          total_delta=1e-10)
                engine = pipeline_dp.DPEngine(
                    budget_accountant=budget_accountant,
                    backend=pipeline_dp.LocalBackend())
                engine.select_partitions(test_case["col"], test_case["params"],
                                         test_case["data_extractor"])
Пример #23
0
    def test_aggregate_report(self):
        col = [[1], [2], [3], [3]]
        data_extractor = pipeline_dp.DataExtractors(
            privacy_id_extractor=lambda x: f"pid{x}",
            partition_extractor=lambda x: f"pk{x}",
            value_extractor=lambda x: x)
        params1 = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=3,
            max_contributions_per_partition=2,
            min_value=1,
            max_value=5,
            metrics=[
                pipeline_dp.Metrics.PRIVACY_ID_COUNT, pipeline_dp.Metrics.COUNT,
                pipeline_dp.Metrics.MEAN
            ],
        )
        params2 = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=1,
            max_contributions_per_partition=3,
            min_value=2,
            max_value=10,
            metrics=[pipeline_dp.Metrics.SUM, pipeline_dp.Metrics.MEAN],
        )

        select_partitions_params = SelectPartitionsParams(
            max_partitions_contributed=2)

        budget_accountant = NaiveBudgetAccountant(total_epsilon=1,
                                                  total_delta=1e-10)
        engine = pipeline_dp.DPEngine(budget_accountant=budget_accountant,
                                      backend=pipeline_dp.LocalBackend())
        engine.aggregate(col, params1, data_extractor)
        engine.aggregate(col, params2, data_extractor, list(range(1, 40)))
        engine.select_partitions(col, select_partitions_params, data_extractor)
        self.assertEqual(3, len(engine._report_generators))  # pylint: disable=protected-access
        budget_accountant.compute_budgets()
        self._check_string_contains_strings(
            engine._report_generators[0].report(),
            [
                "DPEngine method: aggregate",
                "metrics=['privacy_id_count', 'count', 'mean']",
                " noise_kind=gaussian", "max_value=5",
                "Partition selection: private partitions",
                "Cross-partition contribution bounding: for each privacy id randomly select max(actual_partition_contributed, 3)",
                "Private Partition selection: using Truncated Geometric method with (eps="
            ],
        )

        self._check_string_contains_strings(
            engine._report_generators[1].report(),
            [
                "metrics=['sum', 'mean']", " noise_kind=gaussian",
                "max_value=5", "Partition selection: public partitions",
                "Per-partition contribution bounding: for each privacy_id and eachpartition, randomly select max(actual_contributions_per_partition, 3)",
                "Adding empty partitions for public partitions that are missing in data"
            ],
        )

        self._check_string_contains_strings(
            engine._report_generators[2].report(),
            [
                "DPEngine method: select_partitions",
                " budget_weight=1",
                "max_partitions_contributed=2",
                "Private Partition selection: using Truncated Geometric method with",
            ],
        )