Пример #1
0
    def test_privacy_id_count_returns_sensible_result(self):
        # Arrange
        col = [(u, "pk1") for u in range(30)]
        dist_data = PrivateRDDTest.sc.parallelize(col)
        budget_accountant = budget_accounting.NaiveBudgetAccountant(
            total_epsilon=800, total_delta=0.999)

        def privacy_id_extractor(x):
            return x[0]

        prdd = private_spark.make_private(dist_data, budget_accountant,
                                          privacy_id_extractor)
        privacy_id_count_params = agg.PrivacyIdCountParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=2,
            budget_weight=1,
            partition_extractor=lambda x: x[1])

        # Act
        actual_result = prdd.privacy_id_count(privacy_id_count_params)
        budget_accountant.compute_budgets()

        # Assert
        # This is a health check to validate that the result is sensible.
        # Hence, we use a very large tolerance to reduce test flakiness.
        expected_result_dict = {"pk1": 30.0}
        actual_result_dict = self.to_dict(actual_result.collect())

        for pk, count in actual_result_dict.items():
            self.assertTrue(
                self.value_per_key_within_tolerance(count,
                                                    expected_result_dict[pk],
                                                    5.0))
Пример #2
0
    def test_select_partitions_returns_sensible_result(self):
        # Arrange
        col = [(u, "pk1") for u in range(50)]
        col += [(50 + u, "pk2") for u in range(50)]
        dist_data = PrivateRDDTest.sc.parallelize(col)

        # Use very high epsilon and delta to minimize noise and test
        # flakiness.
        budget_accountant = budget_accounting.NaiveBudgetAccountant(
            total_epsilon=800, total_delta=0.999)
        max_partitions_contributed = 2

        def privacy_id_extractor(x):
            return x[0]

        def partition_extractor(x):
            return x[1]

        # Act
        prdd = private_spark.make_private(dist_data, budget_accountant,
                                          privacy_id_extractor)

        select_partitions_params = agg.SelectPartitionsParams(
            max_partitions_contributed=max_partitions_contributed)
        actual_result = prdd.select_partitions(select_partitions_params,
                                               partition_extractor)
        budget_accountant.compute_budgets()

        # Assert
        # This is a health check to validate that the result is sensible.
        # Hence, we use a very large tolerance to reduce test flakiness.
        self.assertEqual(sorted(actual_result.collect()), ["pk1", "pk2"])
Пример #3
0
    def test_variance_calls_aggregate_with_correct_params(
            self, mock_aggregate):
        # Arrange
        dist_data = PrivateRDDTest.sc.parallelize([(1, 0.0, "pk1"),
                                                   (2, 10.0, "pk1")])
        MetricsTuple = collections.namedtuple('MetricsTuple', ['variance'])
        mock_aggregate.return_value = PrivateRDDTest.sc.parallelize([
            ("pk1", MetricsTuple(variance=25.0))
        ])
        budget_accountant = budget_accounting.NaiveBudgetAccountant(1, 1e-10)

        def privacy_id_extractor(x):
            return x[1]

        prdd = private_spark.make_private(dist_data, budget_accountant,
                                          privacy_id_extractor)
        variance_params = agg.VarianceParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=2,
            max_contributions_per_partition=3,
            min_value=1.5,
            max_value=5.78,
            budget_weight=1.1,
            partition_extractor=lambda x: x[0],
            value_extractor=lambda x: x)

        # Act
        actual_result = prdd.variance(variance_params)

        # Assert
        mock_aggregate.assert_called_once()
        args = mock_aggregate.call_args[0]

        rdd = dist_data.map(lambda x: (privacy_id_extractor(x), x))
        self.assertListEqual(args[0].collect(), rdd.collect())

        params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            metrics=[pipeline_dp.Metrics.VARIANCE],
            max_partitions_contributed=variance_params.
            max_partitions_contributed,
            max_contributions_per_partition=variance_params.
            max_contributions_per_partition,
            min_value=variance_params.min_value,
            max_value=variance_params.max_value,
            budget_weight=variance_params.budget_weight,
            public_partitions=variance_params.public_partitions)
        self.assertEqual(args[1], params)

        self.assertEqual(actual_result.collect(), [("pk1", 25.0)])
Пример #4
0
def main(unused_argv):
    delete_if_exists(FLAGS.output_file)

    # Setup Spark

    # Here, we use one worker thread to load the file as 1 partition.
    # For a truly distributed calculation, connect to a Spark cluster (e.g.
    # running on some cloud provider).
    master = "local[1]"  # use one worker thread to load the file as 1 partition
    conf = pyspark.SparkConf().setMaster(master)
    sc = pyspark.SparkContext(conf=conf)
    movie_views = sc \
        .textFile(FLAGS.input_file) \
        .mapPartitions(parse_partition)

    # Define the privacy budget available for our computation.
    budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1,
                                                          total_delta=1e-6)

    # Wrap Spark's RDD into its private version
    private_movie_views = \
        make_private(movie_views, budget_accountant, lambda mv: mv.user_id)

    # Calculate the private sum
    dp_result = private_movie_views.sum(
        SumParams(
            # Limits to how much one user can contribute:
            # .. at most two movies rated per user
            max_partitions_contributed=2,
            # .. at most one rating for each movie
            max_contributions_per_partition=1,
            # .. with minimal rating of "1"
            min_value=1,
            # .. and maximum rating of "5"
            max_value=5,
            # The aggregation key: we're grouping by movies
            partition_extractor=lambda mv: mv.movie_id,
            # The value we're aggregating: we're summing up ratings
            value_extractor=lambda mv: mv.rating))

    budget_accountant.compute_budgets()

    # Save the results
    dp_result.saveAsTextFile(FLAGS.output_file)

    return 0
Пример #5
0
    def test_mean_calls_aggregate_with_correct_params(self, mock_aggregate):
        # Arrange
        dist_data = PrivateRDDTest.sc.parallelize([(1, 2.0, "pk1"),
                                                   (2, 2.0, "pk1")])
        mock_aggregate.return_value = PrivateRDDTest.sc.parallelize([(2.0,
                                                                      ["pk1"])])
        budget_accountant = budget_accounting.NaiveBudgetAccountant(1, 1e-10)

        def privacy_id_extractor(x):
            return x[1]

        prdd = private_spark.make_private(dist_data, budget_accountant,
                                          privacy_id_extractor)
        mean_params = agg.MeanParams(noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                                     max_partitions_contributed=2,
                                     max_contributions_per_partition=3,
                                     min_value=1.5,
                                     max_value=5.78,
                                     budget_weight=1.1,
                                     public_partitions=None,
                                     partition_extractor=lambda x: x[0],
                                     value_extractor=lambda x: x)

        # Act
        actual_result = prdd.mean(mean_params)

        # Assert
        mock_aggregate.assert_called_once()
        args = mock_aggregate.call_args[0]

        rdd = dist_data.map(lambda x: (privacy_id_extractor(x), x))
        self.assertListEqual(args[0].collect(), rdd.collect())

        params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            metrics=[pipeline_dp.Metrics.MEAN],
            max_partitions_contributed=mean_params.max_partitions_contributed,
            max_contributions_per_partition=mean_params.
            max_contributions_per_partition,
            min_value=mean_params.min_value,
            max_value=mean_params.max_value,
            budget_weight=mean_params.budget_weight,
            public_partitions=mean_params.public_partitions)
        self.assertEqual(args[1], params)

        self.assertEqual(actual_result.collect(), [(2.0, "pk1")])
Пример #6
0
    def test_variance_with_public_partitions_returns_sensible_result(self):
        # Arrange
        col = [(u, "pubK1", -100) for u in range(30)]
        col += [(u + 30, "pubK1", 100) for u in range(10)]
        col += [(u + 40, "privK1", 100) for u in range(30)]

        dist_data = PrivateRDDTest.sc.parallelize(col)
        # Use very high epsilon and delta to minimize noise and test
        # flakiness.
        budget_accountant = budget_accounting.NaiveBudgetAccountant(
            total_epsilon=8000, total_delta=0.9999999)

        def privacy_id_extractor(x):
            return x[0]

        prdd = private_spark.make_private(dist_data, budget_accountant,
                                          privacy_id_extractor)
        variance_params = agg.VarianceParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=2,
            max_contributions_per_partition=3,
            min_value=1.55,  # -100 should be clipped to this value
            max_value=2.7889,  # 100 should be clipped to this value
            budget_weight=1,
            partition_extractor=lambda x: x[1],
            value_extractor=lambda x: x[2])

        # Act
        actual_result = prdd.variance(variance_params,
                                      public_partitions=["pubK1", "pubK2"])
        budget_accountant.compute_budgets()

        # Assert
        # This is a health check to validate that the result is sensible.
        # Hence, we use a very large tolerance to reduce test flakiness.
        expected_result_dict = {"pubK1": 0.288, "pubK2": 0.0}
        actual_result_dict = self.to_dict(actual_result.collect())

        for pk, variance in actual_result_dict.items():
            self.assertTrue(
                self.value_per_key_within_tolerance(variance,
                                                    expected_result_dict[pk],
                                                    0.1))
Пример #7
0
    def test_sum_returns_sensible_result(self):
        # Arrange
        col = [(f"{u}", "pk1", 100.0) for u in range(30)]
        col += [(f"{u + 30}", "pk1", -100.0) for u in range(30)]

        dist_data = PrivateRDDTest.sc.parallelize(col)
        # Use very high epsilon and delta to minimize noise and test
        # flakiness.
        budget_accountant = budget_accounting.NaiveBudgetAccountant(
            total_epsilon=800, total_delta=0.999)

        def privacy_id_extractor(x):
            return x[0]

        prdd = private_spark.make_private(dist_data, budget_accountant,
                                          privacy_id_extractor)
        sum_params = agg.SumParams(noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                                   max_partitions_contributed=2,
                                   max_contributions_per_partition=3,
                                   min_value=1.55,
                                   max_value=2.7889,
                                   budget_weight=1,
                                   public_partitions=None,
                                   partition_extractor=lambda x: x[1],
                                   value_extractor=lambda x: x[2])

        # Act
        actual_result = prdd.sum(sum_params)
        budget_accountant.compute_budgets()

        # Assert
        # This is a health check to validate that the result is sensible.
        # Hence, we use a very large tolerance to reduce test flakiness.
        expected_result_dict = {"pk1": 130.167}
        actual_result_dict = self.to_dict(actual_result.collect())

        for pk, sum in actual_result_dict.items():
            self.assertTrue(
                self.value_per_key_within_tolerance(sum,
                                                    expected_result_dict[pk],
                                                    5.0))
Пример #8
0
    def test_privacy_id_count_calls_aggregate_with_correct_params(
            self, mock_aggregate):
        # Arrange
        dist_data = PrivateRDDTest.sc.parallelize([(1, "pk1"), (2, "pk1")])
        MetricsTuple = collections.namedtuple('MetricsTuple',
                                              ['privacy_id_count'])
        mock_aggregate.return_value = PrivateRDDTest.sc.parallelize([
            ("pk1", MetricsTuple(privacy_id_count=2))
        ])
        budget_accountant = budget_accounting.NaiveBudgetAccountant(1, 1e-10)

        def privacy_id_extractor(x):
            return x[0]

        prdd = private_spark.make_private(dist_data, budget_accountant,
                                          privacy_id_extractor)
        privacy_id_count_params = agg.PrivacyIdCountParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            max_partitions_contributed=2,
            budget_weight=1,
            partition_extractor=lambda x: x[1])

        # Act
        actual_result = prdd.privacy_id_count(privacy_id_count_params)

        # Assert
        mock_aggregate.assert_called_once()
        args = mock_aggregate.call_args[0]

        rdd = dist_data.map(lambda x: (privacy_id_extractor(x), x))
        self.assertListEqual(args[0].collect(), rdd.collect())

        params = pipeline_dp.AggregateParams(
            noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
            metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT],
            max_partitions_contributed=privacy_id_count_params.
            max_partitions_contributed,
            max_contributions_per_partition=1)
        self.assertEqual(args[1], params)

        self.assertEqual([("pk1", 2)], actual_result.collect())
Пример #9
0
    def test_select_partitions_calls_select_partitions_with_correct_params(
            self, mock_aggregate):
        # Arrange
        dist_data = PrivateRDDTest.sc.parallelize([(1, "pk1"), (2, "pk2")])
        expected_result_partitions = ["pk1", "pk2"]
        mock_aggregate.return_value = PrivateRDDTest.sc.parallelize(
            expected_result_partitions)
        budget_accountant = budget_accounting.NaiveBudgetAccountant(
            total_epsilon=1, total_delta=0.01)
        max_partitions_contributed = 2

        def privacy_id_extractor(x):
            return x[0]

        def partition_extractor(x):
            return {x[1]}

        # Act
        prdd = private_spark.make_private(dist_data, budget_accountant,
                                          privacy_id_extractor)

        select_partitions_params = agg.SelectPartitionsParams(
            max_partitions_contributed=max_partitions_contributed)
        actual_result = prdd.select_partitions(select_partitions_params,
                                               partition_extractor)

        # Assert
        mock_aggregate.assert_called_once()
        actual_args = mock_aggregate.call_args[0]
        actual_rdd = actual_args[0].collect()
        actual_select_partition_params = actual_args[1]

        self.assertListEqual(actual_rdd, [(1, (1, "pk1")), (2, (2, "pk2"))])

        self.assertEqual(
            actual_select_partition_params.max_partitions_contributed,
            max_partitions_contributed)
        self.assertEqual(actual_result.collect(), expected_result_partitions)